1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 148 149 /** 150 * DOC: pcie_replay_count 151 * 152 * The amdgpu driver provides a sysfs API for reporting the total number 153 * of PCIe replays (NAKs) 154 * The file pcie_replay_count is used for this and returns the total 155 * number of replays as a sum of the NAKs generated and NAKs received 156 */ 157 158 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 159 struct device_attribute *attr, char *buf) 160 { 161 struct drm_device *ddev = dev_get_drvdata(dev); 162 struct amdgpu_device *adev = drm_to_adev(ddev); 163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 164 165 return sysfs_emit(buf, "%llu\n", cnt); 166 } 167 168 static DEVICE_ATTR(pcie_replay_count, 0444, 169 amdgpu_device_get_pcie_replay_count, NULL); 170 171 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 172 struct bin_attribute *attr, char *buf, 173 loff_t ppos, size_t count) 174 { 175 struct device *dev = kobj_to_dev(kobj); 176 struct drm_device *ddev = dev_get_drvdata(dev); 177 struct amdgpu_device *adev = drm_to_adev(ddev); 178 ssize_t bytes_read; 179 180 switch (ppos) { 181 case AMDGPU_SYS_REG_STATE_XGMI: 182 bytes_read = amdgpu_asic_get_reg_state( 183 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 184 break; 185 case AMDGPU_SYS_REG_STATE_WAFL: 186 bytes_read = amdgpu_asic_get_reg_state( 187 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 188 break; 189 case AMDGPU_SYS_REG_STATE_PCIE: 190 bytes_read = amdgpu_asic_get_reg_state( 191 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 192 break; 193 case AMDGPU_SYS_REG_STATE_USR: 194 bytes_read = amdgpu_asic_get_reg_state( 195 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 196 break; 197 case AMDGPU_SYS_REG_STATE_USR_1: 198 bytes_read = amdgpu_asic_get_reg_state( 199 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 200 break; 201 default: 202 return -EINVAL; 203 } 204 205 return bytes_read; 206 } 207 208 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 209 AMDGPU_SYS_REG_STATE_END); 210 211 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 212 { 213 int ret; 214 215 if (!amdgpu_asic_get_reg_state_supported(adev)) 216 return 0; 217 218 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 219 220 return ret; 221 } 222 223 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 224 { 225 if (!amdgpu_asic_get_reg_state_supported(adev)) 226 return; 227 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 228 } 229 230 /** 231 * DOC: board_info 232 * 233 * The amdgpu driver provides a sysfs API for giving board related information. 234 * It provides the form factor information in the format 235 * 236 * type : form factor 237 * 238 * Possible form factor values 239 * 240 * - "cem" - PCIE CEM card 241 * - "oam" - Open Compute Accelerator Module 242 * - "unknown" - Not known 243 * 244 */ 245 246 static ssize_t amdgpu_device_get_board_info(struct device *dev, 247 struct device_attribute *attr, 248 char *buf) 249 { 250 struct drm_device *ddev = dev_get_drvdata(dev); 251 struct amdgpu_device *adev = drm_to_adev(ddev); 252 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 253 const char *pkg; 254 255 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 256 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 257 258 switch (pkg_type) { 259 case AMDGPU_PKG_TYPE_CEM: 260 pkg = "cem"; 261 break; 262 case AMDGPU_PKG_TYPE_OAM: 263 pkg = "oam"; 264 break; 265 default: 266 pkg = "unknown"; 267 break; 268 } 269 270 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 271 } 272 273 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 274 275 static struct attribute *amdgpu_board_attrs[] = { 276 &dev_attr_board_info.attr, 277 NULL, 278 }; 279 280 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 281 struct attribute *attr, int n) 282 { 283 struct device *dev = kobj_to_dev(kobj); 284 struct drm_device *ddev = dev_get_drvdata(dev); 285 struct amdgpu_device *adev = drm_to_adev(ddev); 286 287 if (adev->flags & AMD_IS_APU) 288 return 0; 289 290 return attr->mode; 291 } 292 293 static const struct attribute_group amdgpu_board_attrs_group = { 294 .attrs = amdgpu_board_attrs, 295 .is_visible = amdgpu_board_attrs_is_visible 296 }; 297 298 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 299 300 301 /** 302 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 303 * 304 * @dev: drm_device pointer 305 * 306 * Returns true if the device is a dGPU with ATPX power control, 307 * otherwise return false. 308 */ 309 bool amdgpu_device_supports_px(struct drm_device *dev) 310 { 311 struct amdgpu_device *adev = drm_to_adev(dev); 312 313 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 314 return true; 315 return false; 316 } 317 318 /** 319 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 320 * 321 * @dev: drm_device pointer 322 * 323 * Returns true if the device is a dGPU with ACPI power control, 324 * otherwise return false. 325 */ 326 bool amdgpu_device_supports_boco(struct drm_device *dev) 327 { 328 struct amdgpu_device *adev = drm_to_adev(dev); 329 330 if (adev->has_pr3 || 331 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 332 return true; 333 return false; 334 } 335 336 /** 337 * amdgpu_device_supports_baco - Does the device support BACO 338 * 339 * @dev: drm_device pointer 340 * 341 * Return: 342 * 1 if the device supporte BACO; 343 * 3 if the device support MACO (only works if BACO is supported) 344 * otherwise return 0. 345 */ 346 int amdgpu_device_supports_baco(struct drm_device *dev) 347 { 348 struct amdgpu_device *adev = drm_to_adev(dev); 349 350 return amdgpu_asic_supports_baco(adev); 351 } 352 353 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 354 { 355 struct drm_device *dev; 356 int bamaco_support; 357 358 dev = adev_to_drm(adev); 359 360 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 361 bamaco_support = amdgpu_device_supports_baco(dev); 362 363 switch (amdgpu_runtime_pm) { 364 case 2: 365 if (bamaco_support & MACO_SUPPORT) { 366 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 367 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 368 } else if (bamaco_support == BACO_SUPPORT) { 369 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 370 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 371 } 372 break; 373 case 1: 374 if (bamaco_support & BACO_SUPPORT) { 375 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 376 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 377 } 378 break; 379 case -1: 380 case -2: 381 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 382 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 383 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 384 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 385 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 386 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 387 } else { 388 if (!bamaco_support) 389 goto no_runtime_pm; 390 391 switch (adev->asic_type) { 392 case CHIP_VEGA20: 393 case CHIP_ARCTURUS: 394 /* BACO are not supported on vega20 and arctrus */ 395 break; 396 case CHIP_VEGA10: 397 /* enable BACO as runpm mode if noretry=0 */ 398 if (!adev->gmc.noretry) 399 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 400 break; 401 default: 402 /* enable BACO as runpm mode on CI+ */ 403 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 404 break; 405 } 406 407 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 408 if (bamaco_support & MACO_SUPPORT) { 409 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 410 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 411 } else { 412 dev_info(adev->dev, "Using BACO for runtime pm\n"); 413 } 414 } 415 } 416 break; 417 case 0: 418 dev_info(adev->dev, "runtime pm is manually disabled\n"); 419 break; 420 default: 421 break; 422 } 423 424 no_runtime_pm: 425 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 426 dev_info(adev->dev, "Runtime PM not available\n"); 427 } 428 /** 429 * amdgpu_device_supports_smart_shift - Is the device dGPU with 430 * smart shift support 431 * 432 * @dev: drm_device pointer 433 * 434 * Returns true if the device is a dGPU with Smart Shift support, 435 * otherwise returns false. 436 */ 437 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 438 { 439 return (amdgpu_device_supports_boco(dev) && 440 amdgpu_acpi_is_power_shift_control_supported()); 441 } 442 443 /* 444 * VRAM access helper functions 445 */ 446 447 /** 448 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 449 * 450 * @adev: amdgpu_device pointer 451 * @pos: offset of the buffer in vram 452 * @buf: virtual address of the buffer in system memory 453 * @size: read/write size, sizeof(@buf) must > @size 454 * @write: true - write to vram, otherwise - read from vram 455 */ 456 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 457 void *buf, size_t size, bool write) 458 { 459 unsigned long flags; 460 uint32_t hi = ~0, tmp = 0; 461 uint32_t *data = buf; 462 uint64_t last; 463 int idx; 464 465 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 466 return; 467 468 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 469 470 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 471 for (last = pos + size; pos < last; pos += 4) { 472 tmp = pos >> 31; 473 474 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 475 if (tmp != hi) { 476 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 477 hi = tmp; 478 } 479 if (write) 480 WREG32_NO_KIQ(mmMM_DATA, *data++); 481 else 482 *data++ = RREG32_NO_KIQ(mmMM_DATA); 483 } 484 485 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 486 drm_dev_exit(idx); 487 } 488 489 /** 490 * amdgpu_device_aper_access - access vram by vram aperature 491 * 492 * @adev: amdgpu_device pointer 493 * @pos: offset of the buffer in vram 494 * @buf: virtual address of the buffer in system memory 495 * @size: read/write size, sizeof(@buf) must > @size 496 * @write: true - write to vram, otherwise - read from vram 497 * 498 * The return value means how many bytes have been transferred. 499 */ 500 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 501 void *buf, size_t size, bool write) 502 { 503 #ifdef CONFIG_64BIT 504 void __iomem *addr; 505 size_t count = 0; 506 uint64_t last; 507 508 if (!adev->mman.aper_base_kaddr) 509 return 0; 510 511 last = min(pos + size, adev->gmc.visible_vram_size); 512 if (last > pos) { 513 addr = adev->mman.aper_base_kaddr + pos; 514 count = last - pos; 515 516 if (write) { 517 memcpy_toio(addr, buf, count); 518 /* Make sure HDP write cache flush happens without any reordering 519 * after the system memory contents are sent over PCIe device 520 */ 521 mb(); 522 amdgpu_device_flush_hdp(adev, NULL); 523 } else { 524 amdgpu_device_invalidate_hdp(adev, NULL); 525 /* Make sure HDP read cache is invalidated before issuing a read 526 * to the PCIe device 527 */ 528 mb(); 529 memcpy_fromio(buf, addr, count); 530 } 531 532 } 533 534 return count; 535 #else 536 return 0; 537 #endif 538 } 539 540 /** 541 * amdgpu_device_vram_access - read/write a buffer in vram 542 * 543 * @adev: amdgpu_device pointer 544 * @pos: offset of the buffer in vram 545 * @buf: virtual address of the buffer in system memory 546 * @size: read/write size, sizeof(@buf) must > @size 547 * @write: true - write to vram, otherwise - read from vram 548 */ 549 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 550 void *buf, size_t size, bool write) 551 { 552 size_t count; 553 554 /* try to using vram apreature to access vram first */ 555 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 556 size -= count; 557 if (size) { 558 /* using MM to access rest vram */ 559 pos += count; 560 buf += count; 561 amdgpu_device_mm_access(adev, pos, buf, size, write); 562 } 563 } 564 565 /* 566 * register access helper functions. 567 */ 568 569 /* Check if hw access should be skipped because of hotplug or device error */ 570 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 571 { 572 if (adev->no_hw_access) 573 return true; 574 575 #ifdef CONFIG_LOCKDEP 576 /* 577 * This is a bit complicated to understand, so worth a comment. What we assert 578 * here is that the GPU reset is not running on another thread in parallel. 579 * 580 * For this we trylock the read side of the reset semaphore, if that succeeds 581 * we know that the reset is not running in paralell. 582 * 583 * If the trylock fails we assert that we are either already holding the read 584 * side of the lock or are the reset thread itself and hold the write side of 585 * the lock. 586 */ 587 if (in_task()) { 588 if (down_read_trylock(&adev->reset_domain->sem)) 589 up_read(&adev->reset_domain->sem); 590 else 591 lockdep_assert_held(&adev->reset_domain->sem); 592 } 593 #endif 594 return false; 595 } 596 597 /** 598 * amdgpu_device_rreg - read a memory mapped IO or indirect register 599 * 600 * @adev: amdgpu_device pointer 601 * @reg: dword aligned register offset 602 * @acc_flags: access flags which require special behavior 603 * 604 * Returns the 32 bit value from the offset specified. 605 */ 606 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 607 uint32_t reg, uint32_t acc_flags) 608 { 609 uint32_t ret; 610 611 if (amdgpu_device_skip_hw_access(adev)) 612 return 0; 613 614 if ((reg * 4) < adev->rmmio_size) { 615 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 616 amdgpu_sriov_runtime(adev) && 617 down_read_trylock(&adev->reset_domain->sem)) { 618 ret = amdgpu_kiq_rreg(adev, reg, 0); 619 up_read(&adev->reset_domain->sem); 620 } else { 621 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 622 } 623 } else { 624 ret = adev->pcie_rreg(adev, reg * 4); 625 } 626 627 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 628 629 return ret; 630 } 631 632 /* 633 * MMIO register read with bytes helper functions 634 * @offset:bytes offset from MMIO start 635 */ 636 637 /** 638 * amdgpu_mm_rreg8 - read a memory mapped IO register 639 * 640 * @adev: amdgpu_device pointer 641 * @offset: byte aligned register offset 642 * 643 * Returns the 8 bit value from the offset specified. 644 */ 645 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 646 { 647 if (amdgpu_device_skip_hw_access(adev)) 648 return 0; 649 650 if (offset < adev->rmmio_size) 651 return (readb(adev->rmmio + offset)); 652 BUG(); 653 } 654 655 656 /** 657 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 658 * 659 * @adev: amdgpu_device pointer 660 * @reg: dword aligned register offset 661 * @acc_flags: access flags which require special behavior 662 * @xcc_id: xcc accelerated compute core id 663 * 664 * Returns the 32 bit value from the offset specified. 665 */ 666 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 667 uint32_t reg, uint32_t acc_flags, 668 uint32_t xcc_id) 669 { 670 uint32_t ret, rlcg_flag; 671 672 if (amdgpu_device_skip_hw_access(adev)) 673 return 0; 674 675 if ((reg * 4) < adev->rmmio_size) { 676 if (amdgpu_sriov_vf(adev) && 677 !amdgpu_sriov_runtime(adev) && 678 adev->gfx.rlc.rlcg_reg_access_supported && 679 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 680 GC_HWIP, false, 681 &rlcg_flag)) { 682 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 683 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 684 amdgpu_sriov_runtime(adev) && 685 down_read_trylock(&adev->reset_domain->sem)) { 686 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 687 up_read(&adev->reset_domain->sem); 688 } else { 689 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 690 } 691 } else { 692 ret = adev->pcie_rreg(adev, reg * 4); 693 } 694 695 return ret; 696 } 697 698 /* 699 * MMIO register write with bytes helper functions 700 * @offset:bytes offset from MMIO start 701 * @value: the value want to be written to the register 702 */ 703 704 /** 705 * amdgpu_mm_wreg8 - read a memory mapped IO register 706 * 707 * @adev: amdgpu_device pointer 708 * @offset: byte aligned register offset 709 * @value: 8 bit value to write 710 * 711 * Writes the value specified to the offset specified. 712 */ 713 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 714 { 715 if (amdgpu_device_skip_hw_access(adev)) 716 return; 717 718 if (offset < adev->rmmio_size) 719 writeb(value, adev->rmmio + offset); 720 else 721 BUG(); 722 } 723 724 /** 725 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 726 * 727 * @adev: amdgpu_device pointer 728 * @reg: dword aligned register offset 729 * @v: 32 bit value to write to the register 730 * @acc_flags: access flags which require special behavior 731 * 732 * Writes the value specified to the offset specified. 733 */ 734 void amdgpu_device_wreg(struct amdgpu_device *adev, 735 uint32_t reg, uint32_t v, 736 uint32_t acc_flags) 737 { 738 if (amdgpu_device_skip_hw_access(adev)) 739 return; 740 741 if ((reg * 4) < adev->rmmio_size) { 742 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 743 amdgpu_sriov_runtime(adev) && 744 down_read_trylock(&adev->reset_domain->sem)) { 745 amdgpu_kiq_wreg(adev, reg, v, 0); 746 up_read(&adev->reset_domain->sem); 747 } else { 748 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 749 } 750 } else { 751 adev->pcie_wreg(adev, reg * 4, v); 752 } 753 754 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 755 } 756 757 /** 758 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: mmio/rlc register 762 * @v: value to write 763 * @xcc_id: xcc accelerated compute core id 764 * 765 * this function is invoked only for the debugfs register access 766 */ 767 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 768 uint32_t reg, uint32_t v, 769 uint32_t xcc_id) 770 { 771 if (amdgpu_device_skip_hw_access(adev)) 772 return; 773 774 if (amdgpu_sriov_fullaccess(adev) && 775 adev->gfx.rlc.funcs && 776 adev->gfx.rlc.funcs->is_rlcg_access_range) { 777 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 778 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 779 } else if ((reg * 4) >= adev->rmmio_size) { 780 adev->pcie_wreg(adev, reg * 4, v); 781 } else { 782 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 783 } 784 } 785 786 /** 787 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 788 * 789 * @adev: amdgpu_device pointer 790 * @reg: dword aligned register offset 791 * @v: 32 bit value to write to the register 792 * @acc_flags: access flags which require special behavior 793 * @xcc_id: xcc accelerated compute core id 794 * 795 * Writes the value specified to the offset specified. 796 */ 797 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 798 uint32_t reg, uint32_t v, 799 uint32_t acc_flags, uint32_t xcc_id) 800 { 801 uint32_t rlcg_flag; 802 803 if (amdgpu_device_skip_hw_access(adev)) 804 return; 805 806 if ((reg * 4) < adev->rmmio_size) { 807 if (amdgpu_sriov_vf(adev) && 808 !amdgpu_sriov_runtime(adev) && 809 adev->gfx.rlc.rlcg_reg_access_supported && 810 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 811 GC_HWIP, true, 812 &rlcg_flag)) { 813 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 814 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 815 amdgpu_sriov_runtime(adev) && 816 down_read_trylock(&adev->reset_domain->sem)) { 817 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 818 up_read(&adev->reset_domain->sem); 819 } else { 820 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 821 } 822 } else { 823 adev->pcie_wreg(adev, reg * 4, v); 824 } 825 } 826 827 /** 828 * amdgpu_device_indirect_rreg - read an indirect register 829 * 830 * @adev: amdgpu_device pointer 831 * @reg_addr: indirect register address to read from 832 * 833 * Returns the value of indirect register @reg_addr 834 */ 835 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 836 u32 reg_addr) 837 { 838 unsigned long flags, pcie_index, pcie_data; 839 void __iomem *pcie_index_offset; 840 void __iomem *pcie_data_offset; 841 u32 r; 842 843 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 844 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 845 846 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 847 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 848 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 849 850 writel(reg_addr, pcie_index_offset); 851 readl(pcie_index_offset); 852 r = readl(pcie_data_offset); 853 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 854 855 return r; 856 } 857 858 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 859 u64 reg_addr) 860 { 861 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 862 u32 r; 863 void __iomem *pcie_index_offset; 864 void __iomem *pcie_index_hi_offset; 865 void __iomem *pcie_data_offset; 866 867 if (unlikely(!adev->nbio.funcs)) { 868 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 869 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 870 } else { 871 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 872 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 873 } 874 875 if (reg_addr >> 32) { 876 if (unlikely(!adev->nbio.funcs)) 877 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 878 else 879 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 880 } else { 881 pcie_index_hi = 0; 882 } 883 884 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 885 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 886 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 887 if (pcie_index_hi != 0) 888 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 889 pcie_index_hi * 4; 890 891 writel(reg_addr, pcie_index_offset); 892 readl(pcie_index_offset); 893 if (pcie_index_hi != 0) { 894 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 r = readl(pcie_data_offset); 898 899 /* clear the high bits */ 900 if (pcie_index_hi != 0) { 901 writel(0, pcie_index_hi_offset); 902 readl(pcie_index_hi_offset); 903 } 904 905 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 906 907 return r; 908 } 909 910 /** 911 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 912 * 913 * @adev: amdgpu_device pointer 914 * @reg_addr: indirect register address to read from 915 * 916 * Returns the value of indirect register @reg_addr 917 */ 918 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 919 u32 reg_addr) 920 { 921 unsigned long flags, pcie_index, pcie_data; 922 void __iomem *pcie_index_offset; 923 void __iomem *pcie_data_offset; 924 u64 r; 925 926 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 927 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 928 929 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 930 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 931 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 932 933 /* read low 32 bits */ 934 writel(reg_addr, pcie_index_offset); 935 readl(pcie_index_offset); 936 r = readl(pcie_data_offset); 937 /* read high 32 bits */ 938 writel(reg_addr + 4, pcie_index_offset); 939 readl(pcie_index_offset); 940 r |= ((u64)readl(pcie_data_offset) << 32); 941 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 942 943 return r; 944 } 945 946 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 947 u64 reg_addr) 948 { 949 unsigned long flags, pcie_index, pcie_data; 950 unsigned long pcie_index_hi = 0; 951 void __iomem *pcie_index_offset; 952 void __iomem *pcie_index_hi_offset; 953 void __iomem *pcie_data_offset; 954 u64 r; 955 956 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 957 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 958 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 959 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 960 961 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 962 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 963 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 964 if (pcie_index_hi != 0) 965 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 966 pcie_index_hi * 4; 967 968 /* read low 32 bits */ 969 writel(reg_addr, pcie_index_offset); 970 readl(pcie_index_offset); 971 if (pcie_index_hi != 0) { 972 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 973 readl(pcie_index_hi_offset); 974 } 975 r = readl(pcie_data_offset); 976 /* read high 32 bits */ 977 writel(reg_addr + 4, pcie_index_offset); 978 readl(pcie_index_offset); 979 if (pcie_index_hi != 0) { 980 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 981 readl(pcie_index_hi_offset); 982 } 983 r |= ((u64)readl(pcie_data_offset) << 32); 984 985 /* clear the high bits */ 986 if (pcie_index_hi != 0) { 987 writel(0, pcie_index_hi_offset); 988 readl(pcie_index_hi_offset); 989 } 990 991 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 992 993 return r; 994 } 995 996 /** 997 * amdgpu_device_indirect_wreg - write an indirect register address 998 * 999 * @adev: amdgpu_device pointer 1000 * @reg_addr: indirect register offset 1001 * @reg_data: indirect register data 1002 * 1003 */ 1004 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1005 u32 reg_addr, u32 reg_data) 1006 { 1007 unsigned long flags, pcie_index, pcie_data; 1008 void __iomem *pcie_index_offset; 1009 void __iomem *pcie_data_offset; 1010 1011 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1012 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1013 1014 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1015 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1016 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1017 1018 writel(reg_addr, pcie_index_offset); 1019 readl(pcie_index_offset); 1020 writel(reg_data, pcie_data_offset); 1021 readl(pcie_data_offset); 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 } 1024 1025 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1026 u64 reg_addr, u32 reg_data) 1027 { 1028 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1029 void __iomem *pcie_index_offset; 1030 void __iomem *pcie_index_hi_offset; 1031 void __iomem *pcie_data_offset; 1032 1033 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1034 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1035 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1036 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1037 else 1038 pcie_index_hi = 0; 1039 1040 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1041 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1042 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1043 if (pcie_index_hi != 0) 1044 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1045 pcie_index_hi * 4; 1046 1047 writel(reg_addr, pcie_index_offset); 1048 readl(pcie_index_offset); 1049 if (pcie_index_hi != 0) { 1050 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1051 readl(pcie_index_hi_offset); 1052 } 1053 writel(reg_data, pcie_data_offset); 1054 readl(pcie_data_offset); 1055 1056 /* clear the high bits */ 1057 if (pcie_index_hi != 0) { 1058 writel(0, pcie_index_hi_offset); 1059 readl(pcie_index_hi_offset); 1060 } 1061 1062 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1063 } 1064 1065 /** 1066 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1067 * 1068 * @adev: amdgpu_device pointer 1069 * @reg_addr: indirect register offset 1070 * @reg_data: indirect register data 1071 * 1072 */ 1073 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1074 u32 reg_addr, u64 reg_data) 1075 { 1076 unsigned long flags, pcie_index, pcie_data; 1077 void __iomem *pcie_index_offset; 1078 void __iomem *pcie_data_offset; 1079 1080 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1081 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1082 1083 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1084 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1085 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1086 1087 /* write low 32 bits */ 1088 writel(reg_addr, pcie_index_offset); 1089 readl(pcie_index_offset); 1090 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1091 readl(pcie_data_offset); 1092 /* write high 32 bits */ 1093 writel(reg_addr + 4, pcie_index_offset); 1094 readl(pcie_index_offset); 1095 writel((u32)(reg_data >> 32), pcie_data_offset); 1096 readl(pcie_data_offset); 1097 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1098 } 1099 1100 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1101 u64 reg_addr, u64 reg_data) 1102 { 1103 unsigned long flags, pcie_index, pcie_data; 1104 unsigned long pcie_index_hi = 0; 1105 void __iomem *pcie_index_offset; 1106 void __iomem *pcie_index_hi_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1112 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1113 1114 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1115 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1116 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1117 if (pcie_index_hi != 0) 1118 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1119 pcie_index_hi * 4; 1120 1121 /* write low 32 bits */ 1122 writel(reg_addr, pcie_index_offset); 1123 readl(pcie_index_offset); 1124 if (pcie_index_hi != 0) { 1125 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1126 readl(pcie_index_hi_offset); 1127 } 1128 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1129 readl(pcie_data_offset); 1130 /* write high 32 bits */ 1131 writel(reg_addr + 4, pcie_index_offset); 1132 readl(pcie_index_offset); 1133 if (pcie_index_hi != 0) { 1134 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1135 readl(pcie_index_hi_offset); 1136 } 1137 writel((u32)(reg_data >> 32), pcie_data_offset); 1138 readl(pcie_data_offset); 1139 1140 /* clear the high bits */ 1141 if (pcie_index_hi != 0) { 1142 writel(0, pcie_index_hi_offset); 1143 readl(pcie_index_hi_offset); 1144 } 1145 1146 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1147 } 1148 1149 /** 1150 * amdgpu_device_get_rev_id - query device rev_id 1151 * 1152 * @adev: amdgpu_device pointer 1153 * 1154 * Return device rev_id 1155 */ 1156 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1157 { 1158 return adev->nbio.funcs->get_rev_id(adev); 1159 } 1160 1161 /** 1162 * amdgpu_invalid_rreg - dummy reg read function 1163 * 1164 * @adev: amdgpu_device pointer 1165 * @reg: offset of register 1166 * 1167 * Dummy register read function. Used for register blocks 1168 * that certain asics don't have (all asics). 1169 * Returns the value in the register. 1170 */ 1171 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1172 { 1173 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1174 BUG(); 1175 return 0; 1176 } 1177 1178 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1179 { 1180 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1181 BUG(); 1182 return 0; 1183 } 1184 1185 /** 1186 * amdgpu_invalid_wreg - dummy reg write function 1187 * 1188 * @adev: amdgpu_device pointer 1189 * @reg: offset of register 1190 * @v: value to write to the register 1191 * 1192 * Dummy register read function. Used for register blocks 1193 * that certain asics don't have (all asics). 1194 */ 1195 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1196 { 1197 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1198 reg, v); 1199 BUG(); 1200 } 1201 1202 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1203 { 1204 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1205 reg, v); 1206 BUG(); 1207 } 1208 1209 /** 1210 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1211 * 1212 * @adev: amdgpu_device pointer 1213 * @reg: offset of register 1214 * 1215 * Dummy register read function. Used for register blocks 1216 * that certain asics don't have (all asics). 1217 * Returns the value in the register. 1218 */ 1219 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1220 { 1221 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1222 BUG(); 1223 return 0; 1224 } 1225 1226 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1227 { 1228 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1229 BUG(); 1230 return 0; 1231 } 1232 1233 /** 1234 * amdgpu_invalid_wreg64 - dummy reg write function 1235 * 1236 * @adev: amdgpu_device pointer 1237 * @reg: offset of register 1238 * @v: value to write to the register 1239 * 1240 * Dummy register read function. Used for register blocks 1241 * that certain asics don't have (all asics). 1242 */ 1243 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1244 { 1245 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1246 reg, v); 1247 BUG(); 1248 } 1249 1250 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1251 { 1252 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1253 reg, v); 1254 BUG(); 1255 } 1256 1257 /** 1258 * amdgpu_block_invalid_rreg - dummy reg read function 1259 * 1260 * @adev: amdgpu_device pointer 1261 * @block: offset of instance 1262 * @reg: offset of register 1263 * 1264 * Dummy register read function. Used for register blocks 1265 * that certain asics don't have (all asics). 1266 * Returns the value in the register. 1267 */ 1268 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1269 uint32_t block, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1272 reg, block); 1273 BUG(); 1274 return 0; 1275 } 1276 1277 /** 1278 * amdgpu_block_invalid_wreg - dummy reg write function 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @block: offset of instance 1282 * @reg: offset of register 1283 * @v: value to write to the register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 */ 1288 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1289 uint32_t block, 1290 uint32_t reg, uint32_t v) 1291 { 1292 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1293 reg, block, v); 1294 BUG(); 1295 } 1296 1297 /** 1298 * amdgpu_device_asic_init - Wrapper for atom asic_init 1299 * 1300 * @adev: amdgpu_device pointer 1301 * 1302 * Does any asic specific work and then calls atom asic init. 1303 */ 1304 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1305 { 1306 int ret; 1307 1308 amdgpu_asic_pre_asic_init(adev); 1309 1310 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1311 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1312 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1313 amdgpu_psp_wait_for_bootloader(adev); 1314 ret = amdgpu_atomfirmware_asic_init(adev, true); 1315 return ret; 1316 } else { 1317 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1318 } 1319 1320 return 0; 1321 } 1322 1323 /** 1324 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1325 * 1326 * @adev: amdgpu_device pointer 1327 * 1328 * Allocates a scratch page of VRAM for use by various things in the 1329 * driver. 1330 */ 1331 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1332 { 1333 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1334 AMDGPU_GEM_DOMAIN_VRAM | 1335 AMDGPU_GEM_DOMAIN_GTT, 1336 &adev->mem_scratch.robj, 1337 &adev->mem_scratch.gpu_addr, 1338 (void **)&adev->mem_scratch.ptr); 1339 } 1340 1341 /** 1342 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1343 * 1344 * @adev: amdgpu_device pointer 1345 * 1346 * Frees the VRAM scratch page. 1347 */ 1348 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1349 { 1350 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1351 } 1352 1353 /** 1354 * amdgpu_device_program_register_sequence - program an array of registers. 1355 * 1356 * @adev: amdgpu_device pointer 1357 * @registers: pointer to the register array 1358 * @array_size: size of the register array 1359 * 1360 * Programs an array or registers with and or masks. 1361 * This is a helper for setting golden registers. 1362 */ 1363 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1364 const u32 *registers, 1365 const u32 array_size) 1366 { 1367 u32 tmp, reg, and_mask, or_mask; 1368 int i; 1369 1370 if (array_size % 3) 1371 return; 1372 1373 for (i = 0; i < array_size; i += 3) { 1374 reg = registers[i + 0]; 1375 and_mask = registers[i + 1]; 1376 or_mask = registers[i + 2]; 1377 1378 if (and_mask == 0xffffffff) { 1379 tmp = or_mask; 1380 } else { 1381 tmp = RREG32(reg); 1382 tmp &= ~and_mask; 1383 if (adev->family >= AMDGPU_FAMILY_AI) 1384 tmp |= (or_mask & and_mask); 1385 else 1386 tmp |= or_mask; 1387 } 1388 WREG32(reg, tmp); 1389 } 1390 } 1391 1392 /** 1393 * amdgpu_device_pci_config_reset - reset the GPU 1394 * 1395 * @adev: amdgpu_device pointer 1396 * 1397 * Resets the GPU using the pci config reset sequence. 1398 * Only applicable to asics prior to vega10. 1399 */ 1400 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1401 { 1402 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1403 } 1404 1405 /** 1406 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1407 * 1408 * @adev: amdgpu_device pointer 1409 * 1410 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1411 */ 1412 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1413 { 1414 return pci_reset_function(adev->pdev); 1415 } 1416 1417 /* 1418 * amdgpu_device_wb_*() 1419 * Writeback is the method by which the GPU updates special pages in memory 1420 * with the status of certain GPU events (fences, ring pointers,etc.). 1421 */ 1422 1423 /** 1424 * amdgpu_device_wb_fini - Disable Writeback and free memory 1425 * 1426 * @adev: amdgpu_device pointer 1427 * 1428 * Disables Writeback and frees the Writeback memory (all asics). 1429 * Used at driver shutdown. 1430 */ 1431 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1432 { 1433 if (adev->wb.wb_obj) { 1434 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1435 &adev->wb.gpu_addr, 1436 (void **)&adev->wb.wb); 1437 adev->wb.wb_obj = NULL; 1438 } 1439 } 1440 1441 /** 1442 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1443 * 1444 * @adev: amdgpu_device pointer 1445 * 1446 * Initializes writeback and allocates writeback memory (all asics). 1447 * Used at driver startup. 1448 * Returns 0 on success or an -error on failure. 1449 */ 1450 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1451 { 1452 int r; 1453 1454 if (adev->wb.wb_obj == NULL) { 1455 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1456 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1457 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1458 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1459 (void **)&adev->wb.wb); 1460 if (r) { 1461 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1462 return r; 1463 } 1464 1465 adev->wb.num_wb = AMDGPU_MAX_WB; 1466 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1467 1468 /* clear wb memory */ 1469 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1470 } 1471 1472 return 0; 1473 } 1474 1475 /** 1476 * amdgpu_device_wb_get - Allocate a wb entry 1477 * 1478 * @adev: amdgpu_device pointer 1479 * @wb: wb index 1480 * 1481 * Allocate a wb slot for use by the driver (all asics). 1482 * Returns 0 on success or -EINVAL on failure. 1483 */ 1484 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1485 { 1486 unsigned long flags, offset; 1487 1488 spin_lock_irqsave(&adev->wb.lock, flags); 1489 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1490 if (offset < adev->wb.num_wb) { 1491 __set_bit(offset, adev->wb.used); 1492 spin_unlock_irqrestore(&adev->wb.lock, flags); 1493 *wb = offset << 3; /* convert to dw offset */ 1494 return 0; 1495 } else { 1496 spin_unlock_irqrestore(&adev->wb.lock, flags); 1497 return -EINVAL; 1498 } 1499 } 1500 1501 /** 1502 * amdgpu_device_wb_free - Free a wb entry 1503 * 1504 * @adev: amdgpu_device pointer 1505 * @wb: wb index 1506 * 1507 * Free a wb slot allocated for use by the driver (all asics) 1508 */ 1509 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1510 { 1511 unsigned long flags; 1512 1513 wb >>= 3; 1514 spin_lock_irqsave(&adev->wb.lock, flags); 1515 if (wb < adev->wb.num_wb) 1516 __clear_bit(wb, adev->wb.used); 1517 spin_unlock_irqrestore(&adev->wb.lock, flags); 1518 } 1519 1520 /** 1521 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1522 * 1523 * @adev: amdgpu_device pointer 1524 * 1525 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1526 * to fail, but if any of the BARs is not accessible after the size we abort 1527 * driver loading by returning -ENODEV. 1528 */ 1529 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1530 { 1531 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1532 struct pci_bus *root; 1533 struct resource *res; 1534 unsigned int i; 1535 u16 cmd; 1536 int r; 1537 1538 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1539 return 0; 1540 1541 /* Bypass for VF */ 1542 if (amdgpu_sriov_vf(adev)) 1543 return 0; 1544 1545 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1546 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1547 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1548 1549 /* skip if the bios has already enabled large BAR */ 1550 if (adev->gmc.real_vram_size && 1551 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1552 return 0; 1553 1554 /* Check if the root BUS has 64bit memory resources */ 1555 root = adev->pdev->bus; 1556 while (root->parent) 1557 root = root->parent; 1558 1559 pci_bus_for_each_resource(root, res, i) { 1560 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1561 res->start > 0x100000000ull) 1562 break; 1563 } 1564 1565 /* Trying to resize is pointless without a root hub window above 4GB */ 1566 if (!res) 1567 return 0; 1568 1569 /* Limit the BAR size to what is available */ 1570 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1571 rbar_size); 1572 1573 /* Disable memory decoding while we change the BAR addresses and size */ 1574 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1575 pci_write_config_word(adev->pdev, PCI_COMMAND, 1576 cmd & ~PCI_COMMAND_MEMORY); 1577 1578 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1579 amdgpu_doorbell_fini(adev); 1580 if (adev->asic_type >= CHIP_BONAIRE) 1581 pci_release_resource(adev->pdev, 2); 1582 1583 pci_release_resource(adev->pdev, 0); 1584 1585 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1586 if (r == -ENOSPC) 1587 DRM_INFO("Not enough PCI address space for a large BAR."); 1588 else if (r && r != -ENOTSUPP) 1589 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1590 1591 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1592 1593 /* When the doorbell or fb BAR isn't available we have no chance of 1594 * using the device. 1595 */ 1596 r = amdgpu_doorbell_init(adev); 1597 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1598 return -ENODEV; 1599 1600 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1601 1602 return 0; 1603 } 1604 1605 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1606 { 1607 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1608 return false; 1609 1610 return true; 1611 } 1612 1613 /* 1614 * GPU helpers function. 1615 */ 1616 /** 1617 * amdgpu_device_need_post - check if the hw need post or not 1618 * 1619 * @adev: amdgpu_device pointer 1620 * 1621 * Check if the asic has been initialized (all asics) at driver startup 1622 * or post is needed if hw reset is performed. 1623 * Returns true if need or false if not. 1624 */ 1625 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1626 { 1627 uint32_t reg; 1628 1629 if (amdgpu_sriov_vf(adev)) 1630 return false; 1631 1632 if (!amdgpu_device_read_bios(adev)) 1633 return false; 1634 1635 if (amdgpu_passthrough(adev)) { 1636 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1637 * some old smc fw still need driver do vPost otherwise gpu hang, while 1638 * those smc fw version above 22.15 doesn't have this flaw, so we force 1639 * vpost executed for smc version below 22.15 1640 */ 1641 if (adev->asic_type == CHIP_FIJI) { 1642 int err; 1643 uint32_t fw_ver; 1644 1645 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1646 /* force vPost if error occured */ 1647 if (err) 1648 return true; 1649 1650 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1651 release_firmware(adev->pm.fw); 1652 if (fw_ver < 0x00160e00) 1653 return true; 1654 } 1655 } 1656 1657 /* Don't post if we need to reset whole hive on init */ 1658 if (adev->gmc.xgmi.pending_reset) 1659 return false; 1660 1661 if (adev->has_hw_reset) { 1662 adev->has_hw_reset = false; 1663 return true; 1664 } 1665 1666 /* bios scratch used on CIK+ */ 1667 if (adev->asic_type >= CHIP_BONAIRE) 1668 return amdgpu_atombios_scratch_need_asic_init(adev); 1669 1670 /* check MEM_SIZE for older asics */ 1671 reg = amdgpu_asic_get_config_memsize(adev); 1672 1673 if ((reg != 0) && (reg != 0xffffffff)) 1674 return false; 1675 1676 return true; 1677 } 1678 1679 /* 1680 * Check whether seamless boot is supported. 1681 * 1682 * So far we only support seamless boot on DCE 3.0 or later. 1683 * If users report that it works on older ASICS as well, we may 1684 * loosen this. 1685 */ 1686 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1687 { 1688 switch (amdgpu_seamless) { 1689 case -1: 1690 break; 1691 case 1: 1692 return true; 1693 case 0: 1694 return false; 1695 default: 1696 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1697 amdgpu_seamless); 1698 return false; 1699 } 1700 1701 if (!(adev->flags & AMD_IS_APU)) 1702 return false; 1703 1704 if (adev->mman.keep_stolen_vga_memory) 1705 return false; 1706 1707 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1708 } 1709 1710 /* 1711 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1712 * don't support dynamic speed switching. Until we have confirmation from Intel 1713 * that a specific host supports it, it's safer that we keep it disabled for all. 1714 * 1715 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1716 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1717 */ 1718 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1719 { 1720 #if IS_ENABLED(CONFIG_X86) 1721 struct cpuinfo_x86 *c = &cpu_data(0); 1722 1723 /* eGPU change speeds based on USB4 fabric conditions */ 1724 if (dev_is_removable(adev->dev)) 1725 return true; 1726 1727 if (c->x86_vendor == X86_VENDOR_INTEL) 1728 return false; 1729 #endif 1730 return true; 1731 } 1732 1733 /** 1734 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1735 * 1736 * @adev: amdgpu_device pointer 1737 * 1738 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1739 * be set for this device. 1740 * 1741 * Returns true if it should be used or false if not. 1742 */ 1743 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1744 { 1745 switch (amdgpu_aspm) { 1746 case -1: 1747 break; 1748 case 0: 1749 return false; 1750 case 1: 1751 return true; 1752 default: 1753 return false; 1754 } 1755 if (adev->flags & AMD_IS_APU) 1756 return false; 1757 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1758 return false; 1759 return pcie_aspm_enabled(adev->pdev); 1760 } 1761 1762 /* if we get transitioned to only one device, take VGA back */ 1763 /** 1764 * amdgpu_device_vga_set_decode - enable/disable vga decode 1765 * 1766 * @pdev: PCI device pointer 1767 * @state: enable/disable vga decode 1768 * 1769 * Enable/disable vga decode (all asics). 1770 * Returns VGA resource flags. 1771 */ 1772 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1773 bool state) 1774 { 1775 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1776 1777 amdgpu_asic_set_vga_state(adev, state); 1778 if (state) 1779 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1780 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1781 else 1782 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1783 } 1784 1785 /** 1786 * amdgpu_device_check_block_size - validate the vm block size 1787 * 1788 * @adev: amdgpu_device pointer 1789 * 1790 * Validates the vm block size specified via module parameter. 1791 * The vm block size defines number of bits in page table versus page directory, 1792 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1793 * page table and the remaining bits are in the page directory. 1794 */ 1795 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1796 { 1797 /* defines number of bits in page table versus page directory, 1798 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1799 * page table and the remaining bits are in the page directory 1800 */ 1801 if (amdgpu_vm_block_size == -1) 1802 return; 1803 1804 if (amdgpu_vm_block_size < 9) { 1805 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1806 amdgpu_vm_block_size); 1807 amdgpu_vm_block_size = -1; 1808 } 1809 } 1810 1811 /** 1812 * amdgpu_device_check_vm_size - validate the vm size 1813 * 1814 * @adev: amdgpu_device pointer 1815 * 1816 * Validates the vm size in GB specified via module parameter. 1817 * The VM size is the size of the GPU virtual memory space in GB. 1818 */ 1819 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1820 { 1821 /* no need to check the default value */ 1822 if (amdgpu_vm_size == -1) 1823 return; 1824 1825 if (amdgpu_vm_size < 1) { 1826 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1827 amdgpu_vm_size); 1828 amdgpu_vm_size = -1; 1829 } 1830 } 1831 1832 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1833 { 1834 struct sysinfo si; 1835 bool is_os_64 = (sizeof(void *) == 8); 1836 uint64_t total_memory; 1837 uint64_t dram_size_seven_GB = 0x1B8000000; 1838 uint64_t dram_size_three_GB = 0xB8000000; 1839 1840 if (amdgpu_smu_memory_pool_size == 0) 1841 return; 1842 1843 if (!is_os_64) { 1844 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1845 goto def_value; 1846 } 1847 si_meminfo(&si); 1848 total_memory = (uint64_t)si.totalram * si.mem_unit; 1849 1850 if ((amdgpu_smu_memory_pool_size == 1) || 1851 (amdgpu_smu_memory_pool_size == 2)) { 1852 if (total_memory < dram_size_three_GB) 1853 goto def_value1; 1854 } else if ((amdgpu_smu_memory_pool_size == 4) || 1855 (amdgpu_smu_memory_pool_size == 8)) { 1856 if (total_memory < dram_size_seven_GB) 1857 goto def_value1; 1858 } else { 1859 DRM_WARN("Smu memory pool size not supported\n"); 1860 goto def_value; 1861 } 1862 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1863 1864 return; 1865 1866 def_value1: 1867 DRM_WARN("No enough system memory\n"); 1868 def_value: 1869 adev->pm.smu_prv_buffer_size = 0; 1870 } 1871 1872 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1873 { 1874 if (!(adev->flags & AMD_IS_APU) || 1875 adev->asic_type < CHIP_RAVEN) 1876 return 0; 1877 1878 switch (adev->asic_type) { 1879 case CHIP_RAVEN: 1880 if (adev->pdev->device == 0x15dd) 1881 adev->apu_flags |= AMD_APU_IS_RAVEN; 1882 if (adev->pdev->device == 0x15d8) 1883 adev->apu_flags |= AMD_APU_IS_PICASSO; 1884 break; 1885 case CHIP_RENOIR: 1886 if ((adev->pdev->device == 0x1636) || 1887 (adev->pdev->device == 0x164c)) 1888 adev->apu_flags |= AMD_APU_IS_RENOIR; 1889 else 1890 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1891 break; 1892 case CHIP_VANGOGH: 1893 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1894 break; 1895 case CHIP_YELLOW_CARP: 1896 break; 1897 case CHIP_CYAN_SKILLFISH: 1898 if ((adev->pdev->device == 0x13FE) || 1899 (adev->pdev->device == 0x143F)) 1900 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1901 break; 1902 default: 1903 break; 1904 } 1905 1906 return 0; 1907 } 1908 1909 /** 1910 * amdgpu_device_check_arguments - validate module params 1911 * 1912 * @adev: amdgpu_device pointer 1913 * 1914 * Validates certain module parameters and updates 1915 * the associated values used by the driver (all asics). 1916 */ 1917 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1918 { 1919 int i; 1920 1921 if (amdgpu_sched_jobs < 4) { 1922 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1923 amdgpu_sched_jobs); 1924 amdgpu_sched_jobs = 4; 1925 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1926 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1927 amdgpu_sched_jobs); 1928 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1929 } 1930 1931 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1932 /* gart size must be greater or equal to 32M */ 1933 dev_warn(adev->dev, "gart size (%d) too small\n", 1934 amdgpu_gart_size); 1935 amdgpu_gart_size = -1; 1936 } 1937 1938 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1939 /* gtt size must be greater or equal to 32M */ 1940 dev_warn(adev->dev, "gtt size (%d) too small\n", 1941 amdgpu_gtt_size); 1942 amdgpu_gtt_size = -1; 1943 } 1944 1945 /* valid range is between 4 and 9 inclusive */ 1946 if (amdgpu_vm_fragment_size != -1 && 1947 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1948 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1949 amdgpu_vm_fragment_size = -1; 1950 } 1951 1952 if (amdgpu_sched_hw_submission < 2) { 1953 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1954 amdgpu_sched_hw_submission); 1955 amdgpu_sched_hw_submission = 2; 1956 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1957 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1958 amdgpu_sched_hw_submission); 1959 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1960 } 1961 1962 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1963 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1964 amdgpu_reset_method = -1; 1965 } 1966 1967 amdgpu_device_check_smu_prv_buffer_size(adev); 1968 1969 amdgpu_device_check_vm_size(adev); 1970 1971 amdgpu_device_check_block_size(adev); 1972 1973 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1974 1975 for (i = 0; i < MAX_XCP; i++) 1976 adev->enforce_isolation[i] = !!enforce_isolation; 1977 1978 return 0; 1979 } 1980 1981 /** 1982 * amdgpu_switcheroo_set_state - set switcheroo state 1983 * 1984 * @pdev: pci dev pointer 1985 * @state: vga_switcheroo state 1986 * 1987 * Callback for the switcheroo driver. Suspends or resumes 1988 * the asics before or after it is powered up using ACPI methods. 1989 */ 1990 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1991 enum vga_switcheroo_state state) 1992 { 1993 struct drm_device *dev = pci_get_drvdata(pdev); 1994 int r; 1995 1996 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1997 return; 1998 1999 if (state == VGA_SWITCHEROO_ON) { 2000 pr_info("switched on\n"); 2001 /* don't suspend or resume card normally */ 2002 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2003 2004 pci_set_power_state(pdev, PCI_D0); 2005 amdgpu_device_load_pci_state(pdev); 2006 r = pci_enable_device(pdev); 2007 if (r) 2008 DRM_WARN("pci_enable_device failed (%d)\n", r); 2009 amdgpu_device_resume(dev, true); 2010 2011 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2012 } else { 2013 pr_info("switched off\n"); 2014 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2015 amdgpu_device_prepare(dev); 2016 amdgpu_device_suspend(dev, true); 2017 amdgpu_device_cache_pci_state(pdev); 2018 /* Shut down the device */ 2019 pci_disable_device(pdev); 2020 pci_set_power_state(pdev, PCI_D3cold); 2021 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2022 } 2023 } 2024 2025 /** 2026 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2027 * 2028 * @pdev: pci dev pointer 2029 * 2030 * Callback for the switcheroo driver. Check of the switcheroo 2031 * state can be changed. 2032 * Returns true if the state can be changed, false if not. 2033 */ 2034 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2035 { 2036 struct drm_device *dev = pci_get_drvdata(pdev); 2037 2038 /* 2039 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2040 * locking inversion with the driver load path. And the access here is 2041 * completely racy anyway. So don't bother with locking for now. 2042 */ 2043 return atomic_read(&dev->open_count) == 0; 2044 } 2045 2046 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2047 .set_gpu_state = amdgpu_switcheroo_set_state, 2048 .reprobe = NULL, 2049 .can_switch = amdgpu_switcheroo_can_switch, 2050 }; 2051 2052 /** 2053 * amdgpu_device_ip_set_clockgating_state - set the CG state 2054 * 2055 * @dev: amdgpu_device pointer 2056 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2057 * @state: clockgating state (gate or ungate) 2058 * 2059 * Sets the requested clockgating state for all instances of 2060 * the hardware IP specified. 2061 * Returns the error code from the last instance. 2062 */ 2063 int amdgpu_device_ip_set_clockgating_state(void *dev, 2064 enum amd_ip_block_type block_type, 2065 enum amd_clockgating_state state) 2066 { 2067 struct amdgpu_device *adev = dev; 2068 int i, r = 0; 2069 2070 for (i = 0; i < adev->num_ip_blocks; i++) { 2071 if (!adev->ip_blocks[i].status.valid) 2072 continue; 2073 if (adev->ip_blocks[i].version->type != block_type) 2074 continue; 2075 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2076 continue; 2077 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2078 (void *)adev, state); 2079 if (r) 2080 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2081 adev->ip_blocks[i].version->funcs->name, r); 2082 } 2083 return r; 2084 } 2085 2086 /** 2087 * amdgpu_device_ip_set_powergating_state - set the PG state 2088 * 2089 * @dev: amdgpu_device pointer 2090 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2091 * @state: powergating state (gate or ungate) 2092 * 2093 * Sets the requested powergating state for all instances of 2094 * the hardware IP specified. 2095 * Returns the error code from the last instance. 2096 */ 2097 int amdgpu_device_ip_set_powergating_state(void *dev, 2098 enum amd_ip_block_type block_type, 2099 enum amd_powergating_state state) 2100 { 2101 struct amdgpu_device *adev = dev; 2102 int i, r = 0; 2103 2104 for (i = 0; i < adev->num_ip_blocks; i++) { 2105 if (!adev->ip_blocks[i].status.valid) 2106 continue; 2107 if (adev->ip_blocks[i].version->type != block_type) 2108 continue; 2109 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2110 continue; 2111 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2112 (void *)adev, state); 2113 if (r) 2114 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2115 adev->ip_blocks[i].version->funcs->name, r); 2116 } 2117 return r; 2118 } 2119 2120 /** 2121 * amdgpu_device_ip_get_clockgating_state - get the CG state 2122 * 2123 * @adev: amdgpu_device pointer 2124 * @flags: clockgating feature flags 2125 * 2126 * Walks the list of IPs on the device and updates the clockgating 2127 * flags for each IP. 2128 * Updates @flags with the feature flags for each hardware IP where 2129 * clockgating is enabled. 2130 */ 2131 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2132 u64 *flags) 2133 { 2134 int i; 2135 2136 for (i = 0; i < adev->num_ip_blocks; i++) { 2137 if (!adev->ip_blocks[i].status.valid) 2138 continue; 2139 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2140 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2141 } 2142 } 2143 2144 /** 2145 * amdgpu_device_ip_wait_for_idle - wait for idle 2146 * 2147 * @adev: amdgpu_device pointer 2148 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2149 * 2150 * Waits for the request hardware IP to be idle. 2151 * Returns 0 for success or a negative error code on failure. 2152 */ 2153 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2154 enum amd_ip_block_type block_type) 2155 { 2156 int i, r; 2157 2158 for (i = 0; i < adev->num_ip_blocks; i++) { 2159 if (!adev->ip_blocks[i].status.valid) 2160 continue; 2161 if (adev->ip_blocks[i].version->type == block_type) { 2162 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2163 if (r) 2164 return r; 2165 break; 2166 } 2167 } 2168 return 0; 2169 2170 } 2171 2172 /** 2173 * amdgpu_device_ip_is_idle - is the hardware IP idle 2174 * 2175 * @adev: amdgpu_device pointer 2176 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2177 * 2178 * Check if the hardware IP is idle or not. 2179 * Returns true if it the IP is idle, false if not. 2180 */ 2181 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2182 enum amd_ip_block_type block_type) 2183 { 2184 int i; 2185 2186 for (i = 0; i < adev->num_ip_blocks; i++) { 2187 if (!adev->ip_blocks[i].status.valid) 2188 continue; 2189 if (adev->ip_blocks[i].version->type == block_type) 2190 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2191 } 2192 return true; 2193 2194 } 2195 2196 /** 2197 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2198 * 2199 * @adev: amdgpu_device pointer 2200 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2201 * 2202 * Returns a pointer to the hardware IP block structure 2203 * if it exists for the asic, otherwise NULL. 2204 */ 2205 struct amdgpu_ip_block * 2206 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2207 enum amd_ip_block_type type) 2208 { 2209 int i; 2210 2211 for (i = 0; i < adev->num_ip_blocks; i++) 2212 if (adev->ip_blocks[i].version->type == type) 2213 return &adev->ip_blocks[i]; 2214 2215 return NULL; 2216 } 2217 2218 /** 2219 * amdgpu_device_ip_block_version_cmp 2220 * 2221 * @adev: amdgpu_device pointer 2222 * @type: enum amd_ip_block_type 2223 * @major: major version 2224 * @minor: minor version 2225 * 2226 * return 0 if equal or greater 2227 * return 1 if smaller or the ip_block doesn't exist 2228 */ 2229 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2230 enum amd_ip_block_type type, 2231 u32 major, u32 minor) 2232 { 2233 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2234 2235 if (ip_block && ((ip_block->version->major > major) || 2236 ((ip_block->version->major == major) && 2237 (ip_block->version->minor >= minor)))) 2238 return 0; 2239 2240 return 1; 2241 } 2242 2243 /** 2244 * amdgpu_device_ip_block_add 2245 * 2246 * @adev: amdgpu_device pointer 2247 * @ip_block_version: pointer to the IP to add 2248 * 2249 * Adds the IP block driver information to the collection of IPs 2250 * on the asic. 2251 */ 2252 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2253 const struct amdgpu_ip_block_version *ip_block_version) 2254 { 2255 if (!ip_block_version) 2256 return -EINVAL; 2257 2258 switch (ip_block_version->type) { 2259 case AMD_IP_BLOCK_TYPE_VCN: 2260 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2261 return 0; 2262 break; 2263 case AMD_IP_BLOCK_TYPE_JPEG: 2264 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2265 return 0; 2266 break; 2267 default: 2268 break; 2269 } 2270 2271 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2272 ip_block_version->funcs->name); 2273 2274 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2275 2276 return 0; 2277 } 2278 2279 /** 2280 * amdgpu_device_enable_virtual_display - enable virtual display feature 2281 * 2282 * @adev: amdgpu_device pointer 2283 * 2284 * Enabled the virtual display feature if the user has enabled it via 2285 * the module parameter virtual_display. This feature provides a virtual 2286 * display hardware on headless boards or in virtualized environments. 2287 * This function parses and validates the configuration string specified by 2288 * the user and configues the virtual display configuration (number of 2289 * virtual connectors, crtcs, etc.) specified. 2290 */ 2291 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2292 { 2293 adev->enable_virtual_display = false; 2294 2295 if (amdgpu_virtual_display) { 2296 const char *pci_address_name = pci_name(adev->pdev); 2297 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2298 2299 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2300 pciaddstr_tmp = pciaddstr; 2301 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2302 pciaddname = strsep(&pciaddname_tmp, ","); 2303 if (!strcmp("all", pciaddname) 2304 || !strcmp(pci_address_name, pciaddname)) { 2305 long num_crtc; 2306 int res = -1; 2307 2308 adev->enable_virtual_display = true; 2309 2310 if (pciaddname_tmp) 2311 res = kstrtol(pciaddname_tmp, 10, 2312 &num_crtc); 2313 2314 if (!res) { 2315 if (num_crtc < 1) 2316 num_crtc = 1; 2317 if (num_crtc > 6) 2318 num_crtc = 6; 2319 adev->mode_info.num_crtc = num_crtc; 2320 } else { 2321 adev->mode_info.num_crtc = 1; 2322 } 2323 break; 2324 } 2325 } 2326 2327 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2328 amdgpu_virtual_display, pci_address_name, 2329 adev->enable_virtual_display, adev->mode_info.num_crtc); 2330 2331 kfree(pciaddstr); 2332 } 2333 } 2334 2335 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2336 { 2337 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2338 adev->mode_info.num_crtc = 1; 2339 adev->enable_virtual_display = true; 2340 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2341 adev->enable_virtual_display, adev->mode_info.num_crtc); 2342 } 2343 } 2344 2345 /** 2346 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2347 * 2348 * @adev: amdgpu_device pointer 2349 * 2350 * Parses the asic configuration parameters specified in the gpu info 2351 * firmware and makes them availale to the driver for use in configuring 2352 * the asic. 2353 * Returns 0 on success, -EINVAL on failure. 2354 */ 2355 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2356 { 2357 const char *chip_name; 2358 int err; 2359 const struct gpu_info_firmware_header_v1_0 *hdr; 2360 2361 adev->firmware.gpu_info_fw = NULL; 2362 2363 if (adev->mman.discovery_bin) 2364 return 0; 2365 2366 switch (adev->asic_type) { 2367 default: 2368 return 0; 2369 case CHIP_VEGA10: 2370 chip_name = "vega10"; 2371 break; 2372 case CHIP_VEGA12: 2373 chip_name = "vega12"; 2374 break; 2375 case CHIP_RAVEN: 2376 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2377 chip_name = "raven2"; 2378 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2379 chip_name = "picasso"; 2380 else 2381 chip_name = "raven"; 2382 break; 2383 case CHIP_ARCTURUS: 2384 chip_name = "arcturus"; 2385 break; 2386 case CHIP_NAVI12: 2387 chip_name = "navi12"; 2388 break; 2389 } 2390 2391 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2392 "amdgpu/%s_gpu_info.bin", chip_name); 2393 if (err) { 2394 dev_err(adev->dev, 2395 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2396 chip_name); 2397 goto out; 2398 } 2399 2400 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2401 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2402 2403 switch (hdr->version_major) { 2404 case 1: 2405 { 2406 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2407 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2408 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2409 2410 /* 2411 * Should be droped when DAL no longer needs it. 2412 */ 2413 if (adev->asic_type == CHIP_NAVI12) 2414 goto parse_soc_bounding_box; 2415 2416 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2417 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2418 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2419 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2420 adev->gfx.config.max_texture_channel_caches = 2421 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2422 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2423 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2424 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2425 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2426 adev->gfx.config.double_offchip_lds_buf = 2427 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2428 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2429 adev->gfx.cu_info.max_waves_per_simd = 2430 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2431 adev->gfx.cu_info.max_scratch_slots_per_cu = 2432 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2433 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2434 if (hdr->version_minor >= 1) { 2435 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2436 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2437 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2438 adev->gfx.config.num_sc_per_sh = 2439 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2440 adev->gfx.config.num_packer_per_sc = 2441 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2442 } 2443 2444 parse_soc_bounding_box: 2445 /* 2446 * soc bounding box info is not integrated in disocovery table, 2447 * we always need to parse it from gpu info firmware if needed. 2448 */ 2449 if (hdr->version_minor == 2) { 2450 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2451 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2452 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2453 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2454 } 2455 break; 2456 } 2457 default: 2458 dev_err(adev->dev, 2459 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2460 err = -EINVAL; 2461 goto out; 2462 } 2463 out: 2464 return err; 2465 } 2466 2467 /** 2468 * amdgpu_device_ip_early_init - run early init for hardware IPs 2469 * 2470 * @adev: amdgpu_device pointer 2471 * 2472 * Early initialization pass for hardware IPs. The hardware IPs that make 2473 * up each asic are discovered each IP's early_init callback is run. This 2474 * is the first stage in initializing the asic. 2475 * Returns 0 on success, negative error code on failure. 2476 */ 2477 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2478 { 2479 struct amdgpu_ip_block *ip_block; 2480 struct pci_dev *parent; 2481 int i, r; 2482 bool total; 2483 2484 amdgpu_device_enable_virtual_display(adev); 2485 2486 if (amdgpu_sriov_vf(adev)) { 2487 r = amdgpu_virt_request_full_gpu(adev, true); 2488 if (r) 2489 return r; 2490 } 2491 2492 switch (adev->asic_type) { 2493 #ifdef CONFIG_DRM_AMDGPU_SI 2494 case CHIP_VERDE: 2495 case CHIP_TAHITI: 2496 case CHIP_PITCAIRN: 2497 case CHIP_OLAND: 2498 case CHIP_HAINAN: 2499 adev->family = AMDGPU_FAMILY_SI; 2500 r = si_set_ip_blocks(adev); 2501 if (r) 2502 return r; 2503 break; 2504 #endif 2505 #ifdef CONFIG_DRM_AMDGPU_CIK 2506 case CHIP_BONAIRE: 2507 case CHIP_HAWAII: 2508 case CHIP_KAVERI: 2509 case CHIP_KABINI: 2510 case CHIP_MULLINS: 2511 if (adev->flags & AMD_IS_APU) 2512 adev->family = AMDGPU_FAMILY_KV; 2513 else 2514 adev->family = AMDGPU_FAMILY_CI; 2515 2516 r = cik_set_ip_blocks(adev); 2517 if (r) 2518 return r; 2519 break; 2520 #endif 2521 case CHIP_TOPAZ: 2522 case CHIP_TONGA: 2523 case CHIP_FIJI: 2524 case CHIP_POLARIS10: 2525 case CHIP_POLARIS11: 2526 case CHIP_POLARIS12: 2527 case CHIP_VEGAM: 2528 case CHIP_CARRIZO: 2529 case CHIP_STONEY: 2530 if (adev->flags & AMD_IS_APU) 2531 adev->family = AMDGPU_FAMILY_CZ; 2532 else 2533 adev->family = AMDGPU_FAMILY_VI; 2534 2535 r = vi_set_ip_blocks(adev); 2536 if (r) 2537 return r; 2538 break; 2539 default: 2540 r = amdgpu_discovery_set_ip_blocks(adev); 2541 if (r) 2542 return r; 2543 break; 2544 } 2545 2546 if (amdgpu_has_atpx() && 2547 (amdgpu_is_atpx_hybrid() || 2548 amdgpu_has_atpx_dgpu_power_cntl()) && 2549 ((adev->flags & AMD_IS_APU) == 0) && 2550 !dev_is_removable(&adev->pdev->dev)) 2551 adev->flags |= AMD_IS_PX; 2552 2553 if (!(adev->flags & AMD_IS_APU)) { 2554 parent = pcie_find_root_port(adev->pdev); 2555 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2556 } 2557 2558 2559 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2560 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2561 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2562 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2563 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2564 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2565 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2566 2567 total = true; 2568 for (i = 0; i < adev->num_ip_blocks; i++) { 2569 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2570 DRM_WARN("disabled ip block: %d <%s>\n", 2571 i, adev->ip_blocks[i].version->funcs->name); 2572 adev->ip_blocks[i].status.valid = false; 2573 } else { 2574 if (adev->ip_blocks[i].version->funcs->early_init) { 2575 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2576 if (r == -ENOENT) { 2577 adev->ip_blocks[i].status.valid = false; 2578 } else if (r) { 2579 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2580 adev->ip_blocks[i].version->funcs->name, r); 2581 total = false; 2582 } else { 2583 adev->ip_blocks[i].status.valid = true; 2584 } 2585 } else { 2586 adev->ip_blocks[i].status.valid = true; 2587 } 2588 } 2589 /* get the vbios after the asic_funcs are set up */ 2590 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2591 r = amdgpu_device_parse_gpu_info_fw(adev); 2592 if (r) 2593 return r; 2594 2595 /* Read BIOS */ 2596 if (amdgpu_device_read_bios(adev)) { 2597 if (!amdgpu_get_bios(adev)) 2598 return -EINVAL; 2599 2600 r = amdgpu_atombios_init(adev); 2601 if (r) { 2602 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2603 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2604 return r; 2605 } 2606 } 2607 2608 /*get pf2vf msg info at it's earliest time*/ 2609 if (amdgpu_sriov_vf(adev)) 2610 amdgpu_virt_init_data_exchange(adev); 2611 2612 } 2613 } 2614 if (!total) 2615 return -ENODEV; 2616 2617 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2618 if (ip_block->status.valid != false) 2619 amdgpu_amdkfd_device_probe(adev); 2620 2621 adev->cg_flags &= amdgpu_cg_mask; 2622 adev->pg_flags &= amdgpu_pg_mask; 2623 2624 return 0; 2625 } 2626 2627 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2628 { 2629 int i, r; 2630 2631 for (i = 0; i < adev->num_ip_blocks; i++) { 2632 if (!adev->ip_blocks[i].status.sw) 2633 continue; 2634 if (adev->ip_blocks[i].status.hw) 2635 continue; 2636 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2637 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2638 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2639 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2640 if (r) { 2641 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2642 adev->ip_blocks[i].version->funcs->name, r); 2643 return r; 2644 } 2645 adev->ip_blocks[i].status.hw = true; 2646 } 2647 } 2648 2649 return 0; 2650 } 2651 2652 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2653 { 2654 int i, r; 2655 2656 for (i = 0; i < adev->num_ip_blocks; i++) { 2657 if (!adev->ip_blocks[i].status.sw) 2658 continue; 2659 if (adev->ip_blocks[i].status.hw) 2660 continue; 2661 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2662 if (r) { 2663 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2664 adev->ip_blocks[i].version->funcs->name, r); 2665 return r; 2666 } 2667 adev->ip_blocks[i].status.hw = true; 2668 } 2669 2670 return 0; 2671 } 2672 2673 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2674 { 2675 int r = 0; 2676 int i; 2677 uint32_t smu_version; 2678 2679 if (adev->asic_type >= CHIP_VEGA10) { 2680 for (i = 0; i < adev->num_ip_blocks; i++) { 2681 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2682 continue; 2683 2684 if (!adev->ip_blocks[i].status.sw) 2685 continue; 2686 2687 /* no need to do the fw loading again if already done*/ 2688 if (adev->ip_blocks[i].status.hw == true) 2689 break; 2690 2691 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2692 r = adev->ip_blocks[i].version->funcs->resume(adev); 2693 if (r) { 2694 DRM_ERROR("resume of IP block <%s> failed %d\n", 2695 adev->ip_blocks[i].version->funcs->name, r); 2696 return r; 2697 } 2698 } else { 2699 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2700 if (r) { 2701 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2702 adev->ip_blocks[i].version->funcs->name, r); 2703 return r; 2704 } 2705 } 2706 2707 adev->ip_blocks[i].status.hw = true; 2708 break; 2709 } 2710 } 2711 2712 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2713 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2714 2715 return r; 2716 } 2717 2718 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2719 { 2720 long timeout; 2721 int r, i; 2722 2723 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2724 struct amdgpu_ring *ring = adev->rings[i]; 2725 2726 /* No need to setup the GPU scheduler for rings that don't need it */ 2727 if (!ring || ring->no_scheduler) 2728 continue; 2729 2730 switch (ring->funcs->type) { 2731 case AMDGPU_RING_TYPE_GFX: 2732 timeout = adev->gfx_timeout; 2733 break; 2734 case AMDGPU_RING_TYPE_COMPUTE: 2735 timeout = adev->compute_timeout; 2736 break; 2737 case AMDGPU_RING_TYPE_SDMA: 2738 timeout = adev->sdma_timeout; 2739 break; 2740 default: 2741 timeout = adev->video_timeout; 2742 break; 2743 } 2744 2745 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2746 DRM_SCHED_PRIORITY_COUNT, 2747 ring->num_hw_submission, 0, 2748 timeout, adev->reset_domain->wq, 2749 ring->sched_score, ring->name, 2750 adev->dev); 2751 if (r) { 2752 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2753 ring->name); 2754 return r; 2755 } 2756 r = amdgpu_uvd_entity_init(adev, ring); 2757 if (r) { 2758 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2759 ring->name); 2760 return r; 2761 } 2762 r = amdgpu_vce_entity_init(adev, ring); 2763 if (r) { 2764 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2765 ring->name); 2766 return r; 2767 } 2768 } 2769 2770 amdgpu_xcp_update_partition_sched_list(adev); 2771 2772 return 0; 2773 } 2774 2775 2776 /** 2777 * amdgpu_device_ip_init - run init for hardware IPs 2778 * 2779 * @adev: amdgpu_device pointer 2780 * 2781 * Main initialization pass for hardware IPs. The list of all the hardware 2782 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2783 * are run. sw_init initializes the software state associated with each IP 2784 * and hw_init initializes the hardware associated with each IP. 2785 * Returns 0 on success, negative error code on failure. 2786 */ 2787 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2788 { 2789 int i, r; 2790 2791 r = amdgpu_ras_init(adev); 2792 if (r) 2793 return r; 2794 2795 for (i = 0; i < adev->num_ip_blocks; i++) { 2796 if (!adev->ip_blocks[i].status.valid) 2797 continue; 2798 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2799 if (r) { 2800 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2801 adev->ip_blocks[i].version->funcs->name, r); 2802 goto init_failed; 2803 } 2804 adev->ip_blocks[i].status.sw = true; 2805 2806 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2807 /* need to do common hw init early so everything is set up for gmc */ 2808 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2809 if (r) { 2810 DRM_ERROR("hw_init %d failed %d\n", i, r); 2811 goto init_failed; 2812 } 2813 adev->ip_blocks[i].status.hw = true; 2814 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2815 /* need to do gmc hw init early so we can allocate gpu mem */ 2816 /* Try to reserve bad pages early */ 2817 if (amdgpu_sriov_vf(adev)) 2818 amdgpu_virt_exchange_data(adev); 2819 2820 r = amdgpu_device_mem_scratch_init(adev); 2821 if (r) { 2822 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2823 goto init_failed; 2824 } 2825 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2826 if (r) { 2827 DRM_ERROR("hw_init %d failed %d\n", i, r); 2828 goto init_failed; 2829 } 2830 r = amdgpu_device_wb_init(adev); 2831 if (r) { 2832 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2833 goto init_failed; 2834 } 2835 adev->ip_blocks[i].status.hw = true; 2836 2837 /* right after GMC hw init, we create CSA */ 2838 if (adev->gfx.mcbp) { 2839 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2840 AMDGPU_GEM_DOMAIN_VRAM | 2841 AMDGPU_GEM_DOMAIN_GTT, 2842 AMDGPU_CSA_SIZE); 2843 if (r) { 2844 DRM_ERROR("allocate CSA failed %d\n", r); 2845 goto init_failed; 2846 } 2847 } 2848 2849 r = amdgpu_seq64_init(adev); 2850 if (r) { 2851 DRM_ERROR("allocate seq64 failed %d\n", r); 2852 goto init_failed; 2853 } 2854 } 2855 } 2856 2857 if (amdgpu_sriov_vf(adev)) 2858 amdgpu_virt_init_data_exchange(adev); 2859 2860 r = amdgpu_ib_pool_init(adev); 2861 if (r) { 2862 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2863 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2864 goto init_failed; 2865 } 2866 2867 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2868 if (r) 2869 goto init_failed; 2870 2871 r = amdgpu_device_ip_hw_init_phase1(adev); 2872 if (r) 2873 goto init_failed; 2874 2875 r = amdgpu_device_fw_loading(adev); 2876 if (r) 2877 goto init_failed; 2878 2879 r = amdgpu_device_ip_hw_init_phase2(adev); 2880 if (r) 2881 goto init_failed; 2882 2883 /* 2884 * retired pages will be loaded from eeprom and reserved here, 2885 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2886 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2887 * for I2C communication which only true at this point. 2888 * 2889 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2890 * failure from bad gpu situation and stop amdgpu init process 2891 * accordingly. For other failed cases, it will still release all 2892 * the resource and print error message, rather than returning one 2893 * negative value to upper level. 2894 * 2895 * Note: theoretically, this should be called before all vram allocations 2896 * to protect retired page from abusing 2897 */ 2898 r = amdgpu_ras_recovery_init(adev); 2899 if (r) 2900 goto init_failed; 2901 2902 /** 2903 * In case of XGMI grab extra reference for reset domain for this device 2904 */ 2905 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2906 if (amdgpu_xgmi_add_device(adev) == 0) { 2907 if (!amdgpu_sriov_vf(adev)) { 2908 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2909 2910 if (WARN_ON(!hive)) { 2911 r = -ENOENT; 2912 goto init_failed; 2913 } 2914 2915 if (!hive->reset_domain || 2916 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2917 r = -ENOENT; 2918 amdgpu_put_xgmi_hive(hive); 2919 goto init_failed; 2920 } 2921 2922 /* Drop the early temporary reset domain we created for device */ 2923 amdgpu_reset_put_reset_domain(adev->reset_domain); 2924 adev->reset_domain = hive->reset_domain; 2925 amdgpu_put_xgmi_hive(hive); 2926 } 2927 } 2928 } 2929 2930 r = amdgpu_device_init_schedulers(adev); 2931 if (r) 2932 goto init_failed; 2933 2934 if (adev->mman.buffer_funcs_ring->sched.ready) 2935 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2936 2937 /* Don't init kfd if whole hive need to be reset during init */ 2938 if (!adev->gmc.xgmi.pending_reset) { 2939 kgd2kfd_init_zone_device(adev); 2940 amdgpu_amdkfd_device_init(adev); 2941 } 2942 2943 amdgpu_fru_get_product_info(adev); 2944 2945 init_failed: 2946 2947 return r; 2948 } 2949 2950 /** 2951 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2952 * 2953 * @adev: amdgpu_device pointer 2954 * 2955 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2956 * this function before a GPU reset. If the value is retained after a 2957 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2958 */ 2959 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2960 { 2961 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2962 } 2963 2964 /** 2965 * amdgpu_device_check_vram_lost - check if vram is valid 2966 * 2967 * @adev: amdgpu_device pointer 2968 * 2969 * Checks the reset magic value written to the gart pointer in VRAM. 2970 * The driver calls this after a GPU reset to see if the contents of 2971 * VRAM is lost or now. 2972 * returns true if vram is lost, false if not. 2973 */ 2974 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2975 { 2976 if (memcmp(adev->gart.ptr, adev->reset_magic, 2977 AMDGPU_RESET_MAGIC_NUM)) 2978 return true; 2979 2980 if (!amdgpu_in_reset(adev)) 2981 return false; 2982 2983 /* 2984 * For all ASICs with baco/mode1 reset, the VRAM is 2985 * always assumed to be lost. 2986 */ 2987 switch (amdgpu_asic_reset_method(adev)) { 2988 case AMD_RESET_METHOD_BACO: 2989 case AMD_RESET_METHOD_MODE1: 2990 return true; 2991 default: 2992 return false; 2993 } 2994 } 2995 2996 /** 2997 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2998 * 2999 * @adev: amdgpu_device pointer 3000 * @state: clockgating state (gate or ungate) 3001 * 3002 * The list of all the hardware IPs that make up the asic is walked and the 3003 * set_clockgating_state callbacks are run. 3004 * Late initialization pass enabling clockgating for hardware IPs. 3005 * Fini or suspend, pass disabling clockgating for hardware IPs. 3006 * Returns 0 on success, negative error code on failure. 3007 */ 3008 3009 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3010 enum amd_clockgating_state state) 3011 { 3012 int i, j, r; 3013 3014 if (amdgpu_emu_mode == 1) 3015 return 0; 3016 3017 for (j = 0; j < adev->num_ip_blocks; j++) { 3018 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3019 if (!adev->ip_blocks[i].status.late_initialized) 3020 continue; 3021 /* skip CG for GFX, SDMA on S0ix */ 3022 if (adev->in_s0ix && 3023 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3024 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3025 continue; 3026 /* skip CG for VCE/UVD, it's handled specially */ 3027 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3028 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3029 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3030 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3031 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3032 /* enable clockgating to save power */ 3033 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3034 state); 3035 if (r) { 3036 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3037 adev->ip_blocks[i].version->funcs->name, r); 3038 return r; 3039 } 3040 } 3041 } 3042 3043 return 0; 3044 } 3045 3046 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3047 enum amd_powergating_state state) 3048 { 3049 int i, j, r; 3050 3051 if (amdgpu_emu_mode == 1) 3052 return 0; 3053 3054 for (j = 0; j < adev->num_ip_blocks; j++) { 3055 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3056 if (!adev->ip_blocks[i].status.late_initialized) 3057 continue; 3058 /* skip PG for GFX, SDMA on S0ix */ 3059 if (adev->in_s0ix && 3060 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3061 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3062 continue; 3063 /* skip CG for VCE/UVD, it's handled specially */ 3064 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3065 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3066 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3067 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3068 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3069 /* enable powergating to save power */ 3070 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3071 state); 3072 if (r) { 3073 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3074 adev->ip_blocks[i].version->funcs->name, r); 3075 return r; 3076 } 3077 } 3078 } 3079 return 0; 3080 } 3081 3082 static int amdgpu_device_enable_mgpu_fan_boost(void) 3083 { 3084 struct amdgpu_gpu_instance *gpu_ins; 3085 struct amdgpu_device *adev; 3086 int i, ret = 0; 3087 3088 mutex_lock(&mgpu_info.mutex); 3089 3090 /* 3091 * MGPU fan boost feature should be enabled 3092 * only when there are two or more dGPUs in 3093 * the system 3094 */ 3095 if (mgpu_info.num_dgpu < 2) 3096 goto out; 3097 3098 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3099 gpu_ins = &(mgpu_info.gpu_ins[i]); 3100 adev = gpu_ins->adev; 3101 if (!(adev->flags & AMD_IS_APU) && 3102 !gpu_ins->mgpu_fan_enabled) { 3103 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3104 if (ret) 3105 break; 3106 3107 gpu_ins->mgpu_fan_enabled = 1; 3108 } 3109 } 3110 3111 out: 3112 mutex_unlock(&mgpu_info.mutex); 3113 3114 return ret; 3115 } 3116 3117 /** 3118 * amdgpu_device_ip_late_init - run late init for hardware IPs 3119 * 3120 * @adev: amdgpu_device pointer 3121 * 3122 * Late initialization pass for hardware IPs. The list of all the hardware 3123 * IPs that make up the asic is walked and the late_init callbacks are run. 3124 * late_init covers any special initialization that an IP requires 3125 * after all of the have been initialized or something that needs to happen 3126 * late in the init process. 3127 * Returns 0 on success, negative error code on failure. 3128 */ 3129 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3130 { 3131 struct amdgpu_gpu_instance *gpu_instance; 3132 int i = 0, r; 3133 3134 for (i = 0; i < adev->num_ip_blocks; i++) { 3135 if (!adev->ip_blocks[i].status.hw) 3136 continue; 3137 if (adev->ip_blocks[i].version->funcs->late_init) { 3138 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3139 if (r) { 3140 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3141 adev->ip_blocks[i].version->funcs->name, r); 3142 return r; 3143 } 3144 } 3145 adev->ip_blocks[i].status.late_initialized = true; 3146 } 3147 3148 r = amdgpu_ras_late_init(adev); 3149 if (r) { 3150 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3151 return r; 3152 } 3153 3154 if (!amdgpu_in_reset(adev)) 3155 amdgpu_ras_set_error_query_ready(adev, true); 3156 3157 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3158 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3159 3160 amdgpu_device_fill_reset_magic(adev); 3161 3162 r = amdgpu_device_enable_mgpu_fan_boost(); 3163 if (r) 3164 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3165 3166 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3167 if (amdgpu_passthrough(adev) && 3168 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3169 adev->asic_type == CHIP_ALDEBARAN)) 3170 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3171 3172 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3173 mutex_lock(&mgpu_info.mutex); 3174 3175 /* 3176 * Reset device p-state to low as this was booted with high. 3177 * 3178 * This should be performed only after all devices from the same 3179 * hive get initialized. 3180 * 3181 * However, it's unknown how many device in the hive in advance. 3182 * As this is counted one by one during devices initializations. 3183 * 3184 * So, we wait for all XGMI interlinked devices initialized. 3185 * This may bring some delays as those devices may come from 3186 * different hives. But that should be OK. 3187 */ 3188 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3189 for (i = 0; i < mgpu_info.num_gpu; i++) { 3190 gpu_instance = &(mgpu_info.gpu_ins[i]); 3191 if (gpu_instance->adev->flags & AMD_IS_APU) 3192 continue; 3193 3194 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3195 AMDGPU_XGMI_PSTATE_MIN); 3196 if (r) { 3197 DRM_ERROR("pstate setting failed (%d).\n", r); 3198 break; 3199 } 3200 } 3201 } 3202 3203 mutex_unlock(&mgpu_info.mutex); 3204 } 3205 3206 return 0; 3207 } 3208 3209 /** 3210 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3211 * 3212 * @adev: amdgpu_device pointer 3213 * 3214 * For ASICs need to disable SMC first 3215 */ 3216 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3217 { 3218 int i, r; 3219 3220 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3221 return; 3222 3223 for (i = 0; i < adev->num_ip_blocks; i++) { 3224 if (!adev->ip_blocks[i].status.hw) 3225 continue; 3226 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3227 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3228 /* XXX handle errors */ 3229 if (r) { 3230 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3231 adev->ip_blocks[i].version->funcs->name, r); 3232 } 3233 adev->ip_blocks[i].status.hw = false; 3234 break; 3235 } 3236 } 3237 } 3238 3239 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3240 { 3241 int i, r; 3242 3243 for (i = 0; i < adev->num_ip_blocks; i++) { 3244 if (!adev->ip_blocks[i].version->funcs->early_fini) 3245 continue; 3246 3247 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3248 if (r) { 3249 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3250 adev->ip_blocks[i].version->funcs->name, r); 3251 } 3252 } 3253 3254 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3255 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3256 3257 amdgpu_amdkfd_suspend(adev, false); 3258 3259 /* Workaroud for ASICs need to disable SMC first */ 3260 amdgpu_device_smu_fini_early(adev); 3261 3262 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3263 if (!adev->ip_blocks[i].status.hw) 3264 continue; 3265 3266 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3267 /* XXX handle errors */ 3268 if (r) { 3269 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3270 adev->ip_blocks[i].version->funcs->name, r); 3271 } 3272 3273 adev->ip_blocks[i].status.hw = false; 3274 } 3275 3276 if (amdgpu_sriov_vf(adev)) { 3277 if (amdgpu_virt_release_full_gpu(adev, false)) 3278 DRM_ERROR("failed to release exclusive mode on fini\n"); 3279 } 3280 3281 return 0; 3282 } 3283 3284 /** 3285 * amdgpu_device_ip_fini - run fini for hardware IPs 3286 * 3287 * @adev: amdgpu_device pointer 3288 * 3289 * Main teardown pass for hardware IPs. The list of all the hardware 3290 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3291 * are run. hw_fini tears down the hardware associated with each IP 3292 * and sw_fini tears down any software state associated with each IP. 3293 * Returns 0 on success, negative error code on failure. 3294 */ 3295 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3296 { 3297 int i, r; 3298 3299 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3300 amdgpu_virt_release_ras_err_handler_data(adev); 3301 3302 if (adev->gmc.xgmi.num_physical_nodes > 1) 3303 amdgpu_xgmi_remove_device(adev); 3304 3305 amdgpu_amdkfd_device_fini_sw(adev); 3306 3307 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3308 if (!adev->ip_blocks[i].status.sw) 3309 continue; 3310 3311 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3312 amdgpu_ucode_free_bo(adev); 3313 amdgpu_free_static_csa(&adev->virt.csa_obj); 3314 amdgpu_device_wb_fini(adev); 3315 amdgpu_device_mem_scratch_fini(adev); 3316 amdgpu_ib_pool_fini(adev); 3317 amdgpu_seq64_fini(adev); 3318 } 3319 3320 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3321 /* XXX handle errors */ 3322 if (r) { 3323 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3324 adev->ip_blocks[i].version->funcs->name, r); 3325 } 3326 adev->ip_blocks[i].status.sw = false; 3327 adev->ip_blocks[i].status.valid = false; 3328 } 3329 3330 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3331 if (!adev->ip_blocks[i].status.late_initialized) 3332 continue; 3333 if (adev->ip_blocks[i].version->funcs->late_fini) 3334 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3335 adev->ip_blocks[i].status.late_initialized = false; 3336 } 3337 3338 amdgpu_ras_fini(adev); 3339 3340 return 0; 3341 } 3342 3343 /** 3344 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3345 * 3346 * @work: work_struct. 3347 */ 3348 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3349 { 3350 struct amdgpu_device *adev = 3351 container_of(work, struct amdgpu_device, delayed_init_work.work); 3352 int r; 3353 3354 r = amdgpu_ib_ring_tests(adev); 3355 if (r) 3356 DRM_ERROR("ib ring test failed (%d).\n", r); 3357 } 3358 3359 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3360 { 3361 struct amdgpu_device *adev = 3362 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3363 3364 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3365 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3366 3367 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3368 adev->gfx.gfx_off_state = true; 3369 } 3370 3371 /** 3372 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3373 * 3374 * @adev: amdgpu_device pointer 3375 * 3376 * Main suspend function for hardware IPs. The list of all the hardware 3377 * IPs that make up the asic is walked, clockgating is disabled and the 3378 * suspend callbacks are run. suspend puts the hardware and software state 3379 * in each IP into a state suitable for suspend. 3380 * Returns 0 on success, negative error code on failure. 3381 */ 3382 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3383 { 3384 int i, r; 3385 3386 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3387 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3388 3389 /* 3390 * Per PMFW team's suggestion, driver needs to handle gfxoff 3391 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3392 * scenario. Add the missing df cstate disablement here. 3393 */ 3394 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3395 dev_warn(adev->dev, "Failed to disallow df cstate"); 3396 3397 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3398 if (!adev->ip_blocks[i].status.valid) 3399 continue; 3400 3401 /* displays are handled separately */ 3402 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3403 continue; 3404 3405 /* XXX handle errors */ 3406 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3407 /* XXX handle errors */ 3408 if (r) { 3409 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3410 adev->ip_blocks[i].version->funcs->name, r); 3411 return r; 3412 } 3413 3414 adev->ip_blocks[i].status.hw = false; 3415 } 3416 3417 return 0; 3418 } 3419 3420 /** 3421 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3422 * 3423 * @adev: amdgpu_device pointer 3424 * 3425 * Main suspend function for hardware IPs. The list of all the hardware 3426 * IPs that make up the asic is walked, clockgating is disabled and the 3427 * suspend callbacks are run. suspend puts the hardware and software state 3428 * in each IP into a state suitable for suspend. 3429 * Returns 0 on success, negative error code on failure. 3430 */ 3431 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3432 { 3433 int i, r; 3434 3435 if (adev->in_s0ix) 3436 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3437 3438 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3439 if (!adev->ip_blocks[i].status.valid) 3440 continue; 3441 /* displays are handled in phase1 */ 3442 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3443 continue; 3444 /* PSP lost connection when err_event_athub occurs */ 3445 if (amdgpu_ras_intr_triggered() && 3446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3447 adev->ip_blocks[i].status.hw = false; 3448 continue; 3449 } 3450 3451 /* skip unnecessary suspend if we do not initialize them yet */ 3452 if (adev->gmc.xgmi.pending_reset && 3453 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3454 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3455 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3456 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3457 adev->ip_blocks[i].status.hw = false; 3458 continue; 3459 } 3460 3461 /* skip suspend of gfx/mes and psp for S0ix 3462 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3463 * like at runtime. PSP is also part of the always on hardware 3464 * so no need to suspend it. 3465 */ 3466 if (adev->in_s0ix && 3467 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3468 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3469 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3470 continue; 3471 3472 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3473 if (adev->in_s0ix && 3474 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3475 IP_VERSION(5, 0, 0)) && 3476 (adev->ip_blocks[i].version->type == 3477 AMD_IP_BLOCK_TYPE_SDMA)) 3478 continue; 3479 3480 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3481 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3482 * from this location and RLC Autoload automatically also gets loaded 3483 * from here based on PMFW -> PSP message during re-init sequence. 3484 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3485 * the TMR and reload FWs again for IMU enabled APU ASICs. 3486 */ 3487 if (amdgpu_in_reset(adev) && 3488 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3489 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3490 continue; 3491 3492 /* XXX handle errors */ 3493 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3494 /* XXX handle errors */ 3495 if (r) { 3496 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3497 adev->ip_blocks[i].version->funcs->name, r); 3498 } 3499 adev->ip_blocks[i].status.hw = false; 3500 /* handle putting the SMC in the appropriate state */ 3501 if (!amdgpu_sriov_vf(adev)) { 3502 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3503 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3504 if (r) { 3505 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3506 adev->mp1_state, r); 3507 return r; 3508 } 3509 } 3510 } 3511 } 3512 3513 return 0; 3514 } 3515 3516 /** 3517 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3518 * 3519 * @adev: amdgpu_device pointer 3520 * 3521 * Main suspend function for hardware IPs. The list of all the hardware 3522 * IPs that make up the asic is walked, clockgating is disabled and the 3523 * suspend callbacks are run. suspend puts the hardware and software state 3524 * in each IP into a state suitable for suspend. 3525 * Returns 0 on success, negative error code on failure. 3526 */ 3527 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3528 { 3529 int r; 3530 3531 if (amdgpu_sriov_vf(adev)) { 3532 amdgpu_virt_fini_data_exchange(adev); 3533 amdgpu_virt_request_full_gpu(adev, false); 3534 } 3535 3536 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3537 3538 r = amdgpu_device_ip_suspend_phase1(adev); 3539 if (r) 3540 return r; 3541 r = amdgpu_device_ip_suspend_phase2(adev); 3542 3543 if (amdgpu_sriov_vf(adev)) 3544 amdgpu_virt_release_full_gpu(adev, false); 3545 3546 return r; 3547 } 3548 3549 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3550 { 3551 int i, r; 3552 3553 static enum amd_ip_block_type ip_order[] = { 3554 AMD_IP_BLOCK_TYPE_COMMON, 3555 AMD_IP_BLOCK_TYPE_GMC, 3556 AMD_IP_BLOCK_TYPE_PSP, 3557 AMD_IP_BLOCK_TYPE_IH, 3558 }; 3559 3560 for (i = 0; i < adev->num_ip_blocks; i++) { 3561 int j; 3562 struct amdgpu_ip_block *block; 3563 3564 block = &adev->ip_blocks[i]; 3565 block->status.hw = false; 3566 3567 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3568 3569 if (block->version->type != ip_order[j] || 3570 !block->status.valid) 3571 continue; 3572 3573 r = block->version->funcs->hw_init(adev); 3574 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3575 if (r) 3576 return r; 3577 block->status.hw = true; 3578 } 3579 } 3580 3581 return 0; 3582 } 3583 3584 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3585 { 3586 int i, r; 3587 3588 static enum amd_ip_block_type ip_order[] = { 3589 AMD_IP_BLOCK_TYPE_SMC, 3590 AMD_IP_BLOCK_TYPE_DCE, 3591 AMD_IP_BLOCK_TYPE_GFX, 3592 AMD_IP_BLOCK_TYPE_SDMA, 3593 AMD_IP_BLOCK_TYPE_MES, 3594 AMD_IP_BLOCK_TYPE_UVD, 3595 AMD_IP_BLOCK_TYPE_VCE, 3596 AMD_IP_BLOCK_TYPE_VCN, 3597 AMD_IP_BLOCK_TYPE_JPEG 3598 }; 3599 3600 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3601 int j; 3602 struct amdgpu_ip_block *block; 3603 3604 for (j = 0; j < adev->num_ip_blocks; j++) { 3605 block = &adev->ip_blocks[j]; 3606 3607 if (block->version->type != ip_order[i] || 3608 !block->status.valid || 3609 block->status.hw) 3610 continue; 3611 3612 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3613 r = block->version->funcs->resume(adev); 3614 else 3615 r = block->version->funcs->hw_init(adev); 3616 3617 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3618 if (r) 3619 return r; 3620 block->status.hw = true; 3621 } 3622 } 3623 3624 return 0; 3625 } 3626 3627 /** 3628 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3629 * 3630 * @adev: amdgpu_device pointer 3631 * 3632 * First resume function for hardware IPs. The list of all the hardware 3633 * IPs that make up the asic is walked and the resume callbacks are run for 3634 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3635 * after a suspend and updates the software state as necessary. This 3636 * function is also used for restoring the GPU after a GPU reset. 3637 * Returns 0 on success, negative error code on failure. 3638 */ 3639 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3640 { 3641 int i, r; 3642 3643 for (i = 0; i < adev->num_ip_blocks; i++) { 3644 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3645 continue; 3646 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3647 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3648 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3649 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3650 3651 r = adev->ip_blocks[i].version->funcs->resume(adev); 3652 if (r) { 3653 DRM_ERROR("resume of IP block <%s> failed %d\n", 3654 adev->ip_blocks[i].version->funcs->name, r); 3655 return r; 3656 } 3657 adev->ip_blocks[i].status.hw = true; 3658 } 3659 } 3660 3661 return 0; 3662 } 3663 3664 /** 3665 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3666 * 3667 * @adev: amdgpu_device pointer 3668 * 3669 * First resume function for hardware IPs. The list of all the hardware 3670 * IPs that make up the asic is walked and the resume callbacks are run for 3671 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3672 * functional state after a suspend and updates the software state as 3673 * necessary. This function is also used for restoring the GPU after a GPU 3674 * reset. 3675 * Returns 0 on success, negative error code on failure. 3676 */ 3677 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3678 { 3679 int i, r; 3680 3681 for (i = 0; i < adev->num_ip_blocks; i++) { 3682 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3683 continue; 3684 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3685 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3687 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3688 continue; 3689 r = adev->ip_blocks[i].version->funcs->resume(adev); 3690 if (r) { 3691 DRM_ERROR("resume of IP block <%s> failed %d\n", 3692 adev->ip_blocks[i].version->funcs->name, r); 3693 return r; 3694 } 3695 adev->ip_blocks[i].status.hw = true; 3696 } 3697 3698 return 0; 3699 } 3700 3701 /** 3702 * amdgpu_device_ip_resume - run resume for hardware IPs 3703 * 3704 * @adev: amdgpu_device pointer 3705 * 3706 * Main resume function for hardware IPs. The hardware IPs 3707 * are split into two resume functions because they are 3708 * also used in recovering from a GPU reset and some additional 3709 * steps need to be take between them. In this case (S3/S4) they are 3710 * run sequentially. 3711 * Returns 0 on success, negative error code on failure. 3712 */ 3713 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3714 { 3715 int r; 3716 3717 r = amdgpu_device_ip_resume_phase1(adev); 3718 if (r) 3719 return r; 3720 3721 r = amdgpu_device_fw_loading(adev); 3722 if (r) 3723 return r; 3724 3725 r = amdgpu_device_ip_resume_phase2(adev); 3726 3727 if (adev->mman.buffer_funcs_ring->sched.ready) 3728 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3729 3730 return r; 3731 } 3732 3733 /** 3734 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3735 * 3736 * @adev: amdgpu_device pointer 3737 * 3738 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3739 */ 3740 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3741 { 3742 if (amdgpu_sriov_vf(adev)) { 3743 if (adev->is_atom_fw) { 3744 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3745 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3746 } else { 3747 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3748 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3749 } 3750 3751 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3752 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3753 } 3754 } 3755 3756 /** 3757 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3758 * 3759 * @asic_type: AMD asic type 3760 * 3761 * Check if there is DC (new modesetting infrastructre) support for an asic. 3762 * returns true if DC has support, false if not. 3763 */ 3764 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3765 { 3766 switch (asic_type) { 3767 #ifdef CONFIG_DRM_AMDGPU_SI 3768 case CHIP_HAINAN: 3769 #endif 3770 case CHIP_TOPAZ: 3771 /* chips with no display hardware */ 3772 return false; 3773 #if defined(CONFIG_DRM_AMD_DC) 3774 case CHIP_TAHITI: 3775 case CHIP_PITCAIRN: 3776 case CHIP_VERDE: 3777 case CHIP_OLAND: 3778 /* 3779 * We have systems in the wild with these ASICs that require 3780 * LVDS and VGA support which is not supported with DC. 3781 * 3782 * Fallback to the non-DC driver here by default so as not to 3783 * cause regressions. 3784 */ 3785 #if defined(CONFIG_DRM_AMD_DC_SI) 3786 return amdgpu_dc > 0; 3787 #else 3788 return false; 3789 #endif 3790 case CHIP_BONAIRE: 3791 case CHIP_KAVERI: 3792 case CHIP_KABINI: 3793 case CHIP_MULLINS: 3794 /* 3795 * We have systems in the wild with these ASICs that require 3796 * VGA support which is not supported with DC. 3797 * 3798 * Fallback to the non-DC driver here by default so as not to 3799 * cause regressions. 3800 */ 3801 return amdgpu_dc > 0; 3802 default: 3803 return amdgpu_dc != 0; 3804 #else 3805 default: 3806 if (amdgpu_dc > 0) 3807 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3808 return false; 3809 #endif 3810 } 3811 } 3812 3813 /** 3814 * amdgpu_device_has_dc_support - check if dc is supported 3815 * 3816 * @adev: amdgpu_device pointer 3817 * 3818 * Returns true for supported, false for not supported 3819 */ 3820 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3821 { 3822 if (adev->enable_virtual_display || 3823 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3824 return false; 3825 3826 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3827 } 3828 3829 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3830 { 3831 struct amdgpu_device *adev = 3832 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3833 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3834 3835 /* It's a bug to not have a hive within this function */ 3836 if (WARN_ON(!hive)) 3837 return; 3838 3839 /* 3840 * Use task barrier to synchronize all xgmi reset works across the 3841 * hive. task_barrier_enter and task_barrier_exit will block 3842 * until all the threads running the xgmi reset works reach 3843 * those points. task_barrier_full will do both blocks. 3844 */ 3845 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3846 3847 task_barrier_enter(&hive->tb); 3848 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3849 3850 if (adev->asic_reset_res) 3851 goto fail; 3852 3853 task_barrier_exit(&hive->tb); 3854 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3855 3856 if (adev->asic_reset_res) 3857 goto fail; 3858 3859 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3860 } else { 3861 3862 task_barrier_full(&hive->tb); 3863 adev->asic_reset_res = amdgpu_asic_reset(adev); 3864 } 3865 3866 fail: 3867 if (adev->asic_reset_res) 3868 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3869 adev->asic_reset_res, adev_to_drm(adev)->unique); 3870 amdgpu_put_xgmi_hive(hive); 3871 } 3872 3873 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3874 { 3875 char *input = amdgpu_lockup_timeout; 3876 char *timeout_setting = NULL; 3877 int index = 0; 3878 long timeout; 3879 int ret = 0; 3880 3881 /* 3882 * By default timeout for non compute jobs is 10000 3883 * and 60000 for compute jobs. 3884 * In SR-IOV or passthrough mode, timeout for compute 3885 * jobs are 60000 by default. 3886 */ 3887 adev->gfx_timeout = msecs_to_jiffies(10000); 3888 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3889 if (amdgpu_sriov_vf(adev)) 3890 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3891 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3892 else 3893 adev->compute_timeout = msecs_to_jiffies(60000); 3894 3895 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3896 while ((timeout_setting = strsep(&input, ",")) && 3897 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3898 ret = kstrtol(timeout_setting, 0, &timeout); 3899 if (ret) 3900 return ret; 3901 3902 if (timeout == 0) { 3903 index++; 3904 continue; 3905 } else if (timeout < 0) { 3906 timeout = MAX_SCHEDULE_TIMEOUT; 3907 dev_warn(adev->dev, "lockup timeout disabled"); 3908 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3909 } else { 3910 timeout = msecs_to_jiffies(timeout); 3911 } 3912 3913 switch (index++) { 3914 case 0: 3915 adev->gfx_timeout = timeout; 3916 break; 3917 case 1: 3918 adev->compute_timeout = timeout; 3919 break; 3920 case 2: 3921 adev->sdma_timeout = timeout; 3922 break; 3923 case 3: 3924 adev->video_timeout = timeout; 3925 break; 3926 default: 3927 break; 3928 } 3929 } 3930 /* 3931 * There is only one value specified and 3932 * it should apply to all non-compute jobs. 3933 */ 3934 if (index == 1) { 3935 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3936 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3937 adev->compute_timeout = adev->gfx_timeout; 3938 } 3939 } 3940 3941 return ret; 3942 } 3943 3944 /** 3945 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3946 * 3947 * @adev: amdgpu_device pointer 3948 * 3949 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3950 */ 3951 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3952 { 3953 struct iommu_domain *domain; 3954 3955 domain = iommu_get_domain_for_dev(adev->dev); 3956 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3957 adev->ram_is_direct_mapped = true; 3958 } 3959 3960 #if defined(CONFIG_HSA_AMD_P2P) 3961 /** 3962 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 3963 * 3964 * @adev: amdgpu_device pointer 3965 * 3966 * return if IOMMU remapping bar address 3967 */ 3968 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 3969 { 3970 struct iommu_domain *domain; 3971 3972 domain = iommu_get_domain_for_dev(adev->dev); 3973 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 3974 domain->type == IOMMU_DOMAIN_DMA_FQ)) 3975 return true; 3976 3977 return false; 3978 } 3979 #endif 3980 3981 static const struct attribute *amdgpu_dev_attributes[] = { 3982 &dev_attr_pcie_replay_count.attr, 3983 NULL 3984 }; 3985 3986 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3987 { 3988 if (amdgpu_mcbp == 1) 3989 adev->gfx.mcbp = true; 3990 else if (amdgpu_mcbp == 0) 3991 adev->gfx.mcbp = false; 3992 3993 if (amdgpu_sriov_vf(adev)) 3994 adev->gfx.mcbp = true; 3995 3996 if (adev->gfx.mcbp) 3997 DRM_INFO("MCBP is enabled\n"); 3998 } 3999 4000 /** 4001 * amdgpu_device_init - initialize the driver 4002 * 4003 * @adev: amdgpu_device pointer 4004 * @flags: driver flags 4005 * 4006 * Initializes the driver info and hw (all asics). 4007 * Returns 0 for success or an error on failure. 4008 * Called at driver startup. 4009 */ 4010 int amdgpu_device_init(struct amdgpu_device *adev, 4011 uint32_t flags) 4012 { 4013 struct drm_device *ddev = adev_to_drm(adev); 4014 struct pci_dev *pdev = adev->pdev; 4015 int r, i; 4016 bool px = false; 4017 u32 max_MBps; 4018 int tmp; 4019 4020 adev->shutdown = false; 4021 adev->flags = flags; 4022 4023 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4024 adev->asic_type = amdgpu_force_asic_type; 4025 else 4026 adev->asic_type = flags & AMD_ASIC_MASK; 4027 4028 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4029 if (amdgpu_emu_mode == 1) 4030 adev->usec_timeout *= 10; 4031 adev->gmc.gart_size = 512 * 1024 * 1024; 4032 adev->accel_working = false; 4033 adev->num_rings = 0; 4034 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4035 adev->mman.buffer_funcs = NULL; 4036 adev->mman.buffer_funcs_ring = NULL; 4037 adev->vm_manager.vm_pte_funcs = NULL; 4038 adev->vm_manager.vm_pte_num_scheds = 0; 4039 adev->gmc.gmc_funcs = NULL; 4040 adev->harvest_ip_mask = 0x0; 4041 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4042 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4043 4044 adev->smc_rreg = &amdgpu_invalid_rreg; 4045 adev->smc_wreg = &amdgpu_invalid_wreg; 4046 adev->pcie_rreg = &amdgpu_invalid_rreg; 4047 adev->pcie_wreg = &amdgpu_invalid_wreg; 4048 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4049 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4050 adev->pciep_rreg = &amdgpu_invalid_rreg; 4051 adev->pciep_wreg = &amdgpu_invalid_wreg; 4052 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4053 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4054 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4055 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4056 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4057 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4058 adev->didt_rreg = &amdgpu_invalid_rreg; 4059 adev->didt_wreg = &amdgpu_invalid_wreg; 4060 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4061 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4062 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4063 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4064 4065 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4066 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4067 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4068 4069 /* mutex initialization are all done here so we 4070 * can recall function without having locking issues 4071 */ 4072 mutex_init(&adev->firmware.mutex); 4073 mutex_init(&adev->pm.mutex); 4074 mutex_init(&adev->gfx.gpu_clock_mutex); 4075 mutex_init(&adev->srbm_mutex); 4076 mutex_init(&adev->gfx.pipe_reserve_mutex); 4077 mutex_init(&adev->gfx.gfx_off_mutex); 4078 mutex_init(&adev->gfx.partition_mutex); 4079 mutex_init(&adev->grbm_idx_mutex); 4080 mutex_init(&adev->mn_lock); 4081 mutex_init(&adev->virt.vf_errors.lock); 4082 mutex_init(&adev->virt.rlcg_reg_lock); 4083 hash_init(adev->mn_hash); 4084 mutex_init(&adev->psp.mutex); 4085 mutex_init(&adev->notifier_lock); 4086 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4087 mutex_init(&adev->benchmark_mutex); 4088 mutex_init(&adev->gfx.reset_sem_mutex); 4089 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4090 mutex_init(&adev->enforce_isolation_mutex); 4091 mutex_init(&adev->gfx.kfd_sch_mutex); 4092 4093 amdgpu_device_init_apu_flags(adev); 4094 4095 r = amdgpu_device_check_arguments(adev); 4096 if (r) 4097 return r; 4098 4099 spin_lock_init(&adev->mmio_idx_lock); 4100 spin_lock_init(&adev->smc_idx_lock); 4101 spin_lock_init(&adev->pcie_idx_lock); 4102 spin_lock_init(&adev->uvd_ctx_idx_lock); 4103 spin_lock_init(&adev->didt_idx_lock); 4104 spin_lock_init(&adev->gc_cac_idx_lock); 4105 spin_lock_init(&adev->se_cac_idx_lock); 4106 spin_lock_init(&adev->audio_endpt_idx_lock); 4107 spin_lock_init(&adev->mm_stats.lock); 4108 spin_lock_init(&adev->wb.lock); 4109 4110 INIT_LIST_HEAD(&adev->shadow_list); 4111 mutex_init(&adev->shadow_list_lock); 4112 4113 INIT_LIST_HEAD(&adev->reset_list); 4114 4115 INIT_LIST_HEAD(&adev->ras_list); 4116 4117 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4118 4119 INIT_DELAYED_WORK(&adev->delayed_init_work, 4120 amdgpu_device_delayed_init_work_handler); 4121 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4122 amdgpu_device_delay_enable_gfx_off); 4123 /* 4124 * Initialize the enforce_isolation work structures for each XCP 4125 * partition. This work handler is responsible for enforcing shader 4126 * isolation on AMD GPUs. It counts the number of emitted fences for 4127 * each GFX and compute ring. If there are any fences, it schedules 4128 * the `enforce_isolation_work` to be run after a delay. If there are 4129 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4130 * runqueue. 4131 */ 4132 for (i = 0; i < MAX_XCP; i++) { 4133 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4134 amdgpu_gfx_enforce_isolation_handler); 4135 adev->gfx.enforce_isolation[i].adev = adev; 4136 adev->gfx.enforce_isolation[i].xcp_id = i; 4137 } 4138 4139 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4140 4141 adev->gfx.gfx_off_req_count = 1; 4142 adev->gfx.gfx_off_residency = 0; 4143 adev->gfx.gfx_off_entrycount = 0; 4144 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4145 4146 atomic_set(&adev->throttling_logging_enabled, 1); 4147 /* 4148 * If throttling continues, logging will be performed every minute 4149 * to avoid log flooding. "-1" is subtracted since the thermal 4150 * throttling interrupt comes every second. Thus, the total logging 4151 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4152 * for throttling interrupt) = 60 seconds. 4153 */ 4154 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4155 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4156 4157 /* Registers mapping */ 4158 /* TODO: block userspace mapping of io register */ 4159 if (adev->asic_type >= CHIP_BONAIRE) { 4160 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4161 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4162 } else { 4163 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4164 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4165 } 4166 4167 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4168 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4169 4170 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4171 if (!adev->rmmio) 4172 return -ENOMEM; 4173 4174 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4175 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4176 4177 /* 4178 * Reset domain needs to be present early, before XGMI hive discovered 4179 * (if any) and intitialized to use reset sem and in_gpu reset flag 4180 * early on during init and before calling to RREG32. 4181 */ 4182 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4183 if (!adev->reset_domain) 4184 return -ENOMEM; 4185 4186 /* detect hw virtualization here */ 4187 amdgpu_detect_virtualization(adev); 4188 4189 amdgpu_device_get_pcie_info(adev); 4190 4191 r = amdgpu_device_get_job_timeout_settings(adev); 4192 if (r) { 4193 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4194 return r; 4195 } 4196 4197 amdgpu_device_set_mcbp(adev); 4198 4199 /* early init functions */ 4200 r = amdgpu_device_ip_early_init(adev); 4201 if (r) 4202 return r; 4203 4204 /* Get rid of things like offb */ 4205 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4206 if (r) 4207 return r; 4208 4209 /* Enable TMZ based on IP_VERSION */ 4210 amdgpu_gmc_tmz_set(adev); 4211 4212 if (amdgpu_sriov_vf(adev) && 4213 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4214 /* VF MMIO access (except mailbox range) from CPU 4215 * will be blocked during sriov runtime 4216 */ 4217 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4218 4219 amdgpu_gmc_noretry_set(adev); 4220 /* Need to get xgmi info early to decide the reset behavior*/ 4221 if (adev->gmc.xgmi.supported) { 4222 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4223 if (r) 4224 return r; 4225 } 4226 4227 /* enable PCIE atomic ops */ 4228 if (amdgpu_sriov_vf(adev)) { 4229 if (adev->virt.fw_reserve.p_pf2vf) 4230 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4231 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4232 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4233 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4234 * internal path natively support atomics, set have_atomics_support to true. 4235 */ 4236 } else if ((adev->flags & AMD_IS_APU) && 4237 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4238 IP_VERSION(9, 0, 0))) { 4239 adev->have_atomics_support = true; 4240 } else { 4241 adev->have_atomics_support = 4242 !pci_enable_atomic_ops_to_root(adev->pdev, 4243 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4244 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4245 } 4246 4247 if (!adev->have_atomics_support) 4248 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4249 4250 /* doorbell bar mapping and doorbell index init*/ 4251 amdgpu_doorbell_init(adev); 4252 4253 if (amdgpu_emu_mode == 1) { 4254 /* post the asic on emulation mode */ 4255 emu_soc_asic_init(adev); 4256 goto fence_driver_init; 4257 } 4258 4259 amdgpu_reset_init(adev); 4260 4261 /* detect if we are with an SRIOV vbios */ 4262 if (adev->bios) 4263 amdgpu_device_detect_sriov_bios(adev); 4264 4265 /* check if we need to reset the asic 4266 * E.g., driver was not cleanly unloaded previously, etc. 4267 */ 4268 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4269 if (adev->gmc.xgmi.num_physical_nodes) { 4270 dev_info(adev->dev, "Pending hive reset.\n"); 4271 adev->gmc.xgmi.pending_reset = true; 4272 /* Only need to init necessary block for SMU to handle the reset */ 4273 for (i = 0; i < adev->num_ip_blocks; i++) { 4274 if (!adev->ip_blocks[i].status.valid) 4275 continue; 4276 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4277 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4278 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4279 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4280 DRM_DEBUG("IP %s disabled for hw_init.\n", 4281 adev->ip_blocks[i].version->funcs->name); 4282 adev->ip_blocks[i].status.hw = true; 4283 } 4284 } 4285 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4286 !amdgpu_device_has_display_hardware(adev)) { 4287 r = psp_gpu_reset(adev); 4288 } else { 4289 tmp = amdgpu_reset_method; 4290 /* It should do a default reset when loading or reloading the driver, 4291 * regardless of the module parameter reset_method. 4292 */ 4293 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4294 r = amdgpu_asic_reset(adev); 4295 amdgpu_reset_method = tmp; 4296 } 4297 4298 if (r) { 4299 dev_err(adev->dev, "asic reset on init failed\n"); 4300 goto failed; 4301 } 4302 } 4303 4304 /* Post card if necessary */ 4305 if (amdgpu_device_need_post(adev)) { 4306 if (!adev->bios) { 4307 dev_err(adev->dev, "no vBIOS found\n"); 4308 r = -EINVAL; 4309 goto failed; 4310 } 4311 DRM_INFO("GPU posting now...\n"); 4312 r = amdgpu_device_asic_init(adev); 4313 if (r) { 4314 dev_err(adev->dev, "gpu post error!\n"); 4315 goto failed; 4316 } 4317 } 4318 4319 if (adev->bios) { 4320 if (adev->is_atom_fw) { 4321 /* Initialize clocks */ 4322 r = amdgpu_atomfirmware_get_clock_info(adev); 4323 if (r) { 4324 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4325 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4326 goto failed; 4327 } 4328 } else { 4329 /* Initialize clocks */ 4330 r = amdgpu_atombios_get_clock_info(adev); 4331 if (r) { 4332 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4333 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4334 goto failed; 4335 } 4336 /* init i2c buses */ 4337 if (!amdgpu_device_has_dc_support(adev)) 4338 amdgpu_atombios_i2c_init(adev); 4339 } 4340 } 4341 4342 fence_driver_init: 4343 /* Fence driver */ 4344 r = amdgpu_fence_driver_sw_init(adev); 4345 if (r) { 4346 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4347 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4348 goto failed; 4349 } 4350 4351 /* init the mode config */ 4352 drm_mode_config_init(adev_to_drm(adev)); 4353 4354 r = amdgpu_device_ip_init(adev); 4355 if (r) { 4356 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4357 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4358 goto release_ras_con; 4359 } 4360 4361 amdgpu_fence_driver_hw_init(adev); 4362 4363 dev_info(adev->dev, 4364 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4365 adev->gfx.config.max_shader_engines, 4366 adev->gfx.config.max_sh_per_se, 4367 adev->gfx.config.max_cu_per_sh, 4368 adev->gfx.cu_info.number); 4369 4370 adev->accel_working = true; 4371 4372 amdgpu_vm_check_compute_bug(adev); 4373 4374 /* Initialize the buffer migration limit. */ 4375 if (amdgpu_moverate >= 0) 4376 max_MBps = amdgpu_moverate; 4377 else 4378 max_MBps = 8; /* Allow 8 MB/s. */ 4379 /* Get a log2 for easy divisions. */ 4380 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4381 4382 /* 4383 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4384 * Otherwise the mgpu fan boost feature will be skipped due to the 4385 * gpu instance is counted less. 4386 */ 4387 amdgpu_register_gpu_instance(adev); 4388 4389 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4390 * explicit gating rather than handling it automatically. 4391 */ 4392 if (!adev->gmc.xgmi.pending_reset) { 4393 r = amdgpu_device_ip_late_init(adev); 4394 if (r) { 4395 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4396 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4397 goto release_ras_con; 4398 } 4399 /* must succeed. */ 4400 amdgpu_ras_resume(adev); 4401 queue_delayed_work(system_wq, &adev->delayed_init_work, 4402 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4403 } 4404 4405 if (amdgpu_sriov_vf(adev)) { 4406 amdgpu_virt_release_full_gpu(adev, true); 4407 flush_delayed_work(&adev->delayed_init_work); 4408 } 4409 4410 /* 4411 * Place those sysfs registering after `late_init`. As some of those 4412 * operations performed in `late_init` might affect the sysfs 4413 * interfaces creating. 4414 */ 4415 r = amdgpu_atombios_sysfs_init(adev); 4416 if (r) 4417 drm_err(&adev->ddev, 4418 "registering atombios sysfs failed (%d).\n", r); 4419 4420 r = amdgpu_pm_sysfs_init(adev); 4421 if (r) 4422 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4423 4424 r = amdgpu_ucode_sysfs_init(adev); 4425 if (r) { 4426 adev->ucode_sysfs_en = false; 4427 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4428 } else 4429 adev->ucode_sysfs_en = true; 4430 4431 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4432 if (r) 4433 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4434 4435 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4436 if (r) 4437 dev_err(adev->dev, 4438 "Could not create amdgpu board attributes\n"); 4439 4440 amdgpu_fru_sysfs_init(adev); 4441 amdgpu_reg_state_sysfs_init(adev); 4442 4443 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4444 r = amdgpu_pmu_init(adev); 4445 if (r) 4446 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4447 4448 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4449 if (amdgpu_device_cache_pci_state(adev->pdev)) 4450 pci_restore_state(pdev); 4451 4452 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4453 /* this will fail for cards that aren't VGA class devices, just 4454 * ignore it 4455 */ 4456 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4457 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4458 4459 px = amdgpu_device_supports_px(ddev); 4460 4461 if (px || (!dev_is_removable(&adev->pdev->dev) && 4462 apple_gmux_detect(NULL, NULL))) 4463 vga_switcheroo_register_client(adev->pdev, 4464 &amdgpu_switcheroo_ops, px); 4465 4466 if (px) 4467 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4468 4469 if (adev->gmc.xgmi.pending_reset) 4470 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4471 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4472 4473 amdgpu_device_check_iommu_direct_map(adev); 4474 4475 return 0; 4476 4477 release_ras_con: 4478 if (amdgpu_sriov_vf(adev)) 4479 amdgpu_virt_release_full_gpu(adev, true); 4480 4481 /* failed in exclusive mode due to timeout */ 4482 if (amdgpu_sriov_vf(adev) && 4483 !amdgpu_sriov_runtime(adev) && 4484 amdgpu_virt_mmio_blocked(adev) && 4485 !amdgpu_virt_wait_reset(adev)) { 4486 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4487 /* Don't send request since VF is inactive. */ 4488 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4489 adev->virt.ops = NULL; 4490 r = -EAGAIN; 4491 } 4492 amdgpu_release_ras_context(adev); 4493 4494 failed: 4495 amdgpu_vf_error_trans_all(adev); 4496 4497 return r; 4498 } 4499 4500 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4501 { 4502 4503 /* Clear all CPU mappings pointing to this device */ 4504 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4505 4506 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4507 amdgpu_doorbell_fini(adev); 4508 4509 iounmap(adev->rmmio); 4510 adev->rmmio = NULL; 4511 if (adev->mman.aper_base_kaddr) 4512 iounmap(adev->mman.aper_base_kaddr); 4513 adev->mman.aper_base_kaddr = NULL; 4514 4515 /* Memory manager related */ 4516 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4517 arch_phys_wc_del(adev->gmc.vram_mtrr); 4518 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4519 } 4520 } 4521 4522 /** 4523 * amdgpu_device_fini_hw - tear down the driver 4524 * 4525 * @adev: amdgpu_device pointer 4526 * 4527 * Tear down the driver info (all asics). 4528 * Called at driver shutdown. 4529 */ 4530 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4531 { 4532 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4533 flush_delayed_work(&adev->delayed_init_work); 4534 adev->shutdown = true; 4535 4536 /* make sure IB test finished before entering exclusive mode 4537 * to avoid preemption on IB test 4538 */ 4539 if (amdgpu_sriov_vf(adev)) { 4540 amdgpu_virt_request_full_gpu(adev, false); 4541 amdgpu_virt_fini_data_exchange(adev); 4542 } 4543 4544 /* disable all interrupts */ 4545 amdgpu_irq_disable_all(adev); 4546 if (adev->mode_info.mode_config_initialized) { 4547 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4548 drm_helper_force_disable_all(adev_to_drm(adev)); 4549 else 4550 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4551 } 4552 amdgpu_fence_driver_hw_fini(adev); 4553 4554 if (adev->mman.initialized) 4555 drain_workqueue(adev->mman.bdev.wq); 4556 4557 if (adev->pm.sysfs_initialized) 4558 amdgpu_pm_sysfs_fini(adev); 4559 if (adev->ucode_sysfs_en) 4560 amdgpu_ucode_sysfs_fini(adev); 4561 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4562 amdgpu_fru_sysfs_fini(adev); 4563 4564 amdgpu_reg_state_sysfs_fini(adev); 4565 4566 /* disable ras feature must before hw fini */ 4567 amdgpu_ras_pre_fini(adev); 4568 4569 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4570 4571 amdgpu_device_ip_fini_early(adev); 4572 4573 amdgpu_irq_fini_hw(adev); 4574 4575 if (adev->mman.initialized) 4576 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4577 4578 amdgpu_gart_dummy_page_fini(adev); 4579 4580 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4581 amdgpu_device_unmap_mmio(adev); 4582 4583 } 4584 4585 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4586 { 4587 int idx; 4588 bool px; 4589 4590 amdgpu_fence_driver_sw_fini(adev); 4591 amdgpu_device_ip_fini(adev); 4592 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4593 adev->accel_working = false; 4594 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4595 4596 amdgpu_reset_fini(adev); 4597 4598 /* free i2c buses */ 4599 if (!amdgpu_device_has_dc_support(adev)) 4600 amdgpu_i2c_fini(adev); 4601 4602 if (amdgpu_emu_mode != 1) 4603 amdgpu_atombios_fini(adev); 4604 4605 kfree(adev->bios); 4606 adev->bios = NULL; 4607 4608 kfree(adev->fru_info); 4609 adev->fru_info = NULL; 4610 4611 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4612 4613 if (px || (!dev_is_removable(&adev->pdev->dev) && 4614 apple_gmux_detect(NULL, NULL))) 4615 vga_switcheroo_unregister_client(adev->pdev); 4616 4617 if (px) 4618 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4619 4620 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4621 vga_client_unregister(adev->pdev); 4622 4623 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4624 4625 iounmap(adev->rmmio); 4626 adev->rmmio = NULL; 4627 amdgpu_doorbell_fini(adev); 4628 drm_dev_exit(idx); 4629 } 4630 4631 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4632 amdgpu_pmu_fini(adev); 4633 if (adev->mman.discovery_bin) 4634 amdgpu_discovery_fini(adev); 4635 4636 amdgpu_reset_put_reset_domain(adev->reset_domain); 4637 adev->reset_domain = NULL; 4638 4639 kfree(adev->pci_state); 4640 4641 } 4642 4643 /** 4644 * amdgpu_device_evict_resources - evict device resources 4645 * @adev: amdgpu device object 4646 * 4647 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4648 * of the vram memory type. Mainly used for evicting device resources 4649 * at suspend time. 4650 * 4651 */ 4652 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4653 { 4654 int ret; 4655 4656 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4657 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4658 return 0; 4659 4660 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4661 if (ret) 4662 DRM_WARN("evicting device resources failed\n"); 4663 return ret; 4664 } 4665 4666 /* 4667 * Suspend & resume. 4668 */ 4669 /** 4670 * amdgpu_device_prepare - prepare for device suspend 4671 * 4672 * @dev: drm dev pointer 4673 * 4674 * Prepare to put the hw in the suspend state (all asics). 4675 * Returns 0 for success or an error on failure. 4676 * Called at driver suspend. 4677 */ 4678 int amdgpu_device_prepare(struct drm_device *dev) 4679 { 4680 struct amdgpu_device *adev = drm_to_adev(dev); 4681 int i, r; 4682 4683 amdgpu_choose_low_power_state(adev); 4684 4685 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4686 return 0; 4687 4688 /* Evict the majority of BOs before starting suspend sequence */ 4689 r = amdgpu_device_evict_resources(adev); 4690 if (r) 4691 goto unprepare; 4692 4693 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4694 4695 for (i = 0; i < adev->num_ip_blocks; i++) { 4696 if (!adev->ip_blocks[i].status.valid) 4697 continue; 4698 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4699 continue; 4700 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4701 if (r) 4702 goto unprepare; 4703 } 4704 4705 return 0; 4706 4707 unprepare: 4708 adev->in_s0ix = adev->in_s3 = false; 4709 4710 return r; 4711 } 4712 4713 /** 4714 * amdgpu_device_suspend - initiate device suspend 4715 * 4716 * @dev: drm dev pointer 4717 * @fbcon : notify the fbdev of suspend 4718 * 4719 * Puts the hw in the suspend state (all asics). 4720 * Returns 0 for success or an error on failure. 4721 * Called at driver suspend. 4722 */ 4723 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4724 { 4725 struct amdgpu_device *adev = drm_to_adev(dev); 4726 int r = 0; 4727 4728 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4729 return 0; 4730 4731 adev->in_suspend = true; 4732 4733 if (amdgpu_sriov_vf(adev)) { 4734 amdgpu_virt_fini_data_exchange(adev); 4735 r = amdgpu_virt_request_full_gpu(adev, false); 4736 if (r) 4737 return r; 4738 } 4739 4740 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4741 DRM_WARN("smart shift update failed\n"); 4742 4743 if (fbcon) 4744 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4745 4746 cancel_delayed_work_sync(&adev->delayed_init_work); 4747 4748 amdgpu_ras_suspend(adev); 4749 4750 amdgpu_device_ip_suspend_phase1(adev); 4751 4752 if (!adev->in_s0ix) 4753 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4754 4755 r = amdgpu_device_evict_resources(adev); 4756 if (r) 4757 return r; 4758 4759 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4760 4761 amdgpu_fence_driver_hw_fini(adev); 4762 4763 amdgpu_device_ip_suspend_phase2(adev); 4764 4765 if (amdgpu_sriov_vf(adev)) 4766 amdgpu_virt_release_full_gpu(adev, false); 4767 4768 r = amdgpu_dpm_notify_rlc_state(adev, false); 4769 if (r) 4770 return r; 4771 4772 return 0; 4773 } 4774 4775 /** 4776 * amdgpu_device_resume - initiate device resume 4777 * 4778 * @dev: drm dev pointer 4779 * @fbcon : notify the fbdev of resume 4780 * 4781 * Bring the hw back to operating state (all asics). 4782 * Returns 0 for success or an error on failure. 4783 * Called at driver resume. 4784 */ 4785 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4786 { 4787 struct amdgpu_device *adev = drm_to_adev(dev); 4788 int r = 0; 4789 4790 if (amdgpu_sriov_vf(adev)) { 4791 r = amdgpu_virt_request_full_gpu(adev, true); 4792 if (r) 4793 return r; 4794 } 4795 4796 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4797 return 0; 4798 4799 if (adev->in_s0ix) 4800 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4801 4802 /* post card */ 4803 if (amdgpu_device_need_post(adev)) { 4804 r = amdgpu_device_asic_init(adev); 4805 if (r) 4806 dev_err(adev->dev, "amdgpu asic init failed\n"); 4807 } 4808 4809 r = amdgpu_device_ip_resume(adev); 4810 4811 if (r) { 4812 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4813 goto exit; 4814 } 4815 amdgpu_fence_driver_hw_init(adev); 4816 4817 if (!adev->in_s0ix) { 4818 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4819 if (r) 4820 goto exit; 4821 } 4822 4823 r = amdgpu_device_ip_late_init(adev); 4824 if (r) 4825 goto exit; 4826 4827 queue_delayed_work(system_wq, &adev->delayed_init_work, 4828 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4829 exit: 4830 if (amdgpu_sriov_vf(adev)) { 4831 amdgpu_virt_init_data_exchange(adev); 4832 amdgpu_virt_release_full_gpu(adev, true); 4833 } 4834 4835 if (r) 4836 return r; 4837 4838 /* Make sure IB tests flushed */ 4839 flush_delayed_work(&adev->delayed_init_work); 4840 4841 if (fbcon) 4842 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4843 4844 amdgpu_ras_resume(adev); 4845 4846 if (adev->mode_info.num_crtc) { 4847 /* 4848 * Most of the connector probing functions try to acquire runtime pm 4849 * refs to ensure that the GPU is powered on when connector polling is 4850 * performed. Since we're calling this from a runtime PM callback, 4851 * trying to acquire rpm refs will cause us to deadlock. 4852 * 4853 * Since we're guaranteed to be holding the rpm lock, it's safe to 4854 * temporarily disable the rpm helpers so this doesn't deadlock us. 4855 */ 4856 #ifdef CONFIG_PM 4857 dev->dev->power.disable_depth++; 4858 #endif 4859 if (!adev->dc_enabled) 4860 drm_helper_hpd_irq_event(dev); 4861 else 4862 drm_kms_helper_hotplug_event(dev); 4863 #ifdef CONFIG_PM 4864 dev->dev->power.disable_depth--; 4865 #endif 4866 } 4867 adev->in_suspend = false; 4868 4869 if (adev->enable_mes) 4870 amdgpu_mes_self_test(adev); 4871 4872 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4873 DRM_WARN("smart shift update failed\n"); 4874 4875 return 0; 4876 } 4877 4878 /** 4879 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4880 * 4881 * @adev: amdgpu_device pointer 4882 * 4883 * The list of all the hardware IPs that make up the asic is walked and 4884 * the check_soft_reset callbacks are run. check_soft_reset determines 4885 * if the asic is still hung or not. 4886 * Returns true if any of the IPs are still in a hung state, false if not. 4887 */ 4888 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4889 { 4890 int i; 4891 bool asic_hang = false; 4892 4893 if (amdgpu_sriov_vf(adev)) 4894 return true; 4895 4896 if (amdgpu_asic_need_full_reset(adev)) 4897 return true; 4898 4899 for (i = 0; i < adev->num_ip_blocks; i++) { 4900 if (!adev->ip_blocks[i].status.valid) 4901 continue; 4902 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4903 adev->ip_blocks[i].status.hang = 4904 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4905 if (adev->ip_blocks[i].status.hang) { 4906 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4907 asic_hang = true; 4908 } 4909 } 4910 return asic_hang; 4911 } 4912 4913 /** 4914 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4915 * 4916 * @adev: amdgpu_device pointer 4917 * 4918 * The list of all the hardware IPs that make up the asic is walked and the 4919 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4920 * handles any IP specific hardware or software state changes that are 4921 * necessary for a soft reset to succeed. 4922 * Returns 0 on success, negative error code on failure. 4923 */ 4924 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4925 { 4926 int i, r = 0; 4927 4928 for (i = 0; i < adev->num_ip_blocks; i++) { 4929 if (!adev->ip_blocks[i].status.valid) 4930 continue; 4931 if (adev->ip_blocks[i].status.hang && 4932 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4933 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4934 if (r) 4935 return r; 4936 } 4937 } 4938 4939 return 0; 4940 } 4941 4942 /** 4943 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4944 * 4945 * @adev: amdgpu_device pointer 4946 * 4947 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4948 * reset is necessary to recover. 4949 * Returns true if a full asic reset is required, false if not. 4950 */ 4951 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4952 { 4953 int i; 4954 4955 if (amdgpu_asic_need_full_reset(adev)) 4956 return true; 4957 4958 for (i = 0; i < adev->num_ip_blocks; i++) { 4959 if (!adev->ip_blocks[i].status.valid) 4960 continue; 4961 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4962 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4963 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4964 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4965 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4966 if (adev->ip_blocks[i].status.hang) { 4967 dev_info(adev->dev, "Some block need full reset!\n"); 4968 return true; 4969 } 4970 } 4971 } 4972 return false; 4973 } 4974 4975 /** 4976 * amdgpu_device_ip_soft_reset - do a soft reset 4977 * 4978 * @adev: amdgpu_device pointer 4979 * 4980 * The list of all the hardware IPs that make up the asic is walked and the 4981 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4982 * IP specific hardware or software state changes that are necessary to soft 4983 * reset the IP. 4984 * Returns 0 on success, negative error code on failure. 4985 */ 4986 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4987 { 4988 int i, r = 0; 4989 4990 for (i = 0; i < adev->num_ip_blocks; i++) { 4991 if (!adev->ip_blocks[i].status.valid) 4992 continue; 4993 if (adev->ip_blocks[i].status.hang && 4994 adev->ip_blocks[i].version->funcs->soft_reset) { 4995 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4996 if (r) 4997 return r; 4998 } 4999 } 5000 5001 return 0; 5002 } 5003 5004 /** 5005 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5006 * 5007 * @adev: amdgpu_device pointer 5008 * 5009 * The list of all the hardware IPs that make up the asic is walked and the 5010 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5011 * handles any IP specific hardware or software state changes that are 5012 * necessary after the IP has been soft reset. 5013 * Returns 0 on success, negative error code on failure. 5014 */ 5015 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5016 { 5017 int i, r = 0; 5018 5019 for (i = 0; i < adev->num_ip_blocks; i++) { 5020 if (!adev->ip_blocks[i].status.valid) 5021 continue; 5022 if (adev->ip_blocks[i].status.hang && 5023 adev->ip_blocks[i].version->funcs->post_soft_reset) 5024 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 5025 if (r) 5026 return r; 5027 } 5028 5029 return 0; 5030 } 5031 5032 /** 5033 * amdgpu_device_recover_vram - Recover some VRAM contents 5034 * 5035 * @adev: amdgpu_device pointer 5036 * 5037 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 5038 * restore things like GPUVM page tables after a GPU reset where 5039 * the contents of VRAM might be lost. 5040 * 5041 * Returns: 5042 * 0 on success, negative error code on failure. 5043 */ 5044 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 5045 { 5046 struct dma_fence *fence = NULL, *next = NULL; 5047 struct amdgpu_bo *shadow; 5048 struct amdgpu_bo_vm *vmbo; 5049 long r = 1, tmo; 5050 5051 if (amdgpu_sriov_runtime(adev)) 5052 tmo = msecs_to_jiffies(8000); 5053 else 5054 tmo = msecs_to_jiffies(100); 5055 5056 dev_info(adev->dev, "recover vram bo from shadow start\n"); 5057 mutex_lock(&adev->shadow_list_lock); 5058 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 5059 /* If vm is compute context or adev is APU, shadow will be NULL */ 5060 if (!vmbo->shadow) 5061 continue; 5062 shadow = vmbo->shadow; 5063 5064 /* No need to recover an evicted BO */ 5065 if (!shadow->tbo.resource || 5066 shadow->tbo.resource->mem_type != TTM_PL_TT || 5067 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 5068 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 5069 continue; 5070 5071 r = amdgpu_bo_restore_shadow(shadow, &next); 5072 if (r) 5073 break; 5074 5075 if (fence) { 5076 tmo = dma_fence_wait_timeout(fence, false, tmo); 5077 dma_fence_put(fence); 5078 fence = next; 5079 if (tmo == 0) { 5080 r = -ETIMEDOUT; 5081 break; 5082 } else if (tmo < 0) { 5083 r = tmo; 5084 break; 5085 } 5086 } else { 5087 fence = next; 5088 } 5089 } 5090 mutex_unlock(&adev->shadow_list_lock); 5091 5092 if (fence) 5093 tmo = dma_fence_wait_timeout(fence, false, tmo); 5094 dma_fence_put(fence); 5095 5096 if (r < 0 || tmo <= 0) { 5097 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 5098 return -EIO; 5099 } 5100 5101 dev_info(adev->dev, "recover vram bo from shadow done\n"); 5102 return 0; 5103 } 5104 5105 5106 /** 5107 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5108 * 5109 * @adev: amdgpu_device pointer 5110 * @reset_context: amdgpu reset context pointer 5111 * 5112 * do VF FLR and reinitialize Asic 5113 * return 0 means succeeded otherwise failed 5114 */ 5115 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5116 struct amdgpu_reset_context *reset_context) 5117 { 5118 int r; 5119 struct amdgpu_hive_info *hive = NULL; 5120 5121 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5122 if (!amdgpu_ras_get_fed_status(adev)) 5123 amdgpu_virt_ready_to_reset(adev); 5124 amdgpu_virt_wait_reset(adev); 5125 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5126 r = amdgpu_virt_request_full_gpu(adev, true); 5127 } else { 5128 r = amdgpu_virt_reset_gpu(adev); 5129 } 5130 if (r) 5131 return r; 5132 5133 amdgpu_ras_set_fed(adev, false); 5134 amdgpu_irq_gpu_reset_resume_helper(adev); 5135 5136 /* some sw clean up VF needs to do before recover */ 5137 amdgpu_virt_post_reset(adev); 5138 5139 /* Resume IP prior to SMC */ 5140 r = amdgpu_device_ip_reinit_early_sriov(adev); 5141 if (r) 5142 return r; 5143 5144 amdgpu_virt_init_data_exchange(adev); 5145 5146 r = amdgpu_device_fw_loading(adev); 5147 if (r) 5148 return r; 5149 5150 /* now we are okay to resume SMC/CP/SDMA */ 5151 r = amdgpu_device_ip_reinit_late_sriov(adev); 5152 if (r) 5153 return r; 5154 5155 hive = amdgpu_get_xgmi_hive(adev); 5156 /* Update PSP FW topology after reset */ 5157 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5158 r = amdgpu_xgmi_update_topology(hive, adev); 5159 if (hive) 5160 amdgpu_put_xgmi_hive(hive); 5161 if (r) 5162 return r; 5163 5164 r = amdgpu_ib_ring_tests(adev); 5165 if (r) 5166 return r; 5167 5168 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5169 amdgpu_inc_vram_lost(adev); 5170 r = amdgpu_device_recover_vram(adev); 5171 } 5172 if (r) 5173 return r; 5174 5175 /* need to be called during full access so we can't do it later like 5176 * bare-metal does. 5177 */ 5178 amdgpu_amdkfd_post_reset(adev); 5179 amdgpu_virt_release_full_gpu(adev, true); 5180 5181 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5182 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5183 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5184 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5185 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5186 amdgpu_ras_resume(adev); 5187 return 0; 5188 } 5189 5190 /** 5191 * amdgpu_device_has_job_running - check if there is any job in mirror list 5192 * 5193 * @adev: amdgpu_device pointer 5194 * 5195 * check if there is any job in mirror list 5196 */ 5197 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5198 { 5199 int i; 5200 struct drm_sched_job *job; 5201 5202 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5203 struct amdgpu_ring *ring = adev->rings[i]; 5204 5205 if (!amdgpu_ring_sched_ready(ring)) 5206 continue; 5207 5208 spin_lock(&ring->sched.job_list_lock); 5209 job = list_first_entry_or_null(&ring->sched.pending_list, 5210 struct drm_sched_job, list); 5211 spin_unlock(&ring->sched.job_list_lock); 5212 if (job) 5213 return true; 5214 } 5215 return false; 5216 } 5217 5218 /** 5219 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5220 * 5221 * @adev: amdgpu_device pointer 5222 * 5223 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5224 * a hung GPU. 5225 */ 5226 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5227 { 5228 5229 if (amdgpu_gpu_recovery == 0) 5230 goto disabled; 5231 5232 /* Skip soft reset check in fatal error mode */ 5233 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5234 return true; 5235 5236 if (amdgpu_sriov_vf(adev)) 5237 return true; 5238 5239 if (amdgpu_gpu_recovery == -1) { 5240 switch (adev->asic_type) { 5241 #ifdef CONFIG_DRM_AMDGPU_SI 5242 case CHIP_VERDE: 5243 case CHIP_TAHITI: 5244 case CHIP_PITCAIRN: 5245 case CHIP_OLAND: 5246 case CHIP_HAINAN: 5247 #endif 5248 #ifdef CONFIG_DRM_AMDGPU_CIK 5249 case CHIP_KAVERI: 5250 case CHIP_KABINI: 5251 case CHIP_MULLINS: 5252 #endif 5253 case CHIP_CARRIZO: 5254 case CHIP_STONEY: 5255 case CHIP_CYAN_SKILLFISH: 5256 goto disabled; 5257 default: 5258 break; 5259 } 5260 } 5261 5262 return true; 5263 5264 disabled: 5265 dev_info(adev->dev, "GPU recovery disabled.\n"); 5266 return false; 5267 } 5268 5269 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5270 { 5271 u32 i; 5272 int ret = 0; 5273 5274 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5275 5276 dev_info(adev->dev, "GPU mode1 reset\n"); 5277 5278 /* Cache the state before bus master disable. The saved config space 5279 * values are used in other cases like restore after mode-2 reset. 5280 */ 5281 amdgpu_device_cache_pci_state(adev->pdev); 5282 5283 /* disable BM */ 5284 pci_clear_master(adev->pdev); 5285 5286 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5287 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5288 ret = amdgpu_dpm_mode1_reset(adev); 5289 } else { 5290 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5291 ret = psp_gpu_reset(adev); 5292 } 5293 5294 if (ret) 5295 goto mode1_reset_failed; 5296 5297 amdgpu_device_load_pci_state(adev->pdev); 5298 ret = amdgpu_psp_wait_for_bootloader(adev); 5299 if (ret) 5300 goto mode1_reset_failed; 5301 5302 /* wait for asic to come out of reset */ 5303 for (i = 0; i < adev->usec_timeout; i++) { 5304 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5305 5306 if (memsize != 0xffffffff) 5307 break; 5308 udelay(1); 5309 } 5310 5311 if (i >= adev->usec_timeout) { 5312 ret = -ETIMEDOUT; 5313 goto mode1_reset_failed; 5314 } 5315 5316 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5317 5318 return 0; 5319 5320 mode1_reset_failed: 5321 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5322 return ret; 5323 } 5324 5325 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5326 struct amdgpu_reset_context *reset_context) 5327 { 5328 int i, r = 0; 5329 struct amdgpu_job *job = NULL; 5330 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5331 bool need_full_reset = 5332 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5333 5334 if (reset_context->reset_req_dev == adev) 5335 job = reset_context->job; 5336 5337 if (amdgpu_sriov_vf(adev)) 5338 amdgpu_virt_pre_reset(adev); 5339 5340 amdgpu_fence_driver_isr_toggle(adev, true); 5341 5342 /* block all schedulers and reset given job's ring */ 5343 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5344 struct amdgpu_ring *ring = adev->rings[i]; 5345 5346 if (!amdgpu_ring_sched_ready(ring)) 5347 continue; 5348 5349 /* Clear job fence from fence drv to avoid force_completion 5350 * leave NULL and vm flush fence in fence drv 5351 */ 5352 amdgpu_fence_driver_clear_job_fences(ring); 5353 5354 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5355 amdgpu_fence_driver_force_completion(ring); 5356 } 5357 5358 amdgpu_fence_driver_isr_toggle(adev, false); 5359 5360 if (job && job->vm) 5361 drm_sched_increase_karma(&job->base); 5362 5363 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5364 /* If reset handler not implemented, continue; otherwise return */ 5365 if (r == -EOPNOTSUPP) 5366 r = 0; 5367 else 5368 return r; 5369 5370 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5371 if (!amdgpu_sriov_vf(adev)) { 5372 5373 if (!need_full_reset) 5374 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5375 5376 if (!need_full_reset && amdgpu_gpu_recovery && 5377 amdgpu_device_ip_check_soft_reset(adev)) { 5378 amdgpu_device_ip_pre_soft_reset(adev); 5379 r = amdgpu_device_ip_soft_reset(adev); 5380 amdgpu_device_ip_post_soft_reset(adev); 5381 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5382 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5383 need_full_reset = true; 5384 } 5385 } 5386 5387 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5388 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5389 /* Trigger ip dump before we reset the asic */ 5390 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5391 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5392 tmp_adev->ip_blocks[i].version->funcs 5393 ->dump_ip_state((void *)tmp_adev); 5394 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5395 } 5396 5397 if (need_full_reset) 5398 r = amdgpu_device_ip_suspend(adev); 5399 if (need_full_reset) 5400 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5401 else 5402 clear_bit(AMDGPU_NEED_FULL_RESET, 5403 &reset_context->flags); 5404 } 5405 5406 return r; 5407 } 5408 5409 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5410 struct amdgpu_reset_context *reset_context) 5411 { 5412 struct amdgpu_device *tmp_adev = NULL; 5413 bool need_full_reset, skip_hw_reset, vram_lost = false; 5414 int r = 0; 5415 5416 /* Try reset handler method first */ 5417 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5418 reset_list); 5419 5420 reset_context->reset_device_list = device_list_handle; 5421 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5422 /* If reset handler not implemented, continue; otherwise return */ 5423 if (r == -EOPNOTSUPP) 5424 r = 0; 5425 else 5426 return r; 5427 5428 /* Reset handler not implemented, use the default method */ 5429 need_full_reset = 5430 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5431 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5432 5433 /* 5434 * ASIC reset has to be done on all XGMI hive nodes ASAP 5435 * to allow proper links negotiation in FW (within 1 sec) 5436 */ 5437 if (!skip_hw_reset && need_full_reset) { 5438 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5439 /* For XGMI run all resets in parallel to speed up the process */ 5440 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5441 tmp_adev->gmc.xgmi.pending_reset = false; 5442 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5443 r = -EALREADY; 5444 } else 5445 r = amdgpu_asic_reset(tmp_adev); 5446 5447 if (r) { 5448 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5449 r, adev_to_drm(tmp_adev)->unique); 5450 goto out; 5451 } 5452 } 5453 5454 /* For XGMI wait for all resets to complete before proceed */ 5455 if (!r) { 5456 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5457 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5458 flush_work(&tmp_adev->xgmi_reset_work); 5459 r = tmp_adev->asic_reset_res; 5460 if (r) 5461 break; 5462 } 5463 } 5464 } 5465 } 5466 5467 if (!r && amdgpu_ras_intr_triggered()) { 5468 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5469 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5470 } 5471 5472 amdgpu_ras_intr_cleared(); 5473 } 5474 5475 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5476 if (need_full_reset) { 5477 /* post card */ 5478 amdgpu_ras_set_fed(tmp_adev, false); 5479 r = amdgpu_device_asic_init(tmp_adev); 5480 if (r) { 5481 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5482 } else { 5483 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5484 5485 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5486 if (r) 5487 goto out; 5488 5489 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5490 5491 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5492 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5493 5494 if (vram_lost) { 5495 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5496 amdgpu_inc_vram_lost(tmp_adev); 5497 } 5498 5499 r = amdgpu_device_fw_loading(tmp_adev); 5500 if (r) 5501 return r; 5502 5503 r = amdgpu_xcp_restore_partition_mode( 5504 tmp_adev->xcp_mgr); 5505 if (r) 5506 goto out; 5507 5508 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5509 if (r) 5510 goto out; 5511 5512 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5513 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5514 5515 if (vram_lost) 5516 amdgpu_device_fill_reset_magic(tmp_adev); 5517 5518 /* 5519 * Add this ASIC as tracked as reset was already 5520 * complete successfully. 5521 */ 5522 amdgpu_register_gpu_instance(tmp_adev); 5523 5524 if (!reset_context->hive && 5525 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5526 amdgpu_xgmi_add_device(tmp_adev); 5527 5528 r = amdgpu_device_ip_late_init(tmp_adev); 5529 if (r) 5530 goto out; 5531 5532 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5533 5534 /* 5535 * The GPU enters bad state once faulty pages 5536 * by ECC has reached the threshold, and ras 5537 * recovery is scheduled next. So add one check 5538 * here to break recovery if it indeed exceeds 5539 * bad page threshold, and remind user to 5540 * retire this GPU or setting one bigger 5541 * bad_page_threshold value to fix this once 5542 * probing driver again. 5543 */ 5544 if (!amdgpu_ras_is_rma(tmp_adev)) { 5545 /* must succeed. */ 5546 amdgpu_ras_resume(tmp_adev); 5547 } else { 5548 r = -EINVAL; 5549 goto out; 5550 } 5551 5552 /* Update PSP FW topology after reset */ 5553 if (reset_context->hive && 5554 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5555 r = amdgpu_xgmi_update_topology( 5556 reset_context->hive, tmp_adev); 5557 } 5558 } 5559 5560 out: 5561 if (!r) { 5562 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5563 r = amdgpu_ib_ring_tests(tmp_adev); 5564 if (r) { 5565 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5566 need_full_reset = true; 5567 r = -EAGAIN; 5568 goto end; 5569 } 5570 } 5571 5572 if (!r) 5573 r = amdgpu_device_recover_vram(tmp_adev); 5574 else 5575 tmp_adev->asic_reset_res = r; 5576 } 5577 5578 end: 5579 if (need_full_reset) 5580 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5581 else 5582 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5583 return r; 5584 } 5585 5586 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5587 { 5588 5589 switch (amdgpu_asic_reset_method(adev)) { 5590 case AMD_RESET_METHOD_MODE1: 5591 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5592 break; 5593 case AMD_RESET_METHOD_MODE2: 5594 adev->mp1_state = PP_MP1_STATE_RESET; 5595 break; 5596 default: 5597 adev->mp1_state = PP_MP1_STATE_NONE; 5598 break; 5599 } 5600 } 5601 5602 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5603 { 5604 amdgpu_vf_error_trans_all(adev); 5605 adev->mp1_state = PP_MP1_STATE_NONE; 5606 } 5607 5608 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5609 { 5610 struct pci_dev *p = NULL; 5611 5612 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5613 adev->pdev->bus->number, 1); 5614 if (p) { 5615 pm_runtime_enable(&(p->dev)); 5616 pm_runtime_resume(&(p->dev)); 5617 } 5618 5619 pci_dev_put(p); 5620 } 5621 5622 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5623 { 5624 enum amd_reset_method reset_method; 5625 struct pci_dev *p = NULL; 5626 u64 expires; 5627 5628 /* 5629 * For now, only BACO and mode1 reset are confirmed 5630 * to suffer the audio issue without proper suspended. 5631 */ 5632 reset_method = amdgpu_asic_reset_method(adev); 5633 if ((reset_method != AMD_RESET_METHOD_BACO) && 5634 (reset_method != AMD_RESET_METHOD_MODE1)) 5635 return -EINVAL; 5636 5637 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5638 adev->pdev->bus->number, 1); 5639 if (!p) 5640 return -ENODEV; 5641 5642 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5643 if (!expires) 5644 /* 5645 * If we cannot get the audio device autosuspend delay, 5646 * a fixed 4S interval will be used. Considering 3S is 5647 * the audio controller default autosuspend delay setting. 5648 * 4S used here is guaranteed to cover that. 5649 */ 5650 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5651 5652 while (!pm_runtime_status_suspended(&(p->dev))) { 5653 if (!pm_runtime_suspend(&(p->dev))) 5654 break; 5655 5656 if (expires < ktime_get_mono_fast_ns()) { 5657 dev_warn(adev->dev, "failed to suspend display audio\n"); 5658 pci_dev_put(p); 5659 /* TODO: abort the succeeding gpu reset? */ 5660 return -ETIMEDOUT; 5661 } 5662 } 5663 5664 pm_runtime_disable(&(p->dev)); 5665 5666 pci_dev_put(p); 5667 return 0; 5668 } 5669 5670 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5671 { 5672 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5673 5674 #if defined(CONFIG_DEBUG_FS) 5675 if (!amdgpu_sriov_vf(adev)) 5676 cancel_work(&adev->reset_work); 5677 #endif 5678 5679 if (adev->kfd.dev) 5680 cancel_work(&adev->kfd.reset_work); 5681 5682 if (amdgpu_sriov_vf(adev)) 5683 cancel_work(&adev->virt.flr_work); 5684 5685 if (con && adev->ras_enabled) 5686 cancel_work(&con->recovery_work); 5687 5688 } 5689 5690 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5691 { 5692 struct amdgpu_device *tmp_adev; 5693 int ret = 0; 5694 u32 status; 5695 5696 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5697 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5698 if (PCI_POSSIBLE_ERROR(status)) { 5699 dev_err(tmp_adev->dev, "device lost from bus!"); 5700 ret = -ENODEV; 5701 } 5702 } 5703 5704 return ret; 5705 } 5706 5707 /** 5708 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5709 * 5710 * @adev: amdgpu_device pointer 5711 * @job: which job trigger hang 5712 * @reset_context: amdgpu reset context pointer 5713 * 5714 * Attempt to reset the GPU if it has hung (all asics). 5715 * Attempt to do soft-reset or full-reset and reinitialize Asic 5716 * Returns 0 for success or an error on failure. 5717 */ 5718 5719 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5720 struct amdgpu_job *job, 5721 struct amdgpu_reset_context *reset_context) 5722 { 5723 struct list_head device_list, *device_list_handle = NULL; 5724 bool job_signaled = false; 5725 struct amdgpu_hive_info *hive = NULL; 5726 struct amdgpu_device *tmp_adev = NULL; 5727 int i, r = 0; 5728 bool need_emergency_restart = false; 5729 bool audio_suspended = false; 5730 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5731 5732 /* 5733 * Special case: RAS triggered and full reset isn't supported 5734 */ 5735 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5736 5737 /* 5738 * Flush RAM to disk so that after reboot 5739 * the user can read log and see why the system rebooted. 5740 */ 5741 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5742 amdgpu_ras_get_context(adev)->reboot) { 5743 DRM_WARN("Emergency reboot."); 5744 5745 ksys_sync_helper(); 5746 emergency_restart(); 5747 } 5748 5749 dev_info(adev->dev, "GPU %s begin!\n", 5750 need_emergency_restart ? "jobs stop":"reset"); 5751 5752 if (!amdgpu_sriov_vf(adev)) 5753 hive = amdgpu_get_xgmi_hive(adev); 5754 if (hive) 5755 mutex_lock(&hive->hive_lock); 5756 5757 reset_context->job = job; 5758 reset_context->hive = hive; 5759 /* 5760 * Build list of devices to reset. 5761 * In case we are in XGMI hive mode, resort the device list 5762 * to put adev in the 1st position. 5763 */ 5764 INIT_LIST_HEAD(&device_list); 5765 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5766 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5767 list_add_tail(&tmp_adev->reset_list, &device_list); 5768 if (adev->shutdown) 5769 tmp_adev->shutdown = true; 5770 } 5771 if (!list_is_first(&adev->reset_list, &device_list)) 5772 list_rotate_to_front(&adev->reset_list, &device_list); 5773 device_list_handle = &device_list; 5774 } else { 5775 list_add_tail(&adev->reset_list, &device_list); 5776 device_list_handle = &device_list; 5777 } 5778 5779 if (!amdgpu_sriov_vf(adev)) { 5780 r = amdgpu_device_health_check(device_list_handle); 5781 if (r) 5782 goto end_reset; 5783 } 5784 5785 /* We need to lock reset domain only once both for XGMI and single device */ 5786 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5787 reset_list); 5788 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5789 5790 /* block all schedulers and reset given job's ring */ 5791 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5792 5793 amdgpu_device_set_mp1_state(tmp_adev); 5794 5795 /* 5796 * Try to put the audio codec into suspend state 5797 * before gpu reset started. 5798 * 5799 * Due to the power domain of the graphics device 5800 * is shared with AZ power domain. Without this, 5801 * we may change the audio hardware from behind 5802 * the audio driver's back. That will trigger 5803 * some audio codec errors. 5804 */ 5805 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5806 audio_suspended = true; 5807 5808 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5809 5810 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5811 5812 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5813 5814 /* 5815 * Mark these ASICs to be reseted as untracked first 5816 * And add them back after reset completed 5817 */ 5818 amdgpu_unregister_gpu_instance(tmp_adev); 5819 5820 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5821 5822 /* disable ras on ALL IPs */ 5823 if (!need_emergency_restart && 5824 amdgpu_device_ip_need_full_reset(tmp_adev)) 5825 amdgpu_ras_suspend(tmp_adev); 5826 5827 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5828 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5829 5830 if (!amdgpu_ring_sched_ready(ring)) 5831 continue; 5832 5833 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5834 5835 if (need_emergency_restart) 5836 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5837 } 5838 atomic_inc(&tmp_adev->gpu_reset_counter); 5839 } 5840 5841 if (need_emergency_restart) 5842 goto skip_sched_resume; 5843 5844 /* 5845 * Must check guilty signal here since after this point all old 5846 * HW fences are force signaled. 5847 * 5848 * job->base holds a reference to parent fence 5849 */ 5850 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5851 job_signaled = true; 5852 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5853 goto skip_hw_reset; 5854 } 5855 5856 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5857 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5858 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5859 /*TODO Should we stop ?*/ 5860 if (r) { 5861 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5862 r, adev_to_drm(tmp_adev)->unique); 5863 tmp_adev->asic_reset_res = r; 5864 } 5865 } 5866 5867 /* Actual ASIC resets if needed.*/ 5868 /* Host driver will handle XGMI hive reset for SRIOV */ 5869 if (amdgpu_sriov_vf(adev)) { 5870 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5871 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5872 amdgpu_ras_set_fed(adev, true); 5873 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5874 } 5875 5876 r = amdgpu_device_reset_sriov(adev, reset_context); 5877 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5878 amdgpu_virt_release_full_gpu(adev, true); 5879 goto retry; 5880 } 5881 if (r) 5882 adev->asic_reset_res = r; 5883 } else { 5884 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5885 if (r && r == -EAGAIN) 5886 goto retry; 5887 } 5888 5889 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5890 /* 5891 * Drop any pending non scheduler resets queued before reset is done. 5892 * Any reset scheduled after this point would be valid. Scheduler resets 5893 * were already dropped during drm_sched_stop and no new ones can come 5894 * in before drm_sched_start. 5895 */ 5896 amdgpu_device_stop_pending_resets(tmp_adev); 5897 } 5898 5899 skip_hw_reset: 5900 5901 /* Post ASIC reset for all devs .*/ 5902 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5903 5904 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5905 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5906 5907 if (!amdgpu_ring_sched_ready(ring)) 5908 continue; 5909 5910 drm_sched_start(&ring->sched); 5911 } 5912 5913 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5914 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5915 5916 if (tmp_adev->asic_reset_res) 5917 r = tmp_adev->asic_reset_res; 5918 5919 tmp_adev->asic_reset_res = 0; 5920 5921 if (r) { 5922 /* bad news, how to tell it to userspace ? 5923 * for ras error, we should report GPU bad status instead of 5924 * reset failure 5925 */ 5926 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 5927 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 5928 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 5929 atomic_read(&tmp_adev->gpu_reset_counter)); 5930 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5931 } else { 5932 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5933 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5934 DRM_WARN("smart shift update failed\n"); 5935 } 5936 } 5937 5938 skip_sched_resume: 5939 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5940 /* unlock kfd: SRIOV would do it separately */ 5941 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5942 amdgpu_amdkfd_post_reset(tmp_adev); 5943 5944 /* kfd_post_reset will do nothing if kfd device is not initialized, 5945 * need to bring up kfd here if it's not be initialized before 5946 */ 5947 if (!adev->kfd.init_complete) 5948 amdgpu_amdkfd_device_init(adev); 5949 5950 if (audio_suspended) 5951 amdgpu_device_resume_display_audio(tmp_adev); 5952 5953 amdgpu_device_unset_mp1_state(tmp_adev); 5954 5955 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5956 } 5957 5958 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5959 reset_list); 5960 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5961 5962 end_reset: 5963 if (hive) { 5964 mutex_unlock(&hive->hive_lock); 5965 amdgpu_put_xgmi_hive(hive); 5966 } 5967 5968 if (r) 5969 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5970 5971 atomic_set(&adev->reset_domain->reset_res, r); 5972 return r; 5973 } 5974 5975 /** 5976 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5977 * 5978 * @adev: amdgpu_device pointer 5979 * @speed: pointer to the speed of the link 5980 * @width: pointer to the width of the link 5981 * 5982 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5983 * first physical partner to an AMD dGPU. 5984 * This will exclude any virtual switches and links. 5985 */ 5986 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5987 enum pci_bus_speed *speed, 5988 enum pcie_link_width *width) 5989 { 5990 struct pci_dev *parent = adev->pdev; 5991 5992 if (!speed || !width) 5993 return; 5994 5995 *speed = PCI_SPEED_UNKNOWN; 5996 *width = PCIE_LNK_WIDTH_UNKNOWN; 5997 5998 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 5999 while ((parent = pci_upstream_bridge(parent))) { 6000 /* skip upstream/downstream switches internal to dGPU*/ 6001 if (parent->vendor == PCI_VENDOR_ID_ATI) 6002 continue; 6003 *speed = pcie_get_speed_cap(parent); 6004 *width = pcie_get_width_cap(parent); 6005 break; 6006 } 6007 } else { 6008 /* use the current speeds rather than max if switching is not supported */ 6009 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6010 } 6011 } 6012 6013 /** 6014 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6015 * 6016 * @adev: amdgpu_device pointer 6017 * 6018 * Fetchs and stores in the driver the PCIE capabilities (gen speed 6019 * and lanes) of the slot the device is in. Handles APUs and 6020 * virtualized environments where PCIE config space may not be available. 6021 */ 6022 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6023 { 6024 struct pci_dev *pdev; 6025 enum pci_bus_speed speed_cap, platform_speed_cap; 6026 enum pcie_link_width platform_link_width; 6027 6028 if (amdgpu_pcie_gen_cap) 6029 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6030 6031 if (amdgpu_pcie_lane_cap) 6032 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6033 6034 /* covers APUs as well */ 6035 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6036 if (adev->pm.pcie_gen_mask == 0) 6037 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6038 if (adev->pm.pcie_mlw_mask == 0) 6039 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6040 return; 6041 } 6042 6043 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6044 return; 6045 6046 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6047 &platform_link_width); 6048 6049 if (adev->pm.pcie_gen_mask == 0) { 6050 /* asic caps */ 6051 pdev = adev->pdev; 6052 speed_cap = pcie_get_speed_cap(pdev); 6053 if (speed_cap == PCI_SPEED_UNKNOWN) { 6054 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6055 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6056 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6057 } else { 6058 if (speed_cap == PCIE_SPEED_32_0GT) 6059 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6060 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6061 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6062 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6063 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6064 else if (speed_cap == PCIE_SPEED_16_0GT) 6065 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6066 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6067 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6068 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6069 else if (speed_cap == PCIE_SPEED_8_0GT) 6070 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6071 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6072 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6073 else if (speed_cap == PCIE_SPEED_5_0GT) 6074 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6075 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6076 else 6077 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6078 } 6079 /* platform caps */ 6080 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6081 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6082 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6083 } else { 6084 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6085 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6086 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6087 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6088 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6089 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6090 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6091 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6092 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6093 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6094 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6095 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6096 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6097 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6098 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6099 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6100 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6101 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6102 else 6103 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6104 6105 } 6106 } 6107 if (adev->pm.pcie_mlw_mask == 0) { 6108 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6109 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6110 } else { 6111 switch (platform_link_width) { 6112 case PCIE_LNK_X32: 6113 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6114 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6115 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6116 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6117 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6118 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6119 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6120 break; 6121 case PCIE_LNK_X16: 6122 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6123 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6124 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6125 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6126 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6127 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6128 break; 6129 case PCIE_LNK_X12: 6130 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6131 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6132 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6133 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6134 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6135 break; 6136 case PCIE_LNK_X8: 6137 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6138 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6139 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6140 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6141 break; 6142 case PCIE_LNK_X4: 6143 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6144 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6145 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6146 break; 6147 case PCIE_LNK_X2: 6148 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6149 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6150 break; 6151 case PCIE_LNK_X1: 6152 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6153 break; 6154 default: 6155 break; 6156 } 6157 } 6158 } 6159 } 6160 6161 /** 6162 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6163 * 6164 * @adev: amdgpu_device pointer 6165 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6166 * 6167 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6168 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6169 * @peer_adev. 6170 */ 6171 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6172 struct amdgpu_device *peer_adev) 6173 { 6174 #ifdef CONFIG_HSA_AMD_P2P 6175 bool p2p_access = 6176 !adev->gmc.xgmi.connected_to_cpu && 6177 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6178 6179 bool is_large_bar = adev->gmc.visible_vram_size && 6180 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6181 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6182 6183 if (!p2p_addressable) { 6184 uint64_t address_mask = peer_adev->dev->dma_mask ? 6185 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6186 resource_size_t aper_limit = 6187 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6188 6189 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6190 aper_limit & address_mask); 6191 } 6192 return is_large_bar && p2p_access && p2p_addressable; 6193 #else 6194 return false; 6195 #endif 6196 } 6197 6198 int amdgpu_device_baco_enter(struct drm_device *dev) 6199 { 6200 struct amdgpu_device *adev = drm_to_adev(dev); 6201 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6202 6203 if (!amdgpu_device_supports_baco(dev)) 6204 return -ENOTSUPP; 6205 6206 if (ras && adev->ras_enabled && 6207 adev->nbio.funcs->enable_doorbell_interrupt) 6208 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6209 6210 return amdgpu_dpm_baco_enter(adev); 6211 } 6212 6213 int amdgpu_device_baco_exit(struct drm_device *dev) 6214 { 6215 struct amdgpu_device *adev = drm_to_adev(dev); 6216 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6217 int ret = 0; 6218 6219 if (!amdgpu_device_supports_baco(dev)) 6220 return -ENOTSUPP; 6221 6222 ret = amdgpu_dpm_baco_exit(adev); 6223 if (ret) 6224 return ret; 6225 6226 if (ras && adev->ras_enabled && 6227 adev->nbio.funcs->enable_doorbell_interrupt) 6228 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6229 6230 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6231 adev->nbio.funcs->clear_doorbell_interrupt) 6232 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6233 6234 return 0; 6235 } 6236 6237 /** 6238 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6239 * @pdev: PCI device struct 6240 * @state: PCI channel state 6241 * 6242 * Description: Called when a PCI error is detected. 6243 * 6244 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6245 */ 6246 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6247 { 6248 struct drm_device *dev = pci_get_drvdata(pdev); 6249 struct amdgpu_device *adev = drm_to_adev(dev); 6250 int i; 6251 6252 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6253 6254 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6255 DRM_WARN("No support for XGMI hive yet..."); 6256 return PCI_ERS_RESULT_DISCONNECT; 6257 } 6258 6259 adev->pci_channel_state = state; 6260 6261 switch (state) { 6262 case pci_channel_io_normal: 6263 return PCI_ERS_RESULT_CAN_RECOVER; 6264 /* Fatal error, prepare for slot reset */ 6265 case pci_channel_io_frozen: 6266 /* 6267 * Locking adev->reset_domain->sem will prevent any external access 6268 * to GPU during PCI error recovery 6269 */ 6270 amdgpu_device_lock_reset_domain(adev->reset_domain); 6271 amdgpu_device_set_mp1_state(adev); 6272 6273 /* 6274 * Block any work scheduling as we do for regular GPU reset 6275 * for the duration of the recovery 6276 */ 6277 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6278 struct amdgpu_ring *ring = adev->rings[i]; 6279 6280 if (!amdgpu_ring_sched_ready(ring)) 6281 continue; 6282 6283 drm_sched_stop(&ring->sched, NULL); 6284 } 6285 atomic_inc(&adev->gpu_reset_counter); 6286 return PCI_ERS_RESULT_NEED_RESET; 6287 case pci_channel_io_perm_failure: 6288 /* Permanent error, prepare for device removal */ 6289 return PCI_ERS_RESULT_DISCONNECT; 6290 } 6291 6292 return PCI_ERS_RESULT_NEED_RESET; 6293 } 6294 6295 /** 6296 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6297 * @pdev: pointer to PCI device 6298 */ 6299 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6300 { 6301 6302 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6303 6304 /* TODO - dump whatever for debugging purposes */ 6305 6306 /* This called only if amdgpu_pci_error_detected returns 6307 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6308 * works, no need to reset slot. 6309 */ 6310 6311 return PCI_ERS_RESULT_RECOVERED; 6312 } 6313 6314 /** 6315 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6316 * @pdev: PCI device struct 6317 * 6318 * Description: This routine is called by the pci error recovery 6319 * code after the PCI slot has been reset, just before we 6320 * should resume normal operations. 6321 */ 6322 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6323 { 6324 struct drm_device *dev = pci_get_drvdata(pdev); 6325 struct amdgpu_device *adev = drm_to_adev(dev); 6326 int r, i; 6327 struct amdgpu_reset_context reset_context; 6328 u32 memsize; 6329 struct list_head device_list; 6330 6331 /* PCI error slot reset should be skipped During RAS recovery */ 6332 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6333 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6334 amdgpu_ras_in_recovery(adev)) 6335 return PCI_ERS_RESULT_RECOVERED; 6336 6337 DRM_INFO("PCI error: slot reset callback!!\n"); 6338 6339 memset(&reset_context, 0, sizeof(reset_context)); 6340 6341 INIT_LIST_HEAD(&device_list); 6342 list_add_tail(&adev->reset_list, &device_list); 6343 6344 /* wait for asic to come out of reset */ 6345 msleep(500); 6346 6347 /* Restore PCI confspace */ 6348 amdgpu_device_load_pci_state(pdev); 6349 6350 /* confirm ASIC came out of reset */ 6351 for (i = 0; i < adev->usec_timeout; i++) { 6352 memsize = amdgpu_asic_get_config_memsize(adev); 6353 6354 if (memsize != 0xffffffff) 6355 break; 6356 udelay(1); 6357 } 6358 if (memsize == 0xffffffff) { 6359 r = -ETIME; 6360 goto out; 6361 } 6362 6363 reset_context.method = AMD_RESET_METHOD_NONE; 6364 reset_context.reset_req_dev = adev; 6365 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6366 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6367 6368 adev->no_hw_access = true; 6369 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6370 adev->no_hw_access = false; 6371 if (r) 6372 goto out; 6373 6374 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6375 6376 out: 6377 if (!r) { 6378 if (amdgpu_device_cache_pci_state(adev->pdev)) 6379 pci_restore_state(adev->pdev); 6380 6381 DRM_INFO("PCIe error recovery succeeded\n"); 6382 } else { 6383 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6384 amdgpu_device_unset_mp1_state(adev); 6385 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6386 } 6387 6388 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6389 } 6390 6391 /** 6392 * amdgpu_pci_resume() - resume normal ops after PCI reset 6393 * @pdev: pointer to PCI device 6394 * 6395 * Called when the error recovery driver tells us that its 6396 * OK to resume normal operation. 6397 */ 6398 void amdgpu_pci_resume(struct pci_dev *pdev) 6399 { 6400 struct drm_device *dev = pci_get_drvdata(pdev); 6401 struct amdgpu_device *adev = drm_to_adev(dev); 6402 int i; 6403 6404 6405 DRM_INFO("PCI error: resume callback!!\n"); 6406 6407 /* Only continue execution for the case of pci_channel_io_frozen */ 6408 if (adev->pci_channel_state != pci_channel_io_frozen) 6409 return; 6410 6411 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6412 struct amdgpu_ring *ring = adev->rings[i]; 6413 6414 if (!amdgpu_ring_sched_ready(ring)) 6415 continue; 6416 6417 drm_sched_start(&ring->sched); 6418 } 6419 6420 amdgpu_device_unset_mp1_state(adev); 6421 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6422 } 6423 6424 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6425 { 6426 struct drm_device *dev = pci_get_drvdata(pdev); 6427 struct amdgpu_device *adev = drm_to_adev(dev); 6428 int r; 6429 6430 r = pci_save_state(pdev); 6431 if (!r) { 6432 kfree(adev->pci_state); 6433 6434 adev->pci_state = pci_store_saved_state(pdev); 6435 6436 if (!adev->pci_state) { 6437 DRM_ERROR("Failed to store PCI saved state"); 6438 return false; 6439 } 6440 } else { 6441 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6442 return false; 6443 } 6444 6445 return true; 6446 } 6447 6448 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6449 { 6450 struct drm_device *dev = pci_get_drvdata(pdev); 6451 struct amdgpu_device *adev = drm_to_adev(dev); 6452 int r; 6453 6454 if (!adev->pci_state) 6455 return false; 6456 6457 r = pci_load_saved_state(pdev, adev->pci_state); 6458 6459 if (!r) { 6460 pci_restore_state(pdev); 6461 } else { 6462 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6463 return false; 6464 } 6465 6466 return true; 6467 } 6468 6469 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6470 struct amdgpu_ring *ring) 6471 { 6472 #ifdef CONFIG_X86_64 6473 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6474 return; 6475 #endif 6476 if (adev->gmc.xgmi.connected_to_cpu) 6477 return; 6478 6479 if (ring && ring->funcs->emit_hdp_flush) 6480 amdgpu_ring_emit_hdp_flush(ring); 6481 else 6482 amdgpu_asic_flush_hdp(adev, ring); 6483 } 6484 6485 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6486 struct amdgpu_ring *ring) 6487 { 6488 #ifdef CONFIG_X86_64 6489 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6490 return; 6491 #endif 6492 if (adev->gmc.xgmi.connected_to_cpu) 6493 return; 6494 6495 amdgpu_asic_invalidate_hdp(adev, ring); 6496 } 6497 6498 int amdgpu_in_reset(struct amdgpu_device *adev) 6499 { 6500 return atomic_read(&adev->reset_domain->in_gpu_reset); 6501 } 6502 6503 /** 6504 * amdgpu_device_halt() - bring hardware to some kind of halt state 6505 * 6506 * @adev: amdgpu_device pointer 6507 * 6508 * Bring hardware to some kind of halt state so that no one can touch it 6509 * any more. It will help to maintain error context when error occurred. 6510 * Compare to a simple hang, the system will keep stable at least for SSH 6511 * access. Then it should be trivial to inspect the hardware state and 6512 * see what's going on. Implemented as following: 6513 * 6514 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6515 * clears all CPU mappings to device, disallows remappings through page faults 6516 * 2. amdgpu_irq_disable_all() disables all interrupts 6517 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6518 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6519 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6520 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6521 * flush any in flight DMA operations 6522 */ 6523 void amdgpu_device_halt(struct amdgpu_device *adev) 6524 { 6525 struct pci_dev *pdev = adev->pdev; 6526 struct drm_device *ddev = adev_to_drm(adev); 6527 6528 amdgpu_xcp_dev_unplug(adev); 6529 drm_dev_unplug(ddev); 6530 6531 amdgpu_irq_disable_all(adev); 6532 6533 amdgpu_fence_driver_hw_fini(adev); 6534 6535 adev->no_hw_access = true; 6536 6537 amdgpu_device_unmap_mmio(adev); 6538 6539 pci_disable_device(pdev); 6540 pci_wait_for_pending_transaction(pdev); 6541 } 6542 6543 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6544 u32 reg) 6545 { 6546 unsigned long flags, address, data; 6547 u32 r; 6548 6549 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6550 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6551 6552 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6553 WREG32(address, reg * 4); 6554 (void)RREG32(address); 6555 r = RREG32(data); 6556 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6557 return r; 6558 } 6559 6560 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6561 u32 reg, u32 v) 6562 { 6563 unsigned long flags, address, data; 6564 6565 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6566 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6567 6568 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6569 WREG32(address, reg * 4); 6570 (void)RREG32(address); 6571 WREG32(data, v); 6572 (void)RREG32(data); 6573 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6574 } 6575 6576 /** 6577 * amdgpu_device_get_gang - return a reference to the current gang 6578 * @adev: amdgpu_device pointer 6579 * 6580 * Returns: A new reference to the current gang leader. 6581 */ 6582 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6583 { 6584 struct dma_fence *fence; 6585 6586 rcu_read_lock(); 6587 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6588 rcu_read_unlock(); 6589 return fence; 6590 } 6591 6592 /** 6593 * amdgpu_device_switch_gang - switch to a new gang 6594 * @adev: amdgpu_device pointer 6595 * @gang: the gang to switch to 6596 * 6597 * Try to switch to a new gang. 6598 * Returns: NULL if we switched to the new gang or a reference to the current 6599 * gang leader. 6600 */ 6601 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6602 struct dma_fence *gang) 6603 { 6604 struct dma_fence *old = NULL; 6605 6606 do { 6607 dma_fence_put(old); 6608 old = amdgpu_device_get_gang(adev); 6609 if (old == gang) 6610 break; 6611 6612 if (!dma_fence_is_signaled(old)) 6613 return old; 6614 6615 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6616 old, gang) != old); 6617 6618 dma_fence_put(old); 6619 return NULL; 6620 } 6621 6622 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6623 { 6624 switch (adev->asic_type) { 6625 #ifdef CONFIG_DRM_AMDGPU_SI 6626 case CHIP_HAINAN: 6627 #endif 6628 case CHIP_TOPAZ: 6629 /* chips with no display hardware */ 6630 return false; 6631 #ifdef CONFIG_DRM_AMDGPU_SI 6632 case CHIP_TAHITI: 6633 case CHIP_PITCAIRN: 6634 case CHIP_VERDE: 6635 case CHIP_OLAND: 6636 #endif 6637 #ifdef CONFIG_DRM_AMDGPU_CIK 6638 case CHIP_BONAIRE: 6639 case CHIP_HAWAII: 6640 case CHIP_KAVERI: 6641 case CHIP_KABINI: 6642 case CHIP_MULLINS: 6643 #endif 6644 case CHIP_TONGA: 6645 case CHIP_FIJI: 6646 case CHIP_POLARIS10: 6647 case CHIP_POLARIS11: 6648 case CHIP_POLARIS12: 6649 case CHIP_VEGAM: 6650 case CHIP_CARRIZO: 6651 case CHIP_STONEY: 6652 /* chips with display hardware */ 6653 return true; 6654 default: 6655 /* IP discovery */ 6656 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6657 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6658 return false; 6659 return true; 6660 } 6661 } 6662 6663 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6664 uint32_t inst, uint32_t reg_addr, char reg_name[], 6665 uint32_t expected_value, uint32_t mask) 6666 { 6667 uint32_t ret = 0; 6668 uint32_t old_ = 0; 6669 uint32_t tmp_ = RREG32(reg_addr); 6670 uint32_t loop = adev->usec_timeout; 6671 6672 while ((tmp_ & (mask)) != (expected_value)) { 6673 if (old_ != tmp_) { 6674 loop = adev->usec_timeout; 6675 old_ = tmp_; 6676 } else 6677 udelay(1); 6678 tmp_ = RREG32(reg_addr); 6679 loop--; 6680 if (!loop) { 6681 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6682 inst, reg_name, (uint32_t)expected_value, 6683 (uint32_t)(tmp_ & (mask))); 6684 ret = -ETIMEDOUT; 6685 break; 6686 } 6687 } 6688 return ret; 6689 } 6690