1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 148 149 /** 150 * DOC: pcie_replay_count 151 * 152 * The amdgpu driver provides a sysfs API for reporting the total number 153 * of PCIe replays (NAKs) 154 * The file pcie_replay_count is used for this and returns the total 155 * number of replays as a sum of the NAKs generated and NAKs received 156 */ 157 158 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 159 struct device_attribute *attr, char *buf) 160 { 161 struct drm_device *ddev = dev_get_drvdata(dev); 162 struct amdgpu_device *adev = drm_to_adev(ddev); 163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 164 165 return sysfs_emit(buf, "%llu\n", cnt); 166 } 167 168 static DEVICE_ATTR(pcie_replay_count, 0444, 169 amdgpu_device_get_pcie_replay_count, NULL); 170 171 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 172 struct bin_attribute *attr, char *buf, 173 loff_t ppos, size_t count) 174 { 175 struct device *dev = kobj_to_dev(kobj); 176 struct drm_device *ddev = dev_get_drvdata(dev); 177 struct amdgpu_device *adev = drm_to_adev(ddev); 178 ssize_t bytes_read; 179 180 switch (ppos) { 181 case AMDGPU_SYS_REG_STATE_XGMI: 182 bytes_read = amdgpu_asic_get_reg_state( 183 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 184 break; 185 case AMDGPU_SYS_REG_STATE_WAFL: 186 bytes_read = amdgpu_asic_get_reg_state( 187 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 188 break; 189 case AMDGPU_SYS_REG_STATE_PCIE: 190 bytes_read = amdgpu_asic_get_reg_state( 191 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 192 break; 193 case AMDGPU_SYS_REG_STATE_USR: 194 bytes_read = amdgpu_asic_get_reg_state( 195 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 196 break; 197 case AMDGPU_SYS_REG_STATE_USR_1: 198 bytes_read = amdgpu_asic_get_reg_state( 199 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 200 break; 201 default: 202 return -EINVAL; 203 } 204 205 return bytes_read; 206 } 207 208 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 209 AMDGPU_SYS_REG_STATE_END); 210 211 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 212 { 213 int ret; 214 215 if (!amdgpu_asic_get_reg_state_supported(adev)) 216 return 0; 217 218 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 219 220 return ret; 221 } 222 223 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 224 { 225 if (!amdgpu_asic_get_reg_state_supported(adev)) 226 return; 227 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 228 } 229 230 /** 231 * DOC: board_info 232 * 233 * The amdgpu driver provides a sysfs API for giving board related information. 234 * It provides the form factor information in the format 235 * 236 * type : form factor 237 * 238 * Possible form factor values 239 * 240 * - "cem" - PCIE CEM card 241 * - "oam" - Open Compute Accelerator Module 242 * - "unknown" - Not known 243 * 244 */ 245 246 static ssize_t amdgpu_device_get_board_info(struct device *dev, 247 struct device_attribute *attr, 248 char *buf) 249 { 250 struct drm_device *ddev = dev_get_drvdata(dev); 251 struct amdgpu_device *adev = drm_to_adev(ddev); 252 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 253 const char *pkg; 254 255 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 256 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 257 258 switch (pkg_type) { 259 case AMDGPU_PKG_TYPE_CEM: 260 pkg = "cem"; 261 break; 262 case AMDGPU_PKG_TYPE_OAM: 263 pkg = "oam"; 264 break; 265 default: 266 pkg = "unknown"; 267 break; 268 } 269 270 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 271 } 272 273 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 274 275 static struct attribute *amdgpu_board_attrs[] = { 276 &dev_attr_board_info.attr, 277 NULL, 278 }; 279 280 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 281 struct attribute *attr, int n) 282 { 283 struct device *dev = kobj_to_dev(kobj); 284 struct drm_device *ddev = dev_get_drvdata(dev); 285 struct amdgpu_device *adev = drm_to_adev(ddev); 286 287 if (adev->flags & AMD_IS_APU) 288 return 0; 289 290 return attr->mode; 291 } 292 293 static const struct attribute_group amdgpu_board_attrs_group = { 294 .attrs = amdgpu_board_attrs, 295 .is_visible = amdgpu_board_attrs_is_visible 296 }; 297 298 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 299 300 301 /** 302 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 303 * 304 * @dev: drm_device pointer 305 * 306 * Returns true if the device is a dGPU with ATPX power control, 307 * otherwise return false. 308 */ 309 bool amdgpu_device_supports_px(struct drm_device *dev) 310 { 311 struct amdgpu_device *adev = drm_to_adev(dev); 312 313 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 314 return true; 315 return false; 316 } 317 318 /** 319 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 320 * 321 * @dev: drm_device pointer 322 * 323 * Returns true if the device is a dGPU with ACPI power control, 324 * otherwise return false. 325 */ 326 bool amdgpu_device_supports_boco(struct drm_device *dev) 327 { 328 struct amdgpu_device *adev = drm_to_adev(dev); 329 330 if (adev->has_pr3 || 331 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 332 return true; 333 return false; 334 } 335 336 /** 337 * amdgpu_device_supports_baco - Does the device support BACO 338 * 339 * @dev: drm_device pointer 340 * 341 * Return: 342 * 1 if the device supporte BACO; 343 * 3 if the device support MACO (only works if BACO is supported) 344 * otherwise return 0. 345 */ 346 int amdgpu_device_supports_baco(struct drm_device *dev) 347 { 348 struct amdgpu_device *adev = drm_to_adev(dev); 349 350 return amdgpu_asic_supports_baco(adev); 351 } 352 353 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 354 { 355 struct drm_device *dev; 356 int bamaco_support; 357 358 dev = adev_to_drm(adev); 359 360 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 361 bamaco_support = amdgpu_device_supports_baco(dev); 362 363 switch (amdgpu_runtime_pm) { 364 case 2: 365 if (bamaco_support & MACO_SUPPORT) { 366 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 367 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 368 } else if (bamaco_support == BACO_SUPPORT) { 369 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 370 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 371 } 372 break; 373 case 1: 374 if (bamaco_support & BACO_SUPPORT) { 375 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 376 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 377 } 378 break; 379 case -1: 380 case -2: 381 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 382 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 383 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 384 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 385 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 386 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 387 } else { 388 if (!bamaco_support) 389 goto no_runtime_pm; 390 391 switch (adev->asic_type) { 392 case CHIP_VEGA20: 393 case CHIP_ARCTURUS: 394 /* BACO are not supported on vega20 and arctrus */ 395 break; 396 case CHIP_VEGA10: 397 /* enable BACO as runpm mode if noretry=0 */ 398 if (!adev->gmc.noretry) 399 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 400 break; 401 default: 402 /* enable BACO as runpm mode on CI+ */ 403 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 404 break; 405 } 406 407 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 408 if (bamaco_support & MACO_SUPPORT) { 409 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 410 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 411 } else { 412 dev_info(adev->dev, "Using BACO for runtime pm\n"); 413 } 414 } 415 } 416 break; 417 case 0: 418 dev_info(adev->dev, "runtime pm is manually disabled\n"); 419 break; 420 default: 421 break; 422 } 423 424 no_runtime_pm: 425 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 426 dev_info(adev->dev, "Runtime PM not available\n"); 427 } 428 /** 429 * amdgpu_device_supports_smart_shift - Is the device dGPU with 430 * smart shift support 431 * 432 * @dev: drm_device pointer 433 * 434 * Returns true if the device is a dGPU with Smart Shift support, 435 * otherwise returns false. 436 */ 437 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 438 { 439 return (amdgpu_device_supports_boco(dev) && 440 amdgpu_acpi_is_power_shift_control_supported()); 441 } 442 443 /* 444 * VRAM access helper functions 445 */ 446 447 /** 448 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 449 * 450 * @adev: amdgpu_device pointer 451 * @pos: offset of the buffer in vram 452 * @buf: virtual address of the buffer in system memory 453 * @size: read/write size, sizeof(@buf) must > @size 454 * @write: true - write to vram, otherwise - read from vram 455 */ 456 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 457 void *buf, size_t size, bool write) 458 { 459 unsigned long flags; 460 uint32_t hi = ~0, tmp = 0; 461 uint32_t *data = buf; 462 uint64_t last; 463 int idx; 464 465 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 466 return; 467 468 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 469 470 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 471 for (last = pos + size; pos < last; pos += 4) { 472 tmp = pos >> 31; 473 474 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 475 if (tmp != hi) { 476 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 477 hi = tmp; 478 } 479 if (write) 480 WREG32_NO_KIQ(mmMM_DATA, *data++); 481 else 482 *data++ = RREG32_NO_KIQ(mmMM_DATA); 483 } 484 485 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 486 drm_dev_exit(idx); 487 } 488 489 /** 490 * amdgpu_device_aper_access - access vram by vram aperature 491 * 492 * @adev: amdgpu_device pointer 493 * @pos: offset of the buffer in vram 494 * @buf: virtual address of the buffer in system memory 495 * @size: read/write size, sizeof(@buf) must > @size 496 * @write: true - write to vram, otherwise - read from vram 497 * 498 * The return value means how many bytes have been transferred. 499 */ 500 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 501 void *buf, size_t size, bool write) 502 { 503 #ifdef CONFIG_64BIT 504 void __iomem *addr; 505 size_t count = 0; 506 uint64_t last; 507 508 if (!adev->mman.aper_base_kaddr) 509 return 0; 510 511 last = min(pos + size, adev->gmc.visible_vram_size); 512 if (last > pos) { 513 addr = adev->mman.aper_base_kaddr + pos; 514 count = last - pos; 515 516 if (write) { 517 memcpy_toio(addr, buf, count); 518 /* Make sure HDP write cache flush happens without any reordering 519 * after the system memory contents are sent over PCIe device 520 */ 521 mb(); 522 amdgpu_device_flush_hdp(adev, NULL); 523 } else { 524 amdgpu_device_invalidate_hdp(adev, NULL); 525 /* Make sure HDP read cache is invalidated before issuing a read 526 * to the PCIe device 527 */ 528 mb(); 529 memcpy_fromio(buf, addr, count); 530 } 531 532 } 533 534 return count; 535 #else 536 return 0; 537 #endif 538 } 539 540 /** 541 * amdgpu_device_vram_access - read/write a buffer in vram 542 * 543 * @adev: amdgpu_device pointer 544 * @pos: offset of the buffer in vram 545 * @buf: virtual address of the buffer in system memory 546 * @size: read/write size, sizeof(@buf) must > @size 547 * @write: true - write to vram, otherwise - read from vram 548 */ 549 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 550 void *buf, size_t size, bool write) 551 { 552 size_t count; 553 554 /* try to using vram apreature to access vram first */ 555 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 556 size -= count; 557 if (size) { 558 /* using MM to access rest vram */ 559 pos += count; 560 buf += count; 561 amdgpu_device_mm_access(adev, pos, buf, size, write); 562 } 563 } 564 565 /* 566 * register access helper functions. 567 */ 568 569 /* Check if hw access should be skipped because of hotplug or device error */ 570 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 571 { 572 if (adev->no_hw_access) 573 return true; 574 575 #ifdef CONFIG_LOCKDEP 576 /* 577 * This is a bit complicated to understand, so worth a comment. What we assert 578 * here is that the GPU reset is not running on another thread in parallel. 579 * 580 * For this we trylock the read side of the reset semaphore, if that succeeds 581 * we know that the reset is not running in paralell. 582 * 583 * If the trylock fails we assert that we are either already holding the read 584 * side of the lock or are the reset thread itself and hold the write side of 585 * the lock. 586 */ 587 if (in_task()) { 588 if (down_read_trylock(&adev->reset_domain->sem)) 589 up_read(&adev->reset_domain->sem); 590 else 591 lockdep_assert_held(&adev->reset_domain->sem); 592 } 593 #endif 594 return false; 595 } 596 597 /** 598 * amdgpu_device_rreg - read a memory mapped IO or indirect register 599 * 600 * @adev: amdgpu_device pointer 601 * @reg: dword aligned register offset 602 * @acc_flags: access flags which require special behavior 603 * 604 * Returns the 32 bit value from the offset specified. 605 */ 606 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 607 uint32_t reg, uint32_t acc_flags) 608 { 609 uint32_t ret; 610 611 if (amdgpu_device_skip_hw_access(adev)) 612 return 0; 613 614 if ((reg * 4) < adev->rmmio_size) { 615 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 616 amdgpu_sriov_runtime(adev) && 617 down_read_trylock(&adev->reset_domain->sem)) { 618 ret = amdgpu_kiq_rreg(adev, reg, 0); 619 up_read(&adev->reset_domain->sem); 620 } else { 621 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 622 } 623 } else { 624 ret = adev->pcie_rreg(adev, reg * 4); 625 } 626 627 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 628 629 return ret; 630 } 631 632 /* 633 * MMIO register read with bytes helper functions 634 * @offset:bytes offset from MMIO start 635 */ 636 637 /** 638 * amdgpu_mm_rreg8 - read a memory mapped IO register 639 * 640 * @adev: amdgpu_device pointer 641 * @offset: byte aligned register offset 642 * 643 * Returns the 8 bit value from the offset specified. 644 */ 645 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 646 { 647 if (amdgpu_device_skip_hw_access(adev)) 648 return 0; 649 650 if (offset < adev->rmmio_size) 651 return (readb(adev->rmmio + offset)); 652 BUG(); 653 } 654 655 656 /** 657 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 658 * 659 * @adev: amdgpu_device pointer 660 * @reg: dword aligned register offset 661 * @acc_flags: access flags which require special behavior 662 * @xcc_id: xcc accelerated compute core id 663 * 664 * Returns the 32 bit value from the offset specified. 665 */ 666 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 667 uint32_t reg, uint32_t acc_flags, 668 uint32_t xcc_id) 669 { 670 uint32_t ret, rlcg_flag; 671 672 if (amdgpu_device_skip_hw_access(adev)) 673 return 0; 674 675 if ((reg * 4) < adev->rmmio_size) { 676 if (amdgpu_sriov_vf(adev) && 677 !amdgpu_sriov_runtime(adev) && 678 adev->gfx.rlc.rlcg_reg_access_supported && 679 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 680 GC_HWIP, false, 681 &rlcg_flag)) { 682 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 683 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 684 amdgpu_sriov_runtime(adev) && 685 down_read_trylock(&adev->reset_domain->sem)) { 686 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 687 up_read(&adev->reset_domain->sem); 688 } else { 689 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 690 } 691 } else { 692 ret = adev->pcie_rreg(adev, reg * 4); 693 } 694 695 return ret; 696 } 697 698 /* 699 * MMIO register write with bytes helper functions 700 * @offset:bytes offset from MMIO start 701 * @value: the value want to be written to the register 702 */ 703 704 /** 705 * amdgpu_mm_wreg8 - read a memory mapped IO register 706 * 707 * @adev: amdgpu_device pointer 708 * @offset: byte aligned register offset 709 * @value: 8 bit value to write 710 * 711 * Writes the value specified to the offset specified. 712 */ 713 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 714 { 715 if (amdgpu_device_skip_hw_access(adev)) 716 return; 717 718 if (offset < adev->rmmio_size) 719 writeb(value, adev->rmmio + offset); 720 else 721 BUG(); 722 } 723 724 /** 725 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 726 * 727 * @adev: amdgpu_device pointer 728 * @reg: dword aligned register offset 729 * @v: 32 bit value to write to the register 730 * @acc_flags: access flags which require special behavior 731 * 732 * Writes the value specified to the offset specified. 733 */ 734 void amdgpu_device_wreg(struct amdgpu_device *adev, 735 uint32_t reg, uint32_t v, 736 uint32_t acc_flags) 737 { 738 if (amdgpu_device_skip_hw_access(adev)) 739 return; 740 741 if ((reg * 4) < adev->rmmio_size) { 742 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 743 amdgpu_sriov_runtime(adev) && 744 down_read_trylock(&adev->reset_domain->sem)) { 745 amdgpu_kiq_wreg(adev, reg, v, 0); 746 up_read(&adev->reset_domain->sem); 747 } else { 748 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 749 } 750 } else { 751 adev->pcie_wreg(adev, reg * 4, v); 752 } 753 754 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 755 } 756 757 /** 758 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: mmio/rlc register 762 * @v: value to write 763 * @xcc_id: xcc accelerated compute core id 764 * 765 * this function is invoked only for the debugfs register access 766 */ 767 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 768 uint32_t reg, uint32_t v, 769 uint32_t xcc_id) 770 { 771 if (amdgpu_device_skip_hw_access(adev)) 772 return; 773 774 if (amdgpu_sriov_fullaccess(adev) && 775 adev->gfx.rlc.funcs && 776 adev->gfx.rlc.funcs->is_rlcg_access_range) { 777 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 778 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 779 } else if ((reg * 4) >= adev->rmmio_size) { 780 adev->pcie_wreg(adev, reg * 4, v); 781 } else { 782 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 783 } 784 } 785 786 /** 787 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 788 * 789 * @adev: amdgpu_device pointer 790 * @reg: dword aligned register offset 791 * @v: 32 bit value to write to the register 792 * @acc_flags: access flags which require special behavior 793 * @xcc_id: xcc accelerated compute core id 794 * 795 * Writes the value specified to the offset specified. 796 */ 797 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 798 uint32_t reg, uint32_t v, 799 uint32_t acc_flags, uint32_t xcc_id) 800 { 801 uint32_t rlcg_flag; 802 803 if (amdgpu_device_skip_hw_access(adev)) 804 return; 805 806 if ((reg * 4) < adev->rmmio_size) { 807 if (amdgpu_sriov_vf(adev) && 808 !amdgpu_sriov_runtime(adev) && 809 adev->gfx.rlc.rlcg_reg_access_supported && 810 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 811 GC_HWIP, true, 812 &rlcg_flag)) { 813 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 814 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 815 amdgpu_sriov_runtime(adev) && 816 down_read_trylock(&adev->reset_domain->sem)) { 817 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 818 up_read(&adev->reset_domain->sem); 819 } else { 820 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 821 } 822 } else { 823 adev->pcie_wreg(adev, reg * 4, v); 824 } 825 } 826 827 /** 828 * amdgpu_device_indirect_rreg - read an indirect register 829 * 830 * @adev: amdgpu_device pointer 831 * @reg_addr: indirect register address to read from 832 * 833 * Returns the value of indirect register @reg_addr 834 */ 835 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 836 u32 reg_addr) 837 { 838 unsigned long flags, pcie_index, pcie_data; 839 void __iomem *pcie_index_offset; 840 void __iomem *pcie_data_offset; 841 u32 r; 842 843 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 844 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 845 846 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 847 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 848 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 849 850 writel(reg_addr, pcie_index_offset); 851 readl(pcie_index_offset); 852 r = readl(pcie_data_offset); 853 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 854 855 return r; 856 } 857 858 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 859 u64 reg_addr) 860 { 861 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 862 u32 r; 863 void __iomem *pcie_index_offset; 864 void __iomem *pcie_index_hi_offset; 865 void __iomem *pcie_data_offset; 866 867 if (unlikely(!adev->nbio.funcs)) { 868 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 869 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 870 } else { 871 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 872 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 873 } 874 875 if (reg_addr >> 32) { 876 if (unlikely(!adev->nbio.funcs)) 877 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 878 else 879 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 880 } else { 881 pcie_index_hi = 0; 882 } 883 884 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 885 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 886 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 887 if (pcie_index_hi != 0) 888 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 889 pcie_index_hi * 4; 890 891 writel(reg_addr, pcie_index_offset); 892 readl(pcie_index_offset); 893 if (pcie_index_hi != 0) { 894 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 r = readl(pcie_data_offset); 898 899 /* clear the high bits */ 900 if (pcie_index_hi != 0) { 901 writel(0, pcie_index_hi_offset); 902 readl(pcie_index_hi_offset); 903 } 904 905 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 906 907 return r; 908 } 909 910 /** 911 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 912 * 913 * @adev: amdgpu_device pointer 914 * @reg_addr: indirect register address to read from 915 * 916 * Returns the value of indirect register @reg_addr 917 */ 918 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 919 u32 reg_addr) 920 { 921 unsigned long flags, pcie_index, pcie_data; 922 void __iomem *pcie_index_offset; 923 void __iomem *pcie_data_offset; 924 u64 r; 925 926 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 927 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 928 929 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 930 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 931 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 932 933 /* read low 32 bits */ 934 writel(reg_addr, pcie_index_offset); 935 readl(pcie_index_offset); 936 r = readl(pcie_data_offset); 937 /* read high 32 bits */ 938 writel(reg_addr + 4, pcie_index_offset); 939 readl(pcie_index_offset); 940 r |= ((u64)readl(pcie_data_offset) << 32); 941 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 942 943 return r; 944 } 945 946 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 947 u64 reg_addr) 948 { 949 unsigned long flags, pcie_index, pcie_data; 950 unsigned long pcie_index_hi = 0; 951 void __iomem *pcie_index_offset; 952 void __iomem *pcie_index_hi_offset; 953 void __iomem *pcie_data_offset; 954 u64 r; 955 956 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 957 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 958 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 959 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 960 961 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 962 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 963 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 964 if (pcie_index_hi != 0) 965 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 966 pcie_index_hi * 4; 967 968 /* read low 32 bits */ 969 writel(reg_addr, pcie_index_offset); 970 readl(pcie_index_offset); 971 if (pcie_index_hi != 0) { 972 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 973 readl(pcie_index_hi_offset); 974 } 975 r = readl(pcie_data_offset); 976 /* read high 32 bits */ 977 writel(reg_addr + 4, pcie_index_offset); 978 readl(pcie_index_offset); 979 if (pcie_index_hi != 0) { 980 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 981 readl(pcie_index_hi_offset); 982 } 983 r |= ((u64)readl(pcie_data_offset) << 32); 984 985 /* clear the high bits */ 986 if (pcie_index_hi != 0) { 987 writel(0, pcie_index_hi_offset); 988 readl(pcie_index_hi_offset); 989 } 990 991 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 992 993 return r; 994 } 995 996 /** 997 * amdgpu_device_indirect_wreg - write an indirect register address 998 * 999 * @adev: amdgpu_device pointer 1000 * @reg_addr: indirect register offset 1001 * @reg_data: indirect register data 1002 * 1003 */ 1004 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1005 u32 reg_addr, u32 reg_data) 1006 { 1007 unsigned long flags, pcie_index, pcie_data; 1008 void __iomem *pcie_index_offset; 1009 void __iomem *pcie_data_offset; 1010 1011 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1012 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1013 1014 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1015 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1016 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1017 1018 writel(reg_addr, pcie_index_offset); 1019 readl(pcie_index_offset); 1020 writel(reg_data, pcie_data_offset); 1021 readl(pcie_data_offset); 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 } 1024 1025 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1026 u64 reg_addr, u32 reg_data) 1027 { 1028 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1029 void __iomem *pcie_index_offset; 1030 void __iomem *pcie_index_hi_offset; 1031 void __iomem *pcie_data_offset; 1032 1033 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1034 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1035 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1036 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1037 else 1038 pcie_index_hi = 0; 1039 1040 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1041 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1042 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1043 if (pcie_index_hi != 0) 1044 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1045 pcie_index_hi * 4; 1046 1047 writel(reg_addr, pcie_index_offset); 1048 readl(pcie_index_offset); 1049 if (pcie_index_hi != 0) { 1050 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1051 readl(pcie_index_hi_offset); 1052 } 1053 writel(reg_data, pcie_data_offset); 1054 readl(pcie_data_offset); 1055 1056 /* clear the high bits */ 1057 if (pcie_index_hi != 0) { 1058 writel(0, pcie_index_hi_offset); 1059 readl(pcie_index_hi_offset); 1060 } 1061 1062 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1063 } 1064 1065 /** 1066 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1067 * 1068 * @adev: amdgpu_device pointer 1069 * @reg_addr: indirect register offset 1070 * @reg_data: indirect register data 1071 * 1072 */ 1073 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1074 u32 reg_addr, u64 reg_data) 1075 { 1076 unsigned long flags, pcie_index, pcie_data; 1077 void __iomem *pcie_index_offset; 1078 void __iomem *pcie_data_offset; 1079 1080 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1081 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1082 1083 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1084 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1085 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1086 1087 /* write low 32 bits */ 1088 writel(reg_addr, pcie_index_offset); 1089 readl(pcie_index_offset); 1090 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1091 readl(pcie_data_offset); 1092 /* write high 32 bits */ 1093 writel(reg_addr + 4, pcie_index_offset); 1094 readl(pcie_index_offset); 1095 writel((u32)(reg_data >> 32), pcie_data_offset); 1096 readl(pcie_data_offset); 1097 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1098 } 1099 1100 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1101 u64 reg_addr, u64 reg_data) 1102 { 1103 unsigned long flags, pcie_index, pcie_data; 1104 unsigned long pcie_index_hi = 0; 1105 void __iomem *pcie_index_offset; 1106 void __iomem *pcie_index_hi_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1112 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1113 1114 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1115 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1116 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1117 if (pcie_index_hi != 0) 1118 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1119 pcie_index_hi * 4; 1120 1121 /* write low 32 bits */ 1122 writel(reg_addr, pcie_index_offset); 1123 readl(pcie_index_offset); 1124 if (pcie_index_hi != 0) { 1125 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1126 readl(pcie_index_hi_offset); 1127 } 1128 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1129 readl(pcie_data_offset); 1130 /* write high 32 bits */ 1131 writel(reg_addr + 4, pcie_index_offset); 1132 readl(pcie_index_offset); 1133 if (pcie_index_hi != 0) { 1134 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1135 readl(pcie_index_hi_offset); 1136 } 1137 writel((u32)(reg_data >> 32), pcie_data_offset); 1138 readl(pcie_data_offset); 1139 1140 /* clear the high bits */ 1141 if (pcie_index_hi != 0) { 1142 writel(0, pcie_index_hi_offset); 1143 readl(pcie_index_hi_offset); 1144 } 1145 1146 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1147 } 1148 1149 /** 1150 * amdgpu_device_get_rev_id - query device rev_id 1151 * 1152 * @adev: amdgpu_device pointer 1153 * 1154 * Return device rev_id 1155 */ 1156 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1157 { 1158 return adev->nbio.funcs->get_rev_id(adev); 1159 } 1160 1161 /** 1162 * amdgpu_invalid_rreg - dummy reg read function 1163 * 1164 * @adev: amdgpu_device pointer 1165 * @reg: offset of register 1166 * 1167 * Dummy register read function. Used for register blocks 1168 * that certain asics don't have (all asics). 1169 * Returns the value in the register. 1170 */ 1171 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1172 { 1173 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1174 BUG(); 1175 return 0; 1176 } 1177 1178 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1179 { 1180 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1181 BUG(); 1182 return 0; 1183 } 1184 1185 /** 1186 * amdgpu_invalid_wreg - dummy reg write function 1187 * 1188 * @adev: amdgpu_device pointer 1189 * @reg: offset of register 1190 * @v: value to write to the register 1191 * 1192 * Dummy register read function. Used for register blocks 1193 * that certain asics don't have (all asics). 1194 */ 1195 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1196 { 1197 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1198 reg, v); 1199 BUG(); 1200 } 1201 1202 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1203 { 1204 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1205 reg, v); 1206 BUG(); 1207 } 1208 1209 /** 1210 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1211 * 1212 * @adev: amdgpu_device pointer 1213 * @reg: offset of register 1214 * 1215 * Dummy register read function. Used for register blocks 1216 * that certain asics don't have (all asics). 1217 * Returns the value in the register. 1218 */ 1219 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1220 { 1221 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1222 BUG(); 1223 return 0; 1224 } 1225 1226 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1227 { 1228 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1229 BUG(); 1230 return 0; 1231 } 1232 1233 /** 1234 * amdgpu_invalid_wreg64 - dummy reg write function 1235 * 1236 * @adev: amdgpu_device pointer 1237 * @reg: offset of register 1238 * @v: value to write to the register 1239 * 1240 * Dummy register read function. Used for register blocks 1241 * that certain asics don't have (all asics). 1242 */ 1243 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1244 { 1245 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1246 reg, v); 1247 BUG(); 1248 } 1249 1250 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1251 { 1252 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1253 reg, v); 1254 BUG(); 1255 } 1256 1257 /** 1258 * amdgpu_block_invalid_rreg - dummy reg read function 1259 * 1260 * @adev: amdgpu_device pointer 1261 * @block: offset of instance 1262 * @reg: offset of register 1263 * 1264 * Dummy register read function. Used for register blocks 1265 * that certain asics don't have (all asics). 1266 * Returns the value in the register. 1267 */ 1268 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1269 uint32_t block, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1272 reg, block); 1273 BUG(); 1274 return 0; 1275 } 1276 1277 /** 1278 * amdgpu_block_invalid_wreg - dummy reg write function 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @block: offset of instance 1282 * @reg: offset of register 1283 * @v: value to write to the register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 */ 1288 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1289 uint32_t block, 1290 uint32_t reg, uint32_t v) 1291 { 1292 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1293 reg, block, v); 1294 BUG(); 1295 } 1296 1297 /** 1298 * amdgpu_device_asic_init - Wrapper for atom asic_init 1299 * 1300 * @adev: amdgpu_device pointer 1301 * 1302 * Does any asic specific work and then calls atom asic init. 1303 */ 1304 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1305 { 1306 int ret; 1307 1308 amdgpu_asic_pre_asic_init(adev); 1309 1310 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1311 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1312 amdgpu_psp_wait_for_bootloader(adev); 1313 ret = amdgpu_atomfirmware_asic_init(adev, true); 1314 return ret; 1315 } else { 1316 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1317 } 1318 1319 return 0; 1320 } 1321 1322 /** 1323 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1324 * 1325 * @adev: amdgpu_device pointer 1326 * 1327 * Allocates a scratch page of VRAM for use by various things in the 1328 * driver. 1329 */ 1330 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1331 { 1332 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1333 AMDGPU_GEM_DOMAIN_VRAM | 1334 AMDGPU_GEM_DOMAIN_GTT, 1335 &adev->mem_scratch.robj, 1336 &adev->mem_scratch.gpu_addr, 1337 (void **)&adev->mem_scratch.ptr); 1338 } 1339 1340 /** 1341 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1342 * 1343 * @adev: amdgpu_device pointer 1344 * 1345 * Frees the VRAM scratch page. 1346 */ 1347 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1348 { 1349 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1350 } 1351 1352 /** 1353 * amdgpu_device_program_register_sequence - program an array of registers. 1354 * 1355 * @adev: amdgpu_device pointer 1356 * @registers: pointer to the register array 1357 * @array_size: size of the register array 1358 * 1359 * Programs an array or registers with and or masks. 1360 * This is a helper for setting golden registers. 1361 */ 1362 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1363 const u32 *registers, 1364 const u32 array_size) 1365 { 1366 u32 tmp, reg, and_mask, or_mask; 1367 int i; 1368 1369 if (array_size % 3) 1370 return; 1371 1372 for (i = 0; i < array_size; i += 3) { 1373 reg = registers[i + 0]; 1374 and_mask = registers[i + 1]; 1375 or_mask = registers[i + 2]; 1376 1377 if (and_mask == 0xffffffff) { 1378 tmp = or_mask; 1379 } else { 1380 tmp = RREG32(reg); 1381 tmp &= ~and_mask; 1382 if (adev->family >= AMDGPU_FAMILY_AI) 1383 tmp |= (or_mask & and_mask); 1384 else 1385 tmp |= or_mask; 1386 } 1387 WREG32(reg, tmp); 1388 } 1389 } 1390 1391 /** 1392 * amdgpu_device_pci_config_reset - reset the GPU 1393 * 1394 * @adev: amdgpu_device pointer 1395 * 1396 * Resets the GPU using the pci config reset sequence. 1397 * Only applicable to asics prior to vega10. 1398 */ 1399 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1400 { 1401 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1402 } 1403 1404 /** 1405 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1406 * 1407 * @adev: amdgpu_device pointer 1408 * 1409 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1410 */ 1411 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1412 { 1413 return pci_reset_function(adev->pdev); 1414 } 1415 1416 /* 1417 * amdgpu_device_wb_*() 1418 * Writeback is the method by which the GPU updates special pages in memory 1419 * with the status of certain GPU events (fences, ring pointers,etc.). 1420 */ 1421 1422 /** 1423 * amdgpu_device_wb_fini - Disable Writeback and free memory 1424 * 1425 * @adev: amdgpu_device pointer 1426 * 1427 * Disables Writeback and frees the Writeback memory (all asics). 1428 * Used at driver shutdown. 1429 */ 1430 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1431 { 1432 if (adev->wb.wb_obj) { 1433 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1434 &adev->wb.gpu_addr, 1435 (void **)&adev->wb.wb); 1436 adev->wb.wb_obj = NULL; 1437 } 1438 } 1439 1440 /** 1441 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1442 * 1443 * @adev: amdgpu_device pointer 1444 * 1445 * Initializes writeback and allocates writeback memory (all asics). 1446 * Used at driver startup. 1447 * Returns 0 on success or an -error on failure. 1448 */ 1449 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1450 { 1451 int r; 1452 1453 if (adev->wb.wb_obj == NULL) { 1454 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1455 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1456 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1457 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1458 (void **)&adev->wb.wb); 1459 if (r) { 1460 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1461 return r; 1462 } 1463 1464 adev->wb.num_wb = AMDGPU_MAX_WB; 1465 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1466 1467 /* clear wb memory */ 1468 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1469 } 1470 1471 return 0; 1472 } 1473 1474 /** 1475 * amdgpu_device_wb_get - Allocate a wb entry 1476 * 1477 * @adev: amdgpu_device pointer 1478 * @wb: wb index 1479 * 1480 * Allocate a wb slot for use by the driver (all asics). 1481 * Returns 0 on success or -EINVAL on failure. 1482 */ 1483 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1484 { 1485 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1486 1487 if (offset < adev->wb.num_wb) { 1488 __set_bit(offset, adev->wb.used); 1489 *wb = offset << 3; /* convert to dw offset */ 1490 return 0; 1491 } else { 1492 return -EINVAL; 1493 } 1494 } 1495 1496 /** 1497 * amdgpu_device_wb_free - Free a wb entry 1498 * 1499 * @adev: amdgpu_device pointer 1500 * @wb: wb index 1501 * 1502 * Free a wb slot allocated for use by the driver (all asics) 1503 */ 1504 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1505 { 1506 wb >>= 3; 1507 if (wb < adev->wb.num_wb) 1508 __clear_bit(wb, adev->wb.used); 1509 } 1510 1511 /** 1512 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1513 * 1514 * @adev: amdgpu_device pointer 1515 * 1516 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1517 * to fail, but if any of the BARs is not accessible after the size we abort 1518 * driver loading by returning -ENODEV. 1519 */ 1520 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1521 { 1522 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1523 struct pci_bus *root; 1524 struct resource *res; 1525 unsigned int i; 1526 u16 cmd; 1527 int r; 1528 1529 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1530 return 0; 1531 1532 /* Bypass for VF */ 1533 if (amdgpu_sriov_vf(adev)) 1534 return 0; 1535 1536 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1537 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1538 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1539 1540 /* skip if the bios has already enabled large BAR */ 1541 if (adev->gmc.real_vram_size && 1542 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1543 return 0; 1544 1545 /* Check if the root BUS has 64bit memory resources */ 1546 root = adev->pdev->bus; 1547 while (root->parent) 1548 root = root->parent; 1549 1550 pci_bus_for_each_resource(root, res, i) { 1551 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1552 res->start > 0x100000000ull) 1553 break; 1554 } 1555 1556 /* Trying to resize is pointless without a root hub window above 4GB */ 1557 if (!res) 1558 return 0; 1559 1560 /* Limit the BAR size to what is available */ 1561 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1562 rbar_size); 1563 1564 /* Disable memory decoding while we change the BAR addresses and size */ 1565 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1566 pci_write_config_word(adev->pdev, PCI_COMMAND, 1567 cmd & ~PCI_COMMAND_MEMORY); 1568 1569 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1570 amdgpu_doorbell_fini(adev); 1571 if (adev->asic_type >= CHIP_BONAIRE) 1572 pci_release_resource(adev->pdev, 2); 1573 1574 pci_release_resource(adev->pdev, 0); 1575 1576 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1577 if (r == -ENOSPC) 1578 DRM_INFO("Not enough PCI address space for a large BAR."); 1579 else if (r && r != -ENOTSUPP) 1580 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1581 1582 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1583 1584 /* When the doorbell or fb BAR isn't available we have no chance of 1585 * using the device. 1586 */ 1587 r = amdgpu_doorbell_init(adev); 1588 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1589 return -ENODEV; 1590 1591 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1592 1593 return 0; 1594 } 1595 1596 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1597 { 1598 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1599 return false; 1600 1601 return true; 1602 } 1603 1604 /* 1605 * GPU helpers function. 1606 */ 1607 /** 1608 * amdgpu_device_need_post - check if the hw need post or not 1609 * 1610 * @adev: amdgpu_device pointer 1611 * 1612 * Check if the asic has been initialized (all asics) at driver startup 1613 * or post is needed if hw reset is performed. 1614 * Returns true if need or false if not. 1615 */ 1616 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1617 { 1618 uint32_t reg; 1619 1620 if (amdgpu_sriov_vf(adev)) 1621 return false; 1622 1623 if (!amdgpu_device_read_bios(adev)) 1624 return false; 1625 1626 if (amdgpu_passthrough(adev)) { 1627 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1628 * some old smc fw still need driver do vPost otherwise gpu hang, while 1629 * those smc fw version above 22.15 doesn't have this flaw, so we force 1630 * vpost executed for smc version below 22.15 1631 */ 1632 if (adev->asic_type == CHIP_FIJI) { 1633 int err; 1634 uint32_t fw_ver; 1635 1636 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1637 /* force vPost if error occured */ 1638 if (err) 1639 return true; 1640 1641 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1642 release_firmware(adev->pm.fw); 1643 if (fw_ver < 0x00160e00) 1644 return true; 1645 } 1646 } 1647 1648 /* Don't post if we need to reset whole hive on init */ 1649 if (adev->gmc.xgmi.pending_reset) 1650 return false; 1651 1652 if (adev->has_hw_reset) { 1653 adev->has_hw_reset = false; 1654 return true; 1655 } 1656 1657 /* bios scratch used on CIK+ */ 1658 if (adev->asic_type >= CHIP_BONAIRE) 1659 return amdgpu_atombios_scratch_need_asic_init(adev); 1660 1661 /* check MEM_SIZE for older asics */ 1662 reg = amdgpu_asic_get_config_memsize(adev); 1663 1664 if ((reg != 0) && (reg != 0xffffffff)) 1665 return false; 1666 1667 return true; 1668 } 1669 1670 /* 1671 * Check whether seamless boot is supported. 1672 * 1673 * So far we only support seamless boot on DCE 3.0 or later. 1674 * If users report that it works on older ASICS as well, we may 1675 * loosen this. 1676 */ 1677 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1678 { 1679 switch (amdgpu_seamless) { 1680 case -1: 1681 break; 1682 case 1: 1683 return true; 1684 case 0: 1685 return false; 1686 default: 1687 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1688 amdgpu_seamless); 1689 return false; 1690 } 1691 1692 if (!(adev->flags & AMD_IS_APU)) 1693 return false; 1694 1695 if (adev->mman.keep_stolen_vga_memory) 1696 return false; 1697 1698 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1699 } 1700 1701 /* 1702 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1703 * don't support dynamic speed switching. Until we have confirmation from Intel 1704 * that a specific host supports it, it's safer that we keep it disabled for all. 1705 * 1706 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1707 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1708 */ 1709 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1710 { 1711 #if IS_ENABLED(CONFIG_X86) 1712 struct cpuinfo_x86 *c = &cpu_data(0); 1713 1714 /* eGPU change speeds based on USB4 fabric conditions */ 1715 if (dev_is_removable(adev->dev)) 1716 return true; 1717 1718 if (c->x86_vendor == X86_VENDOR_INTEL) 1719 return false; 1720 #endif 1721 return true; 1722 } 1723 1724 /** 1725 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1726 * 1727 * @adev: amdgpu_device pointer 1728 * 1729 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1730 * be set for this device. 1731 * 1732 * Returns true if it should be used or false if not. 1733 */ 1734 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1735 { 1736 switch (amdgpu_aspm) { 1737 case -1: 1738 break; 1739 case 0: 1740 return false; 1741 case 1: 1742 return true; 1743 default: 1744 return false; 1745 } 1746 if (adev->flags & AMD_IS_APU) 1747 return false; 1748 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1749 return false; 1750 return pcie_aspm_enabled(adev->pdev); 1751 } 1752 1753 /* if we get transitioned to only one device, take VGA back */ 1754 /** 1755 * amdgpu_device_vga_set_decode - enable/disable vga decode 1756 * 1757 * @pdev: PCI device pointer 1758 * @state: enable/disable vga decode 1759 * 1760 * Enable/disable vga decode (all asics). 1761 * Returns VGA resource flags. 1762 */ 1763 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1764 bool state) 1765 { 1766 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1767 1768 amdgpu_asic_set_vga_state(adev, state); 1769 if (state) 1770 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1771 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1772 else 1773 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1774 } 1775 1776 /** 1777 * amdgpu_device_check_block_size - validate the vm block size 1778 * 1779 * @adev: amdgpu_device pointer 1780 * 1781 * Validates the vm block size specified via module parameter. 1782 * The vm block size defines number of bits in page table versus page directory, 1783 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1784 * page table and the remaining bits are in the page directory. 1785 */ 1786 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1787 { 1788 /* defines number of bits in page table versus page directory, 1789 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1790 * page table and the remaining bits are in the page directory 1791 */ 1792 if (amdgpu_vm_block_size == -1) 1793 return; 1794 1795 if (amdgpu_vm_block_size < 9) { 1796 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1797 amdgpu_vm_block_size); 1798 amdgpu_vm_block_size = -1; 1799 } 1800 } 1801 1802 /** 1803 * amdgpu_device_check_vm_size - validate the vm size 1804 * 1805 * @adev: amdgpu_device pointer 1806 * 1807 * Validates the vm size in GB specified via module parameter. 1808 * The VM size is the size of the GPU virtual memory space in GB. 1809 */ 1810 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1811 { 1812 /* no need to check the default value */ 1813 if (amdgpu_vm_size == -1) 1814 return; 1815 1816 if (amdgpu_vm_size < 1) { 1817 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1818 amdgpu_vm_size); 1819 amdgpu_vm_size = -1; 1820 } 1821 } 1822 1823 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1824 { 1825 struct sysinfo si; 1826 bool is_os_64 = (sizeof(void *) == 8); 1827 uint64_t total_memory; 1828 uint64_t dram_size_seven_GB = 0x1B8000000; 1829 uint64_t dram_size_three_GB = 0xB8000000; 1830 1831 if (amdgpu_smu_memory_pool_size == 0) 1832 return; 1833 1834 if (!is_os_64) { 1835 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1836 goto def_value; 1837 } 1838 si_meminfo(&si); 1839 total_memory = (uint64_t)si.totalram * si.mem_unit; 1840 1841 if ((amdgpu_smu_memory_pool_size == 1) || 1842 (amdgpu_smu_memory_pool_size == 2)) { 1843 if (total_memory < dram_size_three_GB) 1844 goto def_value1; 1845 } else if ((amdgpu_smu_memory_pool_size == 4) || 1846 (amdgpu_smu_memory_pool_size == 8)) { 1847 if (total_memory < dram_size_seven_GB) 1848 goto def_value1; 1849 } else { 1850 DRM_WARN("Smu memory pool size not supported\n"); 1851 goto def_value; 1852 } 1853 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1854 1855 return; 1856 1857 def_value1: 1858 DRM_WARN("No enough system memory\n"); 1859 def_value: 1860 adev->pm.smu_prv_buffer_size = 0; 1861 } 1862 1863 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1864 { 1865 if (!(adev->flags & AMD_IS_APU) || 1866 adev->asic_type < CHIP_RAVEN) 1867 return 0; 1868 1869 switch (adev->asic_type) { 1870 case CHIP_RAVEN: 1871 if (adev->pdev->device == 0x15dd) 1872 adev->apu_flags |= AMD_APU_IS_RAVEN; 1873 if (adev->pdev->device == 0x15d8) 1874 adev->apu_flags |= AMD_APU_IS_PICASSO; 1875 break; 1876 case CHIP_RENOIR: 1877 if ((adev->pdev->device == 0x1636) || 1878 (adev->pdev->device == 0x164c)) 1879 adev->apu_flags |= AMD_APU_IS_RENOIR; 1880 else 1881 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1882 break; 1883 case CHIP_VANGOGH: 1884 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1885 break; 1886 case CHIP_YELLOW_CARP: 1887 break; 1888 case CHIP_CYAN_SKILLFISH: 1889 if ((adev->pdev->device == 0x13FE) || 1890 (adev->pdev->device == 0x143F)) 1891 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1892 break; 1893 default: 1894 break; 1895 } 1896 1897 return 0; 1898 } 1899 1900 /** 1901 * amdgpu_device_check_arguments - validate module params 1902 * 1903 * @adev: amdgpu_device pointer 1904 * 1905 * Validates certain module parameters and updates 1906 * the associated values used by the driver (all asics). 1907 */ 1908 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1909 { 1910 if (amdgpu_sched_jobs < 4) { 1911 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1912 amdgpu_sched_jobs); 1913 amdgpu_sched_jobs = 4; 1914 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1915 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1916 amdgpu_sched_jobs); 1917 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1918 } 1919 1920 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1921 /* gart size must be greater or equal to 32M */ 1922 dev_warn(adev->dev, "gart size (%d) too small\n", 1923 amdgpu_gart_size); 1924 amdgpu_gart_size = -1; 1925 } 1926 1927 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1928 /* gtt size must be greater or equal to 32M */ 1929 dev_warn(adev->dev, "gtt size (%d) too small\n", 1930 amdgpu_gtt_size); 1931 amdgpu_gtt_size = -1; 1932 } 1933 1934 /* valid range is between 4 and 9 inclusive */ 1935 if (amdgpu_vm_fragment_size != -1 && 1936 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1937 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1938 amdgpu_vm_fragment_size = -1; 1939 } 1940 1941 if (amdgpu_sched_hw_submission < 2) { 1942 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1943 amdgpu_sched_hw_submission); 1944 amdgpu_sched_hw_submission = 2; 1945 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1946 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1947 amdgpu_sched_hw_submission); 1948 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1949 } 1950 1951 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1952 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1953 amdgpu_reset_method = -1; 1954 } 1955 1956 amdgpu_device_check_smu_prv_buffer_size(adev); 1957 1958 amdgpu_device_check_vm_size(adev); 1959 1960 amdgpu_device_check_block_size(adev); 1961 1962 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1963 1964 return 0; 1965 } 1966 1967 /** 1968 * amdgpu_switcheroo_set_state - set switcheroo state 1969 * 1970 * @pdev: pci dev pointer 1971 * @state: vga_switcheroo state 1972 * 1973 * Callback for the switcheroo driver. Suspends or resumes 1974 * the asics before or after it is powered up using ACPI methods. 1975 */ 1976 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1977 enum vga_switcheroo_state state) 1978 { 1979 struct drm_device *dev = pci_get_drvdata(pdev); 1980 int r; 1981 1982 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1983 return; 1984 1985 if (state == VGA_SWITCHEROO_ON) { 1986 pr_info("switched on\n"); 1987 /* don't suspend or resume card normally */ 1988 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1989 1990 pci_set_power_state(pdev, PCI_D0); 1991 amdgpu_device_load_pci_state(pdev); 1992 r = pci_enable_device(pdev); 1993 if (r) 1994 DRM_WARN("pci_enable_device failed (%d)\n", r); 1995 amdgpu_device_resume(dev, true); 1996 1997 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1998 } else { 1999 pr_info("switched off\n"); 2000 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2001 amdgpu_device_prepare(dev); 2002 amdgpu_device_suspend(dev, true); 2003 amdgpu_device_cache_pci_state(pdev); 2004 /* Shut down the device */ 2005 pci_disable_device(pdev); 2006 pci_set_power_state(pdev, PCI_D3cold); 2007 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2008 } 2009 } 2010 2011 /** 2012 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2013 * 2014 * @pdev: pci dev pointer 2015 * 2016 * Callback for the switcheroo driver. Check of the switcheroo 2017 * state can be changed. 2018 * Returns true if the state can be changed, false if not. 2019 */ 2020 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2021 { 2022 struct drm_device *dev = pci_get_drvdata(pdev); 2023 2024 /* 2025 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2026 * locking inversion with the driver load path. And the access here is 2027 * completely racy anyway. So don't bother with locking for now. 2028 */ 2029 return atomic_read(&dev->open_count) == 0; 2030 } 2031 2032 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2033 .set_gpu_state = amdgpu_switcheroo_set_state, 2034 .reprobe = NULL, 2035 .can_switch = amdgpu_switcheroo_can_switch, 2036 }; 2037 2038 /** 2039 * amdgpu_device_ip_set_clockgating_state - set the CG state 2040 * 2041 * @dev: amdgpu_device pointer 2042 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2043 * @state: clockgating state (gate or ungate) 2044 * 2045 * Sets the requested clockgating state for all instances of 2046 * the hardware IP specified. 2047 * Returns the error code from the last instance. 2048 */ 2049 int amdgpu_device_ip_set_clockgating_state(void *dev, 2050 enum amd_ip_block_type block_type, 2051 enum amd_clockgating_state state) 2052 { 2053 struct amdgpu_device *adev = dev; 2054 int i, r = 0; 2055 2056 for (i = 0; i < adev->num_ip_blocks; i++) { 2057 if (!adev->ip_blocks[i].status.valid) 2058 continue; 2059 if (adev->ip_blocks[i].version->type != block_type) 2060 continue; 2061 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2062 continue; 2063 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2064 (void *)adev, state); 2065 if (r) 2066 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2067 adev->ip_blocks[i].version->funcs->name, r); 2068 } 2069 return r; 2070 } 2071 2072 /** 2073 * amdgpu_device_ip_set_powergating_state - set the PG state 2074 * 2075 * @dev: amdgpu_device pointer 2076 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2077 * @state: powergating state (gate or ungate) 2078 * 2079 * Sets the requested powergating state for all instances of 2080 * the hardware IP specified. 2081 * Returns the error code from the last instance. 2082 */ 2083 int amdgpu_device_ip_set_powergating_state(void *dev, 2084 enum amd_ip_block_type block_type, 2085 enum amd_powergating_state state) 2086 { 2087 struct amdgpu_device *adev = dev; 2088 int i, r = 0; 2089 2090 for (i = 0; i < adev->num_ip_blocks; i++) { 2091 if (!adev->ip_blocks[i].status.valid) 2092 continue; 2093 if (adev->ip_blocks[i].version->type != block_type) 2094 continue; 2095 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2096 continue; 2097 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2098 (void *)adev, state); 2099 if (r) 2100 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2101 adev->ip_blocks[i].version->funcs->name, r); 2102 } 2103 return r; 2104 } 2105 2106 /** 2107 * amdgpu_device_ip_get_clockgating_state - get the CG state 2108 * 2109 * @adev: amdgpu_device pointer 2110 * @flags: clockgating feature flags 2111 * 2112 * Walks the list of IPs on the device and updates the clockgating 2113 * flags for each IP. 2114 * Updates @flags with the feature flags for each hardware IP where 2115 * clockgating is enabled. 2116 */ 2117 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2118 u64 *flags) 2119 { 2120 int i; 2121 2122 for (i = 0; i < adev->num_ip_blocks; i++) { 2123 if (!adev->ip_blocks[i].status.valid) 2124 continue; 2125 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2126 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2127 } 2128 } 2129 2130 /** 2131 * amdgpu_device_ip_wait_for_idle - wait for idle 2132 * 2133 * @adev: amdgpu_device pointer 2134 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2135 * 2136 * Waits for the request hardware IP to be idle. 2137 * Returns 0 for success or a negative error code on failure. 2138 */ 2139 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2140 enum amd_ip_block_type block_type) 2141 { 2142 int i, r; 2143 2144 for (i = 0; i < adev->num_ip_blocks; i++) { 2145 if (!adev->ip_blocks[i].status.valid) 2146 continue; 2147 if (adev->ip_blocks[i].version->type == block_type) { 2148 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2149 if (r) 2150 return r; 2151 break; 2152 } 2153 } 2154 return 0; 2155 2156 } 2157 2158 /** 2159 * amdgpu_device_ip_is_idle - is the hardware IP idle 2160 * 2161 * @adev: amdgpu_device pointer 2162 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2163 * 2164 * Check if the hardware IP is idle or not. 2165 * Returns true if it the IP is idle, false if not. 2166 */ 2167 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2168 enum amd_ip_block_type block_type) 2169 { 2170 int i; 2171 2172 for (i = 0; i < adev->num_ip_blocks; i++) { 2173 if (!adev->ip_blocks[i].status.valid) 2174 continue; 2175 if (adev->ip_blocks[i].version->type == block_type) 2176 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2177 } 2178 return true; 2179 2180 } 2181 2182 /** 2183 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2184 * 2185 * @adev: amdgpu_device pointer 2186 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2187 * 2188 * Returns a pointer to the hardware IP block structure 2189 * if it exists for the asic, otherwise NULL. 2190 */ 2191 struct amdgpu_ip_block * 2192 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2193 enum amd_ip_block_type type) 2194 { 2195 int i; 2196 2197 for (i = 0; i < adev->num_ip_blocks; i++) 2198 if (adev->ip_blocks[i].version->type == type) 2199 return &adev->ip_blocks[i]; 2200 2201 return NULL; 2202 } 2203 2204 /** 2205 * amdgpu_device_ip_block_version_cmp 2206 * 2207 * @adev: amdgpu_device pointer 2208 * @type: enum amd_ip_block_type 2209 * @major: major version 2210 * @minor: minor version 2211 * 2212 * return 0 if equal or greater 2213 * return 1 if smaller or the ip_block doesn't exist 2214 */ 2215 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2216 enum amd_ip_block_type type, 2217 u32 major, u32 minor) 2218 { 2219 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2220 2221 if (ip_block && ((ip_block->version->major > major) || 2222 ((ip_block->version->major == major) && 2223 (ip_block->version->minor >= minor)))) 2224 return 0; 2225 2226 return 1; 2227 } 2228 2229 /** 2230 * amdgpu_device_ip_block_add 2231 * 2232 * @adev: amdgpu_device pointer 2233 * @ip_block_version: pointer to the IP to add 2234 * 2235 * Adds the IP block driver information to the collection of IPs 2236 * on the asic. 2237 */ 2238 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2239 const struct amdgpu_ip_block_version *ip_block_version) 2240 { 2241 if (!ip_block_version) 2242 return -EINVAL; 2243 2244 switch (ip_block_version->type) { 2245 case AMD_IP_BLOCK_TYPE_VCN: 2246 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2247 return 0; 2248 break; 2249 case AMD_IP_BLOCK_TYPE_JPEG: 2250 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2251 return 0; 2252 break; 2253 default: 2254 break; 2255 } 2256 2257 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2258 ip_block_version->funcs->name); 2259 2260 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2261 2262 return 0; 2263 } 2264 2265 /** 2266 * amdgpu_device_enable_virtual_display - enable virtual display feature 2267 * 2268 * @adev: amdgpu_device pointer 2269 * 2270 * Enabled the virtual display feature if the user has enabled it via 2271 * the module parameter virtual_display. This feature provides a virtual 2272 * display hardware on headless boards or in virtualized environments. 2273 * This function parses and validates the configuration string specified by 2274 * the user and configues the virtual display configuration (number of 2275 * virtual connectors, crtcs, etc.) specified. 2276 */ 2277 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2278 { 2279 adev->enable_virtual_display = false; 2280 2281 if (amdgpu_virtual_display) { 2282 const char *pci_address_name = pci_name(adev->pdev); 2283 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2284 2285 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2286 pciaddstr_tmp = pciaddstr; 2287 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2288 pciaddname = strsep(&pciaddname_tmp, ","); 2289 if (!strcmp("all", pciaddname) 2290 || !strcmp(pci_address_name, pciaddname)) { 2291 long num_crtc; 2292 int res = -1; 2293 2294 adev->enable_virtual_display = true; 2295 2296 if (pciaddname_tmp) 2297 res = kstrtol(pciaddname_tmp, 10, 2298 &num_crtc); 2299 2300 if (!res) { 2301 if (num_crtc < 1) 2302 num_crtc = 1; 2303 if (num_crtc > 6) 2304 num_crtc = 6; 2305 adev->mode_info.num_crtc = num_crtc; 2306 } else { 2307 adev->mode_info.num_crtc = 1; 2308 } 2309 break; 2310 } 2311 } 2312 2313 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2314 amdgpu_virtual_display, pci_address_name, 2315 adev->enable_virtual_display, adev->mode_info.num_crtc); 2316 2317 kfree(pciaddstr); 2318 } 2319 } 2320 2321 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2322 { 2323 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2324 adev->mode_info.num_crtc = 1; 2325 adev->enable_virtual_display = true; 2326 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2327 adev->enable_virtual_display, adev->mode_info.num_crtc); 2328 } 2329 } 2330 2331 /** 2332 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2333 * 2334 * @adev: amdgpu_device pointer 2335 * 2336 * Parses the asic configuration parameters specified in the gpu info 2337 * firmware and makes them availale to the driver for use in configuring 2338 * the asic. 2339 * Returns 0 on success, -EINVAL on failure. 2340 */ 2341 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2342 { 2343 const char *chip_name; 2344 char fw_name[40]; 2345 int err; 2346 const struct gpu_info_firmware_header_v1_0 *hdr; 2347 2348 adev->firmware.gpu_info_fw = NULL; 2349 2350 if (adev->mman.discovery_bin) 2351 return 0; 2352 2353 switch (adev->asic_type) { 2354 default: 2355 return 0; 2356 case CHIP_VEGA10: 2357 chip_name = "vega10"; 2358 break; 2359 case CHIP_VEGA12: 2360 chip_name = "vega12"; 2361 break; 2362 case CHIP_RAVEN: 2363 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2364 chip_name = "raven2"; 2365 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2366 chip_name = "picasso"; 2367 else 2368 chip_name = "raven"; 2369 break; 2370 case CHIP_ARCTURUS: 2371 chip_name = "arcturus"; 2372 break; 2373 case CHIP_NAVI12: 2374 chip_name = "navi12"; 2375 break; 2376 } 2377 2378 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2379 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2380 if (err) { 2381 dev_err(adev->dev, 2382 "Failed to get gpu_info firmware \"%s\"\n", 2383 fw_name); 2384 goto out; 2385 } 2386 2387 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2388 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2389 2390 switch (hdr->version_major) { 2391 case 1: 2392 { 2393 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2394 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2395 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2396 2397 /* 2398 * Should be droped when DAL no longer needs it. 2399 */ 2400 if (adev->asic_type == CHIP_NAVI12) 2401 goto parse_soc_bounding_box; 2402 2403 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2404 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2405 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2406 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2407 adev->gfx.config.max_texture_channel_caches = 2408 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2409 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2410 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2411 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2412 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2413 adev->gfx.config.double_offchip_lds_buf = 2414 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2415 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2416 adev->gfx.cu_info.max_waves_per_simd = 2417 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2418 adev->gfx.cu_info.max_scratch_slots_per_cu = 2419 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2420 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2421 if (hdr->version_minor >= 1) { 2422 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2423 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2424 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2425 adev->gfx.config.num_sc_per_sh = 2426 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2427 adev->gfx.config.num_packer_per_sc = 2428 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2429 } 2430 2431 parse_soc_bounding_box: 2432 /* 2433 * soc bounding box info is not integrated in disocovery table, 2434 * we always need to parse it from gpu info firmware if needed. 2435 */ 2436 if (hdr->version_minor == 2) { 2437 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2438 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2439 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2440 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2441 } 2442 break; 2443 } 2444 default: 2445 dev_err(adev->dev, 2446 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2447 err = -EINVAL; 2448 goto out; 2449 } 2450 out: 2451 return err; 2452 } 2453 2454 /** 2455 * amdgpu_device_ip_early_init - run early init for hardware IPs 2456 * 2457 * @adev: amdgpu_device pointer 2458 * 2459 * Early initialization pass for hardware IPs. The hardware IPs that make 2460 * up each asic are discovered each IP's early_init callback is run. This 2461 * is the first stage in initializing the asic. 2462 * Returns 0 on success, negative error code on failure. 2463 */ 2464 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2465 { 2466 struct pci_dev *parent; 2467 int i, r; 2468 bool total; 2469 2470 amdgpu_device_enable_virtual_display(adev); 2471 2472 if (amdgpu_sriov_vf(adev)) { 2473 r = amdgpu_virt_request_full_gpu(adev, true); 2474 if (r) 2475 return r; 2476 } 2477 2478 switch (adev->asic_type) { 2479 #ifdef CONFIG_DRM_AMDGPU_SI 2480 case CHIP_VERDE: 2481 case CHIP_TAHITI: 2482 case CHIP_PITCAIRN: 2483 case CHIP_OLAND: 2484 case CHIP_HAINAN: 2485 adev->family = AMDGPU_FAMILY_SI; 2486 r = si_set_ip_blocks(adev); 2487 if (r) 2488 return r; 2489 break; 2490 #endif 2491 #ifdef CONFIG_DRM_AMDGPU_CIK 2492 case CHIP_BONAIRE: 2493 case CHIP_HAWAII: 2494 case CHIP_KAVERI: 2495 case CHIP_KABINI: 2496 case CHIP_MULLINS: 2497 if (adev->flags & AMD_IS_APU) 2498 adev->family = AMDGPU_FAMILY_KV; 2499 else 2500 adev->family = AMDGPU_FAMILY_CI; 2501 2502 r = cik_set_ip_blocks(adev); 2503 if (r) 2504 return r; 2505 break; 2506 #endif 2507 case CHIP_TOPAZ: 2508 case CHIP_TONGA: 2509 case CHIP_FIJI: 2510 case CHIP_POLARIS10: 2511 case CHIP_POLARIS11: 2512 case CHIP_POLARIS12: 2513 case CHIP_VEGAM: 2514 case CHIP_CARRIZO: 2515 case CHIP_STONEY: 2516 if (adev->flags & AMD_IS_APU) 2517 adev->family = AMDGPU_FAMILY_CZ; 2518 else 2519 adev->family = AMDGPU_FAMILY_VI; 2520 2521 r = vi_set_ip_blocks(adev); 2522 if (r) 2523 return r; 2524 break; 2525 default: 2526 r = amdgpu_discovery_set_ip_blocks(adev); 2527 if (r) 2528 return r; 2529 break; 2530 } 2531 2532 if (amdgpu_has_atpx() && 2533 (amdgpu_is_atpx_hybrid() || 2534 amdgpu_has_atpx_dgpu_power_cntl()) && 2535 ((adev->flags & AMD_IS_APU) == 0) && 2536 !dev_is_removable(&adev->pdev->dev)) 2537 adev->flags |= AMD_IS_PX; 2538 2539 if (!(adev->flags & AMD_IS_APU)) { 2540 parent = pcie_find_root_port(adev->pdev); 2541 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2542 } 2543 2544 2545 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2546 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2547 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2548 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2549 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2550 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2551 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2552 2553 total = true; 2554 for (i = 0; i < adev->num_ip_blocks; i++) { 2555 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2556 DRM_WARN("disabled ip block: %d <%s>\n", 2557 i, adev->ip_blocks[i].version->funcs->name); 2558 adev->ip_blocks[i].status.valid = false; 2559 } else { 2560 if (adev->ip_blocks[i].version->funcs->early_init) { 2561 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2562 if (r == -ENOENT) { 2563 adev->ip_blocks[i].status.valid = false; 2564 } else if (r) { 2565 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2566 adev->ip_blocks[i].version->funcs->name, r); 2567 total = false; 2568 } else { 2569 adev->ip_blocks[i].status.valid = true; 2570 } 2571 } else { 2572 adev->ip_blocks[i].status.valid = true; 2573 } 2574 } 2575 /* get the vbios after the asic_funcs are set up */ 2576 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2577 r = amdgpu_device_parse_gpu_info_fw(adev); 2578 if (r) 2579 return r; 2580 2581 /* Read BIOS */ 2582 if (amdgpu_device_read_bios(adev)) { 2583 if (!amdgpu_get_bios(adev)) 2584 return -EINVAL; 2585 2586 r = amdgpu_atombios_init(adev); 2587 if (r) { 2588 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2589 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2590 return r; 2591 } 2592 } 2593 2594 /*get pf2vf msg info at it's earliest time*/ 2595 if (amdgpu_sriov_vf(adev)) 2596 amdgpu_virt_init_data_exchange(adev); 2597 2598 } 2599 } 2600 if (!total) 2601 return -ENODEV; 2602 2603 amdgpu_amdkfd_device_probe(adev); 2604 adev->cg_flags &= amdgpu_cg_mask; 2605 adev->pg_flags &= amdgpu_pg_mask; 2606 2607 return 0; 2608 } 2609 2610 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2611 { 2612 int i, r; 2613 2614 for (i = 0; i < adev->num_ip_blocks; i++) { 2615 if (!adev->ip_blocks[i].status.sw) 2616 continue; 2617 if (adev->ip_blocks[i].status.hw) 2618 continue; 2619 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2620 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2621 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2622 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2623 if (r) { 2624 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2625 adev->ip_blocks[i].version->funcs->name, r); 2626 return r; 2627 } 2628 adev->ip_blocks[i].status.hw = true; 2629 } 2630 } 2631 2632 return 0; 2633 } 2634 2635 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2636 { 2637 int i, r; 2638 2639 for (i = 0; i < adev->num_ip_blocks; i++) { 2640 if (!adev->ip_blocks[i].status.sw) 2641 continue; 2642 if (adev->ip_blocks[i].status.hw) 2643 continue; 2644 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2645 if (r) { 2646 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2647 adev->ip_blocks[i].version->funcs->name, r); 2648 return r; 2649 } 2650 adev->ip_blocks[i].status.hw = true; 2651 } 2652 2653 return 0; 2654 } 2655 2656 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2657 { 2658 int r = 0; 2659 int i; 2660 uint32_t smu_version; 2661 2662 if (adev->asic_type >= CHIP_VEGA10) { 2663 for (i = 0; i < adev->num_ip_blocks; i++) { 2664 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2665 continue; 2666 2667 if (!adev->ip_blocks[i].status.sw) 2668 continue; 2669 2670 /* no need to do the fw loading again if already done*/ 2671 if (adev->ip_blocks[i].status.hw == true) 2672 break; 2673 2674 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2675 r = adev->ip_blocks[i].version->funcs->resume(adev); 2676 if (r) { 2677 DRM_ERROR("resume of IP block <%s> failed %d\n", 2678 adev->ip_blocks[i].version->funcs->name, r); 2679 return r; 2680 } 2681 } else { 2682 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2683 if (r) { 2684 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2685 adev->ip_blocks[i].version->funcs->name, r); 2686 return r; 2687 } 2688 } 2689 2690 adev->ip_blocks[i].status.hw = true; 2691 break; 2692 } 2693 } 2694 2695 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2696 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2697 2698 return r; 2699 } 2700 2701 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2702 { 2703 long timeout; 2704 int r, i; 2705 2706 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2707 struct amdgpu_ring *ring = adev->rings[i]; 2708 2709 /* No need to setup the GPU scheduler for rings that don't need it */ 2710 if (!ring || ring->no_scheduler) 2711 continue; 2712 2713 switch (ring->funcs->type) { 2714 case AMDGPU_RING_TYPE_GFX: 2715 timeout = adev->gfx_timeout; 2716 break; 2717 case AMDGPU_RING_TYPE_COMPUTE: 2718 timeout = adev->compute_timeout; 2719 break; 2720 case AMDGPU_RING_TYPE_SDMA: 2721 timeout = adev->sdma_timeout; 2722 break; 2723 default: 2724 timeout = adev->video_timeout; 2725 break; 2726 } 2727 2728 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2729 DRM_SCHED_PRIORITY_COUNT, 2730 ring->num_hw_submission, 0, 2731 timeout, adev->reset_domain->wq, 2732 ring->sched_score, ring->name, 2733 adev->dev); 2734 if (r) { 2735 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2736 ring->name); 2737 return r; 2738 } 2739 r = amdgpu_uvd_entity_init(adev, ring); 2740 if (r) { 2741 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2742 ring->name); 2743 return r; 2744 } 2745 r = amdgpu_vce_entity_init(adev, ring); 2746 if (r) { 2747 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2748 ring->name); 2749 return r; 2750 } 2751 } 2752 2753 amdgpu_xcp_update_partition_sched_list(adev); 2754 2755 return 0; 2756 } 2757 2758 2759 /** 2760 * amdgpu_device_ip_init - run init for hardware IPs 2761 * 2762 * @adev: amdgpu_device pointer 2763 * 2764 * Main initialization pass for hardware IPs. The list of all the hardware 2765 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2766 * are run. sw_init initializes the software state associated with each IP 2767 * and hw_init initializes the hardware associated with each IP. 2768 * Returns 0 on success, negative error code on failure. 2769 */ 2770 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2771 { 2772 int i, r; 2773 2774 r = amdgpu_ras_init(adev); 2775 if (r) 2776 return r; 2777 2778 for (i = 0; i < adev->num_ip_blocks; i++) { 2779 if (!adev->ip_blocks[i].status.valid) 2780 continue; 2781 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2782 if (r) { 2783 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2784 adev->ip_blocks[i].version->funcs->name, r); 2785 goto init_failed; 2786 } 2787 adev->ip_blocks[i].status.sw = true; 2788 2789 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2790 /* need to do common hw init early so everything is set up for gmc */ 2791 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2792 if (r) { 2793 DRM_ERROR("hw_init %d failed %d\n", i, r); 2794 goto init_failed; 2795 } 2796 adev->ip_blocks[i].status.hw = true; 2797 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2798 /* need to do gmc hw init early so we can allocate gpu mem */ 2799 /* Try to reserve bad pages early */ 2800 if (amdgpu_sriov_vf(adev)) 2801 amdgpu_virt_exchange_data(adev); 2802 2803 r = amdgpu_device_mem_scratch_init(adev); 2804 if (r) { 2805 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2806 goto init_failed; 2807 } 2808 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2809 if (r) { 2810 DRM_ERROR("hw_init %d failed %d\n", i, r); 2811 goto init_failed; 2812 } 2813 r = amdgpu_device_wb_init(adev); 2814 if (r) { 2815 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2816 goto init_failed; 2817 } 2818 adev->ip_blocks[i].status.hw = true; 2819 2820 /* right after GMC hw init, we create CSA */ 2821 if (adev->gfx.mcbp) { 2822 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2823 AMDGPU_GEM_DOMAIN_VRAM | 2824 AMDGPU_GEM_DOMAIN_GTT, 2825 AMDGPU_CSA_SIZE); 2826 if (r) { 2827 DRM_ERROR("allocate CSA failed %d\n", r); 2828 goto init_failed; 2829 } 2830 } 2831 2832 r = amdgpu_seq64_init(adev); 2833 if (r) { 2834 DRM_ERROR("allocate seq64 failed %d\n", r); 2835 goto init_failed; 2836 } 2837 } 2838 } 2839 2840 if (amdgpu_sriov_vf(adev)) 2841 amdgpu_virt_init_data_exchange(adev); 2842 2843 r = amdgpu_ib_pool_init(adev); 2844 if (r) { 2845 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2846 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2847 goto init_failed; 2848 } 2849 2850 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2851 if (r) 2852 goto init_failed; 2853 2854 r = amdgpu_device_ip_hw_init_phase1(adev); 2855 if (r) 2856 goto init_failed; 2857 2858 r = amdgpu_device_fw_loading(adev); 2859 if (r) 2860 goto init_failed; 2861 2862 r = amdgpu_device_ip_hw_init_phase2(adev); 2863 if (r) 2864 goto init_failed; 2865 2866 /* 2867 * retired pages will be loaded from eeprom and reserved here, 2868 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2869 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2870 * for I2C communication which only true at this point. 2871 * 2872 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2873 * failure from bad gpu situation and stop amdgpu init process 2874 * accordingly. For other failed cases, it will still release all 2875 * the resource and print error message, rather than returning one 2876 * negative value to upper level. 2877 * 2878 * Note: theoretically, this should be called before all vram allocations 2879 * to protect retired page from abusing 2880 */ 2881 r = amdgpu_ras_recovery_init(adev); 2882 if (r) 2883 goto init_failed; 2884 2885 /** 2886 * In case of XGMI grab extra reference for reset domain for this device 2887 */ 2888 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2889 if (amdgpu_xgmi_add_device(adev) == 0) { 2890 if (!amdgpu_sriov_vf(adev)) { 2891 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2892 2893 if (WARN_ON(!hive)) { 2894 r = -ENOENT; 2895 goto init_failed; 2896 } 2897 2898 if (!hive->reset_domain || 2899 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2900 r = -ENOENT; 2901 amdgpu_put_xgmi_hive(hive); 2902 goto init_failed; 2903 } 2904 2905 /* Drop the early temporary reset domain we created for device */ 2906 amdgpu_reset_put_reset_domain(adev->reset_domain); 2907 adev->reset_domain = hive->reset_domain; 2908 amdgpu_put_xgmi_hive(hive); 2909 } 2910 } 2911 } 2912 2913 r = amdgpu_device_init_schedulers(adev); 2914 if (r) 2915 goto init_failed; 2916 2917 if (adev->mman.buffer_funcs_ring->sched.ready) 2918 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2919 2920 /* Don't init kfd if whole hive need to be reset during init */ 2921 if (!adev->gmc.xgmi.pending_reset) { 2922 kgd2kfd_init_zone_device(adev); 2923 amdgpu_amdkfd_device_init(adev); 2924 } 2925 2926 amdgpu_fru_get_product_info(adev); 2927 2928 init_failed: 2929 2930 return r; 2931 } 2932 2933 /** 2934 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2935 * 2936 * @adev: amdgpu_device pointer 2937 * 2938 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2939 * this function before a GPU reset. If the value is retained after a 2940 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2941 */ 2942 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2943 { 2944 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2945 } 2946 2947 /** 2948 * amdgpu_device_check_vram_lost - check if vram is valid 2949 * 2950 * @adev: amdgpu_device pointer 2951 * 2952 * Checks the reset magic value written to the gart pointer in VRAM. 2953 * The driver calls this after a GPU reset to see if the contents of 2954 * VRAM is lost or now. 2955 * returns true if vram is lost, false if not. 2956 */ 2957 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2958 { 2959 if (memcmp(adev->gart.ptr, adev->reset_magic, 2960 AMDGPU_RESET_MAGIC_NUM)) 2961 return true; 2962 2963 if (!amdgpu_in_reset(adev)) 2964 return false; 2965 2966 /* 2967 * For all ASICs with baco/mode1 reset, the VRAM is 2968 * always assumed to be lost. 2969 */ 2970 switch (amdgpu_asic_reset_method(adev)) { 2971 case AMD_RESET_METHOD_BACO: 2972 case AMD_RESET_METHOD_MODE1: 2973 return true; 2974 default: 2975 return false; 2976 } 2977 } 2978 2979 /** 2980 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2981 * 2982 * @adev: amdgpu_device pointer 2983 * @state: clockgating state (gate or ungate) 2984 * 2985 * The list of all the hardware IPs that make up the asic is walked and the 2986 * set_clockgating_state callbacks are run. 2987 * Late initialization pass enabling clockgating for hardware IPs. 2988 * Fini or suspend, pass disabling clockgating for hardware IPs. 2989 * Returns 0 on success, negative error code on failure. 2990 */ 2991 2992 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2993 enum amd_clockgating_state state) 2994 { 2995 int i, j, r; 2996 2997 if (amdgpu_emu_mode == 1) 2998 return 0; 2999 3000 for (j = 0; j < adev->num_ip_blocks; j++) { 3001 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3002 if (!adev->ip_blocks[i].status.late_initialized) 3003 continue; 3004 /* skip CG for GFX, SDMA on S0ix */ 3005 if (adev->in_s0ix && 3006 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3007 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3008 continue; 3009 /* skip CG for VCE/UVD, it's handled specially */ 3010 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3011 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3012 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3013 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3014 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3015 /* enable clockgating to save power */ 3016 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3017 state); 3018 if (r) { 3019 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3020 adev->ip_blocks[i].version->funcs->name, r); 3021 return r; 3022 } 3023 } 3024 } 3025 3026 return 0; 3027 } 3028 3029 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3030 enum amd_powergating_state state) 3031 { 3032 int i, j, r; 3033 3034 if (amdgpu_emu_mode == 1) 3035 return 0; 3036 3037 for (j = 0; j < adev->num_ip_blocks; j++) { 3038 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3039 if (!adev->ip_blocks[i].status.late_initialized) 3040 continue; 3041 /* skip PG for GFX, SDMA on S0ix */ 3042 if (adev->in_s0ix && 3043 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3044 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3045 continue; 3046 /* skip CG for VCE/UVD, it's handled specially */ 3047 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3048 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3049 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3050 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3051 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3052 /* enable powergating to save power */ 3053 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3054 state); 3055 if (r) { 3056 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3057 adev->ip_blocks[i].version->funcs->name, r); 3058 return r; 3059 } 3060 } 3061 } 3062 return 0; 3063 } 3064 3065 static int amdgpu_device_enable_mgpu_fan_boost(void) 3066 { 3067 struct amdgpu_gpu_instance *gpu_ins; 3068 struct amdgpu_device *adev; 3069 int i, ret = 0; 3070 3071 mutex_lock(&mgpu_info.mutex); 3072 3073 /* 3074 * MGPU fan boost feature should be enabled 3075 * only when there are two or more dGPUs in 3076 * the system 3077 */ 3078 if (mgpu_info.num_dgpu < 2) 3079 goto out; 3080 3081 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3082 gpu_ins = &(mgpu_info.gpu_ins[i]); 3083 adev = gpu_ins->adev; 3084 if (!(adev->flags & AMD_IS_APU) && 3085 !gpu_ins->mgpu_fan_enabled) { 3086 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3087 if (ret) 3088 break; 3089 3090 gpu_ins->mgpu_fan_enabled = 1; 3091 } 3092 } 3093 3094 out: 3095 mutex_unlock(&mgpu_info.mutex); 3096 3097 return ret; 3098 } 3099 3100 /** 3101 * amdgpu_device_ip_late_init - run late init for hardware IPs 3102 * 3103 * @adev: amdgpu_device pointer 3104 * 3105 * Late initialization pass for hardware IPs. The list of all the hardware 3106 * IPs that make up the asic is walked and the late_init callbacks are run. 3107 * late_init covers any special initialization that an IP requires 3108 * after all of the have been initialized or something that needs to happen 3109 * late in the init process. 3110 * Returns 0 on success, negative error code on failure. 3111 */ 3112 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3113 { 3114 struct amdgpu_gpu_instance *gpu_instance; 3115 int i = 0, r; 3116 3117 for (i = 0; i < adev->num_ip_blocks; i++) { 3118 if (!adev->ip_blocks[i].status.hw) 3119 continue; 3120 if (adev->ip_blocks[i].version->funcs->late_init) { 3121 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3122 if (r) { 3123 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3124 adev->ip_blocks[i].version->funcs->name, r); 3125 return r; 3126 } 3127 } 3128 adev->ip_blocks[i].status.late_initialized = true; 3129 } 3130 3131 r = amdgpu_ras_late_init(adev); 3132 if (r) { 3133 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3134 return r; 3135 } 3136 3137 amdgpu_ras_set_error_query_ready(adev, true); 3138 3139 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3140 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3141 3142 amdgpu_device_fill_reset_magic(adev); 3143 3144 r = amdgpu_device_enable_mgpu_fan_boost(); 3145 if (r) 3146 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3147 3148 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3149 if (amdgpu_passthrough(adev) && 3150 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3151 adev->asic_type == CHIP_ALDEBARAN)) 3152 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3153 3154 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3155 mutex_lock(&mgpu_info.mutex); 3156 3157 /* 3158 * Reset device p-state to low as this was booted with high. 3159 * 3160 * This should be performed only after all devices from the same 3161 * hive get initialized. 3162 * 3163 * However, it's unknown how many device in the hive in advance. 3164 * As this is counted one by one during devices initializations. 3165 * 3166 * So, we wait for all XGMI interlinked devices initialized. 3167 * This may bring some delays as those devices may come from 3168 * different hives. But that should be OK. 3169 */ 3170 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3171 for (i = 0; i < mgpu_info.num_gpu; i++) { 3172 gpu_instance = &(mgpu_info.gpu_ins[i]); 3173 if (gpu_instance->adev->flags & AMD_IS_APU) 3174 continue; 3175 3176 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3177 AMDGPU_XGMI_PSTATE_MIN); 3178 if (r) { 3179 DRM_ERROR("pstate setting failed (%d).\n", r); 3180 break; 3181 } 3182 } 3183 } 3184 3185 mutex_unlock(&mgpu_info.mutex); 3186 } 3187 3188 return 0; 3189 } 3190 3191 /** 3192 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3193 * 3194 * @adev: amdgpu_device pointer 3195 * 3196 * For ASICs need to disable SMC first 3197 */ 3198 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3199 { 3200 int i, r; 3201 3202 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3203 return; 3204 3205 for (i = 0; i < adev->num_ip_blocks; i++) { 3206 if (!adev->ip_blocks[i].status.hw) 3207 continue; 3208 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3209 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3210 /* XXX handle errors */ 3211 if (r) { 3212 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3213 adev->ip_blocks[i].version->funcs->name, r); 3214 } 3215 adev->ip_blocks[i].status.hw = false; 3216 break; 3217 } 3218 } 3219 } 3220 3221 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3222 { 3223 int i, r; 3224 3225 for (i = 0; i < adev->num_ip_blocks; i++) { 3226 if (!adev->ip_blocks[i].version->funcs->early_fini) 3227 continue; 3228 3229 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3230 if (r) { 3231 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3232 adev->ip_blocks[i].version->funcs->name, r); 3233 } 3234 } 3235 3236 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3237 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3238 3239 amdgpu_amdkfd_suspend(adev, false); 3240 3241 /* Workaroud for ASICs need to disable SMC first */ 3242 amdgpu_device_smu_fini_early(adev); 3243 3244 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3245 if (!adev->ip_blocks[i].status.hw) 3246 continue; 3247 3248 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3249 /* XXX handle errors */ 3250 if (r) { 3251 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3252 adev->ip_blocks[i].version->funcs->name, r); 3253 } 3254 3255 adev->ip_blocks[i].status.hw = false; 3256 } 3257 3258 if (amdgpu_sriov_vf(adev)) { 3259 if (amdgpu_virt_release_full_gpu(adev, false)) 3260 DRM_ERROR("failed to release exclusive mode on fini\n"); 3261 } 3262 3263 return 0; 3264 } 3265 3266 /** 3267 * amdgpu_device_ip_fini - run fini for hardware IPs 3268 * 3269 * @adev: amdgpu_device pointer 3270 * 3271 * Main teardown pass for hardware IPs. The list of all the hardware 3272 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3273 * are run. hw_fini tears down the hardware associated with each IP 3274 * and sw_fini tears down any software state associated with each IP. 3275 * Returns 0 on success, negative error code on failure. 3276 */ 3277 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3278 { 3279 int i, r; 3280 3281 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3282 amdgpu_virt_release_ras_err_handler_data(adev); 3283 3284 if (adev->gmc.xgmi.num_physical_nodes > 1) 3285 amdgpu_xgmi_remove_device(adev); 3286 3287 amdgpu_amdkfd_device_fini_sw(adev); 3288 3289 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3290 if (!adev->ip_blocks[i].status.sw) 3291 continue; 3292 3293 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3294 amdgpu_ucode_free_bo(adev); 3295 amdgpu_free_static_csa(&adev->virt.csa_obj); 3296 amdgpu_device_wb_fini(adev); 3297 amdgpu_device_mem_scratch_fini(adev); 3298 amdgpu_ib_pool_fini(adev); 3299 amdgpu_seq64_fini(adev); 3300 } 3301 3302 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3303 /* XXX handle errors */ 3304 if (r) { 3305 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3306 adev->ip_blocks[i].version->funcs->name, r); 3307 } 3308 adev->ip_blocks[i].status.sw = false; 3309 adev->ip_blocks[i].status.valid = false; 3310 } 3311 3312 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3313 if (!adev->ip_blocks[i].status.late_initialized) 3314 continue; 3315 if (adev->ip_blocks[i].version->funcs->late_fini) 3316 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3317 adev->ip_blocks[i].status.late_initialized = false; 3318 } 3319 3320 amdgpu_ras_fini(adev); 3321 3322 return 0; 3323 } 3324 3325 /** 3326 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3327 * 3328 * @work: work_struct. 3329 */ 3330 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3331 { 3332 struct amdgpu_device *adev = 3333 container_of(work, struct amdgpu_device, delayed_init_work.work); 3334 int r; 3335 3336 r = amdgpu_ib_ring_tests(adev); 3337 if (r) 3338 DRM_ERROR("ib ring test failed (%d).\n", r); 3339 } 3340 3341 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3342 { 3343 struct amdgpu_device *adev = 3344 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3345 3346 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3347 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3348 3349 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3350 adev->gfx.gfx_off_state = true; 3351 } 3352 3353 /** 3354 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3355 * 3356 * @adev: amdgpu_device pointer 3357 * 3358 * Main suspend function for hardware IPs. The list of all the hardware 3359 * IPs that make up the asic is walked, clockgating is disabled and the 3360 * suspend callbacks are run. suspend puts the hardware and software state 3361 * in each IP into a state suitable for suspend. 3362 * Returns 0 on success, negative error code on failure. 3363 */ 3364 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3365 { 3366 int i, r; 3367 3368 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3369 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3370 3371 /* 3372 * Per PMFW team's suggestion, driver needs to handle gfxoff 3373 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3374 * scenario. Add the missing df cstate disablement here. 3375 */ 3376 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3377 dev_warn(adev->dev, "Failed to disallow df cstate"); 3378 3379 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3380 if (!adev->ip_blocks[i].status.valid) 3381 continue; 3382 3383 /* displays are handled separately */ 3384 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3385 continue; 3386 3387 /* XXX handle errors */ 3388 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3389 /* XXX handle errors */ 3390 if (r) { 3391 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3392 adev->ip_blocks[i].version->funcs->name, r); 3393 return r; 3394 } 3395 3396 adev->ip_blocks[i].status.hw = false; 3397 } 3398 3399 return 0; 3400 } 3401 3402 /** 3403 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3404 * 3405 * @adev: amdgpu_device pointer 3406 * 3407 * Main suspend function for hardware IPs. The list of all the hardware 3408 * IPs that make up the asic is walked, clockgating is disabled and the 3409 * suspend callbacks are run. suspend puts the hardware and software state 3410 * in each IP into a state suitable for suspend. 3411 * Returns 0 on success, negative error code on failure. 3412 */ 3413 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3414 { 3415 int i, r; 3416 3417 if (adev->in_s0ix) 3418 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3419 3420 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3421 if (!adev->ip_blocks[i].status.valid) 3422 continue; 3423 /* displays are handled in phase1 */ 3424 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3425 continue; 3426 /* PSP lost connection when err_event_athub occurs */ 3427 if (amdgpu_ras_intr_triggered() && 3428 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3429 adev->ip_blocks[i].status.hw = false; 3430 continue; 3431 } 3432 3433 /* skip unnecessary suspend if we do not initialize them yet */ 3434 if (adev->gmc.xgmi.pending_reset && 3435 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3436 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3437 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3438 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3439 adev->ip_blocks[i].status.hw = false; 3440 continue; 3441 } 3442 3443 /* skip suspend of gfx/mes and psp for S0ix 3444 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3445 * like at runtime. PSP is also part of the always on hardware 3446 * so no need to suspend it. 3447 */ 3448 if (adev->in_s0ix && 3449 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3450 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3451 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3452 continue; 3453 3454 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3455 if (adev->in_s0ix && 3456 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3457 IP_VERSION(5, 0, 0)) && 3458 (adev->ip_blocks[i].version->type == 3459 AMD_IP_BLOCK_TYPE_SDMA)) 3460 continue; 3461 3462 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3463 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3464 * from this location and RLC Autoload automatically also gets loaded 3465 * from here based on PMFW -> PSP message during re-init sequence. 3466 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3467 * the TMR and reload FWs again for IMU enabled APU ASICs. 3468 */ 3469 if (amdgpu_in_reset(adev) && 3470 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3471 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3472 continue; 3473 3474 /* XXX handle errors */ 3475 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3476 /* XXX handle errors */ 3477 if (r) { 3478 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3479 adev->ip_blocks[i].version->funcs->name, r); 3480 } 3481 adev->ip_blocks[i].status.hw = false; 3482 /* handle putting the SMC in the appropriate state */ 3483 if (!amdgpu_sriov_vf(adev)) { 3484 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3485 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3486 if (r) { 3487 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3488 adev->mp1_state, r); 3489 return r; 3490 } 3491 } 3492 } 3493 } 3494 3495 return 0; 3496 } 3497 3498 /** 3499 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3500 * 3501 * @adev: amdgpu_device pointer 3502 * 3503 * Main suspend function for hardware IPs. The list of all the hardware 3504 * IPs that make up the asic is walked, clockgating is disabled and the 3505 * suspend callbacks are run. suspend puts the hardware and software state 3506 * in each IP into a state suitable for suspend. 3507 * Returns 0 on success, negative error code on failure. 3508 */ 3509 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3510 { 3511 int r; 3512 3513 if (amdgpu_sriov_vf(adev)) { 3514 amdgpu_virt_fini_data_exchange(adev); 3515 amdgpu_virt_request_full_gpu(adev, false); 3516 } 3517 3518 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3519 3520 r = amdgpu_device_ip_suspend_phase1(adev); 3521 if (r) 3522 return r; 3523 r = amdgpu_device_ip_suspend_phase2(adev); 3524 3525 if (amdgpu_sriov_vf(adev)) 3526 amdgpu_virt_release_full_gpu(adev, false); 3527 3528 return r; 3529 } 3530 3531 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3532 { 3533 int i, r; 3534 3535 static enum amd_ip_block_type ip_order[] = { 3536 AMD_IP_BLOCK_TYPE_COMMON, 3537 AMD_IP_BLOCK_TYPE_GMC, 3538 AMD_IP_BLOCK_TYPE_PSP, 3539 AMD_IP_BLOCK_TYPE_IH, 3540 }; 3541 3542 for (i = 0; i < adev->num_ip_blocks; i++) { 3543 int j; 3544 struct amdgpu_ip_block *block; 3545 3546 block = &adev->ip_blocks[i]; 3547 block->status.hw = false; 3548 3549 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3550 3551 if (block->version->type != ip_order[j] || 3552 !block->status.valid) 3553 continue; 3554 3555 r = block->version->funcs->hw_init(adev); 3556 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3557 if (r) 3558 return r; 3559 block->status.hw = true; 3560 } 3561 } 3562 3563 return 0; 3564 } 3565 3566 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3567 { 3568 int i, r; 3569 3570 static enum amd_ip_block_type ip_order[] = { 3571 AMD_IP_BLOCK_TYPE_SMC, 3572 AMD_IP_BLOCK_TYPE_DCE, 3573 AMD_IP_BLOCK_TYPE_GFX, 3574 AMD_IP_BLOCK_TYPE_SDMA, 3575 AMD_IP_BLOCK_TYPE_MES, 3576 AMD_IP_BLOCK_TYPE_UVD, 3577 AMD_IP_BLOCK_TYPE_VCE, 3578 AMD_IP_BLOCK_TYPE_VCN, 3579 AMD_IP_BLOCK_TYPE_JPEG 3580 }; 3581 3582 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3583 int j; 3584 struct amdgpu_ip_block *block; 3585 3586 for (j = 0; j < adev->num_ip_blocks; j++) { 3587 block = &adev->ip_blocks[j]; 3588 3589 if (block->version->type != ip_order[i] || 3590 !block->status.valid || 3591 block->status.hw) 3592 continue; 3593 3594 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3595 r = block->version->funcs->resume(adev); 3596 else 3597 r = block->version->funcs->hw_init(adev); 3598 3599 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3600 if (r) 3601 return r; 3602 block->status.hw = true; 3603 } 3604 } 3605 3606 return 0; 3607 } 3608 3609 /** 3610 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3611 * 3612 * @adev: amdgpu_device pointer 3613 * 3614 * First resume function for hardware IPs. The list of all the hardware 3615 * IPs that make up the asic is walked and the resume callbacks are run for 3616 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3617 * after a suspend and updates the software state as necessary. This 3618 * function is also used for restoring the GPU after a GPU reset. 3619 * Returns 0 on success, negative error code on failure. 3620 */ 3621 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3622 { 3623 int i, r; 3624 3625 for (i = 0; i < adev->num_ip_blocks; i++) { 3626 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3627 continue; 3628 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3629 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3630 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3631 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3632 3633 r = adev->ip_blocks[i].version->funcs->resume(adev); 3634 if (r) { 3635 DRM_ERROR("resume of IP block <%s> failed %d\n", 3636 adev->ip_blocks[i].version->funcs->name, r); 3637 return r; 3638 } 3639 adev->ip_blocks[i].status.hw = true; 3640 } 3641 } 3642 3643 return 0; 3644 } 3645 3646 /** 3647 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3648 * 3649 * @adev: amdgpu_device pointer 3650 * 3651 * First resume function for hardware IPs. The list of all the hardware 3652 * IPs that make up the asic is walked and the resume callbacks are run for 3653 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3654 * functional state after a suspend and updates the software state as 3655 * necessary. This function is also used for restoring the GPU after a GPU 3656 * reset. 3657 * Returns 0 on success, negative error code on failure. 3658 */ 3659 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3660 { 3661 int i, r; 3662 3663 for (i = 0; i < adev->num_ip_blocks; i++) { 3664 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3665 continue; 3666 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3667 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3668 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3669 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3670 continue; 3671 r = adev->ip_blocks[i].version->funcs->resume(adev); 3672 if (r) { 3673 DRM_ERROR("resume of IP block <%s> failed %d\n", 3674 adev->ip_blocks[i].version->funcs->name, r); 3675 return r; 3676 } 3677 adev->ip_blocks[i].status.hw = true; 3678 } 3679 3680 return 0; 3681 } 3682 3683 /** 3684 * amdgpu_device_ip_resume - run resume for hardware IPs 3685 * 3686 * @adev: amdgpu_device pointer 3687 * 3688 * Main resume function for hardware IPs. The hardware IPs 3689 * are split into two resume functions because they are 3690 * also used in recovering from a GPU reset and some additional 3691 * steps need to be take between them. In this case (S3/S4) they are 3692 * run sequentially. 3693 * Returns 0 on success, negative error code on failure. 3694 */ 3695 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3696 { 3697 int r; 3698 3699 r = amdgpu_device_ip_resume_phase1(adev); 3700 if (r) 3701 return r; 3702 3703 r = amdgpu_device_fw_loading(adev); 3704 if (r) 3705 return r; 3706 3707 r = amdgpu_device_ip_resume_phase2(adev); 3708 3709 if (adev->mman.buffer_funcs_ring->sched.ready) 3710 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3711 3712 return r; 3713 } 3714 3715 /** 3716 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3717 * 3718 * @adev: amdgpu_device pointer 3719 * 3720 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3721 */ 3722 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3723 { 3724 if (amdgpu_sriov_vf(adev)) { 3725 if (adev->is_atom_fw) { 3726 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3727 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3728 } else { 3729 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3730 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3731 } 3732 3733 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3734 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3735 } 3736 } 3737 3738 /** 3739 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3740 * 3741 * @asic_type: AMD asic type 3742 * 3743 * Check if there is DC (new modesetting infrastructre) support for an asic. 3744 * returns true if DC has support, false if not. 3745 */ 3746 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3747 { 3748 switch (asic_type) { 3749 #ifdef CONFIG_DRM_AMDGPU_SI 3750 case CHIP_HAINAN: 3751 #endif 3752 case CHIP_TOPAZ: 3753 /* chips with no display hardware */ 3754 return false; 3755 #if defined(CONFIG_DRM_AMD_DC) 3756 case CHIP_TAHITI: 3757 case CHIP_PITCAIRN: 3758 case CHIP_VERDE: 3759 case CHIP_OLAND: 3760 /* 3761 * We have systems in the wild with these ASICs that require 3762 * LVDS and VGA support which is not supported with DC. 3763 * 3764 * Fallback to the non-DC driver here by default so as not to 3765 * cause regressions. 3766 */ 3767 #if defined(CONFIG_DRM_AMD_DC_SI) 3768 return amdgpu_dc > 0; 3769 #else 3770 return false; 3771 #endif 3772 case CHIP_BONAIRE: 3773 case CHIP_KAVERI: 3774 case CHIP_KABINI: 3775 case CHIP_MULLINS: 3776 /* 3777 * We have systems in the wild with these ASICs that require 3778 * VGA support which is not supported with DC. 3779 * 3780 * Fallback to the non-DC driver here by default so as not to 3781 * cause regressions. 3782 */ 3783 return amdgpu_dc > 0; 3784 default: 3785 return amdgpu_dc != 0; 3786 #else 3787 default: 3788 if (amdgpu_dc > 0) 3789 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3790 return false; 3791 #endif 3792 } 3793 } 3794 3795 /** 3796 * amdgpu_device_has_dc_support - check if dc is supported 3797 * 3798 * @adev: amdgpu_device pointer 3799 * 3800 * Returns true for supported, false for not supported 3801 */ 3802 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3803 { 3804 if (adev->enable_virtual_display || 3805 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3806 return false; 3807 3808 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3809 } 3810 3811 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3812 { 3813 struct amdgpu_device *adev = 3814 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3815 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3816 3817 /* It's a bug to not have a hive within this function */ 3818 if (WARN_ON(!hive)) 3819 return; 3820 3821 /* 3822 * Use task barrier to synchronize all xgmi reset works across the 3823 * hive. task_barrier_enter and task_barrier_exit will block 3824 * until all the threads running the xgmi reset works reach 3825 * those points. task_barrier_full will do both blocks. 3826 */ 3827 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3828 3829 task_barrier_enter(&hive->tb); 3830 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3831 3832 if (adev->asic_reset_res) 3833 goto fail; 3834 3835 task_barrier_exit(&hive->tb); 3836 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3837 3838 if (adev->asic_reset_res) 3839 goto fail; 3840 3841 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3842 } else { 3843 3844 task_barrier_full(&hive->tb); 3845 adev->asic_reset_res = amdgpu_asic_reset(adev); 3846 } 3847 3848 fail: 3849 if (adev->asic_reset_res) 3850 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3851 adev->asic_reset_res, adev_to_drm(adev)->unique); 3852 amdgpu_put_xgmi_hive(hive); 3853 } 3854 3855 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3856 { 3857 char *input = amdgpu_lockup_timeout; 3858 char *timeout_setting = NULL; 3859 int index = 0; 3860 long timeout; 3861 int ret = 0; 3862 3863 /* 3864 * By default timeout for non compute jobs is 10000 3865 * and 60000 for compute jobs. 3866 * In SR-IOV or passthrough mode, timeout for compute 3867 * jobs are 60000 by default. 3868 */ 3869 adev->gfx_timeout = msecs_to_jiffies(10000); 3870 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3871 if (amdgpu_sriov_vf(adev)) 3872 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3873 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3874 else 3875 adev->compute_timeout = msecs_to_jiffies(60000); 3876 3877 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3878 while ((timeout_setting = strsep(&input, ",")) && 3879 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3880 ret = kstrtol(timeout_setting, 0, &timeout); 3881 if (ret) 3882 return ret; 3883 3884 if (timeout == 0) { 3885 index++; 3886 continue; 3887 } else if (timeout < 0) { 3888 timeout = MAX_SCHEDULE_TIMEOUT; 3889 dev_warn(adev->dev, "lockup timeout disabled"); 3890 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3891 } else { 3892 timeout = msecs_to_jiffies(timeout); 3893 } 3894 3895 switch (index++) { 3896 case 0: 3897 adev->gfx_timeout = timeout; 3898 break; 3899 case 1: 3900 adev->compute_timeout = timeout; 3901 break; 3902 case 2: 3903 adev->sdma_timeout = timeout; 3904 break; 3905 case 3: 3906 adev->video_timeout = timeout; 3907 break; 3908 default: 3909 break; 3910 } 3911 } 3912 /* 3913 * There is only one value specified and 3914 * it should apply to all non-compute jobs. 3915 */ 3916 if (index == 1) { 3917 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3918 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3919 adev->compute_timeout = adev->gfx_timeout; 3920 } 3921 } 3922 3923 return ret; 3924 } 3925 3926 /** 3927 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3928 * 3929 * @adev: amdgpu_device pointer 3930 * 3931 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3932 */ 3933 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3934 { 3935 struct iommu_domain *domain; 3936 3937 domain = iommu_get_domain_for_dev(adev->dev); 3938 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3939 adev->ram_is_direct_mapped = true; 3940 } 3941 3942 static const struct attribute *amdgpu_dev_attributes[] = { 3943 &dev_attr_pcie_replay_count.attr, 3944 NULL 3945 }; 3946 3947 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3948 { 3949 if (amdgpu_mcbp == 1) 3950 adev->gfx.mcbp = true; 3951 else if (amdgpu_mcbp == 0) 3952 adev->gfx.mcbp = false; 3953 3954 if (amdgpu_sriov_vf(adev)) 3955 adev->gfx.mcbp = true; 3956 3957 if (adev->gfx.mcbp) 3958 DRM_INFO("MCBP is enabled\n"); 3959 } 3960 3961 /** 3962 * amdgpu_device_init - initialize the driver 3963 * 3964 * @adev: amdgpu_device pointer 3965 * @flags: driver flags 3966 * 3967 * Initializes the driver info and hw (all asics). 3968 * Returns 0 for success or an error on failure. 3969 * Called at driver startup. 3970 */ 3971 int amdgpu_device_init(struct amdgpu_device *adev, 3972 uint32_t flags) 3973 { 3974 struct drm_device *ddev = adev_to_drm(adev); 3975 struct pci_dev *pdev = adev->pdev; 3976 int r, i; 3977 bool px = false; 3978 u32 max_MBps; 3979 int tmp; 3980 3981 adev->shutdown = false; 3982 adev->flags = flags; 3983 3984 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3985 adev->asic_type = amdgpu_force_asic_type; 3986 else 3987 adev->asic_type = flags & AMD_ASIC_MASK; 3988 3989 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3990 if (amdgpu_emu_mode == 1) 3991 adev->usec_timeout *= 10; 3992 adev->gmc.gart_size = 512 * 1024 * 1024; 3993 adev->accel_working = false; 3994 adev->num_rings = 0; 3995 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3996 adev->mman.buffer_funcs = NULL; 3997 adev->mman.buffer_funcs_ring = NULL; 3998 adev->vm_manager.vm_pte_funcs = NULL; 3999 adev->vm_manager.vm_pte_num_scheds = 0; 4000 adev->gmc.gmc_funcs = NULL; 4001 adev->harvest_ip_mask = 0x0; 4002 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4003 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4004 4005 adev->smc_rreg = &amdgpu_invalid_rreg; 4006 adev->smc_wreg = &amdgpu_invalid_wreg; 4007 adev->pcie_rreg = &amdgpu_invalid_rreg; 4008 adev->pcie_wreg = &amdgpu_invalid_wreg; 4009 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4010 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4011 adev->pciep_rreg = &amdgpu_invalid_rreg; 4012 adev->pciep_wreg = &amdgpu_invalid_wreg; 4013 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4014 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4015 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4016 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4017 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4018 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4019 adev->didt_rreg = &amdgpu_invalid_rreg; 4020 adev->didt_wreg = &amdgpu_invalid_wreg; 4021 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4022 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4023 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4024 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4025 4026 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4027 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4028 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4029 4030 /* mutex initialization are all done here so we 4031 * can recall function without having locking issues 4032 */ 4033 mutex_init(&adev->firmware.mutex); 4034 mutex_init(&adev->pm.mutex); 4035 mutex_init(&adev->gfx.gpu_clock_mutex); 4036 mutex_init(&adev->srbm_mutex); 4037 mutex_init(&adev->gfx.pipe_reserve_mutex); 4038 mutex_init(&adev->gfx.gfx_off_mutex); 4039 mutex_init(&adev->gfx.partition_mutex); 4040 mutex_init(&adev->grbm_idx_mutex); 4041 mutex_init(&adev->mn_lock); 4042 mutex_init(&adev->virt.vf_errors.lock); 4043 hash_init(adev->mn_hash); 4044 mutex_init(&adev->psp.mutex); 4045 mutex_init(&adev->notifier_lock); 4046 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4047 mutex_init(&adev->benchmark_mutex); 4048 4049 amdgpu_device_init_apu_flags(adev); 4050 4051 r = amdgpu_device_check_arguments(adev); 4052 if (r) 4053 return r; 4054 4055 spin_lock_init(&adev->mmio_idx_lock); 4056 spin_lock_init(&adev->smc_idx_lock); 4057 spin_lock_init(&adev->pcie_idx_lock); 4058 spin_lock_init(&adev->uvd_ctx_idx_lock); 4059 spin_lock_init(&adev->didt_idx_lock); 4060 spin_lock_init(&adev->gc_cac_idx_lock); 4061 spin_lock_init(&adev->se_cac_idx_lock); 4062 spin_lock_init(&adev->audio_endpt_idx_lock); 4063 spin_lock_init(&adev->mm_stats.lock); 4064 4065 INIT_LIST_HEAD(&adev->shadow_list); 4066 mutex_init(&adev->shadow_list_lock); 4067 4068 INIT_LIST_HEAD(&adev->reset_list); 4069 4070 INIT_LIST_HEAD(&adev->ras_list); 4071 4072 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4073 4074 INIT_DELAYED_WORK(&adev->delayed_init_work, 4075 amdgpu_device_delayed_init_work_handler); 4076 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4077 amdgpu_device_delay_enable_gfx_off); 4078 4079 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4080 4081 adev->gfx.gfx_off_req_count = 1; 4082 adev->gfx.gfx_off_residency = 0; 4083 adev->gfx.gfx_off_entrycount = 0; 4084 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4085 4086 atomic_set(&adev->throttling_logging_enabled, 1); 4087 /* 4088 * If throttling continues, logging will be performed every minute 4089 * to avoid log flooding. "-1" is subtracted since the thermal 4090 * throttling interrupt comes every second. Thus, the total logging 4091 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4092 * for throttling interrupt) = 60 seconds. 4093 */ 4094 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4095 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4096 4097 /* Registers mapping */ 4098 /* TODO: block userspace mapping of io register */ 4099 if (adev->asic_type >= CHIP_BONAIRE) { 4100 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4101 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4102 } else { 4103 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4104 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4105 } 4106 4107 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4108 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4109 4110 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4111 if (!adev->rmmio) 4112 return -ENOMEM; 4113 4114 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4115 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4116 4117 /* 4118 * Reset domain needs to be present early, before XGMI hive discovered 4119 * (if any) and intitialized to use reset sem and in_gpu reset flag 4120 * early on during init and before calling to RREG32. 4121 */ 4122 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4123 if (!adev->reset_domain) 4124 return -ENOMEM; 4125 4126 /* detect hw virtualization here */ 4127 amdgpu_detect_virtualization(adev); 4128 4129 amdgpu_device_get_pcie_info(adev); 4130 4131 r = amdgpu_device_get_job_timeout_settings(adev); 4132 if (r) { 4133 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4134 return r; 4135 } 4136 4137 amdgpu_device_set_mcbp(adev); 4138 4139 /* early init functions */ 4140 r = amdgpu_device_ip_early_init(adev); 4141 if (r) 4142 return r; 4143 4144 /* Get rid of things like offb */ 4145 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4146 if (r) 4147 return r; 4148 4149 /* Enable TMZ based on IP_VERSION */ 4150 amdgpu_gmc_tmz_set(adev); 4151 4152 if (amdgpu_sriov_vf(adev) && 4153 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4154 /* VF MMIO access (except mailbox range) from CPU 4155 * will be blocked during sriov runtime 4156 */ 4157 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4158 4159 amdgpu_gmc_noretry_set(adev); 4160 /* Need to get xgmi info early to decide the reset behavior*/ 4161 if (adev->gmc.xgmi.supported) { 4162 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4163 if (r) 4164 return r; 4165 } 4166 4167 /* enable PCIE atomic ops */ 4168 if (amdgpu_sriov_vf(adev)) { 4169 if (adev->virt.fw_reserve.p_pf2vf) 4170 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4171 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4172 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4173 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4174 * internal path natively support atomics, set have_atomics_support to true. 4175 */ 4176 } else if ((adev->flags & AMD_IS_APU) && 4177 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4178 IP_VERSION(9, 0, 0))) { 4179 adev->have_atomics_support = true; 4180 } else { 4181 adev->have_atomics_support = 4182 !pci_enable_atomic_ops_to_root(adev->pdev, 4183 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4184 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4185 } 4186 4187 if (!adev->have_atomics_support) 4188 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4189 4190 /* doorbell bar mapping and doorbell index init*/ 4191 amdgpu_doorbell_init(adev); 4192 4193 if (amdgpu_emu_mode == 1) { 4194 /* post the asic on emulation mode */ 4195 emu_soc_asic_init(adev); 4196 goto fence_driver_init; 4197 } 4198 4199 amdgpu_reset_init(adev); 4200 4201 /* detect if we are with an SRIOV vbios */ 4202 if (adev->bios) 4203 amdgpu_device_detect_sriov_bios(adev); 4204 4205 /* check if we need to reset the asic 4206 * E.g., driver was not cleanly unloaded previously, etc. 4207 */ 4208 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4209 if (adev->gmc.xgmi.num_physical_nodes) { 4210 dev_info(adev->dev, "Pending hive reset.\n"); 4211 adev->gmc.xgmi.pending_reset = true; 4212 /* Only need to init necessary block for SMU to handle the reset */ 4213 for (i = 0; i < adev->num_ip_blocks; i++) { 4214 if (!adev->ip_blocks[i].status.valid) 4215 continue; 4216 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4217 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4218 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4220 DRM_DEBUG("IP %s disabled for hw_init.\n", 4221 adev->ip_blocks[i].version->funcs->name); 4222 adev->ip_blocks[i].status.hw = true; 4223 } 4224 } 4225 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4226 !amdgpu_device_has_display_hardware(adev)) { 4227 r = psp_gpu_reset(adev); 4228 } else { 4229 tmp = amdgpu_reset_method; 4230 /* It should do a default reset when loading or reloading the driver, 4231 * regardless of the module parameter reset_method. 4232 */ 4233 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4234 r = amdgpu_asic_reset(adev); 4235 amdgpu_reset_method = tmp; 4236 } 4237 4238 if (r) { 4239 dev_err(adev->dev, "asic reset on init failed\n"); 4240 goto failed; 4241 } 4242 } 4243 4244 /* Post card if necessary */ 4245 if (amdgpu_device_need_post(adev)) { 4246 if (!adev->bios) { 4247 dev_err(adev->dev, "no vBIOS found\n"); 4248 r = -EINVAL; 4249 goto failed; 4250 } 4251 DRM_INFO("GPU posting now...\n"); 4252 r = amdgpu_device_asic_init(adev); 4253 if (r) { 4254 dev_err(adev->dev, "gpu post error!\n"); 4255 goto failed; 4256 } 4257 } 4258 4259 if (adev->bios) { 4260 if (adev->is_atom_fw) { 4261 /* Initialize clocks */ 4262 r = amdgpu_atomfirmware_get_clock_info(adev); 4263 if (r) { 4264 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4265 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4266 goto failed; 4267 } 4268 } else { 4269 /* Initialize clocks */ 4270 r = amdgpu_atombios_get_clock_info(adev); 4271 if (r) { 4272 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4273 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4274 goto failed; 4275 } 4276 /* init i2c buses */ 4277 if (!amdgpu_device_has_dc_support(adev)) 4278 amdgpu_atombios_i2c_init(adev); 4279 } 4280 } 4281 4282 fence_driver_init: 4283 /* Fence driver */ 4284 r = amdgpu_fence_driver_sw_init(adev); 4285 if (r) { 4286 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4287 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4288 goto failed; 4289 } 4290 4291 /* init the mode config */ 4292 drm_mode_config_init(adev_to_drm(adev)); 4293 4294 r = amdgpu_device_ip_init(adev); 4295 if (r) { 4296 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4297 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4298 goto release_ras_con; 4299 } 4300 4301 amdgpu_fence_driver_hw_init(adev); 4302 4303 dev_info(adev->dev, 4304 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4305 adev->gfx.config.max_shader_engines, 4306 adev->gfx.config.max_sh_per_se, 4307 adev->gfx.config.max_cu_per_sh, 4308 adev->gfx.cu_info.number); 4309 4310 adev->accel_working = true; 4311 4312 amdgpu_vm_check_compute_bug(adev); 4313 4314 /* Initialize the buffer migration limit. */ 4315 if (amdgpu_moverate >= 0) 4316 max_MBps = amdgpu_moverate; 4317 else 4318 max_MBps = 8; /* Allow 8 MB/s. */ 4319 /* Get a log2 for easy divisions. */ 4320 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4321 4322 /* 4323 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4324 * Otherwise the mgpu fan boost feature will be skipped due to the 4325 * gpu instance is counted less. 4326 */ 4327 amdgpu_register_gpu_instance(adev); 4328 4329 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4330 * explicit gating rather than handling it automatically. 4331 */ 4332 if (!adev->gmc.xgmi.pending_reset) { 4333 r = amdgpu_device_ip_late_init(adev); 4334 if (r) { 4335 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4336 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4337 goto release_ras_con; 4338 } 4339 /* must succeed. */ 4340 amdgpu_ras_resume(adev); 4341 queue_delayed_work(system_wq, &adev->delayed_init_work, 4342 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4343 } 4344 4345 if (amdgpu_sriov_vf(adev)) { 4346 amdgpu_virt_release_full_gpu(adev, true); 4347 flush_delayed_work(&adev->delayed_init_work); 4348 } 4349 4350 /* 4351 * Place those sysfs registering after `late_init`. As some of those 4352 * operations performed in `late_init` might affect the sysfs 4353 * interfaces creating. 4354 */ 4355 r = amdgpu_atombios_sysfs_init(adev); 4356 if (r) 4357 drm_err(&adev->ddev, 4358 "registering atombios sysfs failed (%d).\n", r); 4359 4360 r = amdgpu_pm_sysfs_init(adev); 4361 if (r) 4362 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4363 4364 r = amdgpu_ucode_sysfs_init(adev); 4365 if (r) { 4366 adev->ucode_sysfs_en = false; 4367 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4368 } else 4369 adev->ucode_sysfs_en = true; 4370 4371 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4372 if (r) 4373 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4374 4375 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4376 if (r) 4377 dev_err(adev->dev, 4378 "Could not create amdgpu board attributes\n"); 4379 4380 amdgpu_fru_sysfs_init(adev); 4381 amdgpu_reg_state_sysfs_init(adev); 4382 4383 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4384 r = amdgpu_pmu_init(adev); 4385 if (r) 4386 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4387 4388 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4389 if (amdgpu_device_cache_pci_state(adev->pdev)) 4390 pci_restore_state(pdev); 4391 4392 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4393 /* this will fail for cards that aren't VGA class devices, just 4394 * ignore it 4395 */ 4396 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4397 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4398 4399 px = amdgpu_device_supports_px(ddev); 4400 4401 if (px || (!dev_is_removable(&adev->pdev->dev) && 4402 apple_gmux_detect(NULL, NULL))) 4403 vga_switcheroo_register_client(adev->pdev, 4404 &amdgpu_switcheroo_ops, px); 4405 4406 if (px) 4407 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4408 4409 if (adev->gmc.xgmi.pending_reset) 4410 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4411 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4412 4413 amdgpu_device_check_iommu_direct_map(adev); 4414 4415 return 0; 4416 4417 release_ras_con: 4418 if (amdgpu_sriov_vf(adev)) 4419 amdgpu_virt_release_full_gpu(adev, true); 4420 4421 /* failed in exclusive mode due to timeout */ 4422 if (amdgpu_sriov_vf(adev) && 4423 !amdgpu_sriov_runtime(adev) && 4424 amdgpu_virt_mmio_blocked(adev) && 4425 !amdgpu_virt_wait_reset(adev)) { 4426 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4427 /* Don't send request since VF is inactive. */ 4428 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4429 adev->virt.ops = NULL; 4430 r = -EAGAIN; 4431 } 4432 amdgpu_release_ras_context(adev); 4433 4434 failed: 4435 amdgpu_vf_error_trans_all(adev); 4436 4437 return r; 4438 } 4439 4440 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4441 { 4442 4443 /* Clear all CPU mappings pointing to this device */ 4444 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4445 4446 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4447 amdgpu_doorbell_fini(adev); 4448 4449 iounmap(adev->rmmio); 4450 adev->rmmio = NULL; 4451 if (adev->mman.aper_base_kaddr) 4452 iounmap(adev->mman.aper_base_kaddr); 4453 adev->mman.aper_base_kaddr = NULL; 4454 4455 /* Memory manager related */ 4456 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4457 arch_phys_wc_del(adev->gmc.vram_mtrr); 4458 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4459 } 4460 } 4461 4462 /** 4463 * amdgpu_device_fini_hw - tear down the driver 4464 * 4465 * @adev: amdgpu_device pointer 4466 * 4467 * Tear down the driver info (all asics). 4468 * Called at driver shutdown. 4469 */ 4470 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4471 { 4472 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4473 flush_delayed_work(&adev->delayed_init_work); 4474 adev->shutdown = true; 4475 4476 /* make sure IB test finished before entering exclusive mode 4477 * to avoid preemption on IB test 4478 */ 4479 if (amdgpu_sriov_vf(adev)) { 4480 amdgpu_virt_request_full_gpu(adev, false); 4481 amdgpu_virt_fini_data_exchange(adev); 4482 } 4483 4484 /* disable all interrupts */ 4485 amdgpu_irq_disable_all(adev); 4486 if (adev->mode_info.mode_config_initialized) { 4487 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4488 drm_helper_force_disable_all(adev_to_drm(adev)); 4489 else 4490 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4491 } 4492 amdgpu_fence_driver_hw_fini(adev); 4493 4494 if (adev->mman.initialized) 4495 drain_workqueue(adev->mman.bdev.wq); 4496 4497 if (adev->pm.sysfs_initialized) 4498 amdgpu_pm_sysfs_fini(adev); 4499 if (adev->ucode_sysfs_en) 4500 amdgpu_ucode_sysfs_fini(adev); 4501 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4502 amdgpu_fru_sysfs_fini(adev); 4503 4504 amdgpu_reg_state_sysfs_fini(adev); 4505 4506 /* disable ras feature must before hw fini */ 4507 amdgpu_ras_pre_fini(adev); 4508 4509 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4510 4511 amdgpu_device_ip_fini_early(adev); 4512 4513 amdgpu_irq_fini_hw(adev); 4514 4515 if (adev->mman.initialized) 4516 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4517 4518 amdgpu_gart_dummy_page_fini(adev); 4519 4520 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4521 amdgpu_device_unmap_mmio(adev); 4522 4523 } 4524 4525 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4526 { 4527 int idx; 4528 bool px; 4529 4530 amdgpu_fence_driver_sw_fini(adev); 4531 amdgpu_device_ip_fini(adev); 4532 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4533 adev->accel_working = false; 4534 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4535 4536 amdgpu_reset_fini(adev); 4537 4538 /* free i2c buses */ 4539 if (!amdgpu_device_has_dc_support(adev)) 4540 amdgpu_i2c_fini(adev); 4541 4542 if (amdgpu_emu_mode != 1) 4543 amdgpu_atombios_fini(adev); 4544 4545 kfree(adev->bios); 4546 adev->bios = NULL; 4547 4548 kfree(adev->fru_info); 4549 adev->fru_info = NULL; 4550 4551 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4552 4553 if (px || (!dev_is_removable(&adev->pdev->dev) && 4554 apple_gmux_detect(NULL, NULL))) 4555 vga_switcheroo_unregister_client(adev->pdev); 4556 4557 if (px) 4558 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4559 4560 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4561 vga_client_unregister(adev->pdev); 4562 4563 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4564 4565 iounmap(adev->rmmio); 4566 adev->rmmio = NULL; 4567 amdgpu_doorbell_fini(adev); 4568 drm_dev_exit(idx); 4569 } 4570 4571 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4572 amdgpu_pmu_fini(adev); 4573 if (adev->mman.discovery_bin) 4574 amdgpu_discovery_fini(adev); 4575 4576 amdgpu_reset_put_reset_domain(adev->reset_domain); 4577 adev->reset_domain = NULL; 4578 4579 kfree(adev->pci_state); 4580 4581 } 4582 4583 /** 4584 * amdgpu_device_evict_resources - evict device resources 4585 * @adev: amdgpu device object 4586 * 4587 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4588 * of the vram memory type. Mainly used for evicting device resources 4589 * at suspend time. 4590 * 4591 */ 4592 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4593 { 4594 int ret; 4595 4596 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4597 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4598 return 0; 4599 4600 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4601 if (ret) 4602 DRM_WARN("evicting device resources failed\n"); 4603 return ret; 4604 } 4605 4606 /* 4607 * Suspend & resume. 4608 */ 4609 /** 4610 * amdgpu_device_prepare - prepare for device suspend 4611 * 4612 * @dev: drm dev pointer 4613 * 4614 * Prepare to put the hw in the suspend state (all asics). 4615 * Returns 0 for success or an error on failure. 4616 * Called at driver suspend. 4617 */ 4618 int amdgpu_device_prepare(struct drm_device *dev) 4619 { 4620 struct amdgpu_device *adev = drm_to_adev(dev); 4621 int i, r; 4622 4623 amdgpu_choose_low_power_state(adev); 4624 4625 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4626 return 0; 4627 4628 /* Evict the majority of BOs before starting suspend sequence */ 4629 r = amdgpu_device_evict_resources(adev); 4630 if (r) 4631 goto unprepare; 4632 4633 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4634 4635 for (i = 0; i < adev->num_ip_blocks; i++) { 4636 if (!adev->ip_blocks[i].status.valid) 4637 continue; 4638 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4639 continue; 4640 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4641 if (r) 4642 goto unprepare; 4643 } 4644 4645 return 0; 4646 4647 unprepare: 4648 adev->in_s0ix = adev->in_s3 = false; 4649 4650 return r; 4651 } 4652 4653 /** 4654 * amdgpu_device_suspend - initiate device suspend 4655 * 4656 * @dev: drm dev pointer 4657 * @fbcon : notify the fbdev of suspend 4658 * 4659 * Puts the hw in the suspend state (all asics). 4660 * Returns 0 for success or an error on failure. 4661 * Called at driver suspend. 4662 */ 4663 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4664 { 4665 struct amdgpu_device *adev = drm_to_adev(dev); 4666 int r = 0; 4667 4668 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4669 return 0; 4670 4671 adev->in_suspend = true; 4672 4673 if (amdgpu_sriov_vf(adev)) { 4674 amdgpu_virt_fini_data_exchange(adev); 4675 r = amdgpu_virt_request_full_gpu(adev, false); 4676 if (r) 4677 return r; 4678 } 4679 4680 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4681 DRM_WARN("smart shift update failed\n"); 4682 4683 if (fbcon) 4684 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4685 4686 cancel_delayed_work_sync(&adev->delayed_init_work); 4687 4688 amdgpu_ras_suspend(adev); 4689 4690 amdgpu_device_ip_suspend_phase1(adev); 4691 4692 if (!adev->in_s0ix) 4693 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4694 4695 r = amdgpu_device_evict_resources(adev); 4696 if (r) 4697 return r; 4698 4699 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4700 4701 amdgpu_fence_driver_hw_fini(adev); 4702 4703 amdgpu_device_ip_suspend_phase2(adev); 4704 4705 if (amdgpu_sriov_vf(adev)) 4706 amdgpu_virt_release_full_gpu(adev, false); 4707 4708 r = amdgpu_dpm_notify_rlc_state(adev, false); 4709 if (r) 4710 return r; 4711 4712 return 0; 4713 } 4714 4715 /** 4716 * amdgpu_device_resume - initiate device resume 4717 * 4718 * @dev: drm dev pointer 4719 * @fbcon : notify the fbdev of resume 4720 * 4721 * Bring the hw back to operating state (all asics). 4722 * Returns 0 for success or an error on failure. 4723 * Called at driver resume. 4724 */ 4725 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4726 { 4727 struct amdgpu_device *adev = drm_to_adev(dev); 4728 int r = 0; 4729 4730 if (amdgpu_sriov_vf(adev)) { 4731 r = amdgpu_virt_request_full_gpu(adev, true); 4732 if (r) 4733 return r; 4734 } 4735 4736 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4737 return 0; 4738 4739 if (adev->in_s0ix) 4740 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4741 4742 /* post card */ 4743 if (amdgpu_device_need_post(adev)) { 4744 r = amdgpu_device_asic_init(adev); 4745 if (r) 4746 dev_err(adev->dev, "amdgpu asic init failed\n"); 4747 } 4748 4749 r = amdgpu_device_ip_resume(adev); 4750 4751 if (r) { 4752 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4753 goto exit; 4754 } 4755 amdgpu_fence_driver_hw_init(adev); 4756 4757 if (!adev->in_s0ix) { 4758 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4759 if (r) 4760 goto exit; 4761 } 4762 4763 r = amdgpu_device_ip_late_init(adev); 4764 if (r) 4765 goto exit; 4766 4767 queue_delayed_work(system_wq, &adev->delayed_init_work, 4768 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4769 exit: 4770 if (amdgpu_sriov_vf(adev)) { 4771 amdgpu_virt_init_data_exchange(adev); 4772 amdgpu_virt_release_full_gpu(adev, true); 4773 } 4774 4775 if (r) 4776 return r; 4777 4778 /* Make sure IB tests flushed */ 4779 flush_delayed_work(&adev->delayed_init_work); 4780 4781 if (fbcon) 4782 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4783 4784 amdgpu_ras_resume(adev); 4785 4786 if (adev->mode_info.num_crtc) { 4787 /* 4788 * Most of the connector probing functions try to acquire runtime pm 4789 * refs to ensure that the GPU is powered on when connector polling is 4790 * performed. Since we're calling this from a runtime PM callback, 4791 * trying to acquire rpm refs will cause us to deadlock. 4792 * 4793 * Since we're guaranteed to be holding the rpm lock, it's safe to 4794 * temporarily disable the rpm helpers so this doesn't deadlock us. 4795 */ 4796 #ifdef CONFIG_PM 4797 dev->dev->power.disable_depth++; 4798 #endif 4799 if (!adev->dc_enabled) 4800 drm_helper_hpd_irq_event(dev); 4801 else 4802 drm_kms_helper_hotplug_event(dev); 4803 #ifdef CONFIG_PM 4804 dev->dev->power.disable_depth--; 4805 #endif 4806 } 4807 adev->in_suspend = false; 4808 4809 if (adev->enable_mes) 4810 amdgpu_mes_self_test(adev); 4811 4812 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4813 DRM_WARN("smart shift update failed\n"); 4814 4815 return 0; 4816 } 4817 4818 /** 4819 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4820 * 4821 * @adev: amdgpu_device pointer 4822 * 4823 * The list of all the hardware IPs that make up the asic is walked and 4824 * the check_soft_reset callbacks are run. check_soft_reset determines 4825 * if the asic is still hung or not. 4826 * Returns true if any of the IPs are still in a hung state, false if not. 4827 */ 4828 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4829 { 4830 int i; 4831 bool asic_hang = false; 4832 4833 if (amdgpu_sriov_vf(adev)) 4834 return true; 4835 4836 if (amdgpu_asic_need_full_reset(adev)) 4837 return true; 4838 4839 for (i = 0; i < adev->num_ip_blocks; i++) { 4840 if (!adev->ip_blocks[i].status.valid) 4841 continue; 4842 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4843 adev->ip_blocks[i].status.hang = 4844 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4845 if (adev->ip_blocks[i].status.hang) { 4846 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4847 asic_hang = true; 4848 } 4849 } 4850 return asic_hang; 4851 } 4852 4853 /** 4854 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4855 * 4856 * @adev: amdgpu_device pointer 4857 * 4858 * The list of all the hardware IPs that make up the asic is walked and the 4859 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4860 * handles any IP specific hardware or software state changes that are 4861 * necessary for a soft reset to succeed. 4862 * Returns 0 on success, negative error code on failure. 4863 */ 4864 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4865 { 4866 int i, r = 0; 4867 4868 for (i = 0; i < adev->num_ip_blocks; i++) { 4869 if (!adev->ip_blocks[i].status.valid) 4870 continue; 4871 if (adev->ip_blocks[i].status.hang && 4872 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4873 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4874 if (r) 4875 return r; 4876 } 4877 } 4878 4879 return 0; 4880 } 4881 4882 /** 4883 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4884 * 4885 * @adev: amdgpu_device pointer 4886 * 4887 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4888 * reset is necessary to recover. 4889 * Returns true if a full asic reset is required, false if not. 4890 */ 4891 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4892 { 4893 int i; 4894 4895 if (amdgpu_asic_need_full_reset(adev)) 4896 return true; 4897 4898 for (i = 0; i < adev->num_ip_blocks; i++) { 4899 if (!adev->ip_blocks[i].status.valid) 4900 continue; 4901 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4902 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4903 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4904 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4905 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4906 if (adev->ip_blocks[i].status.hang) { 4907 dev_info(adev->dev, "Some block need full reset!\n"); 4908 return true; 4909 } 4910 } 4911 } 4912 return false; 4913 } 4914 4915 /** 4916 * amdgpu_device_ip_soft_reset - do a soft reset 4917 * 4918 * @adev: amdgpu_device pointer 4919 * 4920 * The list of all the hardware IPs that make up the asic is walked and the 4921 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4922 * IP specific hardware or software state changes that are necessary to soft 4923 * reset the IP. 4924 * Returns 0 on success, negative error code on failure. 4925 */ 4926 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4927 { 4928 int i, r = 0; 4929 4930 for (i = 0; i < adev->num_ip_blocks; i++) { 4931 if (!adev->ip_blocks[i].status.valid) 4932 continue; 4933 if (adev->ip_blocks[i].status.hang && 4934 adev->ip_blocks[i].version->funcs->soft_reset) { 4935 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4936 if (r) 4937 return r; 4938 } 4939 } 4940 4941 return 0; 4942 } 4943 4944 /** 4945 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4946 * 4947 * @adev: amdgpu_device pointer 4948 * 4949 * The list of all the hardware IPs that make up the asic is walked and the 4950 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4951 * handles any IP specific hardware or software state changes that are 4952 * necessary after the IP has been soft reset. 4953 * Returns 0 on success, negative error code on failure. 4954 */ 4955 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4956 { 4957 int i, r = 0; 4958 4959 for (i = 0; i < adev->num_ip_blocks; i++) { 4960 if (!adev->ip_blocks[i].status.valid) 4961 continue; 4962 if (adev->ip_blocks[i].status.hang && 4963 adev->ip_blocks[i].version->funcs->post_soft_reset) 4964 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4965 if (r) 4966 return r; 4967 } 4968 4969 return 0; 4970 } 4971 4972 /** 4973 * amdgpu_device_recover_vram - Recover some VRAM contents 4974 * 4975 * @adev: amdgpu_device pointer 4976 * 4977 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4978 * restore things like GPUVM page tables after a GPU reset where 4979 * the contents of VRAM might be lost. 4980 * 4981 * Returns: 4982 * 0 on success, negative error code on failure. 4983 */ 4984 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4985 { 4986 struct dma_fence *fence = NULL, *next = NULL; 4987 struct amdgpu_bo *shadow; 4988 struct amdgpu_bo_vm *vmbo; 4989 long r = 1, tmo; 4990 4991 if (amdgpu_sriov_runtime(adev)) 4992 tmo = msecs_to_jiffies(8000); 4993 else 4994 tmo = msecs_to_jiffies(100); 4995 4996 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4997 mutex_lock(&adev->shadow_list_lock); 4998 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4999 /* If vm is compute context or adev is APU, shadow will be NULL */ 5000 if (!vmbo->shadow) 5001 continue; 5002 shadow = vmbo->shadow; 5003 5004 /* No need to recover an evicted BO */ 5005 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 5006 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 5007 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 5008 continue; 5009 5010 r = amdgpu_bo_restore_shadow(shadow, &next); 5011 if (r) 5012 break; 5013 5014 if (fence) { 5015 tmo = dma_fence_wait_timeout(fence, false, tmo); 5016 dma_fence_put(fence); 5017 fence = next; 5018 if (tmo == 0) { 5019 r = -ETIMEDOUT; 5020 break; 5021 } else if (tmo < 0) { 5022 r = tmo; 5023 break; 5024 } 5025 } else { 5026 fence = next; 5027 } 5028 } 5029 mutex_unlock(&adev->shadow_list_lock); 5030 5031 if (fence) 5032 tmo = dma_fence_wait_timeout(fence, false, tmo); 5033 dma_fence_put(fence); 5034 5035 if (r < 0 || tmo <= 0) { 5036 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 5037 return -EIO; 5038 } 5039 5040 dev_info(adev->dev, "recover vram bo from shadow done\n"); 5041 return 0; 5042 } 5043 5044 5045 /** 5046 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5047 * 5048 * @adev: amdgpu_device pointer 5049 * @from_hypervisor: request from hypervisor 5050 * 5051 * do VF FLR and reinitialize Asic 5052 * return 0 means succeeded otherwise failed 5053 */ 5054 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5055 bool from_hypervisor) 5056 { 5057 int r; 5058 struct amdgpu_hive_info *hive = NULL; 5059 int retry_limit = 0; 5060 5061 retry: 5062 amdgpu_amdkfd_pre_reset(adev); 5063 5064 amdgpu_device_stop_pending_resets(adev); 5065 5066 if (from_hypervisor) 5067 r = amdgpu_virt_request_full_gpu(adev, true); 5068 else 5069 r = amdgpu_virt_reset_gpu(adev); 5070 if (r) 5071 return r; 5072 amdgpu_ras_set_fed(adev, false); 5073 amdgpu_irq_gpu_reset_resume_helper(adev); 5074 5075 /* some sw clean up VF needs to do before recover */ 5076 amdgpu_virt_post_reset(adev); 5077 5078 /* Resume IP prior to SMC */ 5079 r = amdgpu_device_ip_reinit_early_sriov(adev); 5080 if (r) 5081 goto error; 5082 5083 amdgpu_virt_init_data_exchange(adev); 5084 5085 r = amdgpu_device_fw_loading(adev); 5086 if (r) 5087 return r; 5088 5089 /* now we are okay to resume SMC/CP/SDMA */ 5090 r = amdgpu_device_ip_reinit_late_sriov(adev); 5091 if (r) 5092 goto error; 5093 5094 hive = amdgpu_get_xgmi_hive(adev); 5095 /* Update PSP FW topology after reset */ 5096 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5097 r = amdgpu_xgmi_update_topology(hive, adev); 5098 5099 if (hive) 5100 amdgpu_put_xgmi_hive(hive); 5101 5102 if (!r) { 5103 r = amdgpu_ib_ring_tests(adev); 5104 5105 amdgpu_amdkfd_post_reset(adev); 5106 } 5107 5108 error: 5109 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5110 amdgpu_inc_vram_lost(adev); 5111 r = amdgpu_device_recover_vram(adev); 5112 } 5113 amdgpu_virt_release_full_gpu(adev, true); 5114 5115 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5116 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5117 retry_limit++; 5118 goto retry; 5119 } else 5120 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5121 } 5122 5123 return r; 5124 } 5125 5126 /** 5127 * amdgpu_device_has_job_running - check if there is any job in mirror list 5128 * 5129 * @adev: amdgpu_device pointer 5130 * 5131 * check if there is any job in mirror list 5132 */ 5133 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5134 { 5135 int i; 5136 struct drm_sched_job *job; 5137 5138 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5139 struct amdgpu_ring *ring = adev->rings[i]; 5140 5141 if (!amdgpu_ring_sched_ready(ring)) 5142 continue; 5143 5144 spin_lock(&ring->sched.job_list_lock); 5145 job = list_first_entry_or_null(&ring->sched.pending_list, 5146 struct drm_sched_job, list); 5147 spin_unlock(&ring->sched.job_list_lock); 5148 if (job) 5149 return true; 5150 } 5151 return false; 5152 } 5153 5154 /** 5155 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5156 * 5157 * @adev: amdgpu_device pointer 5158 * 5159 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5160 * a hung GPU. 5161 */ 5162 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5163 { 5164 5165 if (amdgpu_gpu_recovery == 0) 5166 goto disabled; 5167 5168 /* Skip soft reset check in fatal error mode */ 5169 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5170 return true; 5171 5172 if (amdgpu_sriov_vf(adev)) 5173 return true; 5174 5175 if (amdgpu_gpu_recovery == -1) { 5176 switch (adev->asic_type) { 5177 #ifdef CONFIG_DRM_AMDGPU_SI 5178 case CHIP_VERDE: 5179 case CHIP_TAHITI: 5180 case CHIP_PITCAIRN: 5181 case CHIP_OLAND: 5182 case CHIP_HAINAN: 5183 #endif 5184 #ifdef CONFIG_DRM_AMDGPU_CIK 5185 case CHIP_KAVERI: 5186 case CHIP_KABINI: 5187 case CHIP_MULLINS: 5188 #endif 5189 case CHIP_CARRIZO: 5190 case CHIP_STONEY: 5191 case CHIP_CYAN_SKILLFISH: 5192 goto disabled; 5193 default: 5194 break; 5195 } 5196 } 5197 5198 return true; 5199 5200 disabled: 5201 dev_info(adev->dev, "GPU recovery disabled.\n"); 5202 return false; 5203 } 5204 5205 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5206 { 5207 u32 i; 5208 int ret = 0; 5209 5210 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5211 5212 dev_info(adev->dev, "GPU mode1 reset\n"); 5213 5214 /* disable BM */ 5215 pci_clear_master(adev->pdev); 5216 5217 amdgpu_device_cache_pci_state(adev->pdev); 5218 5219 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5220 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5221 ret = amdgpu_dpm_mode1_reset(adev); 5222 } else { 5223 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5224 ret = psp_gpu_reset(adev); 5225 } 5226 5227 if (ret) 5228 goto mode1_reset_failed; 5229 5230 amdgpu_device_load_pci_state(adev->pdev); 5231 ret = amdgpu_psp_wait_for_bootloader(adev); 5232 if (ret) 5233 goto mode1_reset_failed; 5234 5235 /* wait for asic to come out of reset */ 5236 for (i = 0; i < adev->usec_timeout; i++) { 5237 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5238 5239 if (memsize != 0xffffffff) 5240 break; 5241 udelay(1); 5242 } 5243 5244 if (i >= adev->usec_timeout) { 5245 ret = -ETIMEDOUT; 5246 goto mode1_reset_failed; 5247 } 5248 5249 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5250 5251 return 0; 5252 5253 mode1_reset_failed: 5254 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5255 return ret; 5256 } 5257 5258 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5259 struct amdgpu_reset_context *reset_context) 5260 { 5261 int i, r = 0; 5262 struct amdgpu_job *job = NULL; 5263 bool need_full_reset = 5264 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5265 5266 if (reset_context->reset_req_dev == adev) 5267 job = reset_context->job; 5268 5269 if (amdgpu_sriov_vf(adev)) { 5270 /* stop the data exchange thread */ 5271 amdgpu_virt_fini_data_exchange(adev); 5272 } 5273 5274 amdgpu_fence_driver_isr_toggle(adev, true); 5275 5276 /* block all schedulers and reset given job's ring */ 5277 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5278 struct amdgpu_ring *ring = adev->rings[i]; 5279 5280 if (!amdgpu_ring_sched_ready(ring)) 5281 continue; 5282 5283 /* Clear job fence from fence drv to avoid force_completion 5284 * leave NULL and vm flush fence in fence drv 5285 */ 5286 amdgpu_fence_driver_clear_job_fences(ring); 5287 5288 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5289 amdgpu_fence_driver_force_completion(ring); 5290 } 5291 5292 amdgpu_fence_driver_isr_toggle(adev, false); 5293 5294 if (job && job->vm) 5295 drm_sched_increase_karma(&job->base); 5296 5297 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5298 /* If reset handler not implemented, continue; otherwise return */ 5299 if (r == -EOPNOTSUPP) 5300 r = 0; 5301 else 5302 return r; 5303 5304 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5305 if (!amdgpu_sriov_vf(adev)) { 5306 5307 if (!need_full_reset) 5308 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5309 5310 if (!need_full_reset && amdgpu_gpu_recovery && 5311 amdgpu_device_ip_check_soft_reset(adev)) { 5312 amdgpu_device_ip_pre_soft_reset(adev); 5313 r = amdgpu_device_ip_soft_reset(adev); 5314 amdgpu_device_ip_post_soft_reset(adev); 5315 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5316 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5317 need_full_reset = true; 5318 } 5319 } 5320 5321 if (need_full_reset) 5322 r = amdgpu_device_ip_suspend(adev); 5323 if (need_full_reset) 5324 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5325 else 5326 clear_bit(AMDGPU_NEED_FULL_RESET, 5327 &reset_context->flags); 5328 } 5329 5330 return r; 5331 } 5332 5333 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5334 { 5335 int i; 5336 5337 lockdep_assert_held(&adev->reset_domain->sem); 5338 5339 for (i = 0; i < adev->reset_info.num_regs; i++) { 5340 adev->reset_info.reset_dump_reg_value[i] = 5341 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5342 5343 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5344 adev->reset_info.reset_dump_reg_value[i]); 5345 } 5346 5347 return 0; 5348 } 5349 5350 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5351 struct amdgpu_reset_context *reset_context) 5352 { 5353 struct amdgpu_device *tmp_adev = NULL; 5354 bool need_full_reset, skip_hw_reset, vram_lost = false; 5355 int r = 0; 5356 5357 /* Try reset handler method first */ 5358 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5359 reset_list); 5360 5361 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5362 amdgpu_reset_reg_dumps(tmp_adev); 5363 5364 reset_context->reset_device_list = device_list_handle; 5365 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5366 /* If reset handler not implemented, continue; otherwise return */ 5367 if (r == -EOPNOTSUPP) 5368 r = 0; 5369 else 5370 return r; 5371 5372 /* Reset handler not implemented, use the default method */ 5373 need_full_reset = 5374 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5375 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5376 5377 /* 5378 * ASIC reset has to be done on all XGMI hive nodes ASAP 5379 * to allow proper links negotiation in FW (within 1 sec) 5380 */ 5381 if (!skip_hw_reset && need_full_reset) { 5382 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5383 /* For XGMI run all resets in parallel to speed up the process */ 5384 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5385 tmp_adev->gmc.xgmi.pending_reset = false; 5386 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5387 r = -EALREADY; 5388 } else 5389 r = amdgpu_asic_reset(tmp_adev); 5390 5391 if (r) { 5392 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5393 r, adev_to_drm(tmp_adev)->unique); 5394 goto out; 5395 } 5396 } 5397 5398 /* For XGMI wait for all resets to complete before proceed */ 5399 if (!r) { 5400 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5401 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5402 flush_work(&tmp_adev->xgmi_reset_work); 5403 r = tmp_adev->asic_reset_res; 5404 if (r) 5405 break; 5406 } 5407 } 5408 } 5409 } 5410 5411 if (!r && amdgpu_ras_intr_triggered()) { 5412 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5413 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5414 } 5415 5416 amdgpu_ras_intr_cleared(); 5417 } 5418 5419 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5420 if (need_full_reset) { 5421 /* post card */ 5422 amdgpu_ras_set_fed(tmp_adev, false); 5423 r = amdgpu_device_asic_init(tmp_adev); 5424 if (r) { 5425 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5426 } else { 5427 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5428 5429 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5430 if (r) 5431 goto out; 5432 5433 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5434 5435 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5436 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5437 5438 if (vram_lost) { 5439 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5440 amdgpu_inc_vram_lost(tmp_adev); 5441 } 5442 5443 r = amdgpu_device_fw_loading(tmp_adev); 5444 if (r) 5445 return r; 5446 5447 r = amdgpu_xcp_restore_partition_mode( 5448 tmp_adev->xcp_mgr); 5449 if (r) 5450 goto out; 5451 5452 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5453 if (r) 5454 goto out; 5455 5456 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5457 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5458 5459 if (vram_lost) 5460 amdgpu_device_fill_reset_magic(tmp_adev); 5461 5462 /* 5463 * Add this ASIC as tracked as reset was already 5464 * complete successfully. 5465 */ 5466 amdgpu_register_gpu_instance(tmp_adev); 5467 5468 if (!reset_context->hive && 5469 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5470 amdgpu_xgmi_add_device(tmp_adev); 5471 5472 r = amdgpu_device_ip_late_init(tmp_adev); 5473 if (r) 5474 goto out; 5475 5476 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5477 5478 /* 5479 * The GPU enters bad state once faulty pages 5480 * by ECC has reached the threshold, and ras 5481 * recovery is scheduled next. So add one check 5482 * here to break recovery if it indeed exceeds 5483 * bad page threshold, and remind user to 5484 * retire this GPU or setting one bigger 5485 * bad_page_threshold value to fix this once 5486 * probing driver again. 5487 */ 5488 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5489 /* must succeed. */ 5490 amdgpu_ras_resume(tmp_adev); 5491 } else { 5492 r = -EINVAL; 5493 goto out; 5494 } 5495 5496 /* Update PSP FW topology after reset */ 5497 if (reset_context->hive && 5498 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5499 r = amdgpu_xgmi_update_topology( 5500 reset_context->hive, tmp_adev); 5501 } 5502 } 5503 5504 out: 5505 if (!r) { 5506 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5507 r = amdgpu_ib_ring_tests(tmp_adev); 5508 if (r) { 5509 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5510 need_full_reset = true; 5511 r = -EAGAIN; 5512 goto end; 5513 } 5514 } 5515 5516 if (!r) 5517 r = amdgpu_device_recover_vram(tmp_adev); 5518 else 5519 tmp_adev->asic_reset_res = r; 5520 } 5521 5522 end: 5523 if (need_full_reset) 5524 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5525 else 5526 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5527 return r; 5528 } 5529 5530 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5531 { 5532 5533 switch (amdgpu_asic_reset_method(adev)) { 5534 case AMD_RESET_METHOD_MODE1: 5535 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5536 break; 5537 case AMD_RESET_METHOD_MODE2: 5538 adev->mp1_state = PP_MP1_STATE_RESET; 5539 break; 5540 default: 5541 adev->mp1_state = PP_MP1_STATE_NONE; 5542 break; 5543 } 5544 } 5545 5546 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5547 { 5548 amdgpu_vf_error_trans_all(adev); 5549 adev->mp1_state = PP_MP1_STATE_NONE; 5550 } 5551 5552 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5553 { 5554 struct pci_dev *p = NULL; 5555 5556 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5557 adev->pdev->bus->number, 1); 5558 if (p) { 5559 pm_runtime_enable(&(p->dev)); 5560 pm_runtime_resume(&(p->dev)); 5561 } 5562 5563 pci_dev_put(p); 5564 } 5565 5566 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5567 { 5568 enum amd_reset_method reset_method; 5569 struct pci_dev *p = NULL; 5570 u64 expires; 5571 5572 /* 5573 * For now, only BACO and mode1 reset are confirmed 5574 * to suffer the audio issue without proper suspended. 5575 */ 5576 reset_method = amdgpu_asic_reset_method(adev); 5577 if ((reset_method != AMD_RESET_METHOD_BACO) && 5578 (reset_method != AMD_RESET_METHOD_MODE1)) 5579 return -EINVAL; 5580 5581 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5582 adev->pdev->bus->number, 1); 5583 if (!p) 5584 return -ENODEV; 5585 5586 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5587 if (!expires) 5588 /* 5589 * If we cannot get the audio device autosuspend delay, 5590 * a fixed 4S interval will be used. Considering 3S is 5591 * the audio controller default autosuspend delay setting. 5592 * 4S used here is guaranteed to cover that. 5593 */ 5594 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5595 5596 while (!pm_runtime_status_suspended(&(p->dev))) { 5597 if (!pm_runtime_suspend(&(p->dev))) 5598 break; 5599 5600 if (expires < ktime_get_mono_fast_ns()) { 5601 dev_warn(adev->dev, "failed to suspend display audio\n"); 5602 pci_dev_put(p); 5603 /* TODO: abort the succeeding gpu reset? */ 5604 return -ETIMEDOUT; 5605 } 5606 } 5607 5608 pm_runtime_disable(&(p->dev)); 5609 5610 pci_dev_put(p); 5611 return 0; 5612 } 5613 5614 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5615 { 5616 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5617 5618 #if defined(CONFIG_DEBUG_FS) 5619 if (!amdgpu_sriov_vf(adev)) 5620 cancel_work(&adev->reset_work); 5621 #endif 5622 5623 if (adev->kfd.dev) 5624 cancel_work(&adev->kfd.reset_work); 5625 5626 if (amdgpu_sriov_vf(adev)) 5627 cancel_work(&adev->virt.flr_work); 5628 5629 if (con && adev->ras_enabled) 5630 cancel_work(&con->recovery_work); 5631 5632 } 5633 5634 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5635 { 5636 struct amdgpu_device *tmp_adev; 5637 int ret = 0; 5638 u32 status; 5639 5640 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5641 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5642 if (PCI_POSSIBLE_ERROR(status)) { 5643 dev_err(tmp_adev->dev, "device lost from bus!"); 5644 ret = -ENODEV; 5645 } 5646 } 5647 5648 return ret; 5649 } 5650 5651 /** 5652 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5653 * 5654 * @adev: amdgpu_device pointer 5655 * @job: which job trigger hang 5656 * @reset_context: amdgpu reset context pointer 5657 * 5658 * Attempt to reset the GPU if it has hung (all asics). 5659 * Attempt to do soft-reset or full-reset and reinitialize Asic 5660 * Returns 0 for success or an error on failure. 5661 */ 5662 5663 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5664 struct amdgpu_job *job, 5665 struct amdgpu_reset_context *reset_context) 5666 { 5667 struct list_head device_list, *device_list_handle = NULL; 5668 bool job_signaled = false; 5669 struct amdgpu_hive_info *hive = NULL; 5670 struct amdgpu_device *tmp_adev = NULL; 5671 int i, r = 0; 5672 bool need_emergency_restart = false; 5673 bool audio_suspended = false; 5674 5675 /* 5676 * Special case: RAS triggered and full reset isn't supported 5677 */ 5678 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5679 5680 /* 5681 * Flush RAM to disk so that after reboot 5682 * the user can read log and see why the system rebooted. 5683 */ 5684 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5685 amdgpu_ras_get_context(adev)->reboot) { 5686 DRM_WARN("Emergency reboot."); 5687 5688 ksys_sync_helper(); 5689 emergency_restart(); 5690 } 5691 5692 dev_info(adev->dev, "GPU %s begin!\n", 5693 need_emergency_restart ? "jobs stop":"reset"); 5694 5695 if (!amdgpu_sriov_vf(adev)) 5696 hive = amdgpu_get_xgmi_hive(adev); 5697 if (hive) 5698 mutex_lock(&hive->hive_lock); 5699 5700 reset_context->job = job; 5701 reset_context->hive = hive; 5702 /* 5703 * Build list of devices to reset. 5704 * In case we are in XGMI hive mode, resort the device list 5705 * to put adev in the 1st position. 5706 */ 5707 INIT_LIST_HEAD(&device_list); 5708 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5709 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5710 list_add_tail(&tmp_adev->reset_list, &device_list); 5711 if (adev->shutdown) 5712 tmp_adev->shutdown = true; 5713 } 5714 if (!list_is_first(&adev->reset_list, &device_list)) 5715 list_rotate_to_front(&adev->reset_list, &device_list); 5716 device_list_handle = &device_list; 5717 } else { 5718 list_add_tail(&adev->reset_list, &device_list); 5719 device_list_handle = &device_list; 5720 } 5721 5722 if (!amdgpu_sriov_vf(adev)) { 5723 r = amdgpu_device_health_check(device_list_handle); 5724 if (r) 5725 goto end_reset; 5726 } 5727 5728 /* We need to lock reset domain only once both for XGMI and single device */ 5729 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5730 reset_list); 5731 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5732 5733 /* block all schedulers and reset given job's ring */ 5734 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5735 5736 amdgpu_device_set_mp1_state(tmp_adev); 5737 5738 /* 5739 * Try to put the audio codec into suspend state 5740 * before gpu reset started. 5741 * 5742 * Due to the power domain of the graphics device 5743 * is shared with AZ power domain. Without this, 5744 * we may change the audio hardware from behind 5745 * the audio driver's back. That will trigger 5746 * some audio codec errors. 5747 */ 5748 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5749 audio_suspended = true; 5750 5751 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5752 5753 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5754 5755 if (!amdgpu_sriov_vf(tmp_adev)) 5756 amdgpu_amdkfd_pre_reset(tmp_adev); 5757 5758 /* 5759 * Mark these ASICs to be reseted as untracked first 5760 * And add them back after reset completed 5761 */ 5762 amdgpu_unregister_gpu_instance(tmp_adev); 5763 5764 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5765 5766 /* disable ras on ALL IPs */ 5767 if (!need_emergency_restart && 5768 amdgpu_device_ip_need_full_reset(tmp_adev)) 5769 amdgpu_ras_suspend(tmp_adev); 5770 5771 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5772 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5773 5774 if (!amdgpu_ring_sched_ready(ring)) 5775 continue; 5776 5777 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5778 5779 if (need_emergency_restart) 5780 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5781 } 5782 atomic_inc(&tmp_adev->gpu_reset_counter); 5783 } 5784 5785 if (need_emergency_restart) 5786 goto skip_sched_resume; 5787 5788 /* 5789 * Must check guilty signal here since after this point all old 5790 * HW fences are force signaled. 5791 * 5792 * job->base holds a reference to parent fence 5793 */ 5794 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5795 job_signaled = true; 5796 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5797 goto skip_hw_reset; 5798 } 5799 5800 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5801 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5802 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5803 /*TODO Should we stop ?*/ 5804 if (r) { 5805 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5806 r, adev_to_drm(tmp_adev)->unique); 5807 tmp_adev->asic_reset_res = r; 5808 } 5809 5810 if (!amdgpu_sriov_vf(tmp_adev)) 5811 /* 5812 * Drop all pending non scheduler resets. Scheduler resets 5813 * were already dropped during drm_sched_stop 5814 */ 5815 amdgpu_device_stop_pending_resets(tmp_adev); 5816 } 5817 5818 /* Actual ASIC resets if needed.*/ 5819 /* Host driver will handle XGMI hive reset for SRIOV */ 5820 if (amdgpu_sriov_vf(adev)) { 5821 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5822 if (r) 5823 adev->asic_reset_res = r; 5824 5825 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5826 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5827 IP_VERSION(9, 4, 2) || 5828 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5829 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5830 amdgpu_ras_resume(adev); 5831 } else { 5832 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5833 if (r && r == -EAGAIN) 5834 goto retry; 5835 } 5836 5837 skip_hw_reset: 5838 5839 /* Post ASIC reset for all devs .*/ 5840 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5841 5842 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5843 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5844 5845 if (!amdgpu_ring_sched_ready(ring)) 5846 continue; 5847 5848 drm_sched_start(&ring->sched, true); 5849 } 5850 5851 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5852 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5853 5854 if (tmp_adev->asic_reset_res) 5855 r = tmp_adev->asic_reset_res; 5856 5857 tmp_adev->asic_reset_res = 0; 5858 5859 if (r) { 5860 /* bad news, how to tell it to userspace ? */ 5861 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5862 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5863 } else { 5864 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5865 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5866 DRM_WARN("smart shift update failed\n"); 5867 } 5868 } 5869 5870 skip_sched_resume: 5871 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5872 /* unlock kfd: SRIOV would do it separately */ 5873 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5874 amdgpu_amdkfd_post_reset(tmp_adev); 5875 5876 /* kfd_post_reset will do nothing if kfd device is not initialized, 5877 * need to bring up kfd here if it's not be initialized before 5878 */ 5879 if (!adev->kfd.init_complete) 5880 amdgpu_amdkfd_device_init(adev); 5881 5882 if (audio_suspended) 5883 amdgpu_device_resume_display_audio(tmp_adev); 5884 5885 amdgpu_device_unset_mp1_state(tmp_adev); 5886 5887 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5888 } 5889 5890 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5891 reset_list); 5892 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5893 5894 end_reset: 5895 if (hive) { 5896 mutex_unlock(&hive->hive_lock); 5897 amdgpu_put_xgmi_hive(hive); 5898 } 5899 5900 if (r) 5901 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5902 5903 atomic_set(&adev->reset_domain->reset_res, r); 5904 return r; 5905 } 5906 5907 /** 5908 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5909 * 5910 * @adev: amdgpu_device pointer 5911 * @speed: pointer to the speed of the link 5912 * @width: pointer to the width of the link 5913 * 5914 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5915 * first physical partner to an AMD dGPU. 5916 * This will exclude any virtual switches and links. 5917 */ 5918 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5919 enum pci_bus_speed *speed, 5920 enum pcie_link_width *width) 5921 { 5922 struct pci_dev *parent = adev->pdev; 5923 5924 if (!speed || !width) 5925 return; 5926 5927 *speed = PCI_SPEED_UNKNOWN; 5928 *width = PCIE_LNK_WIDTH_UNKNOWN; 5929 5930 while ((parent = pci_upstream_bridge(parent))) { 5931 /* skip upstream/downstream switches internal to dGPU*/ 5932 if (parent->vendor == PCI_VENDOR_ID_ATI) 5933 continue; 5934 *speed = pcie_get_speed_cap(parent); 5935 *width = pcie_get_width_cap(parent); 5936 break; 5937 } 5938 } 5939 5940 /** 5941 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5942 * 5943 * @adev: amdgpu_device pointer 5944 * 5945 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5946 * and lanes) of the slot the device is in. Handles APUs and 5947 * virtualized environments where PCIE config space may not be available. 5948 */ 5949 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5950 { 5951 struct pci_dev *pdev; 5952 enum pci_bus_speed speed_cap, platform_speed_cap; 5953 enum pcie_link_width platform_link_width; 5954 5955 if (amdgpu_pcie_gen_cap) 5956 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5957 5958 if (amdgpu_pcie_lane_cap) 5959 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5960 5961 /* covers APUs as well */ 5962 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5963 if (adev->pm.pcie_gen_mask == 0) 5964 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5965 if (adev->pm.pcie_mlw_mask == 0) 5966 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5967 return; 5968 } 5969 5970 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5971 return; 5972 5973 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5974 &platform_link_width); 5975 5976 if (adev->pm.pcie_gen_mask == 0) { 5977 /* asic caps */ 5978 pdev = adev->pdev; 5979 speed_cap = pcie_get_speed_cap(pdev); 5980 if (speed_cap == PCI_SPEED_UNKNOWN) { 5981 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5982 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5983 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5984 } else { 5985 if (speed_cap == PCIE_SPEED_32_0GT) 5986 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5987 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5988 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5989 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5990 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5991 else if (speed_cap == PCIE_SPEED_16_0GT) 5992 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5993 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5994 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5995 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5996 else if (speed_cap == PCIE_SPEED_8_0GT) 5997 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5998 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5999 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6000 else if (speed_cap == PCIE_SPEED_5_0GT) 6001 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6002 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6003 else 6004 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6005 } 6006 /* platform caps */ 6007 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6008 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6009 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6010 } else { 6011 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6012 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6013 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6014 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6015 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6016 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6017 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6018 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6019 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6020 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6021 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6022 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6023 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6024 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6025 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6026 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6027 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6028 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6029 else 6030 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6031 6032 } 6033 } 6034 if (adev->pm.pcie_mlw_mask == 0) { 6035 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6036 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6037 } else { 6038 switch (platform_link_width) { 6039 case PCIE_LNK_X32: 6040 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6041 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6042 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6043 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6044 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6045 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6046 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6047 break; 6048 case PCIE_LNK_X16: 6049 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6050 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6051 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6052 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6053 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6054 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6055 break; 6056 case PCIE_LNK_X12: 6057 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6058 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6059 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6060 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6061 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6062 break; 6063 case PCIE_LNK_X8: 6064 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6065 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6066 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6067 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6068 break; 6069 case PCIE_LNK_X4: 6070 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6071 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6072 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6073 break; 6074 case PCIE_LNK_X2: 6075 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6076 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6077 break; 6078 case PCIE_LNK_X1: 6079 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6080 break; 6081 default: 6082 break; 6083 } 6084 } 6085 } 6086 } 6087 6088 /** 6089 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6090 * 6091 * @adev: amdgpu_device pointer 6092 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6093 * 6094 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6095 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6096 * @peer_adev. 6097 */ 6098 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6099 struct amdgpu_device *peer_adev) 6100 { 6101 #ifdef CONFIG_HSA_AMD_P2P 6102 uint64_t address_mask = peer_adev->dev->dma_mask ? 6103 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6104 resource_size_t aper_limit = 6105 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6106 bool p2p_access = 6107 !adev->gmc.xgmi.connected_to_cpu && 6108 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6109 6110 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6111 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6112 !(adev->gmc.aper_base & address_mask || 6113 aper_limit & address_mask)); 6114 #else 6115 return false; 6116 #endif 6117 } 6118 6119 int amdgpu_device_baco_enter(struct drm_device *dev) 6120 { 6121 struct amdgpu_device *adev = drm_to_adev(dev); 6122 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6123 6124 if (!amdgpu_device_supports_baco(dev)) 6125 return -ENOTSUPP; 6126 6127 if (ras && adev->ras_enabled && 6128 adev->nbio.funcs->enable_doorbell_interrupt) 6129 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6130 6131 return amdgpu_dpm_baco_enter(adev); 6132 } 6133 6134 int amdgpu_device_baco_exit(struct drm_device *dev) 6135 { 6136 struct amdgpu_device *adev = drm_to_adev(dev); 6137 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6138 int ret = 0; 6139 6140 if (!amdgpu_device_supports_baco(dev)) 6141 return -ENOTSUPP; 6142 6143 ret = amdgpu_dpm_baco_exit(adev); 6144 if (ret) 6145 return ret; 6146 6147 if (ras && adev->ras_enabled && 6148 adev->nbio.funcs->enable_doorbell_interrupt) 6149 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6150 6151 if (amdgpu_passthrough(adev) && 6152 adev->nbio.funcs->clear_doorbell_interrupt) 6153 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6154 6155 return 0; 6156 } 6157 6158 /** 6159 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6160 * @pdev: PCI device struct 6161 * @state: PCI channel state 6162 * 6163 * Description: Called when a PCI error is detected. 6164 * 6165 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6166 */ 6167 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6168 { 6169 struct drm_device *dev = pci_get_drvdata(pdev); 6170 struct amdgpu_device *adev = drm_to_adev(dev); 6171 int i; 6172 6173 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6174 6175 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6176 DRM_WARN("No support for XGMI hive yet..."); 6177 return PCI_ERS_RESULT_DISCONNECT; 6178 } 6179 6180 adev->pci_channel_state = state; 6181 6182 switch (state) { 6183 case pci_channel_io_normal: 6184 return PCI_ERS_RESULT_CAN_RECOVER; 6185 /* Fatal error, prepare for slot reset */ 6186 case pci_channel_io_frozen: 6187 /* 6188 * Locking adev->reset_domain->sem will prevent any external access 6189 * to GPU during PCI error recovery 6190 */ 6191 amdgpu_device_lock_reset_domain(adev->reset_domain); 6192 amdgpu_device_set_mp1_state(adev); 6193 6194 /* 6195 * Block any work scheduling as we do for regular GPU reset 6196 * for the duration of the recovery 6197 */ 6198 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6199 struct amdgpu_ring *ring = adev->rings[i]; 6200 6201 if (!amdgpu_ring_sched_ready(ring)) 6202 continue; 6203 6204 drm_sched_stop(&ring->sched, NULL); 6205 } 6206 atomic_inc(&adev->gpu_reset_counter); 6207 return PCI_ERS_RESULT_NEED_RESET; 6208 case pci_channel_io_perm_failure: 6209 /* Permanent error, prepare for device removal */ 6210 return PCI_ERS_RESULT_DISCONNECT; 6211 } 6212 6213 return PCI_ERS_RESULT_NEED_RESET; 6214 } 6215 6216 /** 6217 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6218 * @pdev: pointer to PCI device 6219 */ 6220 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6221 { 6222 6223 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6224 6225 /* TODO - dump whatever for debugging purposes */ 6226 6227 /* This called only if amdgpu_pci_error_detected returns 6228 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6229 * works, no need to reset slot. 6230 */ 6231 6232 return PCI_ERS_RESULT_RECOVERED; 6233 } 6234 6235 /** 6236 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6237 * @pdev: PCI device struct 6238 * 6239 * Description: This routine is called by the pci error recovery 6240 * code after the PCI slot has been reset, just before we 6241 * should resume normal operations. 6242 */ 6243 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6244 { 6245 struct drm_device *dev = pci_get_drvdata(pdev); 6246 struct amdgpu_device *adev = drm_to_adev(dev); 6247 int r, i; 6248 struct amdgpu_reset_context reset_context; 6249 u32 memsize; 6250 struct list_head device_list; 6251 struct amdgpu_hive_info *hive; 6252 int hive_ras_recovery = 0; 6253 struct amdgpu_ras *ras; 6254 6255 /* PCI error slot reset should be skipped During RAS recovery */ 6256 hive = amdgpu_get_xgmi_hive(adev); 6257 if (hive) { 6258 hive_ras_recovery = atomic_read(&hive->ras_recovery); 6259 amdgpu_put_xgmi_hive(hive); 6260 } 6261 ras = amdgpu_ras_get_context(adev); 6262 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && 6263 ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) 6264 return PCI_ERS_RESULT_RECOVERED; 6265 6266 DRM_INFO("PCI error: slot reset callback!!\n"); 6267 6268 memset(&reset_context, 0, sizeof(reset_context)); 6269 6270 INIT_LIST_HEAD(&device_list); 6271 list_add_tail(&adev->reset_list, &device_list); 6272 6273 /* wait for asic to come out of reset */ 6274 msleep(500); 6275 6276 /* Restore PCI confspace */ 6277 amdgpu_device_load_pci_state(pdev); 6278 6279 /* confirm ASIC came out of reset */ 6280 for (i = 0; i < adev->usec_timeout; i++) { 6281 memsize = amdgpu_asic_get_config_memsize(adev); 6282 6283 if (memsize != 0xffffffff) 6284 break; 6285 udelay(1); 6286 } 6287 if (memsize == 0xffffffff) { 6288 r = -ETIME; 6289 goto out; 6290 } 6291 6292 reset_context.method = AMD_RESET_METHOD_NONE; 6293 reset_context.reset_req_dev = adev; 6294 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6295 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6296 6297 adev->no_hw_access = true; 6298 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6299 adev->no_hw_access = false; 6300 if (r) 6301 goto out; 6302 6303 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6304 6305 out: 6306 if (!r) { 6307 if (amdgpu_device_cache_pci_state(adev->pdev)) 6308 pci_restore_state(adev->pdev); 6309 6310 DRM_INFO("PCIe error recovery succeeded\n"); 6311 } else { 6312 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6313 amdgpu_device_unset_mp1_state(adev); 6314 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6315 } 6316 6317 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6318 } 6319 6320 /** 6321 * amdgpu_pci_resume() - resume normal ops after PCI reset 6322 * @pdev: pointer to PCI device 6323 * 6324 * Called when the error recovery driver tells us that its 6325 * OK to resume normal operation. 6326 */ 6327 void amdgpu_pci_resume(struct pci_dev *pdev) 6328 { 6329 struct drm_device *dev = pci_get_drvdata(pdev); 6330 struct amdgpu_device *adev = drm_to_adev(dev); 6331 int i; 6332 6333 6334 DRM_INFO("PCI error: resume callback!!\n"); 6335 6336 /* Only continue execution for the case of pci_channel_io_frozen */ 6337 if (adev->pci_channel_state != pci_channel_io_frozen) 6338 return; 6339 6340 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6341 struct amdgpu_ring *ring = adev->rings[i]; 6342 6343 if (!amdgpu_ring_sched_ready(ring)) 6344 continue; 6345 6346 drm_sched_start(&ring->sched, true); 6347 } 6348 6349 amdgpu_device_unset_mp1_state(adev); 6350 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6351 } 6352 6353 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6354 { 6355 struct drm_device *dev = pci_get_drvdata(pdev); 6356 struct amdgpu_device *adev = drm_to_adev(dev); 6357 int r; 6358 6359 r = pci_save_state(pdev); 6360 if (!r) { 6361 kfree(adev->pci_state); 6362 6363 adev->pci_state = pci_store_saved_state(pdev); 6364 6365 if (!adev->pci_state) { 6366 DRM_ERROR("Failed to store PCI saved state"); 6367 return false; 6368 } 6369 } else { 6370 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6371 return false; 6372 } 6373 6374 return true; 6375 } 6376 6377 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6378 { 6379 struct drm_device *dev = pci_get_drvdata(pdev); 6380 struct amdgpu_device *adev = drm_to_adev(dev); 6381 int r; 6382 6383 if (!adev->pci_state) 6384 return false; 6385 6386 r = pci_load_saved_state(pdev, adev->pci_state); 6387 6388 if (!r) { 6389 pci_restore_state(pdev); 6390 } else { 6391 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6392 return false; 6393 } 6394 6395 return true; 6396 } 6397 6398 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6399 struct amdgpu_ring *ring) 6400 { 6401 #ifdef CONFIG_X86_64 6402 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6403 return; 6404 #endif 6405 if (adev->gmc.xgmi.connected_to_cpu) 6406 return; 6407 6408 if (ring && ring->funcs->emit_hdp_flush) 6409 amdgpu_ring_emit_hdp_flush(ring); 6410 else 6411 amdgpu_asic_flush_hdp(adev, ring); 6412 } 6413 6414 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6415 struct amdgpu_ring *ring) 6416 { 6417 #ifdef CONFIG_X86_64 6418 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6419 return; 6420 #endif 6421 if (adev->gmc.xgmi.connected_to_cpu) 6422 return; 6423 6424 amdgpu_asic_invalidate_hdp(adev, ring); 6425 } 6426 6427 int amdgpu_in_reset(struct amdgpu_device *adev) 6428 { 6429 return atomic_read(&adev->reset_domain->in_gpu_reset); 6430 } 6431 6432 /** 6433 * amdgpu_device_halt() - bring hardware to some kind of halt state 6434 * 6435 * @adev: amdgpu_device pointer 6436 * 6437 * Bring hardware to some kind of halt state so that no one can touch it 6438 * any more. It will help to maintain error context when error occurred. 6439 * Compare to a simple hang, the system will keep stable at least for SSH 6440 * access. Then it should be trivial to inspect the hardware state and 6441 * see what's going on. Implemented as following: 6442 * 6443 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6444 * clears all CPU mappings to device, disallows remappings through page faults 6445 * 2. amdgpu_irq_disable_all() disables all interrupts 6446 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6447 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6448 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6449 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6450 * flush any in flight DMA operations 6451 */ 6452 void amdgpu_device_halt(struct amdgpu_device *adev) 6453 { 6454 struct pci_dev *pdev = adev->pdev; 6455 struct drm_device *ddev = adev_to_drm(adev); 6456 6457 amdgpu_xcp_dev_unplug(adev); 6458 drm_dev_unplug(ddev); 6459 6460 amdgpu_irq_disable_all(adev); 6461 6462 amdgpu_fence_driver_hw_fini(adev); 6463 6464 adev->no_hw_access = true; 6465 6466 amdgpu_device_unmap_mmio(adev); 6467 6468 pci_disable_device(pdev); 6469 pci_wait_for_pending_transaction(pdev); 6470 } 6471 6472 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6473 u32 reg) 6474 { 6475 unsigned long flags, address, data; 6476 u32 r; 6477 6478 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6479 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6480 6481 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6482 WREG32(address, reg * 4); 6483 (void)RREG32(address); 6484 r = RREG32(data); 6485 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6486 return r; 6487 } 6488 6489 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6490 u32 reg, u32 v) 6491 { 6492 unsigned long flags, address, data; 6493 6494 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6495 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6496 6497 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6498 WREG32(address, reg * 4); 6499 (void)RREG32(address); 6500 WREG32(data, v); 6501 (void)RREG32(data); 6502 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6503 } 6504 6505 /** 6506 * amdgpu_device_switch_gang - switch to a new gang 6507 * @adev: amdgpu_device pointer 6508 * @gang: the gang to switch to 6509 * 6510 * Try to switch to a new gang. 6511 * Returns: NULL if we switched to the new gang or a reference to the current 6512 * gang leader. 6513 */ 6514 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6515 struct dma_fence *gang) 6516 { 6517 struct dma_fence *old = NULL; 6518 6519 do { 6520 dma_fence_put(old); 6521 rcu_read_lock(); 6522 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6523 rcu_read_unlock(); 6524 6525 if (old == gang) 6526 break; 6527 6528 if (!dma_fence_is_signaled(old)) 6529 return old; 6530 6531 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6532 old, gang) != old); 6533 6534 dma_fence_put(old); 6535 return NULL; 6536 } 6537 6538 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6539 { 6540 switch (adev->asic_type) { 6541 #ifdef CONFIG_DRM_AMDGPU_SI 6542 case CHIP_HAINAN: 6543 #endif 6544 case CHIP_TOPAZ: 6545 /* chips with no display hardware */ 6546 return false; 6547 #ifdef CONFIG_DRM_AMDGPU_SI 6548 case CHIP_TAHITI: 6549 case CHIP_PITCAIRN: 6550 case CHIP_VERDE: 6551 case CHIP_OLAND: 6552 #endif 6553 #ifdef CONFIG_DRM_AMDGPU_CIK 6554 case CHIP_BONAIRE: 6555 case CHIP_HAWAII: 6556 case CHIP_KAVERI: 6557 case CHIP_KABINI: 6558 case CHIP_MULLINS: 6559 #endif 6560 case CHIP_TONGA: 6561 case CHIP_FIJI: 6562 case CHIP_POLARIS10: 6563 case CHIP_POLARIS11: 6564 case CHIP_POLARIS12: 6565 case CHIP_VEGAM: 6566 case CHIP_CARRIZO: 6567 case CHIP_STONEY: 6568 /* chips with display hardware */ 6569 return true; 6570 default: 6571 /* IP discovery */ 6572 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6573 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6574 return false; 6575 return true; 6576 } 6577 } 6578 6579 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6580 uint32_t inst, uint32_t reg_addr, char reg_name[], 6581 uint32_t expected_value, uint32_t mask) 6582 { 6583 uint32_t ret = 0; 6584 uint32_t old_ = 0; 6585 uint32_t tmp_ = RREG32(reg_addr); 6586 uint32_t loop = adev->usec_timeout; 6587 6588 while ((tmp_ & (mask)) != (expected_value)) { 6589 if (old_ != tmp_) { 6590 loop = adev->usec_timeout; 6591 old_ = tmp_; 6592 } else 6593 udelay(1); 6594 tmp_ = RREG32(reg_addr); 6595 loop--; 6596 if (!loop) { 6597 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6598 inst, reg_name, (uint32_t)expected_value, 6599 (uint32_t)(tmp_ & (mask))); 6600 ret = -ETIMEDOUT; 6601 break; 6602 } 6603 } 6604 return ret; 6605 } 6606