1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 148 149 /** 150 * DOC: pcie_replay_count 151 * 152 * The amdgpu driver provides a sysfs API for reporting the total number 153 * of PCIe replays (NAKs) 154 * The file pcie_replay_count is used for this and returns the total 155 * number of replays as a sum of the NAKs generated and NAKs received 156 */ 157 158 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 159 struct device_attribute *attr, char *buf) 160 { 161 struct drm_device *ddev = dev_get_drvdata(dev); 162 struct amdgpu_device *adev = drm_to_adev(ddev); 163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 164 165 return sysfs_emit(buf, "%llu\n", cnt); 166 } 167 168 static DEVICE_ATTR(pcie_replay_count, 0444, 169 amdgpu_device_get_pcie_replay_count, NULL); 170 171 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 172 struct bin_attribute *attr, char *buf, 173 loff_t ppos, size_t count) 174 { 175 struct device *dev = kobj_to_dev(kobj); 176 struct drm_device *ddev = dev_get_drvdata(dev); 177 struct amdgpu_device *adev = drm_to_adev(ddev); 178 ssize_t bytes_read; 179 180 switch (ppos) { 181 case AMDGPU_SYS_REG_STATE_XGMI: 182 bytes_read = amdgpu_asic_get_reg_state( 183 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 184 break; 185 case AMDGPU_SYS_REG_STATE_WAFL: 186 bytes_read = amdgpu_asic_get_reg_state( 187 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 188 break; 189 case AMDGPU_SYS_REG_STATE_PCIE: 190 bytes_read = amdgpu_asic_get_reg_state( 191 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 192 break; 193 case AMDGPU_SYS_REG_STATE_USR: 194 bytes_read = amdgpu_asic_get_reg_state( 195 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 196 break; 197 case AMDGPU_SYS_REG_STATE_USR_1: 198 bytes_read = amdgpu_asic_get_reg_state( 199 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 200 break; 201 default: 202 return -EINVAL; 203 } 204 205 return bytes_read; 206 } 207 208 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 209 AMDGPU_SYS_REG_STATE_END); 210 211 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 212 { 213 int ret; 214 215 if (!amdgpu_asic_get_reg_state_supported(adev)) 216 return 0; 217 218 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 219 220 return ret; 221 } 222 223 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 224 { 225 if (!amdgpu_asic_get_reg_state_supported(adev)) 226 return; 227 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 228 } 229 230 /** 231 * DOC: board_info 232 * 233 * The amdgpu driver provides a sysfs API for giving board related information. 234 * It provides the form factor information in the format 235 * 236 * type : form factor 237 * 238 * Possible form factor values 239 * 240 * - "cem" - PCIE CEM card 241 * - "oam" - Open Compute Accelerator Module 242 * - "unknown" - Not known 243 * 244 */ 245 246 static ssize_t amdgpu_device_get_board_info(struct device *dev, 247 struct device_attribute *attr, 248 char *buf) 249 { 250 struct drm_device *ddev = dev_get_drvdata(dev); 251 struct amdgpu_device *adev = drm_to_adev(ddev); 252 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 253 const char *pkg; 254 255 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 256 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 257 258 switch (pkg_type) { 259 case AMDGPU_PKG_TYPE_CEM: 260 pkg = "cem"; 261 break; 262 case AMDGPU_PKG_TYPE_OAM: 263 pkg = "oam"; 264 break; 265 default: 266 pkg = "unknown"; 267 break; 268 } 269 270 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 271 } 272 273 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 274 275 static struct attribute *amdgpu_board_attrs[] = { 276 &dev_attr_board_info.attr, 277 NULL, 278 }; 279 280 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 281 struct attribute *attr, int n) 282 { 283 struct device *dev = kobj_to_dev(kobj); 284 struct drm_device *ddev = dev_get_drvdata(dev); 285 struct amdgpu_device *adev = drm_to_adev(ddev); 286 287 if (adev->flags & AMD_IS_APU) 288 return 0; 289 290 return attr->mode; 291 } 292 293 static const struct attribute_group amdgpu_board_attrs_group = { 294 .attrs = amdgpu_board_attrs, 295 .is_visible = amdgpu_board_attrs_is_visible 296 }; 297 298 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 299 300 301 /** 302 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 303 * 304 * @dev: drm_device pointer 305 * 306 * Returns true if the device is a dGPU with ATPX power control, 307 * otherwise return false. 308 */ 309 bool amdgpu_device_supports_px(struct drm_device *dev) 310 { 311 struct amdgpu_device *adev = drm_to_adev(dev); 312 313 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 314 return true; 315 return false; 316 } 317 318 /** 319 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 320 * 321 * @dev: drm_device pointer 322 * 323 * Returns true if the device is a dGPU with ACPI power control, 324 * otherwise return false. 325 */ 326 bool amdgpu_device_supports_boco(struct drm_device *dev) 327 { 328 struct amdgpu_device *adev = drm_to_adev(dev); 329 330 if (adev->has_pr3 || 331 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 332 return true; 333 return false; 334 } 335 336 /** 337 * amdgpu_device_supports_baco - Does the device support BACO 338 * 339 * @dev: drm_device pointer 340 * 341 * Return: 342 * 1 if the device supporte BACO; 343 * 3 if the device support MACO (only works if BACO is supported) 344 * otherwise return 0. 345 */ 346 int amdgpu_device_supports_baco(struct drm_device *dev) 347 { 348 struct amdgpu_device *adev = drm_to_adev(dev); 349 350 return amdgpu_asic_supports_baco(adev); 351 } 352 353 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 354 { 355 struct drm_device *dev; 356 int bamaco_support; 357 358 dev = adev_to_drm(adev); 359 360 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 361 bamaco_support = amdgpu_device_supports_baco(dev); 362 363 switch (amdgpu_runtime_pm) { 364 case 2: 365 if (bamaco_support & MACO_SUPPORT) { 366 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 367 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 368 } else if (bamaco_support == BACO_SUPPORT) { 369 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 370 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 371 } 372 break; 373 case 1: 374 if (bamaco_support & BACO_SUPPORT) { 375 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 376 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 377 } 378 break; 379 case -1: 380 case -2: 381 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 382 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 383 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 384 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 385 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 386 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 387 } else { 388 if (!bamaco_support) 389 goto no_runtime_pm; 390 391 switch (adev->asic_type) { 392 case CHIP_VEGA20: 393 case CHIP_ARCTURUS: 394 /* BACO are not supported on vega20 and arctrus */ 395 break; 396 case CHIP_VEGA10: 397 /* enable BACO as runpm mode if noretry=0 */ 398 if (!adev->gmc.noretry) 399 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 400 break; 401 default: 402 /* enable BACO as runpm mode on CI+ */ 403 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 404 break; 405 } 406 407 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 408 if (bamaco_support & MACO_SUPPORT) { 409 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 410 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 411 } else { 412 dev_info(adev->dev, "Using BACO for runtime pm\n"); 413 } 414 } 415 } 416 break; 417 case 0: 418 dev_info(adev->dev, "runtime pm is manually disabled\n"); 419 break; 420 default: 421 break; 422 } 423 424 no_runtime_pm: 425 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 426 dev_info(adev->dev, "Runtime PM not available\n"); 427 } 428 /** 429 * amdgpu_device_supports_smart_shift - Is the device dGPU with 430 * smart shift support 431 * 432 * @dev: drm_device pointer 433 * 434 * Returns true if the device is a dGPU with Smart Shift support, 435 * otherwise returns false. 436 */ 437 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 438 { 439 return (amdgpu_device_supports_boco(dev) && 440 amdgpu_acpi_is_power_shift_control_supported()); 441 } 442 443 /* 444 * VRAM access helper functions 445 */ 446 447 /** 448 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 449 * 450 * @adev: amdgpu_device pointer 451 * @pos: offset of the buffer in vram 452 * @buf: virtual address of the buffer in system memory 453 * @size: read/write size, sizeof(@buf) must > @size 454 * @write: true - write to vram, otherwise - read from vram 455 */ 456 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 457 void *buf, size_t size, bool write) 458 { 459 unsigned long flags; 460 uint32_t hi = ~0, tmp = 0; 461 uint32_t *data = buf; 462 uint64_t last; 463 int idx; 464 465 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 466 return; 467 468 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 469 470 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 471 for (last = pos + size; pos < last; pos += 4) { 472 tmp = pos >> 31; 473 474 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 475 if (tmp != hi) { 476 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 477 hi = tmp; 478 } 479 if (write) 480 WREG32_NO_KIQ(mmMM_DATA, *data++); 481 else 482 *data++ = RREG32_NO_KIQ(mmMM_DATA); 483 } 484 485 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 486 drm_dev_exit(idx); 487 } 488 489 /** 490 * amdgpu_device_aper_access - access vram by vram aperature 491 * 492 * @adev: amdgpu_device pointer 493 * @pos: offset of the buffer in vram 494 * @buf: virtual address of the buffer in system memory 495 * @size: read/write size, sizeof(@buf) must > @size 496 * @write: true - write to vram, otherwise - read from vram 497 * 498 * The return value means how many bytes have been transferred. 499 */ 500 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 501 void *buf, size_t size, bool write) 502 { 503 #ifdef CONFIG_64BIT 504 void __iomem *addr; 505 size_t count = 0; 506 uint64_t last; 507 508 if (!adev->mman.aper_base_kaddr) 509 return 0; 510 511 last = min(pos + size, adev->gmc.visible_vram_size); 512 if (last > pos) { 513 addr = adev->mman.aper_base_kaddr + pos; 514 count = last - pos; 515 516 if (write) { 517 memcpy_toio(addr, buf, count); 518 /* Make sure HDP write cache flush happens without any reordering 519 * after the system memory contents are sent over PCIe device 520 */ 521 mb(); 522 amdgpu_device_flush_hdp(adev, NULL); 523 } else { 524 amdgpu_device_invalidate_hdp(adev, NULL); 525 /* Make sure HDP read cache is invalidated before issuing a read 526 * to the PCIe device 527 */ 528 mb(); 529 memcpy_fromio(buf, addr, count); 530 } 531 532 } 533 534 return count; 535 #else 536 return 0; 537 #endif 538 } 539 540 /** 541 * amdgpu_device_vram_access - read/write a buffer in vram 542 * 543 * @adev: amdgpu_device pointer 544 * @pos: offset of the buffer in vram 545 * @buf: virtual address of the buffer in system memory 546 * @size: read/write size, sizeof(@buf) must > @size 547 * @write: true - write to vram, otherwise - read from vram 548 */ 549 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 550 void *buf, size_t size, bool write) 551 { 552 size_t count; 553 554 /* try to using vram apreature to access vram first */ 555 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 556 size -= count; 557 if (size) { 558 /* using MM to access rest vram */ 559 pos += count; 560 buf += count; 561 amdgpu_device_mm_access(adev, pos, buf, size, write); 562 } 563 } 564 565 /* 566 * register access helper functions. 567 */ 568 569 /* Check if hw access should be skipped because of hotplug or device error */ 570 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 571 { 572 if (adev->no_hw_access) 573 return true; 574 575 #ifdef CONFIG_LOCKDEP 576 /* 577 * This is a bit complicated to understand, so worth a comment. What we assert 578 * here is that the GPU reset is not running on another thread in parallel. 579 * 580 * For this we trylock the read side of the reset semaphore, if that succeeds 581 * we know that the reset is not running in paralell. 582 * 583 * If the trylock fails we assert that we are either already holding the read 584 * side of the lock or are the reset thread itself and hold the write side of 585 * the lock. 586 */ 587 if (in_task()) { 588 if (down_read_trylock(&adev->reset_domain->sem)) 589 up_read(&adev->reset_domain->sem); 590 else 591 lockdep_assert_held(&adev->reset_domain->sem); 592 } 593 #endif 594 return false; 595 } 596 597 /** 598 * amdgpu_device_rreg - read a memory mapped IO or indirect register 599 * 600 * @adev: amdgpu_device pointer 601 * @reg: dword aligned register offset 602 * @acc_flags: access flags which require special behavior 603 * 604 * Returns the 32 bit value from the offset specified. 605 */ 606 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 607 uint32_t reg, uint32_t acc_flags) 608 { 609 uint32_t ret; 610 611 if (amdgpu_device_skip_hw_access(adev)) 612 return 0; 613 614 if ((reg * 4) < adev->rmmio_size) { 615 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 616 amdgpu_sriov_runtime(adev) && 617 down_read_trylock(&adev->reset_domain->sem)) { 618 ret = amdgpu_kiq_rreg(adev, reg, 0); 619 up_read(&adev->reset_domain->sem); 620 } else { 621 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 622 } 623 } else { 624 ret = adev->pcie_rreg(adev, reg * 4); 625 } 626 627 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 628 629 return ret; 630 } 631 632 /* 633 * MMIO register read with bytes helper functions 634 * @offset:bytes offset from MMIO start 635 */ 636 637 /** 638 * amdgpu_mm_rreg8 - read a memory mapped IO register 639 * 640 * @adev: amdgpu_device pointer 641 * @offset: byte aligned register offset 642 * 643 * Returns the 8 bit value from the offset specified. 644 */ 645 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 646 { 647 if (amdgpu_device_skip_hw_access(adev)) 648 return 0; 649 650 if (offset < adev->rmmio_size) 651 return (readb(adev->rmmio + offset)); 652 BUG(); 653 } 654 655 656 /** 657 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 658 * 659 * @adev: amdgpu_device pointer 660 * @reg: dword aligned register offset 661 * @acc_flags: access flags which require special behavior 662 * @xcc_id: xcc accelerated compute core id 663 * 664 * Returns the 32 bit value from the offset specified. 665 */ 666 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 667 uint32_t reg, uint32_t acc_flags, 668 uint32_t xcc_id) 669 { 670 uint32_t ret, rlcg_flag; 671 672 if (amdgpu_device_skip_hw_access(adev)) 673 return 0; 674 675 if ((reg * 4) < adev->rmmio_size) { 676 if (amdgpu_sriov_vf(adev) && 677 !amdgpu_sriov_runtime(adev) && 678 adev->gfx.rlc.rlcg_reg_access_supported && 679 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 680 GC_HWIP, false, 681 &rlcg_flag)) { 682 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 683 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 684 amdgpu_sriov_runtime(adev) && 685 down_read_trylock(&adev->reset_domain->sem)) { 686 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 687 up_read(&adev->reset_domain->sem); 688 } else { 689 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 690 } 691 } else { 692 ret = adev->pcie_rreg(adev, reg * 4); 693 } 694 695 return ret; 696 } 697 698 /* 699 * MMIO register write with bytes helper functions 700 * @offset:bytes offset from MMIO start 701 * @value: the value want to be written to the register 702 */ 703 704 /** 705 * amdgpu_mm_wreg8 - read a memory mapped IO register 706 * 707 * @adev: amdgpu_device pointer 708 * @offset: byte aligned register offset 709 * @value: 8 bit value to write 710 * 711 * Writes the value specified to the offset specified. 712 */ 713 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 714 { 715 if (amdgpu_device_skip_hw_access(adev)) 716 return; 717 718 if (offset < adev->rmmio_size) 719 writeb(value, adev->rmmio + offset); 720 else 721 BUG(); 722 } 723 724 /** 725 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 726 * 727 * @adev: amdgpu_device pointer 728 * @reg: dword aligned register offset 729 * @v: 32 bit value to write to the register 730 * @acc_flags: access flags which require special behavior 731 * 732 * Writes the value specified to the offset specified. 733 */ 734 void amdgpu_device_wreg(struct amdgpu_device *adev, 735 uint32_t reg, uint32_t v, 736 uint32_t acc_flags) 737 { 738 if (amdgpu_device_skip_hw_access(adev)) 739 return; 740 741 if ((reg * 4) < adev->rmmio_size) { 742 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 743 amdgpu_sriov_runtime(adev) && 744 down_read_trylock(&adev->reset_domain->sem)) { 745 amdgpu_kiq_wreg(adev, reg, v, 0); 746 up_read(&adev->reset_domain->sem); 747 } else { 748 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 749 } 750 } else { 751 adev->pcie_wreg(adev, reg * 4, v); 752 } 753 754 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 755 } 756 757 /** 758 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: mmio/rlc register 762 * @v: value to write 763 * @xcc_id: xcc accelerated compute core id 764 * 765 * this function is invoked only for the debugfs register access 766 */ 767 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 768 uint32_t reg, uint32_t v, 769 uint32_t xcc_id) 770 { 771 if (amdgpu_device_skip_hw_access(adev)) 772 return; 773 774 if (amdgpu_sriov_fullaccess(adev) && 775 adev->gfx.rlc.funcs && 776 adev->gfx.rlc.funcs->is_rlcg_access_range) { 777 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 778 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 779 } else if ((reg * 4) >= adev->rmmio_size) { 780 adev->pcie_wreg(adev, reg * 4, v); 781 } else { 782 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 783 } 784 } 785 786 /** 787 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 788 * 789 * @adev: amdgpu_device pointer 790 * @reg: dword aligned register offset 791 * @v: 32 bit value to write to the register 792 * @acc_flags: access flags which require special behavior 793 * @xcc_id: xcc accelerated compute core id 794 * 795 * Writes the value specified to the offset specified. 796 */ 797 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 798 uint32_t reg, uint32_t v, 799 uint32_t acc_flags, uint32_t xcc_id) 800 { 801 uint32_t rlcg_flag; 802 803 if (amdgpu_device_skip_hw_access(adev)) 804 return; 805 806 if ((reg * 4) < adev->rmmio_size) { 807 if (amdgpu_sriov_vf(adev) && 808 !amdgpu_sriov_runtime(adev) && 809 adev->gfx.rlc.rlcg_reg_access_supported && 810 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 811 GC_HWIP, true, 812 &rlcg_flag)) { 813 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 814 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 815 amdgpu_sriov_runtime(adev) && 816 down_read_trylock(&adev->reset_domain->sem)) { 817 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 818 up_read(&adev->reset_domain->sem); 819 } else { 820 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 821 } 822 } else { 823 adev->pcie_wreg(adev, reg * 4, v); 824 } 825 } 826 827 /** 828 * amdgpu_device_indirect_rreg - read an indirect register 829 * 830 * @adev: amdgpu_device pointer 831 * @reg_addr: indirect register address to read from 832 * 833 * Returns the value of indirect register @reg_addr 834 */ 835 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 836 u32 reg_addr) 837 { 838 unsigned long flags, pcie_index, pcie_data; 839 void __iomem *pcie_index_offset; 840 void __iomem *pcie_data_offset; 841 u32 r; 842 843 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 844 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 845 846 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 847 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 848 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 849 850 writel(reg_addr, pcie_index_offset); 851 readl(pcie_index_offset); 852 r = readl(pcie_data_offset); 853 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 854 855 return r; 856 } 857 858 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 859 u64 reg_addr) 860 { 861 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 862 u32 r; 863 void __iomem *pcie_index_offset; 864 void __iomem *pcie_index_hi_offset; 865 void __iomem *pcie_data_offset; 866 867 if (unlikely(!adev->nbio.funcs)) { 868 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 869 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 870 } else { 871 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 872 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 873 } 874 875 if (reg_addr >> 32) { 876 if (unlikely(!adev->nbio.funcs)) 877 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 878 else 879 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 880 } else { 881 pcie_index_hi = 0; 882 } 883 884 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 885 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 886 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 887 if (pcie_index_hi != 0) 888 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 889 pcie_index_hi * 4; 890 891 writel(reg_addr, pcie_index_offset); 892 readl(pcie_index_offset); 893 if (pcie_index_hi != 0) { 894 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 r = readl(pcie_data_offset); 898 899 /* clear the high bits */ 900 if (pcie_index_hi != 0) { 901 writel(0, pcie_index_hi_offset); 902 readl(pcie_index_hi_offset); 903 } 904 905 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 906 907 return r; 908 } 909 910 /** 911 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 912 * 913 * @adev: amdgpu_device pointer 914 * @reg_addr: indirect register address to read from 915 * 916 * Returns the value of indirect register @reg_addr 917 */ 918 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 919 u32 reg_addr) 920 { 921 unsigned long flags, pcie_index, pcie_data; 922 void __iomem *pcie_index_offset; 923 void __iomem *pcie_data_offset; 924 u64 r; 925 926 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 927 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 928 929 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 930 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 931 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 932 933 /* read low 32 bits */ 934 writel(reg_addr, pcie_index_offset); 935 readl(pcie_index_offset); 936 r = readl(pcie_data_offset); 937 /* read high 32 bits */ 938 writel(reg_addr + 4, pcie_index_offset); 939 readl(pcie_index_offset); 940 r |= ((u64)readl(pcie_data_offset) << 32); 941 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 942 943 return r; 944 } 945 946 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 947 u64 reg_addr) 948 { 949 unsigned long flags, pcie_index, pcie_data; 950 unsigned long pcie_index_hi = 0; 951 void __iomem *pcie_index_offset; 952 void __iomem *pcie_index_hi_offset; 953 void __iomem *pcie_data_offset; 954 u64 r; 955 956 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 957 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 958 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 959 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 960 961 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 962 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 963 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 964 if (pcie_index_hi != 0) 965 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 966 pcie_index_hi * 4; 967 968 /* read low 32 bits */ 969 writel(reg_addr, pcie_index_offset); 970 readl(pcie_index_offset); 971 if (pcie_index_hi != 0) { 972 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 973 readl(pcie_index_hi_offset); 974 } 975 r = readl(pcie_data_offset); 976 /* read high 32 bits */ 977 writel(reg_addr + 4, pcie_index_offset); 978 readl(pcie_index_offset); 979 if (pcie_index_hi != 0) { 980 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 981 readl(pcie_index_hi_offset); 982 } 983 r |= ((u64)readl(pcie_data_offset) << 32); 984 985 /* clear the high bits */ 986 if (pcie_index_hi != 0) { 987 writel(0, pcie_index_hi_offset); 988 readl(pcie_index_hi_offset); 989 } 990 991 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 992 993 return r; 994 } 995 996 /** 997 * amdgpu_device_indirect_wreg - write an indirect register address 998 * 999 * @adev: amdgpu_device pointer 1000 * @reg_addr: indirect register offset 1001 * @reg_data: indirect register data 1002 * 1003 */ 1004 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1005 u32 reg_addr, u32 reg_data) 1006 { 1007 unsigned long flags, pcie_index, pcie_data; 1008 void __iomem *pcie_index_offset; 1009 void __iomem *pcie_data_offset; 1010 1011 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1012 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1013 1014 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1015 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1016 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1017 1018 writel(reg_addr, pcie_index_offset); 1019 readl(pcie_index_offset); 1020 writel(reg_data, pcie_data_offset); 1021 readl(pcie_data_offset); 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 } 1024 1025 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1026 u64 reg_addr, u32 reg_data) 1027 { 1028 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1029 void __iomem *pcie_index_offset; 1030 void __iomem *pcie_index_hi_offset; 1031 void __iomem *pcie_data_offset; 1032 1033 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1034 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1035 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1036 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1037 else 1038 pcie_index_hi = 0; 1039 1040 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1041 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1042 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1043 if (pcie_index_hi != 0) 1044 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1045 pcie_index_hi * 4; 1046 1047 writel(reg_addr, pcie_index_offset); 1048 readl(pcie_index_offset); 1049 if (pcie_index_hi != 0) { 1050 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1051 readl(pcie_index_hi_offset); 1052 } 1053 writel(reg_data, pcie_data_offset); 1054 readl(pcie_data_offset); 1055 1056 /* clear the high bits */ 1057 if (pcie_index_hi != 0) { 1058 writel(0, pcie_index_hi_offset); 1059 readl(pcie_index_hi_offset); 1060 } 1061 1062 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1063 } 1064 1065 /** 1066 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1067 * 1068 * @adev: amdgpu_device pointer 1069 * @reg_addr: indirect register offset 1070 * @reg_data: indirect register data 1071 * 1072 */ 1073 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1074 u32 reg_addr, u64 reg_data) 1075 { 1076 unsigned long flags, pcie_index, pcie_data; 1077 void __iomem *pcie_index_offset; 1078 void __iomem *pcie_data_offset; 1079 1080 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1081 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1082 1083 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1084 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1085 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1086 1087 /* write low 32 bits */ 1088 writel(reg_addr, pcie_index_offset); 1089 readl(pcie_index_offset); 1090 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1091 readl(pcie_data_offset); 1092 /* write high 32 bits */ 1093 writel(reg_addr + 4, pcie_index_offset); 1094 readl(pcie_index_offset); 1095 writel((u32)(reg_data >> 32), pcie_data_offset); 1096 readl(pcie_data_offset); 1097 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1098 } 1099 1100 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1101 u64 reg_addr, u64 reg_data) 1102 { 1103 unsigned long flags, pcie_index, pcie_data; 1104 unsigned long pcie_index_hi = 0; 1105 void __iomem *pcie_index_offset; 1106 void __iomem *pcie_index_hi_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1112 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1113 1114 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1115 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1116 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1117 if (pcie_index_hi != 0) 1118 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1119 pcie_index_hi * 4; 1120 1121 /* write low 32 bits */ 1122 writel(reg_addr, pcie_index_offset); 1123 readl(pcie_index_offset); 1124 if (pcie_index_hi != 0) { 1125 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1126 readl(pcie_index_hi_offset); 1127 } 1128 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1129 readl(pcie_data_offset); 1130 /* write high 32 bits */ 1131 writel(reg_addr + 4, pcie_index_offset); 1132 readl(pcie_index_offset); 1133 if (pcie_index_hi != 0) { 1134 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1135 readl(pcie_index_hi_offset); 1136 } 1137 writel((u32)(reg_data >> 32), pcie_data_offset); 1138 readl(pcie_data_offset); 1139 1140 /* clear the high bits */ 1141 if (pcie_index_hi != 0) { 1142 writel(0, pcie_index_hi_offset); 1143 readl(pcie_index_hi_offset); 1144 } 1145 1146 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1147 } 1148 1149 /** 1150 * amdgpu_device_get_rev_id - query device rev_id 1151 * 1152 * @adev: amdgpu_device pointer 1153 * 1154 * Return device rev_id 1155 */ 1156 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1157 { 1158 return adev->nbio.funcs->get_rev_id(adev); 1159 } 1160 1161 /** 1162 * amdgpu_invalid_rreg - dummy reg read function 1163 * 1164 * @adev: amdgpu_device pointer 1165 * @reg: offset of register 1166 * 1167 * Dummy register read function. Used for register blocks 1168 * that certain asics don't have (all asics). 1169 * Returns the value in the register. 1170 */ 1171 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1172 { 1173 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1174 BUG(); 1175 return 0; 1176 } 1177 1178 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1179 { 1180 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1181 BUG(); 1182 return 0; 1183 } 1184 1185 /** 1186 * amdgpu_invalid_wreg - dummy reg write function 1187 * 1188 * @adev: amdgpu_device pointer 1189 * @reg: offset of register 1190 * @v: value to write to the register 1191 * 1192 * Dummy register read function. Used for register blocks 1193 * that certain asics don't have (all asics). 1194 */ 1195 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1196 { 1197 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1198 reg, v); 1199 BUG(); 1200 } 1201 1202 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1203 { 1204 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1205 reg, v); 1206 BUG(); 1207 } 1208 1209 /** 1210 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1211 * 1212 * @adev: amdgpu_device pointer 1213 * @reg: offset of register 1214 * 1215 * Dummy register read function. Used for register blocks 1216 * that certain asics don't have (all asics). 1217 * Returns the value in the register. 1218 */ 1219 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1220 { 1221 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1222 BUG(); 1223 return 0; 1224 } 1225 1226 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1227 { 1228 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1229 BUG(); 1230 return 0; 1231 } 1232 1233 /** 1234 * amdgpu_invalid_wreg64 - dummy reg write function 1235 * 1236 * @adev: amdgpu_device pointer 1237 * @reg: offset of register 1238 * @v: value to write to the register 1239 * 1240 * Dummy register read function. Used for register blocks 1241 * that certain asics don't have (all asics). 1242 */ 1243 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1244 { 1245 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1246 reg, v); 1247 BUG(); 1248 } 1249 1250 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1251 { 1252 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1253 reg, v); 1254 BUG(); 1255 } 1256 1257 /** 1258 * amdgpu_block_invalid_rreg - dummy reg read function 1259 * 1260 * @adev: amdgpu_device pointer 1261 * @block: offset of instance 1262 * @reg: offset of register 1263 * 1264 * Dummy register read function. Used for register blocks 1265 * that certain asics don't have (all asics). 1266 * Returns the value in the register. 1267 */ 1268 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1269 uint32_t block, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1272 reg, block); 1273 BUG(); 1274 return 0; 1275 } 1276 1277 /** 1278 * amdgpu_block_invalid_wreg - dummy reg write function 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @block: offset of instance 1282 * @reg: offset of register 1283 * @v: value to write to the register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 */ 1288 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1289 uint32_t block, 1290 uint32_t reg, uint32_t v) 1291 { 1292 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1293 reg, block, v); 1294 BUG(); 1295 } 1296 1297 /** 1298 * amdgpu_device_asic_init - Wrapper for atom asic_init 1299 * 1300 * @adev: amdgpu_device pointer 1301 * 1302 * Does any asic specific work and then calls atom asic init. 1303 */ 1304 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1305 { 1306 int ret; 1307 1308 amdgpu_asic_pre_asic_init(adev); 1309 1310 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1311 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1312 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1313 amdgpu_psp_wait_for_bootloader(adev); 1314 ret = amdgpu_atomfirmware_asic_init(adev, true); 1315 return ret; 1316 } else { 1317 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1318 } 1319 1320 return 0; 1321 } 1322 1323 /** 1324 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1325 * 1326 * @adev: amdgpu_device pointer 1327 * 1328 * Allocates a scratch page of VRAM for use by various things in the 1329 * driver. 1330 */ 1331 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1332 { 1333 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1334 AMDGPU_GEM_DOMAIN_VRAM | 1335 AMDGPU_GEM_DOMAIN_GTT, 1336 &adev->mem_scratch.robj, 1337 &adev->mem_scratch.gpu_addr, 1338 (void **)&adev->mem_scratch.ptr); 1339 } 1340 1341 /** 1342 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1343 * 1344 * @adev: amdgpu_device pointer 1345 * 1346 * Frees the VRAM scratch page. 1347 */ 1348 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1349 { 1350 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1351 } 1352 1353 /** 1354 * amdgpu_device_program_register_sequence - program an array of registers. 1355 * 1356 * @adev: amdgpu_device pointer 1357 * @registers: pointer to the register array 1358 * @array_size: size of the register array 1359 * 1360 * Programs an array or registers with and or masks. 1361 * This is a helper for setting golden registers. 1362 */ 1363 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1364 const u32 *registers, 1365 const u32 array_size) 1366 { 1367 u32 tmp, reg, and_mask, or_mask; 1368 int i; 1369 1370 if (array_size % 3) 1371 return; 1372 1373 for (i = 0; i < array_size; i += 3) { 1374 reg = registers[i + 0]; 1375 and_mask = registers[i + 1]; 1376 or_mask = registers[i + 2]; 1377 1378 if (and_mask == 0xffffffff) { 1379 tmp = or_mask; 1380 } else { 1381 tmp = RREG32(reg); 1382 tmp &= ~and_mask; 1383 if (adev->family >= AMDGPU_FAMILY_AI) 1384 tmp |= (or_mask & and_mask); 1385 else 1386 tmp |= or_mask; 1387 } 1388 WREG32(reg, tmp); 1389 } 1390 } 1391 1392 /** 1393 * amdgpu_device_pci_config_reset - reset the GPU 1394 * 1395 * @adev: amdgpu_device pointer 1396 * 1397 * Resets the GPU using the pci config reset sequence. 1398 * Only applicable to asics prior to vega10. 1399 */ 1400 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1401 { 1402 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1403 } 1404 1405 /** 1406 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1407 * 1408 * @adev: amdgpu_device pointer 1409 * 1410 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1411 */ 1412 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1413 { 1414 return pci_reset_function(adev->pdev); 1415 } 1416 1417 /* 1418 * amdgpu_device_wb_*() 1419 * Writeback is the method by which the GPU updates special pages in memory 1420 * with the status of certain GPU events (fences, ring pointers,etc.). 1421 */ 1422 1423 /** 1424 * amdgpu_device_wb_fini - Disable Writeback and free memory 1425 * 1426 * @adev: amdgpu_device pointer 1427 * 1428 * Disables Writeback and frees the Writeback memory (all asics). 1429 * Used at driver shutdown. 1430 */ 1431 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1432 { 1433 if (adev->wb.wb_obj) { 1434 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1435 &adev->wb.gpu_addr, 1436 (void **)&adev->wb.wb); 1437 adev->wb.wb_obj = NULL; 1438 } 1439 } 1440 1441 /** 1442 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1443 * 1444 * @adev: amdgpu_device pointer 1445 * 1446 * Initializes writeback and allocates writeback memory (all asics). 1447 * Used at driver startup. 1448 * Returns 0 on success or an -error on failure. 1449 */ 1450 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1451 { 1452 int r; 1453 1454 if (adev->wb.wb_obj == NULL) { 1455 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1456 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1457 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1458 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1459 (void **)&adev->wb.wb); 1460 if (r) { 1461 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1462 return r; 1463 } 1464 1465 adev->wb.num_wb = AMDGPU_MAX_WB; 1466 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1467 1468 /* clear wb memory */ 1469 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1470 } 1471 1472 return 0; 1473 } 1474 1475 /** 1476 * amdgpu_device_wb_get - Allocate a wb entry 1477 * 1478 * @adev: amdgpu_device pointer 1479 * @wb: wb index 1480 * 1481 * Allocate a wb slot for use by the driver (all asics). 1482 * Returns 0 on success or -EINVAL on failure. 1483 */ 1484 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1485 { 1486 unsigned long flags, offset; 1487 1488 spin_lock_irqsave(&adev->wb.lock, flags); 1489 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1490 if (offset < adev->wb.num_wb) { 1491 __set_bit(offset, adev->wb.used); 1492 spin_unlock_irqrestore(&adev->wb.lock, flags); 1493 *wb = offset << 3; /* convert to dw offset */ 1494 return 0; 1495 } else { 1496 spin_unlock_irqrestore(&adev->wb.lock, flags); 1497 return -EINVAL; 1498 } 1499 } 1500 1501 /** 1502 * amdgpu_device_wb_free - Free a wb entry 1503 * 1504 * @adev: amdgpu_device pointer 1505 * @wb: wb index 1506 * 1507 * Free a wb slot allocated for use by the driver (all asics) 1508 */ 1509 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1510 { 1511 unsigned long flags; 1512 1513 wb >>= 3; 1514 spin_lock_irqsave(&adev->wb.lock, flags); 1515 if (wb < adev->wb.num_wb) 1516 __clear_bit(wb, adev->wb.used); 1517 spin_unlock_irqrestore(&adev->wb.lock, flags); 1518 } 1519 1520 /** 1521 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1522 * 1523 * @adev: amdgpu_device pointer 1524 * 1525 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1526 * to fail, but if any of the BARs is not accessible after the size we abort 1527 * driver loading by returning -ENODEV. 1528 */ 1529 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1530 { 1531 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1532 struct pci_bus *root; 1533 struct resource *res; 1534 unsigned int i; 1535 u16 cmd; 1536 int r; 1537 1538 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1539 return 0; 1540 1541 /* Bypass for VF */ 1542 if (amdgpu_sriov_vf(adev)) 1543 return 0; 1544 1545 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1546 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1547 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1548 1549 /* skip if the bios has already enabled large BAR */ 1550 if (adev->gmc.real_vram_size && 1551 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1552 return 0; 1553 1554 /* Check if the root BUS has 64bit memory resources */ 1555 root = adev->pdev->bus; 1556 while (root->parent) 1557 root = root->parent; 1558 1559 pci_bus_for_each_resource(root, res, i) { 1560 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1561 res->start > 0x100000000ull) 1562 break; 1563 } 1564 1565 /* Trying to resize is pointless without a root hub window above 4GB */ 1566 if (!res) 1567 return 0; 1568 1569 /* Limit the BAR size to what is available */ 1570 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1571 rbar_size); 1572 1573 /* Disable memory decoding while we change the BAR addresses and size */ 1574 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1575 pci_write_config_word(adev->pdev, PCI_COMMAND, 1576 cmd & ~PCI_COMMAND_MEMORY); 1577 1578 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1579 amdgpu_doorbell_fini(adev); 1580 if (adev->asic_type >= CHIP_BONAIRE) 1581 pci_release_resource(adev->pdev, 2); 1582 1583 pci_release_resource(adev->pdev, 0); 1584 1585 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1586 if (r == -ENOSPC) 1587 DRM_INFO("Not enough PCI address space for a large BAR."); 1588 else if (r && r != -ENOTSUPP) 1589 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1590 1591 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1592 1593 /* When the doorbell or fb BAR isn't available we have no chance of 1594 * using the device. 1595 */ 1596 r = amdgpu_doorbell_init(adev); 1597 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1598 return -ENODEV; 1599 1600 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1601 1602 return 0; 1603 } 1604 1605 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1606 { 1607 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1608 return false; 1609 1610 return true; 1611 } 1612 1613 /* 1614 * GPU helpers function. 1615 */ 1616 /** 1617 * amdgpu_device_need_post - check if the hw need post or not 1618 * 1619 * @adev: amdgpu_device pointer 1620 * 1621 * Check if the asic has been initialized (all asics) at driver startup 1622 * or post is needed if hw reset is performed. 1623 * Returns true if need or false if not. 1624 */ 1625 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1626 { 1627 uint32_t reg; 1628 1629 if (amdgpu_sriov_vf(adev)) 1630 return false; 1631 1632 if (!amdgpu_device_read_bios(adev)) 1633 return false; 1634 1635 if (amdgpu_passthrough(adev)) { 1636 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1637 * some old smc fw still need driver do vPost otherwise gpu hang, while 1638 * those smc fw version above 22.15 doesn't have this flaw, so we force 1639 * vpost executed for smc version below 22.15 1640 */ 1641 if (adev->asic_type == CHIP_FIJI) { 1642 int err; 1643 uint32_t fw_ver; 1644 1645 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1646 /* force vPost if error occured */ 1647 if (err) 1648 return true; 1649 1650 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1651 release_firmware(adev->pm.fw); 1652 if (fw_ver < 0x00160e00) 1653 return true; 1654 } 1655 } 1656 1657 /* Don't post if we need to reset whole hive on init */ 1658 if (adev->gmc.xgmi.pending_reset) 1659 return false; 1660 1661 if (adev->has_hw_reset) { 1662 adev->has_hw_reset = false; 1663 return true; 1664 } 1665 1666 /* bios scratch used on CIK+ */ 1667 if (adev->asic_type >= CHIP_BONAIRE) 1668 return amdgpu_atombios_scratch_need_asic_init(adev); 1669 1670 /* check MEM_SIZE for older asics */ 1671 reg = amdgpu_asic_get_config_memsize(adev); 1672 1673 if ((reg != 0) && (reg != 0xffffffff)) 1674 return false; 1675 1676 return true; 1677 } 1678 1679 /* 1680 * Check whether seamless boot is supported. 1681 * 1682 * So far we only support seamless boot on DCE 3.0 or later. 1683 * If users report that it works on older ASICS as well, we may 1684 * loosen this. 1685 */ 1686 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1687 { 1688 switch (amdgpu_seamless) { 1689 case -1: 1690 break; 1691 case 1: 1692 return true; 1693 case 0: 1694 return false; 1695 default: 1696 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1697 amdgpu_seamless); 1698 return false; 1699 } 1700 1701 if (!(adev->flags & AMD_IS_APU)) 1702 return false; 1703 1704 if (adev->mman.keep_stolen_vga_memory) 1705 return false; 1706 1707 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1708 } 1709 1710 /* 1711 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1712 * don't support dynamic speed switching. Until we have confirmation from Intel 1713 * that a specific host supports it, it's safer that we keep it disabled for all. 1714 * 1715 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1716 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1717 */ 1718 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1719 { 1720 #if IS_ENABLED(CONFIG_X86) 1721 struct cpuinfo_x86 *c = &cpu_data(0); 1722 1723 /* eGPU change speeds based on USB4 fabric conditions */ 1724 if (dev_is_removable(adev->dev)) 1725 return true; 1726 1727 if (c->x86_vendor == X86_VENDOR_INTEL) 1728 return false; 1729 #endif 1730 return true; 1731 } 1732 1733 /** 1734 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1735 * 1736 * @adev: amdgpu_device pointer 1737 * 1738 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1739 * be set for this device. 1740 * 1741 * Returns true if it should be used or false if not. 1742 */ 1743 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1744 { 1745 switch (amdgpu_aspm) { 1746 case -1: 1747 break; 1748 case 0: 1749 return false; 1750 case 1: 1751 return true; 1752 default: 1753 return false; 1754 } 1755 if (adev->flags & AMD_IS_APU) 1756 return false; 1757 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1758 return false; 1759 return pcie_aspm_enabled(adev->pdev); 1760 } 1761 1762 /* if we get transitioned to only one device, take VGA back */ 1763 /** 1764 * amdgpu_device_vga_set_decode - enable/disable vga decode 1765 * 1766 * @pdev: PCI device pointer 1767 * @state: enable/disable vga decode 1768 * 1769 * Enable/disable vga decode (all asics). 1770 * Returns VGA resource flags. 1771 */ 1772 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1773 bool state) 1774 { 1775 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1776 1777 amdgpu_asic_set_vga_state(adev, state); 1778 if (state) 1779 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1780 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1781 else 1782 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1783 } 1784 1785 /** 1786 * amdgpu_device_check_block_size - validate the vm block size 1787 * 1788 * @adev: amdgpu_device pointer 1789 * 1790 * Validates the vm block size specified via module parameter. 1791 * The vm block size defines number of bits in page table versus page directory, 1792 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1793 * page table and the remaining bits are in the page directory. 1794 */ 1795 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1796 { 1797 /* defines number of bits in page table versus page directory, 1798 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1799 * page table and the remaining bits are in the page directory 1800 */ 1801 if (amdgpu_vm_block_size == -1) 1802 return; 1803 1804 if (amdgpu_vm_block_size < 9) { 1805 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1806 amdgpu_vm_block_size); 1807 amdgpu_vm_block_size = -1; 1808 } 1809 } 1810 1811 /** 1812 * amdgpu_device_check_vm_size - validate the vm size 1813 * 1814 * @adev: amdgpu_device pointer 1815 * 1816 * Validates the vm size in GB specified via module parameter. 1817 * The VM size is the size of the GPU virtual memory space in GB. 1818 */ 1819 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1820 { 1821 /* no need to check the default value */ 1822 if (amdgpu_vm_size == -1) 1823 return; 1824 1825 if (amdgpu_vm_size < 1) { 1826 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1827 amdgpu_vm_size); 1828 amdgpu_vm_size = -1; 1829 } 1830 } 1831 1832 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1833 { 1834 struct sysinfo si; 1835 bool is_os_64 = (sizeof(void *) == 8); 1836 uint64_t total_memory; 1837 uint64_t dram_size_seven_GB = 0x1B8000000; 1838 uint64_t dram_size_three_GB = 0xB8000000; 1839 1840 if (amdgpu_smu_memory_pool_size == 0) 1841 return; 1842 1843 if (!is_os_64) { 1844 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1845 goto def_value; 1846 } 1847 si_meminfo(&si); 1848 total_memory = (uint64_t)si.totalram * si.mem_unit; 1849 1850 if ((amdgpu_smu_memory_pool_size == 1) || 1851 (amdgpu_smu_memory_pool_size == 2)) { 1852 if (total_memory < dram_size_three_GB) 1853 goto def_value1; 1854 } else if ((amdgpu_smu_memory_pool_size == 4) || 1855 (amdgpu_smu_memory_pool_size == 8)) { 1856 if (total_memory < dram_size_seven_GB) 1857 goto def_value1; 1858 } else { 1859 DRM_WARN("Smu memory pool size not supported\n"); 1860 goto def_value; 1861 } 1862 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1863 1864 return; 1865 1866 def_value1: 1867 DRM_WARN("No enough system memory\n"); 1868 def_value: 1869 adev->pm.smu_prv_buffer_size = 0; 1870 } 1871 1872 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1873 { 1874 if (!(adev->flags & AMD_IS_APU) || 1875 adev->asic_type < CHIP_RAVEN) 1876 return 0; 1877 1878 switch (adev->asic_type) { 1879 case CHIP_RAVEN: 1880 if (adev->pdev->device == 0x15dd) 1881 adev->apu_flags |= AMD_APU_IS_RAVEN; 1882 if (adev->pdev->device == 0x15d8) 1883 adev->apu_flags |= AMD_APU_IS_PICASSO; 1884 break; 1885 case CHIP_RENOIR: 1886 if ((adev->pdev->device == 0x1636) || 1887 (adev->pdev->device == 0x164c)) 1888 adev->apu_flags |= AMD_APU_IS_RENOIR; 1889 else 1890 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1891 break; 1892 case CHIP_VANGOGH: 1893 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1894 break; 1895 case CHIP_YELLOW_CARP: 1896 break; 1897 case CHIP_CYAN_SKILLFISH: 1898 if ((adev->pdev->device == 0x13FE) || 1899 (adev->pdev->device == 0x143F)) 1900 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1901 break; 1902 default: 1903 break; 1904 } 1905 1906 return 0; 1907 } 1908 1909 /** 1910 * amdgpu_device_check_arguments - validate module params 1911 * 1912 * @adev: amdgpu_device pointer 1913 * 1914 * Validates certain module parameters and updates 1915 * the associated values used by the driver (all asics). 1916 */ 1917 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1918 { 1919 if (amdgpu_sched_jobs < 4) { 1920 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1921 amdgpu_sched_jobs); 1922 amdgpu_sched_jobs = 4; 1923 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1924 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1925 amdgpu_sched_jobs); 1926 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1927 } 1928 1929 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1930 /* gart size must be greater or equal to 32M */ 1931 dev_warn(adev->dev, "gart size (%d) too small\n", 1932 amdgpu_gart_size); 1933 amdgpu_gart_size = -1; 1934 } 1935 1936 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1937 /* gtt size must be greater or equal to 32M */ 1938 dev_warn(adev->dev, "gtt size (%d) too small\n", 1939 amdgpu_gtt_size); 1940 amdgpu_gtt_size = -1; 1941 } 1942 1943 /* valid range is between 4 and 9 inclusive */ 1944 if (amdgpu_vm_fragment_size != -1 && 1945 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1946 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1947 amdgpu_vm_fragment_size = -1; 1948 } 1949 1950 if (amdgpu_sched_hw_submission < 2) { 1951 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1952 amdgpu_sched_hw_submission); 1953 amdgpu_sched_hw_submission = 2; 1954 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1955 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1956 amdgpu_sched_hw_submission); 1957 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1958 } 1959 1960 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1961 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1962 amdgpu_reset_method = -1; 1963 } 1964 1965 amdgpu_device_check_smu_prv_buffer_size(adev); 1966 1967 amdgpu_device_check_vm_size(adev); 1968 1969 amdgpu_device_check_block_size(adev); 1970 1971 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1972 1973 return 0; 1974 } 1975 1976 /** 1977 * amdgpu_switcheroo_set_state - set switcheroo state 1978 * 1979 * @pdev: pci dev pointer 1980 * @state: vga_switcheroo state 1981 * 1982 * Callback for the switcheroo driver. Suspends or resumes 1983 * the asics before or after it is powered up using ACPI methods. 1984 */ 1985 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1986 enum vga_switcheroo_state state) 1987 { 1988 struct drm_device *dev = pci_get_drvdata(pdev); 1989 int r; 1990 1991 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1992 return; 1993 1994 if (state == VGA_SWITCHEROO_ON) { 1995 pr_info("switched on\n"); 1996 /* don't suspend or resume card normally */ 1997 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1998 1999 pci_set_power_state(pdev, PCI_D0); 2000 amdgpu_device_load_pci_state(pdev); 2001 r = pci_enable_device(pdev); 2002 if (r) 2003 DRM_WARN("pci_enable_device failed (%d)\n", r); 2004 amdgpu_device_resume(dev, true); 2005 2006 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2007 } else { 2008 pr_info("switched off\n"); 2009 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2010 amdgpu_device_prepare(dev); 2011 amdgpu_device_suspend(dev, true); 2012 amdgpu_device_cache_pci_state(pdev); 2013 /* Shut down the device */ 2014 pci_disable_device(pdev); 2015 pci_set_power_state(pdev, PCI_D3cold); 2016 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2017 } 2018 } 2019 2020 /** 2021 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2022 * 2023 * @pdev: pci dev pointer 2024 * 2025 * Callback for the switcheroo driver. Check of the switcheroo 2026 * state can be changed. 2027 * Returns true if the state can be changed, false if not. 2028 */ 2029 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2030 { 2031 struct drm_device *dev = pci_get_drvdata(pdev); 2032 2033 /* 2034 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2035 * locking inversion with the driver load path. And the access here is 2036 * completely racy anyway. So don't bother with locking for now. 2037 */ 2038 return atomic_read(&dev->open_count) == 0; 2039 } 2040 2041 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2042 .set_gpu_state = amdgpu_switcheroo_set_state, 2043 .reprobe = NULL, 2044 .can_switch = amdgpu_switcheroo_can_switch, 2045 }; 2046 2047 /** 2048 * amdgpu_device_ip_set_clockgating_state - set the CG state 2049 * 2050 * @dev: amdgpu_device pointer 2051 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2052 * @state: clockgating state (gate or ungate) 2053 * 2054 * Sets the requested clockgating state for all instances of 2055 * the hardware IP specified. 2056 * Returns the error code from the last instance. 2057 */ 2058 int amdgpu_device_ip_set_clockgating_state(void *dev, 2059 enum amd_ip_block_type block_type, 2060 enum amd_clockgating_state state) 2061 { 2062 struct amdgpu_device *adev = dev; 2063 int i, r = 0; 2064 2065 for (i = 0; i < adev->num_ip_blocks; i++) { 2066 if (!adev->ip_blocks[i].status.valid) 2067 continue; 2068 if (adev->ip_blocks[i].version->type != block_type) 2069 continue; 2070 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2071 continue; 2072 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2073 (void *)adev, state); 2074 if (r) 2075 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2076 adev->ip_blocks[i].version->funcs->name, r); 2077 } 2078 return r; 2079 } 2080 2081 /** 2082 * amdgpu_device_ip_set_powergating_state - set the PG state 2083 * 2084 * @dev: amdgpu_device pointer 2085 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2086 * @state: powergating state (gate or ungate) 2087 * 2088 * Sets the requested powergating state for all instances of 2089 * the hardware IP specified. 2090 * Returns the error code from the last instance. 2091 */ 2092 int amdgpu_device_ip_set_powergating_state(void *dev, 2093 enum amd_ip_block_type block_type, 2094 enum amd_powergating_state state) 2095 { 2096 struct amdgpu_device *adev = dev; 2097 int i, r = 0; 2098 2099 for (i = 0; i < adev->num_ip_blocks; i++) { 2100 if (!adev->ip_blocks[i].status.valid) 2101 continue; 2102 if (adev->ip_blocks[i].version->type != block_type) 2103 continue; 2104 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2105 continue; 2106 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2107 (void *)adev, state); 2108 if (r) 2109 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2110 adev->ip_blocks[i].version->funcs->name, r); 2111 } 2112 return r; 2113 } 2114 2115 /** 2116 * amdgpu_device_ip_get_clockgating_state - get the CG state 2117 * 2118 * @adev: amdgpu_device pointer 2119 * @flags: clockgating feature flags 2120 * 2121 * Walks the list of IPs on the device and updates the clockgating 2122 * flags for each IP. 2123 * Updates @flags with the feature flags for each hardware IP where 2124 * clockgating is enabled. 2125 */ 2126 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2127 u64 *flags) 2128 { 2129 int i; 2130 2131 for (i = 0; i < adev->num_ip_blocks; i++) { 2132 if (!adev->ip_blocks[i].status.valid) 2133 continue; 2134 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2135 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2136 } 2137 } 2138 2139 /** 2140 * amdgpu_device_ip_wait_for_idle - wait for idle 2141 * 2142 * @adev: amdgpu_device pointer 2143 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2144 * 2145 * Waits for the request hardware IP to be idle. 2146 * Returns 0 for success or a negative error code on failure. 2147 */ 2148 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2149 enum amd_ip_block_type block_type) 2150 { 2151 int i, r; 2152 2153 for (i = 0; i < adev->num_ip_blocks; i++) { 2154 if (!adev->ip_blocks[i].status.valid) 2155 continue; 2156 if (adev->ip_blocks[i].version->type == block_type) { 2157 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2158 if (r) 2159 return r; 2160 break; 2161 } 2162 } 2163 return 0; 2164 2165 } 2166 2167 /** 2168 * amdgpu_device_ip_is_idle - is the hardware IP idle 2169 * 2170 * @adev: amdgpu_device pointer 2171 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2172 * 2173 * Check if the hardware IP is idle or not. 2174 * Returns true if it the IP is idle, false if not. 2175 */ 2176 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2177 enum amd_ip_block_type block_type) 2178 { 2179 int i; 2180 2181 for (i = 0; i < adev->num_ip_blocks; i++) { 2182 if (!adev->ip_blocks[i].status.valid) 2183 continue; 2184 if (adev->ip_blocks[i].version->type == block_type) 2185 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2186 } 2187 return true; 2188 2189 } 2190 2191 /** 2192 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2193 * 2194 * @adev: amdgpu_device pointer 2195 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2196 * 2197 * Returns a pointer to the hardware IP block structure 2198 * if it exists for the asic, otherwise NULL. 2199 */ 2200 struct amdgpu_ip_block * 2201 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2202 enum amd_ip_block_type type) 2203 { 2204 int i; 2205 2206 for (i = 0; i < adev->num_ip_blocks; i++) 2207 if (adev->ip_blocks[i].version->type == type) 2208 return &adev->ip_blocks[i]; 2209 2210 return NULL; 2211 } 2212 2213 /** 2214 * amdgpu_device_ip_block_version_cmp 2215 * 2216 * @adev: amdgpu_device pointer 2217 * @type: enum amd_ip_block_type 2218 * @major: major version 2219 * @minor: minor version 2220 * 2221 * return 0 if equal or greater 2222 * return 1 if smaller or the ip_block doesn't exist 2223 */ 2224 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2225 enum amd_ip_block_type type, 2226 u32 major, u32 minor) 2227 { 2228 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2229 2230 if (ip_block && ((ip_block->version->major > major) || 2231 ((ip_block->version->major == major) && 2232 (ip_block->version->minor >= minor)))) 2233 return 0; 2234 2235 return 1; 2236 } 2237 2238 /** 2239 * amdgpu_device_ip_block_add 2240 * 2241 * @adev: amdgpu_device pointer 2242 * @ip_block_version: pointer to the IP to add 2243 * 2244 * Adds the IP block driver information to the collection of IPs 2245 * on the asic. 2246 */ 2247 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2248 const struct amdgpu_ip_block_version *ip_block_version) 2249 { 2250 if (!ip_block_version) 2251 return -EINVAL; 2252 2253 switch (ip_block_version->type) { 2254 case AMD_IP_BLOCK_TYPE_VCN: 2255 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2256 return 0; 2257 break; 2258 case AMD_IP_BLOCK_TYPE_JPEG: 2259 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2260 return 0; 2261 break; 2262 default: 2263 break; 2264 } 2265 2266 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2267 ip_block_version->funcs->name); 2268 2269 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2270 2271 return 0; 2272 } 2273 2274 /** 2275 * amdgpu_device_enable_virtual_display - enable virtual display feature 2276 * 2277 * @adev: amdgpu_device pointer 2278 * 2279 * Enabled the virtual display feature if the user has enabled it via 2280 * the module parameter virtual_display. This feature provides a virtual 2281 * display hardware on headless boards or in virtualized environments. 2282 * This function parses and validates the configuration string specified by 2283 * the user and configues the virtual display configuration (number of 2284 * virtual connectors, crtcs, etc.) specified. 2285 */ 2286 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2287 { 2288 adev->enable_virtual_display = false; 2289 2290 if (amdgpu_virtual_display) { 2291 const char *pci_address_name = pci_name(adev->pdev); 2292 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2293 2294 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2295 pciaddstr_tmp = pciaddstr; 2296 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2297 pciaddname = strsep(&pciaddname_tmp, ","); 2298 if (!strcmp("all", pciaddname) 2299 || !strcmp(pci_address_name, pciaddname)) { 2300 long num_crtc; 2301 int res = -1; 2302 2303 adev->enable_virtual_display = true; 2304 2305 if (pciaddname_tmp) 2306 res = kstrtol(pciaddname_tmp, 10, 2307 &num_crtc); 2308 2309 if (!res) { 2310 if (num_crtc < 1) 2311 num_crtc = 1; 2312 if (num_crtc > 6) 2313 num_crtc = 6; 2314 adev->mode_info.num_crtc = num_crtc; 2315 } else { 2316 adev->mode_info.num_crtc = 1; 2317 } 2318 break; 2319 } 2320 } 2321 2322 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2323 amdgpu_virtual_display, pci_address_name, 2324 adev->enable_virtual_display, adev->mode_info.num_crtc); 2325 2326 kfree(pciaddstr); 2327 } 2328 } 2329 2330 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2331 { 2332 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2333 adev->mode_info.num_crtc = 1; 2334 adev->enable_virtual_display = true; 2335 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2336 adev->enable_virtual_display, adev->mode_info.num_crtc); 2337 } 2338 } 2339 2340 /** 2341 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2342 * 2343 * @adev: amdgpu_device pointer 2344 * 2345 * Parses the asic configuration parameters specified in the gpu info 2346 * firmware and makes them availale to the driver for use in configuring 2347 * the asic. 2348 * Returns 0 on success, -EINVAL on failure. 2349 */ 2350 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2351 { 2352 const char *chip_name; 2353 int err; 2354 const struct gpu_info_firmware_header_v1_0 *hdr; 2355 2356 adev->firmware.gpu_info_fw = NULL; 2357 2358 if (adev->mman.discovery_bin) 2359 return 0; 2360 2361 switch (adev->asic_type) { 2362 default: 2363 return 0; 2364 case CHIP_VEGA10: 2365 chip_name = "vega10"; 2366 break; 2367 case CHIP_VEGA12: 2368 chip_name = "vega12"; 2369 break; 2370 case CHIP_RAVEN: 2371 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2372 chip_name = "raven2"; 2373 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2374 chip_name = "picasso"; 2375 else 2376 chip_name = "raven"; 2377 break; 2378 case CHIP_ARCTURUS: 2379 chip_name = "arcturus"; 2380 break; 2381 case CHIP_NAVI12: 2382 chip_name = "navi12"; 2383 break; 2384 } 2385 2386 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2387 "amdgpu/%s_gpu_info.bin", chip_name); 2388 if (err) { 2389 dev_err(adev->dev, 2390 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2391 chip_name); 2392 goto out; 2393 } 2394 2395 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2396 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2397 2398 switch (hdr->version_major) { 2399 case 1: 2400 { 2401 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2402 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2403 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2404 2405 /* 2406 * Should be droped when DAL no longer needs it. 2407 */ 2408 if (adev->asic_type == CHIP_NAVI12) 2409 goto parse_soc_bounding_box; 2410 2411 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2412 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2413 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2414 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2415 adev->gfx.config.max_texture_channel_caches = 2416 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2417 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2418 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2419 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2420 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2421 adev->gfx.config.double_offchip_lds_buf = 2422 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2423 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2424 adev->gfx.cu_info.max_waves_per_simd = 2425 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2426 adev->gfx.cu_info.max_scratch_slots_per_cu = 2427 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2428 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2429 if (hdr->version_minor >= 1) { 2430 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2431 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2432 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2433 adev->gfx.config.num_sc_per_sh = 2434 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2435 adev->gfx.config.num_packer_per_sc = 2436 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2437 } 2438 2439 parse_soc_bounding_box: 2440 /* 2441 * soc bounding box info is not integrated in disocovery table, 2442 * we always need to parse it from gpu info firmware if needed. 2443 */ 2444 if (hdr->version_minor == 2) { 2445 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2446 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2447 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2448 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2449 } 2450 break; 2451 } 2452 default: 2453 dev_err(adev->dev, 2454 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2455 err = -EINVAL; 2456 goto out; 2457 } 2458 out: 2459 return err; 2460 } 2461 2462 /** 2463 * amdgpu_device_ip_early_init - run early init for hardware IPs 2464 * 2465 * @adev: amdgpu_device pointer 2466 * 2467 * Early initialization pass for hardware IPs. The hardware IPs that make 2468 * up each asic are discovered each IP's early_init callback is run. This 2469 * is the first stage in initializing the asic. 2470 * Returns 0 on success, negative error code on failure. 2471 */ 2472 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2473 { 2474 struct pci_dev *parent; 2475 int i, r; 2476 bool total; 2477 2478 amdgpu_device_enable_virtual_display(adev); 2479 2480 if (amdgpu_sriov_vf(adev)) { 2481 r = amdgpu_virt_request_full_gpu(adev, true); 2482 if (r) 2483 return r; 2484 } 2485 2486 switch (adev->asic_type) { 2487 #ifdef CONFIG_DRM_AMDGPU_SI 2488 case CHIP_VERDE: 2489 case CHIP_TAHITI: 2490 case CHIP_PITCAIRN: 2491 case CHIP_OLAND: 2492 case CHIP_HAINAN: 2493 adev->family = AMDGPU_FAMILY_SI; 2494 r = si_set_ip_blocks(adev); 2495 if (r) 2496 return r; 2497 break; 2498 #endif 2499 #ifdef CONFIG_DRM_AMDGPU_CIK 2500 case CHIP_BONAIRE: 2501 case CHIP_HAWAII: 2502 case CHIP_KAVERI: 2503 case CHIP_KABINI: 2504 case CHIP_MULLINS: 2505 if (adev->flags & AMD_IS_APU) 2506 adev->family = AMDGPU_FAMILY_KV; 2507 else 2508 adev->family = AMDGPU_FAMILY_CI; 2509 2510 r = cik_set_ip_blocks(adev); 2511 if (r) 2512 return r; 2513 break; 2514 #endif 2515 case CHIP_TOPAZ: 2516 case CHIP_TONGA: 2517 case CHIP_FIJI: 2518 case CHIP_POLARIS10: 2519 case CHIP_POLARIS11: 2520 case CHIP_POLARIS12: 2521 case CHIP_VEGAM: 2522 case CHIP_CARRIZO: 2523 case CHIP_STONEY: 2524 if (adev->flags & AMD_IS_APU) 2525 adev->family = AMDGPU_FAMILY_CZ; 2526 else 2527 adev->family = AMDGPU_FAMILY_VI; 2528 2529 r = vi_set_ip_blocks(adev); 2530 if (r) 2531 return r; 2532 break; 2533 default: 2534 r = amdgpu_discovery_set_ip_blocks(adev); 2535 if (r) 2536 return r; 2537 break; 2538 } 2539 2540 if (amdgpu_has_atpx() && 2541 (amdgpu_is_atpx_hybrid() || 2542 amdgpu_has_atpx_dgpu_power_cntl()) && 2543 ((adev->flags & AMD_IS_APU) == 0) && 2544 !dev_is_removable(&adev->pdev->dev)) 2545 adev->flags |= AMD_IS_PX; 2546 2547 if (!(adev->flags & AMD_IS_APU)) { 2548 parent = pcie_find_root_port(adev->pdev); 2549 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2550 } 2551 2552 2553 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2554 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2555 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2556 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2557 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2558 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2559 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2560 2561 total = true; 2562 for (i = 0; i < adev->num_ip_blocks; i++) { 2563 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2564 DRM_WARN("disabled ip block: %d <%s>\n", 2565 i, adev->ip_blocks[i].version->funcs->name); 2566 adev->ip_blocks[i].status.valid = false; 2567 } else { 2568 if (adev->ip_blocks[i].version->funcs->early_init) { 2569 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2570 if (r == -ENOENT) { 2571 adev->ip_blocks[i].status.valid = false; 2572 } else if (r) { 2573 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2574 adev->ip_blocks[i].version->funcs->name, r); 2575 total = false; 2576 } else { 2577 adev->ip_blocks[i].status.valid = true; 2578 } 2579 } else { 2580 adev->ip_blocks[i].status.valid = true; 2581 } 2582 } 2583 /* get the vbios after the asic_funcs are set up */ 2584 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2585 r = amdgpu_device_parse_gpu_info_fw(adev); 2586 if (r) 2587 return r; 2588 2589 /* Read BIOS */ 2590 if (amdgpu_device_read_bios(adev)) { 2591 if (!amdgpu_get_bios(adev)) 2592 return -EINVAL; 2593 2594 r = amdgpu_atombios_init(adev); 2595 if (r) { 2596 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2597 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2598 return r; 2599 } 2600 } 2601 2602 /*get pf2vf msg info at it's earliest time*/ 2603 if (amdgpu_sriov_vf(adev)) 2604 amdgpu_virt_init_data_exchange(adev); 2605 2606 } 2607 } 2608 if (!total) 2609 return -ENODEV; 2610 2611 amdgpu_amdkfd_device_probe(adev); 2612 adev->cg_flags &= amdgpu_cg_mask; 2613 adev->pg_flags &= amdgpu_pg_mask; 2614 2615 return 0; 2616 } 2617 2618 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2619 { 2620 int i, r; 2621 2622 for (i = 0; i < adev->num_ip_blocks; i++) { 2623 if (!adev->ip_blocks[i].status.sw) 2624 continue; 2625 if (adev->ip_blocks[i].status.hw) 2626 continue; 2627 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2628 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2629 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2630 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2631 if (r) { 2632 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2633 adev->ip_blocks[i].version->funcs->name, r); 2634 return r; 2635 } 2636 adev->ip_blocks[i].status.hw = true; 2637 } 2638 } 2639 2640 return 0; 2641 } 2642 2643 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2644 { 2645 int i, r; 2646 2647 for (i = 0; i < adev->num_ip_blocks; i++) { 2648 if (!adev->ip_blocks[i].status.sw) 2649 continue; 2650 if (adev->ip_blocks[i].status.hw) 2651 continue; 2652 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2653 if (r) { 2654 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2655 adev->ip_blocks[i].version->funcs->name, r); 2656 return r; 2657 } 2658 adev->ip_blocks[i].status.hw = true; 2659 } 2660 2661 return 0; 2662 } 2663 2664 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2665 { 2666 int r = 0; 2667 int i; 2668 uint32_t smu_version; 2669 2670 if (adev->asic_type >= CHIP_VEGA10) { 2671 for (i = 0; i < adev->num_ip_blocks; i++) { 2672 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2673 continue; 2674 2675 if (!adev->ip_blocks[i].status.sw) 2676 continue; 2677 2678 /* no need to do the fw loading again if already done*/ 2679 if (adev->ip_blocks[i].status.hw == true) 2680 break; 2681 2682 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2683 r = adev->ip_blocks[i].version->funcs->resume(adev); 2684 if (r) { 2685 DRM_ERROR("resume of IP block <%s> failed %d\n", 2686 adev->ip_blocks[i].version->funcs->name, r); 2687 return r; 2688 } 2689 } else { 2690 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2691 if (r) { 2692 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2693 adev->ip_blocks[i].version->funcs->name, r); 2694 return r; 2695 } 2696 } 2697 2698 adev->ip_blocks[i].status.hw = true; 2699 break; 2700 } 2701 } 2702 2703 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2704 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2705 2706 return r; 2707 } 2708 2709 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2710 { 2711 long timeout; 2712 int r, i; 2713 2714 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2715 struct amdgpu_ring *ring = adev->rings[i]; 2716 2717 /* No need to setup the GPU scheduler for rings that don't need it */ 2718 if (!ring || ring->no_scheduler) 2719 continue; 2720 2721 switch (ring->funcs->type) { 2722 case AMDGPU_RING_TYPE_GFX: 2723 timeout = adev->gfx_timeout; 2724 break; 2725 case AMDGPU_RING_TYPE_COMPUTE: 2726 timeout = adev->compute_timeout; 2727 break; 2728 case AMDGPU_RING_TYPE_SDMA: 2729 timeout = adev->sdma_timeout; 2730 break; 2731 default: 2732 timeout = adev->video_timeout; 2733 break; 2734 } 2735 2736 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2737 DRM_SCHED_PRIORITY_COUNT, 2738 ring->num_hw_submission, 0, 2739 timeout, adev->reset_domain->wq, 2740 ring->sched_score, ring->name, 2741 adev->dev); 2742 if (r) { 2743 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2744 ring->name); 2745 return r; 2746 } 2747 r = amdgpu_uvd_entity_init(adev, ring); 2748 if (r) { 2749 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2750 ring->name); 2751 return r; 2752 } 2753 r = amdgpu_vce_entity_init(adev, ring); 2754 if (r) { 2755 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2756 ring->name); 2757 return r; 2758 } 2759 } 2760 2761 amdgpu_xcp_update_partition_sched_list(adev); 2762 2763 return 0; 2764 } 2765 2766 2767 /** 2768 * amdgpu_device_ip_init - run init for hardware IPs 2769 * 2770 * @adev: amdgpu_device pointer 2771 * 2772 * Main initialization pass for hardware IPs. The list of all the hardware 2773 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2774 * are run. sw_init initializes the software state associated with each IP 2775 * and hw_init initializes the hardware associated with each IP. 2776 * Returns 0 on success, negative error code on failure. 2777 */ 2778 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2779 { 2780 int i, r; 2781 2782 r = amdgpu_ras_init(adev); 2783 if (r) 2784 return r; 2785 2786 for (i = 0; i < adev->num_ip_blocks; i++) { 2787 if (!adev->ip_blocks[i].status.valid) 2788 continue; 2789 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2790 if (r) { 2791 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2792 adev->ip_blocks[i].version->funcs->name, r); 2793 goto init_failed; 2794 } 2795 adev->ip_blocks[i].status.sw = true; 2796 2797 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2798 /* need to do common hw init early so everything is set up for gmc */ 2799 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2800 if (r) { 2801 DRM_ERROR("hw_init %d failed %d\n", i, r); 2802 goto init_failed; 2803 } 2804 adev->ip_blocks[i].status.hw = true; 2805 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2806 /* need to do gmc hw init early so we can allocate gpu mem */ 2807 /* Try to reserve bad pages early */ 2808 if (amdgpu_sriov_vf(adev)) 2809 amdgpu_virt_exchange_data(adev); 2810 2811 r = amdgpu_device_mem_scratch_init(adev); 2812 if (r) { 2813 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2814 goto init_failed; 2815 } 2816 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2817 if (r) { 2818 DRM_ERROR("hw_init %d failed %d\n", i, r); 2819 goto init_failed; 2820 } 2821 r = amdgpu_device_wb_init(adev); 2822 if (r) { 2823 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2824 goto init_failed; 2825 } 2826 adev->ip_blocks[i].status.hw = true; 2827 2828 /* right after GMC hw init, we create CSA */ 2829 if (adev->gfx.mcbp) { 2830 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2831 AMDGPU_GEM_DOMAIN_VRAM | 2832 AMDGPU_GEM_DOMAIN_GTT, 2833 AMDGPU_CSA_SIZE); 2834 if (r) { 2835 DRM_ERROR("allocate CSA failed %d\n", r); 2836 goto init_failed; 2837 } 2838 } 2839 2840 r = amdgpu_seq64_init(adev); 2841 if (r) { 2842 DRM_ERROR("allocate seq64 failed %d\n", r); 2843 goto init_failed; 2844 } 2845 } 2846 } 2847 2848 if (amdgpu_sriov_vf(adev)) 2849 amdgpu_virt_init_data_exchange(adev); 2850 2851 r = amdgpu_ib_pool_init(adev); 2852 if (r) { 2853 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2854 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2855 goto init_failed; 2856 } 2857 2858 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2859 if (r) 2860 goto init_failed; 2861 2862 r = amdgpu_device_ip_hw_init_phase1(adev); 2863 if (r) 2864 goto init_failed; 2865 2866 r = amdgpu_device_fw_loading(adev); 2867 if (r) 2868 goto init_failed; 2869 2870 r = amdgpu_device_ip_hw_init_phase2(adev); 2871 if (r) 2872 goto init_failed; 2873 2874 /* 2875 * retired pages will be loaded from eeprom and reserved here, 2876 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2877 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2878 * for I2C communication which only true at this point. 2879 * 2880 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2881 * failure from bad gpu situation and stop amdgpu init process 2882 * accordingly. For other failed cases, it will still release all 2883 * the resource and print error message, rather than returning one 2884 * negative value to upper level. 2885 * 2886 * Note: theoretically, this should be called before all vram allocations 2887 * to protect retired page from abusing 2888 */ 2889 r = amdgpu_ras_recovery_init(adev); 2890 if (r) 2891 goto init_failed; 2892 2893 /** 2894 * In case of XGMI grab extra reference for reset domain for this device 2895 */ 2896 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2897 if (amdgpu_xgmi_add_device(adev) == 0) { 2898 if (!amdgpu_sriov_vf(adev)) { 2899 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2900 2901 if (WARN_ON(!hive)) { 2902 r = -ENOENT; 2903 goto init_failed; 2904 } 2905 2906 if (!hive->reset_domain || 2907 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2908 r = -ENOENT; 2909 amdgpu_put_xgmi_hive(hive); 2910 goto init_failed; 2911 } 2912 2913 /* Drop the early temporary reset domain we created for device */ 2914 amdgpu_reset_put_reset_domain(adev->reset_domain); 2915 adev->reset_domain = hive->reset_domain; 2916 amdgpu_put_xgmi_hive(hive); 2917 } 2918 } 2919 } 2920 2921 r = amdgpu_device_init_schedulers(adev); 2922 if (r) 2923 goto init_failed; 2924 2925 if (adev->mman.buffer_funcs_ring->sched.ready) 2926 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2927 2928 /* Don't init kfd if whole hive need to be reset during init */ 2929 if (!adev->gmc.xgmi.pending_reset) { 2930 kgd2kfd_init_zone_device(adev); 2931 amdgpu_amdkfd_device_init(adev); 2932 } 2933 2934 amdgpu_fru_get_product_info(adev); 2935 2936 init_failed: 2937 2938 return r; 2939 } 2940 2941 /** 2942 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2943 * 2944 * @adev: amdgpu_device pointer 2945 * 2946 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2947 * this function before a GPU reset. If the value is retained after a 2948 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2949 */ 2950 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2951 { 2952 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2953 } 2954 2955 /** 2956 * amdgpu_device_check_vram_lost - check if vram is valid 2957 * 2958 * @adev: amdgpu_device pointer 2959 * 2960 * Checks the reset magic value written to the gart pointer in VRAM. 2961 * The driver calls this after a GPU reset to see if the contents of 2962 * VRAM is lost or now. 2963 * returns true if vram is lost, false if not. 2964 */ 2965 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2966 { 2967 if (memcmp(adev->gart.ptr, adev->reset_magic, 2968 AMDGPU_RESET_MAGIC_NUM)) 2969 return true; 2970 2971 if (!amdgpu_in_reset(adev)) 2972 return false; 2973 2974 /* 2975 * For all ASICs with baco/mode1 reset, the VRAM is 2976 * always assumed to be lost. 2977 */ 2978 switch (amdgpu_asic_reset_method(adev)) { 2979 case AMD_RESET_METHOD_BACO: 2980 case AMD_RESET_METHOD_MODE1: 2981 return true; 2982 default: 2983 return false; 2984 } 2985 } 2986 2987 /** 2988 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2989 * 2990 * @adev: amdgpu_device pointer 2991 * @state: clockgating state (gate or ungate) 2992 * 2993 * The list of all the hardware IPs that make up the asic is walked and the 2994 * set_clockgating_state callbacks are run. 2995 * Late initialization pass enabling clockgating for hardware IPs. 2996 * Fini or suspend, pass disabling clockgating for hardware IPs. 2997 * Returns 0 on success, negative error code on failure. 2998 */ 2999 3000 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3001 enum amd_clockgating_state state) 3002 { 3003 int i, j, r; 3004 3005 if (amdgpu_emu_mode == 1) 3006 return 0; 3007 3008 for (j = 0; j < adev->num_ip_blocks; j++) { 3009 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3010 if (!adev->ip_blocks[i].status.late_initialized) 3011 continue; 3012 /* skip CG for GFX, SDMA on S0ix */ 3013 if (adev->in_s0ix && 3014 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3015 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3016 continue; 3017 /* skip CG for VCE/UVD, it's handled specially */ 3018 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3019 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3020 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3021 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3022 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3023 /* enable clockgating to save power */ 3024 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3025 state); 3026 if (r) { 3027 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3028 adev->ip_blocks[i].version->funcs->name, r); 3029 return r; 3030 } 3031 } 3032 } 3033 3034 return 0; 3035 } 3036 3037 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3038 enum amd_powergating_state state) 3039 { 3040 int i, j, r; 3041 3042 if (amdgpu_emu_mode == 1) 3043 return 0; 3044 3045 for (j = 0; j < adev->num_ip_blocks; j++) { 3046 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3047 if (!adev->ip_blocks[i].status.late_initialized) 3048 continue; 3049 /* skip PG for GFX, SDMA on S0ix */ 3050 if (adev->in_s0ix && 3051 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3052 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3053 continue; 3054 /* skip CG for VCE/UVD, it's handled specially */ 3055 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3056 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3057 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3058 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3059 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3060 /* enable powergating to save power */ 3061 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3062 state); 3063 if (r) { 3064 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3065 adev->ip_blocks[i].version->funcs->name, r); 3066 return r; 3067 } 3068 } 3069 } 3070 return 0; 3071 } 3072 3073 static int amdgpu_device_enable_mgpu_fan_boost(void) 3074 { 3075 struct amdgpu_gpu_instance *gpu_ins; 3076 struct amdgpu_device *adev; 3077 int i, ret = 0; 3078 3079 mutex_lock(&mgpu_info.mutex); 3080 3081 /* 3082 * MGPU fan boost feature should be enabled 3083 * only when there are two or more dGPUs in 3084 * the system 3085 */ 3086 if (mgpu_info.num_dgpu < 2) 3087 goto out; 3088 3089 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3090 gpu_ins = &(mgpu_info.gpu_ins[i]); 3091 adev = gpu_ins->adev; 3092 if (!(adev->flags & AMD_IS_APU) && 3093 !gpu_ins->mgpu_fan_enabled) { 3094 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3095 if (ret) 3096 break; 3097 3098 gpu_ins->mgpu_fan_enabled = 1; 3099 } 3100 } 3101 3102 out: 3103 mutex_unlock(&mgpu_info.mutex); 3104 3105 return ret; 3106 } 3107 3108 /** 3109 * amdgpu_device_ip_late_init - run late init for hardware IPs 3110 * 3111 * @adev: amdgpu_device pointer 3112 * 3113 * Late initialization pass for hardware IPs. The list of all the hardware 3114 * IPs that make up the asic is walked and the late_init callbacks are run. 3115 * late_init covers any special initialization that an IP requires 3116 * after all of the have been initialized or something that needs to happen 3117 * late in the init process. 3118 * Returns 0 on success, negative error code on failure. 3119 */ 3120 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3121 { 3122 struct amdgpu_gpu_instance *gpu_instance; 3123 int i = 0, r; 3124 3125 for (i = 0; i < adev->num_ip_blocks; i++) { 3126 if (!adev->ip_blocks[i].status.hw) 3127 continue; 3128 if (adev->ip_blocks[i].version->funcs->late_init) { 3129 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3130 if (r) { 3131 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3132 adev->ip_blocks[i].version->funcs->name, r); 3133 return r; 3134 } 3135 } 3136 adev->ip_blocks[i].status.late_initialized = true; 3137 } 3138 3139 r = amdgpu_ras_late_init(adev); 3140 if (r) { 3141 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3142 return r; 3143 } 3144 3145 amdgpu_ras_set_error_query_ready(adev, true); 3146 3147 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3148 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3149 3150 amdgpu_device_fill_reset_magic(adev); 3151 3152 r = amdgpu_device_enable_mgpu_fan_boost(); 3153 if (r) 3154 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3155 3156 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3157 if (amdgpu_passthrough(adev) && 3158 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3159 adev->asic_type == CHIP_ALDEBARAN)) 3160 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3161 3162 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3163 mutex_lock(&mgpu_info.mutex); 3164 3165 /* 3166 * Reset device p-state to low as this was booted with high. 3167 * 3168 * This should be performed only after all devices from the same 3169 * hive get initialized. 3170 * 3171 * However, it's unknown how many device in the hive in advance. 3172 * As this is counted one by one during devices initializations. 3173 * 3174 * So, we wait for all XGMI interlinked devices initialized. 3175 * This may bring some delays as those devices may come from 3176 * different hives. But that should be OK. 3177 */ 3178 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3179 for (i = 0; i < mgpu_info.num_gpu; i++) { 3180 gpu_instance = &(mgpu_info.gpu_ins[i]); 3181 if (gpu_instance->adev->flags & AMD_IS_APU) 3182 continue; 3183 3184 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3185 AMDGPU_XGMI_PSTATE_MIN); 3186 if (r) { 3187 DRM_ERROR("pstate setting failed (%d).\n", r); 3188 break; 3189 } 3190 } 3191 } 3192 3193 mutex_unlock(&mgpu_info.mutex); 3194 } 3195 3196 return 0; 3197 } 3198 3199 /** 3200 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3201 * 3202 * @adev: amdgpu_device pointer 3203 * 3204 * For ASICs need to disable SMC first 3205 */ 3206 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3207 { 3208 int i, r; 3209 3210 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3211 return; 3212 3213 for (i = 0; i < adev->num_ip_blocks; i++) { 3214 if (!adev->ip_blocks[i].status.hw) 3215 continue; 3216 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3217 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3218 /* XXX handle errors */ 3219 if (r) { 3220 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3221 adev->ip_blocks[i].version->funcs->name, r); 3222 } 3223 adev->ip_blocks[i].status.hw = false; 3224 break; 3225 } 3226 } 3227 } 3228 3229 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3230 { 3231 int i, r; 3232 3233 for (i = 0; i < adev->num_ip_blocks; i++) { 3234 if (!adev->ip_blocks[i].version->funcs->early_fini) 3235 continue; 3236 3237 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3238 if (r) { 3239 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3240 adev->ip_blocks[i].version->funcs->name, r); 3241 } 3242 } 3243 3244 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3245 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3246 3247 amdgpu_amdkfd_suspend(adev, false); 3248 3249 /* Workaroud for ASICs need to disable SMC first */ 3250 amdgpu_device_smu_fini_early(adev); 3251 3252 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3253 if (!adev->ip_blocks[i].status.hw) 3254 continue; 3255 3256 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3257 /* XXX handle errors */ 3258 if (r) { 3259 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3260 adev->ip_blocks[i].version->funcs->name, r); 3261 } 3262 3263 adev->ip_blocks[i].status.hw = false; 3264 } 3265 3266 if (amdgpu_sriov_vf(adev)) { 3267 if (amdgpu_virt_release_full_gpu(adev, false)) 3268 DRM_ERROR("failed to release exclusive mode on fini\n"); 3269 } 3270 3271 return 0; 3272 } 3273 3274 /** 3275 * amdgpu_device_ip_fini - run fini for hardware IPs 3276 * 3277 * @adev: amdgpu_device pointer 3278 * 3279 * Main teardown pass for hardware IPs. The list of all the hardware 3280 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3281 * are run. hw_fini tears down the hardware associated with each IP 3282 * and sw_fini tears down any software state associated with each IP. 3283 * Returns 0 on success, negative error code on failure. 3284 */ 3285 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3286 { 3287 int i, r; 3288 3289 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3290 amdgpu_virt_release_ras_err_handler_data(adev); 3291 3292 if (adev->gmc.xgmi.num_physical_nodes > 1) 3293 amdgpu_xgmi_remove_device(adev); 3294 3295 amdgpu_amdkfd_device_fini_sw(adev); 3296 3297 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3298 if (!adev->ip_blocks[i].status.sw) 3299 continue; 3300 3301 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3302 amdgpu_ucode_free_bo(adev); 3303 amdgpu_free_static_csa(&adev->virt.csa_obj); 3304 amdgpu_device_wb_fini(adev); 3305 amdgpu_device_mem_scratch_fini(adev); 3306 amdgpu_ib_pool_fini(adev); 3307 amdgpu_seq64_fini(adev); 3308 } 3309 3310 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3311 /* XXX handle errors */ 3312 if (r) { 3313 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3314 adev->ip_blocks[i].version->funcs->name, r); 3315 } 3316 adev->ip_blocks[i].status.sw = false; 3317 adev->ip_blocks[i].status.valid = false; 3318 } 3319 3320 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3321 if (!adev->ip_blocks[i].status.late_initialized) 3322 continue; 3323 if (adev->ip_blocks[i].version->funcs->late_fini) 3324 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3325 adev->ip_blocks[i].status.late_initialized = false; 3326 } 3327 3328 amdgpu_ras_fini(adev); 3329 3330 return 0; 3331 } 3332 3333 /** 3334 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3335 * 3336 * @work: work_struct. 3337 */ 3338 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3339 { 3340 struct amdgpu_device *adev = 3341 container_of(work, struct amdgpu_device, delayed_init_work.work); 3342 int r; 3343 3344 r = amdgpu_ib_ring_tests(adev); 3345 if (r) 3346 DRM_ERROR("ib ring test failed (%d).\n", r); 3347 } 3348 3349 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3350 { 3351 struct amdgpu_device *adev = 3352 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3353 3354 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3355 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3356 3357 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3358 adev->gfx.gfx_off_state = true; 3359 } 3360 3361 /** 3362 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3363 * 3364 * @adev: amdgpu_device pointer 3365 * 3366 * Main suspend function for hardware IPs. The list of all the hardware 3367 * IPs that make up the asic is walked, clockgating is disabled and the 3368 * suspend callbacks are run. suspend puts the hardware and software state 3369 * in each IP into a state suitable for suspend. 3370 * Returns 0 on success, negative error code on failure. 3371 */ 3372 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3373 { 3374 int i, r; 3375 3376 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3377 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3378 3379 /* 3380 * Per PMFW team's suggestion, driver needs to handle gfxoff 3381 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3382 * scenario. Add the missing df cstate disablement here. 3383 */ 3384 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3385 dev_warn(adev->dev, "Failed to disallow df cstate"); 3386 3387 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3388 if (!adev->ip_blocks[i].status.valid) 3389 continue; 3390 3391 /* displays are handled separately */ 3392 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3393 continue; 3394 3395 /* XXX handle errors */ 3396 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3397 /* XXX handle errors */ 3398 if (r) { 3399 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3400 adev->ip_blocks[i].version->funcs->name, r); 3401 return r; 3402 } 3403 3404 adev->ip_blocks[i].status.hw = false; 3405 } 3406 3407 return 0; 3408 } 3409 3410 /** 3411 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3412 * 3413 * @adev: amdgpu_device pointer 3414 * 3415 * Main suspend function for hardware IPs. The list of all the hardware 3416 * IPs that make up the asic is walked, clockgating is disabled and the 3417 * suspend callbacks are run. suspend puts the hardware and software state 3418 * in each IP into a state suitable for suspend. 3419 * Returns 0 on success, negative error code on failure. 3420 */ 3421 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3422 { 3423 int i, r; 3424 3425 if (adev->in_s0ix) 3426 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3427 3428 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3429 if (!adev->ip_blocks[i].status.valid) 3430 continue; 3431 /* displays are handled in phase1 */ 3432 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3433 continue; 3434 /* PSP lost connection when err_event_athub occurs */ 3435 if (amdgpu_ras_intr_triggered() && 3436 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3437 adev->ip_blocks[i].status.hw = false; 3438 continue; 3439 } 3440 3441 /* skip unnecessary suspend if we do not initialize them yet */ 3442 if (adev->gmc.xgmi.pending_reset && 3443 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3444 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3445 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3447 adev->ip_blocks[i].status.hw = false; 3448 continue; 3449 } 3450 3451 /* skip suspend of gfx/mes and psp for S0ix 3452 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3453 * like at runtime. PSP is also part of the always on hardware 3454 * so no need to suspend it. 3455 */ 3456 if (adev->in_s0ix && 3457 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3458 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3459 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3460 continue; 3461 3462 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3463 if (adev->in_s0ix && 3464 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3465 IP_VERSION(5, 0, 0)) && 3466 (adev->ip_blocks[i].version->type == 3467 AMD_IP_BLOCK_TYPE_SDMA)) 3468 continue; 3469 3470 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3471 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3472 * from this location and RLC Autoload automatically also gets loaded 3473 * from here based on PMFW -> PSP message during re-init sequence. 3474 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3475 * the TMR and reload FWs again for IMU enabled APU ASICs. 3476 */ 3477 if (amdgpu_in_reset(adev) && 3478 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3479 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3480 continue; 3481 3482 /* XXX handle errors */ 3483 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3484 /* XXX handle errors */ 3485 if (r) { 3486 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3487 adev->ip_blocks[i].version->funcs->name, r); 3488 } 3489 adev->ip_blocks[i].status.hw = false; 3490 /* handle putting the SMC in the appropriate state */ 3491 if (!amdgpu_sriov_vf(adev)) { 3492 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3493 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3494 if (r) { 3495 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3496 adev->mp1_state, r); 3497 return r; 3498 } 3499 } 3500 } 3501 } 3502 3503 return 0; 3504 } 3505 3506 /** 3507 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3508 * 3509 * @adev: amdgpu_device pointer 3510 * 3511 * Main suspend function for hardware IPs. The list of all the hardware 3512 * IPs that make up the asic is walked, clockgating is disabled and the 3513 * suspend callbacks are run. suspend puts the hardware and software state 3514 * in each IP into a state suitable for suspend. 3515 * Returns 0 on success, negative error code on failure. 3516 */ 3517 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3518 { 3519 int r; 3520 3521 if (amdgpu_sriov_vf(adev)) { 3522 amdgpu_virt_fini_data_exchange(adev); 3523 amdgpu_virt_request_full_gpu(adev, false); 3524 } 3525 3526 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3527 3528 r = amdgpu_device_ip_suspend_phase1(adev); 3529 if (r) 3530 return r; 3531 r = amdgpu_device_ip_suspend_phase2(adev); 3532 3533 if (amdgpu_sriov_vf(adev)) 3534 amdgpu_virt_release_full_gpu(adev, false); 3535 3536 return r; 3537 } 3538 3539 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3540 { 3541 int i, r; 3542 3543 static enum amd_ip_block_type ip_order[] = { 3544 AMD_IP_BLOCK_TYPE_COMMON, 3545 AMD_IP_BLOCK_TYPE_GMC, 3546 AMD_IP_BLOCK_TYPE_PSP, 3547 AMD_IP_BLOCK_TYPE_IH, 3548 }; 3549 3550 for (i = 0; i < adev->num_ip_blocks; i++) { 3551 int j; 3552 struct amdgpu_ip_block *block; 3553 3554 block = &adev->ip_blocks[i]; 3555 block->status.hw = false; 3556 3557 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3558 3559 if (block->version->type != ip_order[j] || 3560 !block->status.valid) 3561 continue; 3562 3563 r = block->version->funcs->hw_init(adev); 3564 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3565 if (r) 3566 return r; 3567 block->status.hw = true; 3568 } 3569 } 3570 3571 return 0; 3572 } 3573 3574 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3575 { 3576 int i, r; 3577 3578 static enum amd_ip_block_type ip_order[] = { 3579 AMD_IP_BLOCK_TYPE_SMC, 3580 AMD_IP_BLOCK_TYPE_DCE, 3581 AMD_IP_BLOCK_TYPE_GFX, 3582 AMD_IP_BLOCK_TYPE_SDMA, 3583 AMD_IP_BLOCK_TYPE_MES, 3584 AMD_IP_BLOCK_TYPE_UVD, 3585 AMD_IP_BLOCK_TYPE_VCE, 3586 AMD_IP_BLOCK_TYPE_VCN, 3587 AMD_IP_BLOCK_TYPE_JPEG 3588 }; 3589 3590 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3591 int j; 3592 struct amdgpu_ip_block *block; 3593 3594 for (j = 0; j < adev->num_ip_blocks; j++) { 3595 block = &adev->ip_blocks[j]; 3596 3597 if (block->version->type != ip_order[i] || 3598 !block->status.valid || 3599 block->status.hw) 3600 continue; 3601 3602 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3603 r = block->version->funcs->resume(adev); 3604 else 3605 r = block->version->funcs->hw_init(adev); 3606 3607 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3608 if (r) 3609 return r; 3610 block->status.hw = true; 3611 } 3612 } 3613 3614 return 0; 3615 } 3616 3617 /** 3618 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3619 * 3620 * @adev: amdgpu_device pointer 3621 * 3622 * First resume function for hardware IPs. The list of all the hardware 3623 * IPs that make up the asic is walked and the resume callbacks are run for 3624 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3625 * after a suspend and updates the software state as necessary. This 3626 * function is also used for restoring the GPU after a GPU reset. 3627 * Returns 0 on success, negative error code on failure. 3628 */ 3629 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3630 { 3631 int i, r; 3632 3633 for (i = 0; i < adev->num_ip_blocks; i++) { 3634 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3635 continue; 3636 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3637 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3638 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3639 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3640 3641 r = adev->ip_blocks[i].version->funcs->resume(adev); 3642 if (r) { 3643 DRM_ERROR("resume of IP block <%s> failed %d\n", 3644 adev->ip_blocks[i].version->funcs->name, r); 3645 return r; 3646 } 3647 adev->ip_blocks[i].status.hw = true; 3648 } 3649 } 3650 3651 return 0; 3652 } 3653 3654 /** 3655 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3656 * 3657 * @adev: amdgpu_device pointer 3658 * 3659 * First resume function for hardware IPs. The list of all the hardware 3660 * IPs that make up the asic is walked and the resume callbacks are run for 3661 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3662 * functional state after a suspend and updates the software state as 3663 * necessary. This function is also used for restoring the GPU after a GPU 3664 * reset. 3665 * Returns 0 on success, negative error code on failure. 3666 */ 3667 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3668 { 3669 int i, r; 3670 3671 for (i = 0; i < adev->num_ip_blocks; i++) { 3672 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3673 continue; 3674 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3675 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3676 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3677 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3678 continue; 3679 r = adev->ip_blocks[i].version->funcs->resume(adev); 3680 if (r) { 3681 DRM_ERROR("resume of IP block <%s> failed %d\n", 3682 adev->ip_blocks[i].version->funcs->name, r); 3683 return r; 3684 } 3685 adev->ip_blocks[i].status.hw = true; 3686 } 3687 3688 return 0; 3689 } 3690 3691 /** 3692 * amdgpu_device_ip_resume - run resume for hardware IPs 3693 * 3694 * @adev: amdgpu_device pointer 3695 * 3696 * Main resume function for hardware IPs. The hardware IPs 3697 * are split into two resume functions because they are 3698 * also used in recovering from a GPU reset and some additional 3699 * steps need to be take between them. In this case (S3/S4) they are 3700 * run sequentially. 3701 * Returns 0 on success, negative error code on failure. 3702 */ 3703 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3704 { 3705 int r; 3706 3707 r = amdgpu_device_ip_resume_phase1(adev); 3708 if (r) 3709 return r; 3710 3711 r = amdgpu_device_fw_loading(adev); 3712 if (r) 3713 return r; 3714 3715 r = amdgpu_device_ip_resume_phase2(adev); 3716 3717 if (adev->mman.buffer_funcs_ring->sched.ready) 3718 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3719 3720 return r; 3721 } 3722 3723 /** 3724 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3725 * 3726 * @adev: amdgpu_device pointer 3727 * 3728 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3729 */ 3730 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3731 { 3732 if (amdgpu_sriov_vf(adev)) { 3733 if (adev->is_atom_fw) { 3734 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3735 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3736 } else { 3737 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3738 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3739 } 3740 3741 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3742 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3743 } 3744 } 3745 3746 /** 3747 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3748 * 3749 * @asic_type: AMD asic type 3750 * 3751 * Check if there is DC (new modesetting infrastructre) support for an asic. 3752 * returns true if DC has support, false if not. 3753 */ 3754 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3755 { 3756 switch (asic_type) { 3757 #ifdef CONFIG_DRM_AMDGPU_SI 3758 case CHIP_HAINAN: 3759 #endif 3760 case CHIP_TOPAZ: 3761 /* chips with no display hardware */ 3762 return false; 3763 #if defined(CONFIG_DRM_AMD_DC) 3764 case CHIP_TAHITI: 3765 case CHIP_PITCAIRN: 3766 case CHIP_VERDE: 3767 case CHIP_OLAND: 3768 /* 3769 * We have systems in the wild with these ASICs that require 3770 * LVDS and VGA support which is not supported with DC. 3771 * 3772 * Fallback to the non-DC driver here by default so as not to 3773 * cause regressions. 3774 */ 3775 #if defined(CONFIG_DRM_AMD_DC_SI) 3776 return amdgpu_dc > 0; 3777 #else 3778 return false; 3779 #endif 3780 case CHIP_BONAIRE: 3781 case CHIP_KAVERI: 3782 case CHIP_KABINI: 3783 case CHIP_MULLINS: 3784 /* 3785 * We have systems in the wild with these ASICs that require 3786 * VGA support which is not supported with DC. 3787 * 3788 * Fallback to the non-DC driver here by default so as not to 3789 * cause regressions. 3790 */ 3791 return amdgpu_dc > 0; 3792 default: 3793 return amdgpu_dc != 0; 3794 #else 3795 default: 3796 if (amdgpu_dc > 0) 3797 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3798 return false; 3799 #endif 3800 } 3801 } 3802 3803 /** 3804 * amdgpu_device_has_dc_support - check if dc is supported 3805 * 3806 * @adev: amdgpu_device pointer 3807 * 3808 * Returns true for supported, false for not supported 3809 */ 3810 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3811 { 3812 if (adev->enable_virtual_display || 3813 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3814 return false; 3815 3816 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3817 } 3818 3819 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3820 { 3821 struct amdgpu_device *adev = 3822 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3823 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3824 3825 /* It's a bug to not have a hive within this function */ 3826 if (WARN_ON(!hive)) 3827 return; 3828 3829 /* 3830 * Use task barrier to synchronize all xgmi reset works across the 3831 * hive. task_barrier_enter and task_barrier_exit will block 3832 * until all the threads running the xgmi reset works reach 3833 * those points. task_barrier_full will do both blocks. 3834 */ 3835 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3836 3837 task_barrier_enter(&hive->tb); 3838 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3839 3840 if (adev->asic_reset_res) 3841 goto fail; 3842 3843 task_barrier_exit(&hive->tb); 3844 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3845 3846 if (adev->asic_reset_res) 3847 goto fail; 3848 3849 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3850 } else { 3851 3852 task_barrier_full(&hive->tb); 3853 adev->asic_reset_res = amdgpu_asic_reset(adev); 3854 } 3855 3856 fail: 3857 if (adev->asic_reset_res) 3858 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3859 adev->asic_reset_res, adev_to_drm(adev)->unique); 3860 amdgpu_put_xgmi_hive(hive); 3861 } 3862 3863 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3864 { 3865 char *input = amdgpu_lockup_timeout; 3866 char *timeout_setting = NULL; 3867 int index = 0; 3868 long timeout; 3869 int ret = 0; 3870 3871 /* 3872 * By default timeout for non compute jobs is 10000 3873 * and 60000 for compute jobs. 3874 * In SR-IOV or passthrough mode, timeout for compute 3875 * jobs are 60000 by default. 3876 */ 3877 adev->gfx_timeout = msecs_to_jiffies(10000); 3878 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3879 if (amdgpu_sriov_vf(adev)) 3880 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3881 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3882 else 3883 adev->compute_timeout = msecs_to_jiffies(60000); 3884 3885 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3886 while ((timeout_setting = strsep(&input, ",")) && 3887 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3888 ret = kstrtol(timeout_setting, 0, &timeout); 3889 if (ret) 3890 return ret; 3891 3892 if (timeout == 0) { 3893 index++; 3894 continue; 3895 } else if (timeout < 0) { 3896 timeout = MAX_SCHEDULE_TIMEOUT; 3897 dev_warn(adev->dev, "lockup timeout disabled"); 3898 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3899 } else { 3900 timeout = msecs_to_jiffies(timeout); 3901 } 3902 3903 switch (index++) { 3904 case 0: 3905 adev->gfx_timeout = timeout; 3906 break; 3907 case 1: 3908 adev->compute_timeout = timeout; 3909 break; 3910 case 2: 3911 adev->sdma_timeout = timeout; 3912 break; 3913 case 3: 3914 adev->video_timeout = timeout; 3915 break; 3916 default: 3917 break; 3918 } 3919 } 3920 /* 3921 * There is only one value specified and 3922 * it should apply to all non-compute jobs. 3923 */ 3924 if (index == 1) { 3925 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3926 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3927 adev->compute_timeout = adev->gfx_timeout; 3928 } 3929 } 3930 3931 return ret; 3932 } 3933 3934 /** 3935 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3936 * 3937 * @adev: amdgpu_device pointer 3938 * 3939 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3940 */ 3941 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3942 { 3943 struct iommu_domain *domain; 3944 3945 domain = iommu_get_domain_for_dev(adev->dev); 3946 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3947 adev->ram_is_direct_mapped = true; 3948 } 3949 3950 static const struct attribute *amdgpu_dev_attributes[] = { 3951 &dev_attr_pcie_replay_count.attr, 3952 NULL 3953 }; 3954 3955 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3956 { 3957 if (amdgpu_mcbp == 1) 3958 adev->gfx.mcbp = true; 3959 else if (amdgpu_mcbp == 0) 3960 adev->gfx.mcbp = false; 3961 3962 if (amdgpu_sriov_vf(adev)) 3963 adev->gfx.mcbp = true; 3964 3965 if (adev->gfx.mcbp) 3966 DRM_INFO("MCBP is enabled\n"); 3967 } 3968 3969 /** 3970 * amdgpu_device_init - initialize the driver 3971 * 3972 * @adev: amdgpu_device pointer 3973 * @flags: driver flags 3974 * 3975 * Initializes the driver info and hw (all asics). 3976 * Returns 0 for success or an error on failure. 3977 * Called at driver startup. 3978 */ 3979 int amdgpu_device_init(struct amdgpu_device *adev, 3980 uint32_t flags) 3981 { 3982 struct drm_device *ddev = adev_to_drm(adev); 3983 struct pci_dev *pdev = adev->pdev; 3984 int r, i; 3985 bool px = false; 3986 u32 max_MBps; 3987 int tmp; 3988 3989 adev->shutdown = false; 3990 adev->flags = flags; 3991 3992 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3993 adev->asic_type = amdgpu_force_asic_type; 3994 else 3995 adev->asic_type = flags & AMD_ASIC_MASK; 3996 3997 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3998 if (amdgpu_emu_mode == 1) 3999 adev->usec_timeout *= 10; 4000 adev->gmc.gart_size = 512 * 1024 * 1024; 4001 adev->accel_working = false; 4002 adev->num_rings = 0; 4003 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4004 adev->mman.buffer_funcs = NULL; 4005 adev->mman.buffer_funcs_ring = NULL; 4006 adev->vm_manager.vm_pte_funcs = NULL; 4007 adev->vm_manager.vm_pte_num_scheds = 0; 4008 adev->gmc.gmc_funcs = NULL; 4009 adev->harvest_ip_mask = 0x0; 4010 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4011 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4012 4013 adev->smc_rreg = &amdgpu_invalid_rreg; 4014 adev->smc_wreg = &amdgpu_invalid_wreg; 4015 adev->pcie_rreg = &amdgpu_invalid_rreg; 4016 adev->pcie_wreg = &amdgpu_invalid_wreg; 4017 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4018 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4019 adev->pciep_rreg = &amdgpu_invalid_rreg; 4020 adev->pciep_wreg = &amdgpu_invalid_wreg; 4021 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4022 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4023 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4024 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4025 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4026 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4027 adev->didt_rreg = &amdgpu_invalid_rreg; 4028 adev->didt_wreg = &amdgpu_invalid_wreg; 4029 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4030 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4031 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4032 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4033 4034 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4035 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4036 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4037 4038 /* mutex initialization are all done here so we 4039 * can recall function without having locking issues 4040 */ 4041 mutex_init(&adev->firmware.mutex); 4042 mutex_init(&adev->pm.mutex); 4043 mutex_init(&adev->gfx.gpu_clock_mutex); 4044 mutex_init(&adev->srbm_mutex); 4045 mutex_init(&adev->gfx.pipe_reserve_mutex); 4046 mutex_init(&adev->gfx.gfx_off_mutex); 4047 mutex_init(&adev->gfx.partition_mutex); 4048 mutex_init(&adev->grbm_idx_mutex); 4049 mutex_init(&adev->mn_lock); 4050 mutex_init(&adev->virt.vf_errors.lock); 4051 mutex_init(&adev->virt.rlcg_reg_lock); 4052 hash_init(adev->mn_hash); 4053 mutex_init(&adev->psp.mutex); 4054 mutex_init(&adev->notifier_lock); 4055 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4056 mutex_init(&adev->benchmark_mutex); 4057 4058 amdgpu_device_init_apu_flags(adev); 4059 4060 r = amdgpu_device_check_arguments(adev); 4061 if (r) 4062 return r; 4063 4064 spin_lock_init(&adev->mmio_idx_lock); 4065 spin_lock_init(&adev->smc_idx_lock); 4066 spin_lock_init(&adev->pcie_idx_lock); 4067 spin_lock_init(&adev->uvd_ctx_idx_lock); 4068 spin_lock_init(&adev->didt_idx_lock); 4069 spin_lock_init(&adev->gc_cac_idx_lock); 4070 spin_lock_init(&adev->se_cac_idx_lock); 4071 spin_lock_init(&adev->audio_endpt_idx_lock); 4072 spin_lock_init(&adev->mm_stats.lock); 4073 spin_lock_init(&adev->wb.lock); 4074 4075 INIT_LIST_HEAD(&adev->shadow_list); 4076 mutex_init(&adev->shadow_list_lock); 4077 4078 INIT_LIST_HEAD(&adev->reset_list); 4079 4080 INIT_LIST_HEAD(&adev->ras_list); 4081 4082 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4083 4084 INIT_DELAYED_WORK(&adev->delayed_init_work, 4085 amdgpu_device_delayed_init_work_handler); 4086 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4087 amdgpu_device_delay_enable_gfx_off); 4088 4089 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4090 4091 adev->gfx.gfx_off_req_count = 1; 4092 adev->gfx.gfx_off_residency = 0; 4093 adev->gfx.gfx_off_entrycount = 0; 4094 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4095 4096 atomic_set(&adev->throttling_logging_enabled, 1); 4097 /* 4098 * If throttling continues, logging will be performed every minute 4099 * to avoid log flooding. "-1" is subtracted since the thermal 4100 * throttling interrupt comes every second. Thus, the total logging 4101 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4102 * for throttling interrupt) = 60 seconds. 4103 */ 4104 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4105 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4106 4107 /* Registers mapping */ 4108 /* TODO: block userspace mapping of io register */ 4109 if (adev->asic_type >= CHIP_BONAIRE) { 4110 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4111 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4112 } else { 4113 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4114 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4115 } 4116 4117 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4118 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4119 4120 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4121 if (!adev->rmmio) 4122 return -ENOMEM; 4123 4124 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4125 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4126 4127 /* 4128 * Reset domain needs to be present early, before XGMI hive discovered 4129 * (if any) and intitialized to use reset sem and in_gpu reset flag 4130 * early on during init and before calling to RREG32. 4131 */ 4132 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4133 if (!adev->reset_domain) 4134 return -ENOMEM; 4135 4136 /* detect hw virtualization here */ 4137 amdgpu_detect_virtualization(adev); 4138 4139 amdgpu_device_get_pcie_info(adev); 4140 4141 r = amdgpu_device_get_job_timeout_settings(adev); 4142 if (r) { 4143 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4144 return r; 4145 } 4146 4147 amdgpu_device_set_mcbp(adev); 4148 4149 /* early init functions */ 4150 r = amdgpu_device_ip_early_init(adev); 4151 if (r) 4152 return r; 4153 4154 /* Get rid of things like offb */ 4155 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4156 if (r) 4157 return r; 4158 4159 /* Enable TMZ based on IP_VERSION */ 4160 amdgpu_gmc_tmz_set(adev); 4161 4162 if (amdgpu_sriov_vf(adev) && 4163 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4164 /* VF MMIO access (except mailbox range) from CPU 4165 * will be blocked during sriov runtime 4166 */ 4167 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4168 4169 amdgpu_gmc_noretry_set(adev); 4170 /* Need to get xgmi info early to decide the reset behavior*/ 4171 if (adev->gmc.xgmi.supported) { 4172 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4173 if (r) 4174 return r; 4175 } 4176 4177 /* enable PCIE atomic ops */ 4178 if (amdgpu_sriov_vf(adev)) { 4179 if (adev->virt.fw_reserve.p_pf2vf) 4180 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4181 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4182 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4183 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4184 * internal path natively support atomics, set have_atomics_support to true. 4185 */ 4186 } else if ((adev->flags & AMD_IS_APU) && 4187 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4188 IP_VERSION(9, 0, 0))) { 4189 adev->have_atomics_support = true; 4190 } else { 4191 adev->have_atomics_support = 4192 !pci_enable_atomic_ops_to_root(adev->pdev, 4193 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4194 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4195 } 4196 4197 if (!adev->have_atomics_support) 4198 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4199 4200 /* doorbell bar mapping and doorbell index init*/ 4201 amdgpu_doorbell_init(adev); 4202 4203 if (amdgpu_emu_mode == 1) { 4204 /* post the asic on emulation mode */ 4205 emu_soc_asic_init(adev); 4206 goto fence_driver_init; 4207 } 4208 4209 amdgpu_reset_init(adev); 4210 4211 /* detect if we are with an SRIOV vbios */ 4212 if (adev->bios) 4213 amdgpu_device_detect_sriov_bios(adev); 4214 4215 /* check if we need to reset the asic 4216 * E.g., driver was not cleanly unloaded previously, etc. 4217 */ 4218 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4219 if (adev->gmc.xgmi.num_physical_nodes) { 4220 dev_info(adev->dev, "Pending hive reset.\n"); 4221 adev->gmc.xgmi.pending_reset = true; 4222 /* Only need to init necessary block for SMU to handle the reset */ 4223 for (i = 0; i < adev->num_ip_blocks; i++) { 4224 if (!adev->ip_blocks[i].status.valid) 4225 continue; 4226 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4227 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4228 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4229 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4230 DRM_DEBUG("IP %s disabled for hw_init.\n", 4231 adev->ip_blocks[i].version->funcs->name); 4232 adev->ip_blocks[i].status.hw = true; 4233 } 4234 } 4235 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4236 !amdgpu_device_has_display_hardware(adev)) { 4237 r = psp_gpu_reset(adev); 4238 } else { 4239 tmp = amdgpu_reset_method; 4240 /* It should do a default reset when loading or reloading the driver, 4241 * regardless of the module parameter reset_method. 4242 */ 4243 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4244 r = amdgpu_asic_reset(adev); 4245 amdgpu_reset_method = tmp; 4246 } 4247 4248 if (r) { 4249 dev_err(adev->dev, "asic reset on init failed\n"); 4250 goto failed; 4251 } 4252 } 4253 4254 /* Post card if necessary */ 4255 if (amdgpu_device_need_post(adev)) { 4256 if (!adev->bios) { 4257 dev_err(adev->dev, "no vBIOS found\n"); 4258 r = -EINVAL; 4259 goto failed; 4260 } 4261 DRM_INFO("GPU posting now...\n"); 4262 r = amdgpu_device_asic_init(adev); 4263 if (r) { 4264 dev_err(adev->dev, "gpu post error!\n"); 4265 goto failed; 4266 } 4267 } 4268 4269 if (adev->bios) { 4270 if (adev->is_atom_fw) { 4271 /* Initialize clocks */ 4272 r = amdgpu_atomfirmware_get_clock_info(adev); 4273 if (r) { 4274 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4275 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4276 goto failed; 4277 } 4278 } else { 4279 /* Initialize clocks */ 4280 r = amdgpu_atombios_get_clock_info(adev); 4281 if (r) { 4282 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4283 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4284 goto failed; 4285 } 4286 /* init i2c buses */ 4287 if (!amdgpu_device_has_dc_support(adev)) 4288 amdgpu_atombios_i2c_init(adev); 4289 } 4290 } 4291 4292 fence_driver_init: 4293 /* Fence driver */ 4294 r = amdgpu_fence_driver_sw_init(adev); 4295 if (r) { 4296 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4297 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4298 goto failed; 4299 } 4300 4301 /* init the mode config */ 4302 drm_mode_config_init(adev_to_drm(adev)); 4303 4304 r = amdgpu_device_ip_init(adev); 4305 if (r) { 4306 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4307 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4308 goto release_ras_con; 4309 } 4310 4311 amdgpu_fence_driver_hw_init(adev); 4312 4313 dev_info(adev->dev, 4314 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4315 adev->gfx.config.max_shader_engines, 4316 adev->gfx.config.max_sh_per_se, 4317 adev->gfx.config.max_cu_per_sh, 4318 adev->gfx.cu_info.number); 4319 4320 adev->accel_working = true; 4321 4322 amdgpu_vm_check_compute_bug(adev); 4323 4324 /* Initialize the buffer migration limit. */ 4325 if (amdgpu_moverate >= 0) 4326 max_MBps = amdgpu_moverate; 4327 else 4328 max_MBps = 8; /* Allow 8 MB/s. */ 4329 /* Get a log2 for easy divisions. */ 4330 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4331 4332 /* 4333 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4334 * Otherwise the mgpu fan boost feature will be skipped due to the 4335 * gpu instance is counted less. 4336 */ 4337 amdgpu_register_gpu_instance(adev); 4338 4339 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4340 * explicit gating rather than handling it automatically. 4341 */ 4342 if (!adev->gmc.xgmi.pending_reset) { 4343 r = amdgpu_device_ip_late_init(adev); 4344 if (r) { 4345 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4346 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4347 goto release_ras_con; 4348 } 4349 /* must succeed. */ 4350 amdgpu_ras_resume(adev); 4351 queue_delayed_work(system_wq, &adev->delayed_init_work, 4352 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4353 } 4354 4355 if (amdgpu_sriov_vf(adev)) { 4356 amdgpu_virt_release_full_gpu(adev, true); 4357 flush_delayed_work(&adev->delayed_init_work); 4358 } 4359 4360 /* 4361 * Place those sysfs registering after `late_init`. As some of those 4362 * operations performed in `late_init` might affect the sysfs 4363 * interfaces creating. 4364 */ 4365 r = amdgpu_atombios_sysfs_init(adev); 4366 if (r) 4367 drm_err(&adev->ddev, 4368 "registering atombios sysfs failed (%d).\n", r); 4369 4370 r = amdgpu_pm_sysfs_init(adev); 4371 if (r) 4372 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4373 4374 r = amdgpu_ucode_sysfs_init(adev); 4375 if (r) { 4376 adev->ucode_sysfs_en = false; 4377 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4378 } else 4379 adev->ucode_sysfs_en = true; 4380 4381 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4382 if (r) 4383 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4384 4385 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4386 if (r) 4387 dev_err(adev->dev, 4388 "Could not create amdgpu board attributes\n"); 4389 4390 amdgpu_fru_sysfs_init(adev); 4391 amdgpu_reg_state_sysfs_init(adev); 4392 4393 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4394 r = amdgpu_pmu_init(adev); 4395 if (r) 4396 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4397 4398 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4399 if (amdgpu_device_cache_pci_state(adev->pdev)) 4400 pci_restore_state(pdev); 4401 4402 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4403 /* this will fail for cards that aren't VGA class devices, just 4404 * ignore it 4405 */ 4406 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4407 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4408 4409 px = amdgpu_device_supports_px(ddev); 4410 4411 if (px || (!dev_is_removable(&adev->pdev->dev) && 4412 apple_gmux_detect(NULL, NULL))) 4413 vga_switcheroo_register_client(adev->pdev, 4414 &amdgpu_switcheroo_ops, px); 4415 4416 if (px) 4417 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4418 4419 if (adev->gmc.xgmi.pending_reset) 4420 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4421 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4422 4423 amdgpu_device_check_iommu_direct_map(adev); 4424 4425 return 0; 4426 4427 release_ras_con: 4428 if (amdgpu_sriov_vf(adev)) 4429 amdgpu_virt_release_full_gpu(adev, true); 4430 4431 /* failed in exclusive mode due to timeout */ 4432 if (amdgpu_sriov_vf(adev) && 4433 !amdgpu_sriov_runtime(adev) && 4434 amdgpu_virt_mmio_blocked(adev) && 4435 !amdgpu_virt_wait_reset(adev)) { 4436 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4437 /* Don't send request since VF is inactive. */ 4438 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4439 adev->virt.ops = NULL; 4440 r = -EAGAIN; 4441 } 4442 amdgpu_release_ras_context(adev); 4443 4444 failed: 4445 amdgpu_vf_error_trans_all(adev); 4446 4447 return r; 4448 } 4449 4450 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4451 { 4452 4453 /* Clear all CPU mappings pointing to this device */ 4454 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4455 4456 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4457 amdgpu_doorbell_fini(adev); 4458 4459 iounmap(adev->rmmio); 4460 adev->rmmio = NULL; 4461 if (adev->mman.aper_base_kaddr) 4462 iounmap(adev->mman.aper_base_kaddr); 4463 adev->mman.aper_base_kaddr = NULL; 4464 4465 /* Memory manager related */ 4466 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4467 arch_phys_wc_del(adev->gmc.vram_mtrr); 4468 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4469 } 4470 } 4471 4472 /** 4473 * amdgpu_device_fini_hw - tear down the driver 4474 * 4475 * @adev: amdgpu_device pointer 4476 * 4477 * Tear down the driver info (all asics). 4478 * Called at driver shutdown. 4479 */ 4480 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4481 { 4482 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4483 flush_delayed_work(&adev->delayed_init_work); 4484 adev->shutdown = true; 4485 4486 /* make sure IB test finished before entering exclusive mode 4487 * to avoid preemption on IB test 4488 */ 4489 if (amdgpu_sriov_vf(adev)) { 4490 amdgpu_virt_request_full_gpu(adev, false); 4491 amdgpu_virt_fini_data_exchange(adev); 4492 } 4493 4494 /* disable all interrupts */ 4495 amdgpu_irq_disable_all(adev); 4496 if (adev->mode_info.mode_config_initialized) { 4497 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4498 drm_helper_force_disable_all(adev_to_drm(adev)); 4499 else 4500 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4501 } 4502 amdgpu_fence_driver_hw_fini(adev); 4503 4504 if (adev->mman.initialized) 4505 drain_workqueue(adev->mman.bdev.wq); 4506 4507 if (adev->pm.sysfs_initialized) 4508 amdgpu_pm_sysfs_fini(adev); 4509 if (adev->ucode_sysfs_en) 4510 amdgpu_ucode_sysfs_fini(adev); 4511 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4512 amdgpu_fru_sysfs_fini(adev); 4513 4514 amdgpu_reg_state_sysfs_fini(adev); 4515 4516 /* disable ras feature must before hw fini */ 4517 amdgpu_ras_pre_fini(adev); 4518 4519 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4520 4521 amdgpu_device_ip_fini_early(adev); 4522 4523 amdgpu_irq_fini_hw(adev); 4524 4525 if (adev->mman.initialized) 4526 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4527 4528 amdgpu_gart_dummy_page_fini(adev); 4529 4530 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4531 amdgpu_device_unmap_mmio(adev); 4532 4533 } 4534 4535 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4536 { 4537 int idx; 4538 bool px; 4539 4540 amdgpu_fence_driver_sw_fini(adev); 4541 amdgpu_device_ip_fini(adev); 4542 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4543 adev->accel_working = false; 4544 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4545 4546 amdgpu_reset_fini(adev); 4547 4548 /* free i2c buses */ 4549 if (!amdgpu_device_has_dc_support(adev)) 4550 amdgpu_i2c_fini(adev); 4551 4552 if (amdgpu_emu_mode != 1) 4553 amdgpu_atombios_fini(adev); 4554 4555 kfree(adev->bios); 4556 adev->bios = NULL; 4557 4558 kfree(adev->fru_info); 4559 adev->fru_info = NULL; 4560 4561 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4562 4563 if (px || (!dev_is_removable(&adev->pdev->dev) && 4564 apple_gmux_detect(NULL, NULL))) 4565 vga_switcheroo_unregister_client(adev->pdev); 4566 4567 if (px) 4568 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4569 4570 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4571 vga_client_unregister(adev->pdev); 4572 4573 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4574 4575 iounmap(adev->rmmio); 4576 adev->rmmio = NULL; 4577 amdgpu_doorbell_fini(adev); 4578 drm_dev_exit(idx); 4579 } 4580 4581 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4582 amdgpu_pmu_fini(adev); 4583 if (adev->mman.discovery_bin) 4584 amdgpu_discovery_fini(adev); 4585 4586 amdgpu_reset_put_reset_domain(adev->reset_domain); 4587 adev->reset_domain = NULL; 4588 4589 kfree(adev->pci_state); 4590 4591 } 4592 4593 /** 4594 * amdgpu_device_evict_resources - evict device resources 4595 * @adev: amdgpu device object 4596 * 4597 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4598 * of the vram memory type. Mainly used for evicting device resources 4599 * at suspend time. 4600 * 4601 */ 4602 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4603 { 4604 int ret; 4605 4606 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4607 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4608 return 0; 4609 4610 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4611 if (ret) 4612 DRM_WARN("evicting device resources failed\n"); 4613 return ret; 4614 } 4615 4616 /* 4617 * Suspend & resume. 4618 */ 4619 /** 4620 * amdgpu_device_prepare - prepare for device suspend 4621 * 4622 * @dev: drm dev pointer 4623 * 4624 * Prepare to put the hw in the suspend state (all asics). 4625 * Returns 0 for success or an error on failure. 4626 * Called at driver suspend. 4627 */ 4628 int amdgpu_device_prepare(struct drm_device *dev) 4629 { 4630 struct amdgpu_device *adev = drm_to_adev(dev); 4631 int i, r; 4632 4633 amdgpu_choose_low_power_state(adev); 4634 4635 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4636 return 0; 4637 4638 /* Evict the majority of BOs before starting suspend sequence */ 4639 r = amdgpu_device_evict_resources(adev); 4640 if (r) 4641 goto unprepare; 4642 4643 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4644 4645 for (i = 0; i < adev->num_ip_blocks; i++) { 4646 if (!adev->ip_blocks[i].status.valid) 4647 continue; 4648 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4649 continue; 4650 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4651 if (r) 4652 goto unprepare; 4653 } 4654 4655 return 0; 4656 4657 unprepare: 4658 adev->in_s0ix = adev->in_s3 = false; 4659 4660 return r; 4661 } 4662 4663 /** 4664 * amdgpu_device_suspend - initiate device suspend 4665 * 4666 * @dev: drm dev pointer 4667 * @fbcon : notify the fbdev of suspend 4668 * 4669 * Puts the hw in the suspend state (all asics). 4670 * Returns 0 for success or an error on failure. 4671 * Called at driver suspend. 4672 */ 4673 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4674 { 4675 struct amdgpu_device *adev = drm_to_adev(dev); 4676 int r = 0; 4677 4678 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4679 return 0; 4680 4681 adev->in_suspend = true; 4682 4683 if (amdgpu_sriov_vf(adev)) { 4684 amdgpu_virt_fini_data_exchange(adev); 4685 r = amdgpu_virt_request_full_gpu(adev, false); 4686 if (r) 4687 return r; 4688 } 4689 4690 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4691 DRM_WARN("smart shift update failed\n"); 4692 4693 if (fbcon) 4694 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4695 4696 cancel_delayed_work_sync(&adev->delayed_init_work); 4697 4698 amdgpu_ras_suspend(adev); 4699 4700 amdgpu_device_ip_suspend_phase1(adev); 4701 4702 if (!adev->in_s0ix) 4703 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4704 4705 r = amdgpu_device_evict_resources(adev); 4706 if (r) 4707 return r; 4708 4709 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4710 4711 amdgpu_fence_driver_hw_fini(adev); 4712 4713 amdgpu_device_ip_suspend_phase2(adev); 4714 4715 if (amdgpu_sriov_vf(adev)) 4716 amdgpu_virt_release_full_gpu(adev, false); 4717 4718 r = amdgpu_dpm_notify_rlc_state(adev, false); 4719 if (r) 4720 return r; 4721 4722 return 0; 4723 } 4724 4725 /** 4726 * amdgpu_device_resume - initiate device resume 4727 * 4728 * @dev: drm dev pointer 4729 * @fbcon : notify the fbdev of resume 4730 * 4731 * Bring the hw back to operating state (all asics). 4732 * Returns 0 for success or an error on failure. 4733 * Called at driver resume. 4734 */ 4735 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4736 { 4737 struct amdgpu_device *adev = drm_to_adev(dev); 4738 int r = 0; 4739 4740 if (amdgpu_sriov_vf(adev)) { 4741 r = amdgpu_virt_request_full_gpu(adev, true); 4742 if (r) 4743 return r; 4744 } 4745 4746 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4747 return 0; 4748 4749 if (adev->in_s0ix) 4750 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4751 4752 /* post card */ 4753 if (amdgpu_device_need_post(adev)) { 4754 r = amdgpu_device_asic_init(adev); 4755 if (r) 4756 dev_err(adev->dev, "amdgpu asic init failed\n"); 4757 } 4758 4759 r = amdgpu_device_ip_resume(adev); 4760 4761 if (r) { 4762 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4763 goto exit; 4764 } 4765 amdgpu_fence_driver_hw_init(adev); 4766 4767 if (!adev->in_s0ix) { 4768 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4769 if (r) 4770 goto exit; 4771 } 4772 4773 r = amdgpu_device_ip_late_init(adev); 4774 if (r) 4775 goto exit; 4776 4777 queue_delayed_work(system_wq, &adev->delayed_init_work, 4778 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4779 exit: 4780 if (amdgpu_sriov_vf(adev)) { 4781 amdgpu_virt_init_data_exchange(adev); 4782 amdgpu_virt_release_full_gpu(adev, true); 4783 } 4784 4785 if (r) 4786 return r; 4787 4788 /* Make sure IB tests flushed */ 4789 flush_delayed_work(&adev->delayed_init_work); 4790 4791 if (fbcon) 4792 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4793 4794 amdgpu_ras_resume(adev); 4795 4796 if (adev->mode_info.num_crtc) { 4797 /* 4798 * Most of the connector probing functions try to acquire runtime pm 4799 * refs to ensure that the GPU is powered on when connector polling is 4800 * performed. Since we're calling this from a runtime PM callback, 4801 * trying to acquire rpm refs will cause us to deadlock. 4802 * 4803 * Since we're guaranteed to be holding the rpm lock, it's safe to 4804 * temporarily disable the rpm helpers so this doesn't deadlock us. 4805 */ 4806 #ifdef CONFIG_PM 4807 dev->dev->power.disable_depth++; 4808 #endif 4809 if (!adev->dc_enabled) 4810 drm_helper_hpd_irq_event(dev); 4811 else 4812 drm_kms_helper_hotplug_event(dev); 4813 #ifdef CONFIG_PM 4814 dev->dev->power.disable_depth--; 4815 #endif 4816 } 4817 adev->in_suspend = false; 4818 4819 if (adev->enable_mes) 4820 amdgpu_mes_self_test(adev); 4821 4822 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4823 DRM_WARN("smart shift update failed\n"); 4824 4825 return 0; 4826 } 4827 4828 /** 4829 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4830 * 4831 * @adev: amdgpu_device pointer 4832 * 4833 * The list of all the hardware IPs that make up the asic is walked and 4834 * the check_soft_reset callbacks are run. check_soft_reset determines 4835 * if the asic is still hung or not. 4836 * Returns true if any of the IPs are still in a hung state, false if not. 4837 */ 4838 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4839 { 4840 int i; 4841 bool asic_hang = false; 4842 4843 if (amdgpu_sriov_vf(adev)) 4844 return true; 4845 4846 if (amdgpu_asic_need_full_reset(adev)) 4847 return true; 4848 4849 for (i = 0; i < adev->num_ip_blocks; i++) { 4850 if (!adev->ip_blocks[i].status.valid) 4851 continue; 4852 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4853 adev->ip_blocks[i].status.hang = 4854 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4855 if (adev->ip_blocks[i].status.hang) { 4856 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4857 asic_hang = true; 4858 } 4859 } 4860 return asic_hang; 4861 } 4862 4863 /** 4864 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4865 * 4866 * @adev: amdgpu_device pointer 4867 * 4868 * The list of all the hardware IPs that make up the asic is walked and the 4869 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4870 * handles any IP specific hardware or software state changes that are 4871 * necessary for a soft reset to succeed. 4872 * Returns 0 on success, negative error code on failure. 4873 */ 4874 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4875 { 4876 int i, r = 0; 4877 4878 for (i = 0; i < adev->num_ip_blocks; i++) { 4879 if (!adev->ip_blocks[i].status.valid) 4880 continue; 4881 if (adev->ip_blocks[i].status.hang && 4882 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4883 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4884 if (r) 4885 return r; 4886 } 4887 } 4888 4889 return 0; 4890 } 4891 4892 /** 4893 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4894 * 4895 * @adev: amdgpu_device pointer 4896 * 4897 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4898 * reset is necessary to recover. 4899 * Returns true if a full asic reset is required, false if not. 4900 */ 4901 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4902 { 4903 int i; 4904 4905 if (amdgpu_asic_need_full_reset(adev)) 4906 return true; 4907 4908 for (i = 0; i < adev->num_ip_blocks; i++) { 4909 if (!adev->ip_blocks[i].status.valid) 4910 continue; 4911 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4912 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4913 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4914 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4915 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4916 if (adev->ip_blocks[i].status.hang) { 4917 dev_info(adev->dev, "Some block need full reset!\n"); 4918 return true; 4919 } 4920 } 4921 } 4922 return false; 4923 } 4924 4925 /** 4926 * amdgpu_device_ip_soft_reset - do a soft reset 4927 * 4928 * @adev: amdgpu_device pointer 4929 * 4930 * The list of all the hardware IPs that make up the asic is walked and the 4931 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4932 * IP specific hardware or software state changes that are necessary to soft 4933 * reset the IP. 4934 * Returns 0 on success, negative error code on failure. 4935 */ 4936 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4937 { 4938 int i, r = 0; 4939 4940 for (i = 0; i < adev->num_ip_blocks; i++) { 4941 if (!adev->ip_blocks[i].status.valid) 4942 continue; 4943 if (adev->ip_blocks[i].status.hang && 4944 adev->ip_blocks[i].version->funcs->soft_reset) { 4945 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4946 if (r) 4947 return r; 4948 } 4949 } 4950 4951 return 0; 4952 } 4953 4954 /** 4955 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4956 * 4957 * @adev: amdgpu_device pointer 4958 * 4959 * The list of all the hardware IPs that make up the asic is walked and the 4960 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4961 * handles any IP specific hardware or software state changes that are 4962 * necessary after the IP has been soft reset. 4963 * Returns 0 on success, negative error code on failure. 4964 */ 4965 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4966 { 4967 int i, r = 0; 4968 4969 for (i = 0; i < adev->num_ip_blocks; i++) { 4970 if (!adev->ip_blocks[i].status.valid) 4971 continue; 4972 if (adev->ip_blocks[i].status.hang && 4973 adev->ip_blocks[i].version->funcs->post_soft_reset) 4974 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4975 if (r) 4976 return r; 4977 } 4978 4979 return 0; 4980 } 4981 4982 /** 4983 * amdgpu_device_recover_vram - Recover some VRAM contents 4984 * 4985 * @adev: amdgpu_device pointer 4986 * 4987 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4988 * restore things like GPUVM page tables after a GPU reset where 4989 * the contents of VRAM might be lost. 4990 * 4991 * Returns: 4992 * 0 on success, negative error code on failure. 4993 */ 4994 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4995 { 4996 struct dma_fence *fence = NULL, *next = NULL; 4997 struct amdgpu_bo *shadow; 4998 struct amdgpu_bo_vm *vmbo; 4999 long r = 1, tmo; 5000 5001 if (amdgpu_sriov_runtime(adev)) 5002 tmo = msecs_to_jiffies(8000); 5003 else 5004 tmo = msecs_to_jiffies(100); 5005 5006 dev_info(adev->dev, "recover vram bo from shadow start\n"); 5007 mutex_lock(&adev->shadow_list_lock); 5008 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 5009 /* If vm is compute context or adev is APU, shadow will be NULL */ 5010 if (!vmbo->shadow) 5011 continue; 5012 shadow = vmbo->shadow; 5013 5014 /* No need to recover an evicted BO */ 5015 if (!shadow->tbo.resource || 5016 shadow->tbo.resource->mem_type != TTM_PL_TT || 5017 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 5018 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 5019 continue; 5020 5021 r = amdgpu_bo_restore_shadow(shadow, &next); 5022 if (r) 5023 break; 5024 5025 if (fence) { 5026 tmo = dma_fence_wait_timeout(fence, false, tmo); 5027 dma_fence_put(fence); 5028 fence = next; 5029 if (tmo == 0) { 5030 r = -ETIMEDOUT; 5031 break; 5032 } else if (tmo < 0) { 5033 r = tmo; 5034 break; 5035 } 5036 } else { 5037 fence = next; 5038 } 5039 } 5040 mutex_unlock(&adev->shadow_list_lock); 5041 5042 if (fence) 5043 tmo = dma_fence_wait_timeout(fence, false, tmo); 5044 dma_fence_put(fence); 5045 5046 if (r < 0 || tmo <= 0) { 5047 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 5048 return -EIO; 5049 } 5050 5051 dev_info(adev->dev, "recover vram bo from shadow done\n"); 5052 return 0; 5053 } 5054 5055 5056 /** 5057 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5058 * 5059 * @adev: amdgpu_device pointer 5060 * @reset_context: amdgpu reset context pointer 5061 * 5062 * do VF FLR and reinitialize Asic 5063 * return 0 means succeeded otherwise failed 5064 */ 5065 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5066 struct amdgpu_reset_context *reset_context) 5067 { 5068 int r; 5069 struct amdgpu_hive_info *hive = NULL; 5070 5071 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5072 if (!amdgpu_ras_get_fed_status(adev)) 5073 amdgpu_virt_ready_to_reset(adev); 5074 amdgpu_virt_wait_reset(adev); 5075 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5076 r = amdgpu_virt_request_full_gpu(adev, true); 5077 } else { 5078 r = amdgpu_virt_reset_gpu(adev); 5079 } 5080 if (r) 5081 return r; 5082 5083 amdgpu_ras_set_fed(adev, false); 5084 amdgpu_irq_gpu_reset_resume_helper(adev); 5085 5086 /* some sw clean up VF needs to do before recover */ 5087 amdgpu_virt_post_reset(adev); 5088 5089 /* Resume IP prior to SMC */ 5090 r = amdgpu_device_ip_reinit_early_sriov(adev); 5091 if (r) 5092 return r; 5093 5094 amdgpu_virt_init_data_exchange(adev); 5095 5096 r = amdgpu_device_fw_loading(adev); 5097 if (r) 5098 return r; 5099 5100 /* now we are okay to resume SMC/CP/SDMA */ 5101 r = amdgpu_device_ip_reinit_late_sriov(adev); 5102 if (r) 5103 return r; 5104 5105 hive = amdgpu_get_xgmi_hive(adev); 5106 /* Update PSP FW topology after reset */ 5107 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5108 r = amdgpu_xgmi_update_topology(hive, adev); 5109 if (hive) 5110 amdgpu_put_xgmi_hive(hive); 5111 if (r) 5112 return r; 5113 5114 r = amdgpu_ib_ring_tests(adev); 5115 if (r) 5116 return r; 5117 5118 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5119 amdgpu_inc_vram_lost(adev); 5120 r = amdgpu_device_recover_vram(adev); 5121 } 5122 if (r) 5123 return r; 5124 5125 /* need to be called during full access so we can't do it later like 5126 * bare-metal does. 5127 */ 5128 amdgpu_amdkfd_post_reset(adev); 5129 amdgpu_virt_release_full_gpu(adev, true); 5130 5131 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5132 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5133 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5134 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5135 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5136 amdgpu_ras_resume(adev); 5137 return 0; 5138 } 5139 5140 /** 5141 * amdgpu_device_has_job_running - check if there is any job in mirror list 5142 * 5143 * @adev: amdgpu_device pointer 5144 * 5145 * check if there is any job in mirror list 5146 */ 5147 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5148 { 5149 int i; 5150 struct drm_sched_job *job; 5151 5152 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5153 struct amdgpu_ring *ring = adev->rings[i]; 5154 5155 if (!amdgpu_ring_sched_ready(ring)) 5156 continue; 5157 5158 spin_lock(&ring->sched.job_list_lock); 5159 job = list_first_entry_or_null(&ring->sched.pending_list, 5160 struct drm_sched_job, list); 5161 spin_unlock(&ring->sched.job_list_lock); 5162 if (job) 5163 return true; 5164 } 5165 return false; 5166 } 5167 5168 /** 5169 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5170 * 5171 * @adev: amdgpu_device pointer 5172 * 5173 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5174 * a hung GPU. 5175 */ 5176 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5177 { 5178 5179 if (amdgpu_gpu_recovery == 0) 5180 goto disabled; 5181 5182 /* Skip soft reset check in fatal error mode */ 5183 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5184 return true; 5185 5186 if (amdgpu_sriov_vf(adev)) 5187 return true; 5188 5189 if (amdgpu_gpu_recovery == -1) { 5190 switch (adev->asic_type) { 5191 #ifdef CONFIG_DRM_AMDGPU_SI 5192 case CHIP_VERDE: 5193 case CHIP_TAHITI: 5194 case CHIP_PITCAIRN: 5195 case CHIP_OLAND: 5196 case CHIP_HAINAN: 5197 #endif 5198 #ifdef CONFIG_DRM_AMDGPU_CIK 5199 case CHIP_KAVERI: 5200 case CHIP_KABINI: 5201 case CHIP_MULLINS: 5202 #endif 5203 case CHIP_CARRIZO: 5204 case CHIP_STONEY: 5205 case CHIP_CYAN_SKILLFISH: 5206 goto disabled; 5207 default: 5208 break; 5209 } 5210 } 5211 5212 return true; 5213 5214 disabled: 5215 dev_info(adev->dev, "GPU recovery disabled.\n"); 5216 return false; 5217 } 5218 5219 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5220 { 5221 u32 i; 5222 int ret = 0; 5223 5224 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5225 5226 dev_info(adev->dev, "GPU mode1 reset\n"); 5227 5228 /* Cache the state before bus master disable. The saved config space 5229 * values are used in other cases like restore after mode-2 reset. 5230 */ 5231 amdgpu_device_cache_pci_state(adev->pdev); 5232 5233 /* disable BM */ 5234 pci_clear_master(adev->pdev); 5235 5236 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5237 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5238 ret = amdgpu_dpm_mode1_reset(adev); 5239 } else { 5240 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5241 ret = psp_gpu_reset(adev); 5242 } 5243 5244 if (ret) 5245 goto mode1_reset_failed; 5246 5247 amdgpu_device_load_pci_state(adev->pdev); 5248 ret = amdgpu_psp_wait_for_bootloader(adev); 5249 if (ret) 5250 goto mode1_reset_failed; 5251 5252 /* wait for asic to come out of reset */ 5253 for (i = 0; i < adev->usec_timeout; i++) { 5254 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5255 5256 if (memsize != 0xffffffff) 5257 break; 5258 udelay(1); 5259 } 5260 5261 if (i >= adev->usec_timeout) { 5262 ret = -ETIMEDOUT; 5263 goto mode1_reset_failed; 5264 } 5265 5266 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5267 5268 return 0; 5269 5270 mode1_reset_failed: 5271 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5272 return ret; 5273 } 5274 5275 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5276 struct amdgpu_reset_context *reset_context) 5277 { 5278 int i, r = 0; 5279 struct amdgpu_job *job = NULL; 5280 bool need_full_reset = 5281 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5282 5283 if (reset_context->reset_req_dev == adev) 5284 job = reset_context->job; 5285 5286 if (amdgpu_sriov_vf(adev)) { 5287 /* stop the data exchange thread */ 5288 amdgpu_virt_fini_data_exchange(adev); 5289 } 5290 5291 amdgpu_fence_driver_isr_toggle(adev, true); 5292 5293 /* block all schedulers and reset given job's ring */ 5294 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5295 struct amdgpu_ring *ring = adev->rings[i]; 5296 5297 if (!amdgpu_ring_sched_ready(ring)) 5298 continue; 5299 5300 /* Clear job fence from fence drv to avoid force_completion 5301 * leave NULL and vm flush fence in fence drv 5302 */ 5303 amdgpu_fence_driver_clear_job_fences(ring); 5304 5305 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5306 amdgpu_fence_driver_force_completion(ring); 5307 } 5308 5309 amdgpu_fence_driver_isr_toggle(adev, false); 5310 5311 if (job && job->vm) 5312 drm_sched_increase_karma(&job->base); 5313 5314 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5315 /* If reset handler not implemented, continue; otherwise return */ 5316 if (r == -EOPNOTSUPP) 5317 r = 0; 5318 else 5319 return r; 5320 5321 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5322 if (!amdgpu_sriov_vf(adev)) { 5323 5324 if (!need_full_reset) 5325 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5326 5327 if (!need_full_reset && amdgpu_gpu_recovery && 5328 amdgpu_device_ip_check_soft_reset(adev)) { 5329 amdgpu_device_ip_pre_soft_reset(adev); 5330 r = amdgpu_device_ip_soft_reset(adev); 5331 amdgpu_device_ip_post_soft_reset(adev); 5332 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5333 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5334 need_full_reset = true; 5335 } 5336 } 5337 5338 if (need_full_reset) 5339 r = amdgpu_device_ip_suspend(adev); 5340 if (need_full_reset) 5341 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5342 else 5343 clear_bit(AMDGPU_NEED_FULL_RESET, 5344 &reset_context->flags); 5345 } 5346 5347 return r; 5348 } 5349 5350 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5351 { 5352 int i; 5353 5354 lockdep_assert_held(&adev->reset_domain->sem); 5355 5356 for (i = 0; i < adev->reset_info.num_regs; i++) { 5357 adev->reset_info.reset_dump_reg_value[i] = 5358 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5359 5360 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5361 adev->reset_info.reset_dump_reg_value[i]); 5362 } 5363 5364 return 0; 5365 } 5366 5367 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5368 struct amdgpu_reset_context *reset_context) 5369 { 5370 struct amdgpu_device *tmp_adev = NULL; 5371 bool need_full_reset, skip_hw_reset, vram_lost = false; 5372 int r = 0; 5373 uint32_t i; 5374 5375 /* Try reset handler method first */ 5376 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5377 reset_list); 5378 5379 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5380 amdgpu_reset_reg_dumps(tmp_adev); 5381 5382 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5383 /* Trigger ip dump before we reset the asic */ 5384 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5385 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5386 tmp_adev->ip_blocks[i].version->funcs 5387 ->dump_ip_state((void *)tmp_adev); 5388 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5389 } 5390 5391 reset_context->reset_device_list = device_list_handle; 5392 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5393 /* If reset handler not implemented, continue; otherwise return */ 5394 if (r == -EOPNOTSUPP) 5395 r = 0; 5396 else 5397 return r; 5398 5399 /* Reset handler not implemented, use the default method */ 5400 need_full_reset = 5401 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5402 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5403 5404 /* 5405 * ASIC reset has to be done on all XGMI hive nodes ASAP 5406 * to allow proper links negotiation in FW (within 1 sec) 5407 */ 5408 if (!skip_hw_reset && need_full_reset) { 5409 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5410 /* For XGMI run all resets in parallel to speed up the process */ 5411 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5412 tmp_adev->gmc.xgmi.pending_reset = false; 5413 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5414 r = -EALREADY; 5415 } else 5416 r = amdgpu_asic_reset(tmp_adev); 5417 5418 if (r) { 5419 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5420 r, adev_to_drm(tmp_adev)->unique); 5421 goto out; 5422 } 5423 } 5424 5425 /* For XGMI wait for all resets to complete before proceed */ 5426 if (!r) { 5427 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5428 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5429 flush_work(&tmp_adev->xgmi_reset_work); 5430 r = tmp_adev->asic_reset_res; 5431 if (r) 5432 break; 5433 } 5434 } 5435 } 5436 } 5437 5438 if (!r && amdgpu_ras_intr_triggered()) { 5439 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5440 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5441 } 5442 5443 amdgpu_ras_intr_cleared(); 5444 } 5445 5446 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5447 if (need_full_reset) { 5448 /* post card */ 5449 amdgpu_ras_set_fed(tmp_adev, false); 5450 r = amdgpu_device_asic_init(tmp_adev); 5451 if (r) { 5452 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5453 } else { 5454 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5455 5456 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5457 if (r) 5458 goto out; 5459 5460 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5461 5462 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5463 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5464 5465 if (vram_lost) { 5466 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5467 amdgpu_inc_vram_lost(tmp_adev); 5468 } 5469 5470 r = amdgpu_device_fw_loading(tmp_adev); 5471 if (r) 5472 return r; 5473 5474 r = amdgpu_xcp_restore_partition_mode( 5475 tmp_adev->xcp_mgr); 5476 if (r) 5477 goto out; 5478 5479 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5480 if (r) 5481 goto out; 5482 5483 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5484 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5485 5486 if (vram_lost) 5487 amdgpu_device_fill_reset_magic(tmp_adev); 5488 5489 /* 5490 * Add this ASIC as tracked as reset was already 5491 * complete successfully. 5492 */ 5493 amdgpu_register_gpu_instance(tmp_adev); 5494 5495 if (!reset_context->hive && 5496 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5497 amdgpu_xgmi_add_device(tmp_adev); 5498 5499 r = amdgpu_device_ip_late_init(tmp_adev); 5500 if (r) 5501 goto out; 5502 5503 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5504 5505 /* 5506 * The GPU enters bad state once faulty pages 5507 * by ECC has reached the threshold, and ras 5508 * recovery is scheduled next. So add one check 5509 * here to break recovery if it indeed exceeds 5510 * bad page threshold, and remind user to 5511 * retire this GPU or setting one bigger 5512 * bad_page_threshold value to fix this once 5513 * probing driver again. 5514 */ 5515 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5516 /* must succeed. */ 5517 amdgpu_ras_resume(tmp_adev); 5518 } else { 5519 r = -EINVAL; 5520 goto out; 5521 } 5522 5523 /* Update PSP FW topology after reset */ 5524 if (reset_context->hive && 5525 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5526 r = amdgpu_xgmi_update_topology( 5527 reset_context->hive, tmp_adev); 5528 } 5529 } 5530 5531 out: 5532 if (!r) { 5533 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5534 r = amdgpu_ib_ring_tests(tmp_adev); 5535 if (r) { 5536 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5537 need_full_reset = true; 5538 r = -EAGAIN; 5539 goto end; 5540 } 5541 } 5542 5543 if (!r) 5544 r = amdgpu_device_recover_vram(tmp_adev); 5545 else 5546 tmp_adev->asic_reset_res = r; 5547 } 5548 5549 end: 5550 if (need_full_reset) 5551 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5552 else 5553 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5554 return r; 5555 } 5556 5557 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5558 { 5559 5560 switch (amdgpu_asic_reset_method(adev)) { 5561 case AMD_RESET_METHOD_MODE1: 5562 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5563 break; 5564 case AMD_RESET_METHOD_MODE2: 5565 adev->mp1_state = PP_MP1_STATE_RESET; 5566 break; 5567 default: 5568 adev->mp1_state = PP_MP1_STATE_NONE; 5569 break; 5570 } 5571 } 5572 5573 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5574 { 5575 amdgpu_vf_error_trans_all(adev); 5576 adev->mp1_state = PP_MP1_STATE_NONE; 5577 } 5578 5579 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5580 { 5581 struct pci_dev *p = NULL; 5582 5583 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5584 adev->pdev->bus->number, 1); 5585 if (p) { 5586 pm_runtime_enable(&(p->dev)); 5587 pm_runtime_resume(&(p->dev)); 5588 } 5589 5590 pci_dev_put(p); 5591 } 5592 5593 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5594 { 5595 enum amd_reset_method reset_method; 5596 struct pci_dev *p = NULL; 5597 u64 expires; 5598 5599 /* 5600 * For now, only BACO and mode1 reset are confirmed 5601 * to suffer the audio issue without proper suspended. 5602 */ 5603 reset_method = amdgpu_asic_reset_method(adev); 5604 if ((reset_method != AMD_RESET_METHOD_BACO) && 5605 (reset_method != AMD_RESET_METHOD_MODE1)) 5606 return -EINVAL; 5607 5608 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5609 adev->pdev->bus->number, 1); 5610 if (!p) 5611 return -ENODEV; 5612 5613 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5614 if (!expires) 5615 /* 5616 * If we cannot get the audio device autosuspend delay, 5617 * a fixed 4S interval will be used. Considering 3S is 5618 * the audio controller default autosuspend delay setting. 5619 * 4S used here is guaranteed to cover that. 5620 */ 5621 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5622 5623 while (!pm_runtime_status_suspended(&(p->dev))) { 5624 if (!pm_runtime_suspend(&(p->dev))) 5625 break; 5626 5627 if (expires < ktime_get_mono_fast_ns()) { 5628 dev_warn(adev->dev, "failed to suspend display audio\n"); 5629 pci_dev_put(p); 5630 /* TODO: abort the succeeding gpu reset? */ 5631 return -ETIMEDOUT; 5632 } 5633 } 5634 5635 pm_runtime_disable(&(p->dev)); 5636 5637 pci_dev_put(p); 5638 return 0; 5639 } 5640 5641 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5642 { 5643 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5644 5645 #if defined(CONFIG_DEBUG_FS) 5646 if (!amdgpu_sriov_vf(adev)) 5647 cancel_work(&adev->reset_work); 5648 #endif 5649 5650 if (adev->kfd.dev) 5651 cancel_work(&adev->kfd.reset_work); 5652 5653 if (amdgpu_sriov_vf(adev)) 5654 cancel_work(&adev->virt.flr_work); 5655 5656 if (con && adev->ras_enabled) 5657 cancel_work(&con->recovery_work); 5658 5659 } 5660 5661 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5662 { 5663 struct amdgpu_device *tmp_adev; 5664 int ret = 0; 5665 u32 status; 5666 5667 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5668 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5669 if (PCI_POSSIBLE_ERROR(status)) { 5670 dev_err(tmp_adev->dev, "device lost from bus!"); 5671 ret = -ENODEV; 5672 } 5673 } 5674 5675 return ret; 5676 } 5677 5678 /** 5679 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5680 * 5681 * @adev: amdgpu_device pointer 5682 * @job: which job trigger hang 5683 * @reset_context: amdgpu reset context pointer 5684 * 5685 * Attempt to reset the GPU if it has hung (all asics). 5686 * Attempt to do soft-reset or full-reset and reinitialize Asic 5687 * Returns 0 for success or an error on failure. 5688 */ 5689 5690 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5691 struct amdgpu_job *job, 5692 struct amdgpu_reset_context *reset_context) 5693 { 5694 struct list_head device_list, *device_list_handle = NULL; 5695 bool job_signaled = false; 5696 struct amdgpu_hive_info *hive = NULL; 5697 struct amdgpu_device *tmp_adev = NULL; 5698 int i, r = 0; 5699 bool need_emergency_restart = false; 5700 bool audio_suspended = false; 5701 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5702 5703 /* 5704 * Special case: RAS triggered and full reset isn't supported 5705 */ 5706 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5707 5708 /* 5709 * Flush RAM to disk so that after reboot 5710 * the user can read log and see why the system rebooted. 5711 */ 5712 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5713 amdgpu_ras_get_context(adev)->reboot) { 5714 DRM_WARN("Emergency reboot."); 5715 5716 ksys_sync_helper(); 5717 emergency_restart(); 5718 } 5719 5720 dev_info(adev->dev, "GPU %s begin!\n", 5721 need_emergency_restart ? "jobs stop":"reset"); 5722 5723 if (!amdgpu_sriov_vf(adev)) 5724 hive = amdgpu_get_xgmi_hive(adev); 5725 if (hive) 5726 mutex_lock(&hive->hive_lock); 5727 5728 reset_context->job = job; 5729 reset_context->hive = hive; 5730 /* 5731 * Build list of devices to reset. 5732 * In case we are in XGMI hive mode, resort the device list 5733 * to put adev in the 1st position. 5734 */ 5735 INIT_LIST_HEAD(&device_list); 5736 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5737 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5738 list_add_tail(&tmp_adev->reset_list, &device_list); 5739 if (adev->shutdown) 5740 tmp_adev->shutdown = true; 5741 } 5742 if (!list_is_first(&adev->reset_list, &device_list)) 5743 list_rotate_to_front(&adev->reset_list, &device_list); 5744 device_list_handle = &device_list; 5745 } else { 5746 list_add_tail(&adev->reset_list, &device_list); 5747 device_list_handle = &device_list; 5748 } 5749 5750 if (!amdgpu_sriov_vf(adev)) { 5751 r = amdgpu_device_health_check(device_list_handle); 5752 if (r) 5753 goto end_reset; 5754 } 5755 5756 /* We need to lock reset domain only once both for XGMI and single device */ 5757 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5758 reset_list); 5759 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5760 5761 /* block all schedulers and reset given job's ring */ 5762 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5763 5764 amdgpu_device_set_mp1_state(tmp_adev); 5765 5766 /* 5767 * Try to put the audio codec into suspend state 5768 * before gpu reset started. 5769 * 5770 * Due to the power domain of the graphics device 5771 * is shared with AZ power domain. Without this, 5772 * we may change the audio hardware from behind 5773 * the audio driver's back. That will trigger 5774 * some audio codec errors. 5775 */ 5776 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5777 audio_suspended = true; 5778 5779 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5780 5781 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5782 5783 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5784 5785 /* 5786 * Mark these ASICs to be reseted as untracked first 5787 * And add them back after reset completed 5788 */ 5789 amdgpu_unregister_gpu_instance(tmp_adev); 5790 5791 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5792 5793 /* disable ras on ALL IPs */ 5794 if (!need_emergency_restart && 5795 amdgpu_device_ip_need_full_reset(tmp_adev)) 5796 amdgpu_ras_suspend(tmp_adev); 5797 5798 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5799 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5800 5801 if (!amdgpu_ring_sched_ready(ring)) 5802 continue; 5803 5804 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5805 5806 if (need_emergency_restart) 5807 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5808 } 5809 atomic_inc(&tmp_adev->gpu_reset_counter); 5810 } 5811 5812 if (need_emergency_restart) 5813 goto skip_sched_resume; 5814 5815 /* 5816 * Must check guilty signal here since after this point all old 5817 * HW fences are force signaled. 5818 * 5819 * job->base holds a reference to parent fence 5820 */ 5821 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5822 job_signaled = true; 5823 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5824 goto skip_hw_reset; 5825 } 5826 5827 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5828 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5829 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5830 /*TODO Should we stop ?*/ 5831 if (r) { 5832 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5833 r, adev_to_drm(tmp_adev)->unique); 5834 tmp_adev->asic_reset_res = r; 5835 } 5836 } 5837 5838 /* Actual ASIC resets if needed.*/ 5839 /* Host driver will handle XGMI hive reset for SRIOV */ 5840 if (amdgpu_sriov_vf(adev)) { 5841 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5842 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5843 amdgpu_ras_set_fed(adev, true); 5844 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5845 } 5846 5847 r = amdgpu_device_reset_sriov(adev, reset_context); 5848 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5849 amdgpu_virt_release_full_gpu(adev, true); 5850 goto retry; 5851 } 5852 if (r) 5853 adev->asic_reset_res = r; 5854 } else { 5855 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5856 if (r && r == -EAGAIN) 5857 goto retry; 5858 } 5859 5860 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5861 /* 5862 * Drop any pending non scheduler resets queued before reset is done. 5863 * Any reset scheduled after this point would be valid. Scheduler resets 5864 * were already dropped during drm_sched_stop and no new ones can come 5865 * in before drm_sched_start. 5866 */ 5867 amdgpu_device_stop_pending_resets(tmp_adev); 5868 } 5869 5870 skip_hw_reset: 5871 5872 /* Post ASIC reset for all devs .*/ 5873 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5874 5875 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5876 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5877 5878 if (!amdgpu_ring_sched_ready(ring)) 5879 continue; 5880 5881 drm_sched_start(&ring->sched, true); 5882 } 5883 5884 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5885 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5886 5887 if (tmp_adev->asic_reset_res) 5888 r = tmp_adev->asic_reset_res; 5889 5890 tmp_adev->asic_reset_res = 0; 5891 5892 if (r) { 5893 /* bad news, how to tell it to userspace ? */ 5894 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5895 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5896 } else { 5897 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5898 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5899 DRM_WARN("smart shift update failed\n"); 5900 } 5901 } 5902 5903 skip_sched_resume: 5904 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5905 /* unlock kfd: SRIOV would do it separately */ 5906 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5907 amdgpu_amdkfd_post_reset(tmp_adev); 5908 5909 /* kfd_post_reset will do nothing if kfd device is not initialized, 5910 * need to bring up kfd here if it's not be initialized before 5911 */ 5912 if (!adev->kfd.init_complete) 5913 amdgpu_amdkfd_device_init(adev); 5914 5915 if (audio_suspended) 5916 amdgpu_device_resume_display_audio(tmp_adev); 5917 5918 amdgpu_device_unset_mp1_state(tmp_adev); 5919 5920 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5921 } 5922 5923 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5924 reset_list); 5925 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5926 5927 end_reset: 5928 if (hive) { 5929 mutex_unlock(&hive->hive_lock); 5930 amdgpu_put_xgmi_hive(hive); 5931 } 5932 5933 if (r) 5934 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5935 5936 atomic_set(&adev->reset_domain->reset_res, r); 5937 return r; 5938 } 5939 5940 /** 5941 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5942 * 5943 * @adev: amdgpu_device pointer 5944 * @speed: pointer to the speed of the link 5945 * @width: pointer to the width of the link 5946 * 5947 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5948 * first physical partner to an AMD dGPU. 5949 * This will exclude any virtual switches and links. 5950 */ 5951 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5952 enum pci_bus_speed *speed, 5953 enum pcie_link_width *width) 5954 { 5955 struct pci_dev *parent = adev->pdev; 5956 5957 if (!speed || !width) 5958 return; 5959 5960 *speed = PCI_SPEED_UNKNOWN; 5961 *width = PCIE_LNK_WIDTH_UNKNOWN; 5962 5963 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 5964 while ((parent = pci_upstream_bridge(parent))) { 5965 /* skip upstream/downstream switches internal to dGPU*/ 5966 if (parent->vendor == PCI_VENDOR_ID_ATI) 5967 continue; 5968 *speed = pcie_get_speed_cap(parent); 5969 *width = pcie_get_width_cap(parent); 5970 break; 5971 } 5972 } else { 5973 /* use the current speeds rather than max if switching is not supported */ 5974 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 5975 } 5976 } 5977 5978 /** 5979 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5980 * 5981 * @adev: amdgpu_device pointer 5982 * 5983 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5984 * and lanes) of the slot the device is in. Handles APUs and 5985 * virtualized environments where PCIE config space may not be available. 5986 */ 5987 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5988 { 5989 struct pci_dev *pdev; 5990 enum pci_bus_speed speed_cap, platform_speed_cap; 5991 enum pcie_link_width platform_link_width; 5992 5993 if (amdgpu_pcie_gen_cap) 5994 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5995 5996 if (amdgpu_pcie_lane_cap) 5997 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5998 5999 /* covers APUs as well */ 6000 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6001 if (adev->pm.pcie_gen_mask == 0) 6002 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6003 if (adev->pm.pcie_mlw_mask == 0) 6004 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6005 return; 6006 } 6007 6008 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6009 return; 6010 6011 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6012 &platform_link_width); 6013 6014 if (adev->pm.pcie_gen_mask == 0) { 6015 /* asic caps */ 6016 pdev = adev->pdev; 6017 speed_cap = pcie_get_speed_cap(pdev); 6018 if (speed_cap == PCI_SPEED_UNKNOWN) { 6019 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6020 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6021 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6022 } else { 6023 if (speed_cap == PCIE_SPEED_32_0GT) 6024 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6025 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6026 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6027 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6028 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6029 else if (speed_cap == PCIE_SPEED_16_0GT) 6030 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6031 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6032 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6033 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6034 else if (speed_cap == PCIE_SPEED_8_0GT) 6035 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6036 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6037 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6038 else if (speed_cap == PCIE_SPEED_5_0GT) 6039 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6040 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6041 else 6042 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6043 } 6044 /* platform caps */ 6045 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6046 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6047 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6048 } else { 6049 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6050 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6051 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6052 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6053 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6054 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6055 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6056 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6057 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6058 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6059 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6060 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6061 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6062 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6063 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6064 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6065 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6066 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6067 else 6068 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6069 6070 } 6071 } 6072 if (adev->pm.pcie_mlw_mask == 0) { 6073 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6074 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6075 } else { 6076 switch (platform_link_width) { 6077 case PCIE_LNK_X32: 6078 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6079 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6080 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6081 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6082 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6083 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6084 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6085 break; 6086 case PCIE_LNK_X16: 6087 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6088 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6089 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6090 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6091 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6092 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6093 break; 6094 case PCIE_LNK_X12: 6095 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6096 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6097 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6098 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6099 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6100 break; 6101 case PCIE_LNK_X8: 6102 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6103 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6104 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6105 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6106 break; 6107 case PCIE_LNK_X4: 6108 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6109 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6110 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6111 break; 6112 case PCIE_LNK_X2: 6113 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6114 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6115 break; 6116 case PCIE_LNK_X1: 6117 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6118 break; 6119 default: 6120 break; 6121 } 6122 } 6123 } 6124 } 6125 6126 /** 6127 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6128 * 6129 * @adev: amdgpu_device pointer 6130 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6131 * 6132 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6133 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6134 * @peer_adev. 6135 */ 6136 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6137 struct amdgpu_device *peer_adev) 6138 { 6139 #ifdef CONFIG_HSA_AMD_P2P 6140 uint64_t address_mask = peer_adev->dev->dma_mask ? 6141 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6142 resource_size_t aper_limit = 6143 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6144 bool p2p_access = 6145 !adev->gmc.xgmi.connected_to_cpu && 6146 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6147 6148 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6149 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6150 !(adev->gmc.aper_base & address_mask || 6151 aper_limit & address_mask)); 6152 #else 6153 return false; 6154 #endif 6155 } 6156 6157 int amdgpu_device_baco_enter(struct drm_device *dev) 6158 { 6159 struct amdgpu_device *adev = drm_to_adev(dev); 6160 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6161 6162 if (!amdgpu_device_supports_baco(dev)) 6163 return -ENOTSUPP; 6164 6165 if (ras && adev->ras_enabled && 6166 adev->nbio.funcs->enable_doorbell_interrupt) 6167 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6168 6169 return amdgpu_dpm_baco_enter(adev); 6170 } 6171 6172 int amdgpu_device_baco_exit(struct drm_device *dev) 6173 { 6174 struct amdgpu_device *adev = drm_to_adev(dev); 6175 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6176 int ret = 0; 6177 6178 if (!amdgpu_device_supports_baco(dev)) 6179 return -ENOTSUPP; 6180 6181 ret = amdgpu_dpm_baco_exit(adev); 6182 if (ret) 6183 return ret; 6184 6185 if (ras && adev->ras_enabled && 6186 adev->nbio.funcs->enable_doorbell_interrupt) 6187 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6188 6189 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6190 adev->nbio.funcs->clear_doorbell_interrupt) 6191 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6192 6193 return 0; 6194 } 6195 6196 /** 6197 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6198 * @pdev: PCI device struct 6199 * @state: PCI channel state 6200 * 6201 * Description: Called when a PCI error is detected. 6202 * 6203 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6204 */ 6205 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6206 { 6207 struct drm_device *dev = pci_get_drvdata(pdev); 6208 struct amdgpu_device *adev = drm_to_adev(dev); 6209 int i; 6210 6211 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6212 6213 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6214 DRM_WARN("No support for XGMI hive yet..."); 6215 return PCI_ERS_RESULT_DISCONNECT; 6216 } 6217 6218 adev->pci_channel_state = state; 6219 6220 switch (state) { 6221 case pci_channel_io_normal: 6222 return PCI_ERS_RESULT_CAN_RECOVER; 6223 /* Fatal error, prepare for slot reset */ 6224 case pci_channel_io_frozen: 6225 /* 6226 * Locking adev->reset_domain->sem will prevent any external access 6227 * to GPU during PCI error recovery 6228 */ 6229 amdgpu_device_lock_reset_domain(adev->reset_domain); 6230 amdgpu_device_set_mp1_state(adev); 6231 6232 /* 6233 * Block any work scheduling as we do for regular GPU reset 6234 * for the duration of the recovery 6235 */ 6236 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6237 struct amdgpu_ring *ring = adev->rings[i]; 6238 6239 if (!amdgpu_ring_sched_ready(ring)) 6240 continue; 6241 6242 drm_sched_stop(&ring->sched, NULL); 6243 } 6244 atomic_inc(&adev->gpu_reset_counter); 6245 return PCI_ERS_RESULT_NEED_RESET; 6246 case pci_channel_io_perm_failure: 6247 /* Permanent error, prepare for device removal */ 6248 return PCI_ERS_RESULT_DISCONNECT; 6249 } 6250 6251 return PCI_ERS_RESULT_NEED_RESET; 6252 } 6253 6254 /** 6255 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6256 * @pdev: pointer to PCI device 6257 */ 6258 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6259 { 6260 6261 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6262 6263 /* TODO - dump whatever for debugging purposes */ 6264 6265 /* This called only if amdgpu_pci_error_detected returns 6266 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6267 * works, no need to reset slot. 6268 */ 6269 6270 return PCI_ERS_RESULT_RECOVERED; 6271 } 6272 6273 /** 6274 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6275 * @pdev: PCI device struct 6276 * 6277 * Description: This routine is called by the pci error recovery 6278 * code after the PCI slot has been reset, just before we 6279 * should resume normal operations. 6280 */ 6281 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6282 { 6283 struct drm_device *dev = pci_get_drvdata(pdev); 6284 struct amdgpu_device *adev = drm_to_adev(dev); 6285 int r, i; 6286 struct amdgpu_reset_context reset_context; 6287 u32 memsize; 6288 struct list_head device_list; 6289 6290 /* PCI error slot reset should be skipped During RAS recovery */ 6291 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6292 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6293 amdgpu_ras_in_recovery(adev)) 6294 return PCI_ERS_RESULT_RECOVERED; 6295 6296 DRM_INFO("PCI error: slot reset callback!!\n"); 6297 6298 memset(&reset_context, 0, sizeof(reset_context)); 6299 6300 INIT_LIST_HEAD(&device_list); 6301 list_add_tail(&adev->reset_list, &device_list); 6302 6303 /* wait for asic to come out of reset */ 6304 msleep(500); 6305 6306 /* Restore PCI confspace */ 6307 amdgpu_device_load_pci_state(pdev); 6308 6309 /* confirm ASIC came out of reset */ 6310 for (i = 0; i < adev->usec_timeout; i++) { 6311 memsize = amdgpu_asic_get_config_memsize(adev); 6312 6313 if (memsize != 0xffffffff) 6314 break; 6315 udelay(1); 6316 } 6317 if (memsize == 0xffffffff) { 6318 r = -ETIME; 6319 goto out; 6320 } 6321 6322 reset_context.method = AMD_RESET_METHOD_NONE; 6323 reset_context.reset_req_dev = adev; 6324 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6325 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6326 6327 adev->no_hw_access = true; 6328 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6329 adev->no_hw_access = false; 6330 if (r) 6331 goto out; 6332 6333 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6334 6335 out: 6336 if (!r) { 6337 if (amdgpu_device_cache_pci_state(adev->pdev)) 6338 pci_restore_state(adev->pdev); 6339 6340 DRM_INFO("PCIe error recovery succeeded\n"); 6341 } else { 6342 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6343 amdgpu_device_unset_mp1_state(adev); 6344 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6345 } 6346 6347 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6348 } 6349 6350 /** 6351 * amdgpu_pci_resume() - resume normal ops after PCI reset 6352 * @pdev: pointer to PCI device 6353 * 6354 * Called when the error recovery driver tells us that its 6355 * OK to resume normal operation. 6356 */ 6357 void amdgpu_pci_resume(struct pci_dev *pdev) 6358 { 6359 struct drm_device *dev = pci_get_drvdata(pdev); 6360 struct amdgpu_device *adev = drm_to_adev(dev); 6361 int i; 6362 6363 6364 DRM_INFO("PCI error: resume callback!!\n"); 6365 6366 /* Only continue execution for the case of pci_channel_io_frozen */ 6367 if (adev->pci_channel_state != pci_channel_io_frozen) 6368 return; 6369 6370 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6371 struct amdgpu_ring *ring = adev->rings[i]; 6372 6373 if (!amdgpu_ring_sched_ready(ring)) 6374 continue; 6375 6376 drm_sched_start(&ring->sched, true); 6377 } 6378 6379 amdgpu_device_unset_mp1_state(adev); 6380 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6381 } 6382 6383 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6384 { 6385 struct drm_device *dev = pci_get_drvdata(pdev); 6386 struct amdgpu_device *adev = drm_to_adev(dev); 6387 int r; 6388 6389 r = pci_save_state(pdev); 6390 if (!r) { 6391 kfree(adev->pci_state); 6392 6393 adev->pci_state = pci_store_saved_state(pdev); 6394 6395 if (!adev->pci_state) { 6396 DRM_ERROR("Failed to store PCI saved state"); 6397 return false; 6398 } 6399 } else { 6400 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6401 return false; 6402 } 6403 6404 return true; 6405 } 6406 6407 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6408 { 6409 struct drm_device *dev = pci_get_drvdata(pdev); 6410 struct amdgpu_device *adev = drm_to_adev(dev); 6411 int r; 6412 6413 if (!adev->pci_state) 6414 return false; 6415 6416 r = pci_load_saved_state(pdev, adev->pci_state); 6417 6418 if (!r) { 6419 pci_restore_state(pdev); 6420 } else { 6421 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6422 return false; 6423 } 6424 6425 return true; 6426 } 6427 6428 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6429 struct amdgpu_ring *ring) 6430 { 6431 #ifdef CONFIG_X86_64 6432 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6433 return; 6434 #endif 6435 if (adev->gmc.xgmi.connected_to_cpu) 6436 return; 6437 6438 if (ring && ring->funcs->emit_hdp_flush) 6439 amdgpu_ring_emit_hdp_flush(ring); 6440 else 6441 amdgpu_asic_flush_hdp(adev, ring); 6442 } 6443 6444 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6445 struct amdgpu_ring *ring) 6446 { 6447 #ifdef CONFIG_X86_64 6448 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6449 return; 6450 #endif 6451 if (adev->gmc.xgmi.connected_to_cpu) 6452 return; 6453 6454 amdgpu_asic_invalidate_hdp(adev, ring); 6455 } 6456 6457 int amdgpu_in_reset(struct amdgpu_device *adev) 6458 { 6459 return atomic_read(&adev->reset_domain->in_gpu_reset); 6460 } 6461 6462 /** 6463 * amdgpu_device_halt() - bring hardware to some kind of halt state 6464 * 6465 * @adev: amdgpu_device pointer 6466 * 6467 * Bring hardware to some kind of halt state so that no one can touch it 6468 * any more. It will help to maintain error context when error occurred. 6469 * Compare to a simple hang, the system will keep stable at least for SSH 6470 * access. Then it should be trivial to inspect the hardware state and 6471 * see what's going on. Implemented as following: 6472 * 6473 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6474 * clears all CPU mappings to device, disallows remappings through page faults 6475 * 2. amdgpu_irq_disable_all() disables all interrupts 6476 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6477 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6478 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6479 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6480 * flush any in flight DMA operations 6481 */ 6482 void amdgpu_device_halt(struct amdgpu_device *adev) 6483 { 6484 struct pci_dev *pdev = adev->pdev; 6485 struct drm_device *ddev = adev_to_drm(adev); 6486 6487 amdgpu_xcp_dev_unplug(adev); 6488 drm_dev_unplug(ddev); 6489 6490 amdgpu_irq_disable_all(adev); 6491 6492 amdgpu_fence_driver_hw_fini(adev); 6493 6494 adev->no_hw_access = true; 6495 6496 amdgpu_device_unmap_mmio(adev); 6497 6498 pci_disable_device(pdev); 6499 pci_wait_for_pending_transaction(pdev); 6500 } 6501 6502 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6503 u32 reg) 6504 { 6505 unsigned long flags, address, data; 6506 u32 r; 6507 6508 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6509 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6510 6511 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6512 WREG32(address, reg * 4); 6513 (void)RREG32(address); 6514 r = RREG32(data); 6515 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6516 return r; 6517 } 6518 6519 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6520 u32 reg, u32 v) 6521 { 6522 unsigned long flags, address, data; 6523 6524 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6525 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6526 6527 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6528 WREG32(address, reg * 4); 6529 (void)RREG32(address); 6530 WREG32(data, v); 6531 (void)RREG32(data); 6532 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6533 } 6534 6535 /** 6536 * amdgpu_device_get_gang - return a reference to the current gang 6537 * @adev: amdgpu_device pointer 6538 * 6539 * Returns: A new reference to the current gang leader. 6540 */ 6541 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6542 { 6543 struct dma_fence *fence; 6544 6545 rcu_read_lock(); 6546 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6547 rcu_read_unlock(); 6548 return fence; 6549 } 6550 6551 /** 6552 * amdgpu_device_switch_gang - switch to a new gang 6553 * @adev: amdgpu_device pointer 6554 * @gang: the gang to switch to 6555 * 6556 * Try to switch to a new gang. 6557 * Returns: NULL if we switched to the new gang or a reference to the current 6558 * gang leader. 6559 */ 6560 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6561 struct dma_fence *gang) 6562 { 6563 struct dma_fence *old = NULL; 6564 6565 do { 6566 dma_fence_put(old); 6567 old = amdgpu_device_get_gang(adev); 6568 if (old == gang) 6569 break; 6570 6571 if (!dma_fence_is_signaled(old)) 6572 return old; 6573 6574 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6575 old, gang) != old); 6576 6577 dma_fence_put(old); 6578 return NULL; 6579 } 6580 6581 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6582 { 6583 switch (adev->asic_type) { 6584 #ifdef CONFIG_DRM_AMDGPU_SI 6585 case CHIP_HAINAN: 6586 #endif 6587 case CHIP_TOPAZ: 6588 /* chips with no display hardware */ 6589 return false; 6590 #ifdef CONFIG_DRM_AMDGPU_SI 6591 case CHIP_TAHITI: 6592 case CHIP_PITCAIRN: 6593 case CHIP_VERDE: 6594 case CHIP_OLAND: 6595 #endif 6596 #ifdef CONFIG_DRM_AMDGPU_CIK 6597 case CHIP_BONAIRE: 6598 case CHIP_HAWAII: 6599 case CHIP_KAVERI: 6600 case CHIP_KABINI: 6601 case CHIP_MULLINS: 6602 #endif 6603 case CHIP_TONGA: 6604 case CHIP_FIJI: 6605 case CHIP_POLARIS10: 6606 case CHIP_POLARIS11: 6607 case CHIP_POLARIS12: 6608 case CHIP_VEGAM: 6609 case CHIP_CARRIZO: 6610 case CHIP_STONEY: 6611 /* chips with display hardware */ 6612 return true; 6613 default: 6614 /* IP discovery */ 6615 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6616 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6617 return false; 6618 return true; 6619 } 6620 } 6621 6622 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6623 uint32_t inst, uint32_t reg_addr, char reg_name[], 6624 uint32_t expected_value, uint32_t mask) 6625 { 6626 uint32_t ret = 0; 6627 uint32_t old_ = 0; 6628 uint32_t tmp_ = RREG32(reg_addr); 6629 uint32_t loop = adev->usec_timeout; 6630 6631 while ((tmp_ & (mask)) != (expected_value)) { 6632 if (old_ != tmp_) { 6633 loop = adev->usec_timeout; 6634 old_ = tmp_; 6635 } else 6636 udelay(1); 6637 tmp_ = RREG32(reg_addr); 6638 loop--; 6639 if (!loop) { 6640 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6641 inst, reg_name, (uint32_t)expected_value, 6642 (uint32_t)(tmp_ & (mask))); 6643 ret = -ETIMEDOUT; 6644 break; 6645 } 6646 } 6647 return ret; 6648 } 6649