1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 148 149 /** 150 * DOC: pcie_replay_count 151 * 152 * The amdgpu driver provides a sysfs API for reporting the total number 153 * of PCIe replays (NAKs) 154 * The file pcie_replay_count is used for this and returns the total 155 * number of replays as a sum of the NAKs generated and NAKs received 156 */ 157 158 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 159 struct device_attribute *attr, char *buf) 160 { 161 struct drm_device *ddev = dev_get_drvdata(dev); 162 struct amdgpu_device *adev = drm_to_adev(ddev); 163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 164 165 return sysfs_emit(buf, "%llu\n", cnt); 166 } 167 168 static DEVICE_ATTR(pcie_replay_count, 0444, 169 amdgpu_device_get_pcie_replay_count, NULL); 170 171 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 172 struct bin_attribute *attr, char *buf, 173 loff_t ppos, size_t count) 174 { 175 struct device *dev = kobj_to_dev(kobj); 176 struct drm_device *ddev = dev_get_drvdata(dev); 177 struct amdgpu_device *adev = drm_to_adev(ddev); 178 ssize_t bytes_read; 179 180 switch (ppos) { 181 case AMDGPU_SYS_REG_STATE_XGMI: 182 bytes_read = amdgpu_asic_get_reg_state( 183 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 184 break; 185 case AMDGPU_SYS_REG_STATE_WAFL: 186 bytes_read = amdgpu_asic_get_reg_state( 187 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 188 break; 189 case AMDGPU_SYS_REG_STATE_PCIE: 190 bytes_read = amdgpu_asic_get_reg_state( 191 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 192 break; 193 case AMDGPU_SYS_REG_STATE_USR: 194 bytes_read = amdgpu_asic_get_reg_state( 195 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 196 break; 197 case AMDGPU_SYS_REG_STATE_USR_1: 198 bytes_read = amdgpu_asic_get_reg_state( 199 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 200 break; 201 default: 202 return -EINVAL; 203 } 204 205 return bytes_read; 206 } 207 208 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 209 AMDGPU_SYS_REG_STATE_END); 210 211 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 212 { 213 int ret; 214 215 if (!amdgpu_asic_get_reg_state_supported(adev)) 216 return 0; 217 218 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 219 220 return ret; 221 } 222 223 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 224 { 225 if (!amdgpu_asic_get_reg_state_supported(adev)) 226 return; 227 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 228 } 229 230 /** 231 * DOC: board_info 232 * 233 * The amdgpu driver provides a sysfs API for giving board related information. 234 * It provides the form factor information in the format 235 * 236 * type : form factor 237 * 238 * Possible form factor values 239 * 240 * - "cem" - PCIE CEM card 241 * - "oam" - Open Compute Accelerator Module 242 * - "unknown" - Not known 243 * 244 */ 245 246 static ssize_t amdgpu_device_get_board_info(struct device *dev, 247 struct device_attribute *attr, 248 char *buf) 249 { 250 struct drm_device *ddev = dev_get_drvdata(dev); 251 struct amdgpu_device *adev = drm_to_adev(ddev); 252 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 253 const char *pkg; 254 255 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 256 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 257 258 switch (pkg_type) { 259 case AMDGPU_PKG_TYPE_CEM: 260 pkg = "cem"; 261 break; 262 case AMDGPU_PKG_TYPE_OAM: 263 pkg = "oam"; 264 break; 265 default: 266 pkg = "unknown"; 267 break; 268 } 269 270 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 271 } 272 273 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 274 275 static struct attribute *amdgpu_board_attrs[] = { 276 &dev_attr_board_info.attr, 277 NULL, 278 }; 279 280 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 281 struct attribute *attr, int n) 282 { 283 struct device *dev = kobj_to_dev(kobj); 284 struct drm_device *ddev = dev_get_drvdata(dev); 285 struct amdgpu_device *adev = drm_to_adev(ddev); 286 287 if (adev->flags & AMD_IS_APU) 288 return 0; 289 290 return attr->mode; 291 } 292 293 static const struct attribute_group amdgpu_board_attrs_group = { 294 .attrs = amdgpu_board_attrs, 295 .is_visible = amdgpu_board_attrs_is_visible 296 }; 297 298 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 299 300 301 /** 302 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 303 * 304 * @dev: drm_device pointer 305 * 306 * Returns true if the device is a dGPU with ATPX power control, 307 * otherwise return false. 308 */ 309 bool amdgpu_device_supports_px(struct drm_device *dev) 310 { 311 struct amdgpu_device *adev = drm_to_adev(dev); 312 313 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 314 return true; 315 return false; 316 } 317 318 /** 319 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 320 * 321 * @dev: drm_device pointer 322 * 323 * Returns true if the device is a dGPU with ACPI power control, 324 * otherwise return false. 325 */ 326 bool amdgpu_device_supports_boco(struct drm_device *dev) 327 { 328 struct amdgpu_device *adev = drm_to_adev(dev); 329 330 if (adev->has_pr3 || 331 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 332 return true; 333 return false; 334 } 335 336 /** 337 * amdgpu_device_supports_baco - Does the device support BACO 338 * 339 * @dev: drm_device pointer 340 * 341 * Return: 342 * 1 if the device supporte BACO; 343 * 3 if the device support MACO (only works if BACO is supported) 344 * otherwise return 0. 345 */ 346 int amdgpu_device_supports_baco(struct drm_device *dev) 347 { 348 struct amdgpu_device *adev = drm_to_adev(dev); 349 350 return amdgpu_asic_supports_baco(adev); 351 } 352 353 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 354 { 355 struct drm_device *dev; 356 int bamaco_support; 357 358 dev = adev_to_drm(adev); 359 360 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 361 bamaco_support = amdgpu_device_supports_baco(dev); 362 363 switch (amdgpu_runtime_pm) { 364 case 2: 365 if (bamaco_support & MACO_SUPPORT) { 366 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 367 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 368 } else if (bamaco_support == BACO_SUPPORT) { 369 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 370 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 371 } 372 break; 373 case 1: 374 if (bamaco_support & BACO_SUPPORT) { 375 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 376 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 377 } 378 break; 379 case -1: 380 case -2: 381 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 382 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 383 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 384 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 385 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 386 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 387 } else { 388 if (!bamaco_support) 389 goto no_runtime_pm; 390 391 switch (adev->asic_type) { 392 case CHIP_VEGA20: 393 case CHIP_ARCTURUS: 394 /* BACO are not supported on vega20 and arctrus */ 395 break; 396 case CHIP_VEGA10: 397 /* enable BACO as runpm mode if noretry=0 */ 398 if (!adev->gmc.noretry) 399 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 400 break; 401 default: 402 /* enable BACO as runpm mode on CI+ */ 403 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 404 break; 405 } 406 407 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 408 if (bamaco_support & MACO_SUPPORT) { 409 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 410 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 411 } else { 412 dev_info(adev->dev, "Using BACO for runtime pm\n"); 413 } 414 } 415 } 416 break; 417 case 0: 418 dev_info(adev->dev, "runtime pm is manually disabled\n"); 419 break; 420 default: 421 break; 422 } 423 424 no_runtime_pm: 425 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 426 dev_info(adev->dev, "Runtime PM not available\n"); 427 } 428 /** 429 * amdgpu_device_supports_smart_shift - Is the device dGPU with 430 * smart shift support 431 * 432 * @dev: drm_device pointer 433 * 434 * Returns true if the device is a dGPU with Smart Shift support, 435 * otherwise returns false. 436 */ 437 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 438 { 439 return (amdgpu_device_supports_boco(dev) && 440 amdgpu_acpi_is_power_shift_control_supported()); 441 } 442 443 /* 444 * VRAM access helper functions 445 */ 446 447 /** 448 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 449 * 450 * @adev: amdgpu_device pointer 451 * @pos: offset of the buffer in vram 452 * @buf: virtual address of the buffer in system memory 453 * @size: read/write size, sizeof(@buf) must > @size 454 * @write: true - write to vram, otherwise - read from vram 455 */ 456 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 457 void *buf, size_t size, bool write) 458 { 459 unsigned long flags; 460 uint32_t hi = ~0, tmp = 0; 461 uint32_t *data = buf; 462 uint64_t last; 463 int idx; 464 465 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 466 return; 467 468 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 469 470 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 471 for (last = pos + size; pos < last; pos += 4) { 472 tmp = pos >> 31; 473 474 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 475 if (tmp != hi) { 476 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 477 hi = tmp; 478 } 479 if (write) 480 WREG32_NO_KIQ(mmMM_DATA, *data++); 481 else 482 *data++ = RREG32_NO_KIQ(mmMM_DATA); 483 } 484 485 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 486 drm_dev_exit(idx); 487 } 488 489 /** 490 * amdgpu_device_aper_access - access vram by vram aperature 491 * 492 * @adev: amdgpu_device pointer 493 * @pos: offset of the buffer in vram 494 * @buf: virtual address of the buffer in system memory 495 * @size: read/write size, sizeof(@buf) must > @size 496 * @write: true - write to vram, otherwise - read from vram 497 * 498 * The return value means how many bytes have been transferred. 499 */ 500 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 501 void *buf, size_t size, bool write) 502 { 503 #ifdef CONFIG_64BIT 504 void __iomem *addr; 505 size_t count = 0; 506 uint64_t last; 507 508 if (!adev->mman.aper_base_kaddr) 509 return 0; 510 511 last = min(pos + size, adev->gmc.visible_vram_size); 512 if (last > pos) { 513 addr = adev->mman.aper_base_kaddr + pos; 514 count = last - pos; 515 516 if (write) { 517 memcpy_toio(addr, buf, count); 518 /* Make sure HDP write cache flush happens without any reordering 519 * after the system memory contents are sent over PCIe device 520 */ 521 mb(); 522 amdgpu_device_flush_hdp(adev, NULL); 523 } else { 524 amdgpu_device_invalidate_hdp(adev, NULL); 525 /* Make sure HDP read cache is invalidated before issuing a read 526 * to the PCIe device 527 */ 528 mb(); 529 memcpy_fromio(buf, addr, count); 530 } 531 532 } 533 534 return count; 535 #else 536 return 0; 537 #endif 538 } 539 540 /** 541 * amdgpu_device_vram_access - read/write a buffer in vram 542 * 543 * @adev: amdgpu_device pointer 544 * @pos: offset of the buffer in vram 545 * @buf: virtual address of the buffer in system memory 546 * @size: read/write size, sizeof(@buf) must > @size 547 * @write: true - write to vram, otherwise - read from vram 548 */ 549 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 550 void *buf, size_t size, bool write) 551 { 552 size_t count; 553 554 /* try to using vram apreature to access vram first */ 555 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 556 size -= count; 557 if (size) { 558 /* using MM to access rest vram */ 559 pos += count; 560 buf += count; 561 amdgpu_device_mm_access(adev, pos, buf, size, write); 562 } 563 } 564 565 /* 566 * register access helper functions. 567 */ 568 569 /* Check if hw access should be skipped because of hotplug or device error */ 570 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 571 { 572 if (adev->no_hw_access) 573 return true; 574 575 #ifdef CONFIG_LOCKDEP 576 /* 577 * This is a bit complicated to understand, so worth a comment. What we assert 578 * here is that the GPU reset is not running on another thread in parallel. 579 * 580 * For this we trylock the read side of the reset semaphore, if that succeeds 581 * we know that the reset is not running in paralell. 582 * 583 * If the trylock fails we assert that we are either already holding the read 584 * side of the lock or are the reset thread itself and hold the write side of 585 * the lock. 586 */ 587 if (in_task()) { 588 if (down_read_trylock(&adev->reset_domain->sem)) 589 up_read(&adev->reset_domain->sem); 590 else 591 lockdep_assert_held(&adev->reset_domain->sem); 592 } 593 #endif 594 return false; 595 } 596 597 /** 598 * amdgpu_device_rreg - read a memory mapped IO or indirect register 599 * 600 * @adev: amdgpu_device pointer 601 * @reg: dword aligned register offset 602 * @acc_flags: access flags which require special behavior 603 * 604 * Returns the 32 bit value from the offset specified. 605 */ 606 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 607 uint32_t reg, uint32_t acc_flags) 608 { 609 uint32_t ret; 610 611 if (amdgpu_device_skip_hw_access(adev)) 612 return 0; 613 614 if ((reg * 4) < adev->rmmio_size) { 615 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 616 amdgpu_sriov_runtime(adev) && 617 down_read_trylock(&adev->reset_domain->sem)) { 618 ret = amdgpu_kiq_rreg(adev, reg, 0); 619 up_read(&adev->reset_domain->sem); 620 } else { 621 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 622 } 623 } else { 624 ret = adev->pcie_rreg(adev, reg * 4); 625 } 626 627 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 628 629 return ret; 630 } 631 632 /* 633 * MMIO register read with bytes helper functions 634 * @offset:bytes offset from MMIO start 635 */ 636 637 /** 638 * amdgpu_mm_rreg8 - read a memory mapped IO register 639 * 640 * @adev: amdgpu_device pointer 641 * @offset: byte aligned register offset 642 * 643 * Returns the 8 bit value from the offset specified. 644 */ 645 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 646 { 647 if (amdgpu_device_skip_hw_access(adev)) 648 return 0; 649 650 if (offset < adev->rmmio_size) 651 return (readb(adev->rmmio + offset)); 652 BUG(); 653 } 654 655 656 /** 657 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 658 * 659 * @adev: amdgpu_device pointer 660 * @reg: dword aligned register offset 661 * @acc_flags: access flags which require special behavior 662 * @xcc_id: xcc accelerated compute core id 663 * 664 * Returns the 32 bit value from the offset specified. 665 */ 666 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 667 uint32_t reg, uint32_t acc_flags, 668 uint32_t xcc_id) 669 { 670 uint32_t ret, rlcg_flag; 671 672 if (amdgpu_device_skip_hw_access(adev)) 673 return 0; 674 675 if ((reg * 4) < adev->rmmio_size) { 676 if (amdgpu_sriov_vf(adev) && 677 !amdgpu_sriov_runtime(adev) && 678 adev->gfx.rlc.rlcg_reg_access_supported && 679 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 680 GC_HWIP, false, 681 &rlcg_flag)) { 682 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 683 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 684 amdgpu_sriov_runtime(adev) && 685 down_read_trylock(&adev->reset_domain->sem)) { 686 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 687 up_read(&adev->reset_domain->sem); 688 } else { 689 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 690 } 691 } else { 692 ret = adev->pcie_rreg(adev, reg * 4); 693 } 694 695 return ret; 696 } 697 698 /* 699 * MMIO register write with bytes helper functions 700 * @offset:bytes offset from MMIO start 701 * @value: the value want to be written to the register 702 */ 703 704 /** 705 * amdgpu_mm_wreg8 - read a memory mapped IO register 706 * 707 * @adev: amdgpu_device pointer 708 * @offset: byte aligned register offset 709 * @value: 8 bit value to write 710 * 711 * Writes the value specified to the offset specified. 712 */ 713 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 714 { 715 if (amdgpu_device_skip_hw_access(adev)) 716 return; 717 718 if (offset < adev->rmmio_size) 719 writeb(value, adev->rmmio + offset); 720 else 721 BUG(); 722 } 723 724 /** 725 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 726 * 727 * @adev: amdgpu_device pointer 728 * @reg: dword aligned register offset 729 * @v: 32 bit value to write to the register 730 * @acc_flags: access flags which require special behavior 731 * 732 * Writes the value specified to the offset specified. 733 */ 734 void amdgpu_device_wreg(struct amdgpu_device *adev, 735 uint32_t reg, uint32_t v, 736 uint32_t acc_flags) 737 { 738 if (amdgpu_device_skip_hw_access(adev)) 739 return; 740 741 if ((reg * 4) < adev->rmmio_size) { 742 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 743 amdgpu_sriov_runtime(adev) && 744 down_read_trylock(&adev->reset_domain->sem)) { 745 amdgpu_kiq_wreg(adev, reg, v, 0); 746 up_read(&adev->reset_domain->sem); 747 } else { 748 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 749 } 750 } else { 751 adev->pcie_wreg(adev, reg * 4, v); 752 } 753 754 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 755 } 756 757 /** 758 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: mmio/rlc register 762 * @v: value to write 763 * @xcc_id: xcc accelerated compute core id 764 * 765 * this function is invoked only for the debugfs register access 766 */ 767 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 768 uint32_t reg, uint32_t v, 769 uint32_t xcc_id) 770 { 771 if (amdgpu_device_skip_hw_access(adev)) 772 return; 773 774 if (amdgpu_sriov_fullaccess(adev) && 775 adev->gfx.rlc.funcs && 776 adev->gfx.rlc.funcs->is_rlcg_access_range) { 777 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 778 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 779 } else if ((reg * 4) >= adev->rmmio_size) { 780 adev->pcie_wreg(adev, reg * 4, v); 781 } else { 782 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 783 } 784 } 785 786 /** 787 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 788 * 789 * @adev: amdgpu_device pointer 790 * @reg: dword aligned register offset 791 * @v: 32 bit value to write to the register 792 * @acc_flags: access flags which require special behavior 793 * @xcc_id: xcc accelerated compute core id 794 * 795 * Writes the value specified to the offset specified. 796 */ 797 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 798 uint32_t reg, uint32_t v, 799 uint32_t acc_flags, uint32_t xcc_id) 800 { 801 uint32_t rlcg_flag; 802 803 if (amdgpu_device_skip_hw_access(adev)) 804 return; 805 806 if ((reg * 4) < adev->rmmio_size) { 807 if (amdgpu_sriov_vf(adev) && 808 !amdgpu_sriov_runtime(adev) && 809 adev->gfx.rlc.rlcg_reg_access_supported && 810 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 811 GC_HWIP, true, 812 &rlcg_flag)) { 813 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 814 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 815 amdgpu_sriov_runtime(adev) && 816 down_read_trylock(&adev->reset_domain->sem)) { 817 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 818 up_read(&adev->reset_domain->sem); 819 } else { 820 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 821 } 822 } else { 823 adev->pcie_wreg(adev, reg * 4, v); 824 } 825 } 826 827 /** 828 * amdgpu_device_indirect_rreg - read an indirect register 829 * 830 * @adev: amdgpu_device pointer 831 * @reg_addr: indirect register address to read from 832 * 833 * Returns the value of indirect register @reg_addr 834 */ 835 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 836 u32 reg_addr) 837 { 838 unsigned long flags, pcie_index, pcie_data; 839 void __iomem *pcie_index_offset; 840 void __iomem *pcie_data_offset; 841 u32 r; 842 843 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 844 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 845 846 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 847 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 848 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 849 850 writel(reg_addr, pcie_index_offset); 851 readl(pcie_index_offset); 852 r = readl(pcie_data_offset); 853 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 854 855 return r; 856 } 857 858 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 859 u64 reg_addr) 860 { 861 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 862 u32 r; 863 void __iomem *pcie_index_offset; 864 void __iomem *pcie_index_hi_offset; 865 void __iomem *pcie_data_offset; 866 867 if (unlikely(!adev->nbio.funcs)) { 868 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 869 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 870 } else { 871 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 872 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 873 } 874 875 if (reg_addr >> 32) { 876 if (unlikely(!adev->nbio.funcs)) 877 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 878 else 879 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 880 } else { 881 pcie_index_hi = 0; 882 } 883 884 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 885 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 886 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 887 if (pcie_index_hi != 0) 888 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 889 pcie_index_hi * 4; 890 891 writel(reg_addr, pcie_index_offset); 892 readl(pcie_index_offset); 893 if (pcie_index_hi != 0) { 894 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 r = readl(pcie_data_offset); 898 899 /* clear the high bits */ 900 if (pcie_index_hi != 0) { 901 writel(0, pcie_index_hi_offset); 902 readl(pcie_index_hi_offset); 903 } 904 905 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 906 907 return r; 908 } 909 910 /** 911 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 912 * 913 * @adev: amdgpu_device pointer 914 * @reg_addr: indirect register address to read from 915 * 916 * Returns the value of indirect register @reg_addr 917 */ 918 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 919 u32 reg_addr) 920 { 921 unsigned long flags, pcie_index, pcie_data; 922 void __iomem *pcie_index_offset; 923 void __iomem *pcie_data_offset; 924 u64 r; 925 926 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 927 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 928 929 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 930 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 931 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 932 933 /* read low 32 bits */ 934 writel(reg_addr, pcie_index_offset); 935 readl(pcie_index_offset); 936 r = readl(pcie_data_offset); 937 /* read high 32 bits */ 938 writel(reg_addr + 4, pcie_index_offset); 939 readl(pcie_index_offset); 940 r |= ((u64)readl(pcie_data_offset) << 32); 941 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 942 943 return r; 944 } 945 946 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 947 u64 reg_addr) 948 { 949 unsigned long flags, pcie_index, pcie_data; 950 unsigned long pcie_index_hi = 0; 951 void __iomem *pcie_index_offset; 952 void __iomem *pcie_index_hi_offset; 953 void __iomem *pcie_data_offset; 954 u64 r; 955 956 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 957 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 958 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 959 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 960 961 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 962 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 963 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 964 if (pcie_index_hi != 0) 965 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 966 pcie_index_hi * 4; 967 968 /* read low 32 bits */ 969 writel(reg_addr, pcie_index_offset); 970 readl(pcie_index_offset); 971 if (pcie_index_hi != 0) { 972 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 973 readl(pcie_index_hi_offset); 974 } 975 r = readl(pcie_data_offset); 976 /* read high 32 bits */ 977 writel(reg_addr + 4, pcie_index_offset); 978 readl(pcie_index_offset); 979 if (pcie_index_hi != 0) { 980 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 981 readl(pcie_index_hi_offset); 982 } 983 r |= ((u64)readl(pcie_data_offset) << 32); 984 985 /* clear the high bits */ 986 if (pcie_index_hi != 0) { 987 writel(0, pcie_index_hi_offset); 988 readl(pcie_index_hi_offset); 989 } 990 991 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 992 993 return r; 994 } 995 996 /** 997 * amdgpu_device_indirect_wreg - write an indirect register address 998 * 999 * @adev: amdgpu_device pointer 1000 * @reg_addr: indirect register offset 1001 * @reg_data: indirect register data 1002 * 1003 */ 1004 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1005 u32 reg_addr, u32 reg_data) 1006 { 1007 unsigned long flags, pcie_index, pcie_data; 1008 void __iomem *pcie_index_offset; 1009 void __iomem *pcie_data_offset; 1010 1011 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1012 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1013 1014 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1015 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1016 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1017 1018 writel(reg_addr, pcie_index_offset); 1019 readl(pcie_index_offset); 1020 writel(reg_data, pcie_data_offset); 1021 readl(pcie_data_offset); 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 } 1024 1025 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1026 u64 reg_addr, u32 reg_data) 1027 { 1028 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1029 void __iomem *pcie_index_offset; 1030 void __iomem *pcie_index_hi_offset; 1031 void __iomem *pcie_data_offset; 1032 1033 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1034 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1035 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1036 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1037 else 1038 pcie_index_hi = 0; 1039 1040 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1041 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1042 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1043 if (pcie_index_hi != 0) 1044 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1045 pcie_index_hi * 4; 1046 1047 writel(reg_addr, pcie_index_offset); 1048 readl(pcie_index_offset); 1049 if (pcie_index_hi != 0) { 1050 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1051 readl(pcie_index_hi_offset); 1052 } 1053 writel(reg_data, pcie_data_offset); 1054 readl(pcie_data_offset); 1055 1056 /* clear the high bits */ 1057 if (pcie_index_hi != 0) { 1058 writel(0, pcie_index_hi_offset); 1059 readl(pcie_index_hi_offset); 1060 } 1061 1062 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1063 } 1064 1065 /** 1066 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1067 * 1068 * @adev: amdgpu_device pointer 1069 * @reg_addr: indirect register offset 1070 * @reg_data: indirect register data 1071 * 1072 */ 1073 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1074 u32 reg_addr, u64 reg_data) 1075 { 1076 unsigned long flags, pcie_index, pcie_data; 1077 void __iomem *pcie_index_offset; 1078 void __iomem *pcie_data_offset; 1079 1080 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1081 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1082 1083 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1084 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1085 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1086 1087 /* write low 32 bits */ 1088 writel(reg_addr, pcie_index_offset); 1089 readl(pcie_index_offset); 1090 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1091 readl(pcie_data_offset); 1092 /* write high 32 bits */ 1093 writel(reg_addr + 4, pcie_index_offset); 1094 readl(pcie_index_offset); 1095 writel((u32)(reg_data >> 32), pcie_data_offset); 1096 readl(pcie_data_offset); 1097 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1098 } 1099 1100 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1101 u64 reg_addr, u64 reg_data) 1102 { 1103 unsigned long flags, pcie_index, pcie_data; 1104 unsigned long pcie_index_hi = 0; 1105 void __iomem *pcie_index_offset; 1106 void __iomem *pcie_index_hi_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1112 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1113 1114 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1115 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1116 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1117 if (pcie_index_hi != 0) 1118 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1119 pcie_index_hi * 4; 1120 1121 /* write low 32 bits */ 1122 writel(reg_addr, pcie_index_offset); 1123 readl(pcie_index_offset); 1124 if (pcie_index_hi != 0) { 1125 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1126 readl(pcie_index_hi_offset); 1127 } 1128 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1129 readl(pcie_data_offset); 1130 /* write high 32 bits */ 1131 writel(reg_addr + 4, pcie_index_offset); 1132 readl(pcie_index_offset); 1133 if (pcie_index_hi != 0) { 1134 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1135 readl(pcie_index_hi_offset); 1136 } 1137 writel((u32)(reg_data >> 32), pcie_data_offset); 1138 readl(pcie_data_offset); 1139 1140 /* clear the high bits */ 1141 if (pcie_index_hi != 0) { 1142 writel(0, pcie_index_hi_offset); 1143 readl(pcie_index_hi_offset); 1144 } 1145 1146 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1147 } 1148 1149 /** 1150 * amdgpu_device_get_rev_id - query device rev_id 1151 * 1152 * @adev: amdgpu_device pointer 1153 * 1154 * Return device rev_id 1155 */ 1156 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1157 { 1158 return adev->nbio.funcs->get_rev_id(adev); 1159 } 1160 1161 /** 1162 * amdgpu_invalid_rreg - dummy reg read function 1163 * 1164 * @adev: amdgpu_device pointer 1165 * @reg: offset of register 1166 * 1167 * Dummy register read function. Used for register blocks 1168 * that certain asics don't have (all asics). 1169 * Returns the value in the register. 1170 */ 1171 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1172 { 1173 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1174 BUG(); 1175 return 0; 1176 } 1177 1178 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1179 { 1180 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1181 BUG(); 1182 return 0; 1183 } 1184 1185 /** 1186 * amdgpu_invalid_wreg - dummy reg write function 1187 * 1188 * @adev: amdgpu_device pointer 1189 * @reg: offset of register 1190 * @v: value to write to the register 1191 * 1192 * Dummy register read function. Used for register blocks 1193 * that certain asics don't have (all asics). 1194 */ 1195 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1196 { 1197 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1198 reg, v); 1199 BUG(); 1200 } 1201 1202 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1203 { 1204 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1205 reg, v); 1206 BUG(); 1207 } 1208 1209 /** 1210 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1211 * 1212 * @adev: amdgpu_device pointer 1213 * @reg: offset of register 1214 * 1215 * Dummy register read function. Used for register blocks 1216 * that certain asics don't have (all asics). 1217 * Returns the value in the register. 1218 */ 1219 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1220 { 1221 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1222 BUG(); 1223 return 0; 1224 } 1225 1226 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1227 { 1228 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1229 BUG(); 1230 return 0; 1231 } 1232 1233 /** 1234 * amdgpu_invalid_wreg64 - dummy reg write function 1235 * 1236 * @adev: amdgpu_device pointer 1237 * @reg: offset of register 1238 * @v: value to write to the register 1239 * 1240 * Dummy register read function. Used for register blocks 1241 * that certain asics don't have (all asics). 1242 */ 1243 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1244 { 1245 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1246 reg, v); 1247 BUG(); 1248 } 1249 1250 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1251 { 1252 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1253 reg, v); 1254 BUG(); 1255 } 1256 1257 /** 1258 * amdgpu_block_invalid_rreg - dummy reg read function 1259 * 1260 * @adev: amdgpu_device pointer 1261 * @block: offset of instance 1262 * @reg: offset of register 1263 * 1264 * Dummy register read function. Used for register blocks 1265 * that certain asics don't have (all asics). 1266 * Returns the value in the register. 1267 */ 1268 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1269 uint32_t block, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1272 reg, block); 1273 BUG(); 1274 return 0; 1275 } 1276 1277 /** 1278 * amdgpu_block_invalid_wreg - dummy reg write function 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @block: offset of instance 1282 * @reg: offset of register 1283 * @v: value to write to the register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 */ 1288 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1289 uint32_t block, 1290 uint32_t reg, uint32_t v) 1291 { 1292 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1293 reg, block, v); 1294 BUG(); 1295 } 1296 1297 /** 1298 * amdgpu_device_asic_init - Wrapper for atom asic_init 1299 * 1300 * @adev: amdgpu_device pointer 1301 * 1302 * Does any asic specific work and then calls atom asic init. 1303 */ 1304 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1305 { 1306 int ret; 1307 1308 amdgpu_asic_pre_asic_init(adev); 1309 1310 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1311 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1312 amdgpu_psp_wait_for_bootloader(adev); 1313 ret = amdgpu_atomfirmware_asic_init(adev, true); 1314 return ret; 1315 } else { 1316 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1317 } 1318 1319 return 0; 1320 } 1321 1322 /** 1323 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1324 * 1325 * @adev: amdgpu_device pointer 1326 * 1327 * Allocates a scratch page of VRAM for use by various things in the 1328 * driver. 1329 */ 1330 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1331 { 1332 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1333 AMDGPU_GEM_DOMAIN_VRAM | 1334 AMDGPU_GEM_DOMAIN_GTT, 1335 &adev->mem_scratch.robj, 1336 &adev->mem_scratch.gpu_addr, 1337 (void **)&adev->mem_scratch.ptr); 1338 } 1339 1340 /** 1341 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1342 * 1343 * @adev: amdgpu_device pointer 1344 * 1345 * Frees the VRAM scratch page. 1346 */ 1347 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1348 { 1349 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1350 } 1351 1352 /** 1353 * amdgpu_device_program_register_sequence - program an array of registers. 1354 * 1355 * @adev: amdgpu_device pointer 1356 * @registers: pointer to the register array 1357 * @array_size: size of the register array 1358 * 1359 * Programs an array or registers with and or masks. 1360 * This is a helper for setting golden registers. 1361 */ 1362 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1363 const u32 *registers, 1364 const u32 array_size) 1365 { 1366 u32 tmp, reg, and_mask, or_mask; 1367 int i; 1368 1369 if (array_size % 3) 1370 return; 1371 1372 for (i = 0; i < array_size; i += 3) { 1373 reg = registers[i + 0]; 1374 and_mask = registers[i + 1]; 1375 or_mask = registers[i + 2]; 1376 1377 if (and_mask == 0xffffffff) { 1378 tmp = or_mask; 1379 } else { 1380 tmp = RREG32(reg); 1381 tmp &= ~and_mask; 1382 if (adev->family >= AMDGPU_FAMILY_AI) 1383 tmp |= (or_mask & and_mask); 1384 else 1385 tmp |= or_mask; 1386 } 1387 WREG32(reg, tmp); 1388 } 1389 } 1390 1391 /** 1392 * amdgpu_device_pci_config_reset - reset the GPU 1393 * 1394 * @adev: amdgpu_device pointer 1395 * 1396 * Resets the GPU using the pci config reset sequence. 1397 * Only applicable to asics prior to vega10. 1398 */ 1399 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1400 { 1401 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1402 } 1403 1404 /** 1405 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1406 * 1407 * @adev: amdgpu_device pointer 1408 * 1409 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1410 */ 1411 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1412 { 1413 return pci_reset_function(adev->pdev); 1414 } 1415 1416 /* 1417 * amdgpu_device_wb_*() 1418 * Writeback is the method by which the GPU updates special pages in memory 1419 * with the status of certain GPU events (fences, ring pointers,etc.). 1420 */ 1421 1422 /** 1423 * amdgpu_device_wb_fini - Disable Writeback and free memory 1424 * 1425 * @adev: amdgpu_device pointer 1426 * 1427 * Disables Writeback and frees the Writeback memory (all asics). 1428 * Used at driver shutdown. 1429 */ 1430 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1431 { 1432 if (adev->wb.wb_obj) { 1433 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1434 &adev->wb.gpu_addr, 1435 (void **)&adev->wb.wb); 1436 adev->wb.wb_obj = NULL; 1437 } 1438 } 1439 1440 /** 1441 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1442 * 1443 * @adev: amdgpu_device pointer 1444 * 1445 * Initializes writeback and allocates writeback memory (all asics). 1446 * Used at driver startup. 1447 * Returns 0 on success or an -error on failure. 1448 */ 1449 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1450 { 1451 int r; 1452 1453 if (adev->wb.wb_obj == NULL) { 1454 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1455 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1456 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1457 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1458 (void **)&adev->wb.wb); 1459 if (r) { 1460 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1461 return r; 1462 } 1463 1464 adev->wb.num_wb = AMDGPU_MAX_WB; 1465 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1466 1467 /* clear wb memory */ 1468 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1469 } 1470 1471 return 0; 1472 } 1473 1474 /** 1475 * amdgpu_device_wb_get - Allocate a wb entry 1476 * 1477 * @adev: amdgpu_device pointer 1478 * @wb: wb index 1479 * 1480 * Allocate a wb slot for use by the driver (all asics). 1481 * Returns 0 on success or -EINVAL on failure. 1482 */ 1483 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1484 { 1485 unsigned long flags, offset; 1486 1487 spin_lock_irqsave(&adev->wb.lock, flags); 1488 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1489 if (offset < adev->wb.num_wb) { 1490 __set_bit(offset, adev->wb.used); 1491 spin_unlock_irqrestore(&adev->wb.lock, flags); 1492 *wb = offset << 3; /* convert to dw offset */ 1493 return 0; 1494 } else { 1495 spin_unlock_irqrestore(&adev->wb.lock, flags); 1496 return -EINVAL; 1497 } 1498 } 1499 1500 /** 1501 * amdgpu_device_wb_free - Free a wb entry 1502 * 1503 * @adev: amdgpu_device pointer 1504 * @wb: wb index 1505 * 1506 * Free a wb slot allocated for use by the driver (all asics) 1507 */ 1508 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1509 { 1510 unsigned long flags; 1511 1512 wb >>= 3; 1513 spin_lock_irqsave(&adev->wb.lock, flags); 1514 if (wb < adev->wb.num_wb) 1515 __clear_bit(wb, adev->wb.used); 1516 spin_unlock_irqrestore(&adev->wb.lock, flags); 1517 } 1518 1519 /** 1520 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1521 * 1522 * @adev: amdgpu_device pointer 1523 * 1524 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1525 * to fail, but if any of the BARs is not accessible after the size we abort 1526 * driver loading by returning -ENODEV. 1527 */ 1528 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1529 { 1530 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1531 struct pci_bus *root; 1532 struct resource *res; 1533 unsigned int i; 1534 u16 cmd; 1535 int r; 1536 1537 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1538 return 0; 1539 1540 /* Bypass for VF */ 1541 if (amdgpu_sriov_vf(adev)) 1542 return 0; 1543 1544 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1545 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1546 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1547 1548 /* skip if the bios has already enabled large BAR */ 1549 if (adev->gmc.real_vram_size && 1550 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1551 return 0; 1552 1553 /* Check if the root BUS has 64bit memory resources */ 1554 root = adev->pdev->bus; 1555 while (root->parent) 1556 root = root->parent; 1557 1558 pci_bus_for_each_resource(root, res, i) { 1559 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1560 res->start > 0x100000000ull) 1561 break; 1562 } 1563 1564 /* Trying to resize is pointless without a root hub window above 4GB */ 1565 if (!res) 1566 return 0; 1567 1568 /* Limit the BAR size to what is available */ 1569 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1570 rbar_size); 1571 1572 /* Disable memory decoding while we change the BAR addresses and size */ 1573 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1574 pci_write_config_word(adev->pdev, PCI_COMMAND, 1575 cmd & ~PCI_COMMAND_MEMORY); 1576 1577 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1578 amdgpu_doorbell_fini(adev); 1579 if (adev->asic_type >= CHIP_BONAIRE) 1580 pci_release_resource(adev->pdev, 2); 1581 1582 pci_release_resource(adev->pdev, 0); 1583 1584 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1585 if (r == -ENOSPC) 1586 DRM_INFO("Not enough PCI address space for a large BAR."); 1587 else if (r && r != -ENOTSUPP) 1588 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1589 1590 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1591 1592 /* When the doorbell or fb BAR isn't available we have no chance of 1593 * using the device. 1594 */ 1595 r = amdgpu_doorbell_init(adev); 1596 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1597 return -ENODEV; 1598 1599 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1600 1601 return 0; 1602 } 1603 1604 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1605 { 1606 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1607 return false; 1608 1609 return true; 1610 } 1611 1612 /* 1613 * GPU helpers function. 1614 */ 1615 /** 1616 * amdgpu_device_need_post - check if the hw need post or not 1617 * 1618 * @adev: amdgpu_device pointer 1619 * 1620 * Check if the asic has been initialized (all asics) at driver startup 1621 * or post is needed if hw reset is performed. 1622 * Returns true if need or false if not. 1623 */ 1624 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1625 { 1626 uint32_t reg; 1627 1628 if (amdgpu_sriov_vf(adev)) 1629 return false; 1630 1631 if (!amdgpu_device_read_bios(adev)) 1632 return false; 1633 1634 if (amdgpu_passthrough(adev)) { 1635 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1636 * some old smc fw still need driver do vPost otherwise gpu hang, while 1637 * those smc fw version above 22.15 doesn't have this flaw, so we force 1638 * vpost executed for smc version below 22.15 1639 */ 1640 if (adev->asic_type == CHIP_FIJI) { 1641 int err; 1642 uint32_t fw_ver; 1643 1644 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1645 /* force vPost if error occured */ 1646 if (err) 1647 return true; 1648 1649 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1650 release_firmware(adev->pm.fw); 1651 if (fw_ver < 0x00160e00) 1652 return true; 1653 } 1654 } 1655 1656 /* Don't post if we need to reset whole hive on init */ 1657 if (adev->gmc.xgmi.pending_reset) 1658 return false; 1659 1660 if (adev->has_hw_reset) { 1661 adev->has_hw_reset = false; 1662 return true; 1663 } 1664 1665 /* bios scratch used on CIK+ */ 1666 if (adev->asic_type >= CHIP_BONAIRE) 1667 return amdgpu_atombios_scratch_need_asic_init(adev); 1668 1669 /* check MEM_SIZE for older asics */ 1670 reg = amdgpu_asic_get_config_memsize(adev); 1671 1672 if ((reg != 0) && (reg != 0xffffffff)) 1673 return false; 1674 1675 return true; 1676 } 1677 1678 /* 1679 * Check whether seamless boot is supported. 1680 * 1681 * So far we only support seamless boot on DCE 3.0 or later. 1682 * If users report that it works on older ASICS as well, we may 1683 * loosen this. 1684 */ 1685 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1686 { 1687 switch (amdgpu_seamless) { 1688 case -1: 1689 break; 1690 case 1: 1691 return true; 1692 case 0: 1693 return false; 1694 default: 1695 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1696 amdgpu_seamless); 1697 return false; 1698 } 1699 1700 if (!(adev->flags & AMD_IS_APU)) 1701 return false; 1702 1703 if (adev->mman.keep_stolen_vga_memory) 1704 return false; 1705 1706 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1707 } 1708 1709 /* 1710 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1711 * don't support dynamic speed switching. Until we have confirmation from Intel 1712 * that a specific host supports it, it's safer that we keep it disabled for all. 1713 * 1714 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1715 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1716 */ 1717 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1718 { 1719 #if IS_ENABLED(CONFIG_X86) 1720 struct cpuinfo_x86 *c = &cpu_data(0); 1721 1722 /* eGPU change speeds based on USB4 fabric conditions */ 1723 if (dev_is_removable(adev->dev)) 1724 return true; 1725 1726 if (c->x86_vendor == X86_VENDOR_INTEL) 1727 return false; 1728 #endif 1729 return true; 1730 } 1731 1732 /** 1733 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1734 * 1735 * @adev: amdgpu_device pointer 1736 * 1737 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1738 * be set for this device. 1739 * 1740 * Returns true if it should be used or false if not. 1741 */ 1742 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1743 { 1744 switch (amdgpu_aspm) { 1745 case -1: 1746 break; 1747 case 0: 1748 return false; 1749 case 1: 1750 return true; 1751 default: 1752 return false; 1753 } 1754 if (adev->flags & AMD_IS_APU) 1755 return false; 1756 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1757 return false; 1758 return pcie_aspm_enabled(adev->pdev); 1759 } 1760 1761 /* if we get transitioned to only one device, take VGA back */ 1762 /** 1763 * amdgpu_device_vga_set_decode - enable/disable vga decode 1764 * 1765 * @pdev: PCI device pointer 1766 * @state: enable/disable vga decode 1767 * 1768 * Enable/disable vga decode (all asics). 1769 * Returns VGA resource flags. 1770 */ 1771 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1772 bool state) 1773 { 1774 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1775 1776 amdgpu_asic_set_vga_state(adev, state); 1777 if (state) 1778 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1779 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1780 else 1781 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1782 } 1783 1784 /** 1785 * amdgpu_device_check_block_size - validate the vm block size 1786 * 1787 * @adev: amdgpu_device pointer 1788 * 1789 * Validates the vm block size specified via module parameter. 1790 * The vm block size defines number of bits in page table versus page directory, 1791 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1792 * page table and the remaining bits are in the page directory. 1793 */ 1794 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1795 { 1796 /* defines number of bits in page table versus page directory, 1797 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1798 * page table and the remaining bits are in the page directory 1799 */ 1800 if (amdgpu_vm_block_size == -1) 1801 return; 1802 1803 if (amdgpu_vm_block_size < 9) { 1804 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1805 amdgpu_vm_block_size); 1806 amdgpu_vm_block_size = -1; 1807 } 1808 } 1809 1810 /** 1811 * amdgpu_device_check_vm_size - validate the vm size 1812 * 1813 * @adev: amdgpu_device pointer 1814 * 1815 * Validates the vm size in GB specified via module parameter. 1816 * The VM size is the size of the GPU virtual memory space in GB. 1817 */ 1818 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1819 { 1820 /* no need to check the default value */ 1821 if (amdgpu_vm_size == -1) 1822 return; 1823 1824 if (amdgpu_vm_size < 1) { 1825 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1826 amdgpu_vm_size); 1827 amdgpu_vm_size = -1; 1828 } 1829 } 1830 1831 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1832 { 1833 struct sysinfo si; 1834 bool is_os_64 = (sizeof(void *) == 8); 1835 uint64_t total_memory; 1836 uint64_t dram_size_seven_GB = 0x1B8000000; 1837 uint64_t dram_size_three_GB = 0xB8000000; 1838 1839 if (amdgpu_smu_memory_pool_size == 0) 1840 return; 1841 1842 if (!is_os_64) { 1843 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1844 goto def_value; 1845 } 1846 si_meminfo(&si); 1847 total_memory = (uint64_t)si.totalram * si.mem_unit; 1848 1849 if ((amdgpu_smu_memory_pool_size == 1) || 1850 (amdgpu_smu_memory_pool_size == 2)) { 1851 if (total_memory < dram_size_three_GB) 1852 goto def_value1; 1853 } else if ((amdgpu_smu_memory_pool_size == 4) || 1854 (amdgpu_smu_memory_pool_size == 8)) { 1855 if (total_memory < dram_size_seven_GB) 1856 goto def_value1; 1857 } else { 1858 DRM_WARN("Smu memory pool size not supported\n"); 1859 goto def_value; 1860 } 1861 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1862 1863 return; 1864 1865 def_value1: 1866 DRM_WARN("No enough system memory\n"); 1867 def_value: 1868 adev->pm.smu_prv_buffer_size = 0; 1869 } 1870 1871 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1872 { 1873 if (!(adev->flags & AMD_IS_APU) || 1874 adev->asic_type < CHIP_RAVEN) 1875 return 0; 1876 1877 switch (adev->asic_type) { 1878 case CHIP_RAVEN: 1879 if (adev->pdev->device == 0x15dd) 1880 adev->apu_flags |= AMD_APU_IS_RAVEN; 1881 if (adev->pdev->device == 0x15d8) 1882 adev->apu_flags |= AMD_APU_IS_PICASSO; 1883 break; 1884 case CHIP_RENOIR: 1885 if ((adev->pdev->device == 0x1636) || 1886 (adev->pdev->device == 0x164c)) 1887 adev->apu_flags |= AMD_APU_IS_RENOIR; 1888 else 1889 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1890 break; 1891 case CHIP_VANGOGH: 1892 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1893 break; 1894 case CHIP_YELLOW_CARP: 1895 break; 1896 case CHIP_CYAN_SKILLFISH: 1897 if ((adev->pdev->device == 0x13FE) || 1898 (adev->pdev->device == 0x143F)) 1899 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1900 break; 1901 default: 1902 break; 1903 } 1904 1905 return 0; 1906 } 1907 1908 /** 1909 * amdgpu_device_check_arguments - validate module params 1910 * 1911 * @adev: amdgpu_device pointer 1912 * 1913 * Validates certain module parameters and updates 1914 * the associated values used by the driver (all asics). 1915 */ 1916 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1917 { 1918 if (amdgpu_sched_jobs < 4) { 1919 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1920 amdgpu_sched_jobs); 1921 amdgpu_sched_jobs = 4; 1922 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1923 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1924 amdgpu_sched_jobs); 1925 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1926 } 1927 1928 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1929 /* gart size must be greater or equal to 32M */ 1930 dev_warn(adev->dev, "gart size (%d) too small\n", 1931 amdgpu_gart_size); 1932 amdgpu_gart_size = -1; 1933 } 1934 1935 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1936 /* gtt size must be greater or equal to 32M */ 1937 dev_warn(adev->dev, "gtt size (%d) too small\n", 1938 amdgpu_gtt_size); 1939 amdgpu_gtt_size = -1; 1940 } 1941 1942 /* valid range is between 4 and 9 inclusive */ 1943 if (amdgpu_vm_fragment_size != -1 && 1944 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1945 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1946 amdgpu_vm_fragment_size = -1; 1947 } 1948 1949 if (amdgpu_sched_hw_submission < 2) { 1950 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1951 amdgpu_sched_hw_submission); 1952 amdgpu_sched_hw_submission = 2; 1953 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1954 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1955 amdgpu_sched_hw_submission); 1956 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1957 } 1958 1959 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1960 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1961 amdgpu_reset_method = -1; 1962 } 1963 1964 amdgpu_device_check_smu_prv_buffer_size(adev); 1965 1966 amdgpu_device_check_vm_size(adev); 1967 1968 amdgpu_device_check_block_size(adev); 1969 1970 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1971 1972 return 0; 1973 } 1974 1975 /** 1976 * amdgpu_switcheroo_set_state - set switcheroo state 1977 * 1978 * @pdev: pci dev pointer 1979 * @state: vga_switcheroo state 1980 * 1981 * Callback for the switcheroo driver. Suspends or resumes 1982 * the asics before or after it is powered up using ACPI methods. 1983 */ 1984 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1985 enum vga_switcheroo_state state) 1986 { 1987 struct drm_device *dev = pci_get_drvdata(pdev); 1988 int r; 1989 1990 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1991 return; 1992 1993 if (state == VGA_SWITCHEROO_ON) { 1994 pr_info("switched on\n"); 1995 /* don't suspend or resume card normally */ 1996 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1997 1998 pci_set_power_state(pdev, PCI_D0); 1999 amdgpu_device_load_pci_state(pdev); 2000 r = pci_enable_device(pdev); 2001 if (r) 2002 DRM_WARN("pci_enable_device failed (%d)\n", r); 2003 amdgpu_device_resume(dev, true); 2004 2005 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2006 } else { 2007 pr_info("switched off\n"); 2008 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2009 amdgpu_device_prepare(dev); 2010 amdgpu_device_suspend(dev, true); 2011 amdgpu_device_cache_pci_state(pdev); 2012 /* Shut down the device */ 2013 pci_disable_device(pdev); 2014 pci_set_power_state(pdev, PCI_D3cold); 2015 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2016 } 2017 } 2018 2019 /** 2020 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2021 * 2022 * @pdev: pci dev pointer 2023 * 2024 * Callback for the switcheroo driver. Check of the switcheroo 2025 * state can be changed. 2026 * Returns true if the state can be changed, false if not. 2027 */ 2028 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2029 { 2030 struct drm_device *dev = pci_get_drvdata(pdev); 2031 2032 /* 2033 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2034 * locking inversion with the driver load path. And the access here is 2035 * completely racy anyway. So don't bother with locking for now. 2036 */ 2037 return atomic_read(&dev->open_count) == 0; 2038 } 2039 2040 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2041 .set_gpu_state = amdgpu_switcheroo_set_state, 2042 .reprobe = NULL, 2043 .can_switch = amdgpu_switcheroo_can_switch, 2044 }; 2045 2046 /** 2047 * amdgpu_device_ip_set_clockgating_state - set the CG state 2048 * 2049 * @dev: amdgpu_device pointer 2050 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2051 * @state: clockgating state (gate or ungate) 2052 * 2053 * Sets the requested clockgating state for all instances of 2054 * the hardware IP specified. 2055 * Returns the error code from the last instance. 2056 */ 2057 int amdgpu_device_ip_set_clockgating_state(void *dev, 2058 enum amd_ip_block_type block_type, 2059 enum amd_clockgating_state state) 2060 { 2061 struct amdgpu_device *adev = dev; 2062 int i, r = 0; 2063 2064 for (i = 0; i < adev->num_ip_blocks; i++) { 2065 if (!adev->ip_blocks[i].status.valid) 2066 continue; 2067 if (adev->ip_blocks[i].version->type != block_type) 2068 continue; 2069 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2070 continue; 2071 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2072 (void *)adev, state); 2073 if (r) 2074 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2075 adev->ip_blocks[i].version->funcs->name, r); 2076 } 2077 return r; 2078 } 2079 2080 /** 2081 * amdgpu_device_ip_set_powergating_state - set the PG state 2082 * 2083 * @dev: amdgpu_device pointer 2084 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2085 * @state: powergating state (gate or ungate) 2086 * 2087 * Sets the requested powergating state for all instances of 2088 * the hardware IP specified. 2089 * Returns the error code from the last instance. 2090 */ 2091 int amdgpu_device_ip_set_powergating_state(void *dev, 2092 enum amd_ip_block_type block_type, 2093 enum amd_powergating_state state) 2094 { 2095 struct amdgpu_device *adev = dev; 2096 int i, r = 0; 2097 2098 for (i = 0; i < adev->num_ip_blocks; i++) { 2099 if (!adev->ip_blocks[i].status.valid) 2100 continue; 2101 if (adev->ip_blocks[i].version->type != block_type) 2102 continue; 2103 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2104 continue; 2105 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2106 (void *)adev, state); 2107 if (r) 2108 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2109 adev->ip_blocks[i].version->funcs->name, r); 2110 } 2111 return r; 2112 } 2113 2114 /** 2115 * amdgpu_device_ip_get_clockgating_state - get the CG state 2116 * 2117 * @adev: amdgpu_device pointer 2118 * @flags: clockgating feature flags 2119 * 2120 * Walks the list of IPs on the device and updates the clockgating 2121 * flags for each IP. 2122 * Updates @flags with the feature flags for each hardware IP where 2123 * clockgating is enabled. 2124 */ 2125 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2126 u64 *flags) 2127 { 2128 int i; 2129 2130 for (i = 0; i < adev->num_ip_blocks; i++) { 2131 if (!adev->ip_blocks[i].status.valid) 2132 continue; 2133 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2134 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2135 } 2136 } 2137 2138 /** 2139 * amdgpu_device_ip_wait_for_idle - wait for idle 2140 * 2141 * @adev: amdgpu_device pointer 2142 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2143 * 2144 * Waits for the request hardware IP to be idle. 2145 * Returns 0 for success or a negative error code on failure. 2146 */ 2147 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2148 enum amd_ip_block_type block_type) 2149 { 2150 int i, r; 2151 2152 for (i = 0; i < adev->num_ip_blocks; i++) { 2153 if (!adev->ip_blocks[i].status.valid) 2154 continue; 2155 if (adev->ip_blocks[i].version->type == block_type) { 2156 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2157 if (r) 2158 return r; 2159 break; 2160 } 2161 } 2162 return 0; 2163 2164 } 2165 2166 /** 2167 * amdgpu_device_ip_is_idle - is the hardware IP idle 2168 * 2169 * @adev: amdgpu_device pointer 2170 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2171 * 2172 * Check if the hardware IP is idle or not. 2173 * Returns true if it the IP is idle, false if not. 2174 */ 2175 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2176 enum amd_ip_block_type block_type) 2177 { 2178 int i; 2179 2180 for (i = 0; i < adev->num_ip_blocks; i++) { 2181 if (!adev->ip_blocks[i].status.valid) 2182 continue; 2183 if (adev->ip_blocks[i].version->type == block_type) 2184 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2185 } 2186 return true; 2187 2188 } 2189 2190 /** 2191 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2192 * 2193 * @adev: amdgpu_device pointer 2194 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2195 * 2196 * Returns a pointer to the hardware IP block structure 2197 * if it exists for the asic, otherwise NULL. 2198 */ 2199 struct amdgpu_ip_block * 2200 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2201 enum amd_ip_block_type type) 2202 { 2203 int i; 2204 2205 for (i = 0; i < adev->num_ip_blocks; i++) 2206 if (adev->ip_blocks[i].version->type == type) 2207 return &adev->ip_blocks[i]; 2208 2209 return NULL; 2210 } 2211 2212 /** 2213 * amdgpu_device_ip_block_version_cmp 2214 * 2215 * @adev: amdgpu_device pointer 2216 * @type: enum amd_ip_block_type 2217 * @major: major version 2218 * @minor: minor version 2219 * 2220 * return 0 if equal or greater 2221 * return 1 if smaller or the ip_block doesn't exist 2222 */ 2223 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2224 enum amd_ip_block_type type, 2225 u32 major, u32 minor) 2226 { 2227 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2228 2229 if (ip_block && ((ip_block->version->major > major) || 2230 ((ip_block->version->major == major) && 2231 (ip_block->version->minor >= minor)))) 2232 return 0; 2233 2234 return 1; 2235 } 2236 2237 /** 2238 * amdgpu_device_ip_block_add 2239 * 2240 * @adev: amdgpu_device pointer 2241 * @ip_block_version: pointer to the IP to add 2242 * 2243 * Adds the IP block driver information to the collection of IPs 2244 * on the asic. 2245 */ 2246 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2247 const struct amdgpu_ip_block_version *ip_block_version) 2248 { 2249 if (!ip_block_version) 2250 return -EINVAL; 2251 2252 switch (ip_block_version->type) { 2253 case AMD_IP_BLOCK_TYPE_VCN: 2254 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2255 return 0; 2256 break; 2257 case AMD_IP_BLOCK_TYPE_JPEG: 2258 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2259 return 0; 2260 break; 2261 default: 2262 break; 2263 } 2264 2265 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2266 ip_block_version->funcs->name); 2267 2268 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2269 2270 return 0; 2271 } 2272 2273 /** 2274 * amdgpu_device_enable_virtual_display - enable virtual display feature 2275 * 2276 * @adev: amdgpu_device pointer 2277 * 2278 * Enabled the virtual display feature if the user has enabled it via 2279 * the module parameter virtual_display. This feature provides a virtual 2280 * display hardware on headless boards or in virtualized environments. 2281 * This function parses and validates the configuration string specified by 2282 * the user and configues the virtual display configuration (number of 2283 * virtual connectors, crtcs, etc.) specified. 2284 */ 2285 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2286 { 2287 adev->enable_virtual_display = false; 2288 2289 if (amdgpu_virtual_display) { 2290 const char *pci_address_name = pci_name(adev->pdev); 2291 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2292 2293 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2294 pciaddstr_tmp = pciaddstr; 2295 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2296 pciaddname = strsep(&pciaddname_tmp, ","); 2297 if (!strcmp("all", pciaddname) 2298 || !strcmp(pci_address_name, pciaddname)) { 2299 long num_crtc; 2300 int res = -1; 2301 2302 adev->enable_virtual_display = true; 2303 2304 if (pciaddname_tmp) 2305 res = kstrtol(pciaddname_tmp, 10, 2306 &num_crtc); 2307 2308 if (!res) { 2309 if (num_crtc < 1) 2310 num_crtc = 1; 2311 if (num_crtc > 6) 2312 num_crtc = 6; 2313 adev->mode_info.num_crtc = num_crtc; 2314 } else { 2315 adev->mode_info.num_crtc = 1; 2316 } 2317 break; 2318 } 2319 } 2320 2321 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2322 amdgpu_virtual_display, pci_address_name, 2323 adev->enable_virtual_display, adev->mode_info.num_crtc); 2324 2325 kfree(pciaddstr); 2326 } 2327 } 2328 2329 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2330 { 2331 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2332 adev->mode_info.num_crtc = 1; 2333 adev->enable_virtual_display = true; 2334 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2335 adev->enable_virtual_display, adev->mode_info.num_crtc); 2336 } 2337 } 2338 2339 /** 2340 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2341 * 2342 * @adev: amdgpu_device pointer 2343 * 2344 * Parses the asic configuration parameters specified in the gpu info 2345 * firmware and makes them availale to the driver for use in configuring 2346 * the asic. 2347 * Returns 0 on success, -EINVAL on failure. 2348 */ 2349 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2350 { 2351 const char *chip_name; 2352 char fw_name[40]; 2353 int err; 2354 const struct gpu_info_firmware_header_v1_0 *hdr; 2355 2356 adev->firmware.gpu_info_fw = NULL; 2357 2358 if (adev->mman.discovery_bin) 2359 return 0; 2360 2361 switch (adev->asic_type) { 2362 default: 2363 return 0; 2364 case CHIP_VEGA10: 2365 chip_name = "vega10"; 2366 break; 2367 case CHIP_VEGA12: 2368 chip_name = "vega12"; 2369 break; 2370 case CHIP_RAVEN: 2371 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2372 chip_name = "raven2"; 2373 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2374 chip_name = "picasso"; 2375 else 2376 chip_name = "raven"; 2377 break; 2378 case CHIP_ARCTURUS: 2379 chip_name = "arcturus"; 2380 break; 2381 case CHIP_NAVI12: 2382 chip_name = "navi12"; 2383 break; 2384 } 2385 2386 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2387 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2388 if (err) { 2389 dev_err(adev->dev, 2390 "Failed to get gpu_info firmware \"%s\"\n", 2391 fw_name); 2392 goto out; 2393 } 2394 2395 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2396 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2397 2398 switch (hdr->version_major) { 2399 case 1: 2400 { 2401 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2402 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2403 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2404 2405 /* 2406 * Should be droped when DAL no longer needs it. 2407 */ 2408 if (adev->asic_type == CHIP_NAVI12) 2409 goto parse_soc_bounding_box; 2410 2411 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2412 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2413 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2414 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2415 adev->gfx.config.max_texture_channel_caches = 2416 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2417 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2418 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2419 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2420 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2421 adev->gfx.config.double_offchip_lds_buf = 2422 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2423 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2424 adev->gfx.cu_info.max_waves_per_simd = 2425 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2426 adev->gfx.cu_info.max_scratch_slots_per_cu = 2427 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2428 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2429 if (hdr->version_minor >= 1) { 2430 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2431 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2432 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2433 adev->gfx.config.num_sc_per_sh = 2434 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2435 adev->gfx.config.num_packer_per_sc = 2436 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2437 } 2438 2439 parse_soc_bounding_box: 2440 /* 2441 * soc bounding box info is not integrated in disocovery table, 2442 * we always need to parse it from gpu info firmware if needed. 2443 */ 2444 if (hdr->version_minor == 2) { 2445 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2446 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2447 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2448 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2449 } 2450 break; 2451 } 2452 default: 2453 dev_err(adev->dev, 2454 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2455 err = -EINVAL; 2456 goto out; 2457 } 2458 out: 2459 return err; 2460 } 2461 2462 /** 2463 * amdgpu_device_ip_early_init - run early init for hardware IPs 2464 * 2465 * @adev: amdgpu_device pointer 2466 * 2467 * Early initialization pass for hardware IPs. The hardware IPs that make 2468 * up each asic are discovered each IP's early_init callback is run. This 2469 * is the first stage in initializing the asic. 2470 * Returns 0 on success, negative error code on failure. 2471 */ 2472 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2473 { 2474 struct pci_dev *parent; 2475 int i, r; 2476 bool total; 2477 2478 amdgpu_device_enable_virtual_display(adev); 2479 2480 if (amdgpu_sriov_vf(adev)) { 2481 r = amdgpu_virt_request_full_gpu(adev, true); 2482 if (r) 2483 return r; 2484 } 2485 2486 switch (adev->asic_type) { 2487 #ifdef CONFIG_DRM_AMDGPU_SI 2488 case CHIP_VERDE: 2489 case CHIP_TAHITI: 2490 case CHIP_PITCAIRN: 2491 case CHIP_OLAND: 2492 case CHIP_HAINAN: 2493 adev->family = AMDGPU_FAMILY_SI; 2494 r = si_set_ip_blocks(adev); 2495 if (r) 2496 return r; 2497 break; 2498 #endif 2499 #ifdef CONFIG_DRM_AMDGPU_CIK 2500 case CHIP_BONAIRE: 2501 case CHIP_HAWAII: 2502 case CHIP_KAVERI: 2503 case CHIP_KABINI: 2504 case CHIP_MULLINS: 2505 if (adev->flags & AMD_IS_APU) 2506 adev->family = AMDGPU_FAMILY_KV; 2507 else 2508 adev->family = AMDGPU_FAMILY_CI; 2509 2510 r = cik_set_ip_blocks(adev); 2511 if (r) 2512 return r; 2513 break; 2514 #endif 2515 case CHIP_TOPAZ: 2516 case CHIP_TONGA: 2517 case CHIP_FIJI: 2518 case CHIP_POLARIS10: 2519 case CHIP_POLARIS11: 2520 case CHIP_POLARIS12: 2521 case CHIP_VEGAM: 2522 case CHIP_CARRIZO: 2523 case CHIP_STONEY: 2524 if (adev->flags & AMD_IS_APU) 2525 adev->family = AMDGPU_FAMILY_CZ; 2526 else 2527 adev->family = AMDGPU_FAMILY_VI; 2528 2529 r = vi_set_ip_blocks(adev); 2530 if (r) 2531 return r; 2532 break; 2533 default: 2534 r = amdgpu_discovery_set_ip_blocks(adev); 2535 if (r) 2536 return r; 2537 break; 2538 } 2539 2540 if (amdgpu_has_atpx() && 2541 (amdgpu_is_atpx_hybrid() || 2542 amdgpu_has_atpx_dgpu_power_cntl()) && 2543 ((adev->flags & AMD_IS_APU) == 0) && 2544 !dev_is_removable(&adev->pdev->dev)) 2545 adev->flags |= AMD_IS_PX; 2546 2547 if (!(adev->flags & AMD_IS_APU)) { 2548 parent = pcie_find_root_port(adev->pdev); 2549 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2550 } 2551 2552 2553 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2554 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2555 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2556 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2557 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2558 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2559 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2560 2561 total = true; 2562 for (i = 0; i < adev->num_ip_blocks; i++) { 2563 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2564 DRM_WARN("disabled ip block: %d <%s>\n", 2565 i, adev->ip_blocks[i].version->funcs->name); 2566 adev->ip_blocks[i].status.valid = false; 2567 } else { 2568 if (adev->ip_blocks[i].version->funcs->early_init) { 2569 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2570 if (r == -ENOENT) { 2571 adev->ip_blocks[i].status.valid = false; 2572 } else if (r) { 2573 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2574 adev->ip_blocks[i].version->funcs->name, r); 2575 total = false; 2576 } else { 2577 adev->ip_blocks[i].status.valid = true; 2578 } 2579 } else { 2580 adev->ip_blocks[i].status.valid = true; 2581 } 2582 } 2583 /* get the vbios after the asic_funcs are set up */ 2584 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2585 r = amdgpu_device_parse_gpu_info_fw(adev); 2586 if (r) 2587 return r; 2588 2589 /* Read BIOS */ 2590 if (amdgpu_device_read_bios(adev)) { 2591 if (!amdgpu_get_bios(adev)) 2592 return -EINVAL; 2593 2594 r = amdgpu_atombios_init(adev); 2595 if (r) { 2596 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2597 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2598 return r; 2599 } 2600 } 2601 2602 /*get pf2vf msg info at it's earliest time*/ 2603 if (amdgpu_sriov_vf(adev)) 2604 amdgpu_virt_init_data_exchange(adev); 2605 2606 } 2607 } 2608 if (!total) 2609 return -ENODEV; 2610 2611 amdgpu_amdkfd_device_probe(adev); 2612 adev->cg_flags &= amdgpu_cg_mask; 2613 adev->pg_flags &= amdgpu_pg_mask; 2614 2615 return 0; 2616 } 2617 2618 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2619 { 2620 int i, r; 2621 2622 for (i = 0; i < adev->num_ip_blocks; i++) { 2623 if (!adev->ip_blocks[i].status.sw) 2624 continue; 2625 if (adev->ip_blocks[i].status.hw) 2626 continue; 2627 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2628 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2629 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2630 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2631 if (r) { 2632 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2633 adev->ip_blocks[i].version->funcs->name, r); 2634 return r; 2635 } 2636 adev->ip_blocks[i].status.hw = true; 2637 } 2638 } 2639 2640 return 0; 2641 } 2642 2643 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2644 { 2645 int i, r; 2646 2647 for (i = 0; i < adev->num_ip_blocks; i++) { 2648 if (!adev->ip_blocks[i].status.sw) 2649 continue; 2650 if (adev->ip_blocks[i].status.hw) 2651 continue; 2652 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2653 if (r) { 2654 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2655 adev->ip_blocks[i].version->funcs->name, r); 2656 return r; 2657 } 2658 adev->ip_blocks[i].status.hw = true; 2659 } 2660 2661 return 0; 2662 } 2663 2664 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2665 { 2666 int r = 0; 2667 int i; 2668 uint32_t smu_version; 2669 2670 if (adev->asic_type >= CHIP_VEGA10) { 2671 for (i = 0; i < adev->num_ip_blocks; i++) { 2672 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2673 continue; 2674 2675 if (!adev->ip_blocks[i].status.sw) 2676 continue; 2677 2678 /* no need to do the fw loading again if already done*/ 2679 if (adev->ip_blocks[i].status.hw == true) 2680 break; 2681 2682 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2683 r = adev->ip_blocks[i].version->funcs->resume(adev); 2684 if (r) { 2685 DRM_ERROR("resume of IP block <%s> failed %d\n", 2686 adev->ip_blocks[i].version->funcs->name, r); 2687 return r; 2688 } 2689 } else { 2690 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2691 if (r) { 2692 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2693 adev->ip_blocks[i].version->funcs->name, r); 2694 return r; 2695 } 2696 } 2697 2698 adev->ip_blocks[i].status.hw = true; 2699 break; 2700 } 2701 } 2702 2703 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2704 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2705 2706 return r; 2707 } 2708 2709 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2710 { 2711 long timeout; 2712 int r, i; 2713 2714 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2715 struct amdgpu_ring *ring = adev->rings[i]; 2716 2717 /* No need to setup the GPU scheduler for rings that don't need it */ 2718 if (!ring || ring->no_scheduler) 2719 continue; 2720 2721 switch (ring->funcs->type) { 2722 case AMDGPU_RING_TYPE_GFX: 2723 timeout = adev->gfx_timeout; 2724 break; 2725 case AMDGPU_RING_TYPE_COMPUTE: 2726 timeout = adev->compute_timeout; 2727 break; 2728 case AMDGPU_RING_TYPE_SDMA: 2729 timeout = adev->sdma_timeout; 2730 break; 2731 default: 2732 timeout = adev->video_timeout; 2733 break; 2734 } 2735 2736 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2737 DRM_SCHED_PRIORITY_COUNT, 2738 ring->num_hw_submission, 0, 2739 timeout, adev->reset_domain->wq, 2740 ring->sched_score, ring->name, 2741 adev->dev); 2742 if (r) { 2743 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2744 ring->name); 2745 return r; 2746 } 2747 r = amdgpu_uvd_entity_init(adev, ring); 2748 if (r) { 2749 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2750 ring->name); 2751 return r; 2752 } 2753 r = amdgpu_vce_entity_init(adev, ring); 2754 if (r) { 2755 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2756 ring->name); 2757 return r; 2758 } 2759 } 2760 2761 amdgpu_xcp_update_partition_sched_list(adev); 2762 2763 return 0; 2764 } 2765 2766 2767 /** 2768 * amdgpu_device_ip_init - run init for hardware IPs 2769 * 2770 * @adev: amdgpu_device pointer 2771 * 2772 * Main initialization pass for hardware IPs. The list of all the hardware 2773 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2774 * are run. sw_init initializes the software state associated with each IP 2775 * and hw_init initializes the hardware associated with each IP. 2776 * Returns 0 on success, negative error code on failure. 2777 */ 2778 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2779 { 2780 int i, r; 2781 2782 r = amdgpu_ras_init(adev); 2783 if (r) 2784 return r; 2785 2786 for (i = 0; i < adev->num_ip_blocks; i++) { 2787 if (!adev->ip_blocks[i].status.valid) 2788 continue; 2789 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2790 if (r) { 2791 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2792 adev->ip_blocks[i].version->funcs->name, r); 2793 goto init_failed; 2794 } 2795 adev->ip_blocks[i].status.sw = true; 2796 2797 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2798 /* need to do common hw init early so everything is set up for gmc */ 2799 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2800 if (r) { 2801 DRM_ERROR("hw_init %d failed %d\n", i, r); 2802 goto init_failed; 2803 } 2804 adev->ip_blocks[i].status.hw = true; 2805 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2806 /* need to do gmc hw init early so we can allocate gpu mem */ 2807 /* Try to reserve bad pages early */ 2808 if (amdgpu_sriov_vf(adev)) 2809 amdgpu_virt_exchange_data(adev); 2810 2811 r = amdgpu_device_mem_scratch_init(adev); 2812 if (r) { 2813 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2814 goto init_failed; 2815 } 2816 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2817 if (r) { 2818 DRM_ERROR("hw_init %d failed %d\n", i, r); 2819 goto init_failed; 2820 } 2821 r = amdgpu_device_wb_init(adev); 2822 if (r) { 2823 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2824 goto init_failed; 2825 } 2826 adev->ip_blocks[i].status.hw = true; 2827 2828 /* right after GMC hw init, we create CSA */ 2829 if (adev->gfx.mcbp) { 2830 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2831 AMDGPU_GEM_DOMAIN_VRAM | 2832 AMDGPU_GEM_DOMAIN_GTT, 2833 AMDGPU_CSA_SIZE); 2834 if (r) { 2835 DRM_ERROR("allocate CSA failed %d\n", r); 2836 goto init_failed; 2837 } 2838 } 2839 2840 r = amdgpu_seq64_init(adev); 2841 if (r) { 2842 DRM_ERROR("allocate seq64 failed %d\n", r); 2843 goto init_failed; 2844 } 2845 } 2846 } 2847 2848 if (amdgpu_sriov_vf(adev)) 2849 amdgpu_virt_init_data_exchange(adev); 2850 2851 r = amdgpu_ib_pool_init(adev); 2852 if (r) { 2853 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2854 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2855 goto init_failed; 2856 } 2857 2858 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2859 if (r) 2860 goto init_failed; 2861 2862 r = amdgpu_device_ip_hw_init_phase1(adev); 2863 if (r) 2864 goto init_failed; 2865 2866 r = amdgpu_device_fw_loading(adev); 2867 if (r) 2868 goto init_failed; 2869 2870 r = amdgpu_device_ip_hw_init_phase2(adev); 2871 if (r) 2872 goto init_failed; 2873 2874 /* 2875 * retired pages will be loaded from eeprom and reserved here, 2876 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2877 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2878 * for I2C communication which only true at this point. 2879 * 2880 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2881 * failure from bad gpu situation and stop amdgpu init process 2882 * accordingly. For other failed cases, it will still release all 2883 * the resource and print error message, rather than returning one 2884 * negative value to upper level. 2885 * 2886 * Note: theoretically, this should be called before all vram allocations 2887 * to protect retired page from abusing 2888 */ 2889 r = amdgpu_ras_recovery_init(adev); 2890 if (r) 2891 goto init_failed; 2892 2893 /** 2894 * In case of XGMI grab extra reference for reset domain for this device 2895 */ 2896 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2897 if (amdgpu_xgmi_add_device(adev) == 0) { 2898 if (!amdgpu_sriov_vf(adev)) { 2899 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2900 2901 if (WARN_ON(!hive)) { 2902 r = -ENOENT; 2903 goto init_failed; 2904 } 2905 2906 if (!hive->reset_domain || 2907 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2908 r = -ENOENT; 2909 amdgpu_put_xgmi_hive(hive); 2910 goto init_failed; 2911 } 2912 2913 /* Drop the early temporary reset domain we created for device */ 2914 amdgpu_reset_put_reset_domain(adev->reset_domain); 2915 adev->reset_domain = hive->reset_domain; 2916 amdgpu_put_xgmi_hive(hive); 2917 } 2918 } 2919 } 2920 2921 r = amdgpu_device_init_schedulers(adev); 2922 if (r) 2923 goto init_failed; 2924 2925 if (adev->mman.buffer_funcs_ring->sched.ready) 2926 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2927 2928 /* Don't init kfd if whole hive need to be reset during init */ 2929 if (!adev->gmc.xgmi.pending_reset) { 2930 kgd2kfd_init_zone_device(adev); 2931 amdgpu_amdkfd_device_init(adev); 2932 } 2933 2934 amdgpu_fru_get_product_info(adev); 2935 2936 init_failed: 2937 2938 return r; 2939 } 2940 2941 /** 2942 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2943 * 2944 * @adev: amdgpu_device pointer 2945 * 2946 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2947 * this function before a GPU reset. If the value is retained after a 2948 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2949 */ 2950 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2951 { 2952 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2953 } 2954 2955 /** 2956 * amdgpu_device_check_vram_lost - check if vram is valid 2957 * 2958 * @adev: amdgpu_device pointer 2959 * 2960 * Checks the reset magic value written to the gart pointer in VRAM. 2961 * The driver calls this after a GPU reset to see if the contents of 2962 * VRAM is lost or now. 2963 * returns true if vram is lost, false if not. 2964 */ 2965 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2966 { 2967 if (memcmp(adev->gart.ptr, adev->reset_magic, 2968 AMDGPU_RESET_MAGIC_NUM)) 2969 return true; 2970 2971 if (!amdgpu_in_reset(adev)) 2972 return false; 2973 2974 /* 2975 * For all ASICs with baco/mode1 reset, the VRAM is 2976 * always assumed to be lost. 2977 */ 2978 switch (amdgpu_asic_reset_method(adev)) { 2979 case AMD_RESET_METHOD_BACO: 2980 case AMD_RESET_METHOD_MODE1: 2981 return true; 2982 default: 2983 return false; 2984 } 2985 } 2986 2987 /** 2988 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2989 * 2990 * @adev: amdgpu_device pointer 2991 * @state: clockgating state (gate or ungate) 2992 * 2993 * The list of all the hardware IPs that make up the asic is walked and the 2994 * set_clockgating_state callbacks are run. 2995 * Late initialization pass enabling clockgating for hardware IPs. 2996 * Fini or suspend, pass disabling clockgating for hardware IPs. 2997 * Returns 0 on success, negative error code on failure. 2998 */ 2999 3000 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3001 enum amd_clockgating_state state) 3002 { 3003 int i, j, r; 3004 3005 if (amdgpu_emu_mode == 1) 3006 return 0; 3007 3008 for (j = 0; j < adev->num_ip_blocks; j++) { 3009 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3010 if (!adev->ip_blocks[i].status.late_initialized) 3011 continue; 3012 /* skip CG for GFX, SDMA on S0ix */ 3013 if (adev->in_s0ix && 3014 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3015 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3016 continue; 3017 /* skip CG for VCE/UVD, it's handled specially */ 3018 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3019 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3020 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3021 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3022 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3023 /* enable clockgating to save power */ 3024 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3025 state); 3026 if (r) { 3027 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3028 adev->ip_blocks[i].version->funcs->name, r); 3029 return r; 3030 } 3031 } 3032 } 3033 3034 return 0; 3035 } 3036 3037 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3038 enum amd_powergating_state state) 3039 { 3040 int i, j, r; 3041 3042 if (amdgpu_emu_mode == 1) 3043 return 0; 3044 3045 for (j = 0; j < adev->num_ip_blocks; j++) { 3046 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3047 if (!adev->ip_blocks[i].status.late_initialized) 3048 continue; 3049 /* skip PG for GFX, SDMA on S0ix */ 3050 if (adev->in_s0ix && 3051 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3052 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3053 continue; 3054 /* skip CG for VCE/UVD, it's handled specially */ 3055 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3056 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3057 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3058 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3059 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3060 /* enable powergating to save power */ 3061 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3062 state); 3063 if (r) { 3064 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3065 adev->ip_blocks[i].version->funcs->name, r); 3066 return r; 3067 } 3068 } 3069 } 3070 return 0; 3071 } 3072 3073 static int amdgpu_device_enable_mgpu_fan_boost(void) 3074 { 3075 struct amdgpu_gpu_instance *gpu_ins; 3076 struct amdgpu_device *adev; 3077 int i, ret = 0; 3078 3079 mutex_lock(&mgpu_info.mutex); 3080 3081 /* 3082 * MGPU fan boost feature should be enabled 3083 * only when there are two or more dGPUs in 3084 * the system 3085 */ 3086 if (mgpu_info.num_dgpu < 2) 3087 goto out; 3088 3089 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3090 gpu_ins = &(mgpu_info.gpu_ins[i]); 3091 adev = gpu_ins->adev; 3092 if (!(adev->flags & AMD_IS_APU) && 3093 !gpu_ins->mgpu_fan_enabled) { 3094 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3095 if (ret) 3096 break; 3097 3098 gpu_ins->mgpu_fan_enabled = 1; 3099 } 3100 } 3101 3102 out: 3103 mutex_unlock(&mgpu_info.mutex); 3104 3105 return ret; 3106 } 3107 3108 /** 3109 * amdgpu_device_ip_late_init - run late init for hardware IPs 3110 * 3111 * @adev: amdgpu_device pointer 3112 * 3113 * Late initialization pass for hardware IPs. The list of all the hardware 3114 * IPs that make up the asic is walked and the late_init callbacks are run. 3115 * late_init covers any special initialization that an IP requires 3116 * after all of the have been initialized or something that needs to happen 3117 * late in the init process. 3118 * Returns 0 on success, negative error code on failure. 3119 */ 3120 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3121 { 3122 struct amdgpu_gpu_instance *gpu_instance; 3123 int i = 0, r; 3124 3125 for (i = 0; i < adev->num_ip_blocks; i++) { 3126 if (!adev->ip_blocks[i].status.hw) 3127 continue; 3128 if (adev->ip_blocks[i].version->funcs->late_init) { 3129 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3130 if (r) { 3131 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3132 adev->ip_blocks[i].version->funcs->name, r); 3133 return r; 3134 } 3135 } 3136 adev->ip_blocks[i].status.late_initialized = true; 3137 } 3138 3139 r = amdgpu_ras_late_init(adev); 3140 if (r) { 3141 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3142 return r; 3143 } 3144 3145 amdgpu_ras_set_error_query_ready(adev, true); 3146 3147 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3148 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3149 3150 amdgpu_device_fill_reset_magic(adev); 3151 3152 r = amdgpu_device_enable_mgpu_fan_boost(); 3153 if (r) 3154 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3155 3156 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3157 if (amdgpu_passthrough(adev) && 3158 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3159 adev->asic_type == CHIP_ALDEBARAN)) 3160 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3161 3162 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3163 mutex_lock(&mgpu_info.mutex); 3164 3165 /* 3166 * Reset device p-state to low as this was booted with high. 3167 * 3168 * This should be performed only after all devices from the same 3169 * hive get initialized. 3170 * 3171 * However, it's unknown how many device in the hive in advance. 3172 * As this is counted one by one during devices initializations. 3173 * 3174 * So, we wait for all XGMI interlinked devices initialized. 3175 * This may bring some delays as those devices may come from 3176 * different hives. But that should be OK. 3177 */ 3178 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3179 for (i = 0; i < mgpu_info.num_gpu; i++) { 3180 gpu_instance = &(mgpu_info.gpu_ins[i]); 3181 if (gpu_instance->adev->flags & AMD_IS_APU) 3182 continue; 3183 3184 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3185 AMDGPU_XGMI_PSTATE_MIN); 3186 if (r) { 3187 DRM_ERROR("pstate setting failed (%d).\n", r); 3188 break; 3189 } 3190 } 3191 } 3192 3193 mutex_unlock(&mgpu_info.mutex); 3194 } 3195 3196 return 0; 3197 } 3198 3199 /** 3200 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3201 * 3202 * @adev: amdgpu_device pointer 3203 * 3204 * For ASICs need to disable SMC first 3205 */ 3206 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3207 { 3208 int i, r; 3209 3210 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3211 return; 3212 3213 for (i = 0; i < adev->num_ip_blocks; i++) { 3214 if (!adev->ip_blocks[i].status.hw) 3215 continue; 3216 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3217 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3218 /* XXX handle errors */ 3219 if (r) { 3220 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3221 adev->ip_blocks[i].version->funcs->name, r); 3222 } 3223 adev->ip_blocks[i].status.hw = false; 3224 break; 3225 } 3226 } 3227 } 3228 3229 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3230 { 3231 int i, r; 3232 3233 for (i = 0; i < adev->num_ip_blocks; i++) { 3234 if (!adev->ip_blocks[i].version->funcs->early_fini) 3235 continue; 3236 3237 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3238 if (r) { 3239 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3240 adev->ip_blocks[i].version->funcs->name, r); 3241 } 3242 } 3243 3244 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3245 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3246 3247 amdgpu_amdkfd_suspend(adev, false); 3248 3249 /* Workaroud for ASICs need to disable SMC first */ 3250 amdgpu_device_smu_fini_early(adev); 3251 3252 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3253 if (!adev->ip_blocks[i].status.hw) 3254 continue; 3255 3256 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3257 /* XXX handle errors */ 3258 if (r) { 3259 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3260 adev->ip_blocks[i].version->funcs->name, r); 3261 } 3262 3263 adev->ip_blocks[i].status.hw = false; 3264 } 3265 3266 if (amdgpu_sriov_vf(adev)) { 3267 if (amdgpu_virt_release_full_gpu(adev, false)) 3268 DRM_ERROR("failed to release exclusive mode on fini\n"); 3269 } 3270 3271 return 0; 3272 } 3273 3274 /** 3275 * amdgpu_device_ip_fini - run fini for hardware IPs 3276 * 3277 * @adev: amdgpu_device pointer 3278 * 3279 * Main teardown pass for hardware IPs. The list of all the hardware 3280 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3281 * are run. hw_fini tears down the hardware associated with each IP 3282 * and sw_fini tears down any software state associated with each IP. 3283 * Returns 0 on success, negative error code on failure. 3284 */ 3285 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3286 { 3287 int i, r; 3288 3289 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3290 amdgpu_virt_release_ras_err_handler_data(adev); 3291 3292 if (adev->gmc.xgmi.num_physical_nodes > 1) 3293 amdgpu_xgmi_remove_device(adev); 3294 3295 amdgpu_amdkfd_device_fini_sw(adev); 3296 3297 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3298 if (!adev->ip_blocks[i].status.sw) 3299 continue; 3300 3301 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3302 amdgpu_ucode_free_bo(adev); 3303 amdgpu_free_static_csa(&adev->virt.csa_obj); 3304 amdgpu_device_wb_fini(adev); 3305 amdgpu_device_mem_scratch_fini(adev); 3306 amdgpu_ib_pool_fini(adev); 3307 amdgpu_seq64_fini(adev); 3308 } 3309 3310 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3311 /* XXX handle errors */ 3312 if (r) { 3313 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3314 adev->ip_blocks[i].version->funcs->name, r); 3315 } 3316 adev->ip_blocks[i].status.sw = false; 3317 adev->ip_blocks[i].status.valid = false; 3318 } 3319 3320 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3321 if (!adev->ip_blocks[i].status.late_initialized) 3322 continue; 3323 if (adev->ip_blocks[i].version->funcs->late_fini) 3324 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3325 adev->ip_blocks[i].status.late_initialized = false; 3326 } 3327 3328 amdgpu_ras_fini(adev); 3329 3330 return 0; 3331 } 3332 3333 /** 3334 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3335 * 3336 * @work: work_struct. 3337 */ 3338 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3339 { 3340 struct amdgpu_device *adev = 3341 container_of(work, struct amdgpu_device, delayed_init_work.work); 3342 int r; 3343 3344 r = amdgpu_ib_ring_tests(adev); 3345 if (r) 3346 DRM_ERROR("ib ring test failed (%d).\n", r); 3347 } 3348 3349 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3350 { 3351 struct amdgpu_device *adev = 3352 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3353 3354 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3355 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3356 3357 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3358 adev->gfx.gfx_off_state = true; 3359 } 3360 3361 /** 3362 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3363 * 3364 * @adev: amdgpu_device pointer 3365 * 3366 * Main suspend function for hardware IPs. The list of all the hardware 3367 * IPs that make up the asic is walked, clockgating is disabled and the 3368 * suspend callbacks are run. suspend puts the hardware and software state 3369 * in each IP into a state suitable for suspend. 3370 * Returns 0 on success, negative error code on failure. 3371 */ 3372 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3373 { 3374 int i, r; 3375 3376 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3377 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3378 3379 /* 3380 * Per PMFW team's suggestion, driver needs to handle gfxoff 3381 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3382 * scenario. Add the missing df cstate disablement here. 3383 */ 3384 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3385 dev_warn(adev->dev, "Failed to disallow df cstate"); 3386 3387 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3388 if (!adev->ip_blocks[i].status.valid) 3389 continue; 3390 3391 /* displays are handled separately */ 3392 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3393 continue; 3394 3395 /* XXX handle errors */ 3396 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3397 /* XXX handle errors */ 3398 if (r) { 3399 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3400 adev->ip_blocks[i].version->funcs->name, r); 3401 return r; 3402 } 3403 3404 adev->ip_blocks[i].status.hw = false; 3405 } 3406 3407 return 0; 3408 } 3409 3410 /** 3411 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3412 * 3413 * @adev: amdgpu_device pointer 3414 * 3415 * Main suspend function for hardware IPs. The list of all the hardware 3416 * IPs that make up the asic is walked, clockgating is disabled and the 3417 * suspend callbacks are run. suspend puts the hardware and software state 3418 * in each IP into a state suitable for suspend. 3419 * Returns 0 on success, negative error code on failure. 3420 */ 3421 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3422 { 3423 int i, r; 3424 3425 if (adev->in_s0ix) 3426 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3427 3428 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3429 if (!adev->ip_blocks[i].status.valid) 3430 continue; 3431 /* displays are handled in phase1 */ 3432 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3433 continue; 3434 /* PSP lost connection when err_event_athub occurs */ 3435 if (amdgpu_ras_intr_triggered() && 3436 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3437 adev->ip_blocks[i].status.hw = false; 3438 continue; 3439 } 3440 3441 /* skip unnecessary suspend if we do not initialize them yet */ 3442 if (adev->gmc.xgmi.pending_reset && 3443 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3444 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3445 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3447 adev->ip_blocks[i].status.hw = false; 3448 continue; 3449 } 3450 3451 /* skip suspend of gfx/mes and psp for S0ix 3452 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3453 * like at runtime. PSP is also part of the always on hardware 3454 * so no need to suspend it. 3455 */ 3456 if (adev->in_s0ix && 3457 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3458 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3459 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3460 continue; 3461 3462 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3463 if (adev->in_s0ix && 3464 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3465 IP_VERSION(5, 0, 0)) && 3466 (adev->ip_blocks[i].version->type == 3467 AMD_IP_BLOCK_TYPE_SDMA)) 3468 continue; 3469 3470 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3471 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3472 * from this location and RLC Autoload automatically also gets loaded 3473 * from here based on PMFW -> PSP message during re-init sequence. 3474 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3475 * the TMR and reload FWs again for IMU enabled APU ASICs. 3476 */ 3477 if (amdgpu_in_reset(adev) && 3478 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3479 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3480 continue; 3481 3482 /* XXX handle errors */ 3483 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3484 /* XXX handle errors */ 3485 if (r) { 3486 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3487 adev->ip_blocks[i].version->funcs->name, r); 3488 } 3489 adev->ip_blocks[i].status.hw = false; 3490 /* handle putting the SMC in the appropriate state */ 3491 if (!amdgpu_sriov_vf(adev)) { 3492 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3493 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3494 if (r) { 3495 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3496 adev->mp1_state, r); 3497 return r; 3498 } 3499 } 3500 } 3501 } 3502 3503 return 0; 3504 } 3505 3506 /** 3507 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3508 * 3509 * @adev: amdgpu_device pointer 3510 * 3511 * Main suspend function for hardware IPs. The list of all the hardware 3512 * IPs that make up the asic is walked, clockgating is disabled and the 3513 * suspend callbacks are run. suspend puts the hardware and software state 3514 * in each IP into a state suitable for suspend. 3515 * Returns 0 on success, negative error code on failure. 3516 */ 3517 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3518 { 3519 int r; 3520 3521 if (amdgpu_sriov_vf(adev)) { 3522 amdgpu_virt_fini_data_exchange(adev); 3523 amdgpu_virt_request_full_gpu(adev, false); 3524 } 3525 3526 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3527 3528 r = amdgpu_device_ip_suspend_phase1(adev); 3529 if (r) 3530 return r; 3531 r = amdgpu_device_ip_suspend_phase2(adev); 3532 3533 if (amdgpu_sriov_vf(adev)) 3534 amdgpu_virt_release_full_gpu(adev, false); 3535 3536 return r; 3537 } 3538 3539 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3540 { 3541 int i, r; 3542 3543 static enum amd_ip_block_type ip_order[] = { 3544 AMD_IP_BLOCK_TYPE_COMMON, 3545 AMD_IP_BLOCK_TYPE_GMC, 3546 AMD_IP_BLOCK_TYPE_PSP, 3547 AMD_IP_BLOCK_TYPE_IH, 3548 }; 3549 3550 for (i = 0; i < adev->num_ip_blocks; i++) { 3551 int j; 3552 struct amdgpu_ip_block *block; 3553 3554 block = &adev->ip_blocks[i]; 3555 block->status.hw = false; 3556 3557 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3558 3559 if (block->version->type != ip_order[j] || 3560 !block->status.valid) 3561 continue; 3562 3563 r = block->version->funcs->hw_init(adev); 3564 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3565 if (r) 3566 return r; 3567 block->status.hw = true; 3568 } 3569 } 3570 3571 return 0; 3572 } 3573 3574 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3575 { 3576 int i, r; 3577 3578 static enum amd_ip_block_type ip_order[] = { 3579 AMD_IP_BLOCK_TYPE_SMC, 3580 AMD_IP_BLOCK_TYPE_DCE, 3581 AMD_IP_BLOCK_TYPE_GFX, 3582 AMD_IP_BLOCK_TYPE_SDMA, 3583 AMD_IP_BLOCK_TYPE_MES, 3584 AMD_IP_BLOCK_TYPE_UVD, 3585 AMD_IP_BLOCK_TYPE_VCE, 3586 AMD_IP_BLOCK_TYPE_VCN, 3587 AMD_IP_BLOCK_TYPE_JPEG 3588 }; 3589 3590 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3591 int j; 3592 struct amdgpu_ip_block *block; 3593 3594 for (j = 0; j < adev->num_ip_blocks; j++) { 3595 block = &adev->ip_blocks[j]; 3596 3597 if (block->version->type != ip_order[i] || 3598 !block->status.valid || 3599 block->status.hw) 3600 continue; 3601 3602 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3603 r = block->version->funcs->resume(adev); 3604 else 3605 r = block->version->funcs->hw_init(adev); 3606 3607 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3608 if (r) 3609 return r; 3610 block->status.hw = true; 3611 } 3612 } 3613 3614 return 0; 3615 } 3616 3617 /** 3618 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3619 * 3620 * @adev: amdgpu_device pointer 3621 * 3622 * First resume function for hardware IPs. The list of all the hardware 3623 * IPs that make up the asic is walked and the resume callbacks are run for 3624 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3625 * after a suspend and updates the software state as necessary. This 3626 * function is also used for restoring the GPU after a GPU reset. 3627 * Returns 0 on success, negative error code on failure. 3628 */ 3629 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3630 { 3631 int i, r; 3632 3633 for (i = 0; i < adev->num_ip_blocks; i++) { 3634 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3635 continue; 3636 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3637 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3638 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3639 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3640 3641 r = adev->ip_blocks[i].version->funcs->resume(adev); 3642 if (r) { 3643 DRM_ERROR("resume of IP block <%s> failed %d\n", 3644 adev->ip_blocks[i].version->funcs->name, r); 3645 return r; 3646 } 3647 adev->ip_blocks[i].status.hw = true; 3648 } 3649 } 3650 3651 return 0; 3652 } 3653 3654 /** 3655 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3656 * 3657 * @adev: amdgpu_device pointer 3658 * 3659 * First resume function for hardware IPs. The list of all the hardware 3660 * IPs that make up the asic is walked and the resume callbacks are run for 3661 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3662 * functional state after a suspend and updates the software state as 3663 * necessary. This function is also used for restoring the GPU after a GPU 3664 * reset. 3665 * Returns 0 on success, negative error code on failure. 3666 */ 3667 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3668 { 3669 int i, r; 3670 3671 for (i = 0; i < adev->num_ip_blocks; i++) { 3672 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3673 continue; 3674 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3675 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3676 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3677 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3678 continue; 3679 r = adev->ip_blocks[i].version->funcs->resume(adev); 3680 if (r) { 3681 DRM_ERROR("resume of IP block <%s> failed %d\n", 3682 adev->ip_blocks[i].version->funcs->name, r); 3683 return r; 3684 } 3685 adev->ip_blocks[i].status.hw = true; 3686 } 3687 3688 return 0; 3689 } 3690 3691 /** 3692 * amdgpu_device_ip_resume - run resume for hardware IPs 3693 * 3694 * @adev: amdgpu_device pointer 3695 * 3696 * Main resume function for hardware IPs. The hardware IPs 3697 * are split into two resume functions because they are 3698 * also used in recovering from a GPU reset and some additional 3699 * steps need to be take between them. In this case (S3/S4) they are 3700 * run sequentially. 3701 * Returns 0 on success, negative error code on failure. 3702 */ 3703 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3704 { 3705 int r; 3706 3707 r = amdgpu_device_ip_resume_phase1(adev); 3708 if (r) 3709 return r; 3710 3711 r = amdgpu_device_fw_loading(adev); 3712 if (r) 3713 return r; 3714 3715 r = amdgpu_device_ip_resume_phase2(adev); 3716 3717 if (adev->mman.buffer_funcs_ring->sched.ready) 3718 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3719 3720 return r; 3721 } 3722 3723 /** 3724 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3725 * 3726 * @adev: amdgpu_device pointer 3727 * 3728 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3729 */ 3730 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3731 { 3732 if (amdgpu_sriov_vf(adev)) { 3733 if (adev->is_atom_fw) { 3734 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3735 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3736 } else { 3737 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3738 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3739 } 3740 3741 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3742 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3743 } 3744 } 3745 3746 /** 3747 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3748 * 3749 * @asic_type: AMD asic type 3750 * 3751 * Check if there is DC (new modesetting infrastructre) support for an asic. 3752 * returns true if DC has support, false if not. 3753 */ 3754 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3755 { 3756 switch (asic_type) { 3757 #ifdef CONFIG_DRM_AMDGPU_SI 3758 case CHIP_HAINAN: 3759 #endif 3760 case CHIP_TOPAZ: 3761 /* chips with no display hardware */ 3762 return false; 3763 #if defined(CONFIG_DRM_AMD_DC) 3764 case CHIP_TAHITI: 3765 case CHIP_PITCAIRN: 3766 case CHIP_VERDE: 3767 case CHIP_OLAND: 3768 /* 3769 * We have systems in the wild with these ASICs that require 3770 * LVDS and VGA support which is not supported with DC. 3771 * 3772 * Fallback to the non-DC driver here by default so as not to 3773 * cause regressions. 3774 */ 3775 #if defined(CONFIG_DRM_AMD_DC_SI) 3776 return amdgpu_dc > 0; 3777 #else 3778 return false; 3779 #endif 3780 case CHIP_BONAIRE: 3781 case CHIP_KAVERI: 3782 case CHIP_KABINI: 3783 case CHIP_MULLINS: 3784 /* 3785 * We have systems in the wild with these ASICs that require 3786 * VGA support which is not supported with DC. 3787 * 3788 * Fallback to the non-DC driver here by default so as not to 3789 * cause regressions. 3790 */ 3791 return amdgpu_dc > 0; 3792 default: 3793 return amdgpu_dc != 0; 3794 #else 3795 default: 3796 if (amdgpu_dc > 0) 3797 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3798 return false; 3799 #endif 3800 } 3801 } 3802 3803 /** 3804 * amdgpu_device_has_dc_support - check if dc is supported 3805 * 3806 * @adev: amdgpu_device pointer 3807 * 3808 * Returns true for supported, false for not supported 3809 */ 3810 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3811 { 3812 if (adev->enable_virtual_display || 3813 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3814 return false; 3815 3816 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3817 } 3818 3819 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3820 { 3821 struct amdgpu_device *adev = 3822 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3823 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3824 3825 /* It's a bug to not have a hive within this function */ 3826 if (WARN_ON(!hive)) 3827 return; 3828 3829 /* 3830 * Use task barrier to synchronize all xgmi reset works across the 3831 * hive. task_barrier_enter and task_barrier_exit will block 3832 * until all the threads running the xgmi reset works reach 3833 * those points. task_barrier_full will do both blocks. 3834 */ 3835 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3836 3837 task_barrier_enter(&hive->tb); 3838 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3839 3840 if (adev->asic_reset_res) 3841 goto fail; 3842 3843 task_barrier_exit(&hive->tb); 3844 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3845 3846 if (adev->asic_reset_res) 3847 goto fail; 3848 3849 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3850 } else { 3851 3852 task_barrier_full(&hive->tb); 3853 adev->asic_reset_res = amdgpu_asic_reset(adev); 3854 } 3855 3856 fail: 3857 if (adev->asic_reset_res) 3858 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3859 adev->asic_reset_res, adev_to_drm(adev)->unique); 3860 amdgpu_put_xgmi_hive(hive); 3861 } 3862 3863 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3864 { 3865 char *input = amdgpu_lockup_timeout; 3866 char *timeout_setting = NULL; 3867 int index = 0; 3868 long timeout; 3869 int ret = 0; 3870 3871 /* 3872 * By default timeout for non compute jobs is 10000 3873 * and 60000 for compute jobs. 3874 * In SR-IOV or passthrough mode, timeout for compute 3875 * jobs are 60000 by default. 3876 */ 3877 adev->gfx_timeout = msecs_to_jiffies(10000); 3878 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3879 if (amdgpu_sriov_vf(adev)) 3880 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3881 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3882 else 3883 adev->compute_timeout = msecs_to_jiffies(60000); 3884 3885 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3886 while ((timeout_setting = strsep(&input, ",")) && 3887 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3888 ret = kstrtol(timeout_setting, 0, &timeout); 3889 if (ret) 3890 return ret; 3891 3892 if (timeout == 0) { 3893 index++; 3894 continue; 3895 } else if (timeout < 0) { 3896 timeout = MAX_SCHEDULE_TIMEOUT; 3897 dev_warn(adev->dev, "lockup timeout disabled"); 3898 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3899 } else { 3900 timeout = msecs_to_jiffies(timeout); 3901 } 3902 3903 switch (index++) { 3904 case 0: 3905 adev->gfx_timeout = timeout; 3906 break; 3907 case 1: 3908 adev->compute_timeout = timeout; 3909 break; 3910 case 2: 3911 adev->sdma_timeout = timeout; 3912 break; 3913 case 3: 3914 adev->video_timeout = timeout; 3915 break; 3916 default: 3917 break; 3918 } 3919 } 3920 /* 3921 * There is only one value specified and 3922 * it should apply to all non-compute jobs. 3923 */ 3924 if (index == 1) { 3925 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3926 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3927 adev->compute_timeout = adev->gfx_timeout; 3928 } 3929 } 3930 3931 return ret; 3932 } 3933 3934 /** 3935 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3936 * 3937 * @adev: amdgpu_device pointer 3938 * 3939 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3940 */ 3941 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3942 { 3943 struct iommu_domain *domain; 3944 3945 domain = iommu_get_domain_for_dev(adev->dev); 3946 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3947 adev->ram_is_direct_mapped = true; 3948 } 3949 3950 static const struct attribute *amdgpu_dev_attributes[] = { 3951 &dev_attr_pcie_replay_count.attr, 3952 NULL 3953 }; 3954 3955 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3956 { 3957 if (amdgpu_mcbp == 1) 3958 adev->gfx.mcbp = true; 3959 else if (amdgpu_mcbp == 0) 3960 adev->gfx.mcbp = false; 3961 3962 if (amdgpu_sriov_vf(adev)) 3963 adev->gfx.mcbp = true; 3964 3965 if (adev->gfx.mcbp) 3966 DRM_INFO("MCBP is enabled\n"); 3967 } 3968 3969 /** 3970 * amdgpu_device_init - initialize the driver 3971 * 3972 * @adev: amdgpu_device pointer 3973 * @flags: driver flags 3974 * 3975 * Initializes the driver info and hw (all asics). 3976 * Returns 0 for success or an error on failure. 3977 * Called at driver startup. 3978 */ 3979 int amdgpu_device_init(struct amdgpu_device *adev, 3980 uint32_t flags) 3981 { 3982 struct drm_device *ddev = adev_to_drm(adev); 3983 struct pci_dev *pdev = adev->pdev; 3984 int r, i; 3985 bool px = false; 3986 u32 max_MBps; 3987 int tmp; 3988 3989 adev->shutdown = false; 3990 adev->flags = flags; 3991 3992 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3993 adev->asic_type = amdgpu_force_asic_type; 3994 else 3995 adev->asic_type = flags & AMD_ASIC_MASK; 3996 3997 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3998 if (amdgpu_emu_mode == 1) 3999 adev->usec_timeout *= 10; 4000 adev->gmc.gart_size = 512 * 1024 * 1024; 4001 adev->accel_working = false; 4002 adev->num_rings = 0; 4003 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4004 adev->mman.buffer_funcs = NULL; 4005 adev->mman.buffer_funcs_ring = NULL; 4006 adev->vm_manager.vm_pte_funcs = NULL; 4007 adev->vm_manager.vm_pte_num_scheds = 0; 4008 adev->gmc.gmc_funcs = NULL; 4009 adev->harvest_ip_mask = 0x0; 4010 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4011 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4012 4013 adev->smc_rreg = &amdgpu_invalid_rreg; 4014 adev->smc_wreg = &amdgpu_invalid_wreg; 4015 adev->pcie_rreg = &amdgpu_invalid_rreg; 4016 adev->pcie_wreg = &amdgpu_invalid_wreg; 4017 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4018 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4019 adev->pciep_rreg = &amdgpu_invalid_rreg; 4020 adev->pciep_wreg = &amdgpu_invalid_wreg; 4021 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4022 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4023 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4024 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4025 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4026 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4027 adev->didt_rreg = &amdgpu_invalid_rreg; 4028 adev->didt_wreg = &amdgpu_invalid_wreg; 4029 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4030 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4031 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4032 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4033 4034 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4035 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4036 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4037 4038 /* mutex initialization are all done here so we 4039 * can recall function without having locking issues 4040 */ 4041 mutex_init(&adev->firmware.mutex); 4042 mutex_init(&adev->pm.mutex); 4043 mutex_init(&adev->gfx.gpu_clock_mutex); 4044 mutex_init(&adev->srbm_mutex); 4045 mutex_init(&adev->gfx.pipe_reserve_mutex); 4046 mutex_init(&adev->gfx.gfx_off_mutex); 4047 mutex_init(&adev->gfx.partition_mutex); 4048 mutex_init(&adev->grbm_idx_mutex); 4049 mutex_init(&adev->mn_lock); 4050 mutex_init(&adev->virt.vf_errors.lock); 4051 hash_init(adev->mn_hash); 4052 mutex_init(&adev->psp.mutex); 4053 mutex_init(&adev->notifier_lock); 4054 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4055 mutex_init(&adev->benchmark_mutex); 4056 4057 amdgpu_device_init_apu_flags(adev); 4058 4059 r = amdgpu_device_check_arguments(adev); 4060 if (r) 4061 return r; 4062 4063 spin_lock_init(&adev->mmio_idx_lock); 4064 spin_lock_init(&adev->smc_idx_lock); 4065 spin_lock_init(&adev->pcie_idx_lock); 4066 spin_lock_init(&adev->uvd_ctx_idx_lock); 4067 spin_lock_init(&adev->didt_idx_lock); 4068 spin_lock_init(&adev->gc_cac_idx_lock); 4069 spin_lock_init(&adev->se_cac_idx_lock); 4070 spin_lock_init(&adev->audio_endpt_idx_lock); 4071 spin_lock_init(&adev->mm_stats.lock); 4072 spin_lock_init(&adev->wb.lock); 4073 4074 INIT_LIST_HEAD(&adev->shadow_list); 4075 mutex_init(&adev->shadow_list_lock); 4076 4077 INIT_LIST_HEAD(&adev->reset_list); 4078 4079 INIT_LIST_HEAD(&adev->ras_list); 4080 4081 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4082 4083 INIT_DELAYED_WORK(&adev->delayed_init_work, 4084 amdgpu_device_delayed_init_work_handler); 4085 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4086 amdgpu_device_delay_enable_gfx_off); 4087 4088 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4089 4090 adev->gfx.gfx_off_req_count = 1; 4091 adev->gfx.gfx_off_residency = 0; 4092 adev->gfx.gfx_off_entrycount = 0; 4093 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4094 4095 atomic_set(&adev->throttling_logging_enabled, 1); 4096 /* 4097 * If throttling continues, logging will be performed every minute 4098 * to avoid log flooding. "-1" is subtracted since the thermal 4099 * throttling interrupt comes every second. Thus, the total logging 4100 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4101 * for throttling interrupt) = 60 seconds. 4102 */ 4103 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4104 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4105 4106 /* Registers mapping */ 4107 /* TODO: block userspace mapping of io register */ 4108 if (adev->asic_type >= CHIP_BONAIRE) { 4109 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4110 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4111 } else { 4112 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4113 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4114 } 4115 4116 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4117 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4118 4119 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4120 if (!adev->rmmio) 4121 return -ENOMEM; 4122 4123 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4124 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4125 4126 /* 4127 * Reset domain needs to be present early, before XGMI hive discovered 4128 * (if any) and intitialized to use reset sem and in_gpu reset flag 4129 * early on during init and before calling to RREG32. 4130 */ 4131 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4132 if (!adev->reset_domain) 4133 return -ENOMEM; 4134 4135 /* detect hw virtualization here */ 4136 amdgpu_detect_virtualization(adev); 4137 4138 amdgpu_device_get_pcie_info(adev); 4139 4140 r = amdgpu_device_get_job_timeout_settings(adev); 4141 if (r) { 4142 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4143 return r; 4144 } 4145 4146 amdgpu_device_set_mcbp(adev); 4147 4148 /* early init functions */ 4149 r = amdgpu_device_ip_early_init(adev); 4150 if (r) 4151 return r; 4152 4153 /* Get rid of things like offb */ 4154 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4155 if (r) 4156 return r; 4157 4158 /* Enable TMZ based on IP_VERSION */ 4159 amdgpu_gmc_tmz_set(adev); 4160 4161 if (amdgpu_sriov_vf(adev) && 4162 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4163 /* VF MMIO access (except mailbox range) from CPU 4164 * will be blocked during sriov runtime 4165 */ 4166 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4167 4168 amdgpu_gmc_noretry_set(adev); 4169 /* Need to get xgmi info early to decide the reset behavior*/ 4170 if (adev->gmc.xgmi.supported) { 4171 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4172 if (r) 4173 return r; 4174 } 4175 4176 /* enable PCIE atomic ops */ 4177 if (amdgpu_sriov_vf(adev)) { 4178 if (adev->virt.fw_reserve.p_pf2vf) 4179 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4180 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4181 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4182 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4183 * internal path natively support atomics, set have_atomics_support to true. 4184 */ 4185 } else if ((adev->flags & AMD_IS_APU) && 4186 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4187 IP_VERSION(9, 0, 0))) { 4188 adev->have_atomics_support = true; 4189 } else { 4190 adev->have_atomics_support = 4191 !pci_enable_atomic_ops_to_root(adev->pdev, 4192 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4193 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4194 } 4195 4196 if (!adev->have_atomics_support) 4197 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4198 4199 /* doorbell bar mapping and doorbell index init*/ 4200 amdgpu_doorbell_init(adev); 4201 4202 if (amdgpu_emu_mode == 1) { 4203 /* post the asic on emulation mode */ 4204 emu_soc_asic_init(adev); 4205 goto fence_driver_init; 4206 } 4207 4208 amdgpu_reset_init(adev); 4209 4210 /* detect if we are with an SRIOV vbios */ 4211 if (adev->bios) 4212 amdgpu_device_detect_sriov_bios(adev); 4213 4214 /* check if we need to reset the asic 4215 * E.g., driver was not cleanly unloaded previously, etc. 4216 */ 4217 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4218 if (adev->gmc.xgmi.num_physical_nodes) { 4219 dev_info(adev->dev, "Pending hive reset.\n"); 4220 adev->gmc.xgmi.pending_reset = true; 4221 /* Only need to init necessary block for SMU to handle the reset */ 4222 for (i = 0; i < adev->num_ip_blocks; i++) { 4223 if (!adev->ip_blocks[i].status.valid) 4224 continue; 4225 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4226 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4227 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4228 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4229 DRM_DEBUG("IP %s disabled for hw_init.\n", 4230 adev->ip_blocks[i].version->funcs->name); 4231 adev->ip_blocks[i].status.hw = true; 4232 } 4233 } 4234 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4235 !amdgpu_device_has_display_hardware(adev)) { 4236 r = psp_gpu_reset(adev); 4237 } else { 4238 tmp = amdgpu_reset_method; 4239 /* It should do a default reset when loading or reloading the driver, 4240 * regardless of the module parameter reset_method. 4241 */ 4242 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4243 r = amdgpu_asic_reset(adev); 4244 amdgpu_reset_method = tmp; 4245 } 4246 4247 if (r) { 4248 dev_err(adev->dev, "asic reset on init failed\n"); 4249 goto failed; 4250 } 4251 } 4252 4253 /* Post card if necessary */ 4254 if (amdgpu_device_need_post(adev)) { 4255 if (!adev->bios) { 4256 dev_err(adev->dev, "no vBIOS found\n"); 4257 r = -EINVAL; 4258 goto failed; 4259 } 4260 DRM_INFO("GPU posting now...\n"); 4261 r = amdgpu_device_asic_init(adev); 4262 if (r) { 4263 dev_err(adev->dev, "gpu post error!\n"); 4264 goto failed; 4265 } 4266 } 4267 4268 if (adev->bios) { 4269 if (adev->is_atom_fw) { 4270 /* Initialize clocks */ 4271 r = amdgpu_atomfirmware_get_clock_info(adev); 4272 if (r) { 4273 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4274 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4275 goto failed; 4276 } 4277 } else { 4278 /* Initialize clocks */ 4279 r = amdgpu_atombios_get_clock_info(adev); 4280 if (r) { 4281 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4282 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4283 goto failed; 4284 } 4285 /* init i2c buses */ 4286 if (!amdgpu_device_has_dc_support(adev)) 4287 amdgpu_atombios_i2c_init(adev); 4288 } 4289 } 4290 4291 fence_driver_init: 4292 /* Fence driver */ 4293 r = amdgpu_fence_driver_sw_init(adev); 4294 if (r) { 4295 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4296 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4297 goto failed; 4298 } 4299 4300 /* init the mode config */ 4301 drm_mode_config_init(adev_to_drm(adev)); 4302 4303 r = amdgpu_device_ip_init(adev); 4304 if (r) { 4305 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4306 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4307 goto release_ras_con; 4308 } 4309 4310 amdgpu_fence_driver_hw_init(adev); 4311 4312 dev_info(adev->dev, 4313 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4314 adev->gfx.config.max_shader_engines, 4315 adev->gfx.config.max_sh_per_se, 4316 adev->gfx.config.max_cu_per_sh, 4317 adev->gfx.cu_info.number); 4318 4319 adev->accel_working = true; 4320 4321 amdgpu_vm_check_compute_bug(adev); 4322 4323 /* Initialize the buffer migration limit. */ 4324 if (amdgpu_moverate >= 0) 4325 max_MBps = amdgpu_moverate; 4326 else 4327 max_MBps = 8; /* Allow 8 MB/s. */ 4328 /* Get a log2 for easy divisions. */ 4329 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4330 4331 /* 4332 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4333 * Otherwise the mgpu fan boost feature will be skipped due to the 4334 * gpu instance is counted less. 4335 */ 4336 amdgpu_register_gpu_instance(adev); 4337 4338 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4339 * explicit gating rather than handling it automatically. 4340 */ 4341 if (!adev->gmc.xgmi.pending_reset) { 4342 r = amdgpu_device_ip_late_init(adev); 4343 if (r) { 4344 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4345 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4346 goto release_ras_con; 4347 } 4348 /* must succeed. */ 4349 amdgpu_ras_resume(adev); 4350 queue_delayed_work(system_wq, &adev->delayed_init_work, 4351 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4352 } 4353 4354 if (amdgpu_sriov_vf(adev)) { 4355 amdgpu_virt_release_full_gpu(adev, true); 4356 flush_delayed_work(&adev->delayed_init_work); 4357 } 4358 4359 /* 4360 * Place those sysfs registering after `late_init`. As some of those 4361 * operations performed in `late_init` might affect the sysfs 4362 * interfaces creating. 4363 */ 4364 r = amdgpu_atombios_sysfs_init(adev); 4365 if (r) 4366 drm_err(&adev->ddev, 4367 "registering atombios sysfs failed (%d).\n", r); 4368 4369 r = amdgpu_pm_sysfs_init(adev); 4370 if (r) 4371 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4372 4373 r = amdgpu_ucode_sysfs_init(adev); 4374 if (r) { 4375 adev->ucode_sysfs_en = false; 4376 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4377 } else 4378 adev->ucode_sysfs_en = true; 4379 4380 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4381 if (r) 4382 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4383 4384 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4385 if (r) 4386 dev_err(adev->dev, 4387 "Could not create amdgpu board attributes\n"); 4388 4389 amdgpu_fru_sysfs_init(adev); 4390 amdgpu_reg_state_sysfs_init(adev); 4391 4392 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4393 r = amdgpu_pmu_init(adev); 4394 if (r) 4395 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4396 4397 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4398 if (amdgpu_device_cache_pci_state(adev->pdev)) 4399 pci_restore_state(pdev); 4400 4401 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4402 /* this will fail for cards that aren't VGA class devices, just 4403 * ignore it 4404 */ 4405 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4406 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4407 4408 px = amdgpu_device_supports_px(ddev); 4409 4410 if (px || (!dev_is_removable(&adev->pdev->dev) && 4411 apple_gmux_detect(NULL, NULL))) 4412 vga_switcheroo_register_client(adev->pdev, 4413 &amdgpu_switcheroo_ops, px); 4414 4415 if (px) 4416 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4417 4418 if (adev->gmc.xgmi.pending_reset) 4419 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4420 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4421 4422 amdgpu_device_check_iommu_direct_map(adev); 4423 4424 return 0; 4425 4426 release_ras_con: 4427 if (amdgpu_sriov_vf(adev)) 4428 amdgpu_virt_release_full_gpu(adev, true); 4429 4430 /* failed in exclusive mode due to timeout */ 4431 if (amdgpu_sriov_vf(adev) && 4432 !amdgpu_sriov_runtime(adev) && 4433 amdgpu_virt_mmio_blocked(adev) && 4434 !amdgpu_virt_wait_reset(adev)) { 4435 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4436 /* Don't send request since VF is inactive. */ 4437 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4438 adev->virt.ops = NULL; 4439 r = -EAGAIN; 4440 } 4441 amdgpu_release_ras_context(adev); 4442 4443 failed: 4444 amdgpu_vf_error_trans_all(adev); 4445 4446 return r; 4447 } 4448 4449 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4450 { 4451 4452 /* Clear all CPU mappings pointing to this device */ 4453 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4454 4455 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4456 amdgpu_doorbell_fini(adev); 4457 4458 iounmap(adev->rmmio); 4459 adev->rmmio = NULL; 4460 if (adev->mman.aper_base_kaddr) 4461 iounmap(adev->mman.aper_base_kaddr); 4462 adev->mman.aper_base_kaddr = NULL; 4463 4464 /* Memory manager related */ 4465 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4466 arch_phys_wc_del(adev->gmc.vram_mtrr); 4467 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4468 } 4469 } 4470 4471 /** 4472 * amdgpu_device_fini_hw - tear down the driver 4473 * 4474 * @adev: amdgpu_device pointer 4475 * 4476 * Tear down the driver info (all asics). 4477 * Called at driver shutdown. 4478 */ 4479 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4480 { 4481 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4482 flush_delayed_work(&adev->delayed_init_work); 4483 adev->shutdown = true; 4484 4485 /* make sure IB test finished before entering exclusive mode 4486 * to avoid preemption on IB test 4487 */ 4488 if (amdgpu_sriov_vf(adev)) { 4489 amdgpu_virt_request_full_gpu(adev, false); 4490 amdgpu_virt_fini_data_exchange(adev); 4491 } 4492 4493 /* disable all interrupts */ 4494 amdgpu_irq_disable_all(adev); 4495 if (adev->mode_info.mode_config_initialized) { 4496 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4497 drm_helper_force_disable_all(adev_to_drm(adev)); 4498 else 4499 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4500 } 4501 amdgpu_fence_driver_hw_fini(adev); 4502 4503 if (adev->mman.initialized) 4504 drain_workqueue(adev->mman.bdev.wq); 4505 4506 if (adev->pm.sysfs_initialized) 4507 amdgpu_pm_sysfs_fini(adev); 4508 if (adev->ucode_sysfs_en) 4509 amdgpu_ucode_sysfs_fini(adev); 4510 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4511 amdgpu_fru_sysfs_fini(adev); 4512 4513 amdgpu_reg_state_sysfs_fini(adev); 4514 4515 /* disable ras feature must before hw fini */ 4516 amdgpu_ras_pre_fini(adev); 4517 4518 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4519 4520 amdgpu_device_ip_fini_early(adev); 4521 4522 amdgpu_irq_fini_hw(adev); 4523 4524 if (adev->mman.initialized) 4525 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4526 4527 amdgpu_gart_dummy_page_fini(adev); 4528 4529 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4530 amdgpu_device_unmap_mmio(adev); 4531 4532 } 4533 4534 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4535 { 4536 int idx; 4537 bool px; 4538 4539 amdgpu_fence_driver_sw_fini(adev); 4540 amdgpu_device_ip_fini(adev); 4541 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4542 adev->accel_working = false; 4543 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4544 4545 amdgpu_reset_fini(adev); 4546 4547 /* free i2c buses */ 4548 if (!amdgpu_device_has_dc_support(adev)) 4549 amdgpu_i2c_fini(adev); 4550 4551 if (amdgpu_emu_mode != 1) 4552 amdgpu_atombios_fini(adev); 4553 4554 kfree(adev->bios); 4555 adev->bios = NULL; 4556 4557 kfree(adev->fru_info); 4558 adev->fru_info = NULL; 4559 4560 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4561 4562 if (px || (!dev_is_removable(&adev->pdev->dev) && 4563 apple_gmux_detect(NULL, NULL))) 4564 vga_switcheroo_unregister_client(adev->pdev); 4565 4566 if (px) 4567 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4568 4569 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4570 vga_client_unregister(adev->pdev); 4571 4572 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4573 4574 iounmap(adev->rmmio); 4575 adev->rmmio = NULL; 4576 amdgpu_doorbell_fini(adev); 4577 drm_dev_exit(idx); 4578 } 4579 4580 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4581 amdgpu_pmu_fini(adev); 4582 if (adev->mman.discovery_bin) 4583 amdgpu_discovery_fini(adev); 4584 4585 amdgpu_reset_put_reset_domain(adev->reset_domain); 4586 adev->reset_domain = NULL; 4587 4588 kfree(adev->pci_state); 4589 4590 } 4591 4592 /** 4593 * amdgpu_device_evict_resources - evict device resources 4594 * @adev: amdgpu device object 4595 * 4596 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4597 * of the vram memory type. Mainly used for evicting device resources 4598 * at suspend time. 4599 * 4600 */ 4601 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4602 { 4603 int ret; 4604 4605 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4606 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4607 return 0; 4608 4609 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4610 if (ret) 4611 DRM_WARN("evicting device resources failed\n"); 4612 return ret; 4613 } 4614 4615 /* 4616 * Suspend & resume. 4617 */ 4618 /** 4619 * amdgpu_device_prepare - prepare for device suspend 4620 * 4621 * @dev: drm dev pointer 4622 * 4623 * Prepare to put the hw in the suspend state (all asics). 4624 * Returns 0 for success or an error on failure. 4625 * Called at driver suspend. 4626 */ 4627 int amdgpu_device_prepare(struct drm_device *dev) 4628 { 4629 struct amdgpu_device *adev = drm_to_adev(dev); 4630 int i, r; 4631 4632 amdgpu_choose_low_power_state(adev); 4633 4634 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4635 return 0; 4636 4637 /* Evict the majority of BOs before starting suspend sequence */ 4638 r = amdgpu_device_evict_resources(adev); 4639 if (r) 4640 goto unprepare; 4641 4642 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4643 4644 for (i = 0; i < adev->num_ip_blocks; i++) { 4645 if (!adev->ip_blocks[i].status.valid) 4646 continue; 4647 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4648 continue; 4649 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4650 if (r) 4651 goto unprepare; 4652 } 4653 4654 return 0; 4655 4656 unprepare: 4657 adev->in_s0ix = adev->in_s3 = false; 4658 4659 return r; 4660 } 4661 4662 /** 4663 * amdgpu_device_suspend - initiate device suspend 4664 * 4665 * @dev: drm dev pointer 4666 * @fbcon : notify the fbdev of suspend 4667 * 4668 * Puts the hw in the suspend state (all asics). 4669 * Returns 0 for success or an error on failure. 4670 * Called at driver suspend. 4671 */ 4672 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4673 { 4674 struct amdgpu_device *adev = drm_to_adev(dev); 4675 int r = 0; 4676 4677 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4678 return 0; 4679 4680 adev->in_suspend = true; 4681 4682 if (amdgpu_sriov_vf(adev)) { 4683 amdgpu_virt_fini_data_exchange(adev); 4684 r = amdgpu_virt_request_full_gpu(adev, false); 4685 if (r) 4686 return r; 4687 } 4688 4689 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4690 DRM_WARN("smart shift update failed\n"); 4691 4692 if (fbcon) 4693 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4694 4695 cancel_delayed_work_sync(&adev->delayed_init_work); 4696 4697 amdgpu_ras_suspend(adev); 4698 4699 amdgpu_device_ip_suspend_phase1(adev); 4700 4701 if (!adev->in_s0ix) 4702 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4703 4704 r = amdgpu_device_evict_resources(adev); 4705 if (r) 4706 return r; 4707 4708 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4709 4710 amdgpu_fence_driver_hw_fini(adev); 4711 4712 amdgpu_device_ip_suspend_phase2(adev); 4713 4714 if (amdgpu_sriov_vf(adev)) 4715 amdgpu_virt_release_full_gpu(adev, false); 4716 4717 r = amdgpu_dpm_notify_rlc_state(adev, false); 4718 if (r) 4719 return r; 4720 4721 return 0; 4722 } 4723 4724 /** 4725 * amdgpu_device_resume - initiate device resume 4726 * 4727 * @dev: drm dev pointer 4728 * @fbcon : notify the fbdev of resume 4729 * 4730 * Bring the hw back to operating state (all asics). 4731 * Returns 0 for success or an error on failure. 4732 * Called at driver resume. 4733 */ 4734 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4735 { 4736 struct amdgpu_device *adev = drm_to_adev(dev); 4737 int r = 0; 4738 4739 if (amdgpu_sriov_vf(adev)) { 4740 r = amdgpu_virt_request_full_gpu(adev, true); 4741 if (r) 4742 return r; 4743 } 4744 4745 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4746 return 0; 4747 4748 if (adev->in_s0ix) 4749 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4750 4751 /* post card */ 4752 if (amdgpu_device_need_post(adev)) { 4753 r = amdgpu_device_asic_init(adev); 4754 if (r) 4755 dev_err(adev->dev, "amdgpu asic init failed\n"); 4756 } 4757 4758 r = amdgpu_device_ip_resume(adev); 4759 4760 if (r) { 4761 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4762 goto exit; 4763 } 4764 amdgpu_fence_driver_hw_init(adev); 4765 4766 if (!adev->in_s0ix) { 4767 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4768 if (r) 4769 goto exit; 4770 } 4771 4772 r = amdgpu_device_ip_late_init(adev); 4773 if (r) 4774 goto exit; 4775 4776 queue_delayed_work(system_wq, &adev->delayed_init_work, 4777 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4778 exit: 4779 if (amdgpu_sriov_vf(adev)) { 4780 amdgpu_virt_init_data_exchange(adev); 4781 amdgpu_virt_release_full_gpu(adev, true); 4782 } 4783 4784 if (r) 4785 return r; 4786 4787 /* Make sure IB tests flushed */ 4788 flush_delayed_work(&adev->delayed_init_work); 4789 4790 if (fbcon) 4791 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4792 4793 amdgpu_ras_resume(adev); 4794 4795 if (adev->mode_info.num_crtc) { 4796 /* 4797 * Most of the connector probing functions try to acquire runtime pm 4798 * refs to ensure that the GPU is powered on when connector polling is 4799 * performed. Since we're calling this from a runtime PM callback, 4800 * trying to acquire rpm refs will cause us to deadlock. 4801 * 4802 * Since we're guaranteed to be holding the rpm lock, it's safe to 4803 * temporarily disable the rpm helpers so this doesn't deadlock us. 4804 */ 4805 #ifdef CONFIG_PM 4806 dev->dev->power.disable_depth++; 4807 #endif 4808 if (!adev->dc_enabled) 4809 drm_helper_hpd_irq_event(dev); 4810 else 4811 drm_kms_helper_hotplug_event(dev); 4812 #ifdef CONFIG_PM 4813 dev->dev->power.disable_depth--; 4814 #endif 4815 } 4816 adev->in_suspend = false; 4817 4818 if (adev->enable_mes) 4819 amdgpu_mes_self_test(adev); 4820 4821 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4822 DRM_WARN("smart shift update failed\n"); 4823 4824 return 0; 4825 } 4826 4827 /** 4828 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4829 * 4830 * @adev: amdgpu_device pointer 4831 * 4832 * The list of all the hardware IPs that make up the asic is walked and 4833 * the check_soft_reset callbacks are run. check_soft_reset determines 4834 * if the asic is still hung or not. 4835 * Returns true if any of the IPs are still in a hung state, false if not. 4836 */ 4837 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4838 { 4839 int i; 4840 bool asic_hang = false; 4841 4842 if (amdgpu_sriov_vf(adev)) 4843 return true; 4844 4845 if (amdgpu_asic_need_full_reset(adev)) 4846 return true; 4847 4848 for (i = 0; i < adev->num_ip_blocks; i++) { 4849 if (!adev->ip_blocks[i].status.valid) 4850 continue; 4851 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4852 adev->ip_blocks[i].status.hang = 4853 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4854 if (adev->ip_blocks[i].status.hang) { 4855 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4856 asic_hang = true; 4857 } 4858 } 4859 return asic_hang; 4860 } 4861 4862 /** 4863 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4864 * 4865 * @adev: amdgpu_device pointer 4866 * 4867 * The list of all the hardware IPs that make up the asic is walked and the 4868 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4869 * handles any IP specific hardware or software state changes that are 4870 * necessary for a soft reset to succeed. 4871 * Returns 0 on success, negative error code on failure. 4872 */ 4873 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4874 { 4875 int i, r = 0; 4876 4877 for (i = 0; i < adev->num_ip_blocks; i++) { 4878 if (!adev->ip_blocks[i].status.valid) 4879 continue; 4880 if (adev->ip_blocks[i].status.hang && 4881 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4882 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4883 if (r) 4884 return r; 4885 } 4886 } 4887 4888 return 0; 4889 } 4890 4891 /** 4892 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4893 * 4894 * @adev: amdgpu_device pointer 4895 * 4896 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4897 * reset is necessary to recover. 4898 * Returns true if a full asic reset is required, false if not. 4899 */ 4900 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4901 { 4902 int i; 4903 4904 if (amdgpu_asic_need_full_reset(adev)) 4905 return true; 4906 4907 for (i = 0; i < adev->num_ip_blocks; i++) { 4908 if (!adev->ip_blocks[i].status.valid) 4909 continue; 4910 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4911 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4912 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4913 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4914 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4915 if (adev->ip_blocks[i].status.hang) { 4916 dev_info(adev->dev, "Some block need full reset!\n"); 4917 return true; 4918 } 4919 } 4920 } 4921 return false; 4922 } 4923 4924 /** 4925 * amdgpu_device_ip_soft_reset - do a soft reset 4926 * 4927 * @adev: amdgpu_device pointer 4928 * 4929 * The list of all the hardware IPs that make up the asic is walked and the 4930 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4931 * IP specific hardware or software state changes that are necessary to soft 4932 * reset the IP. 4933 * Returns 0 on success, negative error code on failure. 4934 */ 4935 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4936 { 4937 int i, r = 0; 4938 4939 for (i = 0; i < adev->num_ip_blocks; i++) { 4940 if (!adev->ip_blocks[i].status.valid) 4941 continue; 4942 if (adev->ip_blocks[i].status.hang && 4943 adev->ip_blocks[i].version->funcs->soft_reset) { 4944 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4945 if (r) 4946 return r; 4947 } 4948 } 4949 4950 return 0; 4951 } 4952 4953 /** 4954 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4955 * 4956 * @adev: amdgpu_device pointer 4957 * 4958 * The list of all the hardware IPs that make up the asic is walked and the 4959 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4960 * handles any IP specific hardware or software state changes that are 4961 * necessary after the IP has been soft reset. 4962 * Returns 0 on success, negative error code on failure. 4963 */ 4964 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4965 { 4966 int i, r = 0; 4967 4968 for (i = 0; i < adev->num_ip_blocks; i++) { 4969 if (!adev->ip_blocks[i].status.valid) 4970 continue; 4971 if (adev->ip_blocks[i].status.hang && 4972 adev->ip_blocks[i].version->funcs->post_soft_reset) 4973 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4974 if (r) 4975 return r; 4976 } 4977 4978 return 0; 4979 } 4980 4981 /** 4982 * amdgpu_device_recover_vram - Recover some VRAM contents 4983 * 4984 * @adev: amdgpu_device pointer 4985 * 4986 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4987 * restore things like GPUVM page tables after a GPU reset where 4988 * the contents of VRAM might be lost. 4989 * 4990 * Returns: 4991 * 0 on success, negative error code on failure. 4992 */ 4993 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4994 { 4995 struct dma_fence *fence = NULL, *next = NULL; 4996 struct amdgpu_bo *shadow; 4997 struct amdgpu_bo_vm *vmbo; 4998 long r = 1, tmo; 4999 5000 if (amdgpu_sriov_runtime(adev)) 5001 tmo = msecs_to_jiffies(8000); 5002 else 5003 tmo = msecs_to_jiffies(100); 5004 5005 dev_info(adev->dev, "recover vram bo from shadow start\n"); 5006 mutex_lock(&adev->shadow_list_lock); 5007 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 5008 /* If vm is compute context or adev is APU, shadow will be NULL */ 5009 if (!vmbo->shadow) 5010 continue; 5011 shadow = vmbo->shadow; 5012 5013 /* No need to recover an evicted BO */ 5014 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 5015 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 5016 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 5017 continue; 5018 5019 r = amdgpu_bo_restore_shadow(shadow, &next); 5020 if (r) 5021 break; 5022 5023 if (fence) { 5024 tmo = dma_fence_wait_timeout(fence, false, tmo); 5025 dma_fence_put(fence); 5026 fence = next; 5027 if (tmo == 0) { 5028 r = -ETIMEDOUT; 5029 break; 5030 } else if (tmo < 0) { 5031 r = tmo; 5032 break; 5033 } 5034 } else { 5035 fence = next; 5036 } 5037 } 5038 mutex_unlock(&adev->shadow_list_lock); 5039 5040 if (fence) 5041 tmo = dma_fence_wait_timeout(fence, false, tmo); 5042 dma_fence_put(fence); 5043 5044 if (r < 0 || tmo <= 0) { 5045 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 5046 return -EIO; 5047 } 5048 5049 dev_info(adev->dev, "recover vram bo from shadow done\n"); 5050 return 0; 5051 } 5052 5053 5054 /** 5055 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5056 * 5057 * @adev: amdgpu_device pointer 5058 * @from_hypervisor: request from hypervisor 5059 * 5060 * do VF FLR and reinitialize Asic 5061 * return 0 means succeeded otherwise failed 5062 */ 5063 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5064 bool from_hypervisor) 5065 { 5066 int r; 5067 struct amdgpu_hive_info *hive = NULL; 5068 int retry_limit = 0; 5069 5070 retry: 5071 amdgpu_amdkfd_pre_reset(adev); 5072 5073 amdgpu_device_stop_pending_resets(adev); 5074 5075 if (from_hypervisor) 5076 r = amdgpu_virt_request_full_gpu(adev, true); 5077 else 5078 r = amdgpu_virt_reset_gpu(adev); 5079 if (r) 5080 return r; 5081 amdgpu_ras_set_fed(adev, false); 5082 amdgpu_irq_gpu_reset_resume_helper(adev); 5083 5084 /* some sw clean up VF needs to do before recover */ 5085 amdgpu_virt_post_reset(adev); 5086 5087 /* Resume IP prior to SMC */ 5088 r = amdgpu_device_ip_reinit_early_sriov(adev); 5089 if (r) 5090 goto error; 5091 5092 amdgpu_virt_init_data_exchange(adev); 5093 5094 r = amdgpu_device_fw_loading(adev); 5095 if (r) 5096 return r; 5097 5098 /* now we are okay to resume SMC/CP/SDMA */ 5099 r = amdgpu_device_ip_reinit_late_sriov(adev); 5100 if (r) 5101 goto error; 5102 5103 hive = amdgpu_get_xgmi_hive(adev); 5104 /* Update PSP FW topology after reset */ 5105 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5106 r = amdgpu_xgmi_update_topology(hive, adev); 5107 5108 if (hive) 5109 amdgpu_put_xgmi_hive(hive); 5110 5111 if (!r) { 5112 r = amdgpu_ib_ring_tests(adev); 5113 5114 amdgpu_amdkfd_post_reset(adev); 5115 } 5116 5117 error: 5118 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5119 amdgpu_inc_vram_lost(adev); 5120 r = amdgpu_device_recover_vram(adev); 5121 } 5122 amdgpu_virt_release_full_gpu(adev, true); 5123 5124 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5125 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5126 retry_limit++; 5127 goto retry; 5128 } else 5129 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5130 } 5131 5132 return r; 5133 } 5134 5135 /** 5136 * amdgpu_device_has_job_running - check if there is any job in mirror list 5137 * 5138 * @adev: amdgpu_device pointer 5139 * 5140 * check if there is any job in mirror list 5141 */ 5142 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5143 { 5144 int i; 5145 struct drm_sched_job *job; 5146 5147 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5148 struct amdgpu_ring *ring = adev->rings[i]; 5149 5150 if (!amdgpu_ring_sched_ready(ring)) 5151 continue; 5152 5153 spin_lock(&ring->sched.job_list_lock); 5154 job = list_first_entry_or_null(&ring->sched.pending_list, 5155 struct drm_sched_job, list); 5156 spin_unlock(&ring->sched.job_list_lock); 5157 if (job) 5158 return true; 5159 } 5160 return false; 5161 } 5162 5163 /** 5164 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5165 * 5166 * @adev: amdgpu_device pointer 5167 * 5168 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5169 * a hung GPU. 5170 */ 5171 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5172 { 5173 5174 if (amdgpu_gpu_recovery == 0) 5175 goto disabled; 5176 5177 /* Skip soft reset check in fatal error mode */ 5178 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5179 return true; 5180 5181 if (amdgpu_sriov_vf(adev)) 5182 return true; 5183 5184 if (amdgpu_gpu_recovery == -1) { 5185 switch (adev->asic_type) { 5186 #ifdef CONFIG_DRM_AMDGPU_SI 5187 case CHIP_VERDE: 5188 case CHIP_TAHITI: 5189 case CHIP_PITCAIRN: 5190 case CHIP_OLAND: 5191 case CHIP_HAINAN: 5192 #endif 5193 #ifdef CONFIG_DRM_AMDGPU_CIK 5194 case CHIP_KAVERI: 5195 case CHIP_KABINI: 5196 case CHIP_MULLINS: 5197 #endif 5198 case CHIP_CARRIZO: 5199 case CHIP_STONEY: 5200 case CHIP_CYAN_SKILLFISH: 5201 goto disabled; 5202 default: 5203 break; 5204 } 5205 } 5206 5207 return true; 5208 5209 disabled: 5210 dev_info(adev->dev, "GPU recovery disabled.\n"); 5211 return false; 5212 } 5213 5214 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5215 { 5216 u32 i; 5217 int ret = 0; 5218 5219 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5220 5221 dev_info(adev->dev, "GPU mode1 reset\n"); 5222 5223 /* disable BM */ 5224 pci_clear_master(adev->pdev); 5225 5226 amdgpu_device_cache_pci_state(adev->pdev); 5227 5228 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5229 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5230 ret = amdgpu_dpm_mode1_reset(adev); 5231 } else { 5232 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5233 ret = psp_gpu_reset(adev); 5234 } 5235 5236 if (ret) 5237 goto mode1_reset_failed; 5238 5239 amdgpu_device_load_pci_state(adev->pdev); 5240 ret = amdgpu_psp_wait_for_bootloader(adev); 5241 if (ret) 5242 goto mode1_reset_failed; 5243 5244 /* wait for asic to come out of reset */ 5245 for (i = 0; i < adev->usec_timeout; i++) { 5246 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5247 5248 if (memsize != 0xffffffff) 5249 break; 5250 udelay(1); 5251 } 5252 5253 if (i >= adev->usec_timeout) { 5254 ret = -ETIMEDOUT; 5255 goto mode1_reset_failed; 5256 } 5257 5258 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5259 5260 return 0; 5261 5262 mode1_reset_failed: 5263 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5264 return ret; 5265 } 5266 5267 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5268 struct amdgpu_reset_context *reset_context) 5269 { 5270 int i, r = 0; 5271 struct amdgpu_job *job = NULL; 5272 bool need_full_reset = 5273 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5274 5275 if (reset_context->reset_req_dev == adev) 5276 job = reset_context->job; 5277 5278 if (amdgpu_sriov_vf(adev)) { 5279 /* stop the data exchange thread */ 5280 amdgpu_virt_fini_data_exchange(adev); 5281 } 5282 5283 amdgpu_fence_driver_isr_toggle(adev, true); 5284 5285 /* block all schedulers and reset given job's ring */ 5286 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5287 struct amdgpu_ring *ring = adev->rings[i]; 5288 5289 if (!amdgpu_ring_sched_ready(ring)) 5290 continue; 5291 5292 /* Clear job fence from fence drv to avoid force_completion 5293 * leave NULL and vm flush fence in fence drv 5294 */ 5295 amdgpu_fence_driver_clear_job_fences(ring); 5296 5297 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5298 amdgpu_fence_driver_force_completion(ring); 5299 } 5300 5301 amdgpu_fence_driver_isr_toggle(adev, false); 5302 5303 if (job && job->vm) 5304 drm_sched_increase_karma(&job->base); 5305 5306 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5307 /* If reset handler not implemented, continue; otherwise return */ 5308 if (r == -EOPNOTSUPP) 5309 r = 0; 5310 else 5311 return r; 5312 5313 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5314 if (!amdgpu_sriov_vf(adev)) { 5315 5316 if (!need_full_reset) 5317 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5318 5319 if (!need_full_reset && amdgpu_gpu_recovery && 5320 amdgpu_device_ip_check_soft_reset(adev)) { 5321 amdgpu_device_ip_pre_soft_reset(adev); 5322 r = amdgpu_device_ip_soft_reset(adev); 5323 amdgpu_device_ip_post_soft_reset(adev); 5324 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5325 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5326 need_full_reset = true; 5327 } 5328 } 5329 5330 if (need_full_reset) 5331 r = amdgpu_device_ip_suspend(adev); 5332 if (need_full_reset) 5333 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5334 else 5335 clear_bit(AMDGPU_NEED_FULL_RESET, 5336 &reset_context->flags); 5337 } 5338 5339 return r; 5340 } 5341 5342 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5343 { 5344 int i; 5345 5346 lockdep_assert_held(&adev->reset_domain->sem); 5347 5348 for (i = 0; i < adev->reset_info.num_regs; i++) { 5349 adev->reset_info.reset_dump_reg_value[i] = 5350 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5351 5352 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5353 adev->reset_info.reset_dump_reg_value[i]); 5354 } 5355 5356 return 0; 5357 } 5358 5359 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5360 struct amdgpu_reset_context *reset_context) 5361 { 5362 struct amdgpu_device *tmp_adev = NULL; 5363 bool need_full_reset, skip_hw_reset, vram_lost = false; 5364 int r = 0; 5365 uint32_t i; 5366 5367 /* Try reset handler method first */ 5368 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5369 reset_list); 5370 5371 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5372 amdgpu_reset_reg_dumps(tmp_adev); 5373 5374 /* Trigger ip dump before we reset the asic */ 5375 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5376 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5377 tmp_adev->ip_blocks[i].version->funcs 5378 ->dump_ip_state((void *)tmp_adev); 5379 } 5380 5381 reset_context->reset_device_list = device_list_handle; 5382 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5383 /* If reset handler not implemented, continue; otherwise return */ 5384 if (r == -EOPNOTSUPP) 5385 r = 0; 5386 else 5387 return r; 5388 5389 /* Reset handler not implemented, use the default method */ 5390 need_full_reset = 5391 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5392 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5393 5394 /* 5395 * ASIC reset has to be done on all XGMI hive nodes ASAP 5396 * to allow proper links negotiation in FW (within 1 sec) 5397 */ 5398 if (!skip_hw_reset && need_full_reset) { 5399 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5400 /* For XGMI run all resets in parallel to speed up the process */ 5401 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5402 tmp_adev->gmc.xgmi.pending_reset = false; 5403 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5404 r = -EALREADY; 5405 } else 5406 r = amdgpu_asic_reset(tmp_adev); 5407 5408 if (r) { 5409 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5410 r, adev_to_drm(tmp_adev)->unique); 5411 goto out; 5412 } 5413 } 5414 5415 /* For XGMI wait for all resets to complete before proceed */ 5416 if (!r) { 5417 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5418 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5419 flush_work(&tmp_adev->xgmi_reset_work); 5420 r = tmp_adev->asic_reset_res; 5421 if (r) 5422 break; 5423 } 5424 } 5425 } 5426 } 5427 5428 if (!r && amdgpu_ras_intr_triggered()) { 5429 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5430 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5431 } 5432 5433 amdgpu_ras_intr_cleared(); 5434 } 5435 5436 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5437 if (need_full_reset) { 5438 /* post card */ 5439 amdgpu_ras_set_fed(tmp_adev, false); 5440 r = amdgpu_device_asic_init(tmp_adev); 5441 if (r) { 5442 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5443 } else { 5444 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5445 5446 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5447 if (r) 5448 goto out; 5449 5450 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5451 5452 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5453 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5454 5455 if (vram_lost) { 5456 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5457 amdgpu_inc_vram_lost(tmp_adev); 5458 } 5459 5460 r = amdgpu_device_fw_loading(tmp_adev); 5461 if (r) 5462 return r; 5463 5464 r = amdgpu_xcp_restore_partition_mode( 5465 tmp_adev->xcp_mgr); 5466 if (r) 5467 goto out; 5468 5469 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5470 if (r) 5471 goto out; 5472 5473 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5474 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5475 5476 if (vram_lost) 5477 amdgpu_device_fill_reset_magic(tmp_adev); 5478 5479 /* 5480 * Add this ASIC as tracked as reset was already 5481 * complete successfully. 5482 */ 5483 amdgpu_register_gpu_instance(tmp_adev); 5484 5485 if (!reset_context->hive && 5486 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5487 amdgpu_xgmi_add_device(tmp_adev); 5488 5489 r = amdgpu_device_ip_late_init(tmp_adev); 5490 if (r) 5491 goto out; 5492 5493 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5494 5495 /* 5496 * The GPU enters bad state once faulty pages 5497 * by ECC has reached the threshold, and ras 5498 * recovery is scheduled next. So add one check 5499 * here to break recovery if it indeed exceeds 5500 * bad page threshold, and remind user to 5501 * retire this GPU or setting one bigger 5502 * bad_page_threshold value to fix this once 5503 * probing driver again. 5504 */ 5505 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5506 /* must succeed. */ 5507 amdgpu_ras_resume(tmp_adev); 5508 } else { 5509 r = -EINVAL; 5510 goto out; 5511 } 5512 5513 /* Update PSP FW topology after reset */ 5514 if (reset_context->hive && 5515 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5516 r = amdgpu_xgmi_update_topology( 5517 reset_context->hive, tmp_adev); 5518 } 5519 } 5520 5521 out: 5522 if (!r) { 5523 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5524 r = amdgpu_ib_ring_tests(tmp_adev); 5525 if (r) { 5526 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5527 need_full_reset = true; 5528 r = -EAGAIN; 5529 goto end; 5530 } 5531 } 5532 5533 if (!r) 5534 r = amdgpu_device_recover_vram(tmp_adev); 5535 else 5536 tmp_adev->asic_reset_res = r; 5537 } 5538 5539 end: 5540 if (need_full_reset) 5541 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5542 else 5543 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5544 return r; 5545 } 5546 5547 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5548 { 5549 5550 switch (amdgpu_asic_reset_method(adev)) { 5551 case AMD_RESET_METHOD_MODE1: 5552 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5553 break; 5554 case AMD_RESET_METHOD_MODE2: 5555 adev->mp1_state = PP_MP1_STATE_RESET; 5556 break; 5557 default: 5558 adev->mp1_state = PP_MP1_STATE_NONE; 5559 break; 5560 } 5561 } 5562 5563 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5564 { 5565 amdgpu_vf_error_trans_all(adev); 5566 adev->mp1_state = PP_MP1_STATE_NONE; 5567 } 5568 5569 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5570 { 5571 struct pci_dev *p = NULL; 5572 5573 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5574 adev->pdev->bus->number, 1); 5575 if (p) { 5576 pm_runtime_enable(&(p->dev)); 5577 pm_runtime_resume(&(p->dev)); 5578 } 5579 5580 pci_dev_put(p); 5581 } 5582 5583 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5584 { 5585 enum amd_reset_method reset_method; 5586 struct pci_dev *p = NULL; 5587 u64 expires; 5588 5589 /* 5590 * For now, only BACO and mode1 reset are confirmed 5591 * to suffer the audio issue without proper suspended. 5592 */ 5593 reset_method = amdgpu_asic_reset_method(adev); 5594 if ((reset_method != AMD_RESET_METHOD_BACO) && 5595 (reset_method != AMD_RESET_METHOD_MODE1)) 5596 return -EINVAL; 5597 5598 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5599 adev->pdev->bus->number, 1); 5600 if (!p) 5601 return -ENODEV; 5602 5603 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5604 if (!expires) 5605 /* 5606 * If we cannot get the audio device autosuspend delay, 5607 * a fixed 4S interval will be used. Considering 3S is 5608 * the audio controller default autosuspend delay setting. 5609 * 4S used here is guaranteed to cover that. 5610 */ 5611 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5612 5613 while (!pm_runtime_status_suspended(&(p->dev))) { 5614 if (!pm_runtime_suspend(&(p->dev))) 5615 break; 5616 5617 if (expires < ktime_get_mono_fast_ns()) { 5618 dev_warn(adev->dev, "failed to suspend display audio\n"); 5619 pci_dev_put(p); 5620 /* TODO: abort the succeeding gpu reset? */ 5621 return -ETIMEDOUT; 5622 } 5623 } 5624 5625 pm_runtime_disable(&(p->dev)); 5626 5627 pci_dev_put(p); 5628 return 0; 5629 } 5630 5631 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5632 { 5633 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5634 5635 #if defined(CONFIG_DEBUG_FS) 5636 if (!amdgpu_sriov_vf(adev)) 5637 cancel_work(&adev->reset_work); 5638 #endif 5639 5640 if (adev->kfd.dev) 5641 cancel_work(&adev->kfd.reset_work); 5642 5643 if (amdgpu_sriov_vf(adev)) 5644 cancel_work(&adev->virt.flr_work); 5645 5646 if (con && adev->ras_enabled) 5647 cancel_work(&con->recovery_work); 5648 5649 } 5650 5651 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5652 { 5653 struct amdgpu_device *tmp_adev; 5654 int ret = 0; 5655 u32 status; 5656 5657 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5658 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5659 if (PCI_POSSIBLE_ERROR(status)) { 5660 dev_err(tmp_adev->dev, "device lost from bus!"); 5661 ret = -ENODEV; 5662 } 5663 } 5664 5665 return ret; 5666 } 5667 5668 /** 5669 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5670 * 5671 * @adev: amdgpu_device pointer 5672 * @job: which job trigger hang 5673 * @reset_context: amdgpu reset context pointer 5674 * 5675 * Attempt to reset the GPU if it has hung (all asics). 5676 * Attempt to do soft-reset or full-reset and reinitialize Asic 5677 * Returns 0 for success or an error on failure. 5678 */ 5679 5680 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5681 struct amdgpu_job *job, 5682 struct amdgpu_reset_context *reset_context) 5683 { 5684 struct list_head device_list, *device_list_handle = NULL; 5685 bool job_signaled = false; 5686 struct amdgpu_hive_info *hive = NULL; 5687 struct amdgpu_device *tmp_adev = NULL; 5688 int i, r = 0; 5689 bool need_emergency_restart = false; 5690 bool audio_suspended = false; 5691 5692 /* 5693 * Special case: RAS triggered and full reset isn't supported 5694 */ 5695 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5696 5697 /* 5698 * Flush RAM to disk so that after reboot 5699 * the user can read log and see why the system rebooted. 5700 */ 5701 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5702 amdgpu_ras_get_context(adev)->reboot) { 5703 DRM_WARN("Emergency reboot."); 5704 5705 ksys_sync_helper(); 5706 emergency_restart(); 5707 } 5708 5709 dev_info(adev->dev, "GPU %s begin!\n", 5710 need_emergency_restart ? "jobs stop":"reset"); 5711 5712 if (!amdgpu_sriov_vf(adev)) 5713 hive = amdgpu_get_xgmi_hive(adev); 5714 if (hive) 5715 mutex_lock(&hive->hive_lock); 5716 5717 reset_context->job = job; 5718 reset_context->hive = hive; 5719 /* 5720 * Build list of devices to reset. 5721 * In case we are in XGMI hive mode, resort the device list 5722 * to put adev in the 1st position. 5723 */ 5724 INIT_LIST_HEAD(&device_list); 5725 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5726 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5727 list_add_tail(&tmp_adev->reset_list, &device_list); 5728 if (adev->shutdown) 5729 tmp_adev->shutdown = true; 5730 } 5731 if (!list_is_first(&adev->reset_list, &device_list)) 5732 list_rotate_to_front(&adev->reset_list, &device_list); 5733 device_list_handle = &device_list; 5734 } else { 5735 list_add_tail(&adev->reset_list, &device_list); 5736 device_list_handle = &device_list; 5737 } 5738 5739 if (!amdgpu_sriov_vf(adev)) { 5740 r = amdgpu_device_health_check(device_list_handle); 5741 if (r) 5742 goto end_reset; 5743 } 5744 5745 /* We need to lock reset domain only once both for XGMI and single device */ 5746 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5747 reset_list); 5748 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5749 5750 /* block all schedulers and reset given job's ring */ 5751 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5752 5753 amdgpu_device_set_mp1_state(tmp_adev); 5754 5755 /* 5756 * Try to put the audio codec into suspend state 5757 * before gpu reset started. 5758 * 5759 * Due to the power domain of the graphics device 5760 * is shared with AZ power domain. Without this, 5761 * we may change the audio hardware from behind 5762 * the audio driver's back. That will trigger 5763 * some audio codec errors. 5764 */ 5765 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5766 audio_suspended = true; 5767 5768 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5769 5770 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5771 5772 if (!amdgpu_sriov_vf(tmp_adev)) 5773 amdgpu_amdkfd_pre_reset(tmp_adev); 5774 5775 /* 5776 * Mark these ASICs to be reseted as untracked first 5777 * And add them back after reset completed 5778 */ 5779 amdgpu_unregister_gpu_instance(tmp_adev); 5780 5781 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5782 5783 /* disable ras on ALL IPs */ 5784 if (!need_emergency_restart && 5785 amdgpu_device_ip_need_full_reset(tmp_adev)) 5786 amdgpu_ras_suspend(tmp_adev); 5787 5788 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5789 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5790 5791 if (!amdgpu_ring_sched_ready(ring)) 5792 continue; 5793 5794 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5795 5796 if (need_emergency_restart) 5797 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5798 } 5799 atomic_inc(&tmp_adev->gpu_reset_counter); 5800 } 5801 5802 if (need_emergency_restart) 5803 goto skip_sched_resume; 5804 5805 /* 5806 * Must check guilty signal here since after this point all old 5807 * HW fences are force signaled. 5808 * 5809 * job->base holds a reference to parent fence 5810 */ 5811 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5812 job_signaled = true; 5813 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5814 goto skip_hw_reset; 5815 } 5816 5817 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5818 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5819 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5820 /*TODO Should we stop ?*/ 5821 if (r) { 5822 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5823 r, adev_to_drm(tmp_adev)->unique); 5824 tmp_adev->asic_reset_res = r; 5825 } 5826 5827 if (!amdgpu_sriov_vf(tmp_adev)) 5828 /* 5829 * Drop all pending non scheduler resets. Scheduler resets 5830 * were already dropped during drm_sched_stop 5831 */ 5832 amdgpu_device_stop_pending_resets(tmp_adev); 5833 } 5834 5835 /* Actual ASIC resets if needed.*/ 5836 /* Host driver will handle XGMI hive reset for SRIOV */ 5837 if (amdgpu_sriov_vf(adev)) { 5838 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5839 if (r) 5840 adev->asic_reset_res = r; 5841 5842 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5843 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5844 IP_VERSION(9, 4, 2) || 5845 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5846 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5847 amdgpu_ras_resume(adev); 5848 } else { 5849 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5850 if (r && r == -EAGAIN) 5851 goto retry; 5852 } 5853 5854 skip_hw_reset: 5855 5856 /* Post ASIC reset for all devs .*/ 5857 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5858 5859 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5860 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5861 5862 if (!amdgpu_ring_sched_ready(ring)) 5863 continue; 5864 5865 drm_sched_start(&ring->sched, true); 5866 } 5867 5868 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5869 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5870 5871 if (tmp_adev->asic_reset_res) 5872 r = tmp_adev->asic_reset_res; 5873 5874 tmp_adev->asic_reset_res = 0; 5875 5876 if (r) { 5877 /* bad news, how to tell it to userspace ? */ 5878 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5879 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5880 } else { 5881 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5882 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5883 DRM_WARN("smart shift update failed\n"); 5884 } 5885 } 5886 5887 skip_sched_resume: 5888 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5889 /* unlock kfd: SRIOV would do it separately */ 5890 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5891 amdgpu_amdkfd_post_reset(tmp_adev); 5892 5893 /* kfd_post_reset will do nothing if kfd device is not initialized, 5894 * need to bring up kfd here if it's not be initialized before 5895 */ 5896 if (!adev->kfd.init_complete) 5897 amdgpu_amdkfd_device_init(adev); 5898 5899 if (audio_suspended) 5900 amdgpu_device_resume_display_audio(tmp_adev); 5901 5902 amdgpu_device_unset_mp1_state(tmp_adev); 5903 5904 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5905 } 5906 5907 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5908 reset_list); 5909 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5910 5911 end_reset: 5912 if (hive) { 5913 mutex_unlock(&hive->hive_lock); 5914 amdgpu_put_xgmi_hive(hive); 5915 } 5916 5917 if (r) 5918 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5919 5920 atomic_set(&adev->reset_domain->reset_res, r); 5921 return r; 5922 } 5923 5924 /** 5925 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5926 * 5927 * @adev: amdgpu_device pointer 5928 * @speed: pointer to the speed of the link 5929 * @width: pointer to the width of the link 5930 * 5931 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5932 * first physical partner to an AMD dGPU. 5933 * This will exclude any virtual switches and links. 5934 */ 5935 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5936 enum pci_bus_speed *speed, 5937 enum pcie_link_width *width) 5938 { 5939 struct pci_dev *parent = adev->pdev; 5940 5941 if (!speed || !width) 5942 return; 5943 5944 *speed = PCI_SPEED_UNKNOWN; 5945 *width = PCIE_LNK_WIDTH_UNKNOWN; 5946 5947 while ((parent = pci_upstream_bridge(parent))) { 5948 /* skip upstream/downstream switches internal to dGPU*/ 5949 if (parent->vendor == PCI_VENDOR_ID_ATI) 5950 continue; 5951 *speed = pcie_get_speed_cap(parent); 5952 *width = pcie_get_width_cap(parent); 5953 break; 5954 } 5955 } 5956 5957 /** 5958 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5959 * 5960 * @adev: amdgpu_device pointer 5961 * 5962 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5963 * and lanes) of the slot the device is in. Handles APUs and 5964 * virtualized environments where PCIE config space may not be available. 5965 */ 5966 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5967 { 5968 struct pci_dev *pdev; 5969 enum pci_bus_speed speed_cap, platform_speed_cap; 5970 enum pcie_link_width platform_link_width; 5971 5972 if (amdgpu_pcie_gen_cap) 5973 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5974 5975 if (amdgpu_pcie_lane_cap) 5976 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5977 5978 /* covers APUs as well */ 5979 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5980 if (adev->pm.pcie_gen_mask == 0) 5981 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5982 if (adev->pm.pcie_mlw_mask == 0) 5983 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5984 return; 5985 } 5986 5987 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5988 return; 5989 5990 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5991 &platform_link_width); 5992 5993 if (adev->pm.pcie_gen_mask == 0) { 5994 /* asic caps */ 5995 pdev = adev->pdev; 5996 speed_cap = pcie_get_speed_cap(pdev); 5997 if (speed_cap == PCI_SPEED_UNKNOWN) { 5998 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5999 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6000 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6001 } else { 6002 if (speed_cap == PCIE_SPEED_32_0GT) 6003 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6004 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6005 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6006 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6007 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6008 else if (speed_cap == PCIE_SPEED_16_0GT) 6009 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6010 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6011 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6012 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6013 else if (speed_cap == PCIE_SPEED_8_0GT) 6014 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6015 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6016 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6017 else if (speed_cap == PCIE_SPEED_5_0GT) 6018 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6019 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6020 else 6021 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6022 } 6023 /* platform caps */ 6024 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6025 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6026 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6027 } else { 6028 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6029 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6030 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6031 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6032 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6033 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6034 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6035 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6036 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6037 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6038 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6039 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6040 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6041 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6042 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6043 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6044 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6045 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6046 else 6047 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6048 6049 } 6050 } 6051 if (adev->pm.pcie_mlw_mask == 0) { 6052 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6053 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6054 } else { 6055 switch (platform_link_width) { 6056 case PCIE_LNK_X32: 6057 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6058 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6059 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6060 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6061 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6062 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6063 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6064 break; 6065 case PCIE_LNK_X16: 6066 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6067 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6068 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6069 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6070 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6071 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6072 break; 6073 case PCIE_LNK_X12: 6074 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6075 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6076 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6077 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6078 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6079 break; 6080 case PCIE_LNK_X8: 6081 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6082 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6083 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6084 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6085 break; 6086 case PCIE_LNK_X4: 6087 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6088 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6089 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6090 break; 6091 case PCIE_LNK_X2: 6092 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6093 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6094 break; 6095 case PCIE_LNK_X1: 6096 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6097 break; 6098 default: 6099 break; 6100 } 6101 } 6102 } 6103 } 6104 6105 /** 6106 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6107 * 6108 * @adev: amdgpu_device pointer 6109 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6110 * 6111 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6112 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6113 * @peer_adev. 6114 */ 6115 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6116 struct amdgpu_device *peer_adev) 6117 { 6118 #ifdef CONFIG_HSA_AMD_P2P 6119 uint64_t address_mask = peer_adev->dev->dma_mask ? 6120 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6121 resource_size_t aper_limit = 6122 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6123 bool p2p_access = 6124 !adev->gmc.xgmi.connected_to_cpu && 6125 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6126 6127 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6128 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6129 !(adev->gmc.aper_base & address_mask || 6130 aper_limit & address_mask)); 6131 #else 6132 return false; 6133 #endif 6134 } 6135 6136 int amdgpu_device_baco_enter(struct drm_device *dev) 6137 { 6138 struct amdgpu_device *adev = drm_to_adev(dev); 6139 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6140 6141 if (!amdgpu_device_supports_baco(dev)) 6142 return -ENOTSUPP; 6143 6144 if (ras && adev->ras_enabled && 6145 adev->nbio.funcs->enable_doorbell_interrupt) 6146 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6147 6148 return amdgpu_dpm_baco_enter(adev); 6149 } 6150 6151 int amdgpu_device_baco_exit(struct drm_device *dev) 6152 { 6153 struct amdgpu_device *adev = drm_to_adev(dev); 6154 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6155 int ret = 0; 6156 6157 if (!amdgpu_device_supports_baco(dev)) 6158 return -ENOTSUPP; 6159 6160 ret = amdgpu_dpm_baco_exit(adev); 6161 if (ret) 6162 return ret; 6163 6164 if (ras && adev->ras_enabled && 6165 adev->nbio.funcs->enable_doorbell_interrupt) 6166 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6167 6168 if (amdgpu_passthrough(adev) && 6169 adev->nbio.funcs->clear_doorbell_interrupt) 6170 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6171 6172 return 0; 6173 } 6174 6175 /** 6176 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6177 * @pdev: PCI device struct 6178 * @state: PCI channel state 6179 * 6180 * Description: Called when a PCI error is detected. 6181 * 6182 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6183 */ 6184 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6185 { 6186 struct drm_device *dev = pci_get_drvdata(pdev); 6187 struct amdgpu_device *adev = drm_to_adev(dev); 6188 int i; 6189 6190 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6191 6192 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6193 DRM_WARN("No support for XGMI hive yet..."); 6194 return PCI_ERS_RESULT_DISCONNECT; 6195 } 6196 6197 adev->pci_channel_state = state; 6198 6199 switch (state) { 6200 case pci_channel_io_normal: 6201 return PCI_ERS_RESULT_CAN_RECOVER; 6202 /* Fatal error, prepare for slot reset */ 6203 case pci_channel_io_frozen: 6204 /* 6205 * Locking adev->reset_domain->sem will prevent any external access 6206 * to GPU during PCI error recovery 6207 */ 6208 amdgpu_device_lock_reset_domain(adev->reset_domain); 6209 amdgpu_device_set_mp1_state(adev); 6210 6211 /* 6212 * Block any work scheduling as we do for regular GPU reset 6213 * for the duration of the recovery 6214 */ 6215 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6216 struct amdgpu_ring *ring = adev->rings[i]; 6217 6218 if (!amdgpu_ring_sched_ready(ring)) 6219 continue; 6220 6221 drm_sched_stop(&ring->sched, NULL); 6222 } 6223 atomic_inc(&adev->gpu_reset_counter); 6224 return PCI_ERS_RESULT_NEED_RESET; 6225 case pci_channel_io_perm_failure: 6226 /* Permanent error, prepare for device removal */ 6227 return PCI_ERS_RESULT_DISCONNECT; 6228 } 6229 6230 return PCI_ERS_RESULT_NEED_RESET; 6231 } 6232 6233 /** 6234 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6235 * @pdev: pointer to PCI device 6236 */ 6237 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6238 { 6239 6240 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6241 6242 /* TODO - dump whatever for debugging purposes */ 6243 6244 /* This called only if amdgpu_pci_error_detected returns 6245 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6246 * works, no need to reset slot. 6247 */ 6248 6249 return PCI_ERS_RESULT_RECOVERED; 6250 } 6251 6252 /** 6253 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6254 * @pdev: PCI device struct 6255 * 6256 * Description: This routine is called by the pci error recovery 6257 * code after the PCI slot has been reset, just before we 6258 * should resume normal operations. 6259 */ 6260 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6261 { 6262 struct drm_device *dev = pci_get_drvdata(pdev); 6263 struct amdgpu_device *adev = drm_to_adev(dev); 6264 int r, i; 6265 struct amdgpu_reset_context reset_context; 6266 u32 memsize; 6267 struct list_head device_list; 6268 struct amdgpu_hive_info *hive; 6269 int hive_ras_recovery = 0; 6270 struct amdgpu_ras *ras; 6271 6272 /* PCI error slot reset should be skipped During RAS recovery */ 6273 hive = amdgpu_get_xgmi_hive(adev); 6274 if (hive) { 6275 hive_ras_recovery = atomic_read(&hive->ras_recovery); 6276 amdgpu_put_xgmi_hive(hive); 6277 } 6278 ras = amdgpu_ras_get_context(adev); 6279 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && 6280 ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) 6281 return PCI_ERS_RESULT_RECOVERED; 6282 6283 DRM_INFO("PCI error: slot reset callback!!\n"); 6284 6285 memset(&reset_context, 0, sizeof(reset_context)); 6286 6287 INIT_LIST_HEAD(&device_list); 6288 list_add_tail(&adev->reset_list, &device_list); 6289 6290 /* wait for asic to come out of reset */ 6291 msleep(500); 6292 6293 /* Restore PCI confspace */ 6294 amdgpu_device_load_pci_state(pdev); 6295 6296 /* confirm ASIC came out of reset */ 6297 for (i = 0; i < adev->usec_timeout; i++) { 6298 memsize = amdgpu_asic_get_config_memsize(adev); 6299 6300 if (memsize != 0xffffffff) 6301 break; 6302 udelay(1); 6303 } 6304 if (memsize == 0xffffffff) { 6305 r = -ETIME; 6306 goto out; 6307 } 6308 6309 reset_context.method = AMD_RESET_METHOD_NONE; 6310 reset_context.reset_req_dev = adev; 6311 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6312 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6313 6314 adev->no_hw_access = true; 6315 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6316 adev->no_hw_access = false; 6317 if (r) 6318 goto out; 6319 6320 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6321 6322 out: 6323 if (!r) { 6324 if (amdgpu_device_cache_pci_state(adev->pdev)) 6325 pci_restore_state(adev->pdev); 6326 6327 DRM_INFO("PCIe error recovery succeeded\n"); 6328 } else { 6329 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6330 amdgpu_device_unset_mp1_state(adev); 6331 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6332 } 6333 6334 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6335 } 6336 6337 /** 6338 * amdgpu_pci_resume() - resume normal ops after PCI reset 6339 * @pdev: pointer to PCI device 6340 * 6341 * Called when the error recovery driver tells us that its 6342 * OK to resume normal operation. 6343 */ 6344 void amdgpu_pci_resume(struct pci_dev *pdev) 6345 { 6346 struct drm_device *dev = pci_get_drvdata(pdev); 6347 struct amdgpu_device *adev = drm_to_adev(dev); 6348 int i; 6349 6350 6351 DRM_INFO("PCI error: resume callback!!\n"); 6352 6353 /* Only continue execution for the case of pci_channel_io_frozen */ 6354 if (adev->pci_channel_state != pci_channel_io_frozen) 6355 return; 6356 6357 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6358 struct amdgpu_ring *ring = adev->rings[i]; 6359 6360 if (!amdgpu_ring_sched_ready(ring)) 6361 continue; 6362 6363 drm_sched_start(&ring->sched, true); 6364 } 6365 6366 amdgpu_device_unset_mp1_state(adev); 6367 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6368 } 6369 6370 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6371 { 6372 struct drm_device *dev = pci_get_drvdata(pdev); 6373 struct amdgpu_device *adev = drm_to_adev(dev); 6374 int r; 6375 6376 r = pci_save_state(pdev); 6377 if (!r) { 6378 kfree(adev->pci_state); 6379 6380 adev->pci_state = pci_store_saved_state(pdev); 6381 6382 if (!adev->pci_state) { 6383 DRM_ERROR("Failed to store PCI saved state"); 6384 return false; 6385 } 6386 } else { 6387 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6388 return false; 6389 } 6390 6391 return true; 6392 } 6393 6394 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6395 { 6396 struct drm_device *dev = pci_get_drvdata(pdev); 6397 struct amdgpu_device *adev = drm_to_adev(dev); 6398 int r; 6399 6400 if (!adev->pci_state) 6401 return false; 6402 6403 r = pci_load_saved_state(pdev, adev->pci_state); 6404 6405 if (!r) { 6406 pci_restore_state(pdev); 6407 } else { 6408 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6409 return false; 6410 } 6411 6412 return true; 6413 } 6414 6415 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6416 struct amdgpu_ring *ring) 6417 { 6418 #ifdef CONFIG_X86_64 6419 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6420 return; 6421 #endif 6422 if (adev->gmc.xgmi.connected_to_cpu) 6423 return; 6424 6425 if (ring && ring->funcs->emit_hdp_flush) 6426 amdgpu_ring_emit_hdp_flush(ring); 6427 else 6428 amdgpu_asic_flush_hdp(adev, ring); 6429 } 6430 6431 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6432 struct amdgpu_ring *ring) 6433 { 6434 #ifdef CONFIG_X86_64 6435 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6436 return; 6437 #endif 6438 if (adev->gmc.xgmi.connected_to_cpu) 6439 return; 6440 6441 amdgpu_asic_invalidate_hdp(adev, ring); 6442 } 6443 6444 int amdgpu_in_reset(struct amdgpu_device *adev) 6445 { 6446 return atomic_read(&adev->reset_domain->in_gpu_reset); 6447 } 6448 6449 /** 6450 * amdgpu_device_halt() - bring hardware to some kind of halt state 6451 * 6452 * @adev: amdgpu_device pointer 6453 * 6454 * Bring hardware to some kind of halt state so that no one can touch it 6455 * any more. It will help to maintain error context when error occurred. 6456 * Compare to a simple hang, the system will keep stable at least for SSH 6457 * access. Then it should be trivial to inspect the hardware state and 6458 * see what's going on. Implemented as following: 6459 * 6460 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6461 * clears all CPU mappings to device, disallows remappings through page faults 6462 * 2. amdgpu_irq_disable_all() disables all interrupts 6463 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6464 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6465 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6466 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6467 * flush any in flight DMA operations 6468 */ 6469 void amdgpu_device_halt(struct amdgpu_device *adev) 6470 { 6471 struct pci_dev *pdev = adev->pdev; 6472 struct drm_device *ddev = adev_to_drm(adev); 6473 6474 amdgpu_xcp_dev_unplug(adev); 6475 drm_dev_unplug(ddev); 6476 6477 amdgpu_irq_disable_all(adev); 6478 6479 amdgpu_fence_driver_hw_fini(adev); 6480 6481 adev->no_hw_access = true; 6482 6483 amdgpu_device_unmap_mmio(adev); 6484 6485 pci_disable_device(pdev); 6486 pci_wait_for_pending_transaction(pdev); 6487 } 6488 6489 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6490 u32 reg) 6491 { 6492 unsigned long flags, address, data; 6493 u32 r; 6494 6495 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6496 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6497 6498 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6499 WREG32(address, reg * 4); 6500 (void)RREG32(address); 6501 r = RREG32(data); 6502 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6503 return r; 6504 } 6505 6506 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6507 u32 reg, u32 v) 6508 { 6509 unsigned long flags, address, data; 6510 6511 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6512 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6513 6514 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6515 WREG32(address, reg * 4); 6516 (void)RREG32(address); 6517 WREG32(data, v); 6518 (void)RREG32(data); 6519 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6520 } 6521 6522 /** 6523 * amdgpu_device_switch_gang - switch to a new gang 6524 * @adev: amdgpu_device pointer 6525 * @gang: the gang to switch to 6526 * 6527 * Try to switch to a new gang. 6528 * Returns: NULL if we switched to the new gang or a reference to the current 6529 * gang leader. 6530 */ 6531 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6532 struct dma_fence *gang) 6533 { 6534 struct dma_fence *old = NULL; 6535 6536 do { 6537 dma_fence_put(old); 6538 rcu_read_lock(); 6539 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6540 rcu_read_unlock(); 6541 6542 if (old == gang) 6543 break; 6544 6545 if (!dma_fence_is_signaled(old)) 6546 return old; 6547 6548 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6549 old, gang) != old); 6550 6551 dma_fence_put(old); 6552 return NULL; 6553 } 6554 6555 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6556 { 6557 switch (adev->asic_type) { 6558 #ifdef CONFIG_DRM_AMDGPU_SI 6559 case CHIP_HAINAN: 6560 #endif 6561 case CHIP_TOPAZ: 6562 /* chips with no display hardware */ 6563 return false; 6564 #ifdef CONFIG_DRM_AMDGPU_SI 6565 case CHIP_TAHITI: 6566 case CHIP_PITCAIRN: 6567 case CHIP_VERDE: 6568 case CHIP_OLAND: 6569 #endif 6570 #ifdef CONFIG_DRM_AMDGPU_CIK 6571 case CHIP_BONAIRE: 6572 case CHIP_HAWAII: 6573 case CHIP_KAVERI: 6574 case CHIP_KABINI: 6575 case CHIP_MULLINS: 6576 #endif 6577 case CHIP_TONGA: 6578 case CHIP_FIJI: 6579 case CHIP_POLARIS10: 6580 case CHIP_POLARIS11: 6581 case CHIP_POLARIS12: 6582 case CHIP_VEGAM: 6583 case CHIP_CARRIZO: 6584 case CHIP_STONEY: 6585 /* chips with display hardware */ 6586 return true; 6587 default: 6588 /* IP discovery */ 6589 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6590 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6591 return false; 6592 return true; 6593 } 6594 } 6595 6596 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6597 uint32_t inst, uint32_t reg_addr, char reg_name[], 6598 uint32_t expected_value, uint32_t mask) 6599 { 6600 uint32_t ret = 0; 6601 uint32_t old_ = 0; 6602 uint32_t tmp_ = RREG32(reg_addr); 6603 uint32_t loop = adev->usec_timeout; 6604 6605 while ((tmp_ & (mask)) != (expected_value)) { 6606 if (old_ != tmp_) { 6607 loop = adev->usec_timeout; 6608 old_ = tmp_; 6609 } else 6610 udelay(1); 6611 tmp_ = RREG32(reg_addr); 6612 loop--; 6613 if (!loop) { 6614 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6615 inst, reg_name, (uint32_t)expected_value, 6616 (uint32_t)(tmp_ & (mask))); 6617 ret = -ETIMEDOUT; 6618 break; 6619 } 6620 } 6621 return ret; 6622 } 6623