1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 100 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 101 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 102 103 static const struct drm_driver amdgpu_kms_driver; 104 105 const char *amdgpu_asic_name[] = { 106 "TAHITI", 107 "PITCAIRN", 108 "VERDE", 109 "OLAND", 110 "HAINAN", 111 "BONAIRE", 112 "KAVERI", 113 "KABINI", 114 "HAWAII", 115 "MULLINS", 116 "TOPAZ", 117 "TONGA", 118 "FIJI", 119 "CARRIZO", 120 "STONEY", 121 "POLARIS10", 122 "POLARIS11", 123 "POLARIS12", 124 "VEGAM", 125 "VEGA10", 126 "VEGA12", 127 "VEGA20", 128 "RAVEN", 129 "ARCTURUS", 130 "RENOIR", 131 "ALDEBARAN", 132 "NAVI10", 133 "CYAN_SKILLFISH", 134 "NAVI14", 135 "NAVI12", 136 "SIENNA_CICHLID", 137 "NAVY_FLOUNDER", 138 "VANGOGH", 139 "DIMGREY_CAVEFISH", 140 "BEIGE_GOBY", 141 "YELLOW_CARP", 142 "IP DISCOVERY", 143 "LAST", 144 }; 145 146 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 147 148 /** 149 * DOC: pcie_replay_count 150 * 151 * The amdgpu driver provides a sysfs API for reporting the total number 152 * of PCIe replays (NAKs) 153 * The file pcie_replay_count is used for this and returns the total 154 * number of replays as a sum of the NAKs generated and NAKs received 155 */ 156 157 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 158 struct device_attribute *attr, char *buf) 159 { 160 struct drm_device *ddev = dev_get_drvdata(dev); 161 struct amdgpu_device *adev = drm_to_adev(ddev); 162 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 163 164 return sysfs_emit(buf, "%llu\n", cnt); 165 } 166 167 static DEVICE_ATTR(pcie_replay_count, 0444, 168 amdgpu_device_get_pcie_replay_count, NULL); 169 170 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 171 struct bin_attribute *attr, char *buf, 172 loff_t ppos, size_t count) 173 { 174 struct device *dev = kobj_to_dev(kobj); 175 struct drm_device *ddev = dev_get_drvdata(dev); 176 struct amdgpu_device *adev = drm_to_adev(ddev); 177 ssize_t bytes_read; 178 179 switch (ppos) { 180 case AMDGPU_SYS_REG_STATE_XGMI: 181 bytes_read = amdgpu_asic_get_reg_state( 182 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 183 break; 184 case AMDGPU_SYS_REG_STATE_WAFL: 185 bytes_read = amdgpu_asic_get_reg_state( 186 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 187 break; 188 case AMDGPU_SYS_REG_STATE_PCIE: 189 bytes_read = amdgpu_asic_get_reg_state( 190 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 191 break; 192 case AMDGPU_SYS_REG_STATE_USR: 193 bytes_read = amdgpu_asic_get_reg_state( 194 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 195 break; 196 case AMDGPU_SYS_REG_STATE_USR_1: 197 bytes_read = amdgpu_asic_get_reg_state( 198 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 199 break; 200 default: 201 return -EINVAL; 202 } 203 204 return bytes_read; 205 } 206 207 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 208 AMDGPU_SYS_REG_STATE_END); 209 210 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 211 { 212 int ret; 213 214 if (!amdgpu_asic_get_reg_state_supported(adev)) 215 return 0; 216 217 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 218 219 return ret; 220 } 221 222 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 223 { 224 if (!amdgpu_asic_get_reg_state_supported(adev)) 225 return; 226 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 227 } 228 229 /** 230 * DOC: board_info 231 * 232 * The amdgpu driver provides a sysfs API for giving board related information. 233 * It provides the form factor information in the format 234 * 235 * type : form factor 236 * 237 * Possible form factor values 238 * 239 * - "cem" - PCIE CEM card 240 * - "oam" - Open Compute Accelerator Module 241 * - "unknown" - Not known 242 * 243 */ 244 245 static ssize_t amdgpu_device_get_board_info(struct device *dev, 246 struct device_attribute *attr, 247 char *buf) 248 { 249 struct drm_device *ddev = dev_get_drvdata(dev); 250 struct amdgpu_device *adev = drm_to_adev(ddev); 251 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 252 const char *pkg; 253 254 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 255 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 256 257 switch (pkg_type) { 258 case AMDGPU_PKG_TYPE_CEM: 259 pkg = "cem"; 260 break; 261 case AMDGPU_PKG_TYPE_OAM: 262 pkg = "oam"; 263 break; 264 default: 265 pkg = "unknown"; 266 break; 267 } 268 269 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 270 } 271 272 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 273 274 static struct attribute *amdgpu_board_attrs[] = { 275 &dev_attr_board_info.attr, 276 NULL, 277 }; 278 279 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 280 struct attribute *attr, int n) 281 { 282 struct device *dev = kobj_to_dev(kobj); 283 struct drm_device *ddev = dev_get_drvdata(dev); 284 struct amdgpu_device *adev = drm_to_adev(ddev); 285 286 if (adev->flags & AMD_IS_APU) 287 return 0; 288 289 return attr->mode; 290 } 291 292 static const struct attribute_group amdgpu_board_attrs_group = { 293 .attrs = amdgpu_board_attrs, 294 .is_visible = amdgpu_board_attrs_is_visible 295 }; 296 297 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 298 299 300 /** 301 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 302 * 303 * @dev: drm_device pointer 304 * 305 * Returns true if the device is a dGPU with ATPX power control, 306 * otherwise return false. 307 */ 308 bool amdgpu_device_supports_px(struct drm_device *dev) 309 { 310 struct amdgpu_device *adev = drm_to_adev(dev); 311 312 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 313 return true; 314 return false; 315 } 316 317 /** 318 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 319 * 320 * @dev: drm_device pointer 321 * 322 * Returns true if the device is a dGPU with ACPI power control, 323 * otherwise return false. 324 */ 325 bool amdgpu_device_supports_boco(struct drm_device *dev) 326 { 327 struct amdgpu_device *adev = drm_to_adev(dev); 328 329 if (adev->has_pr3 || 330 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 331 return true; 332 return false; 333 } 334 335 /** 336 * amdgpu_device_supports_baco - Does the device support BACO 337 * 338 * @dev: drm_device pointer 339 * 340 * Returns true if the device supporte BACO, 341 * otherwise return false. 342 */ 343 bool amdgpu_device_supports_baco(struct drm_device *dev) 344 { 345 struct amdgpu_device *adev = drm_to_adev(dev); 346 347 return amdgpu_asic_supports_baco(adev); 348 } 349 350 /** 351 * amdgpu_device_supports_smart_shift - Is the device dGPU with 352 * smart shift support 353 * 354 * @dev: drm_device pointer 355 * 356 * Returns true if the device is a dGPU with Smart Shift support, 357 * otherwise returns false. 358 */ 359 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 360 { 361 return (amdgpu_device_supports_boco(dev) && 362 amdgpu_acpi_is_power_shift_control_supported()); 363 } 364 365 /* 366 * VRAM access helper functions 367 */ 368 369 /** 370 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 371 * 372 * @adev: amdgpu_device pointer 373 * @pos: offset of the buffer in vram 374 * @buf: virtual address of the buffer in system memory 375 * @size: read/write size, sizeof(@buf) must > @size 376 * @write: true - write to vram, otherwise - read from vram 377 */ 378 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 379 void *buf, size_t size, bool write) 380 { 381 unsigned long flags; 382 uint32_t hi = ~0, tmp = 0; 383 uint32_t *data = buf; 384 uint64_t last; 385 int idx; 386 387 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 388 return; 389 390 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 391 392 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 393 for (last = pos + size; pos < last; pos += 4) { 394 tmp = pos >> 31; 395 396 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 397 if (tmp != hi) { 398 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 399 hi = tmp; 400 } 401 if (write) 402 WREG32_NO_KIQ(mmMM_DATA, *data++); 403 else 404 *data++ = RREG32_NO_KIQ(mmMM_DATA); 405 } 406 407 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 408 drm_dev_exit(idx); 409 } 410 411 /** 412 * amdgpu_device_aper_access - access vram by vram aperature 413 * 414 * @adev: amdgpu_device pointer 415 * @pos: offset of the buffer in vram 416 * @buf: virtual address of the buffer in system memory 417 * @size: read/write size, sizeof(@buf) must > @size 418 * @write: true - write to vram, otherwise - read from vram 419 * 420 * The return value means how many bytes have been transferred. 421 */ 422 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 423 void *buf, size_t size, bool write) 424 { 425 #ifdef CONFIG_64BIT 426 void __iomem *addr; 427 size_t count = 0; 428 uint64_t last; 429 430 if (!adev->mman.aper_base_kaddr) 431 return 0; 432 433 last = min(pos + size, adev->gmc.visible_vram_size); 434 if (last > pos) { 435 addr = adev->mman.aper_base_kaddr + pos; 436 count = last - pos; 437 438 if (write) { 439 memcpy_toio(addr, buf, count); 440 /* Make sure HDP write cache flush happens without any reordering 441 * after the system memory contents are sent over PCIe device 442 */ 443 mb(); 444 amdgpu_device_flush_hdp(adev, NULL); 445 } else { 446 amdgpu_device_invalidate_hdp(adev, NULL); 447 /* Make sure HDP read cache is invalidated before issuing a read 448 * to the PCIe device 449 */ 450 mb(); 451 memcpy_fromio(buf, addr, count); 452 } 453 454 } 455 456 return count; 457 #else 458 return 0; 459 #endif 460 } 461 462 /** 463 * amdgpu_device_vram_access - read/write a buffer in vram 464 * 465 * @adev: amdgpu_device pointer 466 * @pos: offset of the buffer in vram 467 * @buf: virtual address of the buffer in system memory 468 * @size: read/write size, sizeof(@buf) must > @size 469 * @write: true - write to vram, otherwise - read from vram 470 */ 471 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 472 void *buf, size_t size, bool write) 473 { 474 size_t count; 475 476 /* try to using vram apreature to access vram first */ 477 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 478 size -= count; 479 if (size) { 480 /* using MM to access rest vram */ 481 pos += count; 482 buf += count; 483 amdgpu_device_mm_access(adev, pos, buf, size, write); 484 } 485 } 486 487 /* 488 * register access helper functions. 489 */ 490 491 /* Check if hw access should be skipped because of hotplug or device error */ 492 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 493 { 494 if (adev->no_hw_access) 495 return true; 496 497 #ifdef CONFIG_LOCKDEP 498 /* 499 * This is a bit complicated to understand, so worth a comment. What we assert 500 * here is that the GPU reset is not running on another thread in parallel. 501 * 502 * For this we trylock the read side of the reset semaphore, if that succeeds 503 * we know that the reset is not running in paralell. 504 * 505 * If the trylock fails we assert that we are either already holding the read 506 * side of the lock or are the reset thread itself and hold the write side of 507 * the lock. 508 */ 509 if (in_task()) { 510 if (down_read_trylock(&adev->reset_domain->sem)) 511 up_read(&adev->reset_domain->sem); 512 else 513 lockdep_assert_held(&adev->reset_domain->sem); 514 } 515 #endif 516 return false; 517 } 518 519 /** 520 * amdgpu_device_rreg - read a memory mapped IO or indirect register 521 * 522 * @adev: amdgpu_device pointer 523 * @reg: dword aligned register offset 524 * @acc_flags: access flags which require special behavior 525 * 526 * Returns the 32 bit value from the offset specified. 527 */ 528 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 529 uint32_t reg, uint32_t acc_flags) 530 { 531 uint32_t ret; 532 533 if (amdgpu_device_skip_hw_access(adev)) 534 return 0; 535 536 if ((reg * 4) < adev->rmmio_size) { 537 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 538 amdgpu_sriov_runtime(adev) && 539 down_read_trylock(&adev->reset_domain->sem)) { 540 ret = amdgpu_kiq_rreg(adev, reg, 0); 541 up_read(&adev->reset_domain->sem); 542 } else { 543 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 544 } 545 } else { 546 ret = adev->pcie_rreg(adev, reg * 4); 547 } 548 549 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 550 551 return ret; 552 } 553 554 /* 555 * MMIO register read with bytes helper functions 556 * @offset:bytes offset from MMIO start 557 */ 558 559 /** 560 * amdgpu_mm_rreg8 - read a memory mapped IO register 561 * 562 * @adev: amdgpu_device pointer 563 * @offset: byte aligned register offset 564 * 565 * Returns the 8 bit value from the offset specified. 566 */ 567 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 568 { 569 if (amdgpu_device_skip_hw_access(adev)) 570 return 0; 571 572 if (offset < adev->rmmio_size) 573 return (readb(adev->rmmio + offset)); 574 BUG(); 575 } 576 577 578 /** 579 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 580 * 581 * @adev: amdgpu_device pointer 582 * @reg: dword aligned register offset 583 * @acc_flags: access flags which require special behavior 584 * @xcc_id: xcc accelerated compute core id 585 * 586 * Returns the 32 bit value from the offset specified. 587 */ 588 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 589 uint32_t reg, uint32_t acc_flags, 590 uint32_t xcc_id) 591 { 592 uint32_t ret, rlcg_flag; 593 594 if (amdgpu_device_skip_hw_access(adev)) 595 return 0; 596 597 if ((reg * 4) < adev->rmmio_size) { 598 if (amdgpu_sriov_vf(adev) && 599 !amdgpu_sriov_runtime(adev) && 600 adev->gfx.rlc.rlcg_reg_access_supported && 601 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 602 GC_HWIP, false, 603 &rlcg_flag)) { 604 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 605 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 606 amdgpu_sriov_runtime(adev) && 607 down_read_trylock(&adev->reset_domain->sem)) { 608 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 609 up_read(&adev->reset_domain->sem); 610 } else { 611 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 612 } 613 } else { 614 ret = adev->pcie_rreg(adev, reg * 4); 615 } 616 617 return ret; 618 } 619 620 /* 621 * MMIO register write with bytes helper functions 622 * @offset:bytes offset from MMIO start 623 * @value: the value want to be written to the register 624 */ 625 626 /** 627 * amdgpu_mm_wreg8 - read a memory mapped IO register 628 * 629 * @adev: amdgpu_device pointer 630 * @offset: byte aligned register offset 631 * @value: 8 bit value to write 632 * 633 * Writes the value specified to the offset specified. 634 */ 635 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 636 { 637 if (amdgpu_device_skip_hw_access(adev)) 638 return; 639 640 if (offset < adev->rmmio_size) 641 writeb(value, adev->rmmio + offset); 642 else 643 BUG(); 644 } 645 646 /** 647 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 648 * 649 * @adev: amdgpu_device pointer 650 * @reg: dword aligned register offset 651 * @v: 32 bit value to write to the register 652 * @acc_flags: access flags which require special behavior 653 * 654 * Writes the value specified to the offset specified. 655 */ 656 void amdgpu_device_wreg(struct amdgpu_device *adev, 657 uint32_t reg, uint32_t v, 658 uint32_t acc_flags) 659 { 660 if (amdgpu_device_skip_hw_access(adev)) 661 return; 662 663 if ((reg * 4) < adev->rmmio_size) { 664 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 665 amdgpu_sriov_runtime(adev) && 666 down_read_trylock(&adev->reset_domain->sem)) { 667 amdgpu_kiq_wreg(adev, reg, v, 0); 668 up_read(&adev->reset_domain->sem); 669 } else { 670 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 671 } 672 } else { 673 adev->pcie_wreg(adev, reg * 4, v); 674 } 675 676 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 677 } 678 679 /** 680 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 681 * 682 * @adev: amdgpu_device pointer 683 * @reg: mmio/rlc register 684 * @v: value to write 685 * @xcc_id: xcc accelerated compute core id 686 * 687 * this function is invoked only for the debugfs register access 688 */ 689 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 690 uint32_t reg, uint32_t v, 691 uint32_t xcc_id) 692 { 693 if (amdgpu_device_skip_hw_access(adev)) 694 return; 695 696 if (amdgpu_sriov_fullaccess(adev) && 697 adev->gfx.rlc.funcs && 698 adev->gfx.rlc.funcs->is_rlcg_access_range) { 699 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 700 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 701 } else if ((reg * 4) >= adev->rmmio_size) { 702 adev->pcie_wreg(adev, reg * 4, v); 703 } else { 704 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 705 } 706 } 707 708 /** 709 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 710 * 711 * @adev: amdgpu_device pointer 712 * @reg: dword aligned register offset 713 * @v: 32 bit value to write to the register 714 * @acc_flags: access flags which require special behavior 715 * @xcc_id: xcc accelerated compute core id 716 * 717 * Writes the value specified to the offset specified. 718 */ 719 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 720 uint32_t reg, uint32_t v, 721 uint32_t acc_flags, uint32_t xcc_id) 722 { 723 uint32_t rlcg_flag; 724 725 if (amdgpu_device_skip_hw_access(adev)) 726 return; 727 728 if ((reg * 4) < adev->rmmio_size) { 729 if (amdgpu_sriov_vf(adev) && 730 !amdgpu_sriov_runtime(adev) && 731 adev->gfx.rlc.rlcg_reg_access_supported && 732 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 733 GC_HWIP, true, 734 &rlcg_flag)) { 735 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 736 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 737 amdgpu_sriov_runtime(adev) && 738 down_read_trylock(&adev->reset_domain->sem)) { 739 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 740 up_read(&adev->reset_domain->sem); 741 } else { 742 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 743 } 744 } else { 745 adev->pcie_wreg(adev, reg * 4, v); 746 } 747 } 748 749 /** 750 * amdgpu_device_indirect_rreg - read an indirect register 751 * 752 * @adev: amdgpu_device pointer 753 * @reg_addr: indirect register address to read from 754 * 755 * Returns the value of indirect register @reg_addr 756 */ 757 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 758 u32 reg_addr) 759 { 760 unsigned long flags, pcie_index, pcie_data; 761 void __iomem *pcie_index_offset; 762 void __iomem *pcie_data_offset; 763 u32 r; 764 765 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 766 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 767 768 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 769 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 770 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 771 772 writel(reg_addr, pcie_index_offset); 773 readl(pcie_index_offset); 774 r = readl(pcie_data_offset); 775 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 776 777 return r; 778 } 779 780 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 781 u64 reg_addr) 782 { 783 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 784 u32 r; 785 void __iomem *pcie_index_offset; 786 void __iomem *pcie_index_hi_offset; 787 void __iomem *pcie_data_offset; 788 789 if (unlikely(!adev->nbio.funcs)) { 790 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 791 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 792 } else { 793 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 794 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 795 } 796 797 if (reg_addr >> 32) { 798 if (unlikely(!adev->nbio.funcs)) 799 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 800 else 801 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 802 } else { 803 pcie_index_hi = 0; 804 } 805 806 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 807 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 808 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 809 if (pcie_index_hi != 0) 810 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 811 pcie_index_hi * 4; 812 813 writel(reg_addr, pcie_index_offset); 814 readl(pcie_index_offset); 815 if (pcie_index_hi != 0) { 816 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 817 readl(pcie_index_hi_offset); 818 } 819 r = readl(pcie_data_offset); 820 821 /* clear the high bits */ 822 if (pcie_index_hi != 0) { 823 writel(0, pcie_index_hi_offset); 824 readl(pcie_index_hi_offset); 825 } 826 827 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 828 829 return r; 830 } 831 832 /** 833 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 834 * 835 * @adev: amdgpu_device pointer 836 * @reg_addr: indirect register address to read from 837 * 838 * Returns the value of indirect register @reg_addr 839 */ 840 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 841 u32 reg_addr) 842 { 843 unsigned long flags, pcie_index, pcie_data; 844 void __iomem *pcie_index_offset; 845 void __iomem *pcie_data_offset; 846 u64 r; 847 848 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 849 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 850 851 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 852 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 853 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 854 855 /* read low 32 bits */ 856 writel(reg_addr, pcie_index_offset); 857 readl(pcie_index_offset); 858 r = readl(pcie_data_offset); 859 /* read high 32 bits */ 860 writel(reg_addr + 4, pcie_index_offset); 861 readl(pcie_index_offset); 862 r |= ((u64)readl(pcie_data_offset) << 32); 863 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 864 865 return r; 866 } 867 868 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 869 u64 reg_addr) 870 { 871 unsigned long flags, pcie_index, pcie_data; 872 unsigned long pcie_index_hi = 0; 873 void __iomem *pcie_index_offset; 874 void __iomem *pcie_index_hi_offset; 875 void __iomem *pcie_data_offset; 876 u64 r; 877 878 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 879 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 880 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 881 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 882 883 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 884 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 885 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 886 if (pcie_index_hi != 0) 887 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 888 pcie_index_hi * 4; 889 890 /* read low 32 bits */ 891 writel(reg_addr, pcie_index_offset); 892 readl(pcie_index_offset); 893 if (pcie_index_hi != 0) { 894 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 r = readl(pcie_data_offset); 898 /* read high 32 bits */ 899 writel(reg_addr + 4, pcie_index_offset); 900 readl(pcie_index_offset); 901 if (pcie_index_hi != 0) { 902 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 903 readl(pcie_index_hi_offset); 904 } 905 r |= ((u64)readl(pcie_data_offset) << 32); 906 907 /* clear the high bits */ 908 if (pcie_index_hi != 0) { 909 writel(0, pcie_index_hi_offset); 910 readl(pcie_index_hi_offset); 911 } 912 913 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 914 915 return r; 916 } 917 918 /** 919 * amdgpu_device_indirect_wreg - write an indirect register address 920 * 921 * @adev: amdgpu_device pointer 922 * @reg_addr: indirect register offset 923 * @reg_data: indirect register data 924 * 925 */ 926 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 927 u32 reg_addr, u32 reg_data) 928 { 929 unsigned long flags, pcie_index, pcie_data; 930 void __iomem *pcie_index_offset; 931 void __iomem *pcie_data_offset; 932 933 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 934 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 935 936 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 937 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 938 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 939 940 writel(reg_addr, pcie_index_offset); 941 readl(pcie_index_offset); 942 writel(reg_data, pcie_data_offset); 943 readl(pcie_data_offset); 944 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 945 } 946 947 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 948 u64 reg_addr, u32 reg_data) 949 { 950 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 951 void __iomem *pcie_index_offset; 952 void __iomem *pcie_index_hi_offset; 953 void __iomem *pcie_data_offset; 954 955 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 956 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 957 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 958 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 959 else 960 pcie_index_hi = 0; 961 962 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 963 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 964 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 965 if (pcie_index_hi != 0) 966 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 967 pcie_index_hi * 4; 968 969 writel(reg_addr, pcie_index_offset); 970 readl(pcie_index_offset); 971 if (pcie_index_hi != 0) { 972 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 973 readl(pcie_index_hi_offset); 974 } 975 writel(reg_data, pcie_data_offset); 976 readl(pcie_data_offset); 977 978 /* clear the high bits */ 979 if (pcie_index_hi != 0) { 980 writel(0, pcie_index_hi_offset); 981 readl(pcie_index_hi_offset); 982 } 983 984 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 985 } 986 987 /** 988 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 989 * 990 * @adev: amdgpu_device pointer 991 * @reg_addr: indirect register offset 992 * @reg_data: indirect register data 993 * 994 */ 995 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 996 u32 reg_addr, u64 reg_data) 997 { 998 unsigned long flags, pcie_index, pcie_data; 999 void __iomem *pcie_index_offset; 1000 void __iomem *pcie_data_offset; 1001 1002 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1003 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1004 1005 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1006 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1007 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1008 1009 /* write low 32 bits */ 1010 writel(reg_addr, pcie_index_offset); 1011 readl(pcie_index_offset); 1012 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1013 readl(pcie_data_offset); 1014 /* write high 32 bits */ 1015 writel(reg_addr + 4, pcie_index_offset); 1016 readl(pcie_index_offset); 1017 writel((u32)(reg_data >> 32), pcie_data_offset); 1018 readl(pcie_data_offset); 1019 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1020 } 1021 1022 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1023 u64 reg_addr, u64 reg_data) 1024 { 1025 unsigned long flags, pcie_index, pcie_data; 1026 unsigned long pcie_index_hi = 0; 1027 void __iomem *pcie_index_offset; 1028 void __iomem *pcie_index_hi_offset; 1029 void __iomem *pcie_data_offset; 1030 1031 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1032 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1033 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1034 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1035 1036 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1037 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1038 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1039 if (pcie_index_hi != 0) 1040 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1041 pcie_index_hi * 4; 1042 1043 /* write low 32 bits */ 1044 writel(reg_addr, pcie_index_offset); 1045 readl(pcie_index_offset); 1046 if (pcie_index_hi != 0) { 1047 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1048 readl(pcie_index_hi_offset); 1049 } 1050 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1051 readl(pcie_data_offset); 1052 /* write high 32 bits */ 1053 writel(reg_addr + 4, pcie_index_offset); 1054 readl(pcie_index_offset); 1055 if (pcie_index_hi != 0) { 1056 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1057 readl(pcie_index_hi_offset); 1058 } 1059 writel((u32)(reg_data >> 32), pcie_data_offset); 1060 readl(pcie_data_offset); 1061 1062 /* clear the high bits */ 1063 if (pcie_index_hi != 0) { 1064 writel(0, pcie_index_hi_offset); 1065 readl(pcie_index_hi_offset); 1066 } 1067 1068 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1069 } 1070 1071 /** 1072 * amdgpu_device_get_rev_id - query device rev_id 1073 * 1074 * @adev: amdgpu_device pointer 1075 * 1076 * Return device rev_id 1077 */ 1078 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1079 { 1080 return adev->nbio.funcs->get_rev_id(adev); 1081 } 1082 1083 /** 1084 * amdgpu_invalid_rreg - dummy reg read function 1085 * 1086 * @adev: amdgpu_device pointer 1087 * @reg: offset of register 1088 * 1089 * Dummy register read function. Used for register blocks 1090 * that certain asics don't have (all asics). 1091 * Returns the value in the register. 1092 */ 1093 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1094 { 1095 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1096 BUG(); 1097 return 0; 1098 } 1099 1100 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1101 { 1102 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1103 BUG(); 1104 return 0; 1105 } 1106 1107 /** 1108 * amdgpu_invalid_wreg - dummy reg write function 1109 * 1110 * @adev: amdgpu_device pointer 1111 * @reg: offset of register 1112 * @v: value to write to the register 1113 * 1114 * Dummy register read function. Used for register blocks 1115 * that certain asics don't have (all asics). 1116 */ 1117 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1118 { 1119 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1120 reg, v); 1121 BUG(); 1122 } 1123 1124 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1125 { 1126 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1127 reg, v); 1128 BUG(); 1129 } 1130 1131 /** 1132 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1133 * 1134 * @adev: amdgpu_device pointer 1135 * @reg: offset of register 1136 * 1137 * Dummy register read function. Used for register blocks 1138 * that certain asics don't have (all asics). 1139 * Returns the value in the register. 1140 */ 1141 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1142 { 1143 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1144 BUG(); 1145 return 0; 1146 } 1147 1148 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1149 { 1150 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1151 BUG(); 1152 return 0; 1153 } 1154 1155 /** 1156 * amdgpu_invalid_wreg64 - dummy reg write function 1157 * 1158 * @adev: amdgpu_device pointer 1159 * @reg: offset of register 1160 * @v: value to write to the register 1161 * 1162 * Dummy register read function. Used for register blocks 1163 * that certain asics don't have (all asics). 1164 */ 1165 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1166 { 1167 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1168 reg, v); 1169 BUG(); 1170 } 1171 1172 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1173 { 1174 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1175 reg, v); 1176 BUG(); 1177 } 1178 1179 /** 1180 * amdgpu_block_invalid_rreg - dummy reg read function 1181 * 1182 * @adev: amdgpu_device pointer 1183 * @block: offset of instance 1184 * @reg: offset of register 1185 * 1186 * Dummy register read function. Used for register blocks 1187 * that certain asics don't have (all asics). 1188 * Returns the value in the register. 1189 */ 1190 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1191 uint32_t block, uint32_t reg) 1192 { 1193 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1194 reg, block); 1195 BUG(); 1196 return 0; 1197 } 1198 1199 /** 1200 * amdgpu_block_invalid_wreg - dummy reg write function 1201 * 1202 * @adev: amdgpu_device pointer 1203 * @block: offset of instance 1204 * @reg: offset of register 1205 * @v: value to write to the register 1206 * 1207 * Dummy register read function. Used for register blocks 1208 * that certain asics don't have (all asics). 1209 */ 1210 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1211 uint32_t block, 1212 uint32_t reg, uint32_t v) 1213 { 1214 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1215 reg, block, v); 1216 BUG(); 1217 } 1218 1219 /** 1220 * amdgpu_device_asic_init - Wrapper for atom asic_init 1221 * 1222 * @adev: amdgpu_device pointer 1223 * 1224 * Does any asic specific work and then calls atom asic init. 1225 */ 1226 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1227 { 1228 int ret; 1229 1230 amdgpu_asic_pre_asic_init(adev); 1231 1232 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1233 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1234 amdgpu_psp_wait_for_bootloader(adev); 1235 ret = amdgpu_atomfirmware_asic_init(adev, true); 1236 return ret; 1237 } else { 1238 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1239 } 1240 1241 return 0; 1242 } 1243 1244 /** 1245 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1246 * 1247 * @adev: amdgpu_device pointer 1248 * 1249 * Allocates a scratch page of VRAM for use by various things in the 1250 * driver. 1251 */ 1252 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1253 { 1254 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1255 AMDGPU_GEM_DOMAIN_VRAM | 1256 AMDGPU_GEM_DOMAIN_GTT, 1257 &adev->mem_scratch.robj, 1258 &adev->mem_scratch.gpu_addr, 1259 (void **)&adev->mem_scratch.ptr); 1260 } 1261 1262 /** 1263 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1264 * 1265 * @adev: amdgpu_device pointer 1266 * 1267 * Frees the VRAM scratch page. 1268 */ 1269 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1270 { 1271 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1272 } 1273 1274 /** 1275 * amdgpu_device_program_register_sequence - program an array of registers. 1276 * 1277 * @adev: amdgpu_device pointer 1278 * @registers: pointer to the register array 1279 * @array_size: size of the register array 1280 * 1281 * Programs an array or registers with and or masks. 1282 * This is a helper for setting golden registers. 1283 */ 1284 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1285 const u32 *registers, 1286 const u32 array_size) 1287 { 1288 u32 tmp, reg, and_mask, or_mask; 1289 int i; 1290 1291 if (array_size % 3) 1292 return; 1293 1294 for (i = 0; i < array_size; i += 3) { 1295 reg = registers[i + 0]; 1296 and_mask = registers[i + 1]; 1297 or_mask = registers[i + 2]; 1298 1299 if (and_mask == 0xffffffff) { 1300 tmp = or_mask; 1301 } else { 1302 tmp = RREG32(reg); 1303 tmp &= ~and_mask; 1304 if (adev->family >= AMDGPU_FAMILY_AI) 1305 tmp |= (or_mask & and_mask); 1306 else 1307 tmp |= or_mask; 1308 } 1309 WREG32(reg, tmp); 1310 } 1311 } 1312 1313 /** 1314 * amdgpu_device_pci_config_reset - reset the GPU 1315 * 1316 * @adev: amdgpu_device pointer 1317 * 1318 * Resets the GPU using the pci config reset sequence. 1319 * Only applicable to asics prior to vega10. 1320 */ 1321 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1322 { 1323 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1324 } 1325 1326 /** 1327 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1328 * 1329 * @adev: amdgpu_device pointer 1330 * 1331 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1332 */ 1333 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1334 { 1335 return pci_reset_function(adev->pdev); 1336 } 1337 1338 /* 1339 * amdgpu_device_wb_*() 1340 * Writeback is the method by which the GPU updates special pages in memory 1341 * with the status of certain GPU events (fences, ring pointers,etc.). 1342 */ 1343 1344 /** 1345 * amdgpu_device_wb_fini - Disable Writeback and free memory 1346 * 1347 * @adev: amdgpu_device pointer 1348 * 1349 * Disables Writeback and frees the Writeback memory (all asics). 1350 * Used at driver shutdown. 1351 */ 1352 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1353 { 1354 if (adev->wb.wb_obj) { 1355 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1356 &adev->wb.gpu_addr, 1357 (void **)&adev->wb.wb); 1358 adev->wb.wb_obj = NULL; 1359 } 1360 } 1361 1362 /** 1363 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1364 * 1365 * @adev: amdgpu_device pointer 1366 * 1367 * Initializes writeback and allocates writeback memory (all asics). 1368 * Used at driver startup. 1369 * Returns 0 on success or an -error on failure. 1370 */ 1371 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1372 { 1373 int r; 1374 1375 if (adev->wb.wb_obj == NULL) { 1376 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1377 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1378 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1379 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1380 (void **)&adev->wb.wb); 1381 if (r) { 1382 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1383 return r; 1384 } 1385 1386 adev->wb.num_wb = AMDGPU_MAX_WB; 1387 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1388 1389 /* clear wb memory */ 1390 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1391 } 1392 1393 return 0; 1394 } 1395 1396 /** 1397 * amdgpu_device_wb_get - Allocate a wb entry 1398 * 1399 * @adev: amdgpu_device pointer 1400 * @wb: wb index 1401 * 1402 * Allocate a wb slot for use by the driver (all asics). 1403 * Returns 0 on success or -EINVAL on failure. 1404 */ 1405 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1406 { 1407 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1408 1409 if (offset < adev->wb.num_wb) { 1410 __set_bit(offset, adev->wb.used); 1411 *wb = offset << 3; /* convert to dw offset */ 1412 return 0; 1413 } else { 1414 return -EINVAL; 1415 } 1416 } 1417 1418 /** 1419 * amdgpu_device_wb_free - Free a wb entry 1420 * 1421 * @adev: amdgpu_device pointer 1422 * @wb: wb index 1423 * 1424 * Free a wb slot allocated for use by the driver (all asics) 1425 */ 1426 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1427 { 1428 wb >>= 3; 1429 if (wb < adev->wb.num_wb) 1430 __clear_bit(wb, adev->wb.used); 1431 } 1432 1433 /** 1434 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1435 * 1436 * @adev: amdgpu_device pointer 1437 * 1438 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1439 * to fail, but if any of the BARs is not accessible after the size we abort 1440 * driver loading by returning -ENODEV. 1441 */ 1442 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1443 { 1444 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1445 struct pci_bus *root; 1446 struct resource *res; 1447 unsigned int i; 1448 u16 cmd; 1449 int r; 1450 1451 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1452 return 0; 1453 1454 /* Bypass for VF */ 1455 if (amdgpu_sriov_vf(adev)) 1456 return 0; 1457 1458 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1459 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1460 DRM_WARN("System can't access extended configuration space,please check!!\n"); 1461 1462 /* skip if the bios has already enabled large BAR */ 1463 if (adev->gmc.real_vram_size && 1464 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1465 return 0; 1466 1467 /* Check if the root BUS has 64bit memory resources */ 1468 root = adev->pdev->bus; 1469 while (root->parent) 1470 root = root->parent; 1471 1472 pci_bus_for_each_resource(root, res, i) { 1473 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1474 res->start > 0x100000000ull) 1475 break; 1476 } 1477 1478 /* Trying to resize is pointless without a root hub window above 4GB */ 1479 if (!res) 1480 return 0; 1481 1482 /* Limit the BAR size to what is available */ 1483 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1484 rbar_size); 1485 1486 /* Disable memory decoding while we change the BAR addresses and size */ 1487 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1488 pci_write_config_word(adev->pdev, PCI_COMMAND, 1489 cmd & ~PCI_COMMAND_MEMORY); 1490 1491 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1492 amdgpu_doorbell_fini(adev); 1493 if (adev->asic_type >= CHIP_BONAIRE) 1494 pci_release_resource(adev->pdev, 2); 1495 1496 pci_release_resource(adev->pdev, 0); 1497 1498 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1499 if (r == -ENOSPC) 1500 DRM_INFO("Not enough PCI address space for a large BAR."); 1501 else if (r && r != -ENOTSUPP) 1502 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1503 1504 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1505 1506 /* When the doorbell or fb BAR isn't available we have no chance of 1507 * using the device. 1508 */ 1509 r = amdgpu_doorbell_init(adev); 1510 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1511 return -ENODEV; 1512 1513 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1514 1515 return 0; 1516 } 1517 1518 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1519 { 1520 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1521 return false; 1522 1523 return true; 1524 } 1525 1526 /* 1527 * GPU helpers function. 1528 */ 1529 /** 1530 * amdgpu_device_need_post - check if the hw need post or not 1531 * 1532 * @adev: amdgpu_device pointer 1533 * 1534 * Check if the asic has been initialized (all asics) at driver startup 1535 * or post is needed if hw reset is performed. 1536 * Returns true if need or false if not. 1537 */ 1538 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1539 { 1540 uint32_t reg; 1541 1542 if (amdgpu_sriov_vf(adev)) 1543 return false; 1544 1545 if (!amdgpu_device_read_bios(adev)) 1546 return false; 1547 1548 if (amdgpu_passthrough(adev)) { 1549 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1550 * some old smc fw still need driver do vPost otherwise gpu hang, while 1551 * those smc fw version above 22.15 doesn't have this flaw, so we force 1552 * vpost executed for smc version below 22.15 1553 */ 1554 if (adev->asic_type == CHIP_FIJI) { 1555 int err; 1556 uint32_t fw_ver; 1557 1558 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1559 /* force vPost if error occured */ 1560 if (err) 1561 return true; 1562 1563 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1564 release_firmware(adev->pm.fw); 1565 if (fw_ver < 0x00160e00) 1566 return true; 1567 } 1568 } 1569 1570 /* Don't post if we need to reset whole hive on init */ 1571 if (adev->gmc.xgmi.pending_reset) 1572 return false; 1573 1574 if (adev->has_hw_reset) { 1575 adev->has_hw_reset = false; 1576 return true; 1577 } 1578 1579 /* bios scratch used on CIK+ */ 1580 if (adev->asic_type >= CHIP_BONAIRE) 1581 return amdgpu_atombios_scratch_need_asic_init(adev); 1582 1583 /* check MEM_SIZE for older asics */ 1584 reg = amdgpu_asic_get_config_memsize(adev); 1585 1586 if ((reg != 0) && (reg != 0xffffffff)) 1587 return false; 1588 1589 return true; 1590 } 1591 1592 /* 1593 * Check whether seamless boot is supported. 1594 * 1595 * So far we only support seamless boot on DCE 3.0 or later. 1596 * If users report that it works on older ASICS as well, we may 1597 * loosen this. 1598 */ 1599 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1600 { 1601 switch (amdgpu_seamless) { 1602 case -1: 1603 break; 1604 case 1: 1605 return true; 1606 case 0: 1607 return false; 1608 default: 1609 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1610 amdgpu_seamless); 1611 return false; 1612 } 1613 1614 if (!(adev->flags & AMD_IS_APU)) 1615 return false; 1616 1617 if (adev->mman.keep_stolen_vga_memory) 1618 return false; 1619 1620 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1621 } 1622 1623 /* 1624 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1625 * don't support dynamic speed switching. Until we have confirmation from Intel 1626 * that a specific host supports it, it's safer that we keep it disabled for all. 1627 * 1628 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1629 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1630 */ 1631 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1632 { 1633 #if IS_ENABLED(CONFIG_X86) 1634 struct cpuinfo_x86 *c = &cpu_data(0); 1635 1636 /* eGPU change speeds based on USB4 fabric conditions */ 1637 if (dev_is_removable(adev->dev)) 1638 return true; 1639 1640 if (c->x86_vendor == X86_VENDOR_INTEL) 1641 return false; 1642 #endif 1643 return true; 1644 } 1645 1646 /** 1647 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1648 * 1649 * @adev: amdgpu_device pointer 1650 * 1651 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1652 * be set for this device. 1653 * 1654 * Returns true if it should be used or false if not. 1655 */ 1656 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1657 { 1658 switch (amdgpu_aspm) { 1659 case -1: 1660 break; 1661 case 0: 1662 return false; 1663 case 1: 1664 return true; 1665 default: 1666 return false; 1667 } 1668 if (adev->flags & AMD_IS_APU) 1669 return false; 1670 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1671 return false; 1672 return pcie_aspm_enabled(adev->pdev); 1673 } 1674 1675 /* if we get transitioned to only one device, take VGA back */ 1676 /** 1677 * amdgpu_device_vga_set_decode - enable/disable vga decode 1678 * 1679 * @pdev: PCI device pointer 1680 * @state: enable/disable vga decode 1681 * 1682 * Enable/disable vga decode (all asics). 1683 * Returns VGA resource flags. 1684 */ 1685 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1686 bool state) 1687 { 1688 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1689 1690 amdgpu_asic_set_vga_state(adev, state); 1691 if (state) 1692 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1693 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1694 else 1695 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1696 } 1697 1698 /** 1699 * amdgpu_device_check_block_size - validate the vm block size 1700 * 1701 * @adev: amdgpu_device pointer 1702 * 1703 * Validates the vm block size specified via module parameter. 1704 * The vm block size defines number of bits in page table versus page directory, 1705 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1706 * page table and the remaining bits are in the page directory. 1707 */ 1708 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1709 { 1710 /* defines number of bits in page table versus page directory, 1711 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1712 * page table and the remaining bits are in the page directory 1713 */ 1714 if (amdgpu_vm_block_size == -1) 1715 return; 1716 1717 if (amdgpu_vm_block_size < 9) { 1718 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1719 amdgpu_vm_block_size); 1720 amdgpu_vm_block_size = -1; 1721 } 1722 } 1723 1724 /** 1725 * amdgpu_device_check_vm_size - validate the vm size 1726 * 1727 * @adev: amdgpu_device pointer 1728 * 1729 * Validates the vm size in GB specified via module parameter. 1730 * The VM size is the size of the GPU virtual memory space in GB. 1731 */ 1732 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1733 { 1734 /* no need to check the default value */ 1735 if (amdgpu_vm_size == -1) 1736 return; 1737 1738 if (amdgpu_vm_size < 1) { 1739 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1740 amdgpu_vm_size); 1741 amdgpu_vm_size = -1; 1742 } 1743 } 1744 1745 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1746 { 1747 struct sysinfo si; 1748 bool is_os_64 = (sizeof(void *) == 8); 1749 uint64_t total_memory; 1750 uint64_t dram_size_seven_GB = 0x1B8000000; 1751 uint64_t dram_size_three_GB = 0xB8000000; 1752 1753 if (amdgpu_smu_memory_pool_size == 0) 1754 return; 1755 1756 if (!is_os_64) { 1757 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1758 goto def_value; 1759 } 1760 si_meminfo(&si); 1761 total_memory = (uint64_t)si.totalram * si.mem_unit; 1762 1763 if ((amdgpu_smu_memory_pool_size == 1) || 1764 (amdgpu_smu_memory_pool_size == 2)) { 1765 if (total_memory < dram_size_three_GB) 1766 goto def_value1; 1767 } else if ((amdgpu_smu_memory_pool_size == 4) || 1768 (amdgpu_smu_memory_pool_size == 8)) { 1769 if (total_memory < dram_size_seven_GB) 1770 goto def_value1; 1771 } else { 1772 DRM_WARN("Smu memory pool size not supported\n"); 1773 goto def_value; 1774 } 1775 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1776 1777 return; 1778 1779 def_value1: 1780 DRM_WARN("No enough system memory\n"); 1781 def_value: 1782 adev->pm.smu_prv_buffer_size = 0; 1783 } 1784 1785 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1786 { 1787 if (!(adev->flags & AMD_IS_APU) || 1788 adev->asic_type < CHIP_RAVEN) 1789 return 0; 1790 1791 switch (adev->asic_type) { 1792 case CHIP_RAVEN: 1793 if (adev->pdev->device == 0x15dd) 1794 adev->apu_flags |= AMD_APU_IS_RAVEN; 1795 if (adev->pdev->device == 0x15d8) 1796 adev->apu_flags |= AMD_APU_IS_PICASSO; 1797 break; 1798 case CHIP_RENOIR: 1799 if ((adev->pdev->device == 0x1636) || 1800 (adev->pdev->device == 0x164c)) 1801 adev->apu_flags |= AMD_APU_IS_RENOIR; 1802 else 1803 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1804 break; 1805 case CHIP_VANGOGH: 1806 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1807 break; 1808 case CHIP_YELLOW_CARP: 1809 break; 1810 case CHIP_CYAN_SKILLFISH: 1811 if ((adev->pdev->device == 0x13FE) || 1812 (adev->pdev->device == 0x143F)) 1813 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1814 break; 1815 default: 1816 break; 1817 } 1818 1819 return 0; 1820 } 1821 1822 /** 1823 * amdgpu_device_check_arguments - validate module params 1824 * 1825 * @adev: amdgpu_device pointer 1826 * 1827 * Validates certain module parameters and updates 1828 * the associated values used by the driver (all asics). 1829 */ 1830 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1831 { 1832 if (amdgpu_sched_jobs < 4) { 1833 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1834 amdgpu_sched_jobs); 1835 amdgpu_sched_jobs = 4; 1836 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1837 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1838 amdgpu_sched_jobs); 1839 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1840 } 1841 1842 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1843 /* gart size must be greater or equal to 32M */ 1844 dev_warn(adev->dev, "gart size (%d) too small\n", 1845 amdgpu_gart_size); 1846 amdgpu_gart_size = -1; 1847 } 1848 1849 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1850 /* gtt size must be greater or equal to 32M */ 1851 dev_warn(adev->dev, "gtt size (%d) too small\n", 1852 amdgpu_gtt_size); 1853 amdgpu_gtt_size = -1; 1854 } 1855 1856 /* valid range is between 4 and 9 inclusive */ 1857 if (amdgpu_vm_fragment_size != -1 && 1858 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1859 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1860 amdgpu_vm_fragment_size = -1; 1861 } 1862 1863 if (amdgpu_sched_hw_submission < 2) { 1864 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1865 amdgpu_sched_hw_submission); 1866 amdgpu_sched_hw_submission = 2; 1867 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1868 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1869 amdgpu_sched_hw_submission); 1870 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1871 } 1872 1873 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1874 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1875 amdgpu_reset_method = -1; 1876 } 1877 1878 amdgpu_device_check_smu_prv_buffer_size(adev); 1879 1880 amdgpu_device_check_vm_size(adev); 1881 1882 amdgpu_device_check_block_size(adev); 1883 1884 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1885 1886 return 0; 1887 } 1888 1889 /** 1890 * amdgpu_switcheroo_set_state - set switcheroo state 1891 * 1892 * @pdev: pci dev pointer 1893 * @state: vga_switcheroo state 1894 * 1895 * Callback for the switcheroo driver. Suspends or resumes 1896 * the asics before or after it is powered up using ACPI methods. 1897 */ 1898 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1899 enum vga_switcheroo_state state) 1900 { 1901 struct drm_device *dev = pci_get_drvdata(pdev); 1902 int r; 1903 1904 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1905 return; 1906 1907 if (state == VGA_SWITCHEROO_ON) { 1908 pr_info("switched on\n"); 1909 /* don't suspend or resume card normally */ 1910 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1911 1912 pci_set_power_state(pdev, PCI_D0); 1913 amdgpu_device_load_pci_state(pdev); 1914 r = pci_enable_device(pdev); 1915 if (r) 1916 DRM_WARN("pci_enable_device failed (%d)\n", r); 1917 amdgpu_device_resume(dev, true); 1918 1919 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1920 } else { 1921 pr_info("switched off\n"); 1922 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1923 amdgpu_device_prepare(dev); 1924 amdgpu_device_suspend(dev, true); 1925 amdgpu_device_cache_pci_state(pdev); 1926 /* Shut down the device */ 1927 pci_disable_device(pdev); 1928 pci_set_power_state(pdev, PCI_D3cold); 1929 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1930 } 1931 } 1932 1933 /** 1934 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1935 * 1936 * @pdev: pci dev pointer 1937 * 1938 * Callback for the switcheroo driver. Check of the switcheroo 1939 * state can be changed. 1940 * Returns true if the state can be changed, false if not. 1941 */ 1942 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1943 { 1944 struct drm_device *dev = pci_get_drvdata(pdev); 1945 1946 /* 1947 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1948 * locking inversion with the driver load path. And the access here is 1949 * completely racy anyway. So don't bother with locking for now. 1950 */ 1951 return atomic_read(&dev->open_count) == 0; 1952 } 1953 1954 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1955 .set_gpu_state = amdgpu_switcheroo_set_state, 1956 .reprobe = NULL, 1957 .can_switch = amdgpu_switcheroo_can_switch, 1958 }; 1959 1960 /** 1961 * amdgpu_device_ip_set_clockgating_state - set the CG state 1962 * 1963 * @dev: amdgpu_device pointer 1964 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1965 * @state: clockgating state (gate or ungate) 1966 * 1967 * Sets the requested clockgating state for all instances of 1968 * the hardware IP specified. 1969 * Returns the error code from the last instance. 1970 */ 1971 int amdgpu_device_ip_set_clockgating_state(void *dev, 1972 enum amd_ip_block_type block_type, 1973 enum amd_clockgating_state state) 1974 { 1975 struct amdgpu_device *adev = dev; 1976 int i, r = 0; 1977 1978 for (i = 0; i < adev->num_ip_blocks; i++) { 1979 if (!adev->ip_blocks[i].status.valid) 1980 continue; 1981 if (adev->ip_blocks[i].version->type != block_type) 1982 continue; 1983 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1984 continue; 1985 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1986 (void *)adev, state); 1987 if (r) 1988 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1989 adev->ip_blocks[i].version->funcs->name, r); 1990 } 1991 return r; 1992 } 1993 1994 /** 1995 * amdgpu_device_ip_set_powergating_state - set the PG state 1996 * 1997 * @dev: amdgpu_device pointer 1998 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1999 * @state: powergating state (gate or ungate) 2000 * 2001 * Sets the requested powergating state for all instances of 2002 * the hardware IP specified. 2003 * Returns the error code from the last instance. 2004 */ 2005 int amdgpu_device_ip_set_powergating_state(void *dev, 2006 enum amd_ip_block_type block_type, 2007 enum amd_powergating_state state) 2008 { 2009 struct amdgpu_device *adev = dev; 2010 int i, r = 0; 2011 2012 for (i = 0; i < adev->num_ip_blocks; i++) { 2013 if (!adev->ip_blocks[i].status.valid) 2014 continue; 2015 if (adev->ip_blocks[i].version->type != block_type) 2016 continue; 2017 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2018 continue; 2019 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2020 (void *)adev, state); 2021 if (r) 2022 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2023 adev->ip_blocks[i].version->funcs->name, r); 2024 } 2025 return r; 2026 } 2027 2028 /** 2029 * amdgpu_device_ip_get_clockgating_state - get the CG state 2030 * 2031 * @adev: amdgpu_device pointer 2032 * @flags: clockgating feature flags 2033 * 2034 * Walks the list of IPs on the device and updates the clockgating 2035 * flags for each IP. 2036 * Updates @flags with the feature flags for each hardware IP where 2037 * clockgating is enabled. 2038 */ 2039 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2040 u64 *flags) 2041 { 2042 int i; 2043 2044 for (i = 0; i < adev->num_ip_blocks; i++) { 2045 if (!adev->ip_blocks[i].status.valid) 2046 continue; 2047 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2048 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2049 } 2050 } 2051 2052 /** 2053 * amdgpu_device_ip_wait_for_idle - wait for idle 2054 * 2055 * @adev: amdgpu_device pointer 2056 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2057 * 2058 * Waits for the request hardware IP to be idle. 2059 * Returns 0 for success or a negative error code on failure. 2060 */ 2061 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2062 enum amd_ip_block_type block_type) 2063 { 2064 int i, r; 2065 2066 for (i = 0; i < adev->num_ip_blocks; i++) { 2067 if (!adev->ip_blocks[i].status.valid) 2068 continue; 2069 if (adev->ip_blocks[i].version->type == block_type) { 2070 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2071 if (r) 2072 return r; 2073 break; 2074 } 2075 } 2076 return 0; 2077 2078 } 2079 2080 /** 2081 * amdgpu_device_ip_is_idle - is the hardware IP idle 2082 * 2083 * @adev: amdgpu_device pointer 2084 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2085 * 2086 * Check if the hardware IP is idle or not. 2087 * Returns true if it the IP is idle, false if not. 2088 */ 2089 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2090 enum amd_ip_block_type block_type) 2091 { 2092 int i; 2093 2094 for (i = 0; i < adev->num_ip_blocks; i++) { 2095 if (!adev->ip_blocks[i].status.valid) 2096 continue; 2097 if (adev->ip_blocks[i].version->type == block_type) 2098 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2099 } 2100 return true; 2101 2102 } 2103 2104 /** 2105 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2106 * 2107 * @adev: amdgpu_device pointer 2108 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2109 * 2110 * Returns a pointer to the hardware IP block structure 2111 * if it exists for the asic, otherwise NULL. 2112 */ 2113 struct amdgpu_ip_block * 2114 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2115 enum amd_ip_block_type type) 2116 { 2117 int i; 2118 2119 for (i = 0; i < adev->num_ip_blocks; i++) 2120 if (adev->ip_blocks[i].version->type == type) 2121 return &adev->ip_blocks[i]; 2122 2123 return NULL; 2124 } 2125 2126 /** 2127 * amdgpu_device_ip_block_version_cmp 2128 * 2129 * @adev: amdgpu_device pointer 2130 * @type: enum amd_ip_block_type 2131 * @major: major version 2132 * @minor: minor version 2133 * 2134 * return 0 if equal or greater 2135 * return 1 if smaller or the ip_block doesn't exist 2136 */ 2137 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2138 enum amd_ip_block_type type, 2139 u32 major, u32 minor) 2140 { 2141 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2142 2143 if (ip_block && ((ip_block->version->major > major) || 2144 ((ip_block->version->major == major) && 2145 (ip_block->version->minor >= minor)))) 2146 return 0; 2147 2148 return 1; 2149 } 2150 2151 /** 2152 * amdgpu_device_ip_block_add 2153 * 2154 * @adev: amdgpu_device pointer 2155 * @ip_block_version: pointer to the IP to add 2156 * 2157 * Adds the IP block driver information to the collection of IPs 2158 * on the asic. 2159 */ 2160 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2161 const struct amdgpu_ip_block_version *ip_block_version) 2162 { 2163 if (!ip_block_version) 2164 return -EINVAL; 2165 2166 switch (ip_block_version->type) { 2167 case AMD_IP_BLOCK_TYPE_VCN: 2168 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2169 return 0; 2170 break; 2171 case AMD_IP_BLOCK_TYPE_JPEG: 2172 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2173 return 0; 2174 break; 2175 default: 2176 break; 2177 } 2178 2179 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2180 ip_block_version->funcs->name); 2181 2182 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2183 2184 return 0; 2185 } 2186 2187 /** 2188 * amdgpu_device_enable_virtual_display - enable virtual display feature 2189 * 2190 * @adev: amdgpu_device pointer 2191 * 2192 * Enabled the virtual display feature if the user has enabled it via 2193 * the module parameter virtual_display. This feature provides a virtual 2194 * display hardware on headless boards or in virtualized environments. 2195 * This function parses and validates the configuration string specified by 2196 * the user and configues the virtual display configuration (number of 2197 * virtual connectors, crtcs, etc.) specified. 2198 */ 2199 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2200 { 2201 adev->enable_virtual_display = false; 2202 2203 if (amdgpu_virtual_display) { 2204 const char *pci_address_name = pci_name(adev->pdev); 2205 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2206 2207 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2208 pciaddstr_tmp = pciaddstr; 2209 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2210 pciaddname = strsep(&pciaddname_tmp, ","); 2211 if (!strcmp("all", pciaddname) 2212 || !strcmp(pci_address_name, pciaddname)) { 2213 long num_crtc; 2214 int res = -1; 2215 2216 adev->enable_virtual_display = true; 2217 2218 if (pciaddname_tmp) 2219 res = kstrtol(pciaddname_tmp, 10, 2220 &num_crtc); 2221 2222 if (!res) { 2223 if (num_crtc < 1) 2224 num_crtc = 1; 2225 if (num_crtc > 6) 2226 num_crtc = 6; 2227 adev->mode_info.num_crtc = num_crtc; 2228 } else { 2229 adev->mode_info.num_crtc = 1; 2230 } 2231 break; 2232 } 2233 } 2234 2235 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2236 amdgpu_virtual_display, pci_address_name, 2237 adev->enable_virtual_display, adev->mode_info.num_crtc); 2238 2239 kfree(pciaddstr); 2240 } 2241 } 2242 2243 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2244 { 2245 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2246 adev->mode_info.num_crtc = 1; 2247 adev->enable_virtual_display = true; 2248 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2249 adev->enable_virtual_display, adev->mode_info.num_crtc); 2250 } 2251 } 2252 2253 /** 2254 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2255 * 2256 * @adev: amdgpu_device pointer 2257 * 2258 * Parses the asic configuration parameters specified in the gpu info 2259 * firmware and makes them availale to the driver for use in configuring 2260 * the asic. 2261 * Returns 0 on success, -EINVAL on failure. 2262 */ 2263 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2264 { 2265 const char *chip_name; 2266 char fw_name[40]; 2267 int err; 2268 const struct gpu_info_firmware_header_v1_0 *hdr; 2269 2270 adev->firmware.gpu_info_fw = NULL; 2271 2272 if (adev->mman.discovery_bin) 2273 return 0; 2274 2275 switch (adev->asic_type) { 2276 default: 2277 return 0; 2278 case CHIP_VEGA10: 2279 chip_name = "vega10"; 2280 break; 2281 case CHIP_VEGA12: 2282 chip_name = "vega12"; 2283 break; 2284 case CHIP_RAVEN: 2285 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2286 chip_name = "raven2"; 2287 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2288 chip_name = "picasso"; 2289 else 2290 chip_name = "raven"; 2291 break; 2292 case CHIP_ARCTURUS: 2293 chip_name = "arcturus"; 2294 break; 2295 case CHIP_NAVI12: 2296 chip_name = "navi12"; 2297 break; 2298 } 2299 2300 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2301 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2302 if (err) { 2303 dev_err(adev->dev, 2304 "Failed to get gpu_info firmware \"%s\"\n", 2305 fw_name); 2306 goto out; 2307 } 2308 2309 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2310 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2311 2312 switch (hdr->version_major) { 2313 case 1: 2314 { 2315 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2316 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2317 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2318 2319 /* 2320 * Should be droped when DAL no longer needs it. 2321 */ 2322 if (adev->asic_type == CHIP_NAVI12) 2323 goto parse_soc_bounding_box; 2324 2325 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2326 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2327 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2328 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2329 adev->gfx.config.max_texture_channel_caches = 2330 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2331 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2332 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2333 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2334 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2335 adev->gfx.config.double_offchip_lds_buf = 2336 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2337 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2338 adev->gfx.cu_info.max_waves_per_simd = 2339 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2340 adev->gfx.cu_info.max_scratch_slots_per_cu = 2341 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2342 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2343 if (hdr->version_minor >= 1) { 2344 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2345 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2346 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2347 adev->gfx.config.num_sc_per_sh = 2348 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2349 adev->gfx.config.num_packer_per_sc = 2350 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2351 } 2352 2353 parse_soc_bounding_box: 2354 /* 2355 * soc bounding box info is not integrated in disocovery table, 2356 * we always need to parse it from gpu info firmware if needed. 2357 */ 2358 if (hdr->version_minor == 2) { 2359 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2360 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2361 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2362 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2363 } 2364 break; 2365 } 2366 default: 2367 dev_err(adev->dev, 2368 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2369 err = -EINVAL; 2370 goto out; 2371 } 2372 out: 2373 return err; 2374 } 2375 2376 /** 2377 * amdgpu_device_ip_early_init - run early init for hardware IPs 2378 * 2379 * @adev: amdgpu_device pointer 2380 * 2381 * Early initialization pass for hardware IPs. The hardware IPs that make 2382 * up each asic are discovered each IP's early_init callback is run. This 2383 * is the first stage in initializing the asic. 2384 * Returns 0 on success, negative error code on failure. 2385 */ 2386 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2387 { 2388 struct pci_dev *parent; 2389 int i, r; 2390 bool total; 2391 2392 amdgpu_device_enable_virtual_display(adev); 2393 2394 if (amdgpu_sriov_vf(adev)) { 2395 r = amdgpu_virt_request_full_gpu(adev, true); 2396 if (r) 2397 return r; 2398 } 2399 2400 switch (adev->asic_type) { 2401 #ifdef CONFIG_DRM_AMDGPU_SI 2402 case CHIP_VERDE: 2403 case CHIP_TAHITI: 2404 case CHIP_PITCAIRN: 2405 case CHIP_OLAND: 2406 case CHIP_HAINAN: 2407 adev->family = AMDGPU_FAMILY_SI; 2408 r = si_set_ip_blocks(adev); 2409 if (r) 2410 return r; 2411 break; 2412 #endif 2413 #ifdef CONFIG_DRM_AMDGPU_CIK 2414 case CHIP_BONAIRE: 2415 case CHIP_HAWAII: 2416 case CHIP_KAVERI: 2417 case CHIP_KABINI: 2418 case CHIP_MULLINS: 2419 if (adev->flags & AMD_IS_APU) 2420 adev->family = AMDGPU_FAMILY_KV; 2421 else 2422 adev->family = AMDGPU_FAMILY_CI; 2423 2424 r = cik_set_ip_blocks(adev); 2425 if (r) 2426 return r; 2427 break; 2428 #endif 2429 case CHIP_TOPAZ: 2430 case CHIP_TONGA: 2431 case CHIP_FIJI: 2432 case CHIP_POLARIS10: 2433 case CHIP_POLARIS11: 2434 case CHIP_POLARIS12: 2435 case CHIP_VEGAM: 2436 case CHIP_CARRIZO: 2437 case CHIP_STONEY: 2438 if (adev->flags & AMD_IS_APU) 2439 adev->family = AMDGPU_FAMILY_CZ; 2440 else 2441 adev->family = AMDGPU_FAMILY_VI; 2442 2443 r = vi_set_ip_blocks(adev); 2444 if (r) 2445 return r; 2446 break; 2447 default: 2448 r = amdgpu_discovery_set_ip_blocks(adev); 2449 if (r) 2450 return r; 2451 break; 2452 } 2453 2454 if (amdgpu_has_atpx() && 2455 (amdgpu_is_atpx_hybrid() || 2456 amdgpu_has_atpx_dgpu_power_cntl()) && 2457 ((adev->flags & AMD_IS_APU) == 0) && 2458 !dev_is_removable(&adev->pdev->dev)) 2459 adev->flags |= AMD_IS_PX; 2460 2461 if (!(adev->flags & AMD_IS_APU)) { 2462 parent = pcie_find_root_port(adev->pdev); 2463 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2464 } 2465 2466 2467 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2468 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2469 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2470 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2471 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2472 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2473 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2474 2475 total = true; 2476 for (i = 0; i < adev->num_ip_blocks; i++) { 2477 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2478 DRM_WARN("disabled ip block: %d <%s>\n", 2479 i, adev->ip_blocks[i].version->funcs->name); 2480 adev->ip_blocks[i].status.valid = false; 2481 } else { 2482 if (adev->ip_blocks[i].version->funcs->early_init) { 2483 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2484 if (r == -ENOENT) { 2485 adev->ip_blocks[i].status.valid = false; 2486 } else if (r) { 2487 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2488 adev->ip_blocks[i].version->funcs->name, r); 2489 total = false; 2490 } else { 2491 adev->ip_blocks[i].status.valid = true; 2492 } 2493 } else { 2494 adev->ip_blocks[i].status.valid = true; 2495 } 2496 } 2497 /* get the vbios after the asic_funcs are set up */ 2498 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2499 r = amdgpu_device_parse_gpu_info_fw(adev); 2500 if (r) 2501 return r; 2502 2503 /* Read BIOS */ 2504 if (amdgpu_device_read_bios(adev)) { 2505 if (!amdgpu_get_bios(adev)) 2506 return -EINVAL; 2507 2508 r = amdgpu_atombios_init(adev); 2509 if (r) { 2510 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2511 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2512 return r; 2513 } 2514 } 2515 2516 /*get pf2vf msg info at it's earliest time*/ 2517 if (amdgpu_sriov_vf(adev)) 2518 amdgpu_virt_init_data_exchange(adev); 2519 2520 } 2521 } 2522 if (!total) 2523 return -ENODEV; 2524 2525 amdgpu_amdkfd_device_probe(adev); 2526 adev->cg_flags &= amdgpu_cg_mask; 2527 adev->pg_flags &= amdgpu_pg_mask; 2528 2529 return 0; 2530 } 2531 2532 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2533 { 2534 int i, r; 2535 2536 for (i = 0; i < adev->num_ip_blocks; i++) { 2537 if (!adev->ip_blocks[i].status.sw) 2538 continue; 2539 if (adev->ip_blocks[i].status.hw) 2540 continue; 2541 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2542 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2543 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2544 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2545 if (r) { 2546 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2547 adev->ip_blocks[i].version->funcs->name, r); 2548 return r; 2549 } 2550 adev->ip_blocks[i].status.hw = true; 2551 } 2552 } 2553 2554 return 0; 2555 } 2556 2557 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2558 { 2559 int i, r; 2560 2561 for (i = 0; i < adev->num_ip_blocks; i++) { 2562 if (!adev->ip_blocks[i].status.sw) 2563 continue; 2564 if (adev->ip_blocks[i].status.hw) 2565 continue; 2566 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2567 if (r) { 2568 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2569 adev->ip_blocks[i].version->funcs->name, r); 2570 return r; 2571 } 2572 adev->ip_blocks[i].status.hw = true; 2573 } 2574 2575 return 0; 2576 } 2577 2578 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2579 { 2580 int r = 0; 2581 int i; 2582 uint32_t smu_version; 2583 2584 if (adev->asic_type >= CHIP_VEGA10) { 2585 for (i = 0; i < adev->num_ip_blocks; i++) { 2586 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2587 continue; 2588 2589 if (!adev->ip_blocks[i].status.sw) 2590 continue; 2591 2592 /* no need to do the fw loading again if already done*/ 2593 if (adev->ip_blocks[i].status.hw == true) 2594 break; 2595 2596 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2597 r = adev->ip_blocks[i].version->funcs->resume(adev); 2598 if (r) { 2599 DRM_ERROR("resume of IP block <%s> failed %d\n", 2600 adev->ip_blocks[i].version->funcs->name, r); 2601 return r; 2602 } 2603 } else { 2604 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2605 if (r) { 2606 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2607 adev->ip_blocks[i].version->funcs->name, r); 2608 return r; 2609 } 2610 } 2611 2612 adev->ip_blocks[i].status.hw = true; 2613 break; 2614 } 2615 } 2616 2617 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2618 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2619 2620 return r; 2621 } 2622 2623 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2624 { 2625 long timeout; 2626 int r, i; 2627 2628 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2629 struct amdgpu_ring *ring = adev->rings[i]; 2630 2631 /* No need to setup the GPU scheduler for rings that don't need it */ 2632 if (!ring || ring->no_scheduler) 2633 continue; 2634 2635 switch (ring->funcs->type) { 2636 case AMDGPU_RING_TYPE_GFX: 2637 timeout = adev->gfx_timeout; 2638 break; 2639 case AMDGPU_RING_TYPE_COMPUTE: 2640 timeout = adev->compute_timeout; 2641 break; 2642 case AMDGPU_RING_TYPE_SDMA: 2643 timeout = adev->sdma_timeout; 2644 break; 2645 default: 2646 timeout = adev->video_timeout; 2647 break; 2648 } 2649 2650 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2651 DRM_SCHED_PRIORITY_COUNT, 2652 ring->num_hw_submission, 0, 2653 timeout, adev->reset_domain->wq, 2654 ring->sched_score, ring->name, 2655 adev->dev); 2656 if (r) { 2657 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2658 ring->name); 2659 return r; 2660 } 2661 r = amdgpu_uvd_entity_init(adev, ring); 2662 if (r) { 2663 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2664 ring->name); 2665 return r; 2666 } 2667 r = amdgpu_vce_entity_init(adev, ring); 2668 if (r) { 2669 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2670 ring->name); 2671 return r; 2672 } 2673 } 2674 2675 amdgpu_xcp_update_partition_sched_list(adev); 2676 2677 return 0; 2678 } 2679 2680 2681 /** 2682 * amdgpu_device_ip_init - run init for hardware IPs 2683 * 2684 * @adev: amdgpu_device pointer 2685 * 2686 * Main initialization pass for hardware IPs. The list of all the hardware 2687 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2688 * are run. sw_init initializes the software state associated with each IP 2689 * and hw_init initializes the hardware associated with each IP. 2690 * Returns 0 on success, negative error code on failure. 2691 */ 2692 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2693 { 2694 int i, r; 2695 2696 r = amdgpu_ras_init(adev); 2697 if (r) 2698 return r; 2699 2700 for (i = 0; i < adev->num_ip_blocks; i++) { 2701 if (!adev->ip_blocks[i].status.valid) 2702 continue; 2703 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2704 if (r) { 2705 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2706 adev->ip_blocks[i].version->funcs->name, r); 2707 goto init_failed; 2708 } 2709 adev->ip_blocks[i].status.sw = true; 2710 2711 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2712 /* need to do common hw init early so everything is set up for gmc */ 2713 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2714 if (r) { 2715 DRM_ERROR("hw_init %d failed %d\n", i, r); 2716 goto init_failed; 2717 } 2718 adev->ip_blocks[i].status.hw = true; 2719 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2720 /* need to do gmc hw init early so we can allocate gpu mem */ 2721 /* Try to reserve bad pages early */ 2722 if (amdgpu_sriov_vf(adev)) 2723 amdgpu_virt_exchange_data(adev); 2724 2725 r = amdgpu_device_mem_scratch_init(adev); 2726 if (r) { 2727 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2728 goto init_failed; 2729 } 2730 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2731 if (r) { 2732 DRM_ERROR("hw_init %d failed %d\n", i, r); 2733 goto init_failed; 2734 } 2735 r = amdgpu_device_wb_init(adev); 2736 if (r) { 2737 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2738 goto init_failed; 2739 } 2740 adev->ip_blocks[i].status.hw = true; 2741 2742 /* right after GMC hw init, we create CSA */ 2743 if (adev->gfx.mcbp) { 2744 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2745 AMDGPU_GEM_DOMAIN_VRAM | 2746 AMDGPU_GEM_DOMAIN_GTT, 2747 AMDGPU_CSA_SIZE); 2748 if (r) { 2749 DRM_ERROR("allocate CSA failed %d\n", r); 2750 goto init_failed; 2751 } 2752 } 2753 2754 r = amdgpu_seq64_init(adev); 2755 if (r) { 2756 DRM_ERROR("allocate seq64 failed %d\n", r); 2757 goto init_failed; 2758 } 2759 } 2760 } 2761 2762 if (amdgpu_sriov_vf(adev)) 2763 amdgpu_virt_init_data_exchange(adev); 2764 2765 r = amdgpu_ib_pool_init(adev); 2766 if (r) { 2767 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2768 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2769 goto init_failed; 2770 } 2771 2772 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2773 if (r) 2774 goto init_failed; 2775 2776 r = amdgpu_device_ip_hw_init_phase1(adev); 2777 if (r) 2778 goto init_failed; 2779 2780 r = amdgpu_device_fw_loading(adev); 2781 if (r) 2782 goto init_failed; 2783 2784 r = amdgpu_device_ip_hw_init_phase2(adev); 2785 if (r) 2786 goto init_failed; 2787 2788 /* 2789 * retired pages will be loaded from eeprom and reserved here, 2790 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2791 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2792 * for I2C communication which only true at this point. 2793 * 2794 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2795 * failure from bad gpu situation and stop amdgpu init process 2796 * accordingly. For other failed cases, it will still release all 2797 * the resource and print error message, rather than returning one 2798 * negative value to upper level. 2799 * 2800 * Note: theoretically, this should be called before all vram allocations 2801 * to protect retired page from abusing 2802 */ 2803 r = amdgpu_ras_recovery_init(adev); 2804 if (r) 2805 goto init_failed; 2806 2807 /** 2808 * In case of XGMI grab extra reference for reset domain for this device 2809 */ 2810 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2811 if (amdgpu_xgmi_add_device(adev) == 0) { 2812 if (!amdgpu_sriov_vf(adev)) { 2813 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2814 2815 if (WARN_ON(!hive)) { 2816 r = -ENOENT; 2817 goto init_failed; 2818 } 2819 2820 if (!hive->reset_domain || 2821 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2822 r = -ENOENT; 2823 amdgpu_put_xgmi_hive(hive); 2824 goto init_failed; 2825 } 2826 2827 /* Drop the early temporary reset domain we created for device */ 2828 amdgpu_reset_put_reset_domain(adev->reset_domain); 2829 adev->reset_domain = hive->reset_domain; 2830 amdgpu_put_xgmi_hive(hive); 2831 } 2832 } 2833 } 2834 2835 r = amdgpu_device_init_schedulers(adev); 2836 if (r) 2837 goto init_failed; 2838 2839 if (adev->mman.buffer_funcs_ring->sched.ready) 2840 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2841 2842 /* Don't init kfd if whole hive need to be reset during init */ 2843 if (!adev->gmc.xgmi.pending_reset) { 2844 kgd2kfd_init_zone_device(adev); 2845 amdgpu_amdkfd_device_init(adev); 2846 } 2847 2848 amdgpu_fru_get_product_info(adev); 2849 2850 init_failed: 2851 2852 return r; 2853 } 2854 2855 /** 2856 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2857 * 2858 * @adev: amdgpu_device pointer 2859 * 2860 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2861 * this function before a GPU reset. If the value is retained after a 2862 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2863 */ 2864 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2865 { 2866 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2867 } 2868 2869 /** 2870 * amdgpu_device_check_vram_lost - check if vram is valid 2871 * 2872 * @adev: amdgpu_device pointer 2873 * 2874 * Checks the reset magic value written to the gart pointer in VRAM. 2875 * The driver calls this after a GPU reset to see if the contents of 2876 * VRAM is lost or now. 2877 * returns true if vram is lost, false if not. 2878 */ 2879 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2880 { 2881 if (memcmp(adev->gart.ptr, adev->reset_magic, 2882 AMDGPU_RESET_MAGIC_NUM)) 2883 return true; 2884 2885 if (!amdgpu_in_reset(adev)) 2886 return false; 2887 2888 /* 2889 * For all ASICs with baco/mode1 reset, the VRAM is 2890 * always assumed to be lost. 2891 */ 2892 switch (amdgpu_asic_reset_method(adev)) { 2893 case AMD_RESET_METHOD_BACO: 2894 case AMD_RESET_METHOD_MODE1: 2895 return true; 2896 default: 2897 return false; 2898 } 2899 } 2900 2901 /** 2902 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2903 * 2904 * @adev: amdgpu_device pointer 2905 * @state: clockgating state (gate or ungate) 2906 * 2907 * The list of all the hardware IPs that make up the asic is walked and the 2908 * set_clockgating_state callbacks are run. 2909 * Late initialization pass enabling clockgating for hardware IPs. 2910 * Fini or suspend, pass disabling clockgating for hardware IPs. 2911 * Returns 0 on success, negative error code on failure. 2912 */ 2913 2914 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2915 enum amd_clockgating_state state) 2916 { 2917 int i, j, r; 2918 2919 if (amdgpu_emu_mode == 1) 2920 return 0; 2921 2922 for (j = 0; j < adev->num_ip_blocks; j++) { 2923 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2924 if (!adev->ip_blocks[i].status.late_initialized) 2925 continue; 2926 /* skip CG for GFX, SDMA on S0ix */ 2927 if (adev->in_s0ix && 2928 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2929 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2930 continue; 2931 /* skip CG for VCE/UVD, it's handled specially */ 2932 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2933 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2934 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2935 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2936 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2937 /* enable clockgating to save power */ 2938 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2939 state); 2940 if (r) { 2941 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2942 adev->ip_blocks[i].version->funcs->name, r); 2943 return r; 2944 } 2945 } 2946 } 2947 2948 return 0; 2949 } 2950 2951 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2952 enum amd_powergating_state state) 2953 { 2954 int i, j, r; 2955 2956 if (amdgpu_emu_mode == 1) 2957 return 0; 2958 2959 for (j = 0; j < adev->num_ip_blocks; j++) { 2960 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2961 if (!adev->ip_blocks[i].status.late_initialized) 2962 continue; 2963 /* skip PG for GFX, SDMA on S0ix */ 2964 if (adev->in_s0ix && 2965 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2966 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2967 continue; 2968 /* skip CG for VCE/UVD, it's handled specially */ 2969 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2970 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2971 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2972 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2973 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2974 /* enable powergating to save power */ 2975 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2976 state); 2977 if (r) { 2978 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2979 adev->ip_blocks[i].version->funcs->name, r); 2980 return r; 2981 } 2982 } 2983 } 2984 return 0; 2985 } 2986 2987 static int amdgpu_device_enable_mgpu_fan_boost(void) 2988 { 2989 struct amdgpu_gpu_instance *gpu_ins; 2990 struct amdgpu_device *adev; 2991 int i, ret = 0; 2992 2993 mutex_lock(&mgpu_info.mutex); 2994 2995 /* 2996 * MGPU fan boost feature should be enabled 2997 * only when there are two or more dGPUs in 2998 * the system 2999 */ 3000 if (mgpu_info.num_dgpu < 2) 3001 goto out; 3002 3003 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3004 gpu_ins = &(mgpu_info.gpu_ins[i]); 3005 adev = gpu_ins->adev; 3006 if (!(adev->flags & AMD_IS_APU) && 3007 !gpu_ins->mgpu_fan_enabled) { 3008 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3009 if (ret) 3010 break; 3011 3012 gpu_ins->mgpu_fan_enabled = 1; 3013 } 3014 } 3015 3016 out: 3017 mutex_unlock(&mgpu_info.mutex); 3018 3019 return ret; 3020 } 3021 3022 /** 3023 * amdgpu_device_ip_late_init - run late init for hardware IPs 3024 * 3025 * @adev: amdgpu_device pointer 3026 * 3027 * Late initialization pass for hardware IPs. The list of all the hardware 3028 * IPs that make up the asic is walked and the late_init callbacks are run. 3029 * late_init covers any special initialization that an IP requires 3030 * after all of the have been initialized or something that needs to happen 3031 * late in the init process. 3032 * Returns 0 on success, negative error code on failure. 3033 */ 3034 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3035 { 3036 struct amdgpu_gpu_instance *gpu_instance; 3037 int i = 0, r; 3038 3039 for (i = 0; i < adev->num_ip_blocks; i++) { 3040 if (!adev->ip_blocks[i].status.hw) 3041 continue; 3042 if (adev->ip_blocks[i].version->funcs->late_init) { 3043 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3044 if (r) { 3045 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3046 adev->ip_blocks[i].version->funcs->name, r); 3047 return r; 3048 } 3049 } 3050 adev->ip_blocks[i].status.late_initialized = true; 3051 } 3052 3053 r = amdgpu_ras_late_init(adev); 3054 if (r) { 3055 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3056 return r; 3057 } 3058 3059 amdgpu_ras_set_error_query_ready(adev, true); 3060 3061 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3062 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3063 3064 amdgpu_device_fill_reset_magic(adev); 3065 3066 r = amdgpu_device_enable_mgpu_fan_boost(); 3067 if (r) 3068 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3069 3070 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3071 if (amdgpu_passthrough(adev) && 3072 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3073 adev->asic_type == CHIP_ALDEBARAN)) 3074 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3075 3076 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3077 mutex_lock(&mgpu_info.mutex); 3078 3079 /* 3080 * Reset device p-state to low as this was booted with high. 3081 * 3082 * This should be performed only after all devices from the same 3083 * hive get initialized. 3084 * 3085 * However, it's unknown how many device in the hive in advance. 3086 * As this is counted one by one during devices initializations. 3087 * 3088 * So, we wait for all XGMI interlinked devices initialized. 3089 * This may bring some delays as those devices may come from 3090 * different hives. But that should be OK. 3091 */ 3092 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3093 for (i = 0; i < mgpu_info.num_gpu; i++) { 3094 gpu_instance = &(mgpu_info.gpu_ins[i]); 3095 if (gpu_instance->adev->flags & AMD_IS_APU) 3096 continue; 3097 3098 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3099 AMDGPU_XGMI_PSTATE_MIN); 3100 if (r) { 3101 DRM_ERROR("pstate setting failed (%d).\n", r); 3102 break; 3103 } 3104 } 3105 } 3106 3107 mutex_unlock(&mgpu_info.mutex); 3108 } 3109 3110 return 0; 3111 } 3112 3113 /** 3114 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3115 * 3116 * @adev: amdgpu_device pointer 3117 * 3118 * For ASICs need to disable SMC first 3119 */ 3120 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3121 { 3122 int i, r; 3123 3124 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3125 return; 3126 3127 for (i = 0; i < adev->num_ip_blocks; i++) { 3128 if (!adev->ip_blocks[i].status.hw) 3129 continue; 3130 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3131 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3132 /* XXX handle errors */ 3133 if (r) { 3134 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3135 adev->ip_blocks[i].version->funcs->name, r); 3136 } 3137 adev->ip_blocks[i].status.hw = false; 3138 break; 3139 } 3140 } 3141 } 3142 3143 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3144 { 3145 int i, r; 3146 3147 for (i = 0; i < adev->num_ip_blocks; i++) { 3148 if (!adev->ip_blocks[i].version->funcs->early_fini) 3149 continue; 3150 3151 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3152 if (r) { 3153 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3154 adev->ip_blocks[i].version->funcs->name, r); 3155 } 3156 } 3157 3158 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3159 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3160 3161 amdgpu_amdkfd_suspend(adev, false); 3162 3163 /* Workaroud for ASICs need to disable SMC first */ 3164 amdgpu_device_smu_fini_early(adev); 3165 3166 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3167 if (!adev->ip_blocks[i].status.hw) 3168 continue; 3169 3170 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3171 /* XXX handle errors */ 3172 if (r) { 3173 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3174 adev->ip_blocks[i].version->funcs->name, r); 3175 } 3176 3177 adev->ip_blocks[i].status.hw = false; 3178 } 3179 3180 if (amdgpu_sriov_vf(adev)) { 3181 if (amdgpu_virt_release_full_gpu(adev, false)) 3182 DRM_ERROR("failed to release exclusive mode on fini\n"); 3183 } 3184 3185 return 0; 3186 } 3187 3188 /** 3189 * amdgpu_device_ip_fini - run fini for hardware IPs 3190 * 3191 * @adev: amdgpu_device pointer 3192 * 3193 * Main teardown pass for hardware IPs. The list of all the hardware 3194 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3195 * are run. hw_fini tears down the hardware associated with each IP 3196 * and sw_fini tears down any software state associated with each IP. 3197 * Returns 0 on success, negative error code on failure. 3198 */ 3199 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3200 { 3201 int i, r; 3202 3203 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3204 amdgpu_virt_release_ras_err_handler_data(adev); 3205 3206 if (adev->gmc.xgmi.num_physical_nodes > 1) 3207 amdgpu_xgmi_remove_device(adev); 3208 3209 amdgpu_amdkfd_device_fini_sw(adev); 3210 3211 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3212 if (!adev->ip_blocks[i].status.sw) 3213 continue; 3214 3215 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3216 amdgpu_ucode_free_bo(adev); 3217 amdgpu_free_static_csa(&adev->virt.csa_obj); 3218 amdgpu_device_wb_fini(adev); 3219 amdgpu_device_mem_scratch_fini(adev); 3220 amdgpu_ib_pool_fini(adev); 3221 amdgpu_seq64_fini(adev); 3222 } 3223 3224 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3225 /* XXX handle errors */ 3226 if (r) { 3227 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3228 adev->ip_blocks[i].version->funcs->name, r); 3229 } 3230 adev->ip_blocks[i].status.sw = false; 3231 adev->ip_blocks[i].status.valid = false; 3232 } 3233 3234 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3235 if (!adev->ip_blocks[i].status.late_initialized) 3236 continue; 3237 if (adev->ip_blocks[i].version->funcs->late_fini) 3238 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3239 adev->ip_blocks[i].status.late_initialized = false; 3240 } 3241 3242 amdgpu_ras_fini(adev); 3243 3244 return 0; 3245 } 3246 3247 /** 3248 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3249 * 3250 * @work: work_struct. 3251 */ 3252 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3253 { 3254 struct amdgpu_device *adev = 3255 container_of(work, struct amdgpu_device, delayed_init_work.work); 3256 int r; 3257 3258 r = amdgpu_ib_ring_tests(adev); 3259 if (r) 3260 DRM_ERROR("ib ring test failed (%d).\n", r); 3261 } 3262 3263 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3264 { 3265 struct amdgpu_device *adev = 3266 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3267 3268 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3269 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3270 3271 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3272 adev->gfx.gfx_off_state = true; 3273 } 3274 3275 /** 3276 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3277 * 3278 * @adev: amdgpu_device pointer 3279 * 3280 * Main suspend function for hardware IPs. The list of all the hardware 3281 * IPs that make up the asic is walked, clockgating is disabled and the 3282 * suspend callbacks are run. suspend puts the hardware and software state 3283 * in each IP into a state suitable for suspend. 3284 * Returns 0 on success, negative error code on failure. 3285 */ 3286 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3287 { 3288 int i, r; 3289 3290 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3291 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3292 3293 /* 3294 * Per PMFW team's suggestion, driver needs to handle gfxoff 3295 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3296 * scenario. Add the missing df cstate disablement here. 3297 */ 3298 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3299 dev_warn(adev->dev, "Failed to disallow df cstate"); 3300 3301 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3302 if (!adev->ip_blocks[i].status.valid) 3303 continue; 3304 3305 /* displays are handled separately */ 3306 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3307 continue; 3308 3309 /* XXX handle errors */ 3310 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3311 /* XXX handle errors */ 3312 if (r) { 3313 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3314 adev->ip_blocks[i].version->funcs->name, r); 3315 return r; 3316 } 3317 3318 adev->ip_blocks[i].status.hw = false; 3319 } 3320 3321 return 0; 3322 } 3323 3324 /** 3325 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3326 * 3327 * @adev: amdgpu_device pointer 3328 * 3329 * Main suspend function for hardware IPs. The list of all the hardware 3330 * IPs that make up the asic is walked, clockgating is disabled and the 3331 * suspend callbacks are run. suspend puts the hardware and software state 3332 * in each IP into a state suitable for suspend. 3333 * Returns 0 on success, negative error code on failure. 3334 */ 3335 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3336 { 3337 int i, r; 3338 3339 if (adev->in_s0ix) 3340 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3341 3342 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3343 if (!adev->ip_blocks[i].status.valid) 3344 continue; 3345 /* displays are handled in phase1 */ 3346 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3347 continue; 3348 /* PSP lost connection when err_event_athub occurs */ 3349 if (amdgpu_ras_intr_triggered() && 3350 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3351 adev->ip_blocks[i].status.hw = false; 3352 continue; 3353 } 3354 3355 /* skip unnecessary suspend if we do not initialize them yet */ 3356 if (adev->gmc.xgmi.pending_reset && 3357 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3358 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3359 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3360 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3361 adev->ip_blocks[i].status.hw = false; 3362 continue; 3363 } 3364 3365 /* skip suspend of gfx/mes and psp for S0ix 3366 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3367 * like at runtime. PSP is also part of the always on hardware 3368 * so no need to suspend it. 3369 */ 3370 if (adev->in_s0ix && 3371 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3372 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3373 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3374 continue; 3375 3376 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3377 if (adev->in_s0ix && 3378 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3379 IP_VERSION(5, 0, 0)) && 3380 (adev->ip_blocks[i].version->type == 3381 AMD_IP_BLOCK_TYPE_SDMA)) 3382 continue; 3383 3384 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3385 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3386 * from this location and RLC Autoload automatically also gets loaded 3387 * from here based on PMFW -> PSP message during re-init sequence. 3388 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3389 * the TMR and reload FWs again for IMU enabled APU ASICs. 3390 */ 3391 if (amdgpu_in_reset(adev) && 3392 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3393 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3394 continue; 3395 3396 /* XXX handle errors */ 3397 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3398 /* XXX handle errors */ 3399 if (r) { 3400 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3401 adev->ip_blocks[i].version->funcs->name, r); 3402 } 3403 adev->ip_blocks[i].status.hw = false; 3404 /* handle putting the SMC in the appropriate state */ 3405 if (!amdgpu_sriov_vf(adev)) { 3406 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3407 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3408 if (r) { 3409 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3410 adev->mp1_state, r); 3411 return r; 3412 } 3413 } 3414 } 3415 } 3416 3417 return 0; 3418 } 3419 3420 /** 3421 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3422 * 3423 * @adev: amdgpu_device pointer 3424 * 3425 * Main suspend function for hardware IPs. The list of all the hardware 3426 * IPs that make up the asic is walked, clockgating is disabled and the 3427 * suspend callbacks are run. suspend puts the hardware and software state 3428 * in each IP into a state suitable for suspend. 3429 * Returns 0 on success, negative error code on failure. 3430 */ 3431 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3432 { 3433 int r; 3434 3435 if (amdgpu_sriov_vf(adev)) { 3436 amdgpu_virt_fini_data_exchange(adev); 3437 amdgpu_virt_request_full_gpu(adev, false); 3438 } 3439 3440 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3441 3442 r = amdgpu_device_ip_suspend_phase1(adev); 3443 if (r) 3444 return r; 3445 r = amdgpu_device_ip_suspend_phase2(adev); 3446 3447 if (amdgpu_sriov_vf(adev)) 3448 amdgpu_virt_release_full_gpu(adev, false); 3449 3450 return r; 3451 } 3452 3453 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3454 { 3455 int i, r; 3456 3457 static enum amd_ip_block_type ip_order[] = { 3458 AMD_IP_BLOCK_TYPE_COMMON, 3459 AMD_IP_BLOCK_TYPE_GMC, 3460 AMD_IP_BLOCK_TYPE_PSP, 3461 AMD_IP_BLOCK_TYPE_IH, 3462 }; 3463 3464 for (i = 0; i < adev->num_ip_blocks; i++) { 3465 int j; 3466 struct amdgpu_ip_block *block; 3467 3468 block = &adev->ip_blocks[i]; 3469 block->status.hw = false; 3470 3471 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3472 3473 if (block->version->type != ip_order[j] || 3474 !block->status.valid) 3475 continue; 3476 3477 r = block->version->funcs->hw_init(adev); 3478 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3479 if (r) 3480 return r; 3481 block->status.hw = true; 3482 } 3483 } 3484 3485 return 0; 3486 } 3487 3488 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3489 { 3490 int i, r; 3491 3492 static enum amd_ip_block_type ip_order[] = { 3493 AMD_IP_BLOCK_TYPE_SMC, 3494 AMD_IP_BLOCK_TYPE_DCE, 3495 AMD_IP_BLOCK_TYPE_GFX, 3496 AMD_IP_BLOCK_TYPE_SDMA, 3497 AMD_IP_BLOCK_TYPE_MES, 3498 AMD_IP_BLOCK_TYPE_UVD, 3499 AMD_IP_BLOCK_TYPE_VCE, 3500 AMD_IP_BLOCK_TYPE_VCN, 3501 AMD_IP_BLOCK_TYPE_JPEG 3502 }; 3503 3504 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3505 int j; 3506 struct amdgpu_ip_block *block; 3507 3508 for (j = 0; j < adev->num_ip_blocks; j++) { 3509 block = &adev->ip_blocks[j]; 3510 3511 if (block->version->type != ip_order[i] || 3512 !block->status.valid || 3513 block->status.hw) 3514 continue; 3515 3516 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3517 r = block->version->funcs->resume(adev); 3518 else 3519 r = block->version->funcs->hw_init(adev); 3520 3521 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3522 if (r) 3523 return r; 3524 block->status.hw = true; 3525 } 3526 } 3527 3528 return 0; 3529 } 3530 3531 /** 3532 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3533 * 3534 * @adev: amdgpu_device pointer 3535 * 3536 * First resume function for hardware IPs. The list of all the hardware 3537 * IPs that make up the asic is walked and the resume callbacks are run for 3538 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3539 * after a suspend and updates the software state as necessary. This 3540 * function is also used for restoring the GPU after a GPU reset. 3541 * Returns 0 on success, negative error code on failure. 3542 */ 3543 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3544 { 3545 int i, r; 3546 3547 for (i = 0; i < adev->num_ip_blocks; i++) { 3548 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3549 continue; 3550 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3551 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3552 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3553 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3554 3555 r = adev->ip_blocks[i].version->funcs->resume(adev); 3556 if (r) { 3557 DRM_ERROR("resume of IP block <%s> failed %d\n", 3558 adev->ip_blocks[i].version->funcs->name, r); 3559 return r; 3560 } 3561 adev->ip_blocks[i].status.hw = true; 3562 } 3563 } 3564 3565 return 0; 3566 } 3567 3568 /** 3569 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3570 * 3571 * @adev: amdgpu_device pointer 3572 * 3573 * First resume function for hardware IPs. The list of all the hardware 3574 * IPs that make up the asic is walked and the resume callbacks are run for 3575 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3576 * functional state after a suspend and updates the software state as 3577 * necessary. This function is also used for restoring the GPU after a GPU 3578 * reset. 3579 * Returns 0 on success, negative error code on failure. 3580 */ 3581 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3582 { 3583 int i, r; 3584 3585 for (i = 0; i < adev->num_ip_blocks; i++) { 3586 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3587 continue; 3588 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3589 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3590 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3591 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3592 continue; 3593 r = adev->ip_blocks[i].version->funcs->resume(adev); 3594 if (r) { 3595 DRM_ERROR("resume of IP block <%s> failed %d\n", 3596 adev->ip_blocks[i].version->funcs->name, r); 3597 return r; 3598 } 3599 adev->ip_blocks[i].status.hw = true; 3600 } 3601 3602 return 0; 3603 } 3604 3605 /** 3606 * amdgpu_device_ip_resume - run resume for hardware IPs 3607 * 3608 * @adev: amdgpu_device pointer 3609 * 3610 * Main resume function for hardware IPs. The hardware IPs 3611 * are split into two resume functions because they are 3612 * also used in recovering from a GPU reset and some additional 3613 * steps need to be take between them. In this case (S3/S4) they are 3614 * run sequentially. 3615 * Returns 0 on success, negative error code on failure. 3616 */ 3617 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3618 { 3619 int r; 3620 3621 r = amdgpu_device_ip_resume_phase1(adev); 3622 if (r) 3623 return r; 3624 3625 r = amdgpu_device_fw_loading(adev); 3626 if (r) 3627 return r; 3628 3629 r = amdgpu_device_ip_resume_phase2(adev); 3630 3631 if (adev->mman.buffer_funcs_ring->sched.ready) 3632 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3633 3634 return r; 3635 } 3636 3637 /** 3638 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3639 * 3640 * @adev: amdgpu_device pointer 3641 * 3642 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3643 */ 3644 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3645 { 3646 if (amdgpu_sriov_vf(adev)) { 3647 if (adev->is_atom_fw) { 3648 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3649 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3650 } else { 3651 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3652 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3653 } 3654 3655 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3656 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3657 } 3658 } 3659 3660 /** 3661 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3662 * 3663 * @asic_type: AMD asic type 3664 * 3665 * Check if there is DC (new modesetting infrastructre) support for an asic. 3666 * returns true if DC has support, false if not. 3667 */ 3668 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3669 { 3670 switch (asic_type) { 3671 #ifdef CONFIG_DRM_AMDGPU_SI 3672 case CHIP_HAINAN: 3673 #endif 3674 case CHIP_TOPAZ: 3675 /* chips with no display hardware */ 3676 return false; 3677 #if defined(CONFIG_DRM_AMD_DC) 3678 case CHIP_TAHITI: 3679 case CHIP_PITCAIRN: 3680 case CHIP_VERDE: 3681 case CHIP_OLAND: 3682 /* 3683 * We have systems in the wild with these ASICs that require 3684 * LVDS and VGA support which is not supported with DC. 3685 * 3686 * Fallback to the non-DC driver here by default so as not to 3687 * cause regressions. 3688 */ 3689 #if defined(CONFIG_DRM_AMD_DC_SI) 3690 return amdgpu_dc > 0; 3691 #else 3692 return false; 3693 #endif 3694 case CHIP_BONAIRE: 3695 case CHIP_KAVERI: 3696 case CHIP_KABINI: 3697 case CHIP_MULLINS: 3698 /* 3699 * We have systems in the wild with these ASICs that require 3700 * VGA support which is not supported with DC. 3701 * 3702 * Fallback to the non-DC driver here by default so as not to 3703 * cause regressions. 3704 */ 3705 return amdgpu_dc > 0; 3706 default: 3707 return amdgpu_dc != 0; 3708 #else 3709 default: 3710 if (amdgpu_dc > 0) 3711 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3712 return false; 3713 #endif 3714 } 3715 } 3716 3717 /** 3718 * amdgpu_device_has_dc_support - check if dc is supported 3719 * 3720 * @adev: amdgpu_device pointer 3721 * 3722 * Returns true for supported, false for not supported 3723 */ 3724 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3725 { 3726 if (adev->enable_virtual_display || 3727 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3728 return false; 3729 3730 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3731 } 3732 3733 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3734 { 3735 struct amdgpu_device *adev = 3736 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3737 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3738 3739 /* It's a bug to not have a hive within this function */ 3740 if (WARN_ON(!hive)) 3741 return; 3742 3743 /* 3744 * Use task barrier to synchronize all xgmi reset works across the 3745 * hive. task_barrier_enter and task_barrier_exit will block 3746 * until all the threads running the xgmi reset works reach 3747 * those points. task_barrier_full will do both blocks. 3748 */ 3749 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3750 3751 task_barrier_enter(&hive->tb); 3752 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3753 3754 if (adev->asic_reset_res) 3755 goto fail; 3756 3757 task_barrier_exit(&hive->tb); 3758 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3759 3760 if (adev->asic_reset_res) 3761 goto fail; 3762 3763 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3764 } else { 3765 3766 task_barrier_full(&hive->tb); 3767 adev->asic_reset_res = amdgpu_asic_reset(adev); 3768 } 3769 3770 fail: 3771 if (adev->asic_reset_res) 3772 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3773 adev->asic_reset_res, adev_to_drm(adev)->unique); 3774 amdgpu_put_xgmi_hive(hive); 3775 } 3776 3777 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3778 { 3779 char *input = amdgpu_lockup_timeout; 3780 char *timeout_setting = NULL; 3781 int index = 0; 3782 long timeout; 3783 int ret = 0; 3784 3785 /* 3786 * By default timeout for non compute jobs is 10000 3787 * and 60000 for compute jobs. 3788 * In SR-IOV or passthrough mode, timeout for compute 3789 * jobs are 60000 by default. 3790 */ 3791 adev->gfx_timeout = msecs_to_jiffies(10000); 3792 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3793 if (amdgpu_sriov_vf(adev)) 3794 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3795 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3796 else 3797 adev->compute_timeout = msecs_to_jiffies(60000); 3798 3799 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3800 while ((timeout_setting = strsep(&input, ",")) && 3801 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3802 ret = kstrtol(timeout_setting, 0, &timeout); 3803 if (ret) 3804 return ret; 3805 3806 if (timeout == 0) { 3807 index++; 3808 continue; 3809 } else if (timeout < 0) { 3810 timeout = MAX_SCHEDULE_TIMEOUT; 3811 dev_warn(adev->dev, "lockup timeout disabled"); 3812 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3813 } else { 3814 timeout = msecs_to_jiffies(timeout); 3815 } 3816 3817 switch (index++) { 3818 case 0: 3819 adev->gfx_timeout = timeout; 3820 break; 3821 case 1: 3822 adev->compute_timeout = timeout; 3823 break; 3824 case 2: 3825 adev->sdma_timeout = timeout; 3826 break; 3827 case 3: 3828 adev->video_timeout = timeout; 3829 break; 3830 default: 3831 break; 3832 } 3833 } 3834 /* 3835 * There is only one value specified and 3836 * it should apply to all non-compute jobs. 3837 */ 3838 if (index == 1) { 3839 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3840 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3841 adev->compute_timeout = adev->gfx_timeout; 3842 } 3843 } 3844 3845 return ret; 3846 } 3847 3848 /** 3849 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3850 * 3851 * @adev: amdgpu_device pointer 3852 * 3853 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3854 */ 3855 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3856 { 3857 struct iommu_domain *domain; 3858 3859 domain = iommu_get_domain_for_dev(adev->dev); 3860 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3861 adev->ram_is_direct_mapped = true; 3862 } 3863 3864 static const struct attribute *amdgpu_dev_attributes[] = { 3865 &dev_attr_pcie_replay_count.attr, 3866 NULL 3867 }; 3868 3869 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3870 { 3871 if (amdgpu_mcbp == 1) 3872 adev->gfx.mcbp = true; 3873 else if (amdgpu_mcbp == 0) 3874 adev->gfx.mcbp = false; 3875 3876 if (amdgpu_sriov_vf(adev)) 3877 adev->gfx.mcbp = true; 3878 3879 if (adev->gfx.mcbp) 3880 DRM_INFO("MCBP is enabled\n"); 3881 } 3882 3883 /** 3884 * amdgpu_device_init - initialize the driver 3885 * 3886 * @adev: amdgpu_device pointer 3887 * @flags: driver flags 3888 * 3889 * Initializes the driver info and hw (all asics). 3890 * Returns 0 for success or an error on failure. 3891 * Called at driver startup. 3892 */ 3893 int amdgpu_device_init(struct amdgpu_device *adev, 3894 uint32_t flags) 3895 { 3896 struct drm_device *ddev = adev_to_drm(adev); 3897 struct pci_dev *pdev = adev->pdev; 3898 int r, i; 3899 bool px = false; 3900 u32 max_MBps; 3901 int tmp; 3902 3903 adev->shutdown = false; 3904 adev->flags = flags; 3905 3906 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3907 adev->asic_type = amdgpu_force_asic_type; 3908 else 3909 adev->asic_type = flags & AMD_ASIC_MASK; 3910 3911 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3912 if (amdgpu_emu_mode == 1) 3913 adev->usec_timeout *= 10; 3914 adev->gmc.gart_size = 512 * 1024 * 1024; 3915 adev->accel_working = false; 3916 adev->num_rings = 0; 3917 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3918 adev->mman.buffer_funcs = NULL; 3919 adev->mman.buffer_funcs_ring = NULL; 3920 adev->vm_manager.vm_pte_funcs = NULL; 3921 adev->vm_manager.vm_pte_num_scheds = 0; 3922 adev->gmc.gmc_funcs = NULL; 3923 adev->harvest_ip_mask = 0x0; 3924 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3925 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3926 3927 adev->smc_rreg = &amdgpu_invalid_rreg; 3928 adev->smc_wreg = &amdgpu_invalid_wreg; 3929 adev->pcie_rreg = &amdgpu_invalid_rreg; 3930 adev->pcie_wreg = &amdgpu_invalid_wreg; 3931 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3932 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3933 adev->pciep_rreg = &amdgpu_invalid_rreg; 3934 adev->pciep_wreg = &amdgpu_invalid_wreg; 3935 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3936 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3937 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3938 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3939 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3940 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3941 adev->didt_rreg = &amdgpu_invalid_rreg; 3942 adev->didt_wreg = &amdgpu_invalid_wreg; 3943 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3944 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3945 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3946 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3947 3948 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3949 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3950 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3951 3952 /* mutex initialization are all done here so we 3953 * can recall function without having locking issues 3954 */ 3955 mutex_init(&adev->firmware.mutex); 3956 mutex_init(&adev->pm.mutex); 3957 mutex_init(&adev->gfx.gpu_clock_mutex); 3958 mutex_init(&adev->srbm_mutex); 3959 mutex_init(&adev->gfx.pipe_reserve_mutex); 3960 mutex_init(&adev->gfx.gfx_off_mutex); 3961 mutex_init(&adev->gfx.partition_mutex); 3962 mutex_init(&adev->grbm_idx_mutex); 3963 mutex_init(&adev->mn_lock); 3964 mutex_init(&adev->virt.vf_errors.lock); 3965 hash_init(adev->mn_hash); 3966 mutex_init(&adev->psp.mutex); 3967 mutex_init(&adev->notifier_lock); 3968 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3969 mutex_init(&adev->benchmark_mutex); 3970 3971 amdgpu_device_init_apu_flags(adev); 3972 3973 r = amdgpu_device_check_arguments(adev); 3974 if (r) 3975 return r; 3976 3977 spin_lock_init(&adev->mmio_idx_lock); 3978 spin_lock_init(&adev->smc_idx_lock); 3979 spin_lock_init(&adev->pcie_idx_lock); 3980 spin_lock_init(&adev->uvd_ctx_idx_lock); 3981 spin_lock_init(&adev->didt_idx_lock); 3982 spin_lock_init(&adev->gc_cac_idx_lock); 3983 spin_lock_init(&adev->se_cac_idx_lock); 3984 spin_lock_init(&adev->audio_endpt_idx_lock); 3985 spin_lock_init(&adev->mm_stats.lock); 3986 3987 INIT_LIST_HEAD(&adev->shadow_list); 3988 mutex_init(&adev->shadow_list_lock); 3989 3990 INIT_LIST_HEAD(&adev->reset_list); 3991 3992 INIT_LIST_HEAD(&adev->ras_list); 3993 3994 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3995 3996 INIT_DELAYED_WORK(&adev->delayed_init_work, 3997 amdgpu_device_delayed_init_work_handler); 3998 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3999 amdgpu_device_delay_enable_gfx_off); 4000 4001 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4002 4003 adev->gfx.gfx_off_req_count = 1; 4004 adev->gfx.gfx_off_residency = 0; 4005 adev->gfx.gfx_off_entrycount = 0; 4006 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4007 4008 atomic_set(&adev->throttling_logging_enabled, 1); 4009 /* 4010 * If throttling continues, logging will be performed every minute 4011 * to avoid log flooding. "-1" is subtracted since the thermal 4012 * throttling interrupt comes every second. Thus, the total logging 4013 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4014 * for throttling interrupt) = 60 seconds. 4015 */ 4016 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4017 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4018 4019 /* Registers mapping */ 4020 /* TODO: block userspace mapping of io register */ 4021 if (adev->asic_type >= CHIP_BONAIRE) { 4022 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4023 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4024 } else { 4025 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4026 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4027 } 4028 4029 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4030 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4031 4032 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4033 if (!adev->rmmio) 4034 return -ENOMEM; 4035 4036 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4037 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4038 4039 /* 4040 * Reset domain needs to be present early, before XGMI hive discovered 4041 * (if any) and intitialized to use reset sem and in_gpu reset flag 4042 * early on during init and before calling to RREG32. 4043 */ 4044 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4045 if (!adev->reset_domain) 4046 return -ENOMEM; 4047 4048 /* detect hw virtualization here */ 4049 amdgpu_detect_virtualization(adev); 4050 4051 amdgpu_device_get_pcie_info(adev); 4052 4053 r = amdgpu_device_get_job_timeout_settings(adev); 4054 if (r) { 4055 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4056 return r; 4057 } 4058 4059 amdgpu_device_set_mcbp(adev); 4060 4061 /* early init functions */ 4062 r = amdgpu_device_ip_early_init(adev); 4063 if (r) 4064 return r; 4065 4066 /* Get rid of things like offb */ 4067 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4068 if (r) 4069 return r; 4070 4071 /* Enable TMZ based on IP_VERSION */ 4072 amdgpu_gmc_tmz_set(adev); 4073 4074 amdgpu_gmc_noretry_set(adev); 4075 /* Need to get xgmi info early to decide the reset behavior*/ 4076 if (adev->gmc.xgmi.supported) { 4077 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4078 if (r) 4079 return r; 4080 } 4081 4082 /* enable PCIE atomic ops */ 4083 if (amdgpu_sriov_vf(adev)) { 4084 if (adev->virt.fw_reserve.p_pf2vf) 4085 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4086 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4087 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4088 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4089 * internal path natively support atomics, set have_atomics_support to true. 4090 */ 4091 } else if ((adev->flags & AMD_IS_APU) && 4092 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4093 IP_VERSION(9, 0, 0))) { 4094 adev->have_atomics_support = true; 4095 } else { 4096 adev->have_atomics_support = 4097 !pci_enable_atomic_ops_to_root(adev->pdev, 4098 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4099 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4100 } 4101 4102 if (!adev->have_atomics_support) 4103 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4104 4105 /* doorbell bar mapping and doorbell index init*/ 4106 amdgpu_doorbell_init(adev); 4107 4108 if (amdgpu_emu_mode == 1) { 4109 /* post the asic on emulation mode */ 4110 emu_soc_asic_init(adev); 4111 goto fence_driver_init; 4112 } 4113 4114 amdgpu_reset_init(adev); 4115 4116 /* detect if we are with an SRIOV vbios */ 4117 if (adev->bios) 4118 amdgpu_device_detect_sriov_bios(adev); 4119 4120 /* check if we need to reset the asic 4121 * E.g., driver was not cleanly unloaded previously, etc. 4122 */ 4123 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4124 if (adev->gmc.xgmi.num_physical_nodes) { 4125 dev_info(adev->dev, "Pending hive reset.\n"); 4126 adev->gmc.xgmi.pending_reset = true; 4127 /* Only need to init necessary block for SMU to handle the reset */ 4128 for (i = 0; i < adev->num_ip_blocks; i++) { 4129 if (!adev->ip_blocks[i].status.valid) 4130 continue; 4131 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4132 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4133 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4134 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4135 DRM_DEBUG("IP %s disabled for hw_init.\n", 4136 adev->ip_blocks[i].version->funcs->name); 4137 adev->ip_blocks[i].status.hw = true; 4138 } 4139 } 4140 } else { 4141 tmp = amdgpu_reset_method; 4142 /* It should do a default reset when loading or reloading the driver, 4143 * regardless of the module parameter reset_method. 4144 */ 4145 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4146 r = amdgpu_asic_reset(adev); 4147 amdgpu_reset_method = tmp; 4148 if (r) { 4149 dev_err(adev->dev, "asic reset on init failed\n"); 4150 goto failed; 4151 } 4152 } 4153 } 4154 4155 /* Post card if necessary */ 4156 if (amdgpu_device_need_post(adev)) { 4157 if (!adev->bios) { 4158 dev_err(adev->dev, "no vBIOS found\n"); 4159 r = -EINVAL; 4160 goto failed; 4161 } 4162 DRM_INFO("GPU posting now...\n"); 4163 r = amdgpu_device_asic_init(adev); 4164 if (r) { 4165 dev_err(adev->dev, "gpu post error!\n"); 4166 goto failed; 4167 } 4168 } 4169 4170 if (adev->bios) { 4171 if (adev->is_atom_fw) { 4172 /* Initialize clocks */ 4173 r = amdgpu_atomfirmware_get_clock_info(adev); 4174 if (r) { 4175 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4176 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4177 goto failed; 4178 } 4179 } else { 4180 /* Initialize clocks */ 4181 r = amdgpu_atombios_get_clock_info(adev); 4182 if (r) { 4183 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4184 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4185 goto failed; 4186 } 4187 /* init i2c buses */ 4188 if (!amdgpu_device_has_dc_support(adev)) 4189 amdgpu_atombios_i2c_init(adev); 4190 } 4191 } 4192 4193 fence_driver_init: 4194 /* Fence driver */ 4195 r = amdgpu_fence_driver_sw_init(adev); 4196 if (r) { 4197 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4198 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4199 goto failed; 4200 } 4201 4202 /* init the mode config */ 4203 drm_mode_config_init(adev_to_drm(adev)); 4204 4205 r = amdgpu_device_ip_init(adev); 4206 if (r) { 4207 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4208 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4209 goto release_ras_con; 4210 } 4211 4212 amdgpu_fence_driver_hw_init(adev); 4213 4214 dev_info(adev->dev, 4215 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4216 adev->gfx.config.max_shader_engines, 4217 adev->gfx.config.max_sh_per_se, 4218 adev->gfx.config.max_cu_per_sh, 4219 adev->gfx.cu_info.number); 4220 4221 adev->accel_working = true; 4222 4223 amdgpu_vm_check_compute_bug(adev); 4224 4225 /* Initialize the buffer migration limit. */ 4226 if (amdgpu_moverate >= 0) 4227 max_MBps = amdgpu_moverate; 4228 else 4229 max_MBps = 8; /* Allow 8 MB/s. */ 4230 /* Get a log2 for easy divisions. */ 4231 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4232 4233 /* 4234 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4235 * Otherwise the mgpu fan boost feature will be skipped due to the 4236 * gpu instance is counted less. 4237 */ 4238 amdgpu_register_gpu_instance(adev); 4239 4240 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4241 * explicit gating rather than handling it automatically. 4242 */ 4243 if (!adev->gmc.xgmi.pending_reset) { 4244 r = amdgpu_device_ip_late_init(adev); 4245 if (r) { 4246 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4247 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4248 goto release_ras_con; 4249 } 4250 /* must succeed. */ 4251 amdgpu_ras_resume(adev); 4252 queue_delayed_work(system_wq, &adev->delayed_init_work, 4253 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4254 } 4255 4256 if (amdgpu_sriov_vf(adev)) { 4257 amdgpu_virt_release_full_gpu(adev, true); 4258 flush_delayed_work(&adev->delayed_init_work); 4259 } 4260 4261 /* 4262 * Place those sysfs registering after `late_init`. As some of those 4263 * operations performed in `late_init` might affect the sysfs 4264 * interfaces creating. 4265 */ 4266 r = amdgpu_atombios_sysfs_init(adev); 4267 if (r) 4268 drm_err(&adev->ddev, 4269 "registering atombios sysfs failed (%d).\n", r); 4270 4271 r = amdgpu_pm_sysfs_init(adev); 4272 if (r) 4273 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4274 4275 r = amdgpu_ucode_sysfs_init(adev); 4276 if (r) { 4277 adev->ucode_sysfs_en = false; 4278 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4279 } else 4280 adev->ucode_sysfs_en = true; 4281 4282 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4283 if (r) 4284 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4285 4286 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4287 if (r) 4288 dev_err(adev->dev, 4289 "Could not create amdgpu board attributes\n"); 4290 4291 amdgpu_fru_sysfs_init(adev); 4292 amdgpu_reg_state_sysfs_init(adev); 4293 4294 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4295 r = amdgpu_pmu_init(adev); 4296 if (r) 4297 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4298 4299 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4300 if (amdgpu_device_cache_pci_state(adev->pdev)) 4301 pci_restore_state(pdev); 4302 4303 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4304 /* this will fail for cards that aren't VGA class devices, just 4305 * ignore it 4306 */ 4307 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4308 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4309 4310 px = amdgpu_device_supports_px(ddev); 4311 4312 if (px || (!dev_is_removable(&adev->pdev->dev) && 4313 apple_gmux_detect(NULL, NULL))) 4314 vga_switcheroo_register_client(adev->pdev, 4315 &amdgpu_switcheroo_ops, px); 4316 4317 if (px) 4318 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4319 4320 if (adev->gmc.xgmi.pending_reset) 4321 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4322 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4323 4324 amdgpu_device_check_iommu_direct_map(adev); 4325 4326 return 0; 4327 4328 release_ras_con: 4329 if (amdgpu_sriov_vf(adev)) 4330 amdgpu_virt_release_full_gpu(adev, true); 4331 4332 /* failed in exclusive mode due to timeout */ 4333 if (amdgpu_sriov_vf(adev) && 4334 !amdgpu_sriov_runtime(adev) && 4335 amdgpu_virt_mmio_blocked(adev) && 4336 !amdgpu_virt_wait_reset(adev)) { 4337 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4338 /* Don't send request since VF is inactive. */ 4339 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4340 adev->virt.ops = NULL; 4341 r = -EAGAIN; 4342 } 4343 amdgpu_release_ras_context(adev); 4344 4345 failed: 4346 amdgpu_vf_error_trans_all(adev); 4347 4348 return r; 4349 } 4350 4351 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4352 { 4353 4354 /* Clear all CPU mappings pointing to this device */ 4355 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4356 4357 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4358 amdgpu_doorbell_fini(adev); 4359 4360 iounmap(adev->rmmio); 4361 adev->rmmio = NULL; 4362 if (adev->mman.aper_base_kaddr) 4363 iounmap(adev->mman.aper_base_kaddr); 4364 adev->mman.aper_base_kaddr = NULL; 4365 4366 /* Memory manager related */ 4367 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4368 arch_phys_wc_del(adev->gmc.vram_mtrr); 4369 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4370 } 4371 } 4372 4373 /** 4374 * amdgpu_device_fini_hw - tear down the driver 4375 * 4376 * @adev: amdgpu_device pointer 4377 * 4378 * Tear down the driver info (all asics). 4379 * Called at driver shutdown. 4380 */ 4381 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4382 { 4383 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4384 flush_delayed_work(&adev->delayed_init_work); 4385 adev->shutdown = true; 4386 4387 /* make sure IB test finished before entering exclusive mode 4388 * to avoid preemption on IB test 4389 */ 4390 if (amdgpu_sriov_vf(adev)) { 4391 amdgpu_virt_request_full_gpu(adev, false); 4392 amdgpu_virt_fini_data_exchange(adev); 4393 } 4394 4395 /* disable all interrupts */ 4396 amdgpu_irq_disable_all(adev); 4397 if (adev->mode_info.mode_config_initialized) { 4398 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4399 drm_helper_force_disable_all(adev_to_drm(adev)); 4400 else 4401 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4402 } 4403 amdgpu_fence_driver_hw_fini(adev); 4404 4405 if (adev->mman.initialized) 4406 drain_workqueue(adev->mman.bdev.wq); 4407 4408 if (adev->pm.sysfs_initialized) 4409 amdgpu_pm_sysfs_fini(adev); 4410 if (adev->ucode_sysfs_en) 4411 amdgpu_ucode_sysfs_fini(adev); 4412 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4413 amdgpu_fru_sysfs_fini(adev); 4414 4415 amdgpu_reg_state_sysfs_fini(adev); 4416 4417 /* disable ras feature must before hw fini */ 4418 amdgpu_ras_pre_fini(adev); 4419 4420 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4421 4422 amdgpu_device_ip_fini_early(adev); 4423 4424 amdgpu_irq_fini_hw(adev); 4425 4426 if (adev->mman.initialized) 4427 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4428 4429 amdgpu_gart_dummy_page_fini(adev); 4430 4431 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4432 amdgpu_device_unmap_mmio(adev); 4433 4434 } 4435 4436 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4437 { 4438 int idx; 4439 bool px; 4440 4441 amdgpu_fence_driver_sw_fini(adev); 4442 amdgpu_device_ip_fini(adev); 4443 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4444 adev->accel_working = false; 4445 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4446 4447 amdgpu_reset_fini(adev); 4448 4449 /* free i2c buses */ 4450 if (!amdgpu_device_has_dc_support(adev)) 4451 amdgpu_i2c_fini(adev); 4452 4453 if (amdgpu_emu_mode != 1) 4454 amdgpu_atombios_fini(adev); 4455 4456 kfree(adev->bios); 4457 adev->bios = NULL; 4458 4459 kfree(adev->fru_info); 4460 adev->fru_info = NULL; 4461 4462 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4463 4464 if (px || (!dev_is_removable(&adev->pdev->dev) && 4465 apple_gmux_detect(NULL, NULL))) 4466 vga_switcheroo_unregister_client(adev->pdev); 4467 4468 if (px) 4469 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4470 4471 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4472 vga_client_unregister(adev->pdev); 4473 4474 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4475 4476 iounmap(adev->rmmio); 4477 adev->rmmio = NULL; 4478 amdgpu_doorbell_fini(adev); 4479 drm_dev_exit(idx); 4480 } 4481 4482 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4483 amdgpu_pmu_fini(adev); 4484 if (adev->mman.discovery_bin) 4485 amdgpu_discovery_fini(adev); 4486 4487 amdgpu_reset_put_reset_domain(adev->reset_domain); 4488 adev->reset_domain = NULL; 4489 4490 kfree(adev->pci_state); 4491 4492 } 4493 4494 /** 4495 * amdgpu_device_evict_resources - evict device resources 4496 * @adev: amdgpu device object 4497 * 4498 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4499 * of the vram memory type. Mainly used for evicting device resources 4500 * at suspend time. 4501 * 4502 */ 4503 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4504 { 4505 int ret; 4506 4507 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4508 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4509 return 0; 4510 4511 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4512 if (ret) 4513 DRM_WARN("evicting device resources failed\n"); 4514 return ret; 4515 } 4516 4517 /* 4518 * Suspend & resume. 4519 */ 4520 /** 4521 * amdgpu_device_prepare - prepare for device suspend 4522 * 4523 * @dev: drm dev pointer 4524 * 4525 * Prepare to put the hw in the suspend state (all asics). 4526 * Returns 0 for success or an error on failure. 4527 * Called at driver suspend. 4528 */ 4529 int amdgpu_device_prepare(struct drm_device *dev) 4530 { 4531 struct amdgpu_device *adev = drm_to_adev(dev); 4532 int i, r; 4533 4534 amdgpu_choose_low_power_state(adev); 4535 4536 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4537 return 0; 4538 4539 /* Evict the majority of BOs before starting suspend sequence */ 4540 r = amdgpu_device_evict_resources(adev); 4541 if (r) 4542 goto unprepare; 4543 4544 for (i = 0; i < adev->num_ip_blocks; i++) { 4545 if (!adev->ip_blocks[i].status.valid) 4546 continue; 4547 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4548 continue; 4549 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4550 if (r) 4551 goto unprepare; 4552 } 4553 4554 return 0; 4555 4556 unprepare: 4557 adev->in_s0ix = adev->in_s3 = false; 4558 4559 return r; 4560 } 4561 4562 /** 4563 * amdgpu_device_suspend - initiate device suspend 4564 * 4565 * @dev: drm dev pointer 4566 * @fbcon : notify the fbdev of suspend 4567 * 4568 * Puts the hw in the suspend state (all asics). 4569 * Returns 0 for success or an error on failure. 4570 * Called at driver suspend. 4571 */ 4572 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4573 { 4574 struct amdgpu_device *adev = drm_to_adev(dev); 4575 int r = 0; 4576 4577 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4578 return 0; 4579 4580 adev->in_suspend = true; 4581 4582 if (amdgpu_sriov_vf(adev)) { 4583 amdgpu_virt_fini_data_exchange(adev); 4584 r = amdgpu_virt_request_full_gpu(adev, false); 4585 if (r) 4586 return r; 4587 } 4588 4589 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4590 DRM_WARN("smart shift update failed\n"); 4591 4592 if (fbcon) 4593 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4594 4595 cancel_delayed_work_sync(&adev->delayed_init_work); 4596 4597 amdgpu_ras_suspend(adev); 4598 4599 amdgpu_device_ip_suspend_phase1(adev); 4600 4601 if (!adev->in_s0ix) 4602 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4603 4604 r = amdgpu_device_evict_resources(adev); 4605 if (r) 4606 return r; 4607 4608 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4609 4610 amdgpu_fence_driver_hw_fini(adev); 4611 4612 amdgpu_device_ip_suspend_phase2(adev); 4613 4614 if (amdgpu_sriov_vf(adev)) 4615 amdgpu_virt_release_full_gpu(adev, false); 4616 4617 r = amdgpu_dpm_notify_rlc_state(adev, false); 4618 if (r) 4619 return r; 4620 4621 return 0; 4622 } 4623 4624 /** 4625 * amdgpu_device_resume - initiate device resume 4626 * 4627 * @dev: drm dev pointer 4628 * @fbcon : notify the fbdev of resume 4629 * 4630 * Bring the hw back to operating state (all asics). 4631 * Returns 0 for success or an error on failure. 4632 * Called at driver resume. 4633 */ 4634 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4635 { 4636 struct amdgpu_device *adev = drm_to_adev(dev); 4637 int r = 0; 4638 4639 if (amdgpu_sriov_vf(adev)) { 4640 r = amdgpu_virt_request_full_gpu(adev, true); 4641 if (r) 4642 return r; 4643 } 4644 4645 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4646 return 0; 4647 4648 if (adev->in_s0ix) 4649 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4650 4651 /* post card */ 4652 if (amdgpu_device_need_post(adev)) { 4653 r = amdgpu_device_asic_init(adev); 4654 if (r) 4655 dev_err(adev->dev, "amdgpu asic init failed\n"); 4656 } 4657 4658 r = amdgpu_device_ip_resume(adev); 4659 4660 if (r) { 4661 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4662 goto exit; 4663 } 4664 amdgpu_fence_driver_hw_init(adev); 4665 4666 if (!adev->in_s0ix) { 4667 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4668 if (r) 4669 goto exit; 4670 } 4671 4672 r = amdgpu_device_ip_late_init(adev); 4673 if (r) 4674 goto exit; 4675 4676 queue_delayed_work(system_wq, &adev->delayed_init_work, 4677 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4678 exit: 4679 if (amdgpu_sriov_vf(adev)) { 4680 amdgpu_virt_init_data_exchange(adev); 4681 amdgpu_virt_release_full_gpu(adev, true); 4682 } 4683 4684 if (r) 4685 return r; 4686 4687 /* Make sure IB tests flushed */ 4688 flush_delayed_work(&adev->delayed_init_work); 4689 4690 if (fbcon) 4691 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4692 4693 amdgpu_ras_resume(adev); 4694 4695 if (adev->mode_info.num_crtc) { 4696 /* 4697 * Most of the connector probing functions try to acquire runtime pm 4698 * refs to ensure that the GPU is powered on when connector polling is 4699 * performed. Since we're calling this from a runtime PM callback, 4700 * trying to acquire rpm refs will cause us to deadlock. 4701 * 4702 * Since we're guaranteed to be holding the rpm lock, it's safe to 4703 * temporarily disable the rpm helpers so this doesn't deadlock us. 4704 */ 4705 #ifdef CONFIG_PM 4706 dev->dev->power.disable_depth++; 4707 #endif 4708 if (!adev->dc_enabled) 4709 drm_helper_hpd_irq_event(dev); 4710 else 4711 drm_kms_helper_hotplug_event(dev); 4712 #ifdef CONFIG_PM 4713 dev->dev->power.disable_depth--; 4714 #endif 4715 } 4716 adev->in_suspend = false; 4717 4718 if (adev->enable_mes) 4719 amdgpu_mes_self_test(adev); 4720 4721 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4722 DRM_WARN("smart shift update failed\n"); 4723 4724 return 0; 4725 } 4726 4727 /** 4728 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4729 * 4730 * @adev: amdgpu_device pointer 4731 * 4732 * The list of all the hardware IPs that make up the asic is walked and 4733 * the check_soft_reset callbacks are run. check_soft_reset determines 4734 * if the asic is still hung or not. 4735 * Returns true if any of the IPs are still in a hung state, false if not. 4736 */ 4737 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4738 { 4739 int i; 4740 bool asic_hang = false; 4741 4742 if (amdgpu_sriov_vf(adev)) 4743 return true; 4744 4745 if (amdgpu_asic_need_full_reset(adev)) 4746 return true; 4747 4748 for (i = 0; i < adev->num_ip_blocks; i++) { 4749 if (!adev->ip_blocks[i].status.valid) 4750 continue; 4751 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4752 adev->ip_blocks[i].status.hang = 4753 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4754 if (adev->ip_blocks[i].status.hang) { 4755 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4756 asic_hang = true; 4757 } 4758 } 4759 return asic_hang; 4760 } 4761 4762 /** 4763 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4764 * 4765 * @adev: amdgpu_device pointer 4766 * 4767 * The list of all the hardware IPs that make up the asic is walked and the 4768 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4769 * handles any IP specific hardware or software state changes that are 4770 * necessary for a soft reset to succeed. 4771 * Returns 0 on success, negative error code on failure. 4772 */ 4773 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4774 { 4775 int i, r = 0; 4776 4777 for (i = 0; i < adev->num_ip_blocks; i++) { 4778 if (!adev->ip_blocks[i].status.valid) 4779 continue; 4780 if (adev->ip_blocks[i].status.hang && 4781 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4782 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4783 if (r) 4784 return r; 4785 } 4786 } 4787 4788 return 0; 4789 } 4790 4791 /** 4792 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4793 * 4794 * @adev: amdgpu_device pointer 4795 * 4796 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4797 * reset is necessary to recover. 4798 * Returns true if a full asic reset is required, false if not. 4799 */ 4800 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4801 { 4802 int i; 4803 4804 if (amdgpu_asic_need_full_reset(adev)) 4805 return true; 4806 4807 for (i = 0; i < adev->num_ip_blocks; i++) { 4808 if (!adev->ip_blocks[i].status.valid) 4809 continue; 4810 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4811 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4812 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4813 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4814 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4815 if (adev->ip_blocks[i].status.hang) { 4816 dev_info(adev->dev, "Some block need full reset!\n"); 4817 return true; 4818 } 4819 } 4820 } 4821 return false; 4822 } 4823 4824 /** 4825 * amdgpu_device_ip_soft_reset - do a soft reset 4826 * 4827 * @adev: amdgpu_device pointer 4828 * 4829 * The list of all the hardware IPs that make up the asic is walked and the 4830 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4831 * IP specific hardware or software state changes that are necessary to soft 4832 * reset the IP. 4833 * Returns 0 on success, negative error code on failure. 4834 */ 4835 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4836 { 4837 int i, r = 0; 4838 4839 for (i = 0; i < adev->num_ip_blocks; i++) { 4840 if (!adev->ip_blocks[i].status.valid) 4841 continue; 4842 if (adev->ip_blocks[i].status.hang && 4843 adev->ip_blocks[i].version->funcs->soft_reset) { 4844 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4845 if (r) 4846 return r; 4847 } 4848 } 4849 4850 return 0; 4851 } 4852 4853 /** 4854 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4855 * 4856 * @adev: amdgpu_device pointer 4857 * 4858 * The list of all the hardware IPs that make up the asic is walked and the 4859 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4860 * handles any IP specific hardware or software state changes that are 4861 * necessary after the IP has been soft reset. 4862 * Returns 0 on success, negative error code on failure. 4863 */ 4864 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4865 { 4866 int i, r = 0; 4867 4868 for (i = 0; i < adev->num_ip_blocks; i++) { 4869 if (!adev->ip_blocks[i].status.valid) 4870 continue; 4871 if (adev->ip_blocks[i].status.hang && 4872 adev->ip_blocks[i].version->funcs->post_soft_reset) 4873 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4874 if (r) 4875 return r; 4876 } 4877 4878 return 0; 4879 } 4880 4881 /** 4882 * amdgpu_device_recover_vram - Recover some VRAM contents 4883 * 4884 * @adev: amdgpu_device pointer 4885 * 4886 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4887 * restore things like GPUVM page tables after a GPU reset where 4888 * the contents of VRAM might be lost. 4889 * 4890 * Returns: 4891 * 0 on success, negative error code on failure. 4892 */ 4893 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4894 { 4895 struct dma_fence *fence = NULL, *next = NULL; 4896 struct amdgpu_bo *shadow; 4897 struct amdgpu_bo_vm *vmbo; 4898 long r = 1, tmo; 4899 4900 if (amdgpu_sriov_runtime(adev)) 4901 tmo = msecs_to_jiffies(8000); 4902 else 4903 tmo = msecs_to_jiffies(100); 4904 4905 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4906 mutex_lock(&adev->shadow_list_lock); 4907 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4908 /* If vm is compute context or adev is APU, shadow will be NULL */ 4909 if (!vmbo->shadow) 4910 continue; 4911 shadow = vmbo->shadow; 4912 4913 /* No need to recover an evicted BO */ 4914 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4915 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4916 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4917 continue; 4918 4919 r = amdgpu_bo_restore_shadow(shadow, &next); 4920 if (r) 4921 break; 4922 4923 if (fence) { 4924 tmo = dma_fence_wait_timeout(fence, false, tmo); 4925 dma_fence_put(fence); 4926 fence = next; 4927 if (tmo == 0) { 4928 r = -ETIMEDOUT; 4929 break; 4930 } else if (tmo < 0) { 4931 r = tmo; 4932 break; 4933 } 4934 } else { 4935 fence = next; 4936 } 4937 } 4938 mutex_unlock(&adev->shadow_list_lock); 4939 4940 if (fence) 4941 tmo = dma_fence_wait_timeout(fence, false, tmo); 4942 dma_fence_put(fence); 4943 4944 if (r < 0 || tmo <= 0) { 4945 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4946 return -EIO; 4947 } 4948 4949 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4950 return 0; 4951 } 4952 4953 4954 /** 4955 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4956 * 4957 * @adev: amdgpu_device pointer 4958 * @from_hypervisor: request from hypervisor 4959 * 4960 * do VF FLR and reinitialize Asic 4961 * return 0 means succeeded otherwise failed 4962 */ 4963 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4964 bool from_hypervisor) 4965 { 4966 int r; 4967 struct amdgpu_hive_info *hive = NULL; 4968 int retry_limit = 0; 4969 4970 retry: 4971 amdgpu_amdkfd_pre_reset(adev); 4972 4973 amdgpu_device_stop_pending_resets(adev); 4974 4975 if (from_hypervisor) 4976 r = amdgpu_virt_request_full_gpu(adev, true); 4977 else 4978 r = amdgpu_virt_reset_gpu(adev); 4979 if (r) 4980 return r; 4981 amdgpu_irq_gpu_reset_resume_helper(adev); 4982 4983 /* some sw clean up VF needs to do before recover */ 4984 amdgpu_virt_post_reset(adev); 4985 4986 /* Resume IP prior to SMC */ 4987 r = amdgpu_device_ip_reinit_early_sriov(adev); 4988 if (r) 4989 goto error; 4990 4991 amdgpu_virt_init_data_exchange(adev); 4992 4993 r = amdgpu_device_fw_loading(adev); 4994 if (r) 4995 return r; 4996 4997 /* now we are okay to resume SMC/CP/SDMA */ 4998 r = amdgpu_device_ip_reinit_late_sriov(adev); 4999 if (r) 5000 goto error; 5001 5002 hive = amdgpu_get_xgmi_hive(adev); 5003 /* Update PSP FW topology after reset */ 5004 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5005 r = amdgpu_xgmi_update_topology(hive, adev); 5006 5007 if (hive) 5008 amdgpu_put_xgmi_hive(hive); 5009 5010 if (!r) { 5011 r = amdgpu_ib_ring_tests(adev); 5012 5013 amdgpu_amdkfd_post_reset(adev); 5014 } 5015 5016 error: 5017 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5018 amdgpu_inc_vram_lost(adev); 5019 r = amdgpu_device_recover_vram(adev); 5020 } 5021 amdgpu_virt_release_full_gpu(adev, true); 5022 5023 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5024 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5025 retry_limit++; 5026 goto retry; 5027 } else 5028 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5029 } 5030 5031 return r; 5032 } 5033 5034 /** 5035 * amdgpu_device_has_job_running - check if there is any job in mirror list 5036 * 5037 * @adev: amdgpu_device pointer 5038 * 5039 * check if there is any job in mirror list 5040 */ 5041 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5042 { 5043 int i; 5044 struct drm_sched_job *job; 5045 5046 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5047 struct amdgpu_ring *ring = adev->rings[i]; 5048 5049 if (!amdgpu_ring_sched_ready(ring)) 5050 continue; 5051 5052 spin_lock(&ring->sched.job_list_lock); 5053 job = list_first_entry_or_null(&ring->sched.pending_list, 5054 struct drm_sched_job, list); 5055 spin_unlock(&ring->sched.job_list_lock); 5056 if (job) 5057 return true; 5058 } 5059 return false; 5060 } 5061 5062 /** 5063 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5064 * 5065 * @adev: amdgpu_device pointer 5066 * 5067 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5068 * a hung GPU. 5069 */ 5070 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5071 { 5072 5073 if (amdgpu_gpu_recovery == 0) 5074 goto disabled; 5075 5076 /* Skip soft reset check in fatal error mode */ 5077 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5078 return true; 5079 5080 if (amdgpu_sriov_vf(adev)) 5081 return true; 5082 5083 if (amdgpu_gpu_recovery == -1) { 5084 switch (adev->asic_type) { 5085 #ifdef CONFIG_DRM_AMDGPU_SI 5086 case CHIP_VERDE: 5087 case CHIP_TAHITI: 5088 case CHIP_PITCAIRN: 5089 case CHIP_OLAND: 5090 case CHIP_HAINAN: 5091 #endif 5092 #ifdef CONFIG_DRM_AMDGPU_CIK 5093 case CHIP_KAVERI: 5094 case CHIP_KABINI: 5095 case CHIP_MULLINS: 5096 #endif 5097 case CHIP_CARRIZO: 5098 case CHIP_STONEY: 5099 case CHIP_CYAN_SKILLFISH: 5100 goto disabled; 5101 default: 5102 break; 5103 } 5104 } 5105 5106 return true; 5107 5108 disabled: 5109 dev_info(adev->dev, "GPU recovery disabled.\n"); 5110 return false; 5111 } 5112 5113 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5114 { 5115 u32 i; 5116 int ret = 0; 5117 5118 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5119 5120 dev_info(adev->dev, "GPU mode1 reset\n"); 5121 5122 /* disable BM */ 5123 pci_clear_master(adev->pdev); 5124 5125 amdgpu_device_cache_pci_state(adev->pdev); 5126 5127 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5128 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5129 ret = amdgpu_dpm_mode1_reset(adev); 5130 } else { 5131 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5132 ret = psp_gpu_reset(adev); 5133 } 5134 5135 if (ret) 5136 goto mode1_reset_failed; 5137 5138 amdgpu_device_load_pci_state(adev->pdev); 5139 ret = amdgpu_psp_wait_for_bootloader(adev); 5140 if (ret) 5141 goto mode1_reset_failed; 5142 5143 /* wait for asic to come out of reset */ 5144 for (i = 0; i < adev->usec_timeout; i++) { 5145 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5146 5147 if (memsize != 0xffffffff) 5148 break; 5149 udelay(1); 5150 } 5151 5152 if (i >= adev->usec_timeout) { 5153 ret = -ETIMEDOUT; 5154 goto mode1_reset_failed; 5155 } 5156 5157 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5158 5159 return 0; 5160 5161 mode1_reset_failed: 5162 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5163 return ret; 5164 } 5165 5166 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5167 struct amdgpu_reset_context *reset_context) 5168 { 5169 int i, r = 0; 5170 struct amdgpu_job *job = NULL; 5171 bool need_full_reset = 5172 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5173 5174 if (reset_context->reset_req_dev == adev) 5175 job = reset_context->job; 5176 5177 if (amdgpu_sriov_vf(adev)) { 5178 /* stop the data exchange thread */ 5179 amdgpu_virt_fini_data_exchange(adev); 5180 } 5181 5182 amdgpu_fence_driver_isr_toggle(adev, true); 5183 5184 /* block all schedulers and reset given job's ring */ 5185 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5186 struct amdgpu_ring *ring = adev->rings[i]; 5187 5188 if (!amdgpu_ring_sched_ready(ring)) 5189 continue; 5190 5191 /* Clear job fence from fence drv to avoid force_completion 5192 * leave NULL and vm flush fence in fence drv 5193 */ 5194 amdgpu_fence_driver_clear_job_fences(ring); 5195 5196 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5197 amdgpu_fence_driver_force_completion(ring); 5198 } 5199 5200 amdgpu_fence_driver_isr_toggle(adev, false); 5201 5202 if (job && job->vm) 5203 drm_sched_increase_karma(&job->base); 5204 5205 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5206 /* If reset handler not implemented, continue; otherwise return */ 5207 if (r == -EOPNOTSUPP) 5208 r = 0; 5209 else 5210 return r; 5211 5212 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5213 if (!amdgpu_sriov_vf(adev)) { 5214 5215 if (!need_full_reset) 5216 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5217 5218 if (!need_full_reset && amdgpu_gpu_recovery && 5219 amdgpu_device_ip_check_soft_reset(adev)) { 5220 amdgpu_device_ip_pre_soft_reset(adev); 5221 r = amdgpu_device_ip_soft_reset(adev); 5222 amdgpu_device_ip_post_soft_reset(adev); 5223 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5224 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5225 need_full_reset = true; 5226 } 5227 } 5228 5229 if (need_full_reset) 5230 r = amdgpu_device_ip_suspend(adev); 5231 if (need_full_reset) 5232 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5233 else 5234 clear_bit(AMDGPU_NEED_FULL_RESET, 5235 &reset_context->flags); 5236 } 5237 5238 return r; 5239 } 5240 5241 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5242 { 5243 int i; 5244 5245 lockdep_assert_held(&adev->reset_domain->sem); 5246 5247 for (i = 0; i < adev->reset_info.num_regs; i++) { 5248 adev->reset_info.reset_dump_reg_value[i] = 5249 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5250 5251 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5252 adev->reset_info.reset_dump_reg_value[i]); 5253 } 5254 5255 return 0; 5256 } 5257 5258 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5259 struct amdgpu_reset_context *reset_context) 5260 { 5261 struct amdgpu_device *tmp_adev = NULL; 5262 bool need_full_reset, skip_hw_reset, vram_lost = false; 5263 int r = 0; 5264 5265 /* Try reset handler method first */ 5266 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5267 reset_list); 5268 amdgpu_reset_reg_dumps(tmp_adev); 5269 5270 reset_context->reset_device_list = device_list_handle; 5271 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5272 /* If reset handler not implemented, continue; otherwise return */ 5273 if (r == -EOPNOTSUPP) 5274 r = 0; 5275 else 5276 return r; 5277 5278 /* Reset handler not implemented, use the default method */ 5279 need_full_reset = 5280 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5281 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5282 5283 /* 5284 * ASIC reset has to be done on all XGMI hive nodes ASAP 5285 * to allow proper links negotiation in FW (within 1 sec) 5286 */ 5287 if (!skip_hw_reset && need_full_reset) { 5288 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5289 /* For XGMI run all resets in parallel to speed up the process */ 5290 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5291 tmp_adev->gmc.xgmi.pending_reset = false; 5292 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5293 r = -EALREADY; 5294 } else 5295 r = amdgpu_asic_reset(tmp_adev); 5296 5297 if (r) { 5298 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5299 r, adev_to_drm(tmp_adev)->unique); 5300 goto out; 5301 } 5302 } 5303 5304 /* For XGMI wait for all resets to complete before proceed */ 5305 if (!r) { 5306 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5307 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5308 flush_work(&tmp_adev->xgmi_reset_work); 5309 r = tmp_adev->asic_reset_res; 5310 if (r) 5311 break; 5312 } 5313 } 5314 } 5315 } 5316 5317 if (!r && amdgpu_ras_intr_triggered()) { 5318 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5319 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5320 } 5321 5322 amdgpu_ras_intr_cleared(); 5323 } 5324 5325 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5326 if (need_full_reset) { 5327 /* post card */ 5328 amdgpu_ras_set_fed(tmp_adev, false); 5329 r = amdgpu_device_asic_init(tmp_adev); 5330 if (r) { 5331 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5332 } else { 5333 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5334 5335 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5336 if (r) 5337 goto out; 5338 5339 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5340 5341 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5342 5343 if (vram_lost) { 5344 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5345 amdgpu_inc_vram_lost(tmp_adev); 5346 } 5347 5348 r = amdgpu_device_fw_loading(tmp_adev); 5349 if (r) 5350 return r; 5351 5352 r = amdgpu_xcp_restore_partition_mode( 5353 tmp_adev->xcp_mgr); 5354 if (r) 5355 goto out; 5356 5357 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5358 if (r) 5359 goto out; 5360 5361 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5362 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5363 5364 if (vram_lost) 5365 amdgpu_device_fill_reset_magic(tmp_adev); 5366 5367 /* 5368 * Add this ASIC as tracked as reset was already 5369 * complete successfully. 5370 */ 5371 amdgpu_register_gpu_instance(tmp_adev); 5372 5373 if (!reset_context->hive && 5374 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5375 amdgpu_xgmi_add_device(tmp_adev); 5376 5377 r = amdgpu_device_ip_late_init(tmp_adev); 5378 if (r) 5379 goto out; 5380 5381 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5382 5383 /* 5384 * The GPU enters bad state once faulty pages 5385 * by ECC has reached the threshold, and ras 5386 * recovery is scheduled next. So add one check 5387 * here to break recovery if it indeed exceeds 5388 * bad page threshold, and remind user to 5389 * retire this GPU or setting one bigger 5390 * bad_page_threshold value to fix this once 5391 * probing driver again. 5392 */ 5393 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5394 /* must succeed. */ 5395 amdgpu_ras_resume(tmp_adev); 5396 } else { 5397 r = -EINVAL; 5398 goto out; 5399 } 5400 5401 /* Update PSP FW topology after reset */ 5402 if (reset_context->hive && 5403 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5404 r = amdgpu_xgmi_update_topology( 5405 reset_context->hive, tmp_adev); 5406 } 5407 } 5408 5409 out: 5410 if (!r) { 5411 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5412 r = amdgpu_ib_ring_tests(tmp_adev); 5413 if (r) { 5414 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5415 need_full_reset = true; 5416 r = -EAGAIN; 5417 goto end; 5418 } 5419 } 5420 5421 if (!r) 5422 r = amdgpu_device_recover_vram(tmp_adev); 5423 else 5424 tmp_adev->asic_reset_res = r; 5425 } 5426 5427 end: 5428 if (need_full_reset) 5429 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5430 else 5431 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5432 return r; 5433 } 5434 5435 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5436 { 5437 5438 switch (amdgpu_asic_reset_method(adev)) { 5439 case AMD_RESET_METHOD_MODE1: 5440 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5441 break; 5442 case AMD_RESET_METHOD_MODE2: 5443 adev->mp1_state = PP_MP1_STATE_RESET; 5444 break; 5445 default: 5446 adev->mp1_state = PP_MP1_STATE_NONE; 5447 break; 5448 } 5449 } 5450 5451 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5452 { 5453 amdgpu_vf_error_trans_all(adev); 5454 adev->mp1_state = PP_MP1_STATE_NONE; 5455 } 5456 5457 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5458 { 5459 struct pci_dev *p = NULL; 5460 5461 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5462 adev->pdev->bus->number, 1); 5463 if (p) { 5464 pm_runtime_enable(&(p->dev)); 5465 pm_runtime_resume(&(p->dev)); 5466 } 5467 5468 pci_dev_put(p); 5469 } 5470 5471 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5472 { 5473 enum amd_reset_method reset_method; 5474 struct pci_dev *p = NULL; 5475 u64 expires; 5476 5477 /* 5478 * For now, only BACO and mode1 reset are confirmed 5479 * to suffer the audio issue without proper suspended. 5480 */ 5481 reset_method = amdgpu_asic_reset_method(adev); 5482 if ((reset_method != AMD_RESET_METHOD_BACO) && 5483 (reset_method != AMD_RESET_METHOD_MODE1)) 5484 return -EINVAL; 5485 5486 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5487 adev->pdev->bus->number, 1); 5488 if (!p) 5489 return -ENODEV; 5490 5491 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5492 if (!expires) 5493 /* 5494 * If we cannot get the audio device autosuspend delay, 5495 * a fixed 4S interval will be used. Considering 3S is 5496 * the audio controller default autosuspend delay setting. 5497 * 4S used here is guaranteed to cover that. 5498 */ 5499 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5500 5501 while (!pm_runtime_status_suspended(&(p->dev))) { 5502 if (!pm_runtime_suspend(&(p->dev))) 5503 break; 5504 5505 if (expires < ktime_get_mono_fast_ns()) { 5506 dev_warn(adev->dev, "failed to suspend display audio\n"); 5507 pci_dev_put(p); 5508 /* TODO: abort the succeeding gpu reset? */ 5509 return -ETIMEDOUT; 5510 } 5511 } 5512 5513 pm_runtime_disable(&(p->dev)); 5514 5515 pci_dev_put(p); 5516 return 0; 5517 } 5518 5519 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5520 { 5521 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5522 5523 #if defined(CONFIG_DEBUG_FS) 5524 if (!amdgpu_sriov_vf(adev)) 5525 cancel_work(&adev->reset_work); 5526 #endif 5527 5528 if (adev->kfd.dev) 5529 cancel_work(&adev->kfd.reset_work); 5530 5531 if (amdgpu_sriov_vf(adev)) 5532 cancel_work(&adev->virt.flr_work); 5533 5534 if (con && adev->ras_enabled) 5535 cancel_work(&con->recovery_work); 5536 5537 } 5538 5539 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5540 { 5541 struct amdgpu_device *tmp_adev; 5542 int ret = 0; 5543 u32 status; 5544 5545 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5546 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5547 if (PCI_POSSIBLE_ERROR(status)) { 5548 dev_err(tmp_adev->dev, "device lost from bus!"); 5549 ret = -ENODEV; 5550 } 5551 } 5552 5553 return ret; 5554 } 5555 5556 /** 5557 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5558 * 5559 * @adev: amdgpu_device pointer 5560 * @job: which job trigger hang 5561 * @reset_context: amdgpu reset context pointer 5562 * 5563 * Attempt to reset the GPU if it has hung (all asics). 5564 * Attempt to do soft-reset or full-reset and reinitialize Asic 5565 * Returns 0 for success or an error on failure. 5566 */ 5567 5568 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5569 struct amdgpu_job *job, 5570 struct amdgpu_reset_context *reset_context) 5571 { 5572 struct list_head device_list, *device_list_handle = NULL; 5573 bool job_signaled = false; 5574 struct amdgpu_hive_info *hive = NULL; 5575 struct amdgpu_device *tmp_adev = NULL; 5576 int i, r = 0; 5577 bool need_emergency_restart = false; 5578 bool audio_suspended = false; 5579 5580 /* 5581 * Special case: RAS triggered and full reset isn't supported 5582 */ 5583 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5584 5585 /* 5586 * Flush RAM to disk so that after reboot 5587 * the user can read log and see why the system rebooted. 5588 */ 5589 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5590 amdgpu_ras_get_context(adev)->reboot) { 5591 DRM_WARN("Emergency reboot."); 5592 5593 ksys_sync_helper(); 5594 emergency_restart(); 5595 } 5596 5597 dev_info(adev->dev, "GPU %s begin!\n", 5598 need_emergency_restart ? "jobs stop":"reset"); 5599 5600 if (!amdgpu_sriov_vf(adev)) 5601 hive = amdgpu_get_xgmi_hive(adev); 5602 if (hive) 5603 mutex_lock(&hive->hive_lock); 5604 5605 reset_context->job = job; 5606 reset_context->hive = hive; 5607 /* 5608 * Build list of devices to reset. 5609 * In case we are in XGMI hive mode, resort the device list 5610 * to put adev in the 1st position. 5611 */ 5612 INIT_LIST_HEAD(&device_list); 5613 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5614 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5615 list_add_tail(&tmp_adev->reset_list, &device_list); 5616 if (adev->shutdown) 5617 tmp_adev->shutdown = true; 5618 } 5619 if (!list_is_first(&adev->reset_list, &device_list)) 5620 list_rotate_to_front(&adev->reset_list, &device_list); 5621 device_list_handle = &device_list; 5622 } else { 5623 list_add_tail(&adev->reset_list, &device_list); 5624 device_list_handle = &device_list; 5625 } 5626 5627 if (!amdgpu_sriov_vf(adev)) { 5628 r = amdgpu_device_health_check(device_list_handle); 5629 if (r) 5630 goto end_reset; 5631 } 5632 5633 /* We need to lock reset domain only once both for XGMI and single device */ 5634 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5635 reset_list); 5636 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5637 5638 /* block all schedulers and reset given job's ring */ 5639 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5640 5641 amdgpu_device_set_mp1_state(tmp_adev); 5642 5643 /* 5644 * Try to put the audio codec into suspend state 5645 * before gpu reset started. 5646 * 5647 * Due to the power domain of the graphics device 5648 * is shared with AZ power domain. Without this, 5649 * we may change the audio hardware from behind 5650 * the audio driver's back. That will trigger 5651 * some audio codec errors. 5652 */ 5653 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5654 audio_suspended = true; 5655 5656 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5657 5658 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5659 5660 if (!amdgpu_sriov_vf(tmp_adev)) 5661 amdgpu_amdkfd_pre_reset(tmp_adev); 5662 5663 /* 5664 * Mark these ASICs to be reseted as untracked first 5665 * And add them back after reset completed 5666 */ 5667 amdgpu_unregister_gpu_instance(tmp_adev); 5668 5669 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5670 5671 /* disable ras on ALL IPs */ 5672 if (!need_emergency_restart && 5673 amdgpu_device_ip_need_full_reset(tmp_adev)) 5674 amdgpu_ras_suspend(tmp_adev); 5675 5676 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5677 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5678 5679 if (!amdgpu_ring_sched_ready(ring)) 5680 continue; 5681 5682 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5683 5684 if (need_emergency_restart) 5685 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5686 } 5687 atomic_inc(&tmp_adev->gpu_reset_counter); 5688 } 5689 5690 if (need_emergency_restart) 5691 goto skip_sched_resume; 5692 5693 /* 5694 * Must check guilty signal here since after this point all old 5695 * HW fences are force signaled. 5696 * 5697 * job->base holds a reference to parent fence 5698 */ 5699 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5700 job_signaled = true; 5701 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5702 goto skip_hw_reset; 5703 } 5704 5705 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5706 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5707 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5708 /*TODO Should we stop ?*/ 5709 if (r) { 5710 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5711 r, adev_to_drm(tmp_adev)->unique); 5712 tmp_adev->asic_reset_res = r; 5713 } 5714 5715 if (!amdgpu_sriov_vf(tmp_adev)) 5716 /* 5717 * Drop all pending non scheduler resets. Scheduler resets 5718 * were already dropped during drm_sched_stop 5719 */ 5720 amdgpu_device_stop_pending_resets(tmp_adev); 5721 } 5722 5723 /* Actual ASIC resets if needed.*/ 5724 /* Host driver will handle XGMI hive reset for SRIOV */ 5725 if (amdgpu_sriov_vf(adev)) { 5726 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5727 if (r) 5728 adev->asic_reset_res = r; 5729 5730 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5731 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5732 IP_VERSION(9, 4, 2) || 5733 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5734 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5735 amdgpu_ras_resume(adev); 5736 } else { 5737 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5738 if (r && r == -EAGAIN) 5739 goto retry; 5740 } 5741 5742 skip_hw_reset: 5743 5744 /* Post ASIC reset for all devs .*/ 5745 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5746 5747 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5748 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5749 5750 if (!amdgpu_ring_sched_ready(ring)) 5751 continue; 5752 5753 drm_sched_start(&ring->sched, true); 5754 } 5755 5756 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5757 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5758 5759 if (tmp_adev->asic_reset_res) 5760 r = tmp_adev->asic_reset_res; 5761 5762 tmp_adev->asic_reset_res = 0; 5763 5764 if (r) { 5765 /* bad news, how to tell it to userspace ? */ 5766 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5767 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5768 } else { 5769 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5770 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5771 DRM_WARN("smart shift update failed\n"); 5772 } 5773 } 5774 5775 skip_sched_resume: 5776 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5777 /* unlock kfd: SRIOV would do it separately */ 5778 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5779 amdgpu_amdkfd_post_reset(tmp_adev); 5780 5781 /* kfd_post_reset will do nothing if kfd device is not initialized, 5782 * need to bring up kfd here if it's not be initialized before 5783 */ 5784 if (!adev->kfd.init_complete) 5785 amdgpu_amdkfd_device_init(adev); 5786 5787 if (audio_suspended) 5788 amdgpu_device_resume_display_audio(tmp_adev); 5789 5790 amdgpu_device_unset_mp1_state(tmp_adev); 5791 5792 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5793 } 5794 5795 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5796 reset_list); 5797 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5798 5799 end_reset: 5800 if (hive) { 5801 mutex_unlock(&hive->hive_lock); 5802 amdgpu_put_xgmi_hive(hive); 5803 } 5804 5805 if (r) 5806 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5807 5808 atomic_set(&adev->reset_domain->reset_res, r); 5809 return r; 5810 } 5811 5812 /** 5813 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5814 * 5815 * @adev: amdgpu_device pointer 5816 * @speed: pointer to the speed of the link 5817 * @width: pointer to the width of the link 5818 * 5819 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5820 * first physical partner to an AMD dGPU. 5821 * This will exclude any virtual switches and links. 5822 */ 5823 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5824 enum pci_bus_speed *speed, 5825 enum pcie_link_width *width) 5826 { 5827 struct pci_dev *parent = adev->pdev; 5828 5829 if (!speed || !width) 5830 return; 5831 5832 *speed = PCI_SPEED_UNKNOWN; 5833 *width = PCIE_LNK_WIDTH_UNKNOWN; 5834 5835 while ((parent = pci_upstream_bridge(parent))) { 5836 /* skip upstream/downstream switches internal to dGPU*/ 5837 if (parent->vendor == PCI_VENDOR_ID_ATI) 5838 continue; 5839 *speed = pcie_get_speed_cap(parent); 5840 *width = pcie_get_width_cap(parent); 5841 break; 5842 } 5843 } 5844 5845 /** 5846 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5847 * 5848 * @adev: amdgpu_device pointer 5849 * 5850 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5851 * and lanes) of the slot the device is in. Handles APUs and 5852 * virtualized environments where PCIE config space may not be available. 5853 */ 5854 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5855 { 5856 struct pci_dev *pdev; 5857 enum pci_bus_speed speed_cap, platform_speed_cap; 5858 enum pcie_link_width platform_link_width; 5859 5860 if (amdgpu_pcie_gen_cap) 5861 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5862 5863 if (amdgpu_pcie_lane_cap) 5864 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5865 5866 /* covers APUs as well */ 5867 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5868 if (adev->pm.pcie_gen_mask == 0) 5869 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5870 if (adev->pm.pcie_mlw_mask == 0) 5871 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5872 return; 5873 } 5874 5875 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5876 return; 5877 5878 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5879 &platform_link_width); 5880 5881 if (adev->pm.pcie_gen_mask == 0) { 5882 /* asic caps */ 5883 pdev = adev->pdev; 5884 speed_cap = pcie_get_speed_cap(pdev); 5885 if (speed_cap == PCI_SPEED_UNKNOWN) { 5886 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5887 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5888 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5889 } else { 5890 if (speed_cap == PCIE_SPEED_32_0GT) 5891 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5892 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5893 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5894 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5895 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5896 else if (speed_cap == PCIE_SPEED_16_0GT) 5897 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5898 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5899 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5900 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5901 else if (speed_cap == PCIE_SPEED_8_0GT) 5902 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5903 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5904 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5905 else if (speed_cap == PCIE_SPEED_5_0GT) 5906 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5907 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5908 else 5909 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5910 } 5911 /* platform caps */ 5912 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5913 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5914 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5915 } else { 5916 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5917 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5918 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5919 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5920 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5921 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5922 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5923 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5924 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5925 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5926 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5927 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5928 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5929 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5930 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5931 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5932 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5933 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5934 else 5935 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5936 5937 } 5938 } 5939 if (adev->pm.pcie_mlw_mask == 0) { 5940 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5941 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5942 } else { 5943 switch (platform_link_width) { 5944 case PCIE_LNK_X32: 5945 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5946 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5947 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5948 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5950 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5951 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5952 break; 5953 case PCIE_LNK_X16: 5954 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5955 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5960 break; 5961 case PCIE_LNK_X12: 5962 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5963 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5965 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5967 break; 5968 case PCIE_LNK_X8: 5969 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5970 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5971 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5972 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5973 break; 5974 case PCIE_LNK_X4: 5975 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5976 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5977 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5978 break; 5979 case PCIE_LNK_X2: 5980 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5981 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5982 break; 5983 case PCIE_LNK_X1: 5984 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5985 break; 5986 default: 5987 break; 5988 } 5989 } 5990 } 5991 } 5992 5993 /** 5994 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5995 * 5996 * @adev: amdgpu_device pointer 5997 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5998 * 5999 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6000 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6001 * @peer_adev. 6002 */ 6003 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6004 struct amdgpu_device *peer_adev) 6005 { 6006 #ifdef CONFIG_HSA_AMD_P2P 6007 uint64_t address_mask = peer_adev->dev->dma_mask ? 6008 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6009 resource_size_t aper_limit = 6010 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6011 bool p2p_access = 6012 !adev->gmc.xgmi.connected_to_cpu && 6013 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6014 6015 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6016 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6017 !(adev->gmc.aper_base & address_mask || 6018 aper_limit & address_mask)); 6019 #else 6020 return false; 6021 #endif 6022 } 6023 6024 int amdgpu_device_baco_enter(struct drm_device *dev) 6025 { 6026 struct amdgpu_device *adev = drm_to_adev(dev); 6027 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6028 6029 if (!amdgpu_device_supports_baco(dev)) 6030 return -ENOTSUPP; 6031 6032 if (ras && adev->ras_enabled && 6033 adev->nbio.funcs->enable_doorbell_interrupt) 6034 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6035 6036 return amdgpu_dpm_baco_enter(adev); 6037 } 6038 6039 int amdgpu_device_baco_exit(struct drm_device *dev) 6040 { 6041 struct amdgpu_device *adev = drm_to_adev(dev); 6042 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6043 int ret = 0; 6044 6045 if (!amdgpu_device_supports_baco(dev)) 6046 return -ENOTSUPP; 6047 6048 ret = amdgpu_dpm_baco_exit(adev); 6049 if (ret) 6050 return ret; 6051 6052 if (ras && adev->ras_enabled && 6053 adev->nbio.funcs->enable_doorbell_interrupt) 6054 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6055 6056 if (amdgpu_passthrough(adev) && 6057 adev->nbio.funcs->clear_doorbell_interrupt) 6058 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6059 6060 return 0; 6061 } 6062 6063 /** 6064 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6065 * @pdev: PCI device struct 6066 * @state: PCI channel state 6067 * 6068 * Description: Called when a PCI error is detected. 6069 * 6070 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6071 */ 6072 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6073 { 6074 struct drm_device *dev = pci_get_drvdata(pdev); 6075 struct amdgpu_device *adev = drm_to_adev(dev); 6076 int i; 6077 6078 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6079 6080 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6081 DRM_WARN("No support for XGMI hive yet..."); 6082 return PCI_ERS_RESULT_DISCONNECT; 6083 } 6084 6085 adev->pci_channel_state = state; 6086 6087 switch (state) { 6088 case pci_channel_io_normal: 6089 return PCI_ERS_RESULT_CAN_RECOVER; 6090 /* Fatal error, prepare for slot reset */ 6091 case pci_channel_io_frozen: 6092 /* 6093 * Locking adev->reset_domain->sem will prevent any external access 6094 * to GPU during PCI error recovery 6095 */ 6096 amdgpu_device_lock_reset_domain(adev->reset_domain); 6097 amdgpu_device_set_mp1_state(adev); 6098 6099 /* 6100 * Block any work scheduling as we do for regular GPU reset 6101 * for the duration of the recovery 6102 */ 6103 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6104 struct amdgpu_ring *ring = adev->rings[i]; 6105 6106 if (!amdgpu_ring_sched_ready(ring)) 6107 continue; 6108 6109 drm_sched_stop(&ring->sched, NULL); 6110 } 6111 atomic_inc(&adev->gpu_reset_counter); 6112 return PCI_ERS_RESULT_NEED_RESET; 6113 case pci_channel_io_perm_failure: 6114 /* Permanent error, prepare for device removal */ 6115 return PCI_ERS_RESULT_DISCONNECT; 6116 } 6117 6118 return PCI_ERS_RESULT_NEED_RESET; 6119 } 6120 6121 /** 6122 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6123 * @pdev: pointer to PCI device 6124 */ 6125 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6126 { 6127 6128 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6129 6130 /* TODO - dump whatever for debugging purposes */ 6131 6132 /* This called only if amdgpu_pci_error_detected returns 6133 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6134 * works, no need to reset slot. 6135 */ 6136 6137 return PCI_ERS_RESULT_RECOVERED; 6138 } 6139 6140 /** 6141 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6142 * @pdev: PCI device struct 6143 * 6144 * Description: This routine is called by the pci error recovery 6145 * code after the PCI slot has been reset, just before we 6146 * should resume normal operations. 6147 */ 6148 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6149 { 6150 struct drm_device *dev = pci_get_drvdata(pdev); 6151 struct amdgpu_device *adev = drm_to_adev(dev); 6152 int r, i; 6153 struct amdgpu_reset_context reset_context; 6154 u32 memsize; 6155 struct list_head device_list; 6156 struct amdgpu_hive_info *hive; 6157 int hive_ras_recovery = 0; 6158 struct amdgpu_ras *ras; 6159 6160 /* PCI error slot reset should be skipped During RAS recovery */ 6161 hive = amdgpu_get_xgmi_hive(adev); 6162 if (hive) { 6163 hive_ras_recovery = atomic_read(&hive->ras_recovery); 6164 amdgpu_put_xgmi_hive(hive); 6165 } 6166 ras = amdgpu_ras_get_context(adev); 6167 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && 6168 ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) 6169 return PCI_ERS_RESULT_RECOVERED; 6170 6171 DRM_INFO("PCI error: slot reset callback!!\n"); 6172 6173 memset(&reset_context, 0, sizeof(reset_context)); 6174 6175 INIT_LIST_HEAD(&device_list); 6176 list_add_tail(&adev->reset_list, &device_list); 6177 6178 /* wait for asic to come out of reset */ 6179 msleep(500); 6180 6181 /* Restore PCI confspace */ 6182 amdgpu_device_load_pci_state(pdev); 6183 6184 /* confirm ASIC came out of reset */ 6185 for (i = 0; i < adev->usec_timeout; i++) { 6186 memsize = amdgpu_asic_get_config_memsize(adev); 6187 6188 if (memsize != 0xffffffff) 6189 break; 6190 udelay(1); 6191 } 6192 if (memsize == 0xffffffff) { 6193 r = -ETIME; 6194 goto out; 6195 } 6196 6197 reset_context.method = AMD_RESET_METHOD_NONE; 6198 reset_context.reset_req_dev = adev; 6199 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6200 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6201 6202 adev->no_hw_access = true; 6203 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6204 adev->no_hw_access = false; 6205 if (r) 6206 goto out; 6207 6208 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6209 6210 out: 6211 if (!r) { 6212 if (amdgpu_device_cache_pci_state(adev->pdev)) 6213 pci_restore_state(adev->pdev); 6214 6215 DRM_INFO("PCIe error recovery succeeded\n"); 6216 } else { 6217 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6218 amdgpu_device_unset_mp1_state(adev); 6219 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6220 } 6221 6222 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6223 } 6224 6225 /** 6226 * amdgpu_pci_resume() - resume normal ops after PCI reset 6227 * @pdev: pointer to PCI device 6228 * 6229 * Called when the error recovery driver tells us that its 6230 * OK to resume normal operation. 6231 */ 6232 void amdgpu_pci_resume(struct pci_dev *pdev) 6233 { 6234 struct drm_device *dev = pci_get_drvdata(pdev); 6235 struct amdgpu_device *adev = drm_to_adev(dev); 6236 int i; 6237 6238 6239 DRM_INFO("PCI error: resume callback!!\n"); 6240 6241 /* Only continue execution for the case of pci_channel_io_frozen */ 6242 if (adev->pci_channel_state != pci_channel_io_frozen) 6243 return; 6244 6245 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6246 struct amdgpu_ring *ring = adev->rings[i]; 6247 6248 if (!amdgpu_ring_sched_ready(ring)) 6249 continue; 6250 6251 drm_sched_start(&ring->sched, true); 6252 } 6253 6254 amdgpu_device_unset_mp1_state(adev); 6255 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6256 } 6257 6258 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6259 { 6260 struct drm_device *dev = pci_get_drvdata(pdev); 6261 struct amdgpu_device *adev = drm_to_adev(dev); 6262 int r; 6263 6264 r = pci_save_state(pdev); 6265 if (!r) { 6266 kfree(adev->pci_state); 6267 6268 adev->pci_state = pci_store_saved_state(pdev); 6269 6270 if (!adev->pci_state) { 6271 DRM_ERROR("Failed to store PCI saved state"); 6272 return false; 6273 } 6274 } else { 6275 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6276 return false; 6277 } 6278 6279 return true; 6280 } 6281 6282 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6283 { 6284 struct drm_device *dev = pci_get_drvdata(pdev); 6285 struct amdgpu_device *adev = drm_to_adev(dev); 6286 int r; 6287 6288 if (!adev->pci_state) 6289 return false; 6290 6291 r = pci_load_saved_state(pdev, adev->pci_state); 6292 6293 if (!r) { 6294 pci_restore_state(pdev); 6295 } else { 6296 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6297 return false; 6298 } 6299 6300 return true; 6301 } 6302 6303 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6304 struct amdgpu_ring *ring) 6305 { 6306 #ifdef CONFIG_X86_64 6307 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6308 return; 6309 #endif 6310 if (adev->gmc.xgmi.connected_to_cpu) 6311 return; 6312 6313 if (ring && ring->funcs->emit_hdp_flush) 6314 amdgpu_ring_emit_hdp_flush(ring); 6315 else 6316 amdgpu_asic_flush_hdp(adev, ring); 6317 } 6318 6319 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6320 struct amdgpu_ring *ring) 6321 { 6322 #ifdef CONFIG_X86_64 6323 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6324 return; 6325 #endif 6326 if (adev->gmc.xgmi.connected_to_cpu) 6327 return; 6328 6329 amdgpu_asic_invalidate_hdp(adev, ring); 6330 } 6331 6332 int amdgpu_in_reset(struct amdgpu_device *adev) 6333 { 6334 return atomic_read(&adev->reset_domain->in_gpu_reset); 6335 } 6336 6337 /** 6338 * amdgpu_device_halt() - bring hardware to some kind of halt state 6339 * 6340 * @adev: amdgpu_device pointer 6341 * 6342 * Bring hardware to some kind of halt state so that no one can touch it 6343 * any more. It will help to maintain error context when error occurred. 6344 * Compare to a simple hang, the system will keep stable at least for SSH 6345 * access. Then it should be trivial to inspect the hardware state and 6346 * see what's going on. Implemented as following: 6347 * 6348 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6349 * clears all CPU mappings to device, disallows remappings through page faults 6350 * 2. amdgpu_irq_disable_all() disables all interrupts 6351 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6352 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6353 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6354 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6355 * flush any in flight DMA operations 6356 */ 6357 void amdgpu_device_halt(struct amdgpu_device *adev) 6358 { 6359 struct pci_dev *pdev = adev->pdev; 6360 struct drm_device *ddev = adev_to_drm(adev); 6361 6362 amdgpu_xcp_dev_unplug(adev); 6363 drm_dev_unplug(ddev); 6364 6365 amdgpu_irq_disable_all(adev); 6366 6367 amdgpu_fence_driver_hw_fini(adev); 6368 6369 adev->no_hw_access = true; 6370 6371 amdgpu_device_unmap_mmio(adev); 6372 6373 pci_disable_device(pdev); 6374 pci_wait_for_pending_transaction(pdev); 6375 } 6376 6377 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6378 u32 reg) 6379 { 6380 unsigned long flags, address, data; 6381 u32 r; 6382 6383 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6384 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6385 6386 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6387 WREG32(address, reg * 4); 6388 (void)RREG32(address); 6389 r = RREG32(data); 6390 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6391 return r; 6392 } 6393 6394 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6395 u32 reg, u32 v) 6396 { 6397 unsigned long flags, address, data; 6398 6399 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6400 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6401 6402 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6403 WREG32(address, reg * 4); 6404 (void)RREG32(address); 6405 WREG32(data, v); 6406 (void)RREG32(data); 6407 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6408 } 6409 6410 /** 6411 * amdgpu_device_switch_gang - switch to a new gang 6412 * @adev: amdgpu_device pointer 6413 * @gang: the gang to switch to 6414 * 6415 * Try to switch to a new gang. 6416 * Returns: NULL if we switched to the new gang or a reference to the current 6417 * gang leader. 6418 */ 6419 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6420 struct dma_fence *gang) 6421 { 6422 struct dma_fence *old = NULL; 6423 6424 do { 6425 dma_fence_put(old); 6426 rcu_read_lock(); 6427 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6428 rcu_read_unlock(); 6429 6430 if (old == gang) 6431 break; 6432 6433 if (!dma_fence_is_signaled(old)) 6434 return old; 6435 6436 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6437 old, gang) != old); 6438 6439 dma_fence_put(old); 6440 return NULL; 6441 } 6442 6443 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6444 { 6445 switch (adev->asic_type) { 6446 #ifdef CONFIG_DRM_AMDGPU_SI 6447 case CHIP_HAINAN: 6448 #endif 6449 case CHIP_TOPAZ: 6450 /* chips with no display hardware */ 6451 return false; 6452 #ifdef CONFIG_DRM_AMDGPU_SI 6453 case CHIP_TAHITI: 6454 case CHIP_PITCAIRN: 6455 case CHIP_VERDE: 6456 case CHIP_OLAND: 6457 #endif 6458 #ifdef CONFIG_DRM_AMDGPU_CIK 6459 case CHIP_BONAIRE: 6460 case CHIP_HAWAII: 6461 case CHIP_KAVERI: 6462 case CHIP_KABINI: 6463 case CHIP_MULLINS: 6464 #endif 6465 case CHIP_TONGA: 6466 case CHIP_FIJI: 6467 case CHIP_POLARIS10: 6468 case CHIP_POLARIS11: 6469 case CHIP_POLARIS12: 6470 case CHIP_VEGAM: 6471 case CHIP_CARRIZO: 6472 case CHIP_STONEY: 6473 /* chips with display hardware */ 6474 return true; 6475 default: 6476 /* IP discovery */ 6477 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6478 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6479 return false; 6480 return true; 6481 } 6482 } 6483 6484 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6485 uint32_t inst, uint32_t reg_addr, char reg_name[], 6486 uint32_t expected_value, uint32_t mask) 6487 { 6488 uint32_t ret = 0; 6489 uint32_t old_ = 0; 6490 uint32_t tmp_ = RREG32(reg_addr); 6491 uint32_t loop = adev->usec_timeout; 6492 6493 while ((tmp_ & (mask)) != (expected_value)) { 6494 if (old_ != tmp_) { 6495 loop = adev->usec_timeout; 6496 old_ = tmp_; 6497 } else 6498 udelay(1); 6499 tmp_ = RREG32(reg_addr); 6500 loop--; 6501 if (!loop) { 6502 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6503 inst, reg_name, (uint32_t)expected_value, 6504 (uint32_t)(tmp_ & (mask))); 6505 ret = -ETIMEDOUT; 6506 break; 6507 } 6508 } 6509 return ret; 6510 } 6511