1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 148 149 /** 150 * DOC: pcie_replay_count 151 * 152 * The amdgpu driver provides a sysfs API for reporting the total number 153 * of PCIe replays (NAKs) 154 * The file pcie_replay_count is used for this and returns the total 155 * number of replays as a sum of the NAKs generated and NAKs received 156 */ 157 158 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 159 struct device_attribute *attr, char *buf) 160 { 161 struct drm_device *ddev = dev_get_drvdata(dev); 162 struct amdgpu_device *adev = drm_to_adev(ddev); 163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 164 165 return sysfs_emit(buf, "%llu\n", cnt); 166 } 167 168 static DEVICE_ATTR(pcie_replay_count, 0444, 169 amdgpu_device_get_pcie_replay_count, NULL); 170 171 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 172 struct bin_attribute *attr, char *buf, 173 loff_t ppos, size_t count) 174 { 175 struct device *dev = kobj_to_dev(kobj); 176 struct drm_device *ddev = dev_get_drvdata(dev); 177 struct amdgpu_device *adev = drm_to_adev(ddev); 178 ssize_t bytes_read; 179 180 switch (ppos) { 181 case AMDGPU_SYS_REG_STATE_XGMI: 182 bytes_read = amdgpu_asic_get_reg_state( 183 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 184 break; 185 case AMDGPU_SYS_REG_STATE_WAFL: 186 bytes_read = amdgpu_asic_get_reg_state( 187 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 188 break; 189 case AMDGPU_SYS_REG_STATE_PCIE: 190 bytes_read = amdgpu_asic_get_reg_state( 191 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 192 break; 193 case AMDGPU_SYS_REG_STATE_USR: 194 bytes_read = amdgpu_asic_get_reg_state( 195 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 196 break; 197 case AMDGPU_SYS_REG_STATE_USR_1: 198 bytes_read = amdgpu_asic_get_reg_state( 199 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 200 break; 201 default: 202 return -EINVAL; 203 } 204 205 return bytes_read; 206 } 207 208 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 209 AMDGPU_SYS_REG_STATE_END); 210 211 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 212 { 213 int ret; 214 215 if (!amdgpu_asic_get_reg_state_supported(adev)) 216 return 0; 217 218 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 219 220 return ret; 221 } 222 223 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 224 { 225 if (!amdgpu_asic_get_reg_state_supported(adev)) 226 return; 227 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 228 } 229 230 /** 231 * DOC: board_info 232 * 233 * The amdgpu driver provides a sysfs API for giving board related information. 234 * It provides the form factor information in the format 235 * 236 * type : form factor 237 * 238 * Possible form factor values 239 * 240 * - "cem" - PCIE CEM card 241 * - "oam" - Open Compute Accelerator Module 242 * - "unknown" - Not known 243 * 244 */ 245 246 static ssize_t amdgpu_device_get_board_info(struct device *dev, 247 struct device_attribute *attr, 248 char *buf) 249 { 250 struct drm_device *ddev = dev_get_drvdata(dev); 251 struct amdgpu_device *adev = drm_to_adev(ddev); 252 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 253 const char *pkg; 254 255 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 256 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 257 258 switch (pkg_type) { 259 case AMDGPU_PKG_TYPE_CEM: 260 pkg = "cem"; 261 break; 262 case AMDGPU_PKG_TYPE_OAM: 263 pkg = "oam"; 264 break; 265 default: 266 pkg = "unknown"; 267 break; 268 } 269 270 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 271 } 272 273 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 274 275 static struct attribute *amdgpu_board_attrs[] = { 276 &dev_attr_board_info.attr, 277 NULL, 278 }; 279 280 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 281 struct attribute *attr, int n) 282 { 283 struct device *dev = kobj_to_dev(kobj); 284 struct drm_device *ddev = dev_get_drvdata(dev); 285 struct amdgpu_device *adev = drm_to_adev(ddev); 286 287 if (adev->flags & AMD_IS_APU) 288 return 0; 289 290 return attr->mode; 291 } 292 293 static const struct attribute_group amdgpu_board_attrs_group = { 294 .attrs = amdgpu_board_attrs, 295 .is_visible = amdgpu_board_attrs_is_visible 296 }; 297 298 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 299 300 301 /** 302 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 303 * 304 * @dev: drm_device pointer 305 * 306 * Returns true if the device is a dGPU with ATPX power control, 307 * otherwise return false. 308 */ 309 bool amdgpu_device_supports_px(struct drm_device *dev) 310 { 311 struct amdgpu_device *adev = drm_to_adev(dev); 312 313 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 314 return true; 315 return false; 316 } 317 318 /** 319 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 320 * 321 * @dev: drm_device pointer 322 * 323 * Returns true if the device is a dGPU with ACPI power control, 324 * otherwise return false. 325 */ 326 bool amdgpu_device_supports_boco(struct drm_device *dev) 327 { 328 struct amdgpu_device *adev = drm_to_adev(dev); 329 330 if (adev->has_pr3 || 331 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 332 return true; 333 return false; 334 } 335 336 /** 337 * amdgpu_device_supports_baco - Does the device support BACO 338 * 339 * @dev: drm_device pointer 340 * 341 * Return: 342 * 1 if the device supporte BACO; 343 * 3 if the device support MACO (only works if BACO is supported) 344 * otherwise return 0. 345 */ 346 int amdgpu_device_supports_baco(struct drm_device *dev) 347 { 348 struct amdgpu_device *adev = drm_to_adev(dev); 349 350 return amdgpu_asic_supports_baco(adev); 351 } 352 353 /** 354 * amdgpu_device_supports_smart_shift - Is the device dGPU with 355 * smart shift support 356 * 357 * @dev: drm_device pointer 358 * 359 * Returns true if the device is a dGPU with Smart Shift support, 360 * otherwise returns false. 361 */ 362 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 363 { 364 return (amdgpu_device_supports_boco(dev) && 365 amdgpu_acpi_is_power_shift_control_supported()); 366 } 367 368 /* 369 * VRAM access helper functions 370 */ 371 372 /** 373 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 374 * 375 * @adev: amdgpu_device pointer 376 * @pos: offset of the buffer in vram 377 * @buf: virtual address of the buffer in system memory 378 * @size: read/write size, sizeof(@buf) must > @size 379 * @write: true - write to vram, otherwise - read from vram 380 */ 381 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 382 void *buf, size_t size, bool write) 383 { 384 unsigned long flags; 385 uint32_t hi = ~0, tmp = 0; 386 uint32_t *data = buf; 387 uint64_t last; 388 int idx; 389 390 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 391 return; 392 393 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 394 395 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 396 for (last = pos + size; pos < last; pos += 4) { 397 tmp = pos >> 31; 398 399 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 400 if (tmp != hi) { 401 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 402 hi = tmp; 403 } 404 if (write) 405 WREG32_NO_KIQ(mmMM_DATA, *data++); 406 else 407 *data++ = RREG32_NO_KIQ(mmMM_DATA); 408 } 409 410 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 411 drm_dev_exit(idx); 412 } 413 414 /** 415 * amdgpu_device_aper_access - access vram by vram aperature 416 * 417 * @adev: amdgpu_device pointer 418 * @pos: offset of the buffer in vram 419 * @buf: virtual address of the buffer in system memory 420 * @size: read/write size, sizeof(@buf) must > @size 421 * @write: true - write to vram, otherwise - read from vram 422 * 423 * The return value means how many bytes have been transferred. 424 */ 425 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 426 void *buf, size_t size, bool write) 427 { 428 #ifdef CONFIG_64BIT 429 void __iomem *addr; 430 size_t count = 0; 431 uint64_t last; 432 433 if (!adev->mman.aper_base_kaddr) 434 return 0; 435 436 last = min(pos + size, adev->gmc.visible_vram_size); 437 if (last > pos) { 438 addr = adev->mman.aper_base_kaddr + pos; 439 count = last - pos; 440 441 if (write) { 442 memcpy_toio(addr, buf, count); 443 /* Make sure HDP write cache flush happens without any reordering 444 * after the system memory contents are sent over PCIe device 445 */ 446 mb(); 447 amdgpu_device_flush_hdp(adev, NULL); 448 } else { 449 amdgpu_device_invalidate_hdp(adev, NULL); 450 /* Make sure HDP read cache is invalidated before issuing a read 451 * to the PCIe device 452 */ 453 mb(); 454 memcpy_fromio(buf, addr, count); 455 } 456 457 } 458 459 return count; 460 #else 461 return 0; 462 #endif 463 } 464 465 /** 466 * amdgpu_device_vram_access - read/write a buffer in vram 467 * 468 * @adev: amdgpu_device pointer 469 * @pos: offset of the buffer in vram 470 * @buf: virtual address of the buffer in system memory 471 * @size: read/write size, sizeof(@buf) must > @size 472 * @write: true - write to vram, otherwise - read from vram 473 */ 474 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 475 void *buf, size_t size, bool write) 476 { 477 size_t count; 478 479 /* try to using vram apreature to access vram first */ 480 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 481 size -= count; 482 if (size) { 483 /* using MM to access rest vram */ 484 pos += count; 485 buf += count; 486 amdgpu_device_mm_access(adev, pos, buf, size, write); 487 } 488 } 489 490 /* 491 * register access helper functions. 492 */ 493 494 /* Check if hw access should be skipped because of hotplug or device error */ 495 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 496 { 497 if (adev->no_hw_access) 498 return true; 499 500 #ifdef CONFIG_LOCKDEP 501 /* 502 * This is a bit complicated to understand, so worth a comment. What we assert 503 * here is that the GPU reset is not running on another thread in parallel. 504 * 505 * For this we trylock the read side of the reset semaphore, if that succeeds 506 * we know that the reset is not running in paralell. 507 * 508 * If the trylock fails we assert that we are either already holding the read 509 * side of the lock or are the reset thread itself and hold the write side of 510 * the lock. 511 */ 512 if (in_task()) { 513 if (down_read_trylock(&adev->reset_domain->sem)) 514 up_read(&adev->reset_domain->sem); 515 else 516 lockdep_assert_held(&adev->reset_domain->sem); 517 } 518 #endif 519 return false; 520 } 521 522 /** 523 * amdgpu_device_rreg - read a memory mapped IO or indirect register 524 * 525 * @adev: amdgpu_device pointer 526 * @reg: dword aligned register offset 527 * @acc_flags: access flags which require special behavior 528 * 529 * Returns the 32 bit value from the offset specified. 530 */ 531 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 532 uint32_t reg, uint32_t acc_flags) 533 { 534 uint32_t ret; 535 536 if (amdgpu_device_skip_hw_access(adev)) 537 return 0; 538 539 if ((reg * 4) < adev->rmmio_size) { 540 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 541 amdgpu_sriov_runtime(adev) && 542 down_read_trylock(&adev->reset_domain->sem)) { 543 ret = amdgpu_kiq_rreg(adev, reg, 0); 544 up_read(&adev->reset_domain->sem); 545 } else { 546 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 547 } 548 } else { 549 ret = adev->pcie_rreg(adev, reg * 4); 550 } 551 552 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 553 554 return ret; 555 } 556 557 /* 558 * MMIO register read with bytes helper functions 559 * @offset:bytes offset from MMIO start 560 */ 561 562 /** 563 * amdgpu_mm_rreg8 - read a memory mapped IO register 564 * 565 * @adev: amdgpu_device pointer 566 * @offset: byte aligned register offset 567 * 568 * Returns the 8 bit value from the offset specified. 569 */ 570 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 571 { 572 if (amdgpu_device_skip_hw_access(adev)) 573 return 0; 574 575 if (offset < adev->rmmio_size) 576 return (readb(adev->rmmio + offset)); 577 BUG(); 578 } 579 580 581 /** 582 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 583 * 584 * @adev: amdgpu_device pointer 585 * @reg: dword aligned register offset 586 * @acc_flags: access flags which require special behavior 587 * @xcc_id: xcc accelerated compute core id 588 * 589 * Returns the 32 bit value from the offset specified. 590 */ 591 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 592 uint32_t reg, uint32_t acc_flags, 593 uint32_t xcc_id) 594 { 595 uint32_t ret, rlcg_flag; 596 597 if (amdgpu_device_skip_hw_access(adev)) 598 return 0; 599 600 if ((reg * 4) < adev->rmmio_size) { 601 if (amdgpu_sriov_vf(adev) && 602 !amdgpu_sriov_runtime(adev) && 603 adev->gfx.rlc.rlcg_reg_access_supported && 604 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 605 GC_HWIP, false, 606 &rlcg_flag)) { 607 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 608 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 609 amdgpu_sriov_runtime(adev) && 610 down_read_trylock(&adev->reset_domain->sem)) { 611 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 612 up_read(&adev->reset_domain->sem); 613 } else { 614 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 615 } 616 } else { 617 ret = adev->pcie_rreg(adev, reg * 4); 618 } 619 620 return ret; 621 } 622 623 /* 624 * MMIO register write with bytes helper functions 625 * @offset:bytes offset from MMIO start 626 * @value: the value want to be written to the register 627 */ 628 629 /** 630 * amdgpu_mm_wreg8 - read a memory mapped IO register 631 * 632 * @adev: amdgpu_device pointer 633 * @offset: byte aligned register offset 634 * @value: 8 bit value to write 635 * 636 * Writes the value specified to the offset specified. 637 */ 638 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 639 { 640 if (amdgpu_device_skip_hw_access(adev)) 641 return; 642 643 if (offset < adev->rmmio_size) 644 writeb(value, adev->rmmio + offset); 645 else 646 BUG(); 647 } 648 649 /** 650 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 651 * 652 * @adev: amdgpu_device pointer 653 * @reg: dword aligned register offset 654 * @v: 32 bit value to write to the register 655 * @acc_flags: access flags which require special behavior 656 * 657 * Writes the value specified to the offset specified. 658 */ 659 void amdgpu_device_wreg(struct amdgpu_device *adev, 660 uint32_t reg, uint32_t v, 661 uint32_t acc_flags) 662 { 663 if (amdgpu_device_skip_hw_access(adev)) 664 return; 665 666 if ((reg * 4) < adev->rmmio_size) { 667 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 668 amdgpu_sriov_runtime(adev) && 669 down_read_trylock(&adev->reset_domain->sem)) { 670 amdgpu_kiq_wreg(adev, reg, v, 0); 671 up_read(&adev->reset_domain->sem); 672 } else { 673 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 674 } 675 } else { 676 adev->pcie_wreg(adev, reg * 4, v); 677 } 678 679 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 680 } 681 682 /** 683 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 684 * 685 * @adev: amdgpu_device pointer 686 * @reg: mmio/rlc register 687 * @v: value to write 688 * @xcc_id: xcc accelerated compute core id 689 * 690 * this function is invoked only for the debugfs register access 691 */ 692 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 693 uint32_t reg, uint32_t v, 694 uint32_t xcc_id) 695 { 696 if (amdgpu_device_skip_hw_access(adev)) 697 return; 698 699 if (amdgpu_sriov_fullaccess(adev) && 700 adev->gfx.rlc.funcs && 701 adev->gfx.rlc.funcs->is_rlcg_access_range) { 702 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 703 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 704 } else if ((reg * 4) >= adev->rmmio_size) { 705 adev->pcie_wreg(adev, reg * 4, v); 706 } else { 707 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 708 } 709 } 710 711 /** 712 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 713 * 714 * @adev: amdgpu_device pointer 715 * @reg: dword aligned register offset 716 * @v: 32 bit value to write to the register 717 * @acc_flags: access flags which require special behavior 718 * @xcc_id: xcc accelerated compute core id 719 * 720 * Writes the value specified to the offset specified. 721 */ 722 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 723 uint32_t reg, uint32_t v, 724 uint32_t acc_flags, uint32_t xcc_id) 725 { 726 uint32_t rlcg_flag; 727 728 if (amdgpu_device_skip_hw_access(adev)) 729 return; 730 731 if ((reg * 4) < adev->rmmio_size) { 732 if (amdgpu_sriov_vf(adev) && 733 !amdgpu_sriov_runtime(adev) && 734 adev->gfx.rlc.rlcg_reg_access_supported && 735 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 736 GC_HWIP, true, 737 &rlcg_flag)) { 738 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 739 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 740 amdgpu_sriov_runtime(adev) && 741 down_read_trylock(&adev->reset_domain->sem)) { 742 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 743 up_read(&adev->reset_domain->sem); 744 } else { 745 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 746 } 747 } else { 748 adev->pcie_wreg(adev, reg * 4, v); 749 } 750 } 751 752 /** 753 * amdgpu_device_indirect_rreg - read an indirect register 754 * 755 * @adev: amdgpu_device pointer 756 * @reg_addr: indirect register address to read from 757 * 758 * Returns the value of indirect register @reg_addr 759 */ 760 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 761 u32 reg_addr) 762 { 763 unsigned long flags, pcie_index, pcie_data; 764 void __iomem *pcie_index_offset; 765 void __iomem *pcie_data_offset; 766 u32 r; 767 768 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 769 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 770 771 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 772 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 773 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 774 775 writel(reg_addr, pcie_index_offset); 776 readl(pcie_index_offset); 777 r = readl(pcie_data_offset); 778 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 779 780 return r; 781 } 782 783 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 784 u64 reg_addr) 785 { 786 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 787 u32 r; 788 void __iomem *pcie_index_offset; 789 void __iomem *pcie_index_hi_offset; 790 void __iomem *pcie_data_offset; 791 792 if (unlikely(!adev->nbio.funcs)) { 793 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 794 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 795 } else { 796 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 797 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 798 } 799 800 if (reg_addr >> 32) { 801 if (unlikely(!adev->nbio.funcs)) 802 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 803 else 804 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 805 } else { 806 pcie_index_hi = 0; 807 } 808 809 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 810 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 811 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 812 if (pcie_index_hi != 0) 813 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 814 pcie_index_hi * 4; 815 816 writel(reg_addr, pcie_index_offset); 817 readl(pcie_index_offset); 818 if (pcie_index_hi != 0) { 819 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 820 readl(pcie_index_hi_offset); 821 } 822 r = readl(pcie_data_offset); 823 824 /* clear the high bits */ 825 if (pcie_index_hi != 0) { 826 writel(0, pcie_index_hi_offset); 827 readl(pcie_index_hi_offset); 828 } 829 830 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 831 832 return r; 833 } 834 835 /** 836 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 837 * 838 * @adev: amdgpu_device pointer 839 * @reg_addr: indirect register address to read from 840 * 841 * Returns the value of indirect register @reg_addr 842 */ 843 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 844 u32 reg_addr) 845 { 846 unsigned long flags, pcie_index, pcie_data; 847 void __iomem *pcie_index_offset; 848 void __iomem *pcie_data_offset; 849 u64 r; 850 851 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 852 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 853 854 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 855 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 856 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 857 858 /* read low 32 bits */ 859 writel(reg_addr, pcie_index_offset); 860 readl(pcie_index_offset); 861 r = readl(pcie_data_offset); 862 /* read high 32 bits */ 863 writel(reg_addr + 4, pcie_index_offset); 864 readl(pcie_index_offset); 865 r |= ((u64)readl(pcie_data_offset) << 32); 866 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 867 868 return r; 869 } 870 871 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 872 u64 reg_addr) 873 { 874 unsigned long flags, pcie_index, pcie_data; 875 unsigned long pcie_index_hi = 0; 876 void __iomem *pcie_index_offset; 877 void __iomem *pcie_index_hi_offset; 878 void __iomem *pcie_data_offset; 879 u64 r; 880 881 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 882 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 883 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 884 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 885 886 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 887 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 888 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 889 if (pcie_index_hi != 0) 890 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 891 pcie_index_hi * 4; 892 893 /* read low 32 bits */ 894 writel(reg_addr, pcie_index_offset); 895 readl(pcie_index_offset); 896 if (pcie_index_hi != 0) { 897 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 898 readl(pcie_index_hi_offset); 899 } 900 r = readl(pcie_data_offset); 901 /* read high 32 bits */ 902 writel(reg_addr + 4, pcie_index_offset); 903 readl(pcie_index_offset); 904 if (pcie_index_hi != 0) { 905 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 906 readl(pcie_index_hi_offset); 907 } 908 r |= ((u64)readl(pcie_data_offset) << 32); 909 910 /* clear the high bits */ 911 if (pcie_index_hi != 0) { 912 writel(0, pcie_index_hi_offset); 913 readl(pcie_index_hi_offset); 914 } 915 916 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 917 918 return r; 919 } 920 921 /** 922 * amdgpu_device_indirect_wreg - write an indirect register address 923 * 924 * @adev: amdgpu_device pointer 925 * @reg_addr: indirect register offset 926 * @reg_data: indirect register data 927 * 928 */ 929 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 930 u32 reg_addr, u32 reg_data) 931 { 932 unsigned long flags, pcie_index, pcie_data; 933 void __iomem *pcie_index_offset; 934 void __iomem *pcie_data_offset; 935 936 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 937 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 938 939 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 940 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 941 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 942 943 writel(reg_addr, pcie_index_offset); 944 readl(pcie_index_offset); 945 writel(reg_data, pcie_data_offset); 946 readl(pcie_data_offset); 947 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 948 } 949 950 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 951 u64 reg_addr, u32 reg_data) 952 { 953 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 954 void __iomem *pcie_index_offset; 955 void __iomem *pcie_index_hi_offset; 956 void __iomem *pcie_data_offset; 957 958 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 959 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 960 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 961 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 962 else 963 pcie_index_hi = 0; 964 965 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 966 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 967 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 968 if (pcie_index_hi != 0) 969 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 970 pcie_index_hi * 4; 971 972 writel(reg_addr, pcie_index_offset); 973 readl(pcie_index_offset); 974 if (pcie_index_hi != 0) { 975 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 976 readl(pcie_index_hi_offset); 977 } 978 writel(reg_data, pcie_data_offset); 979 readl(pcie_data_offset); 980 981 /* clear the high bits */ 982 if (pcie_index_hi != 0) { 983 writel(0, pcie_index_hi_offset); 984 readl(pcie_index_hi_offset); 985 } 986 987 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 988 } 989 990 /** 991 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 992 * 993 * @adev: amdgpu_device pointer 994 * @reg_addr: indirect register offset 995 * @reg_data: indirect register data 996 * 997 */ 998 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 999 u32 reg_addr, u64 reg_data) 1000 { 1001 unsigned long flags, pcie_index, pcie_data; 1002 void __iomem *pcie_index_offset; 1003 void __iomem *pcie_data_offset; 1004 1005 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1006 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1007 1008 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1009 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1010 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1011 1012 /* write low 32 bits */ 1013 writel(reg_addr, pcie_index_offset); 1014 readl(pcie_index_offset); 1015 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1016 readl(pcie_data_offset); 1017 /* write high 32 bits */ 1018 writel(reg_addr + 4, pcie_index_offset); 1019 readl(pcie_index_offset); 1020 writel((u32)(reg_data >> 32), pcie_data_offset); 1021 readl(pcie_data_offset); 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 } 1024 1025 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1026 u64 reg_addr, u64 reg_data) 1027 { 1028 unsigned long flags, pcie_index, pcie_data; 1029 unsigned long pcie_index_hi = 0; 1030 void __iomem *pcie_index_offset; 1031 void __iomem *pcie_index_hi_offset; 1032 void __iomem *pcie_data_offset; 1033 1034 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1035 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1036 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1037 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1038 1039 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1040 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1041 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1042 if (pcie_index_hi != 0) 1043 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1044 pcie_index_hi * 4; 1045 1046 /* write low 32 bits */ 1047 writel(reg_addr, pcie_index_offset); 1048 readl(pcie_index_offset); 1049 if (pcie_index_hi != 0) { 1050 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1051 readl(pcie_index_hi_offset); 1052 } 1053 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1054 readl(pcie_data_offset); 1055 /* write high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 if (pcie_index_hi != 0) { 1059 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1060 readl(pcie_index_hi_offset); 1061 } 1062 writel((u32)(reg_data >> 32), pcie_data_offset); 1063 readl(pcie_data_offset); 1064 1065 /* clear the high bits */ 1066 if (pcie_index_hi != 0) { 1067 writel(0, pcie_index_hi_offset); 1068 readl(pcie_index_hi_offset); 1069 } 1070 1071 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1072 } 1073 1074 /** 1075 * amdgpu_device_get_rev_id - query device rev_id 1076 * 1077 * @adev: amdgpu_device pointer 1078 * 1079 * Return device rev_id 1080 */ 1081 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1082 { 1083 return adev->nbio.funcs->get_rev_id(adev); 1084 } 1085 1086 /** 1087 * amdgpu_invalid_rreg - dummy reg read function 1088 * 1089 * @adev: amdgpu_device pointer 1090 * @reg: offset of register 1091 * 1092 * Dummy register read function. Used for register blocks 1093 * that certain asics don't have (all asics). 1094 * Returns the value in the register. 1095 */ 1096 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1097 { 1098 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1099 BUG(); 1100 return 0; 1101 } 1102 1103 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1104 { 1105 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1106 BUG(); 1107 return 0; 1108 } 1109 1110 /** 1111 * amdgpu_invalid_wreg - dummy reg write function 1112 * 1113 * @adev: amdgpu_device pointer 1114 * @reg: offset of register 1115 * @v: value to write to the register 1116 * 1117 * Dummy register read function. Used for register blocks 1118 * that certain asics don't have (all asics). 1119 */ 1120 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1121 { 1122 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1123 reg, v); 1124 BUG(); 1125 } 1126 1127 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1128 { 1129 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1130 reg, v); 1131 BUG(); 1132 } 1133 1134 /** 1135 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1136 * 1137 * @adev: amdgpu_device pointer 1138 * @reg: offset of register 1139 * 1140 * Dummy register read function. Used for register blocks 1141 * that certain asics don't have (all asics). 1142 * Returns the value in the register. 1143 */ 1144 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1145 { 1146 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1147 BUG(); 1148 return 0; 1149 } 1150 1151 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1152 { 1153 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1154 BUG(); 1155 return 0; 1156 } 1157 1158 /** 1159 * amdgpu_invalid_wreg64 - dummy reg write function 1160 * 1161 * @adev: amdgpu_device pointer 1162 * @reg: offset of register 1163 * @v: value to write to the register 1164 * 1165 * Dummy register read function. Used for register blocks 1166 * that certain asics don't have (all asics). 1167 */ 1168 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1169 { 1170 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1171 reg, v); 1172 BUG(); 1173 } 1174 1175 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1176 { 1177 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1178 reg, v); 1179 BUG(); 1180 } 1181 1182 /** 1183 * amdgpu_block_invalid_rreg - dummy reg read function 1184 * 1185 * @adev: amdgpu_device pointer 1186 * @block: offset of instance 1187 * @reg: offset of register 1188 * 1189 * Dummy register read function. Used for register blocks 1190 * that certain asics don't have (all asics). 1191 * Returns the value in the register. 1192 */ 1193 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1194 uint32_t block, uint32_t reg) 1195 { 1196 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1197 reg, block); 1198 BUG(); 1199 return 0; 1200 } 1201 1202 /** 1203 * amdgpu_block_invalid_wreg - dummy reg write function 1204 * 1205 * @adev: amdgpu_device pointer 1206 * @block: offset of instance 1207 * @reg: offset of register 1208 * @v: value to write to the register 1209 * 1210 * Dummy register read function. Used for register blocks 1211 * that certain asics don't have (all asics). 1212 */ 1213 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1214 uint32_t block, 1215 uint32_t reg, uint32_t v) 1216 { 1217 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1218 reg, block, v); 1219 BUG(); 1220 } 1221 1222 /** 1223 * amdgpu_device_asic_init - Wrapper for atom asic_init 1224 * 1225 * @adev: amdgpu_device pointer 1226 * 1227 * Does any asic specific work and then calls atom asic init. 1228 */ 1229 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1230 { 1231 int ret; 1232 1233 amdgpu_asic_pre_asic_init(adev); 1234 1235 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1236 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1237 amdgpu_psp_wait_for_bootloader(adev); 1238 ret = amdgpu_atomfirmware_asic_init(adev, true); 1239 return ret; 1240 } else { 1241 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1242 } 1243 1244 return 0; 1245 } 1246 1247 /** 1248 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1249 * 1250 * @adev: amdgpu_device pointer 1251 * 1252 * Allocates a scratch page of VRAM for use by various things in the 1253 * driver. 1254 */ 1255 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1256 { 1257 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1258 AMDGPU_GEM_DOMAIN_VRAM | 1259 AMDGPU_GEM_DOMAIN_GTT, 1260 &adev->mem_scratch.robj, 1261 &adev->mem_scratch.gpu_addr, 1262 (void **)&adev->mem_scratch.ptr); 1263 } 1264 1265 /** 1266 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1267 * 1268 * @adev: amdgpu_device pointer 1269 * 1270 * Frees the VRAM scratch page. 1271 */ 1272 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1273 { 1274 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1275 } 1276 1277 /** 1278 * amdgpu_device_program_register_sequence - program an array of registers. 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @registers: pointer to the register array 1282 * @array_size: size of the register array 1283 * 1284 * Programs an array or registers with and or masks. 1285 * This is a helper for setting golden registers. 1286 */ 1287 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1288 const u32 *registers, 1289 const u32 array_size) 1290 { 1291 u32 tmp, reg, and_mask, or_mask; 1292 int i; 1293 1294 if (array_size % 3) 1295 return; 1296 1297 for (i = 0; i < array_size; i += 3) { 1298 reg = registers[i + 0]; 1299 and_mask = registers[i + 1]; 1300 or_mask = registers[i + 2]; 1301 1302 if (and_mask == 0xffffffff) { 1303 tmp = or_mask; 1304 } else { 1305 tmp = RREG32(reg); 1306 tmp &= ~and_mask; 1307 if (adev->family >= AMDGPU_FAMILY_AI) 1308 tmp |= (or_mask & and_mask); 1309 else 1310 tmp |= or_mask; 1311 } 1312 WREG32(reg, tmp); 1313 } 1314 } 1315 1316 /** 1317 * amdgpu_device_pci_config_reset - reset the GPU 1318 * 1319 * @adev: amdgpu_device pointer 1320 * 1321 * Resets the GPU using the pci config reset sequence. 1322 * Only applicable to asics prior to vega10. 1323 */ 1324 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1325 { 1326 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1327 } 1328 1329 /** 1330 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1331 * 1332 * @adev: amdgpu_device pointer 1333 * 1334 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1335 */ 1336 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1337 { 1338 return pci_reset_function(adev->pdev); 1339 } 1340 1341 /* 1342 * amdgpu_device_wb_*() 1343 * Writeback is the method by which the GPU updates special pages in memory 1344 * with the status of certain GPU events (fences, ring pointers,etc.). 1345 */ 1346 1347 /** 1348 * amdgpu_device_wb_fini - Disable Writeback and free memory 1349 * 1350 * @adev: amdgpu_device pointer 1351 * 1352 * Disables Writeback and frees the Writeback memory (all asics). 1353 * Used at driver shutdown. 1354 */ 1355 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1356 { 1357 if (adev->wb.wb_obj) { 1358 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1359 &adev->wb.gpu_addr, 1360 (void **)&adev->wb.wb); 1361 adev->wb.wb_obj = NULL; 1362 } 1363 } 1364 1365 /** 1366 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1367 * 1368 * @adev: amdgpu_device pointer 1369 * 1370 * Initializes writeback and allocates writeback memory (all asics). 1371 * Used at driver startup. 1372 * Returns 0 on success or an -error on failure. 1373 */ 1374 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1375 { 1376 int r; 1377 1378 if (adev->wb.wb_obj == NULL) { 1379 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1380 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1381 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1382 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1383 (void **)&adev->wb.wb); 1384 if (r) { 1385 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1386 return r; 1387 } 1388 1389 adev->wb.num_wb = AMDGPU_MAX_WB; 1390 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1391 1392 /* clear wb memory */ 1393 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1394 } 1395 1396 return 0; 1397 } 1398 1399 /** 1400 * amdgpu_device_wb_get - Allocate a wb entry 1401 * 1402 * @adev: amdgpu_device pointer 1403 * @wb: wb index 1404 * 1405 * Allocate a wb slot for use by the driver (all asics). 1406 * Returns 0 on success or -EINVAL on failure. 1407 */ 1408 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1409 { 1410 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1411 1412 if (offset < adev->wb.num_wb) { 1413 __set_bit(offset, adev->wb.used); 1414 *wb = offset << 3; /* convert to dw offset */ 1415 return 0; 1416 } else { 1417 return -EINVAL; 1418 } 1419 } 1420 1421 /** 1422 * amdgpu_device_wb_free - Free a wb entry 1423 * 1424 * @adev: amdgpu_device pointer 1425 * @wb: wb index 1426 * 1427 * Free a wb slot allocated for use by the driver (all asics) 1428 */ 1429 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1430 { 1431 wb >>= 3; 1432 if (wb < adev->wb.num_wb) 1433 __clear_bit(wb, adev->wb.used); 1434 } 1435 1436 /** 1437 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1438 * 1439 * @adev: amdgpu_device pointer 1440 * 1441 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1442 * to fail, but if any of the BARs is not accessible after the size we abort 1443 * driver loading by returning -ENODEV. 1444 */ 1445 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1446 { 1447 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1448 struct pci_bus *root; 1449 struct resource *res; 1450 unsigned int i; 1451 u16 cmd; 1452 int r; 1453 1454 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1455 return 0; 1456 1457 /* Bypass for VF */ 1458 if (amdgpu_sriov_vf(adev)) 1459 return 0; 1460 1461 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1462 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1463 DRM_WARN("System can't access extended configuration space,please check!!\n"); 1464 1465 /* skip if the bios has already enabled large BAR */ 1466 if (adev->gmc.real_vram_size && 1467 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1468 return 0; 1469 1470 /* Check if the root BUS has 64bit memory resources */ 1471 root = adev->pdev->bus; 1472 while (root->parent) 1473 root = root->parent; 1474 1475 pci_bus_for_each_resource(root, res, i) { 1476 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1477 res->start > 0x100000000ull) 1478 break; 1479 } 1480 1481 /* Trying to resize is pointless without a root hub window above 4GB */ 1482 if (!res) 1483 return 0; 1484 1485 /* Limit the BAR size to what is available */ 1486 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1487 rbar_size); 1488 1489 /* Disable memory decoding while we change the BAR addresses and size */ 1490 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1491 pci_write_config_word(adev->pdev, PCI_COMMAND, 1492 cmd & ~PCI_COMMAND_MEMORY); 1493 1494 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1495 amdgpu_doorbell_fini(adev); 1496 if (adev->asic_type >= CHIP_BONAIRE) 1497 pci_release_resource(adev->pdev, 2); 1498 1499 pci_release_resource(adev->pdev, 0); 1500 1501 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1502 if (r == -ENOSPC) 1503 DRM_INFO("Not enough PCI address space for a large BAR."); 1504 else if (r && r != -ENOTSUPP) 1505 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1506 1507 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1508 1509 /* When the doorbell or fb BAR isn't available we have no chance of 1510 * using the device. 1511 */ 1512 r = amdgpu_doorbell_init(adev); 1513 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1514 return -ENODEV; 1515 1516 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1517 1518 return 0; 1519 } 1520 1521 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1522 { 1523 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1524 return false; 1525 1526 return true; 1527 } 1528 1529 /* 1530 * GPU helpers function. 1531 */ 1532 /** 1533 * amdgpu_device_need_post - check if the hw need post or not 1534 * 1535 * @adev: amdgpu_device pointer 1536 * 1537 * Check if the asic has been initialized (all asics) at driver startup 1538 * or post is needed if hw reset is performed. 1539 * Returns true if need or false if not. 1540 */ 1541 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1542 { 1543 uint32_t reg; 1544 1545 if (amdgpu_sriov_vf(adev)) 1546 return false; 1547 1548 if (!amdgpu_device_read_bios(adev)) 1549 return false; 1550 1551 if (amdgpu_passthrough(adev)) { 1552 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1553 * some old smc fw still need driver do vPost otherwise gpu hang, while 1554 * those smc fw version above 22.15 doesn't have this flaw, so we force 1555 * vpost executed for smc version below 22.15 1556 */ 1557 if (adev->asic_type == CHIP_FIJI) { 1558 int err; 1559 uint32_t fw_ver; 1560 1561 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1562 /* force vPost if error occured */ 1563 if (err) 1564 return true; 1565 1566 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1567 release_firmware(adev->pm.fw); 1568 if (fw_ver < 0x00160e00) 1569 return true; 1570 } 1571 } 1572 1573 /* Don't post if we need to reset whole hive on init */ 1574 if (adev->gmc.xgmi.pending_reset) 1575 return false; 1576 1577 if (adev->has_hw_reset) { 1578 adev->has_hw_reset = false; 1579 return true; 1580 } 1581 1582 /* bios scratch used on CIK+ */ 1583 if (adev->asic_type >= CHIP_BONAIRE) 1584 return amdgpu_atombios_scratch_need_asic_init(adev); 1585 1586 /* check MEM_SIZE for older asics */ 1587 reg = amdgpu_asic_get_config_memsize(adev); 1588 1589 if ((reg != 0) && (reg != 0xffffffff)) 1590 return false; 1591 1592 return true; 1593 } 1594 1595 /* 1596 * Check whether seamless boot is supported. 1597 * 1598 * So far we only support seamless boot on DCE 3.0 or later. 1599 * If users report that it works on older ASICS as well, we may 1600 * loosen this. 1601 */ 1602 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1603 { 1604 switch (amdgpu_seamless) { 1605 case -1: 1606 break; 1607 case 1: 1608 return true; 1609 case 0: 1610 return false; 1611 default: 1612 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1613 amdgpu_seamless); 1614 return false; 1615 } 1616 1617 if (!(adev->flags & AMD_IS_APU)) 1618 return false; 1619 1620 if (adev->mman.keep_stolen_vga_memory) 1621 return false; 1622 1623 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1624 } 1625 1626 /* 1627 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1628 * don't support dynamic speed switching. Until we have confirmation from Intel 1629 * that a specific host supports it, it's safer that we keep it disabled for all. 1630 * 1631 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1632 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1633 */ 1634 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1635 { 1636 #if IS_ENABLED(CONFIG_X86) 1637 struct cpuinfo_x86 *c = &cpu_data(0); 1638 1639 /* eGPU change speeds based on USB4 fabric conditions */ 1640 if (dev_is_removable(adev->dev)) 1641 return true; 1642 1643 if (c->x86_vendor == X86_VENDOR_INTEL) 1644 return false; 1645 #endif 1646 return true; 1647 } 1648 1649 /** 1650 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1651 * 1652 * @adev: amdgpu_device pointer 1653 * 1654 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1655 * be set for this device. 1656 * 1657 * Returns true if it should be used or false if not. 1658 */ 1659 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1660 { 1661 switch (amdgpu_aspm) { 1662 case -1: 1663 break; 1664 case 0: 1665 return false; 1666 case 1: 1667 return true; 1668 default: 1669 return false; 1670 } 1671 if (adev->flags & AMD_IS_APU) 1672 return false; 1673 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1674 return false; 1675 return pcie_aspm_enabled(adev->pdev); 1676 } 1677 1678 /* if we get transitioned to only one device, take VGA back */ 1679 /** 1680 * amdgpu_device_vga_set_decode - enable/disable vga decode 1681 * 1682 * @pdev: PCI device pointer 1683 * @state: enable/disable vga decode 1684 * 1685 * Enable/disable vga decode (all asics). 1686 * Returns VGA resource flags. 1687 */ 1688 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1689 bool state) 1690 { 1691 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1692 1693 amdgpu_asic_set_vga_state(adev, state); 1694 if (state) 1695 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1696 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1697 else 1698 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1699 } 1700 1701 /** 1702 * amdgpu_device_check_block_size - validate the vm block size 1703 * 1704 * @adev: amdgpu_device pointer 1705 * 1706 * Validates the vm block size specified via module parameter. 1707 * The vm block size defines number of bits in page table versus page directory, 1708 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1709 * page table and the remaining bits are in the page directory. 1710 */ 1711 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1712 { 1713 /* defines number of bits in page table versus page directory, 1714 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1715 * page table and the remaining bits are in the page directory 1716 */ 1717 if (amdgpu_vm_block_size == -1) 1718 return; 1719 1720 if (amdgpu_vm_block_size < 9) { 1721 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1722 amdgpu_vm_block_size); 1723 amdgpu_vm_block_size = -1; 1724 } 1725 } 1726 1727 /** 1728 * amdgpu_device_check_vm_size - validate the vm size 1729 * 1730 * @adev: amdgpu_device pointer 1731 * 1732 * Validates the vm size in GB specified via module parameter. 1733 * The VM size is the size of the GPU virtual memory space in GB. 1734 */ 1735 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1736 { 1737 /* no need to check the default value */ 1738 if (amdgpu_vm_size == -1) 1739 return; 1740 1741 if (amdgpu_vm_size < 1) { 1742 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1743 amdgpu_vm_size); 1744 amdgpu_vm_size = -1; 1745 } 1746 } 1747 1748 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1749 { 1750 struct sysinfo si; 1751 bool is_os_64 = (sizeof(void *) == 8); 1752 uint64_t total_memory; 1753 uint64_t dram_size_seven_GB = 0x1B8000000; 1754 uint64_t dram_size_three_GB = 0xB8000000; 1755 1756 if (amdgpu_smu_memory_pool_size == 0) 1757 return; 1758 1759 if (!is_os_64) { 1760 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1761 goto def_value; 1762 } 1763 si_meminfo(&si); 1764 total_memory = (uint64_t)si.totalram * si.mem_unit; 1765 1766 if ((amdgpu_smu_memory_pool_size == 1) || 1767 (amdgpu_smu_memory_pool_size == 2)) { 1768 if (total_memory < dram_size_three_GB) 1769 goto def_value1; 1770 } else if ((amdgpu_smu_memory_pool_size == 4) || 1771 (amdgpu_smu_memory_pool_size == 8)) { 1772 if (total_memory < dram_size_seven_GB) 1773 goto def_value1; 1774 } else { 1775 DRM_WARN("Smu memory pool size not supported\n"); 1776 goto def_value; 1777 } 1778 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1779 1780 return; 1781 1782 def_value1: 1783 DRM_WARN("No enough system memory\n"); 1784 def_value: 1785 adev->pm.smu_prv_buffer_size = 0; 1786 } 1787 1788 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1789 { 1790 if (!(adev->flags & AMD_IS_APU) || 1791 adev->asic_type < CHIP_RAVEN) 1792 return 0; 1793 1794 switch (adev->asic_type) { 1795 case CHIP_RAVEN: 1796 if (adev->pdev->device == 0x15dd) 1797 adev->apu_flags |= AMD_APU_IS_RAVEN; 1798 if (adev->pdev->device == 0x15d8) 1799 adev->apu_flags |= AMD_APU_IS_PICASSO; 1800 break; 1801 case CHIP_RENOIR: 1802 if ((adev->pdev->device == 0x1636) || 1803 (adev->pdev->device == 0x164c)) 1804 adev->apu_flags |= AMD_APU_IS_RENOIR; 1805 else 1806 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1807 break; 1808 case CHIP_VANGOGH: 1809 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1810 break; 1811 case CHIP_YELLOW_CARP: 1812 break; 1813 case CHIP_CYAN_SKILLFISH: 1814 if ((adev->pdev->device == 0x13FE) || 1815 (adev->pdev->device == 0x143F)) 1816 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1817 break; 1818 default: 1819 break; 1820 } 1821 1822 return 0; 1823 } 1824 1825 /** 1826 * amdgpu_device_check_arguments - validate module params 1827 * 1828 * @adev: amdgpu_device pointer 1829 * 1830 * Validates certain module parameters and updates 1831 * the associated values used by the driver (all asics). 1832 */ 1833 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1834 { 1835 if (amdgpu_sched_jobs < 4) { 1836 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1837 amdgpu_sched_jobs); 1838 amdgpu_sched_jobs = 4; 1839 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1840 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1841 amdgpu_sched_jobs); 1842 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1843 } 1844 1845 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1846 /* gart size must be greater or equal to 32M */ 1847 dev_warn(adev->dev, "gart size (%d) too small\n", 1848 amdgpu_gart_size); 1849 amdgpu_gart_size = -1; 1850 } 1851 1852 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1853 /* gtt size must be greater or equal to 32M */ 1854 dev_warn(adev->dev, "gtt size (%d) too small\n", 1855 amdgpu_gtt_size); 1856 amdgpu_gtt_size = -1; 1857 } 1858 1859 /* valid range is between 4 and 9 inclusive */ 1860 if (amdgpu_vm_fragment_size != -1 && 1861 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1862 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1863 amdgpu_vm_fragment_size = -1; 1864 } 1865 1866 if (amdgpu_sched_hw_submission < 2) { 1867 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1868 amdgpu_sched_hw_submission); 1869 amdgpu_sched_hw_submission = 2; 1870 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1871 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1872 amdgpu_sched_hw_submission); 1873 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1874 } 1875 1876 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1877 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1878 amdgpu_reset_method = -1; 1879 } 1880 1881 amdgpu_device_check_smu_prv_buffer_size(adev); 1882 1883 amdgpu_device_check_vm_size(adev); 1884 1885 amdgpu_device_check_block_size(adev); 1886 1887 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1888 1889 return 0; 1890 } 1891 1892 /** 1893 * amdgpu_switcheroo_set_state - set switcheroo state 1894 * 1895 * @pdev: pci dev pointer 1896 * @state: vga_switcheroo state 1897 * 1898 * Callback for the switcheroo driver. Suspends or resumes 1899 * the asics before or after it is powered up using ACPI methods. 1900 */ 1901 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1902 enum vga_switcheroo_state state) 1903 { 1904 struct drm_device *dev = pci_get_drvdata(pdev); 1905 int r; 1906 1907 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1908 return; 1909 1910 if (state == VGA_SWITCHEROO_ON) { 1911 pr_info("switched on\n"); 1912 /* don't suspend or resume card normally */ 1913 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1914 1915 pci_set_power_state(pdev, PCI_D0); 1916 amdgpu_device_load_pci_state(pdev); 1917 r = pci_enable_device(pdev); 1918 if (r) 1919 DRM_WARN("pci_enable_device failed (%d)\n", r); 1920 amdgpu_device_resume(dev, true); 1921 1922 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1923 } else { 1924 pr_info("switched off\n"); 1925 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1926 amdgpu_device_prepare(dev); 1927 amdgpu_device_suspend(dev, true); 1928 amdgpu_device_cache_pci_state(pdev); 1929 /* Shut down the device */ 1930 pci_disable_device(pdev); 1931 pci_set_power_state(pdev, PCI_D3cold); 1932 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1933 } 1934 } 1935 1936 /** 1937 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1938 * 1939 * @pdev: pci dev pointer 1940 * 1941 * Callback for the switcheroo driver. Check of the switcheroo 1942 * state can be changed. 1943 * Returns true if the state can be changed, false if not. 1944 */ 1945 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1946 { 1947 struct drm_device *dev = pci_get_drvdata(pdev); 1948 1949 /* 1950 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1951 * locking inversion with the driver load path. And the access here is 1952 * completely racy anyway. So don't bother with locking for now. 1953 */ 1954 return atomic_read(&dev->open_count) == 0; 1955 } 1956 1957 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1958 .set_gpu_state = amdgpu_switcheroo_set_state, 1959 .reprobe = NULL, 1960 .can_switch = amdgpu_switcheroo_can_switch, 1961 }; 1962 1963 /** 1964 * amdgpu_device_ip_set_clockgating_state - set the CG state 1965 * 1966 * @dev: amdgpu_device pointer 1967 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1968 * @state: clockgating state (gate or ungate) 1969 * 1970 * Sets the requested clockgating state for all instances of 1971 * the hardware IP specified. 1972 * Returns the error code from the last instance. 1973 */ 1974 int amdgpu_device_ip_set_clockgating_state(void *dev, 1975 enum amd_ip_block_type block_type, 1976 enum amd_clockgating_state state) 1977 { 1978 struct amdgpu_device *adev = dev; 1979 int i, r = 0; 1980 1981 for (i = 0; i < adev->num_ip_blocks; i++) { 1982 if (!adev->ip_blocks[i].status.valid) 1983 continue; 1984 if (adev->ip_blocks[i].version->type != block_type) 1985 continue; 1986 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1987 continue; 1988 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1989 (void *)adev, state); 1990 if (r) 1991 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1992 adev->ip_blocks[i].version->funcs->name, r); 1993 } 1994 return r; 1995 } 1996 1997 /** 1998 * amdgpu_device_ip_set_powergating_state - set the PG state 1999 * 2000 * @dev: amdgpu_device pointer 2001 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2002 * @state: powergating state (gate or ungate) 2003 * 2004 * Sets the requested powergating state for all instances of 2005 * the hardware IP specified. 2006 * Returns the error code from the last instance. 2007 */ 2008 int amdgpu_device_ip_set_powergating_state(void *dev, 2009 enum amd_ip_block_type block_type, 2010 enum amd_powergating_state state) 2011 { 2012 struct amdgpu_device *adev = dev; 2013 int i, r = 0; 2014 2015 for (i = 0; i < adev->num_ip_blocks; i++) { 2016 if (!adev->ip_blocks[i].status.valid) 2017 continue; 2018 if (adev->ip_blocks[i].version->type != block_type) 2019 continue; 2020 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2021 continue; 2022 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2023 (void *)adev, state); 2024 if (r) 2025 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2026 adev->ip_blocks[i].version->funcs->name, r); 2027 } 2028 return r; 2029 } 2030 2031 /** 2032 * amdgpu_device_ip_get_clockgating_state - get the CG state 2033 * 2034 * @adev: amdgpu_device pointer 2035 * @flags: clockgating feature flags 2036 * 2037 * Walks the list of IPs on the device and updates the clockgating 2038 * flags for each IP. 2039 * Updates @flags with the feature flags for each hardware IP where 2040 * clockgating is enabled. 2041 */ 2042 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2043 u64 *flags) 2044 { 2045 int i; 2046 2047 for (i = 0; i < adev->num_ip_blocks; i++) { 2048 if (!adev->ip_blocks[i].status.valid) 2049 continue; 2050 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2051 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2052 } 2053 } 2054 2055 /** 2056 * amdgpu_device_ip_wait_for_idle - wait for idle 2057 * 2058 * @adev: amdgpu_device pointer 2059 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2060 * 2061 * Waits for the request hardware IP to be idle. 2062 * Returns 0 for success or a negative error code on failure. 2063 */ 2064 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2065 enum amd_ip_block_type block_type) 2066 { 2067 int i, r; 2068 2069 for (i = 0; i < adev->num_ip_blocks; i++) { 2070 if (!adev->ip_blocks[i].status.valid) 2071 continue; 2072 if (adev->ip_blocks[i].version->type == block_type) { 2073 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2074 if (r) 2075 return r; 2076 break; 2077 } 2078 } 2079 return 0; 2080 2081 } 2082 2083 /** 2084 * amdgpu_device_ip_is_idle - is the hardware IP idle 2085 * 2086 * @adev: amdgpu_device pointer 2087 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2088 * 2089 * Check if the hardware IP is idle or not. 2090 * Returns true if it the IP is idle, false if not. 2091 */ 2092 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2093 enum amd_ip_block_type block_type) 2094 { 2095 int i; 2096 2097 for (i = 0; i < adev->num_ip_blocks; i++) { 2098 if (!adev->ip_blocks[i].status.valid) 2099 continue; 2100 if (adev->ip_blocks[i].version->type == block_type) 2101 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2102 } 2103 return true; 2104 2105 } 2106 2107 /** 2108 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2109 * 2110 * @adev: amdgpu_device pointer 2111 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2112 * 2113 * Returns a pointer to the hardware IP block structure 2114 * if it exists for the asic, otherwise NULL. 2115 */ 2116 struct amdgpu_ip_block * 2117 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2118 enum amd_ip_block_type type) 2119 { 2120 int i; 2121 2122 for (i = 0; i < adev->num_ip_blocks; i++) 2123 if (adev->ip_blocks[i].version->type == type) 2124 return &adev->ip_blocks[i]; 2125 2126 return NULL; 2127 } 2128 2129 /** 2130 * amdgpu_device_ip_block_version_cmp 2131 * 2132 * @adev: amdgpu_device pointer 2133 * @type: enum amd_ip_block_type 2134 * @major: major version 2135 * @minor: minor version 2136 * 2137 * return 0 if equal or greater 2138 * return 1 if smaller or the ip_block doesn't exist 2139 */ 2140 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2141 enum amd_ip_block_type type, 2142 u32 major, u32 minor) 2143 { 2144 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2145 2146 if (ip_block && ((ip_block->version->major > major) || 2147 ((ip_block->version->major == major) && 2148 (ip_block->version->minor >= minor)))) 2149 return 0; 2150 2151 return 1; 2152 } 2153 2154 /** 2155 * amdgpu_device_ip_block_add 2156 * 2157 * @adev: amdgpu_device pointer 2158 * @ip_block_version: pointer to the IP to add 2159 * 2160 * Adds the IP block driver information to the collection of IPs 2161 * on the asic. 2162 */ 2163 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2164 const struct amdgpu_ip_block_version *ip_block_version) 2165 { 2166 if (!ip_block_version) 2167 return -EINVAL; 2168 2169 switch (ip_block_version->type) { 2170 case AMD_IP_BLOCK_TYPE_VCN: 2171 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2172 return 0; 2173 break; 2174 case AMD_IP_BLOCK_TYPE_JPEG: 2175 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2176 return 0; 2177 break; 2178 default: 2179 break; 2180 } 2181 2182 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2183 ip_block_version->funcs->name); 2184 2185 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2186 2187 return 0; 2188 } 2189 2190 /** 2191 * amdgpu_device_enable_virtual_display - enable virtual display feature 2192 * 2193 * @adev: amdgpu_device pointer 2194 * 2195 * Enabled the virtual display feature if the user has enabled it via 2196 * the module parameter virtual_display. This feature provides a virtual 2197 * display hardware on headless boards or in virtualized environments. 2198 * This function parses and validates the configuration string specified by 2199 * the user and configues the virtual display configuration (number of 2200 * virtual connectors, crtcs, etc.) specified. 2201 */ 2202 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2203 { 2204 adev->enable_virtual_display = false; 2205 2206 if (amdgpu_virtual_display) { 2207 const char *pci_address_name = pci_name(adev->pdev); 2208 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2209 2210 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2211 pciaddstr_tmp = pciaddstr; 2212 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2213 pciaddname = strsep(&pciaddname_tmp, ","); 2214 if (!strcmp("all", pciaddname) 2215 || !strcmp(pci_address_name, pciaddname)) { 2216 long num_crtc; 2217 int res = -1; 2218 2219 adev->enable_virtual_display = true; 2220 2221 if (pciaddname_tmp) 2222 res = kstrtol(pciaddname_tmp, 10, 2223 &num_crtc); 2224 2225 if (!res) { 2226 if (num_crtc < 1) 2227 num_crtc = 1; 2228 if (num_crtc > 6) 2229 num_crtc = 6; 2230 adev->mode_info.num_crtc = num_crtc; 2231 } else { 2232 adev->mode_info.num_crtc = 1; 2233 } 2234 break; 2235 } 2236 } 2237 2238 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2239 amdgpu_virtual_display, pci_address_name, 2240 adev->enable_virtual_display, adev->mode_info.num_crtc); 2241 2242 kfree(pciaddstr); 2243 } 2244 } 2245 2246 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2247 { 2248 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2249 adev->mode_info.num_crtc = 1; 2250 adev->enable_virtual_display = true; 2251 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2252 adev->enable_virtual_display, adev->mode_info.num_crtc); 2253 } 2254 } 2255 2256 /** 2257 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2258 * 2259 * @adev: amdgpu_device pointer 2260 * 2261 * Parses the asic configuration parameters specified in the gpu info 2262 * firmware and makes them availale to the driver for use in configuring 2263 * the asic. 2264 * Returns 0 on success, -EINVAL on failure. 2265 */ 2266 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2267 { 2268 const char *chip_name; 2269 char fw_name[40]; 2270 int err; 2271 const struct gpu_info_firmware_header_v1_0 *hdr; 2272 2273 adev->firmware.gpu_info_fw = NULL; 2274 2275 if (adev->mman.discovery_bin) 2276 return 0; 2277 2278 switch (adev->asic_type) { 2279 default: 2280 return 0; 2281 case CHIP_VEGA10: 2282 chip_name = "vega10"; 2283 break; 2284 case CHIP_VEGA12: 2285 chip_name = "vega12"; 2286 break; 2287 case CHIP_RAVEN: 2288 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2289 chip_name = "raven2"; 2290 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2291 chip_name = "picasso"; 2292 else 2293 chip_name = "raven"; 2294 break; 2295 case CHIP_ARCTURUS: 2296 chip_name = "arcturus"; 2297 break; 2298 case CHIP_NAVI12: 2299 chip_name = "navi12"; 2300 break; 2301 } 2302 2303 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2304 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2305 if (err) { 2306 dev_err(adev->dev, 2307 "Failed to get gpu_info firmware \"%s\"\n", 2308 fw_name); 2309 goto out; 2310 } 2311 2312 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2313 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2314 2315 switch (hdr->version_major) { 2316 case 1: 2317 { 2318 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2319 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2320 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2321 2322 /* 2323 * Should be droped when DAL no longer needs it. 2324 */ 2325 if (adev->asic_type == CHIP_NAVI12) 2326 goto parse_soc_bounding_box; 2327 2328 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2329 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2330 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2331 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2332 adev->gfx.config.max_texture_channel_caches = 2333 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2334 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2335 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2336 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2337 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2338 adev->gfx.config.double_offchip_lds_buf = 2339 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2340 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2341 adev->gfx.cu_info.max_waves_per_simd = 2342 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2343 adev->gfx.cu_info.max_scratch_slots_per_cu = 2344 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2345 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2346 if (hdr->version_minor >= 1) { 2347 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2348 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2349 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2350 adev->gfx.config.num_sc_per_sh = 2351 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2352 adev->gfx.config.num_packer_per_sc = 2353 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2354 } 2355 2356 parse_soc_bounding_box: 2357 /* 2358 * soc bounding box info is not integrated in disocovery table, 2359 * we always need to parse it from gpu info firmware if needed. 2360 */ 2361 if (hdr->version_minor == 2) { 2362 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2363 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2364 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2365 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2366 } 2367 break; 2368 } 2369 default: 2370 dev_err(adev->dev, 2371 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2372 err = -EINVAL; 2373 goto out; 2374 } 2375 out: 2376 return err; 2377 } 2378 2379 /** 2380 * amdgpu_device_ip_early_init - run early init for hardware IPs 2381 * 2382 * @adev: amdgpu_device pointer 2383 * 2384 * Early initialization pass for hardware IPs. The hardware IPs that make 2385 * up each asic are discovered each IP's early_init callback is run. This 2386 * is the first stage in initializing the asic. 2387 * Returns 0 on success, negative error code on failure. 2388 */ 2389 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2390 { 2391 struct pci_dev *parent; 2392 int i, r; 2393 bool total; 2394 2395 amdgpu_device_enable_virtual_display(adev); 2396 2397 if (amdgpu_sriov_vf(adev)) { 2398 r = amdgpu_virt_request_full_gpu(adev, true); 2399 if (r) 2400 return r; 2401 } 2402 2403 switch (adev->asic_type) { 2404 #ifdef CONFIG_DRM_AMDGPU_SI 2405 case CHIP_VERDE: 2406 case CHIP_TAHITI: 2407 case CHIP_PITCAIRN: 2408 case CHIP_OLAND: 2409 case CHIP_HAINAN: 2410 adev->family = AMDGPU_FAMILY_SI; 2411 r = si_set_ip_blocks(adev); 2412 if (r) 2413 return r; 2414 break; 2415 #endif 2416 #ifdef CONFIG_DRM_AMDGPU_CIK 2417 case CHIP_BONAIRE: 2418 case CHIP_HAWAII: 2419 case CHIP_KAVERI: 2420 case CHIP_KABINI: 2421 case CHIP_MULLINS: 2422 if (adev->flags & AMD_IS_APU) 2423 adev->family = AMDGPU_FAMILY_KV; 2424 else 2425 adev->family = AMDGPU_FAMILY_CI; 2426 2427 r = cik_set_ip_blocks(adev); 2428 if (r) 2429 return r; 2430 break; 2431 #endif 2432 case CHIP_TOPAZ: 2433 case CHIP_TONGA: 2434 case CHIP_FIJI: 2435 case CHIP_POLARIS10: 2436 case CHIP_POLARIS11: 2437 case CHIP_POLARIS12: 2438 case CHIP_VEGAM: 2439 case CHIP_CARRIZO: 2440 case CHIP_STONEY: 2441 if (adev->flags & AMD_IS_APU) 2442 adev->family = AMDGPU_FAMILY_CZ; 2443 else 2444 adev->family = AMDGPU_FAMILY_VI; 2445 2446 r = vi_set_ip_blocks(adev); 2447 if (r) 2448 return r; 2449 break; 2450 default: 2451 r = amdgpu_discovery_set_ip_blocks(adev); 2452 if (r) 2453 return r; 2454 break; 2455 } 2456 2457 if (amdgpu_has_atpx() && 2458 (amdgpu_is_atpx_hybrid() || 2459 amdgpu_has_atpx_dgpu_power_cntl()) && 2460 ((adev->flags & AMD_IS_APU) == 0) && 2461 !dev_is_removable(&adev->pdev->dev)) 2462 adev->flags |= AMD_IS_PX; 2463 2464 if (!(adev->flags & AMD_IS_APU)) { 2465 parent = pcie_find_root_port(adev->pdev); 2466 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2467 } 2468 2469 2470 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2471 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2472 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2473 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2474 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2475 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2476 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2477 2478 total = true; 2479 for (i = 0; i < adev->num_ip_blocks; i++) { 2480 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2481 DRM_WARN("disabled ip block: %d <%s>\n", 2482 i, adev->ip_blocks[i].version->funcs->name); 2483 adev->ip_blocks[i].status.valid = false; 2484 } else { 2485 if (adev->ip_blocks[i].version->funcs->early_init) { 2486 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2487 if (r == -ENOENT) { 2488 adev->ip_blocks[i].status.valid = false; 2489 } else if (r) { 2490 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2491 adev->ip_blocks[i].version->funcs->name, r); 2492 total = false; 2493 } else { 2494 adev->ip_blocks[i].status.valid = true; 2495 } 2496 } else { 2497 adev->ip_blocks[i].status.valid = true; 2498 } 2499 } 2500 /* get the vbios after the asic_funcs are set up */ 2501 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2502 r = amdgpu_device_parse_gpu_info_fw(adev); 2503 if (r) 2504 return r; 2505 2506 /* Read BIOS */ 2507 if (amdgpu_device_read_bios(adev)) { 2508 if (!amdgpu_get_bios(adev)) 2509 return -EINVAL; 2510 2511 r = amdgpu_atombios_init(adev); 2512 if (r) { 2513 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2514 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2515 return r; 2516 } 2517 } 2518 2519 /*get pf2vf msg info at it's earliest time*/ 2520 if (amdgpu_sriov_vf(adev)) 2521 amdgpu_virt_init_data_exchange(adev); 2522 2523 } 2524 } 2525 if (!total) 2526 return -ENODEV; 2527 2528 amdgpu_amdkfd_device_probe(adev); 2529 adev->cg_flags &= amdgpu_cg_mask; 2530 adev->pg_flags &= amdgpu_pg_mask; 2531 2532 return 0; 2533 } 2534 2535 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2536 { 2537 int i, r; 2538 2539 for (i = 0; i < adev->num_ip_blocks; i++) { 2540 if (!adev->ip_blocks[i].status.sw) 2541 continue; 2542 if (adev->ip_blocks[i].status.hw) 2543 continue; 2544 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2545 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2546 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2547 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2548 if (r) { 2549 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2550 adev->ip_blocks[i].version->funcs->name, r); 2551 return r; 2552 } 2553 adev->ip_blocks[i].status.hw = true; 2554 } 2555 } 2556 2557 return 0; 2558 } 2559 2560 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2561 { 2562 int i, r; 2563 2564 for (i = 0; i < adev->num_ip_blocks; i++) { 2565 if (!adev->ip_blocks[i].status.sw) 2566 continue; 2567 if (adev->ip_blocks[i].status.hw) 2568 continue; 2569 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2570 if (r) { 2571 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2572 adev->ip_blocks[i].version->funcs->name, r); 2573 return r; 2574 } 2575 adev->ip_blocks[i].status.hw = true; 2576 } 2577 2578 return 0; 2579 } 2580 2581 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2582 { 2583 int r = 0; 2584 int i; 2585 uint32_t smu_version; 2586 2587 if (adev->asic_type >= CHIP_VEGA10) { 2588 for (i = 0; i < adev->num_ip_blocks; i++) { 2589 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2590 continue; 2591 2592 if (!adev->ip_blocks[i].status.sw) 2593 continue; 2594 2595 /* no need to do the fw loading again if already done*/ 2596 if (adev->ip_blocks[i].status.hw == true) 2597 break; 2598 2599 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2600 r = adev->ip_blocks[i].version->funcs->resume(adev); 2601 if (r) { 2602 DRM_ERROR("resume of IP block <%s> failed %d\n", 2603 adev->ip_blocks[i].version->funcs->name, r); 2604 return r; 2605 } 2606 } else { 2607 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2608 if (r) { 2609 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2610 adev->ip_blocks[i].version->funcs->name, r); 2611 return r; 2612 } 2613 } 2614 2615 adev->ip_blocks[i].status.hw = true; 2616 break; 2617 } 2618 } 2619 2620 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2621 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2622 2623 return r; 2624 } 2625 2626 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2627 { 2628 long timeout; 2629 int r, i; 2630 2631 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2632 struct amdgpu_ring *ring = adev->rings[i]; 2633 2634 /* No need to setup the GPU scheduler for rings that don't need it */ 2635 if (!ring || ring->no_scheduler) 2636 continue; 2637 2638 switch (ring->funcs->type) { 2639 case AMDGPU_RING_TYPE_GFX: 2640 timeout = adev->gfx_timeout; 2641 break; 2642 case AMDGPU_RING_TYPE_COMPUTE: 2643 timeout = adev->compute_timeout; 2644 break; 2645 case AMDGPU_RING_TYPE_SDMA: 2646 timeout = adev->sdma_timeout; 2647 break; 2648 default: 2649 timeout = adev->video_timeout; 2650 break; 2651 } 2652 2653 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2654 DRM_SCHED_PRIORITY_COUNT, 2655 ring->num_hw_submission, 0, 2656 timeout, adev->reset_domain->wq, 2657 ring->sched_score, ring->name, 2658 adev->dev); 2659 if (r) { 2660 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2661 ring->name); 2662 return r; 2663 } 2664 r = amdgpu_uvd_entity_init(adev, ring); 2665 if (r) { 2666 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2667 ring->name); 2668 return r; 2669 } 2670 r = amdgpu_vce_entity_init(adev, ring); 2671 if (r) { 2672 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2673 ring->name); 2674 return r; 2675 } 2676 } 2677 2678 amdgpu_xcp_update_partition_sched_list(adev); 2679 2680 return 0; 2681 } 2682 2683 2684 /** 2685 * amdgpu_device_ip_init - run init for hardware IPs 2686 * 2687 * @adev: amdgpu_device pointer 2688 * 2689 * Main initialization pass for hardware IPs. The list of all the hardware 2690 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2691 * are run. sw_init initializes the software state associated with each IP 2692 * and hw_init initializes the hardware associated with each IP. 2693 * Returns 0 on success, negative error code on failure. 2694 */ 2695 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2696 { 2697 int i, r; 2698 2699 r = amdgpu_ras_init(adev); 2700 if (r) 2701 return r; 2702 2703 for (i = 0; i < adev->num_ip_blocks; i++) { 2704 if (!adev->ip_blocks[i].status.valid) 2705 continue; 2706 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2707 if (r) { 2708 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2709 adev->ip_blocks[i].version->funcs->name, r); 2710 goto init_failed; 2711 } 2712 adev->ip_blocks[i].status.sw = true; 2713 2714 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2715 /* need to do common hw init early so everything is set up for gmc */ 2716 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2717 if (r) { 2718 DRM_ERROR("hw_init %d failed %d\n", i, r); 2719 goto init_failed; 2720 } 2721 adev->ip_blocks[i].status.hw = true; 2722 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2723 /* need to do gmc hw init early so we can allocate gpu mem */ 2724 /* Try to reserve bad pages early */ 2725 if (amdgpu_sriov_vf(adev)) 2726 amdgpu_virt_exchange_data(adev); 2727 2728 r = amdgpu_device_mem_scratch_init(adev); 2729 if (r) { 2730 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2731 goto init_failed; 2732 } 2733 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2734 if (r) { 2735 DRM_ERROR("hw_init %d failed %d\n", i, r); 2736 goto init_failed; 2737 } 2738 r = amdgpu_device_wb_init(adev); 2739 if (r) { 2740 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2741 goto init_failed; 2742 } 2743 adev->ip_blocks[i].status.hw = true; 2744 2745 /* right after GMC hw init, we create CSA */ 2746 if (adev->gfx.mcbp) { 2747 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2748 AMDGPU_GEM_DOMAIN_VRAM | 2749 AMDGPU_GEM_DOMAIN_GTT, 2750 AMDGPU_CSA_SIZE); 2751 if (r) { 2752 DRM_ERROR("allocate CSA failed %d\n", r); 2753 goto init_failed; 2754 } 2755 } 2756 2757 r = amdgpu_seq64_init(adev); 2758 if (r) { 2759 DRM_ERROR("allocate seq64 failed %d\n", r); 2760 goto init_failed; 2761 } 2762 } 2763 } 2764 2765 if (amdgpu_sriov_vf(adev)) 2766 amdgpu_virt_init_data_exchange(adev); 2767 2768 r = amdgpu_ib_pool_init(adev); 2769 if (r) { 2770 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2771 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2772 goto init_failed; 2773 } 2774 2775 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2776 if (r) 2777 goto init_failed; 2778 2779 r = amdgpu_device_ip_hw_init_phase1(adev); 2780 if (r) 2781 goto init_failed; 2782 2783 r = amdgpu_device_fw_loading(adev); 2784 if (r) 2785 goto init_failed; 2786 2787 r = amdgpu_device_ip_hw_init_phase2(adev); 2788 if (r) 2789 goto init_failed; 2790 2791 /* 2792 * retired pages will be loaded from eeprom and reserved here, 2793 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2794 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2795 * for I2C communication which only true at this point. 2796 * 2797 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2798 * failure from bad gpu situation and stop amdgpu init process 2799 * accordingly. For other failed cases, it will still release all 2800 * the resource and print error message, rather than returning one 2801 * negative value to upper level. 2802 * 2803 * Note: theoretically, this should be called before all vram allocations 2804 * to protect retired page from abusing 2805 */ 2806 r = amdgpu_ras_recovery_init(adev); 2807 if (r) 2808 goto init_failed; 2809 2810 /** 2811 * In case of XGMI grab extra reference for reset domain for this device 2812 */ 2813 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2814 if (amdgpu_xgmi_add_device(adev) == 0) { 2815 if (!amdgpu_sriov_vf(adev)) { 2816 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2817 2818 if (WARN_ON(!hive)) { 2819 r = -ENOENT; 2820 goto init_failed; 2821 } 2822 2823 if (!hive->reset_domain || 2824 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2825 r = -ENOENT; 2826 amdgpu_put_xgmi_hive(hive); 2827 goto init_failed; 2828 } 2829 2830 /* Drop the early temporary reset domain we created for device */ 2831 amdgpu_reset_put_reset_domain(adev->reset_domain); 2832 adev->reset_domain = hive->reset_domain; 2833 amdgpu_put_xgmi_hive(hive); 2834 } 2835 } 2836 } 2837 2838 r = amdgpu_device_init_schedulers(adev); 2839 if (r) 2840 goto init_failed; 2841 2842 if (adev->mman.buffer_funcs_ring->sched.ready) 2843 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2844 2845 /* Don't init kfd if whole hive need to be reset during init */ 2846 if (!adev->gmc.xgmi.pending_reset) { 2847 kgd2kfd_init_zone_device(adev); 2848 amdgpu_amdkfd_device_init(adev); 2849 } 2850 2851 amdgpu_fru_get_product_info(adev); 2852 2853 init_failed: 2854 2855 return r; 2856 } 2857 2858 /** 2859 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2860 * 2861 * @adev: amdgpu_device pointer 2862 * 2863 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2864 * this function before a GPU reset. If the value is retained after a 2865 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2866 */ 2867 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2868 { 2869 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2870 } 2871 2872 /** 2873 * amdgpu_device_check_vram_lost - check if vram is valid 2874 * 2875 * @adev: amdgpu_device pointer 2876 * 2877 * Checks the reset magic value written to the gart pointer in VRAM. 2878 * The driver calls this after a GPU reset to see if the contents of 2879 * VRAM is lost or now. 2880 * returns true if vram is lost, false if not. 2881 */ 2882 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2883 { 2884 if (memcmp(adev->gart.ptr, adev->reset_magic, 2885 AMDGPU_RESET_MAGIC_NUM)) 2886 return true; 2887 2888 if (!amdgpu_in_reset(adev)) 2889 return false; 2890 2891 /* 2892 * For all ASICs with baco/mode1 reset, the VRAM is 2893 * always assumed to be lost. 2894 */ 2895 switch (amdgpu_asic_reset_method(adev)) { 2896 case AMD_RESET_METHOD_BACO: 2897 case AMD_RESET_METHOD_MODE1: 2898 return true; 2899 default: 2900 return false; 2901 } 2902 } 2903 2904 /** 2905 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2906 * 2907 * @adev: amdgpu_device pointer 2908 * @state: clockgating state (gate or ungate) 2909 * 2910 * The list of all the hardware IPs that make up the asic is walked and the 2911 * set_clockgating_state callbacks are run. 2912 * Late initialization pass enabling clockgating for hardware IPs. 2913 * Fini or suspend, pass disabling clockgating for hardware IPs. 2914 * Returns 0 on success, negative error code on failure. 2915 */ 2916 2917 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2918 enum amd_clockgating_state state) 2919 { 2920 int i, j, r; 2921 2922 if (amdgpu_emu_mode == 1) 2923 return 0; 2924 2925 for (j = 0; j < adev->num_ip_blocks; j++) { 2926 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2927 if (!adev->ip_blocks[i].status.late_initialized) 2928 continue; 2929 /* skip CG for GFX, SDMA on S0ix */ 2930 if (adev->in_s0ix && 2931 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2932 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2933 continue; 2934 /* skip CG for VCE/UVD, it's handled specially */ 2935 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2936 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2937 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2938 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2939 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2940 /* enable clockgating to save power */ 2941 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2942 state); 2943 if (r) { 2944 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2945 adev->ip_blocks[i].version->funcs->name, r); 2946 return r; 2947 } 2948 } 2949 } 2950 2951 return 0; 2952 } 2953 2954 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2955 enum amd_powergating_state state) 2956 { 2957 int i, j, r; 2958 2959 if (amdgpu_emu_mode == 1) 2960 return 0; 2961 2962 for (j = 0; j < adev->num_ip_blocks; j++) { 2963 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2964 if (!adev->ip_blocks[i].status.late_initialized) 2965 continue; 2966 /* skip PG for GFX, SDMA on S0ix */ 2967 if (adev->in_s0ix && 2968 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2969 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2970 continue; 2971 /* skip CG for VCE/UVD, it's handled specially */ 2972 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2973 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2974 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2975 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2976 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2977 /* enable powergating to save power */ 2978 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2979 state); 2980 if (r) { 2981 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2982 adev->ip_blocks[i].version->funcs->name, r); 2983 return r; 2984 } 2985 } 2986 } 2987 return 0; 2988 } 2989 2990 static int amdgpu_device_enable_mgpu_fan_boost(void) 2991 { 2992 struct amdgpu_gpu_instance *gpu_ins; 2993 struct amdgpu_device *adev; 2994 int i, ret = 0; 2995 2996 mutex_lock(&mgpu_info.mutex); 2997 2998 /* 2999 * MGPU fan boost feature should be enabled 3000 * only when there are two or more dGPUs in 3001 * the system 3002 */ 3003 if (mgpu_info.num_dgpu < 2) 3004 goto out; 3005 3006 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3007 gpu_ins = &(mgpu_info.gpu_ins[i]); 3008 adev = gpu_ins->adev; 3009 if (!(adev->flags & AMD_IS_APU) && 3010 !gpu_ins->mgpu_fan_enabled) { 3011 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3012 if (ret) 3013 break; 3014 3015 gpu_ins->mgpu_fan_enabled = 1; 3016 } 3017 } 3018 3019 out: 3020 mutex_unlock(&mgpu_info.mutex); 3021 3022 return ret; 3023 } 3024 3025 /** 3026 * amdgpu_device_ip_late_init - run late init for hardware IPs 3027 * 3028 * @adev: amdgpu_device pointer 3029 * 3030 * Late initialization pass for hardware IPs. The list of all the hardware 3031 * IPs that make up the asic is walked and the late_init callbacks are run. 3032 * late_init covers any special initialization that an IP requires 3033 * after all of the have been initialized or something that needs to happen 3034 * late in the init process. 3035 * Returns 0 on success, negative error code on failure. 3036 */ 3037 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3038 { 3039 struct amdgpu_gpu_instance *gpu_instance; 3040 int i = 0, r; 3041 3042 for (i = 0; i < adev->num_ip_blocks; i++) { 3043 if (!adev->ip_blocks[i].status.hw) 3044 continue; 3045 if (adev->ip_blocks[i].version->funcs->late_init) { 3046 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3047 if (r) { 3048 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3049 adev->ip_blocks[i].version->funcs->name, r); 3050 return r; 3051 } 3052 } 3053 adev->ip_blocks[i].status.late_initialized = true; 3054 } 3055 3056 r = amdgpu_ras_late_init(adev); 3057 if (r) { 3058 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3059 return r; 3060 } 3061 3062 amdgpu_ras_set_error_query_ready(adev, true); 3063 3064 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3065 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3066 3067 amdgpu_device_fill_reset_magic(adev); 3068 3069 r = amdgpu_device_enable_mgpu_fan_boost(); 3070 if (r) 3071 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3072 3073 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3074 if (amdgpu_passthrough(adev) && 3075 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3076 adev->asic_type == CHIP_ALDEBARAN)) 3077 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3078 3079 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3080 mutex_lock(&mgpu_info.mutex); 3081 3082 /* 3083 * Reset device p-state to low as this was booted with high. 3084 * 3085 * This should be performed only after all devices from the same 3086 * hive get initialized. 3087 * 3088 * However, it's unknown how many device in the hive in advance. 3089 * As this is counted one by one during devices initializations. 3090 * 3091 * So, we wait for all XGMI interlinked devices initialized. 3092 * This may bring some delays as those devices may come from 3093 * different hives. But that should be OK. 3094 */ 3095 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3096 for (i = 0; i < mgpu_info.num_gpu; i++) { 3097 gpu_instance = &(mgpu_info.gpu_ins[i]); 3098 if (gpu_instance->adev->flags & AMD_IS_APU) 3099 continue; 3100 3101 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3102 AMDGPU_XGMI_PSTATE_MIN); 3103 if (r) { 3104 DRM_ERROR("pstate setting failed (%d).\n", r); 3105 break; 3106 } 3107 } 3108 } 3109 3110 mutex_unlock(&mgpu_info.mutex); 3111 } 3112 3113 return 0; 3114 } 3115 3116 /** 3117 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3118 * 3119 * @adev: amdgpu_device pointer 3120 * 3121 * For ASICs need to disable SMC first 3122 */ 3123 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3124 { 3125 int i, r; 3126 3127 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3128 return; 3129 3130 for (i = 0; i < adev->num_ip_blocks; i++) { 3131 if (!adev->ip_blocks[i].status.hw) 3132 continue; 3133 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3134 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3135 /* XXX handle errors */ 3136 if (r) { 3137 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3138 adev->ip_blocks[i].version->funcs->name, r); 3139 } 3140 adev->ip_blocks[i].status.hw = false; 3141 break; 3142 } 3143 } 3144 } 3145 3146 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3147 { 3148 int i, r; 3149 3150 for (i = 0; i < adev->num_ip_blocks; i++) { 3151 if (!adev->ip_blocks[i].version->funcs->early_fini) 3152 continue; 3153 3154 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3155 if (r) { 3156 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3157 adev->ip_blocks[i].version->funcs->name, r); 3158 } 3159 } 3160 3161 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3162 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3163 3164 amdgpu_amdkfd_suspend(adev, false); 3165 3166 /* Workaroud for ASICs need to disable SMC first */ 3167 amdgpu_device_smu_fini_early(adev); 3168 3169 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3170 if (!adev->ip_blocks[i].status.hw) 3171 continue; 3172 3173 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3174 /* XXX handle errors */ 3175 if (r) { 3176 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3177 adev->ip_blocks[i].version->funcs->name, r); 3178 } 3179 3180 adev->ip_blocks[i].status.hw = false; 3181 } 3182 3183 if (amdgpu_sriov_vf(adev)) { 3184 if (amdgpu_virt_release_full_gpu(adev, false)) 3185 DRM_ERROR("failed to release exclusive mode on fini\n"); 3186 } 3187 3188 return 0; 3189 } 3190 3191 /** 3192 * amdgpu_device_ip_fini - run fini for hardware IPs 3193 * 3194 * @adev: amdgpu_device pointer 3195 * 3196 * Main teardown pass for hardware IPs. The list of all the hardware 3197 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3198 * are run. hw_fini tears down the hardware associated with each IP 3199 * and sw_fini tears down any software state associated with each IP. 3200 * Returns 0 on success, negative error code on failure. 3201 */ 3202 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3203 { 3204 int i, r; 3205 3206 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3207 amdgpu_virt_release_ras_err_handler_data(adev); 3208 3209 if (adev->gmc.xgmi.num_physical_nodes > 1) 3210 amdgpu_xgmi_remove_device(adev); 3211 3212 amdgpu_amdkfd_device_fini_sw(adev); 3213 3214 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3215 if (!adev->ip_blocks[i].status.sw) 3216 continue; 3217 3218 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3219 amdgpu_ucode_free_bo(adev); 3220 amdgpu_free_static_csa(&adev->virt.csa_obj); 3221 amdgpu_device_wb_fini(adev); 3222 amdgpu_device_mem_scratch_fini(adev); 3223 amdgpu_ib_pool_fini(adev); 3224 amdgpu_seq64_fini(adev); 3225 } 3226 3227 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3228 /* XXX handle errors */ 3229 if (r) { 3230 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3231 adev->ip_blocks[i].version->funcs->name, r); 3232 } 3233 adev->ip_blocks[i].status.sw = false; 3234 adev->ip_blocks[i].status.valid = false; 3235 } 3236 3237 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3238 if (!adev->ip_blocks[i].status.late_initialized) 3239 continue; 3240 if (adev->ip_blocks[i].version->funcs->late_fini) 3241 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3242 adev->ip_blocks[i].status.late_initialized = false; 3243 } 3244 3245 amdgpu_ras_fini(adev); 3246 3247 return 0; 3248 } 3249 3250 /** 3251 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3252 * 3253 * @work: work_struct. 3254 */ 3255 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3256 { 3257 struct amdgpu_device *adev = 3258 container_of(work, struct amdgpu_device, delayed_init_work.work); 3259 int r; 3260 3261 r = amdgpu_ib_ring_tests(adev); 3262 if (r) 3263 DRM_ERROR("ib ring test failed (%d).\n", r); 3264 } 3265 3266 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3267 { 3268 struct amdgpu_device *adev = 3269 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3270 3271 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3272 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3273 3274 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3275 adev->gfx.gfx_off_state = true; 3276 } 3277 3278 /** 3279 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3280 * 3281 * @adev: amdgpu_device pointer 3282 * 3283 * Main suspend function for hardware IPs. The list of all the hardware 3284 * IPs that make up the asic is walked, clockgating is disabled and the 3285 * suspend callbacks are run. suspend puts the hardware and software state 3286 * in each IP into a state suitable for suspend. 3287 * Returns 0 on success, negative error code on failure. 3288 */ 3289 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3290 { 3291 int i, r; 3292 3293 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3294 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3295 3296 /* 3297 * Per PMFW team's suggestion, driver needs to handle gfxoff 3298 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3299 * scenario. Add the missing df cstate disablement here. 3300 */ 3301 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3302 dev_warn(adev->dev, "Failed to disallow df cstate"); 3303 3304 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3305 if (!adev->ip_blocks[i].status.valid) 3306 continue; 3307 3308 /* displays are handled separately */ 3309 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3310 continue; 3311 3312 /* XXX handle errors */ 3313 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3314 /* XXX handle errors */ 3315 if (r) { 3316 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3317 adev->ip_blocks[i].version->funcs->name, r); 3318 return r; 3319 } 3320 3321 adev->ip_blocks[i].status.hw = false; 3322 } 3323 3324 return 0; 3325 } 3326 3327 /** 3328 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3329 * 3330 * @adev: amdgpu_device pointer 3331 * 3332 * Main suspend function for hardware IPs. The list of all the hardware 3333 * IPs that make up the asic is walked, clockgating is disabled and the 3334 * suspend callbacks are run. suspend puts the hardware and software state 3335 * in each IP into a state suitable for suspend. 3336 * Returns 0 on success, negative error code on failure. 3337 */ 3338 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3339 { 3340 int i, r; 3341 3342 if (adev->in_s0ix) 3343 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3344 3345 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3346 if (!adev->ip_blocks[i].status.valid) 3347 continue; 3348 /* displays are handled in phase1 */ 3349 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3350 continue; 3351 /* PSP lost connection when err_event_athub occurs */ 3352 if (amdgpu_ras_intr_triggered() && 3353 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3354 adev->ip_blocks[i].status.hw = false; 3355 continue; 3356 } 3357 3358 /* skip unnecessary suspend if we do not initialize them yet */ 3359 if (adev->gmc.xgmi.pending_reset && 3360 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3361 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3362 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3363 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3364 adev->ip_blocks[i].status.hw = false; 3365 continue; 3366 } 3367 3368 /* skip suspend of gfx/mes and psp for S0ix 3369 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3370 * like at runtime. PSP is also part of the always on hardware 3371 * so no need to suspend it. 3372 */ 3373 if (adev->in_s0ix && 3374 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3375 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3376 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3377 continue; 3378 3379 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3380 if (adev->in_s0ix && 3381 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3382 IP_VERSION(5, 0, 0)) && 3383 (adev->ip_blocks[i].version->type == 3384 AMD_IP_BLOCK_TYPE_SDMA)) 3385 continue; 3386 3387 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3388 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3389 * from this location and RLC Autoload automatically also gets loaded 3390 * from here based on PMFW -> PSP message during re-init sequence. 3391 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3392 * the TMR and reload FWs again for IMU enabled APU ASICs. 3393 */ 3394 if (amdgpu_in_reset(adev) && 3395 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3396 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3397 continue; 3398 3399 /* XXX handle errors */ 3400 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3401 /* XXX handle errors */ 3402 if (r) { 3403 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3404 adev->ip_blocks[i].version->funcs->name, r); 3405 } 3406 adev->ip_blocks[i].status.hw = false; 3407 /* handle putting the SMC in the appropriate state */ 3408 if (!amdgpu_sriov_vf(adev)) { 3409 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3410 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3411 if (r) { 3412 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3413 adev->mp1_state, r); 3414 return r; 3415 } 3416 } 3417 } 3418 } 3419 3420 return 0; 3421 } 3422 3423 /** 3424 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3425 * 3426 * @adev: amdgpu_device pointer 3427 * 3428 * Main suspend function for hardware IPs. The list of all the hardware 3429 * IPs that make up the asic is walked, clockgating is disabled and the 3430 * suspend callbacks are run. suspend puts the hardware and software state 3431 * in each IP into a state suitable for suspend. 3432 * Returns 0 on success, negative error code on failure. 3433 */ 3434 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3435 { 3436 int r; 3437 3438 if (amdgpu_sriov_vf(adev)) { 3439 amdgpu_virt_fini_data_exchange(adev); 3440 amdgpu_virt_request_full_gpu(adev, false); 3441 } 3442 3443 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3444 3445 r = amdgpu_device_ip_suspend_phase1(adev); 3446 if (r) 3447 return r; 3448 r = amdgpu_device_ip_suspend_phase2(adev); 3449 3450 if (amdgpu_sriov_vf(adev)) 3451 amdgpu_virt_release_full_gpu(adev, false); 3452 3453 return r; 3454 } 3455 3456 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3457 { 3458 int i, r; 3459 3460 static enum amd_ip_block_type ip_order[] = { 3461 AMD_IP_BLOCK_TYPE_COMMON, 3462 AMD_IP_BLOCK_TYPE_GMC, 3463 AMD_IP_BLOCK_TYPE_PSP, 3464 AMD_IP_BLOCK_TYPE_IH, 3465 }; 3466 3467 for (i = 0; i < adev->num_ip_blocks; i++) { 3468 int j; 3469 struct amdgpu_ip_block *block; 3470 3471 block = &adev->ip_blocks[i]; 3472 block->status.hw = false; 3473 3474 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3475 3476 if (block->version->type != ip_order[j] || 3477 !block->status.valid) 3478 continue; 3479 3480 r = block->version->funcs->hw_init(adev); 3481 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3482 if (r) 3483 return r; 3484 block->status.hw = true; 3485 } 3486 } 3487 3488 return 0; 3489 } 3490 3491 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3492 { 3493 int i, r; 3494 3495 static enum amd_ip_block_type ip_order[] = { 3496 AMD_IP_BLOCK_TYPE_SMC, 3497 AMD_IP_BLOCK_TYPE_DCE, 3498 AMD_IP_BLOCK_TYPE_GFX, 3499 AMD_IP_BLOCK_TYPE_SDMA, 3500 AMD_IP_BLOCK_TYPE_MES, 3501 AMD_IP_BLOCK_TYPE_UVD, 3502 AMD_IP_BLOCK_TYPE_VCE, 3503 AMD_IP_BLOCK_TYPE_VCN, 3504 AMD_IP_BLOCK_TYPE_JPEG 3505 }; 3506 3507 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3508 int j; 3509 struct amdgpu_ip_block *block; 3510 3511 for (j = 0; j < adev->num_ip_blocks; j++) { 3512 block = &adev->ip_blocks[j]; 3513 3514 if (block->version->type != ip_order[i] || 3515 !block->status.valid || 3516 block->status.hw) 3517 continue; 3518 3519 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3520 r = block->version->funcs->resume(adev); 3521 else 3522 r = block->version->funcs->hw_init(adev); 3523 3524 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3525 if (r) 3526 return r; 3527 block->status.hw = true; 3528 } 3529 } 3530 3531 return 0; 3532 } 3533 3534 /** 3535 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3536 * 3537 * @adev: amdgpu_device pointer 3538 * 3539 * First resume function for hardware IPs. The list of all the hardware 3540 * IPs that make up the asic is walked and the resume callbacks are run for 3541 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3542 * after a suspend and updates the software state as necessary. This 3543 * function is also used for restoring the GPU after a GPU reset. 3544 * Returns 0 on success, negative error code on failure. 3545 */ 3546 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3547 { 3548 int i, r; 3549 3550 for (i = 0; i < adev->num_ip_blocks; i++) { 3551 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3552 continue; 3553 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3554 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3555 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3556 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3557 3558 r = adev->ip_blocks[i].version->funcs->resume(adev); 3559 if (r) { 3560 DRM_ERROR("resume of IP block <%s> failed %d\n", 3561 adev->ip_blocks[i].version->funcs->name, r); 3562 return r; 3563 } 3564 adev->ip_blocks[i].status.hw = true; 3565 } 3566 } 3567 3568 return 0; 3569 } 3570 3571 /** 3572 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3573 * 3574 * @adev: amdgpu_device pointer 3575 * 3576 * First resume function for hardware IPs. The list of all the hardware 3577 * IPs that make up the asic is walked and the resume callbacks are run for 3578 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3579 * functional state after a suspend and updates the software state as 3580 * necessary. This function is also used for restoring the GPU after a GPU 3581 * reset. 3582 * Returns 0 on success, negative error code on failure. 3583 */ 3584 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3585 { 3586 int i, r; 3587 3588 for (i = 0; i < adev->num_ip_blocks; i++) { 3589 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3590 continue; 3591 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3592 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3593 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3594 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3595 continue; 3596 r = adev->ip_blocks[i].version->funcs->resume(adev); 3597 if (r) { 3598 DRM_ERROR("resume of IP block <%s> failed %d\n", 3599 adev->ip_blocks[i].version->funcs->name, r); 3600 return r; 3601 } 3602 adev->ip_blocks[i].status.hw = true; 3603 } 3604 3605 return 0; 3606 } 3607 3608 /** 3609 * amdgpu_device_ip_resume - run resume for hardware IPs 3610 * 3611 * @adev: amdgpu_device pointer 3612 * 3613 * Main resume function for hardware IPs. The hardware IPs 3614 * are split into two resume functions because they are 3615 * also used in recovering from a GPU reset and some additional 3616 * steps need to be take between them. In this case (S3/S4) they are 3617 * run sequentially. 3618 * Returns 0 on success, negative error code on failure. 3619 */ 3620 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3621 { 3622 int r; 3623 3624 r = amdgpu_device_ip_resume_phase1(adev); 3625 if (r) 3626 return r; 3627 3628 r = amdgpu_device_fw_loading(adev); 3629 if (r) 3630 return r; 3631 3632 r = amdgpu_device_ip_resume_phase2(adev); 3633 3634 if (adev->mman.buffer_funcs_ring->sched.ready) 3635 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3636 3637 return r; 3638 } 3639 3640 /** 3641 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3642 * 3643 * @adev: amdgpu_device pointer 3644 * 3645 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3646 */ 3647 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3648 { 3649 if (amdgpu_sriov_vf(adev)) { 3650 if (adev->is_atom_fw) { 3651 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3652 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3653 } else { 3654 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3655 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3656 } 3657 3658 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3659 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3660 } 3661 } 3662 3663 /** 3664 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3665 * 3666 * @asic_type: AMD asic type 3667 * 3668 * Check if there is DC (new modesetting infrastructre) support for an asic. 3669 * returns true if DC has support, false if not. 3670 */ 3671 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3672 { 3673 switch (asic_type) { 3674 #ifdef CONFIG_DRM_AMDGPU_SI 3675 case CHIP_HAINAN: 3676 #endif 3677 case CHIP_TOPAZ: 3678 /* chips with no display hardware */ 3679 return false; 3680 #if defined(CONFIG_DRM_AMD_DC) 3681 case CHIP_TAHITI: 3682 case CHIP_PITCAIRN: 3683 case CHIP_VERDE: 3684 case CHIP_OLAND: 3685 /* 3686 * We have systems in the wild with these ASICs that require 3687 * LVDS and VGA support which is not supported with DC. 3688 * 3689 * Fallback to the non-DC driver here by default so as not to 3690 * cause regressions. 3691 */ 3692 #if defined(CONFIG_DRM_AMD_DC_SI) 3693 return amdgpu_dc > 0; 3694 #else 3695 return false; 3696 #endif 3697 case CHIP_BONAIRE: 3698 case CHIP_KAVERI: 3699 case CHIP_KABINI: 3700 case CHIP_MULLINS: 3701 /* 3702 * We have systems in the wild with these ASICs that require 3703 * VGA support which is not supported with DC. 3704 * 3705 * Fallback to the non-DC driver here by default so as not to 3706 * cause regressions. 3707 */ 3708 return amdgpu_dc > 0; 3709 default: 3710 return amdgpu_dc != 0; 3711 #else 3712 default: 3713 if (amdgpu_dc > 0) 3714 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3715 return false; 3716 #endif 3717 } 3718 } 3719 3720 /** 3721 * amdgpu_device_has_dc_support - check if dc is supported 3722 * 3723 * @adev: amdgpu_device pointer 3724 * 3725 * Returns true for supported, false for not supported 3726 */ 3727 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3728 { 3729 if (adev->enable_virtual_display || 3730 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3731 return false; 3732 3733 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3734 } 3735 3736 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3737 { 3738 struct amdgpu_device *adev = 3739 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3740 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3741 3742 /* It's a bug to not have a hive within this function */ 3743 if (WARN_ON(!hive)) 3744 return; 3745 3746 /* 3747 * Use task barrier to synchronize all xgmi reset works across the 3748 * hive. task_barrier_enter and task_barrier_exit will block 3749 * until all the threads running the xgmi reset works reach 3750 * those points. task_barrier_full will do both blocks. 3751 */ 3752 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3753 3754 task_barrier_enter(&hive->tb); 3755 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3756 3757 if (adev->asic_reset_res) 3758 goto fail; 3759 3760 task_barrier_exit(&hive->tb); 3761 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3762 3763 if (adev->asic_reset_res) 3764 goto fail; 3765 3766 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3767 } else { 3768 3769 task_barrier_full(&hive->tb); 3770 adev->asic_reset_res = amdgpu_asic_reset(adev); 3771 } 3772 3773 fail: 3774 if (adev->asic_reset_res) 3775 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3776 adev->asic_reset_res, adev_to_drm(adev)->unique); 3777 amdgpu_put_xgmi_hive(hive); 3778 } 3779 3780 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3781 { 3782 char *input = amdgpu_lockup_timeout; 3783 char *timeout_setting = NULL; 3784 int index = 0; 3785 long timeout; 3786 int ret = 0; 3787 3788 /* 3789 * By default timeout for non compute jobs is 10000 3790 * and 60000 for compute jobs. 3791 * In SR-IOV or passthrough mode, timeout for compute 3792 * jobs are 60000 by default. 3793 */ 3794 adev->gfx_timeout = msecs_to_jiffies(10000); 3795 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3796 if (amdgpu_sriov_vf(adev)) 3797 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3798 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3799 else 3800 adev->compute_timeout = msecs_to_jiffies(60000); 3801 3802 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3803 while ((timeout_setting = strsep(&input, ",")) && 3804 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3805 ret = kstrtol(timeout_setting, 0, &timeout); 3806 if (ret) 3807 return ret; 3808 3809 if (timeout == 0) { 3810 index++; 3811 continue; 3812 } else if (timeout < 0) { 3813 timeout = MAX_SCHEDULE_TIMEOUT; 3814 dev_warn(adev->dev, "lockup timeout disabled"); 3815 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3816 } else { 3817 timeout = msecs_to_jiffies(timeout); 3818 } 3819 3820 switch (index++) { 3821 case 0: 3822 adev->gfx_timeout = timeout; 3823 break; 3824 case 1: 3825 adev->compute_timeout = timeout; 3826 break; 3827 case 2: 3828 adev->sdma_timeout = timeout; 3829 break; 3830 case 3: 3831 adev->video_timeout = timeout; 3832 break; 3833 default: 3834 break; 3835 } 3836 } 3837 /* 3838 * There is only one value specified and 3839 * it should apply to all non-compute jobs. 3840 */ 3841 if (index == 1) { 3842 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3843 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3844 adev->compute_timeout = adev->gfx_timeout; 3845 } 3846 } 3847 3848 return ret; 3849 } 3850 3851 /** 3852 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3853 * 3854 * @adev: amdgpu_device pointer 3855 * 3856 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3857 */ 3858 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3859 { 3860 struct iommu_domain *domain; 3861 3862 domain = iommu_get_domain_for_dev(adev->dev); 3863 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3864 adev->ram_is_direct_mapped = true; 3865 } 3866 3867 static const struct attribute *amdgpu_dev_attributes[] = { 3868 &dev_attr_pcie_replay_count.attr, 3869 NULL 3870 }; 3871 3872 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3873 { 3874 if (amdgpu_mcbp == 1) 3875 adev->gfx.mcbp = true; 3876 else if (amdgpu_mcbp == 0) 3877 adev->gfx.mcbp = false; 3878 3879 if (amdgpu_sriov_vf(adev)) 3880 adev->gfx.mcbp = true; 3881 3882 if (adev->gfx.mcbp) 3883 DRM_INFO("MCBP is enabled\n"); 3884 } 3885 3886 /** 3887 * amdgpu_device_init - initialize the driver 3888 * 3889 * @adev: amdgpu_device pointer 3890 * @flags: driver flags 3891 * 3892 * Initializes the driver info and hw (all asics). 3893 * Returns 0 for success or an error on failure. 3894 * Called at driver startup. 3895 */ 3896 int amdgpu_device_init(struct amdgpu_device *adev, 3897 uint32_t flags) 3898 { 3899 struct drm_device *ddev = adev_to_drm(adev); 3900 struct pci_dev *pdev = adev->pdev; 3901 int r, i; 3902 bool px = false; 3903 u32 max_MBps; 3904 int tmp; 3905 3906 adev->shutdown = false; 3907 adev->flags = flags; 3908 3909 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3910 adev->asic_type = amdgpu_force_asic_type; 3911 else 3912 adev->asic_type = flags & AMD_ASIC_MASK; 3913 3914 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3915 if (amdgpu_emu_mode == 1) 3916 adev->usec_timeout *= 10; 3917 adev->gmc.gart_size = 512 * 1024 * 1024; 3918 adev->accel_working = false; 3919 adev->num_rings = 0; 3920 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3921 adev->mman.buffer_funcs = NULL; 3922 adev->mman.buffer_funcs_ring = NULL; 3923 adev->vm_manager.vm_pte_funcs = NULL; 3924 adev->vm_manager.vm_pte_num_scheds = 0; 3925 adev->gmc.gmc_funcs = NULL; 3926 adev->harvest_ip_mask = 0x0; 3927 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3928 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3929 3930 adev->smc_rreg = &amdgpu_invalid_rreg; 3931 adev->smc_wreg = &amdgpu_invalid_wreg; 3932 adev->pcie_rreg = &amdgpu_invalid_rreg; 3933 adev->pcie_wreg = &amdgpu_invalid_wreg; 3934 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3935 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3936 adev->pciep_rreg = &amdgpu_invalid_rreg; 3937 adev->pciep_wreg = &amdgpu_invalid_wreg; 3938 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3939 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3940 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3941 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3942 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3943 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3944 adev->didt_rreg = &amdgpu_invalid_rreg; 3945 adev->didt_wreg = &amdgpu_invalid_wreg; 3946 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3947 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3948 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3949 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3950 3951 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3952 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3953 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3954 3955 /* mutex initialization are all done here so we 3956 * can recall function without having locking issues 3957 */ 3958 mutex_init(&adev->firmware.mutex); 3959 mutex_init(&adev->pm.mutex); 3960 mutex_init(&adev->gfx.gpu_clock_mutex); 3961 mutex_init(&adev->srbm_mutex); 3962 mutex_init(&adev->gfx.pipe_reserve_mutex); 3963 mutex_init(&adev->gfx.gfx_off_mutex); 3964 mutex_init(&adev->gfx.partition_mutex); 3965 mutex_init(&adev->grbm_idx_mutex); 3966 mutex_init(&adev->mn_lock); 3967 mutex_init(&adev->virt.vf_errors.lock); 3968 hash_init(adev->mn_hash); 3969 mutex_init(&adev->psp.mutex); 3970 mutex_init(&adev->notifier_lock); 3971 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3972 mutex_init(&adev->benchmark_mutex); 3973 3974 amdgpu_device_init_apu_flags(adev); 3975 3976 r = amdgpu_device_check_arguments(adev); 3977 if (r) 3978 return r; 3979 3980 spin_lock_init(&adev->mmio_idx_lock); 3981 spin_lock_init(&adev->smc_idx_lock); 3982 spin_lock_init(&adev->pcie_idx_lock); 3983 spin_lock_init(&adev->uvd_ctx_idx_lock); 3984 spin_lock_init(&adev->didt_idx_lock); 3985 spin_lock_init(&adev->gc_cac_idx_lock); 3986 spin_lock_init(&adev->se_cac_idx_lock); 3987 spin_lock_init(&adev->audio_endpt_idx_lock); 3988 spin_lock_init(&adev->mm_stats.lock); 3989 3990 INIT_LIST_HEAD(&adev->shadow_list); 3991 mutex_init(&adev->shadow_list_lock); 3992 3993 INIT_LIST_HEAD(&adev->reset_list); 3994 3995 INIT_LIST_HEAD(&adev->ras_list); 3996 3997 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3998 3999 INIT_DELAYED_WORK(&adev->delayed_init_work, 4000 amdgpu_device_delayed_init_work_handler); 4001 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4002 amdgpu_device_delay_enable_gfx_off); 4003 4004 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4005 4006 adev->gfx.gfx_off_req_count = 1; 4007 adev->gfx.gfx_off_residency = 0; 4008 adev->gfx.gfx_off_entrycount = 0; 4009 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4010 4011 atomic_set(&adev->throttling_logging_enabled, 1); 4012 /* 4013 * If throttling continues, logging will be performed every minute 4014 * to avoid log flooding. "-1" is subtracted since the thermal 4015 * throttling interrupt comes every second. Thus, the total logging 4016 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4017 * for throttling interrupt) = 60 seconds. 4018 */ 4019 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4020 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4021 4022 /* Registers mapping */ 4023 /* TODO: block userspace mapping of io register */ 4024 if (adev->asic_type >= CHIP_BONAIRE) { 4025 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4026 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4027 } else { 4028 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4029 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4030 } 4031 4032 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4033 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4034 4035 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4036 if (!adev->rmmio) 4037 return -ENOMEM; 4038 4039 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4040 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4041 4042 /* 4043 * Reset domain needs to be present early, before XGMI hive discovered 4044 * (if any) and intitialized to use reset sem and in_gpu reset flag 4045 * early on during init and before calling to RREG32. 4046 */ 4047 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4048 if (!adev->reset_domain) 4049 return -ENOMEM; 4050 4051 /* detect hw virtualization here */ 4052 amdgpu_detect_virtualization(adev); 4053 4054 amdgpu_device_get_pcie_info(adev); 4055 4056 r = amdgpu_device_get_job_timeout_settings(adev); 4057 if (r) { 4058 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4059 return r; 4060 } 4061 4062 amdgpu_device_set_mcbp(adev); 4063 4064 /* early init functions */ 4065 r = amdgpu_device_ip_early_init(adev); 4066 if (r) 4067 return r; 4068 4069 /* Get rid of things like offb */ 4070 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4071 if (r) 4072 return r; 4073 4074 /* Enable TMZ based on IP_VERSION */ 4075 amdgpu_gmc_tmz_set(adev); 4076 4077 if (amdgpu_sriov_vf(adev) && 4078 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4079 /* VF MMIO access (except mailbox range) from CPU 4080 * will be blocked during sriov runtime 4081 */ 4082 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4083 4084 amdgpu_gmc_noretry_set(adev); 4085 /* Need to get xgmi info early to decide the reset behavior*/ 4086 if (adev->gmc.xgmi.supported) { 4087 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4088 if (r) 4089 return r; 4090 } 4091 4092 /* enable PCIE atomic ops */ 4093 if (amdgpu_sriov_vf(adev)) { 4094 if (adev->virt.fw_reserve.p_pf2vf) 4095 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4096 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4097 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4098 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4099 * internal path natively support atomics, set have_atomics_support to true. 4100 */ 4101 } else if ((adev->flags & AMD_IS_APU) && 4102 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4103 IP_VERSION(9, 0, 0))) { 4104 adev->have_atomics_support = true; 4105 } else { 4106 adev->have_atomics_support = 4107 !pci_enable_atomic_ops_to_root(adev->pdev, 4108 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4109 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4110 } 4111 4112 if (!adev->have_atomics_support) 4113 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4114 4115 /* doorbell bar mapping and doorbell index init*/ 4116 amdgpu_doorbell_init(adev); 4117 4118 if (amdgpu_emu_mode == 1) { 4119 /* post the asic on emulation mode */ 4120 emu_soc_asic_init(adev); 4121 goto fence_driver_init; 4122 } 4123 4124 amdgpu_reset_init(adev); 4125 4126 /* detect if we are with an SRIOV vbios */ 4127 if (adev->bios) 4128 amdgpu_device_detect_sriov_bios(adev); 4129 4130 /* check if we need to reset the asic 4131 * E.g., driver was not cleanly unloaded previously, etc. 4132 */ 4133 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4134 if (adev->gmc.xgmi.num_physical_nodes) { 4135 dev_info(adev->dev, "Pending hive reset.\n"); 4136 adev->gmc.xgmi.pending_reset = true; 4137 /* Only need to init necessary block for SMU to handle the reset */ 4138 for (i = 0; i < adev->num_ip_blocks; i++) { 4139 if (!adev->ip_blocks[i].status.valid) 4140 continue; 4141 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4142 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4143 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4144 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4145 DRM_DEBUG("IP %s disabled for hw_init.\n", 4146 adev->ip_blocks[i].version->funcs->name); 4147 adev->ip_blocks[i].status.hw = true; 4148 } 4149 } 4150 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4151 !amdgpu_device_has_display_hardware(adev)) { 4152 r = psp_gpu_reset(adev); 4153 } else { 4154 tmp = amdgpu_reset_method; 4155 /* It should do a default reset when loading or reloading the driver, 4156 * regardless of the module parameter reset_method. 4157 */ 4158 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4159 r = amdgpu_asic_reset(adev); 4160 amdgpu_reset_method = tmp; 4161 } 4162 4163 if (r) { 4164 dev_err(adev->dev, "asic reset on init failed\n"); 4165 goto failed; 4166 } 4167 } 4168 4169 /* Post card if necessary */ 4170 if (amdgpu_device_need_post(adev)) { 4171 if (!adev->bios) { 4172 dev_err(adev->dev, "no vBIOS found\n"); 4173 r = -EINVAL; 4174 goto failed; 4175 } 4176 DRM_INFO("GPU posting now...\n"); 4177 r = amdgpu_device_asic_init(adev); 4178 if (r) { 4179 dev_err(adev->dev, "gpu post error!\n"); 4180 goto failed; 4181 } 4182 } 4183 4184 if (adev->bios) { 4185 if (adev->is_atom_fw) { 4186 /* Initialize clocks */ 4187 r = amdgpu_atomfirmware_get_clock_info(adev); 4188 if (r) { 4189 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4190 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4191 goto failed; 4192 } 4193 } else { 4194 /* Initialize clocks */ 4195 r = amdgpu_atombios_get_clock_info(adev); 4196 if (r) { 4197 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4198 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4199 goto failed; 4200 } 4201 /* init i2c buses */ 4202 if (!amdgpu_device_has_dc_support(adev)) 4203 amdgpu_atombios_i2c_init(adev); 4204 } 4205 } 4206 4207 fence_driver_init: 4208 /* Fence driver */ 4209 r = amdgpu_fence_driver_sw_init(adev); 4210 if (r) { 4211 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4212 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4213 goto failed; 4214 } 4215 4216 /* init the mode config */ 4217 drm_mode_config_init(adev_to_drm(adev)); 4218 4219 r = amdgpu_device_ip_init(adev); 4220 if (r) { 4221 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4222 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4223 goto release_ras_con; 4224 } 4225 4226 amdgpu_fence_driver_hw_init(adev); 4227 4228 dev_info(adev->dev, 4229 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4230 adev->gfx.config.max_shader_engines, 4231 adev->gfx.config.max_sh_per_se, 4232 adev->gfx.config.max_cu_per_sh, 4233 adev->gfx.cu_info.number); 4234 4235 adev->accel_working = true; 4236 4237 amdgpu_vm_check_compute_bug(adev); 4238 4239 /* Initialize the buffer migration limit. */ 4240 if (amdgpu_moverate >= 0) 4241 max_MBps = amdgpu_moverate; 4242 else 4243 max_MBps = 8; /* Allow 8 MB/s. */ 4244 /* Get a log2 for easy divisions. */ 4245 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4246 4247 /* 4248 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4249 * Otherwise the mgpu fan boost feature will be skipped due to the 4250 * gpu instance is counted less. 4251 */ 4252 amdgpu_register_gpu_instance(adev); 4253 4254 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4255 * explicit gating rather than handling it automatically. 4256 */ 4257 if (!adev->gmc.xgmi.pending_reset) { 4258 r = amdgpu_device_ip_late_init(adev); 4259 if (r) { 4260 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4261 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4262 goto release_ras_con; 4263 } 4264 /* must succeed. */ 4265 amdgpu_ras_resume(adev); 4266 queue_delayed_work(system_wq, &adev->delayed_init_work, 4267 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4268 } 4269 4270 if (amdgpu_sriov_vf(adev)) { 4271 amdgpu_virt_release_full_gpu(adev, true); 4272 flush_delayed_work(&adev->delayed_init_work); 4273 } 4274 4275 /* 4276 * Place those sysfs registering after `late_init`. As some of those 4277 * operations performed in `late_init` might affect the sysfs 4278 * interfaces creating. 4279 */ 4280 r = amdgpu_atombios_sysfs_init(adev); 4281 if (r) 4282 drm_err(&adev->ddev, 4283 "registering atombios sysfs failed (%d).\n", r); 4284 4285 r = amdgpu_pm_sysfs_init(adev); 4286 if (r) 4287 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4288 4289 r = amdgpu_ucode_sysfs_init(adev); 4290 if (r) { 4291 adev->ucode_sysfs_en = false; 4292 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4293 } else 4294 adev->ucode_sysfs_en = true; 4295 4296 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4297 if (r) 4298 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4299 4300 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4301 if (r) 4302 dev_err(adev->dev, 4303 "Could not create amdgpu board attributes\n"); 4304 4305 amdgpu_fru_sysfs_init(adev); 4306 amdgpu_reg_state_sysfs_init(adev); 4307 4308 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4309 r = amdgpu_pmu_init(adev); 4310 if (r) 4311 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4312 4313 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4314 if (amdgpu_device_cache_pci_state(adev->pdev)) 4315 pci_restore_state(pdev); 4316 4317 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4318 /* this will fail for cards that aren't VGA class devices, just 4319 * ignore it 4320 */ 4321 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4322 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4323 4324 px = amdgpu_device_supports_px(ddev); 4325 4326 if (px || (!dev_is_removable(&adev->pdev->dev) && 4327 apple_gmux_detect(NULL, NULL))) 4328 vga_switcheroo_register_client(adev->pdev, 4329 &amdgpu_switcheroo_ops, px); 4330 4331 if (px) 4332 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4333 4334 if (adev->gmc.xgmi.pending_reset) 4335 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4336 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4337 4338 amdgpu_device_check_iommu_direct_map(adev); 4339 4340 return 0; 4341 4342 release_ras_con: 4343 if (amdgpu_sriov_vf(adev)) 4344 amdgpu_virt_release_full_gpu(adev, true); 4345 4346 /* failed in exclusive mode due to timeout */ 4347 if (amdgpu_sriov_vf(adev) && 4348 !amdgpu_sriov_runtime(adev) && 4349 amdgpu_virt_mmio_blocked(adev) && 4350 !amdgpu_virt_wait_reset(adev)) { 4351 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4352 /* Don't send request since VF is inactive. */ 4353 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4354 adev->virt.ops = NULL; 4355 r = -EAGAIN; 4356 } 4357 amdgpu_release_ras_context(adev); 4358 4359 failed: 4360 amdgpu_vf_error_trans_all(adev); 4361 4362 return r; 4363 } 4364 4365 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4366 { 4367 4368 /* Clear all CPU mappings pointing to this device */ 4369 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4370 4371 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4372 amdgpu_doorbell_fini(adev); 4373 4374 iounmap(adev->rmmio); 4375 adev->rmmio = NULL; 4376 if (adev->mman.aper_base_kaddr) 4377 iounmap(adev->mman.aper_base_kaddr); 4378 adev->mman.aper_base_kaddr = NULL; 4379 4380 /* Memory manager related */ 4381 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4382 arch_phys_wc_del(adev->gmc.vram_mtrr); 4383 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4384 } 4385 } 4386 4387 /** 4388 * amdgpu_device_fini_hw - tear down the driver 4389 * 4390 * @adev: amdgpu_device pointer 4391 * 4392 * Tear down the driver info (all asics). 4393 * Called at driver shutdown. 4394 */ 4395 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4396 { 4397 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4398 flush_delayed_work(&adev->delayed_init_work); 4399 adev->shutdown = true; 4400 4401 /* make sure IB test finished before entering exclusive mode 4402 * to avoid preemption on IB test 4403 */ 4404 if (amdgpu_sriov_vf(adev)) { 4405 amdgpu_virt_request_full_gpu(adev, false); 4406 amdgpu_virt_fini_data_exchange(adev); 4407 } 4408 4409 /* disable all interrupts */ 4410 amdgpu_irq_disable_all(adev); 4411 if (adev->mode_info.mode_config_initialized) { 4412 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4413 drm_helper_force_disable_all(adev_to_drm(adev)); 4414 else 4415 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4416 } 4417 amdgpu_fence_driver_hw_fini(adev); 4418 4419 if (adev->mman.initialized) 4420 drain_workqueue(adev->mman.bdev.wq); 4421 4422 if (adev->pm.sysfs_initialized) 4423 amdgpu_pm_sysfs_fini(adev); 4424 if (adev->ucode_sysfs_en) 4425 amdgpu_ucode_sysfs_fini(adev); 4426 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4427 amdgpu_fru_sysfs_fini(adev); 4428 4429 amdgpu_reg_state_sysfs_fini(adev); 4430 4431 /* disable ras feature must before hw fini */ 4432 amdgpu_ras_pre_fini(adev); 4433 4434 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4435 4436 amdgpu_device_ip_fini_early(adev); 4437 4438 amdgpu_irq_fini_hw(adev); 4439 4440 if (adev->mman.initialized) 4441 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4442 4443 amdgpu_gart_dummy_page_fini(adev); 4444 4445 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4446 amdgpu_device_unmap_mmio(adev); 4447 4448 } 4449 4450 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4451 { 4452 int idx; 4453 bool px; 4454 4455 amdgpu_fence_driver_sw_fini(adev); 4456 amdgpu_device_ip_fini(adev); 4457 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4458 adev->accel_working = false; 4459 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4460 4461 amdgpu_reset_fini(adev); 4462 4463 /* free i2c buses */ 4464 if (!amdgpu_device_has_dc_support(adev)) 4465 amdgpu_i2c_fini(adev); 4466 4467 if (amdgpu_emu_mode != 1) 4468 amdgpu_atombios_fini(adev); 4469 4470 kfree(adev->bios); 4471 adev->bios = NULL; 4472 4473 kfree(adev->fru_info); 4474 adev->fru_info = NULL; 4475 4476 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4477 4478 if (px || (!dev_is_removable(&adev->pdev->dev) && 4479 apple_gmux_detect(NULL, NULL))) 4480 vga_switcheroo_unregister_client(adev->pdev); 4481 4482 if (px) 4483 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4484 4485 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4486 vga_client_unregister(adev->pdev); 4487 4488 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4489 4490 iounmap(adev->rmmio); 4491 adev->rmmio = NULL; 4492 amdgpu_doorbell_fini(adev); 4493 drm_dev_exit(idx); 4494 } 4495 4496 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4497 amdgpu_pmu_fini(adev); 4498 if (adev->mman.discovery_bin) 4499 amdgpu_discovery_fini(adev); 4500 4501 amdgpu_reset_put_reset_domain(adev->reset_domain); 4502 adev->reset_domain = NULL; 4503 4504 kfree(adev->pci_state); 4505 4506 } 4507 4508 /** 4509 * amdgpu_device_evict_resources - evict device resources 4510 * @adev: amdgpu device object 4511 * 4512 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4513 * of the vram memory type. Mainly used for evicting device resources 4514 * at suspend time. 4515 * 4516 */ 4517 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4518 { 4519 int ret; 4520 4521 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4522 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4523 return 0; 4524 4525 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4526 if (ret) 4527 DRM_WARN("evicting device resources failed\n"); 4528 return ret; 4529 } 4530 4531 /* 4532 * Suspend & resume. 4533 */ 4534 /** 4535 * amdgpu_device_prepare - prepare for device suspend 4536 * 4537 * @dev: drm dev pointer 4538 * 4539 * Prepare to put the hw in the suspend state (all asics). 4540 * Returns 0 for success or an error on failure. 4541 * Called at driver suspend. 4542 */ 4543 int amdgpu_device_prepare(struct drm_device *dev) 4544 { 4545 struct amdgpu_device *adev = drm_to_adev(dev); 4546 int i, r; 4547 4548 amdgpu_choose_low_power_state(adev); 4549 4550 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4551 return 0; 4552 4553 /* Evict the majority of BOs before starting suspend sequence */ 4554 r = amdgpu_device_evict_resources(adev); 4555 if (r) 4556 goto unprepare; 4557 4558 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4559 4560 for (i = 0; i < adev->num_ip_blocks; i++) { 4561 if (!adev->ip_blocks[i].status.valid) 4562 continue; 4563 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4564 continue; 4565 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4566 if (r) 4567 goto unprepare; 4568 } 4569 4570 return 0; 4571 4572 unprepare: 4573 adev->in_s0ix = adev->in_s3 = false; 4574 4575 return r; 4576 } 4577 4578 /** 4579 * amdgpu_device_suspend - initiate device suspend 4580 * 4581 * @dev: drm dev pointer 4582 * @fbcon : notify the fbdev of suspend 4583 * 4584 * Puts the hw in the suspend state (all asics). 4585 * Returns 0 for success or an error on failure. 4586 * Called at driver suspend. 4587 */ 4588 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4589 { 4590 struct amdgpu_device *adev = drm_to_adev(dev); 4591 int r = 0; 4592 4593 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4594 return 0; 4595 4596 adev->in_suspend = true; 4597 4598 if (amdgpu_sriov_vf(adev)) { 4599 amdgpu_virt_fini_data_exchange(adev); 4600 r = amdgpu_virt_request_full_gpu(adev, false); 4601 if (r) 4602 return r; 4603 } 4604 4605 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4606 DRM_WARN("smart shift update failed\n"); 4607 4608 if (fbcon) 4609 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4610 4611 cancel_delayed_work_sync(&adev->delayed_init_work); 4612 4613 amdgpu_ras_suspend(adev); 4614 4615 amdgpu_device_ip_suspend_phase1(adev); 4616 4617 if (!adev->in_s0ix) 4618 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4619 4620 r = amdgpu_device_evict_resources(adev); 4621 if (r) 4622 return r; 4623 4624 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4625 4626 amdgpu_fence_driver_hw_fini(adev); 4627 4628 amdgpu_device_ip_suspend_phase2(adev); 4629 4630 if (amdgpu_sriov_vf(adev)) 4631 amdgpu_virt_release_full_gpu(adev, false); 4632 4633 r = amdgpu_dpm_notify_rlc_state(adev, false); 4634 if (r) 4635 return r; 4636 4637 return 0; 4638 } 4639 4640 /** 4641 * amdgpu_device_resume - initiate device resume 4642 * 4643 * @dev: drm dev pointer 4644 * @fbcon : notify the fbdev of resume 4645 * 4646 * Bring the hw back to operating state (all asics). 4647 * Returns 0 for success or an error on failure. 4648 * Called at driver resume. 4649 */ 4650 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4651 { 4652 struct amdgpu_device *adev = drm_to_adev(dev); 4653 int r = 0; 4654 4655 if (amdgpu_sriov_vf(adev)) { 4656 r = amdgpu_virt_request_full_gpu(adev, true); 4657 if (r) 4658 return r; 4659 } 4660 4661 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4662 return 0; 4663 4664 if (adev->in_s0ix) 4665 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4666 4667 /* post card */ 4668 if (amdgpu_device_need_post(adev)) { 4669 r = amdgpu_device_asic_init(adev); 4670 if (r) 4671 dev_err(adev->dev, "amdgpu asic init failed\n"); 4672 } 4673 4674 r = amdgpu_device_ip_resume(adev); 4675 4676 if (r) { 4677 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4678 goto exit; 4679 } 4680 amdgpu_fence_driver_hw_init(adev); 4681 4682 if (!adev->in_s0ix) { 4683 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4684 if (r) 4685 goto exit; 4686 } 4687 4688 r = amdgpu_device_ip_late_init(adev); 4689 if (r) 4690 goto exit; 4691 4692 queue_delayed_work(system_wq, &adev->delayed_init_work, 4693 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4694 exit: 4695 if (amdgpu_sriov_vf(adev)) { 4696 amdgpu_virt_init_data_exchange(adev); 4697 amdgpu_virt_release_full_gpu(adev, true); 4698 } 4699 4700 if (r) 4701 return r; 4702 4703 /* Make sure IB tests flushed */ 4704 flush_delayed_work(&adev->delayed_init_work); 4705 4706 if (fbcon) 4707 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4708 4709 amdgpu_ras_resume(adev); 4710 4711 if (adev->mode_info.num_crtc) { 4712 /* 4713 * Most of the connector probing functions try to acquire runtime pm 4714 * refs to ensure that the GPU is powered on when connector polling is 4715 * performed. Since we're calling this from a runtime PM callback, 4716 * trying to acquire rpm refs will cause us to deadlock. 4717 * 4718 * Since we're guaranteed to be holding the rpm lock, it's safe to 4719 * temporarily disable the rpm helpers so this doesn't deadlock us. 4720 */ 4721 #ifdef CONFIG_PM 4722 dev->dev->power.disable_depth++; 4723 #endif 4724 if (!adev->dc_enabled) 4725 drm_helper_hpd_irq_event(dev); 4726 else 4727 drm_kms_helper_hotplug_event(dev); 4728 #ifdef CONFIG_PM 4729 dev->dev->power.disable_depth--; 4730 #endif 4731 } 4732 adev->in_suspend = false; 4733 4734 if (adev->enable_mes) 4735 amdgpu_mes_self_test(adev); 4736 4737 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4738 DRM_WARN("smart shift update failed\n"); 4739 4740 return 0; 4741 } 4742 4743 /** 4744 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4745 * 4746 * @adev: amdgpu_device pointer 4747 * 4748 * The list of all the hardware IPs that make up the asic is walked and 4749 * the check_soft_reset callbacks are run. check_soft_reset determines 4750 * if the asic is still hung or not. 4751 * Returns true if any of the IPs are still in a hung state, false if not. 4752 */ 4753 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4754 { 4755 int i; 4756 bool asic_hang = false; 4757 4758 if (amdgpu_sriov_vf(adev)) 4759 return true; 4760 4761 if (amdgpu_asic_need_full_reset(adev)) 4762 return true; 4763 4764 for (i = 0; i < adev->num_ip_blocks; i++) { 4765 if (!adev->ip_blocks[i].status.valid) 4766 continue; 4767 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4768 adev->ip_blocks[i].status.hang = 4769 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4770 if (adev->ip_blocks[i].status.hang) { 4771 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4772 asic_hang = true; 4773 } 4774 } 4775 return asic_hang; 4776 } 4777 4778 /** 4779 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4780 * 4781 * @adev: amdgpu_device pointer 4782 * 4783 * The list of all the hardware IPs that make up the asic is walked and the 4784 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4785 * handles any IP specific hardware or software state changes that are 4786 * necessary for a soft reset to succeed. 4787 * Returns 0 on success, negative error code on failure. 4788 */ 4789 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4790 { 4791 int i, r = 0; 4792 4793 for (i = 0; i < adev->num_ip_blocks; i++) { 4794 if (!adev->ip_blocks[i].status.valid) 4795 continue; 4796 if (adev->ip_blocks[i].status.hang && 4797 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4798 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4799 if (r) 4800 return r; 4801 } 4802 } 4803 4804 return 0; 4805 } 4806 4807 /** 4808 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4809 * 4810 * @adev: amdgpu_device pointer 4811 * 4812 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4813 * reset is necessary to recover. 4814 * Returns true if a full asic reset is required, false if not. 4815 */ 4816 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4817 { 4818 int i; 4819 4820 if (amdgpu_asic_need_full_reset(adev)) 4821 return true; 4822 4823 for (i = 0; i < adev->num_ip_blocks; i++) { 4824 if (!adev->ip_blocks[i].status.valid) 4825 continue; 4826 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4827 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4828 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4829 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4830 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4831 if (adev->ip_blocks[i].status.hang) { 4832 dev_info(adev->dev, "Some block need full reset!\n"); 4833 return true; 4834 } 4835 } 4836 } 4837 return false; 4838 } 4839 4840 /** 4841 * amdgpu_device_ip_soft_reset - do a soft reset 4842 * 4843 * @adev: amdgpu_device pointer 4844 * 4845 * The list of all the hardware IPs that make up the asic is walked and the 4846 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4847 * IP specific hardware or software state changes that are necessary to soft 4848 * reset the IP. 4849 * Returns 0 on success, negative error code on failure. 4850 */ 4851 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4852 { 4853 int i, r = 0; 4854 4855 for (i = 0; i < adev->num_ip_blocks; i++) { 4856 if (!adev->ip_blocks[i].status.valid) 4857 continue; 4858 if (adev->ip_blocks[i].status.hang && 4859 adev->ip_blocks[i].version->funcs->soft_reset) { 4860 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4861 if (r) 4862 return r; 4863 } 4864 } 4865 4866 return 0; 4867 } 4868 4869 /** 4870 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4871 * 4872 * @adev: amdgpu_device pointer 4873 * 4874 * The list of all the hardware IPs that make up the asic is walked and the 4875 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4876 * handles any IP specific hardware or software state changes that are 4877 * necessary after the IP has been soft reset. 4878 * Returns 0 on success, negative error code on failure. 4879 */ 4880 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4881 { 4882 int i, r = 0; 4883 4884 for (i = 0; i < adev->num_ip_blocks; i++) { 4885 if (!adev->ip_blocks[i].status.valid) 4886 continue; 4887 if (adev->ip_blocks[i].status.hang && 4888 adev->ip_blocks[i].version->funcs->post_soft_reset) 4889 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4890 if (r) 4891 return r; 4892 } 4893 4894 return 0; 4895 } 4896 4897 /** 4898 * amdgpu_device_recover_vram - Recover some VRAM contents 4899 * 4900 * @adev: amdgpu_device pointer 4901 * 4902 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4903 * restore things like GPUVM page tables after a GPU reset where 4904 * the contents of VRAM might be lost. 4905 * 4906 * Returns: 4907 * 0 on success, negative error code on failure. 4908 */ 4909 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4910 { 4911 struct dma_fence *fence = NULL, *next = NULL; 4912 struct amdgpu_bo *shadow; 4913 struct amdgpu_bo_vm *vmbo; 4914 long r = 1, tmo; 4915 4916 if (amdgpu_sriov_runtime(adev)) 4917 tmo = msecs_to_jiffies(8000); 4918 else 4919 tmo = msecs_to_jiffies(100); 4920 4921 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4922 mutex_lock(&adev->shadow_list_lock); 4923 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4924 /* If vm is compute context or adev is APU, shadow will be NULL */ 4925 if (!vmbo->shadow) 4926 continue; 4927 shadow = vmbo->shadow; 4928 4929 /* No need to recover an evicted BO */ 4930 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4931 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4932 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4933 continue; 4934 4935 r = amdgpu_bo_restore_shadow(shadow, &next); 4936 if (r) 4937 break; 4938 4939 if (fence) { 4940 tmo = dma_fence_wait_timeout(fence, false, tmo); 4941 dma_fence_put(fence); 4942 fence = next; 4943 if (tmo == 0) { 4944 r = -ETIMEDOUT; 4945 break; 4946 } else if (tmo < 0) { 4947 r = tmo; 4948 break; 4949 } 4950 } else { 4951 fence = next; 4952 } 4953 } 4954 mutex_unlock(&adev->shadow_list_lock); 4955 4956 if (fence) 4957 tmo = dma_fence_wait_timeout(fence, false, tmo); 4958 dma_fence_put(fence); 4959 4960 if (r < 0 || tmo <= 0) { 4961 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4962 return -EIO; 4963 } 4964 4965 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4966 return 0; 4967 } 4968 4969 4970 /** 4971 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4972 * 4973 * @adev: amdgpu_device pointer 4974 * @from_hypervisor: request from hypervisor 4975 * 4976 * do VF FLR and reinitialize Asic 4977 * return 0 means succeeded otherwise failed 4978 */ 4979 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4980 bool from_hypervisor) 4981 { 4982 int r; 4983 struct amdgpu_hive_info *hive = NULL; 4984 int retry_limit = 0; 4985 4986 retry: 4987 amdgpu_amdkfd_pre_reset(adev); 4988 4989 amdgpu_device_stop_pending_resets(adev); 4990 4991 if (from_hypervisor) 4992 r = amdgpu_virt_request_full_gpu(adev, true); 4993 else 4994 r = amdgpu_virt_reset_gpu(adev); 4995 if (r) 4996 return r; 4997 amdgpu_ras_set_fed(adev, false); 4998 amdgpu_irq_gpu_reset_resume_helper(adev); 4999 5000 /* some sw clean up VF needs to do before recover */ 5001 amdgpu_virt_post_reset(adev); 5002 5003 /* Resume IP prior to SMC */ 5004 r = amdgpu_device_ip_reinit_early_sriov(adev); 5005 if (r) 5006 goto error; 5007 5008 amdgpu_virt_init_data_exchange(adev); 5009 5010 r = amdgpu_device_fw_loading(adev); 5011 if (r) 5012 return r; 5013 5014 /* now we are okay to resume SMC/CP/SDMA */ 5015 r = amdgpu_device_ip_reinit_late_sriov(adev); 5016 if (r) 5017 goto error; 5018 5019 hive = amdgpu_get_xgmi_hive(adev); 5020 /* Update PSP FW topology after reset */ 5021 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5022 r = amdgpu_xgmi_update_topology(hive, adev); 5023 5024 if (hive) 5025 amdgpu_put_xgmi_hive(hive); 5026 5027 if (!r) { 5028 r = amdgpu_ib_ring_tests(adev); 5029 5030 amdgpu_amdkfd_post_reset(adev); 5031 } 5032 5033 error: 5034 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5035 amdgpu_inc_vram_lost(adev); 5036 r = amdgpu_device_recover_vram(adev); 5037 } 5038 amdgpu_virt_release_full_gpu(adev, true); 5039 5040 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5041 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5042 retry_limit++; 5043 goto retry; 5044 } else 5045 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5046 } 5047 5048 return r; 5049 } 5050 5051 /** 5052 * amdgpu_device_has_job_running - check if there is any job in mirror list 5053 * 5054 * @adev: amdgpu_device pointer 5055 * 5056 * check if there is any job in mirror list 5057 */ 5058 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5059 { 5060 int i; 5061 struct drm_sched_job *job; 5062 5063 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5064 struct amdgpu_ring *ring = adev->rings[i]; 5065 5066 if (!amdgpu_ring_sched_ready(ring)) 5067 continue; 5068 5069 spin_lock(&ring->sched.job_list_lock); 5070 job = list_first_entry_or_null(&ring->sched.pending_list, 5071 struct drm_sched_job, list); 5072 spin_unlock(&ring->sched.job_list_lock); 5073 if (job) 5074 return true; 5075 } 5076 return false; 5077 } 5078 5079 /** 5080 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5081 * 5082 * @adev: amdgpu_device pointer 5083 * 5084 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5085 * a hung GPU. 5086 */ 5087 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5088 { 5089 5090 if (amdgpu_gpu_recovery == 0) 5091 goto disabled; 5092 5093 /* Skip soft reset check in fatal error mode */ 5094 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5095 return true; 5096 5097 if (amdgpu_sriov_vf(adev)) 5098 return true; 5099 5100 if (amdgpu_gpu_recovery == -1) { 5101 switch (adev->asic_type) { 5102 #ifdef CONFIG_DRM_AMDGPU_SI 5103 case CHIP_VERDE: 5104 case CHIP_TAHITI: 5105 case CHIP_PITCAIRN: 5106 case CHIP_OLAND: 5107 case CHIP_HAINAN: 5108 #endif 5109 #ifdef CONFIG_DRM_AMDGPU_CIK 5110 case CHIP_KAVERI: 5111 case CHIP_KABINI: 5112 case CHIP_MULLINS: 5113 #endif 5114 case CHIP_CARRIZO: 5115 case CHIP_STONEY: 5116 case CHIP_CYAN_SKILLFISH: 5117 goto disabled; 5118 default: 5119 break; 5120 } 5121 } 5122 5123 return true; 5124 5125 disabled: 5126 dev_info(adev->dev, "GPU recovery disabled.\n"); 5127 return false; 5128 } 5129 5130 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5131 { 5132 u32 i; 5133 int ret = 0; 5134 5135 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5136 5137 dev_info(adev->dev, "GPU mode1 reset\n"); 5138 5139 /* disable BM */ 5140 pci_clear_master(adev->pdev); 5141 5142 amdgpu_device_cache_pci_state(adev->pdev); 5143 5144 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5145 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5146 ret = amdgpu_dpm_mode1_reset(adev); 5147 } else { 5148 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5149 ret = psp_gpu_reset(adev); 5150 } 5151 5152 if (ret) 5153 goto mode1_reset_failed; 5154 5155 amdgpu_device_load_pci_state(adev->pdev); 5156 ret = amdgpu_psp_wait_for_bootloader(adev); 5157 if (ret) 5158 goto mode1_reset_failed; 5159 5160 /* wait for asic to come out of reset */ 5161 for (i = 0; i < adev->usec_timeout; i++) { 5162 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5163 5164 if (memsize != 0xffffffff) 5165 break; 5166 udelay(1); 5167 } 5168 5169 if (i >= adev->usec_timeout) { 5170 ret = -ETIMEDOUT; 5171 goto mode1_reset_failed; 5172 } 5173 5174 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5175 5176 return 0; 5177 5178 mode1_reset_failed: 5179 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5180 return ret; 5181 } 5182 5183 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5184 struct amdgpu_reset_context *reset_context) 5185 { 5186 int i, r = 0; 5187 struct amdgpu_job *job = NULL; 5188 bool need_full_reset = 5189 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5190 5191 if (reset_context->reset_req_dev == adev) 5192 job = reset_context->job; 5193 5194 if (amdgpu_sriov_vf(adev)) { 5195 /* stop the data exchange thread */ 5196 amdgpu_virt_fini_data_exchange(adev); 5197 } 5198 5199 amdgpu_fence_driver_isr_toggle(adev, true); 5200 5201 /* block all schedulers and reset given job's ring */ 5202 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5203 struct amdgpu_ring *ring = adev->rings[i]; 5204 5205 if (!amdgpu_ring_sched_ready(ring)) 5206 continue; 5207 5208 /* Clear job fence from fence drv to avoid force_completion 5209 * leave NULL and vm flush fence in fence drv 5210 */ 5211 amdgpu_fence_driver_clear_job_fences(ring); 5212 5213 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5214 amdgpu_fence_driver_force_completion(ring); 5215 } 5216 5217 amdgpu_fence_driver_isr_toggle(adev, false); 5218 5219 if (job && job->vm) 5220 drm_sched_increase_karma(&job->base); 5221 5222 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5223 /* If reset handler not implemented, continue; otherwise return */ 5224 if (r == -EOPNOTSUPP) 5225 r = 0; 5226 else 5227 return r; 5228 5229 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5230 if (!amdgpu_sriov_vf(adev)) { 5231 5232 if (!need_full_reset) 5233 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5234 5235 if (!need_full_reset && amdgpu_gpu_recovery && 5236 amdgpu_device_ip_check_soft_reset(adev)) { 5237 amdgpu_device_ip_pre_soft_reset(adev); 5238 r = amdgpu_device_ip_soft_reset(adev); 5239 amdgpu_device_ip_post_soft_reset(adev); 5240 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5241 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5242 need_full_reset = true; 5243 } 5244 } 5245 5246 if (need_full_reset) 5247 r = amdgpu_device_ip_suspend(adev); 5248 if (need_full_reset) 5249 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5250 else 5251 clear_bit(AMDGPU_NEED_FULL_RESET, 5252 &reset_context->flags); 5253 } 5254 5255 return r; 5256 } 5257 5258 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5259 { 5260 int i; 5261 5262 lockdep_assert_held(&adev->reset_domain->sem); 5263 5264 for (i = 0; i < adev->reset_info.num_regs; i++) { 5265 adev->reset_info.reset_dump_reg_value[i] = 5266 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5267 5268 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5269 adev->reset_info.reset_dump_reg_value[i]); 5270 } 5271 5272 return 0; 5273 } 5274 5275 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5276 struct amdgpu_reset_context *reset_context) 5277 { 5278 struct amdgpu_device *tmp_adev = NULL; 5279 bool need_full_reset, skip_hw_reset, vram_lost = false; 5280 int r = 0; 5281 5282 /* Try reset handler method first */ 5283 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5284 reset_list); 5285 amdgpu_reset_reg_dumps(tmp_adev); 5286 5287 reset_context->reset_device_list = device_list_handle; 5288 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5289 /* If reset handler not implemented, continue; otherwise return */ 5290 if (r == -EOPNOTSUPP) 5291 r = 0; 5292 else 5293 return r; 5294 5295 /* Reset handler not implemented, use the default method */ 5296 need_full_reset = 5297 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5298 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5299 5300 /* 5301 * ASIC reset has to be done on all XGMI hive nodes ASAP 5302 * to allow proper links negotiation in FW (within 1 sec) 5303 */ 5304 if (!skip_hw_reset && need_full_reset) { 5305 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5306 /* For XGMI run all resets in parallel to speed up the process */ 5307 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5308 tmp_adev->gmc.xgmi.pending_reset = false; 5309 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5310 r = -EALREADY; 5311 } else 5312 r = amdgpu_asic_reset(tmp_adev); 5313 5314 if (r) { 5315 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5316 r, adev_to_drm(tmp_adev)->unique); 5317 goto out; 5318 } 5319 } 5320 5321 /* For XGMI wait for all resets to complete before proceed */ 5322 if (!r) { 5323 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5324 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5325 flush_work(&tmp_adev->xgmi_reset_work); 5326 r = tmp_adev->asic_reset_res; 5327 if (r) 5328 break; 5329 } 5330 } 5331 } 5332 } 5333 5334 if (!r && amdgpu_ras_intr_triggered()) { 5335 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5336 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5337 } 5338 5339 amdgpu_ras_intr_cleared(); 5340 } 5341 5342 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5343 if (need_full_reset) { 5344 /* post card */ 5345 amdgpu_ras_set_fed(tmp_adev, false); 5346 r = amdgpu_device_asic_init(tmp_adev); 5347 if (r) { 5348 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5349 } else { 5350 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5351 5352 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5353 if (r) 5354 goto out; 5355 5356 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5357 5358 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5359 5360 if (vram_lost) { 5361 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5362 amdgpu_inc_vram_lost(tmp_adev); 5363 } 5364 5365 r = amdgpu_device_fw_loading(tmp_adev); 5366 if (r) 5367 return r; 5368 5369 r = amdgpu_xcp_restore_partition_mode( 5370 tmp_adev->xcp_mgr); 5371 if (r) 5372 goto out; 5373 5374 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5375 if (r) 5376 goto out; 5377 5378 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5379 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5380 5381 if (vram_lost) 5382 amdgpu_device_fill_reset_magic(tmp_adev); 5383 5384 /* 5385 * Add this ASIC as tracked as reset was already 5386 * complete successfully. 5387 */ 5388 amdgpu_register_gpu_instance(tmp_adev); 5389 5390 if (!reset_context->hive && 5391 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5392 amdgpu_xgmi_add_device(tmp_adev); 5393 5394 r = amdgpu_device_ip_late_init(tmp_adev); 5395 if (r) 5396 goto out; 5397 5398 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5399 5400 /* 5401 * The GPU enters bad state once faulty pages 5402 * by ECC has reached the threshold, and ras 5403 * recovery is scheduled next. So add one check 5404 * here to break recovery if it indeed exceeds 5405 * bad page threshold, and remind user to 5406 * retire this GPU or setting one bigger 5407 * bad_page_threshold value to fix this once 5408 * probing driver again. 5409 */ 5410 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5411 /* must succeed. */ 5412 amdgpu_ras_resume(tmp_adev); 5413 } else { 5414 r = -EINVAL; 5415 goto out; 5416 } 5417 5418 /* Update PSP FW topology after reset */ 5419 if (reset_context->hive && 5420 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5421 r = amdgpu_xgmi_update_topology( 5422 reset_context->hive, tmp_adev); 5423 } 5424 } 5425 5426 out: 5427 if (!r) { 5428 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5429 r = amdgpu_ib_ring_tests(tmp_adev); 5430 if (r) { 5431 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5432 need_full_reset = true; 5433 r = -EAGAIN; 5434 goto end; 5435 } 5436 } 5437 5438 if (!r) 5439 r = amdgpu_device_recover_vram(tmp_adev); 5440 else 5441 tmp_adev->asic_reset_res = r; 5442 } 5443 5444 end: 5445 if (need_full_reset) 5446 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5447 else 5448 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5449 return r; 5450 } 5451 5452 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5453 { 5454 5455 switch (amdgpu_asic_reset_method(adev)) { 5456 case AMD_RESET_METHOD_MODE1: 5457 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5458 break; 5459 case AMD_RESET_METHOD_MODE2: 5460 adev->mp1_state = PP_MP1_STATE_RESET; 5461 break; 5462 default: 5463 adev->mp1_state = PP_MP1_STATE_NONE; 5464 break; 5465 } 5466 } 5467 5468 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5469 { 5470 amdgpu_vf_error_trans_all(adev); 5471 adev->mp1_state = PP_MP1_STATE_NONE; 5472 } 5473 5474 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5475 { 5476 struct pci_dev *p = NULL; 5477 5478 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5479 adev->pdev->bus->number, 1); 5480 if (p) { 5481 pm_runtime_enable(&(p->dev)); 5482 pm_runtime_resume(&(p->dev)); 5483 } 5484 5485 pci_dev_put(p); 5486 } 5487 5488 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5489 { 5490 enum amd_reset_method reset_method; 5491 struct pci_dev *p = NULL; 5492 u64 expires; 5493 5494 /* 5495 * For now, only BACO and mode1 reset are confirmed 5496 * to suffer the audio issue without proper suspended. 5497 */ 5498 reset_method = amdgpu_asic_reset_method(adev); 5499 if ((reset_method != AMD_RESET_METHOD_BACO) && 5500 (reset_method != AMD_RESET_METHOD_MODE1)) 5501 return -EINVAL; 5502 5503 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5504 adev->pdev->bus->number, 1); 5505 if (!p) 5506 return -ENODEV; 5507 5508 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5509 if (!expires) 5510 /* 5511 * If we cannot get the audio device autosuspend delay, 5512 * a fixed 4S interval will be used. Considering 3S is 5513 * the audio controller default autosuspend delay setting. 5514 * 4S used here is guaranteed to cover that. 5515 */ 5516 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5517 5518 while (!pm_runtime_status_suspended(&(p->dev))) { 5519 if (!pm_runtime_suspend(&(p->dev))) 5520 break; 5521 5522 if (expires < ktime_get_mono_fast_ns()) { 5523 dev_warn(adev->dev, "failed to suspend display audio\n"); 5524 pci_dev_put(p); 5525 /* TODO: abort the succeeding gpu reset? */ 5526 return -ETIMEDOUT; 5527 } 5528 } 5529 5530 pm_runtime_disable(&(p->dev)); 5531 5532 pci_dev_put(p); 5533 return 0; 5534 } 5535 5536 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5537 { 5538 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5539 5540 #if defined(CONFIG_DEBUG_FS) 5541 if (!amdgpu_sriov_vf(adev)) 5542 cancel_work(&adev->reset_work); 5543 #endif 5544 5545 if (adev->kfd.dev) 5546 cancel_work(&adev->kfd.reset_work); 5547 5548 if (amdgpu_sriov_vf(adev)) 5549 cancel_work(&adev->virt.flr_work); 5550 5551 if (con && adev->ras_enabled) 5552 cancel_work(&con->recovery_work); 5553 5554 } 5555 5556 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5557 { 5558 struct amdgpu_device *tmp_adev; 5559 int ret = 0; 5560 u32 status; 5561 5562 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5563 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5564 if (PCI_POSSIBLE_ERROR(status)) { 5565 dev_err(tmp_adev->dev, "device lost from bus!"); 5566 ret = -ENODEV; 5567 } 5568 } 5569 5570 return ret; 5571 } 5572 5573 /** 5574 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5575 * 5576 * @adev: amdgpu_device pointer 5577 * @job: which job trigger hang 5578 * @reset_context: amdgpu reset context pointer 5579 * 5580 * Attempt to reset the GPU if it has hung (all asics). 5581 * Attempt to do soft-reset or full-reset and reinitialize Asic 5582 * Returns 0 for success or an error on failure. 5583 */ 5584 5585 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5586 struct amdgpu_job *job, 5587 struct amdgpu_reset_context *reset_context) 5588 { 5589 struct list_head device_list, *device_list_handle = NULL; 5590 bool job_signaled = false; 5591 struct amdgpu_hive_info *hive = NULL; 5592 struct amdgpu_device *tmp_adev = NULL; 5593 int i, r = 0; 5594 bool need_emergency_restart = false; 5595 bool audio_suspended = false; 5596 5597 /* 5598 * Special case: RAS triggered and full reset isn't supported 5599 */ 5600 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5601 5602 /* 5603 * Flush RAM to disk so that after reboot 5604 * the user can read log and see why the system rebooted. 5605 */ 5606 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5607 amdgpu_ras_get_context(adev)->reboot) { 5608 DRM_WARN("Emergency reboot."); 5609 5610 ksys_sync_helper(); 5611 emergency_restart(); 5612 } 5613 5614 dev_info(adev->dev, "GPU %s begin!\n", 5615 need_emergency_restart ? "jobs stop":"reset"); 5616 5617 if (!amdgpu_sriov_vf(adev)) 5618 hive = amdgpu_get_xgmi_hive(adev); 5619 if (hive) 5620 mutex_lock(&hive->hive_lock); 5621 5622 reset_context->job = job; 5623 reset_context->hive = hive; 5624 /* 5625 * Build list of devices to reset. 5626 * In case we are in XGMI hive mode, resort the device list 5627 * to put adev in the 1st position. 5628 */ 5629 INIT_LIST_HEAD(&device_list); 5630 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5631 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5632 list_add_tail(&tmp_adev->reset_list, &device_list); 5633 if (adev->shutdown) 5634 tmp_adev->shutdown = true; 5635 } 5636 if (!list_is_first(&adev->reset_list, &device_list)) 5637 list_rotate_to_front(&adev->reset_list, &device_list); 5638 device_list_handle = &device_list; 5639 } else { 5640 list_add_tail(&adev->reset_list, &device_list); 5641 device_list_handle = &device_list; 5642 } 5643 5644 if (!amdgpu_sriov_vf(adev)) { 5645 r = amdgpu_device_health_check(device_list_handle); 5646 if (r) 5647 goto end_reset; 5648 } 5649 5650 /* We need to lock reset domain only once both for XGMI and single device */ 5651 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5652 reset_list); 5653 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5654 5655 /* block all schedulers and reset given job's ring */ 5656 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5657 5658 amdgpu_device_set_mp1_state(tmp_adev); 5659 5660 /* 5661 * Try to put the audio codec into suspend state 5662 * before gpu reset started. 5663 * 5664 * Due to the power domain of the graphics device 5665 * is shared with AZ power domain. Without this, 5666 * we may change the audio hardware from behind 5667 * the audio driver's back. That will trigger 5668 * some audio codec errors. 5669 */ 5670 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5671 audio_suspended = true; 5672 5673 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5674 5675 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5676 5677 if (!amdgpu_sriov_vf(tmp_adev)) 5678 amdgpu_amdkfd_pre_reset(tmp_adev); 5679 5680 /* 5681 * Mark these ASICs to be reseted as untracked first 5682 * And add them back after reset completed 5683 */ 5684 amdgpu_unregister_gpu_instance(tmp_adev); 5685 5686 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5687 5688 /* disable ras on ALL IPs */ 5689 if (!need_emergency_restart && 5690 amdgpu_device_ip_need_full_reset(tmp_adev)) 5691 amdgpu_ras_suspend(tmp_adev); 5692 5693 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5694 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5695 5696 if (!amdgpu_ring_sched_ready(ring)) 5697 continue; 5698 5699 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5700 5701 if (need_emergency_restart) 5702 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5703 } 5704 atomic_inc(&tmp_adev->gpu_reset_counter); 5705 } 5706 5707 if (need_emergency_restart) 5708 goto skip_sched_resume; 5709 5710 /* 5711 * Must check guilty signal here since after this point all old 5712 * HW fences are force signaled. 5713 * 5714 * job->base holds a reference to parent fence 5715 */ 5716 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5717 job_signaled = true; 5718 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5719 goto skip_hw_reset; 5720 } 5721 5722 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5723 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5724 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5725 /*TODO Should we stop ?*/ 5726 if (r) { 5727 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5728 r, adev_to_drm(tmp_adev)->unique); 5729 tmp_adev->asic_reset_res = r; 5730 } 5731 5732 if (!amdgpu_sriov_vf(tmp_adev)) 5733 /* 5734 * Drop all pending non scheduler resets. Scheduler resets 5735 * were already dropped during drm_sched_stop 5736 */ 5737 amdgpu_device_stop_pending_resets(tmp_adev); 5738 } 5739 5740 /* Actual ASIC resets if needed.*/ 5741 /* Host driver will handle XGMI hive reset for SRIOV */ 5742 if (amdgpu_sriov_vf(adev)) { 5743 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5744 if (r) 5745 adev->asic_reset_res = r; 5746 5747 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5748 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5749 IP_VERSION(9, 4, 2) || 5750 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5751 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5752 amdgpu_ras_resume(adev); 5753 } else { 5754 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5755 if (r && r == -EAGAIN) 5756 goto retry; 5757 } 5758 5759 skip_hw_reset: 5760 5761 /* Post ASIC reset for all devs .*/ 5762 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5763 5764 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5765 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5766 5767 if (!amdgpu_ring_sched_ready(ring)) 5768 continue; 5769 5770 drm_sched_start(&ring->sched, true); 5771 } 5772 5773 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5774 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5775 5776 if (tmp_adev->asic_reset_res) 5777 r = tmp_adev->asic_reset_res; 5778 5779 tmp_adev->asic_reset_res = 0; 5780 5781 if (r) { 5782 /* bad news, how to tell it to userspace ? */ 5783 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5784 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5785 } else { 5786 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5787 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5788 DRM_WARN("smart shift update failed\n"); 5789 } 5790 } 5791 5792 skip_sched_resume: 5793 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5794 /* unlock kfd: SRIOV would do it separately */ 5795 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5796 amdgpu_amdkfd_post_reset(tmp_adev); 5797 5798 /* kfd_post_reset will do nothing if kfd device is not initialized, 5799 * need to bring up kfd here if it's not be initialized before 5800 */ 5801 if (!adev->kfd.init_complete) 5802 amdgpu_amdkfd_device_init(adev); 5803 5804 if (audio_suspended) 5805 amdgpu_device_resume_display_audio(tmp_adev); 5806 5807 amdgpu_device_unset_mp1_state(tmp_adev); 5808 5809 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5810 } 5811 5812 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5813 reset_list); 5814 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5815 5816 end_reset: 5817 if (hive) { 5818 mutex_unlock(&hive->hive_lock); 5819 amdgpu_put_xgmi_hive(hive); 5820 } 5821 5822 if (r) 5823 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5824 5825 atomic_set(&adev->reset_domain->reset_res, r); 5826 return r; 5827 } 5828 5829 /** 5830 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5831 * 5832 * @adev: amdgpu_device pointer 5833 * @speed: pointer to the speed of the link 5834 * @width: pointer to the width of the link 5835 * 5836 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5837 * first physical partner to an AMD dGPU. 5838 * This will exclude any virtual switches and links. 5839 */ 5840 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5841 enum pci_bus_speed *speed, 5842 enum pcie_link_width *width) 5843 { 5844 struct pci_dev *parent = adev->pdev; 5845 5846 if (!speed || !width) 5847 return; 5848 5849 *speed = PCI_SPEED_UNKNOWN; 5850 *width = PCIE_LNK_WIDTH_UNKNOWN; 5851 5852 while ((parent = pci_upstream_bridge(parent))) { 5853 /* skip upstream/downstream switches internal to dGPU*/ 5854 if (parent->vendor == PCI_VENDOR_ID_ATI) 5855 continue; 5856 *speed = pcie_get_speed_cap(parent); 5857 *width = pcie_get_width_cap(parent); 5858 break; 5859 } 5860 } 5861 5862 /** 5863 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5864 * 5865 * @adev: amdgpu_device pointer 5866 * 5867 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5868 * and lanes) of the slot the device is in. Handles APUs and 5869 * virtualized environments where PCIE config space may not be available. 5870 */ 5871 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5872 { 5873 struct pci_dev *pdev; 5874 enum pci_bus_speed speed_cap, platform_speed_cap; 5875 enum pcie_link_width platform_link_width; 5876 5877 if (amdgpu_pcie_gen_cap) 5878 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5879 5880 if (amdgpu_pcie_lane_cap) 5881 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5882 5883 /* covers APUs as well */ 5884 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5885 if (adev->pm.pcie_gen_mask == 0) 5886 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5887 if (adev->pm.pcie_mlw_mask == 0) 5888 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5889 return; 5890 } 5891 5892 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5893 return; 5894 5895 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5896 &platform_link_width); 5897 5898 if (adev->pm.pcie_gen_mask == 0) { 5899 /* asic caps */ 5900 pdev = adev->pdev; 5901 speed_cap = pcie_get_speed_cap(pdev); 5902 if (speed_cap == PCI_SPEED_UNKNOWN) { 5903 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5904 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5905 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5906 } else { 5907 if (speed_cap == PCIE_SPEED_32_0GT) 5908 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5909 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5910 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5911 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5912 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5913 else if (speed_cap == PCIE_SPEED_16_0GT) 5914 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5915 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5916 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5917 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5918 else if (speed_cap == PCIE_SPEED_8_0GT) 5919 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5920 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5921 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5922 else if (speed_cap == PCIE_SPEED_5_0GT) 5923 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5924 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5925 else 5926 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5927 } 5928 /* platform caps */ 5929 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5930 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5931 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5932 } else { 5933 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5934 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5935 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5936 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5937 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5938 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5939 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5940 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5941 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5942 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5943 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5944 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5945 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5946 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5947 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5948 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5949 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5950 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5951 else 5952 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5953 5954 } 5955 } 5956 if (adev->pm.pcie_mlw_mask == 0) { 5957 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5958 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5959 } else { 5960 switch (platform_link_width) { 5961 case PCIE_LNK_X32: 5962 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5963 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5965 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5967 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5969 break; 5970 case PCIE_LNK_X16: 5971 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5972 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5973 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5974 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5976 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5977 break; 5978 case PCIE_LNK_X12: 5979 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5980 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5981 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5982 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5983 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5984 break; 5985 case PCIE_LNK_X8: 5986 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5987 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5988 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5989 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5990 break; 5991 case PCIE_LNK_X4: 5992 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5993 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5994 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5995 break; 5996 case PCIE_LNK_X2: 5997 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5998 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5999 break; 6000 case PCIE_LNK_X1: 6001 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6002 break; 6003 default: 6004 break; 6005 } 6006 } 6007 } 6008 } 6009 6010 /** 6011 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6012 * 6013 * @adev: amdgpu_device pointer 6014 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6015 * 6016 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6017 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6018 * @peer_adev. 6019 */ 6020 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6021 struct amdgpu_device *peer_adev) 6022 { 6023 #ifdef CONFIG_HSA_AMD_P2P 6024 uint64_t address_mask = peer_adev->dev->dma_mask ? 6025 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6026 resource_size_t aper_limit = 6027 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6028 bool p2p_access = 6029 !adev->gmc.xgmi.connected_to_cpu && 6030 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6031 6032 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6033 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6034 !(adev->gmc.aper_base & address_mask || 6035 aper_limit & address_mask)); 6036 #else 6037 return false; 6038 #endif 6039 } 6040 6041 int amdgpu_device_baco_enter(struct drm_device *dev) 6042 { 6043 struct amdgpu_device *adev = drm_to_adev(dev); 6044 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6045 6046 if (!amdgpu_device_supports_baco(dev)) 6047 return -ENOTSUPP; 6048 6049 if (ras && adev->ras_enabled && 6050 adev->nbio.funcs->enable_doorbell_interrupt) 6051 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6052 6053 return amdgpu_dpm_baco_enter(adev); 6054 } 6055 6056 int amdgpu_device_baco_exit(struct drm_device *dev) 6057 { 6058 struct amdgpu_device *adev = drm_to_adev(dev); 6059 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6060 int ret = 0; 6061 6062 if (!amdgpu_device_supports_baco(dev)) 6063 return -ENOTSUPP; 6064 6065 ret = amdgpu_dpm_baco_exit(adev); 6066 if (ret) 6067 return ret; 6068 6069 if (ras && adev->ras_enabled && 6070 adev->nbio.funcs->enable_doorbell_interrupt) 6071 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6072 6073 if (amdgpu_passthrough(adev) && 6074 adev->nbio.funcs->clear_doorbell_interrupt) 6075 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6076 6077 return 0; 6078 } 6079 6080 /** 6081 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6082 * @pdev: PCI device struct 6083 * @state: PCI channel state 6084 * 6085 * Description: Called when a PCI error is detected. 6086 * 6087 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6088 */ 6089 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6090 { 6091 struct drm_device *dev = pci_get_drvdata(pdev); 6092 struct amdgpu_device *adev = drm_to_adev(dev); 6093 int i; 6094 6095 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6096 6097 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6098 DRM_WARN("No support for XGMI hive yet..."); 6099 return PCI_ERS_RESULT_DISCONNECT; 6100 } 6101 6102 adev->pci_channel_state = state; 6103 6104 switch (state) { 6105 case pci_channel_io_normal: 6106 return PCI_ERS_RESULT_CAN_RECOVER; 6107 /* Fatal error, prepare for slot reset */ 6108 case pci_channel_io_frozen: 6109 /* 6110 * Locking adev->reset_domain->sem will prevent any external access 6111 * to GPU during PCI error recovery 6112 */ 6113 amdgpu_device_lock_reset_domain(adev->reset_domain); 6114 amdgpu_device_set_mp1_state(adev); 6115 6116 /* 6117 * Block any work scheduling as we do for regular GPU reset 6118 * for the duration of the recovery 6119 */ 6120 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6121 struct amdgpu_ring *ring = adev->rings[i]; 6122 6123 if (!amdgpu_ring_sched_ready(ring)) 6124 continue; 6125 6126 drm_sched_stop(&ring->sched, NULL); 6127 } 6128 atomic_inc(&adev->gpu_reset_counter); 6129 return PCI_ERS_RESULT_NEED_RESET; 6130 case pci_channel_io_perm_failure: 6131 /* Permanent error, prepare for device removal */ 6132 return PCI_ERS_RESULT_DISCONNECT; 6133 } 6134 6135 return PCI_ERS_RESULT_NEED_RESET; 6136 } 6137 6138 /** 6139 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6140 * @pdev: pointer to PCI device 6141 */ 6142 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6143 { 6144 6145 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6146 6147 /* TODO - dump whatever for debugging purposes */ 6148 6149 /* This called only if amdgpu_pci_error_detected returns 6150 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6151 * works, no need to reset slot. 6152 */ 6153 6154 return PCI_ERS_RESULT_RECOVERED; 6155 } 6156 6157 /** 6158 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6159 * @pdev: PCI device struct 6160 * 6161 * Description: This routine is called by the pci error recovery 6162 * code after the PCI slot has been reset, just before we 6163 * should resume normal operations. 6164 */ 6165 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6166 { 6167 struct drm_device *dev = pci_get_drvdata(pdev); 6168 struct amdgpu_device *adev = drm_to_adev(dev); 6169 int r, i; 6170 struct amdgpu_reset_context reset_context; 6171 u32 memsize; 6172 struct list_head device_list; 6173 struct amdgpu_hive_info *hive; 6174 int hive_ras_recovery = 0; 6175 struct amdgpu_ras *ras; 6176 6177 /* PCI error slot reset should be skipped During RAS recovery */ 6178 hive = amdgpu_get_xgmi_hive(adev); 6179 if (hive) { 6180 hive_ras_recovery = atomic_read(&hive->ras_recovery); 6181 amdgpu_put_xgmi_hive(hive); 6182 } 6183 ras = amdgpu_ras_get_context(adev); 6184 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && 6185 ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) 6186 return PCI_ERS_RESULT_RECOVERED; 6187 6188 DRM_INFO("PCI error: slot reset callback!!\n"); 6189 6190 memset(&reset_context, 0, sizeof(reset_context)); 6191 6192 INIT_LIST_HEAD(&device_list); 6193 list_add_tail(&adev->reset_list, &device_list); 6194 6195 /* wait for asic to come out of reset */ 6196 msleep(500); 6197 6198 /* Restore PCI confspace */ 6199 amdgpu_device_load_pci_state(pdev); 6200 6201 /* confirm ASIC came out of reset */ 6202 for (i = 0; i < adev->usec_timeout; i++) { 6203 memsize = amdgpu_asic_get_config_memsize(adev); 6204 6205 if (memsize != 0xffffffff) 6206 break; 6207 udelay(1); 6208 } 6209 if (memsize == 0xffffffff) { 6210 r = -ETIME; 6211 goto out; 6212 } 6213 6214 reset_context.method = AMD_RESET_METHOD_NONE; 6215 reset_context.reset_req_dev = adev; 6216 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6217 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6218 6219 adev->no_hw_access = true; 6220 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6221 adev->no_hw_access = false; 6222 if (r) 6223 goto out; 6224 6225 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6226 6227 out: 6228 if (!r) { 6229 if (amdgpu_device_cache_pci_state(adev->pdev)) 6230 pci_restore_state(adev->pdev); 6231 6232 DRM_INFO("PCIe error recovery succeeded\n"); 6233 } else { 6234 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6235 amdgpu_device_unset_mp1_state(adev); 6236 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6237 } 6238 6239 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6240 } 6241 6242 /** 6243 * amdgpu_pci_resume() - resume normal ops after PCI reset 6244 * @pdev: pointer to PCI device 6245 * 6246 * Called when the error recovery driver tells us that its 6247 * OK to resume normal operation. 6248 */ 6249 void amdgpu_pci_resume(struct pci_dev *pdev) 6250 { 6251 struct drm_device *dev = pci_get_drvdata(pdev); 6252 struct amdgpu_device *adev = drm_to_adev(dev); 6253 int i; 6254 6255 6256 DRM_INFO("PCI error: resume callback!!\n"); 6257 6258 /* Only continue execution for the case of pci_channel_io_frozen */ 6259 if (adev->pci_channel_state != pci_channel_io_frozen) 6260 return; 6261 6262 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6263 struct amdgpu_ring *ring = adev->rings[i]; 6264 6265 if (!amdgpu_ring_sched_ready(ring)) 6266 continue; 6267 6268 drm_sched_start(&ring->sched, true); 6269 } 6270 6271 amdgpu_device_unset_mp1_state(adev); 6272 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6273 } 6274 6275 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6276 { 6277 struct drm_device *dev = pci_get_drvdata(pdev); 6278 struct amdgpu_device *adev = drm_to_adev(dev); 6279 int r; 6280 6281 r = pci_save_state(pdev); 6282 if (!r) { 6283 kfree(adev->pci_state); 6284 6285 adev->pci_state = pci_store_saved_state(pdev); 6286 6287 if (!adev->pci_state) { 6288 DRM_ERROR("Failed to store PCI saved state"); 6289 return false; 6290 } 6291 } else { 6292 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6293 return false; 6294 } 6295 6296 return true; 6297 } 6298 6299 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6300 { 6301 struct drm_device *dev = pci_get_drvdata(pdev); 6302 struct amdgpu_device *adev = drm_to_adev(dev); 6303 int r; 6304 6305 if (!adev->pci_state) 6306 return false; 6307 6308 r = pci_load_saved_state(pdev, adev->pci_state); 6309 6310 if (!r) { 6311 pci_restore_state(pdev); 6312 } else { 6313 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6314 return false; 6315 } 6316 6317 return true; 6318 } 6319 6320 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6321 struct amdgpu_ring *ring) 6322 { 6323 #ifdef CONFIG_X86_64 6324 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6325 return; 6326 #endif 6327 if (adev->gmc.xgmi.connected_to_cpu) 6328 return; 6329 6330 if (ring && ring->funcs->emit_hdp_flush) 6331 amdgpu_ring_emit_hdp_flush(ring); 6332 else 6333 amdgpu_asic_flush_hdp(adev, ring); 6334 } 6335 6336 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6337 struct amdgpu_ring *ring) 6338 { 6339 #ifdef CONFIG_X86_64 6340 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6341 return; 6342 #endif 6343 if (adev->gmc.xgmi.connected_to_cpu) 6344 return; 6345 6346 amdgpu_asic_invalidate_hdp(adev, ring); 6347 } 6348 6349 int amdgpu_in_reset(struct amdgpu_device *adev) 6350 { 6351 return atomic_read(&adev->reset_domain->in_gpu_reset); 6352 } 6353 6354 /** 6355 * amdgpu_device_halt() - bring hardware to some kind of halt state 6356 * 6357 * @adev: amdgpu_device pointer 6358 * 6359 * Bring hardware to some kind of halt state so that no one can touch it 6360 * any more. It will help to maintain error context when error occurred. 6361 * Compare to a simple hang, the system will keep stable at least for SSH 6362 * access. Then it should be trivial to inspect the hardware state and 6363 * see what's going on. Implemented as following: 6364 * 6365 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6366 * clears all CPU mappings to device, disallows remappings through page faults 6367 * 2. amdgpu_irq_disable_all() disables all interrupts 6368 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6369 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6370 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6371 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6372 * flush any in flight DMA operations 6373 */ 6374 void amdgpu_device_halt(struct amdgpu_device *adev) 6375 { 6376 struct pci_dev *pdev = adev->pdev; 6377 struct drm_device *ddev = adev_to_drm(adev); 6378 6379 amdgpu_xcp_dev_unplug(adev); 6380 drm_dev_unplug(ddev); 6381 6382 amdgpu_irq_disable_all(adev); 6383 6384 amdgpu_fence_driver_hw_fini(adev); 6385 6386 adev->no_hw_access = true; 6387 6388 amdgpu_device_unmap_mmio(adev); 6389 6390 pci_disable_device(pdev); 6391 pci_wait_for_pending_transaction(pdev); 6392 } 6393 6394 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6395 u32 reg) 6396 { 6397 unsigned long flags, address, data; 6398 u32 r; 6399 6400 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6401 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6402 6403 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6404 WREG32(address, reg * 4); 6405 (void)RREG32(address); 6406 r = RREG32(data); 6407 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6408 return r; 6409 } 6410 6411 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6412 u32 reg, u32 v) 6413 { 6414 unsigned long flags, address, data; 6415 6416 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6417 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6418 6419 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6420 WREG32(address, reg * 4); 6421 (void)RREG32(address); 6422 WREG32(data, v); 6423 (void)RREG32(data); 6424 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6425 } 6426 6427 /** 6428 * amdgpu_device_switch_gang - switch to a new gang 6429 * @adev: amdgpu_device pointer 6430 * @gang: the gang to switch to 6431 * 6432 * Try to switch to a new gang. 6433 * Returns: NULL if we switched to the new gang or a reference to the current 6434 * gang leader. 6435 */ 6436 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6437 struct dma_fence *gang) 6438 { 6439 struct dma_fence *old = NULL; 6440 6441 do { 6442 dma_fence_put(old); 6443 rcu_read_lock(); 6444 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6445 rcu_read_unlock(); 6446 6447 if (old == gang) 6448 break; 6449 6450 if (!dma_fence_is_signaled(old)) 6451 return old; 6452 6453 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6454 old, gang) != old); 6455 6456 dma_fence_put(old); 6457 return NULL; 6458 } 6459 6460 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6461 { 6462 switch (adev->asic_type) { 6463 #ifdef CONFIG_DRM_AMDGPU_SI 6464 case CHIP_HAINAN: 6465 #endif 6466 case CHIP_TOPAZ: 6467 /* chips with no display hardware */ 6468 return false; 6469 #ifdef CONFIG_DRM_AMDGPU_SI 6470 case CHIP_TAHITI: 6471 case CHIP_PITCAIRN: 6472 case CHIP_VERDE: 6473 case CHIP_OLAND: 6474 #endif 6475 #ifdef CONFIG_DRM_AMDGPU_CIK 6476 case CHIP_BONAIRE: 6477 case CHIP_HAWAII: 6478 case CHIP_KAVERI: 6479 case CHIP_KABINI: 6480 case CHIP_MULLINS: 6481 #endif 6482 case CHIP_TONGA: 6483 case CHIP_FIJI: 6484 case CHIP_POLARIS10: 6485 case CHIP_POLARIS11: 6486 case CHIP_POLARIS12: 6487 case CHIP_VEGAM: 6488 case CHIP_CARRIZO: 6489 case CHIP_STONEY: 6490 /* chips with display hardware */ 6491 return true; 6492 default: 6493 /* IP discovery */ 6494 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6495 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6496 return false; 6497 return true; 6498 } 6499 } 6500 6501 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6502 uint32_t inst, uint32_t reg_addr, char reg_name[], 6503 uint32_t expected_value, uint32_t mask) 6504 { 6505 uint32_t ret = 0; 6506 uint32_t old_ = 0; 6507 uint32_t tmp_ = RREG32(reg_addr); 6508 uint32_t loop = adev->usec_timeout; 6509 6510 while ((tmp_ & (mask)) != (expected_value)) { 6511 if (old_ != tmp_) { 6512 loop = adev->usec_timeout; 6513 old_ = tmp_; 6514 } else 6515 udelay(1); 6516 tmp_ = RREG32(reg_addr); 6517 loop--; 6518 if (!loop) { 6519 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6520 inst, reg_name, (uint32_t)expected_value, 6521 (uint32_t)(tmp_ & (mask))); 6522 ret = -ETIMEDOUT; 6523 break; 6524 } 6525 } 6526 return ret; 6527 } 6528