1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 166 struct bin_attribute *attr, char *buf, 167 loff_t ppos, size_t count) 168 { 169 struct device *dev = kobj_to_dev(kobj); 170 struct drm_device *ddev = dev_get_drvdata(dev); 171 struct amdgpu_device *adev = drm_to_adev(ddev); 172 ssize_t bytes_read; 173 174 switch (ppos) { 175 case AMDGPU_SYS_REG_STATE_XGMI: 176 bytes_read = amdgpu_asic_get_reg_state( 177 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 178 break; 179 case AMDGPU_SYS_REG_STATE_WAFL: 180 bytes_read = amdgpu_asic_get_reg_state( 181 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 182 break; 183 case AMDGPU_SYS_REG_STATE_PCIE: 184 bytes_read = amdgpu_asic_get_reg_state( 185 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 186 break; 187 case AMDGPU_SYS_REG_STATE_USR: 188 bytes_read = amdgpu_asic_get_reg_state( 189 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 190 break; 191 case AMDGPU_SYS_REG_STATE_USR_1: 192 bytes_read = amdgpu_asic_get_reg_state( 193 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 194 break; 195 default: 196 return -EINVAL; 197 } 198 199 return bytes_read; 200 } 201 202 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 203 AMDGPU_SYS_REG_STATE_END); 204 205 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 206 { 207 int ret; 208 209 if (!amdgpu_asic_get_reg_state_supported(adev)) 210 return 0; 211 212 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 213 214 return ret; 215 } 216 217 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 218 { 219 if (!amdgpu_asic_get_reg_state_supported(adev)) 220 return; 221 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 222 } 223 224 /** 225 * DOC: board_info 226 * 227 * The amdgpu driver provides a sysfs API for giving board related information. 228 * It provides the form factor information in the format 229 * 230 * type : form factor 231 * 232 * Possible form factor values 233 * 234 * - "cem" - PCIE CEM card 235 * - "oam" - Open Compute Accelerator Module 236 * - "unknown" - Not known 237 * 238 */ 239 240 static ssize_t amdgpu_device_get_board_info(struct device *dev, 241 struct device_attribute *attr, 242 char *buf) 243 { 244 struct drm_device *ddev = dev_get_drvdata(dev); 245 struct amdgpu_device *adev = drm_to_adev(ddev); 246 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 247 const char *pkg; 248 249 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 250 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 251 252 switch (pkg_type) { 253 case AMDGPU_PKG_TYPE_CEM: 254 pkg = "cem"; 255 break; 256 case AMDGPU_PKG_TYPE_OAM: 257 pkg = "oam"; 258 break; 259 default: 260 pkg = "unknown"; 261 break; 262 } 263 264 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 265 } 266 267 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 268 269 static struct attribute *amdgpu_board_attrs[] = { 270 &dev_attr_board_info.attr, 271 NULL, 272 }; 273 274 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 275 struct attribute *attr, int n) 276 { 277 struct device *dev = kobj_to_dev(kobj); 278 struct drm_device *ddev = dev_get_drvdata(dev); 279 struct amdgpu_device *adev = drm_to_adev(ddev); 280 281 if (adev->flags & AMD_IS_APU) 282 return 0; 283 284 return attr->mode; 285 } 286 287 static const struct attribute_group amdgpu_board_attrs_group = { 288 .attrs = amdgpu_board_attrs, 289 .is_visible = amdgpu_board_attrs_is_visible 290 }; 291 292 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 293 294 295 /** 296 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 297 * 298 * @dev: drm_device pointer 299 * 300 * Returns true if the device is a dGPU with ATPX power control, 301 * otherwise return false. 302 */ 303 bool amdgpu_device_supports_px(struct drm_device *dev) 304 { 305 struct amdgpu_device *adev = drm_to_adev(dev); 306 307 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 308 return true; 309 return false; 310 } 311 312 /** 313 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 314 * 315 * @dev: drm_device pointer 316 * 317 * Returns true if the device is a dGPU with ACPI power control, 318 * otherwise return false. 319 */ 320 bool amdgpu_device_supports_boco(struct drm_device *dev) 321 { 322 struct amdgpu_device *adev = drm_to_adev(dev); 323 324 if (adev->has_pr3 || 325 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 326 return true; 327 return false; 328 } 329 330 /** 331 * amdgpu_device_supports_baco - Does the device support BACO 332 * 333 * @dev: drm_device pointer 334 * 335 * Returns true if the device supporte BACO, 336 * otherwise return false. 337 */ 338 bool amdgpu_device_supports_baco(struct drm_device *dev) 339 { 340 struct amdgpu_device *adev = drm_to_adev(dev); 341 342 return amdgpu_asic_supports_baco(adev); 343 } 344 345 /** 346 * amdgpu_device_supports_smart_shift - Is the device dGPU with 347 * smart shift support 348 * 349 * @dev: drm_device pointer 350 * 351 * Returns true if the device is a dGPU with Smart Shift support, 352 * otherwise returns false. 353 */ 354 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 355 { 356 return (amdgpu_device_supports_boco(dev) && 357 amdgpu_acpi_is_power_shift_control_supported()); 358 } 359 360 /* 361 * VRAM access helper functions 362 */ 363 364 /** 365 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 366 * 367 * @adev: amdgpu_device pointer 368 * @pos: offset of the buffer in vram 369 * @buf: virtual address of the buffer in system memory 370 * @size: read/write size, sizeof(@buf) must > @size 371 * @write: true - write to vram, otherwise - read from vram 372 */ 373 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 374 void *buf, size_t size, bool write) 375 { 376 unsigned long flags; 377 uint32_t hi = ~0, tmp = 0; 378 uint32_t *data = buf; 379 uint64_t last; 380 int idx; 381 382 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 383 return; 384 385 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 386 387 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 388 for (last = pos + size; pos < last; pos += 4) { 389 tmp = pos >> 31; 390 391 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 392 if (tmp != hi) { 393 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 394 hi = tmp; 395 } 396 if (write) 397 WREG32_NO_KIQ(mmMM_DATA, *data++); 398 else 399 *data++ = RREG32_NO_KIQ(mmMM_DATA); 400 } 401 402 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 403 drm_dev_exit(idx); 404 } 405 406 /** 407 * amdgpu_device_aper_access - access vram by vram aperature 408 * 409 * @adev: amdgpu_device pointer 410 * @pos: offset of the buffer in vram 411 * @buf: virtual address of the buffer in system memory 412 * @size: read/write size, sizeof(@buf) must > @size 413 * @write: true - write to vram, otherwise - read from vram 414 * 415 * The return value means how many bytes have been transferred. 416 */ 417 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 418 void *buf, size_t size, bool write) 419 { 420 #ifdef CONFIG_64BIT 421 void __iomem *addr; 422 size_t count = 0; 423 uint64_t last; 424 425 if (!adev->mman.aper_base_kaddr) 426 return 0; 427 428 last = min(pos + size, adev->gmc.visible_vram_size); 429 if (last > pos) { 430 addr = adev->mman.aper_base_kaddr + pos; 431 count = last - pos; 432 433 if (write) { 434 memcpy_toio(addr, buf, count); 435 /* Make sure HDP write cache flush happens without any reordering 436 * after the system memory contents are sent over PCIe device 437 */ 438 mb(); 439 amdgpu_device_flush_hdp(adev, NULL); 440 } else { 441 amdgpu_device_invalidate_hdp(adev, NULL); 442 /* Make sure HDP read cache is invalidated before issuing a read 443 * to the PCIe device 444 */ 445 mb(); 446 memcpy_fromio(buf, addr, count); 447 } 448 449 } 450 451 return count; 452 #else 453 return 0; 454 #endif 455 } 456 457 /** 458 * amdgpu_device_vram_access - read/write a buffer in vram 459 * 460 * @adev: amdgpu_device pointer 461 * @pos: offset of the buffer in vram 462 * @buf: virtual address of the buffer in system memory 463 * @size: read/write size, sizeof(@buf) must > @size 464 * @write: true - write to vram, otherwise - read from vram 465 */ 466 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 467 void *buf, size_t size, bool write) 468 { 469 size_t count; 470 471 /* try to using vram apreature to access vram first */ 472 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 473 size -= count; 474 if (size) { 475 /* using MM to access rest vram */ 476 pos += count; 477 buf += count; 478 amdgpu_device_mm_access(adev, pos, buf, size, write); 479 } 480 } 481 482 /* 483 * register access helper functions. 484 */ 485 486 /* Check if hw access should be skipped because of hotplug or device error */ 487 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 488 { 489 if (adev->no_hw_access) 490 return true; 491 492 #ifdef CONFIG_LOCKDEP 493 /* 494 * This is a bit complicated to understand, so worth a comment. What we assert 495 * here is that the GPU reset is not running on another thread in parallel. 496 * 497 * For this we trylock the read side of the reset semaphore, if that succeeds 498 * we know that the reset is not running in paralell. 499 * 500 * If the trylock fails we assert that we are either already holding the read 501 * side of the lock or are the reset thread itself and hold the write side of 502 * the lock. 503 */ 504 if (in_task()) { 505 if (down_read_trylock(&adev->reset_domain->sem)) 506 up_read(&adev->reset_domain->sem); 507 else 508 lockdep_assert_held(&adev->reset_domain->sem); 509 } 510 #endif 511 return false; 512 } 513 514 /** 515 * amdgpu_device_rreg - read a memory mapped IO or indirect register 516 * 517 * @adev: amdgpu_device pointer 518 * @reg: dword aligned register offset 519 * @acc_flags: access flags which require special behavior 520 * 521 * Returns the 32 bit value from the offset specified. 522 */ 523 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 524 uint32_t reg, uint32_t acc_flags) 525 { 526 uint32_t ret; 527 528 if (amdgpu_device_skip_hw_access(adev)) 529 return 0; 530 531 if ((reg * 4) < adev->rmmio_size) { 532 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 533 amdgpu_sriov_runtime(adev) && 534 down_read_trylock(&adev->reset_domain->sem)) { 535 ret = amdgpu_kiq_rreg(adev, reg, 0); 536 up_read(&adev->reset_domain->sem); 537 } else { 538 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 539 } 540 } else { 541 ret = adev->pcie_rreg(adev, reg * 4); 542 } 543 544 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 545 546 return ret; 547 } 548 549 /* 550 * MMIO register read with bytes helper functions 551 * @offset:bytes offset from MMIO start 552 */ 553 554 /** 555 * amdgpu_mm_rreg8 - read a memory mapped IO register 556 * 557 * @adev: amdgpu_device pointer 558 * @offset: byte aligned register offset 559 * 560 * Returns the 8 bit value from the offset specified. 561 */ 562 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 563 { 564 if (amdgpu_device_skip_hw_access(adev)) 565 return 0; 566 567 if (offset < adev->rmmio_size) 568 return (readb(adev->rmmio + offset)); 569 BUG(); 570 } 571 572 573 /** 574 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 575 * 576 * @adev: amdgpu_device pointer 577 * @reg: dword aligned register offset 578 * @acc_flags: access flags which require special behavior 579 * @xcc_id: xcc accelerated compute core id 580 * 581 * Returns the 32 bit value from the offset specified. 582 */ 583 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 584 uint32_t reg, uint32_t acc_flags, 585 uint32_t xcc_id) 586 { 587 uint32_t ret, rlcg_flag; 588 589 if (amdgpu_device_skip_hw_access(adev)) 590 return 0; 591 592 if ((reg * 4) < adev->rmmio_size) { 593 if (amdgpu_sriov_vf(adev) && 594 !amdgpu_sriov_runtime(adev) && 595 adev->gfx.rlc.rlcg_reg_access_supported && 596 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 597 GC_HWIP, false, 598 &rlcg_flag)) { 599 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 600 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 601 amdgpu_sriov_runtime(adev) && 602 down_read_trylock(&adev->reset_domain->sem)) { 603 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 604 up_read(&adev->reset_domain->sem); 605 } else { 606 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 607 } 608 } else { 609 ret = adev->pcie_rreg(adev, reg * 4); 610 } 611 612 return ret; 613 } 614 615 /* 616 * MMIO register write with bytes helper functions 617 * @offset:bytes offset from MMIO start 618 * @value: the value want to be written to the register 619 */ 620 621 /** 622 * amdgpu_mm_wreg8 - read a memory mapped IO register 623 * 624 * @adev: amdgpu_device pointer 625 * @offset: byte aligned register offset 626 * @value: 8 bit value to write 627 * 628 * Writes the value specified to the offset specified. 629 */ 630 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 631 { 632 if (amdgpu_device_skip_hw_access(adev)) 633 return; 634 635 if (offset < adev->rmmio_size) 636 writeb(value, adev->rmmio + offset); 637 else 638 BUG(); 639 } 640 641 /** 642 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 643 * 644 * @adev: amdgpu_device pointer 645 * @reg: dword aligned register offset 646 * @v: 32 bit value to write to the register 647 * @acc_flags: access flags which require special behavior 648 * 649 * Writes the value specified to the offset specified. 650 */ 651 void amdgpu_device_wreg(struct amdgpu_device *adev, 652 uint32_t reg, uint32_t v, 653 uint32_t acc_flags) 654 { 655 if (amdgpu_device_skip_hw_access(adev)) 656 return; 657 658 if ((reg * 4) < adev->rmmio_size) { 659 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 660 amdgpu_sriov_runtime(adev) && 661 down_read_trylock(&adev->reset_domain->sem)) { 662 amdgpu_kiq_wreg(adev, reg, v, 0); 663 up_read(&adev->reset_domain->sem); 664 } else { 665 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 666 } 667 } else { 668 adev->pcie_wreg(adev, reg * 4, v); 669 } 670 671 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 672 } 673 674 /** 675 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 676 * 677 * @adev: amdgpu_device pointer 678 * @reg: mmio/rlc register 679 * @v: value to write 680 * @xcc_id: xcc accelerated compute core id 681 * 682 * this function is invoked only for the debugfs register access 683 */ 684 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 685 uint32_t reg, uint32_t v, 686 uint32_t xcc_id) 687 { 688 if (amdgpu_device_skip_hw_access(adev)) 689 return; 690 691 if (amdgpu_sriov_fullaccess(adev) && 692 adev->gfx.rlc.funcs && 693 adev->gfx.rlc.funcs->is_rlcg_access_range) { 694 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 695 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 696 } else if ((reg * 4) >= adev->rmmio_size) { 697 adev->pcie_wreg(adev, reg * 4, v); 698 } else { 699 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 700 } 701 } 702 703 /** 704 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 705 * 706 * @adev: amdgpu_device pointer 707 * @reg: dword aligned register offset 708 * @v: 32 bit value to write to the register 709 * @acc_flags: access flags which require special behavior 710 * @xcc_id: xcc accelerated compute core id 711 * 712 * Writes the value specified to the offset specified. 713 */ 714 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 715 uint32_t reg, uint32_t v, 716 uint32_t acc_flags, uint32_t xcc_id) 717 { 718 uint32_t rlcg_flag; 719 720 if (amdgpu_device_skip_hw_access(adev)) 721 return; 722 723 if ((reg * 4) < adev->rmmio_size) { 724 if (amdgpu_sriov_vf(adev) && 725 !amdgpu_sriov_runtime(adev) && 726 adev->gfx.rlc.rlcg_reg_access_supported && 727 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 728 GC_HWIP, true, 729 &rlcg_flag)) { 730 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 731 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 732 amdgpu_sriov_runtime(adev) && 733 down_read_trylock(&adev->reset_domain->sem)) { 734 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 735 up_read(&adev->reset_domain->sem); 736 } else { 737 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 738 } 739 } else { 740 adev->pcie_wreg(adev, reg * 4, v); 741 } 742 } 743 744 /** 745 * amdgpu_device_indirect_rreg - read an indirect register 746 * 747 * @adev: amdgpu_device pointer 748 * @reg_addr: indirect register address to read from 749 * 750 * Returns the value of indirect register @reg_addr 751 */ 752 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 753 u32 reg_addr) 754 { 755 unsigned long flags, pcie_index, pcie_data; 756 void __iomem *pcie_index_offset; 757 void __iomem *pcie_data_offset; 758 u32 r; 759 760 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 761 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 762 763 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 764 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 765 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 766 767 writel(reg_addr, pcie_index_offset); 768 readl(pcie_index_offset); 769 r = readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 772 return r; 773 } 774 775 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 776 u64 reg_addr) 777 { 778 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 779 u32 r; 780 void __iomem *pcie_index_offset; 781 void __iomem *pcie_index_hi_offset; 782 void __iomem *pcie_data_offset; 783 784 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 785 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 786 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 787 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 788 else 789 pcie_index_hi = 0; 790 791 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 792 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 793 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 794 if (pcie_index_hi != 0) 795 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 796 pcie_index_hi * 4; 797 798 writel(reg_addr, pcie_index_offset); 799 readl(pcie_index_offset); 800 if (pcie_index_hi != 0) { 801 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 802 readl(pcie_index_hi_offset); 803 } 804 r = readl(pcie_data_offset); 805 806 /* clear the high bits */ 807 if (pcie_index_hi != 0) { 808 writel(0, pcie_index_hi_offset); 809 readl(pcie_index_hi_offset); 810 } 811 812 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 813 814 return r; 815 } 816 817 /** 818 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 819 * 820 * @adev: amdgpu_device pointer 821 * @reg_addr: indirect register address to read from 822 * 823 * Returns the value of indirect register @reg_addr 824 */ 825 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 826 u32 reg_addr) 827 { 828 unsigned long flags, pcie_index, pcie_data; 829 void __iomem *pcie_index_offset; 830 void __iomem *pcie_data_offset; 831 u64 r; 832 833 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 834 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 835 836 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 837 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 838 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 839 840 /* read low 32 bits */ 841 writel(reg_addr, pcie_index_offset); 842 readl(pcie_index_offset); 843 r = readl(pcie_data_offset); 844 /* read high 32 bits */ 845 writel(reg_addr + 4, pcie_index_offset); 846 readl(pcie_index_offset); 847 r |= ((u64)readl(pcie_data_offset) << 32); 848 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 849 850 return r; 851 } 852 853 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 854 u64 reg_addr) 855 { 856 unsigned long flags, pcie_index, pcie_data; 857 unsigned long pcie_index_hi = 0; 858 void __iomem *pcie_index_offset; 859 void __iomem *pcie_index_hi_offset; 860 void __iomem *pcie_data_offset; 861 u64 r; 862 863 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 864 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 865 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 866 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 867 868 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 869 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 870 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 871 if (pcie_index_hi != 0) 872 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 873 pcie_index_hi * 4; 874 875 /* read low 32 bits */ 876 writel(reg_addr, pcie_index_offset); 877 readl(pcie_index_offset); 878 if (pcie_index_hi != 0) { 879 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 880 readl(pcie_index_hi_offset); 881 } 882 r = readl(pcie_data_offset); 883 /* read high 32 bits */ 884 writel(reg_addr + 4, pcie_index_offset); 885 readl(pcie_index_offset); 886 if (pcie_index_hi != 0) { 887 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 888 readl(pcie_index_hi_offset); 889 } 890 r |= ((u64)readl(pcie_data_offset) << 32); 891 892 /* clear the high bits */ 893 if (pcie_index_hi != 0) { 894 writel(0, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 898 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 899 900 return r; 901 } 902 903 /** 904 * amdgpu_device_indirect_wreg - write an indirect register address 905 * 906 * @adev: amdgpu_device pointer 907 * @reg_addr: indirect register offset 908 * @reg_data: indirect register data 909 * 910 */ 911 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 912 u32 reg_addr, u32 reg_data) 913 { 914 unsigned long flags, pcie_index, pcie_data; 915 void __iomem *pcie_index_offset; 916 void __iomem *pcie_data_offset; 917 918 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 919 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 920 921 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 922 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 923 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 924 925 writel(reg_addr, pcie_index_offset); 926 readl(pcie_index_offset); 927 writel(reg_data, pcie_data_offset); 928 readl(pcie_data_offset); 929 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 930 } 931 932 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 933 u64 reg_addr, u32 reg_data) 934 { 935 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 936 void __iomem *pcie_index_offset; 937 void __iomem *pcie_index_hi_offset; 938 void __iomem *pcie_data_offset; 939 940 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 941 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 942 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 943 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 944 else 945 pcie_index_hi = 0; 946 947 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 948 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 949 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 950 if (pcie_index_hi != 0) 951 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 952 pcie_index_hi * 4; 953 954 writel(reg_addr, pcie_index_offset); 955 readl(pcie_index_offset); 956 if (pcie_index_hi != 0) { 957 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 958 readl(pcie_index_hi_offset); 959 } 960 writel(reg_data, pcie_data_offset); 961 readl(pcie_data_offset); 962 963 /* clear the high bits */ 964 if (pcie_index_hi != 0) { 965 writel(0, pcie_index_hi_offset); 966 readl(pcie_index_hi_offset); 967 } 968 969 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 970 } 971 972 /** 973 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 974 * 975 * @adev: amdgpu_device pointer 976 * @reg_addr: indirect register offset 977 * @reg_data: indirect register data 978 * 979 */ 980 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 981 u32 reg_addr, u64 reg_data) 982 { 983 unsigned long flags, pcie_index, pcie_data; 984 void __iomem *pcie_index_offset; 985 void __iomem *pcie_data_offset; 986 987 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 988 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 989 990 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 991 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 992 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 993 994 /* write low 32 bits */ 995 writel(reg_addr, pcie_index_offset); 996 readl(pcie_index_offset); 997 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 998 readl(pcie_data_offset); 999 /* write high 32 bits */ 1000 writel(reg_addr + 4, pcie_index_offset); 1001 readl(pcie_index_offset); 1002 writel((u32)(reg_data >> 32), pcie_data_offset); 1003 readl(pcie_data_offset); 1004 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1005 } 1006 1007 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1008 u64 reg_addr, u64 reg_data) 1009 { 1010 unsigned long flags, pcie_index, pcie_data; 1011 unsigned long pcie_index_hi = 0; 1012 void __iomem *pcie_index_offset; 1013 void __iomem *pcie_index_hi_offset; 1014 void __iomem *pcie_data_offset; 1015 1016 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1017 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1018 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1019 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1020 1021 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1022 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1023 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1024 if (pcie_index_hi != 0) 1025 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1026 pcie_index_hi * 4; 1027 1028 /* write low 32 bits */ 1029 writel(reg_addr, pcie_index_offset); 1030 readl(pcie_index_offset); 1031 if (pcie_index_hi != 0) { 1032 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1033 readl(pcie_index_hi_offset); 1034 } 1035 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1036 readl(pcie_data_offset); 1037 /* write high 32 bits */ 1038 writel(reg_addr + 4, pcie_index_offset); 1039 readl(pcie_index_offset); 1040 if (pcie_index_hi != 0) { 1041 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1042 readl(pcie_index_hi_offset); 1043 } 1044 writel((u32)(reg_data >> 32), pcie_data_offset); 1045 readl(pcie_data_offset); 1046 1047 /* clear the high bits */ 1048 if (pcie_index_hi != 0) { 1049 writel(0, pcie_index_hi_offset); 1050 readl(pcie_index_hi_offset); 1051 } 1052 1053 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1054 } 1055 1056 /** 1057 * amdgpu_device_get_rev_id - query device rev_id 1058 * 1059 * @adev: amdgpu_device pointer 1060 * 1061 * Return device rev_id 1062 */ 1063 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1064 { 1065 return adev->nbio.funcs->get_rev_id(adev); 1066 } 1067 1068 /** 1069 * amdgpu_invalid_rreg - dummy reg read function 1070 * 1071 * @adev: amdgpu_device pointer 1072 * @reg: offset of register 1073 * 1074 * Dummy register read function. Used for register blocks 1075 * that certain asics don't have (all asics). 1076 * Returns the value in the register. 1077 */ 1078 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1079 { 1080 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1081 BUG(); 1082 return 0; 1083 } 1084 1085 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1086 { 1087 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1088 BUG(); 1089 return 0; 1090 } 1091 1092 /** 1093 * amdgpu_invalid_wreg - dummy reg write function 1094 * 1095 * @adev: amdgpu_device pointer 1096 * @reg: offset of register 1097 * @v: value to write to the register 1098 * 1099 * Dummy register read function. Used for register blocks 1100 * that certain asics don't have (all asics). 1101 */ 1102 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1103 { 1104 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1105 reg, v); 1106 BUG(); 1107 } 1108 1109 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1110 { 1111 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1112 reg, v); 1113 BUG(); 1114 } 1115 1116 /** 1117 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1118 * 1119 * @adev: amdgpu_device pointer 1120 * @reg: offset of register 1121 * 1122 * Dummy register read function. Used for register blocks 1123 * that certain asics don't have (all asics). 1124 * Returns the value in the register. 1125 */ 1126 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1127 { 1128 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1129 BUG(); 1130 return 0; 1131 } 1132 1133 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1134 { 1135 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1136 BUG(); 1137 return 0; 1138 } 1139 1140 /** 1141 * amdgpu_invalid_wreg64 - dummy reg write function 1142 * 1143 * @adev: amdgpu_device pointer 1144 * @reg: offset of register 1145 * @v: value to write to the register 1146 * 1147 * Dummy register read function. Used for register blocks 1148 * that certain asics don't have (all asics). 1149 */ 1150 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1151 { 1152 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1153 reg, v); 1154 BUG(); 1155 } 1156 1157 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1158 { 1159 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1160 reg, v); 1161 BUG(); 1162 } 1163 1164 /** 1165 * amdgpu_block_invalid_rreg - dummy reg read function 1166 * 1167 * @adev: amdgpu_device pointer 1168 * @block: offset of instance 1169 * @reg: offset of register 1170 * 1171 * Dummy register read function. Used for register blocks 1172 * that certain asics don't have (all asics). 1173 * Returns the value in the register. 1174 */ 1175 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1176 uint32_t block, uint32_t reg) 1177 { 1178 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1179 reg, block); 1180 BUG(); 1181 return 0; 1182 } 1183 1184 /** 1185 * amdgpu_block_invalid_wreg - dummy reg write function 1186 * 1187 * @adev: amdgpu_device pointer 1188 * @block: offset of instance 1189 * @reg: offset of register 1190 * @v: value to write to the register 1191 * 1192 * Dummy register read function. Used for register blocks 1193 * that certain asics don't have (all asics). 1194 */ 1195 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1196 uint32_t block, 1197 uint32_t reg, uint32_t v) 1198 { 1199 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1200 reg, block, v); 1201 BUG(); 1202 } 1203 1204 /** 1205 * amdgpu_device_asic_init - Wrapper for atom asic_init 1206 * 1207 * @adev: amdgpu_device pointer 1208 * 1209 * Does any asic specific work and then calls atom asic init. 1210 */ 1211 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1212 { 1213 int ret; 1214 1215 amdgpu_asic_pre_asic_init(adev); 1216 1217 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1218 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1219 amdgpu_psp_wait_for_bootloader(adev); 1220 ret = amdgpu_atomfirmware_asic_init(adev, true); 1221 /* TODO: check the return val and stop device initialization if boot fails */ 1222 amdgpu_psp_query_boot_status(adev); 1223 return ret; 1224 } else { 1225 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1226 } 1227 1228 return 0; 1229 } 1230 1231 /** 1232 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1233 * 1234 * @adev: amdgpu_device pointer 1235 * 1236 * Allocates a scratch page of VRAM for use by various things in the 1237 * driver. 1238 */ 1239 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1240 { 1241 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1242 AMDGPU_GEM_DOMAIN_VRAM | 1243 AMDGPU_GEM_DOMAIN_GTT, 1244 &adev->mem_scratch.robj, 1245 &adev->mem_scratch.gpu_addr, 1246 (void **)&adev->mem_scratch.ptr); 1247 } 1248 1249 /** 1250 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1251 * 1252 * @adev: amdgpu_device pointer 1253 * 1254 * Frees the VRAM scratch page. 1255 */ 1256 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1257 { 1258 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1259 } 1260 1261 /** 1262 * amdgpu_device_program_register_sequence - program an array of registers. 1263 * 1264 * @adev: amdgpu_device pointer 1265 * @registers: pointer to the register array 1266 * @array_size: size of the register array 1267 * 1268 * Programs an array or registers with and or masks. 1269 * This is a helper for setting golden registers. 1270 */ 1271 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1272 const u32 *registers, 1273 const u32 array_size) 1274 { 1275 u32 tmp, reg, and_mask, or_mask; 1276 int i; 1277 1278 if (array_size % 3) 1279 return; 1280 1281 for (i = 0; i < array_size; i += 3) { 1282 reg = registers[i + 0]; 1283 and_mask = registers[i + 1]; 1284 or_mask = registers[i + 2]; 1285 1286 if (and_mask == 0xffffffff) { 1287 tmp = or_mask; 1288 } else { 1289 tmp = RREG32(reg); 1290 tmp &= ~and_mask; 1291 if (adev->family >= AMDGPU_FAMILY_AI) 1292 tmp |= (or_mask & and_mask); 1293 else 1294 tmp |= or_mask; 1295 } 1296 WREG32(reg, tmp); 1297 } 1298 } 1299 1300 /** 1301 * amdgpu_device_pci_config_reset - reset the GPU 1302 * 1303 * @adev: amdgpu_device pointer 1304 * 1305 * Resets the GPU using the pci config reset sequence. 1306 * Only applicable to asics prior to vega10. 1307 */ 1308 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1309 { 1310 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1311 } 1312 1313 /** 1314 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1315 * 1316 * @adev: amdgpu_device pointer 1317 * 1318 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1319 */ 1320 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1321 { 1322 return pci_reset_function(adev->pdev); 1323 } 1324 1325 /* 1326 * amdgpu_device_wb_*() 1327 * Writeback is the method by which the GPU updates special pages in memory 1328 * with the status of certain GPU events (fences, ring pointers,etc.). 1329 */ 1330 1331 /** 1332 * amdgpu_device_wb_fini - Disable Writeback and free memory 1333 * 1334 * @adev: amdgpu_device pointer 1335 * 1336 * Disables Writeback and frees the Writeback memory (all asics). 1337 * Used at driver shutdown. 1338 */ 1339 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1340 { 1341 if (adev->wb.wb_obj) { 1342 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1343 &adev->wb.gpu_addr, 1344 (void **)&adev->wb.wb); 1345 adev->wb.wb_obj = NULL; 1346 } 1347 } 1348 1349 /** 1350 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1351 * 1352 * @adev: amdgpu_device pointer 1353 * 1354 * Initializes writeback and allocates writeback memory (all asics). 1355 * Used at driver startup. 1356 * Returns 0 on success or an -error on failure. 1357 */ 1358 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1359 { 1360 int r; 1361 1362 if (adev->wb.wb_obj == NULL) { 1363 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1364 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1365 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1366 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1367 (void **)&adev->wb.wb); 1368 if (r) { 1369 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1370 return r; 1371 } 1372 1373 adev->wb.num_wb = AMDGPU_MAX_WB; 1374 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1375 1376 /* clear wb memory */ 1377 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1378 } 1379 1380 return 0; 1381 } 1382 1383 /** 1384 * amdgpu_device_wb_get - Allocate a wb entry 1385 * 1386 * @adev: amdgpu_device pointer 1387 * @wb: wb index 1388 * 1389 * Allocate a wb slot for use by the driver (all asics). 1390 * Returns 0 on success or -EINVAL on failure. 1391 */ 1392 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1393 { 1394 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1395 1396 if (offset < adev->wb.num_wb) { 1397 __set_bit(offset, adev->wb.used); 1398 *wb = offset << 3; /* convert to dw offset */ 1399 return 0; 1400 } else { 1401 return -EINVAL; 1402 } 1403 } 1404 1405 /** 1406 * amdgpu_device_wb_free - Free a wb entry 1407 * 1408 * @adev: amdgpu_device pointer 1409 * @wb: wb index 1410 * 1411 * Free a wb slot allocated for use by the driver (all asics) 1412 */ 1413 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1414 { 1415 wb >>= 3; 1416 if (wb < adev->wb.num_wb) 1417 __clear_bit(wb, adev->wb.used); 1418 } 1419 1420 /** 1421 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1422 * 1423 * @adev: amdgpu_device pointer 1424 * 1425 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1426 * to fail, but if any of the BARs is not accessible after the size we abort 1427 * driver loading by returning -ENODEV. 1428 */ 1429 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1430 { 1431 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1432 struct pci_bus *root; 1433 struct resource *res; 1434 unsigned int i; 1435 u16 cmd; 1436 int r; 1437 1438 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1439 return 0; 1440 1441 /* Bypass for VF */ 1442 if (amdgpu_sriov_vf(adev)) 1443 return 0; 1444 1445 /* skip if the bios has already enabled large BAR */ 1446 if (adev->gmc.real_vram_size && 1447 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1448 return 0; 1449 1450 /* Check if the root BUS has 64bit memory resources */ 1451 root = adev->pdev->bus; 1452 while (root->parent) 1453 root = root->parent; 1454 1455 pci_bus_for_each_resource(root, res, i) { 1456 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1457 res->start > 0x100000000ull) 1458 break; 1459 } 1460 1461 /* Trying to resize is pointless without a root hub window above 4GB */ 1462 if (!res) 1463 return 0; 1464 1465 /* Limit the BAR size to what is available */ 1466 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1467 rbar_size); 1468 1469 /* Disable memory decoding while we change the BAR addresses and size */ 1470 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1471 pci_write_config_word(adev->pdev, PCI_COMMAND, 1472 cmd & ~PCI_COMMAND_MEMORY); 1473 1474 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1475 amdgpu_doorbell_fini(adev); 1476 if (adev->asic_type >= CHIP_BONAIRE) 1477 pci_release_resource(adev->pdev, 2); 1478 1479 pci_release_resource(adev->pdev, 0); 1480 1481 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1482 if (r == -ENOSPC) 1483 DRM_INFO("Not enough PCI address space for a large BAR."); 1484 else if (r && r != -ENOTSUPP) 1485 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1486 1487 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1488 1489 /* When the doorbell or fb BAR isn't available we have no chance of 1490 * using the device. 1491 */ 1492 r = amdgpu_doorbell_init(adev); 1493 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1494 return -ENODEV; 1495 1496 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1497 1498 return 0; 1499 } 1500 1501 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1502 { 1503 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1504 return false; 1505 1506 return true; 1507 } 1508 1509 /* 1510 * GPU helpers function. 1511 */ 1512 /** 1513 * amdgpu_device_need_post - check if the hw need post or not 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Check if the asic has been initialized (all asics) at driver startup 1518 * or post is needed if hw reset is performed. 1519 * Returns true if need or false if not. 1520 */ 1521 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1522 { 1523 uint32_t reg; 1524 1525 if (amdgpu_sriov_vf(adev)) 1526 return false; 1527 1528 if (!amdgpu_device_read_bios(adev)) 1529 return false; 1530 1531 if (amdgpu_passthrough(adev)) { 1532 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1533 * some old smc fw still need driver do vPost otherwise gpu hang, while 1534 * those smc fw version above 22.15 doesn't have this flaw, so we force 1535 * vpost executed for smc version below 22.15 1536 */ 1537 if (adev->asic_type == CHIP_FIJI) { 1538 int err; 1539 uint32_t fw_ver; 1540 1541 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1542 /* force vPost if error occured */ 1543 if (err) 1544 return true; 1545 1546 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1547 if (fw_ver < 0x00160e00) 1548 return true; 1549 } 1550 } 1551 1552 /* Don't post if we need to reset whole hive on init */ 1553 if (adev->gmc.xgmi.pending_reset) 1554 return false; 1555 1556 if (adev->has_hw_reset) { 1557 adev->has_hw_reset = false; 1558 return true; 1559 } 1560 1561 /* bios scratch used on CIK+ */ 1562 if (adev->asic_type >= CHIP_BONAIRE) 1563 return amdgpu_atombios_scratch_need_asic_init(adev); 1564 1565 /* check MEM_SIZE for older asics */ 1566 reg = amdgpu_asic_get_config_memsize(adev); 1567 1568 if ((reg != 0) && (reg != 0xffffffff)) 1569 return false; 1570 1571 return true; 1572 } 1573 1574 /* 1575 * Check whether seamless boot is supported. 1576 * 1577 * So far we only support seamless boot on DCE 3.0 or later. 1578 * If users report that it works on older ASICS as well, we may 1579 * loosen this. 1580 */ 1581 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1582 { 1583 switch (amdgpu_seamless) { 1584 case -1: 1585 break; 1586 case 1: 1587 return true; 1588 case 0: 1589 return false; 1590 default: 1591 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1592 amdgpu_seamless); 1593 return false; 1594 } 1595 1596 if (!(adev->flags & AMD_IS_APU)) 1597 return false; 1598 1599 if (adev->mman.keep_stolen_vga_memory) 1600 return false; 1601 1602 return adev->ip_versions[DCE_HWIP][0] >= IP_VERSION(3, 0, 0); 1603 } 1604 1605 /* 1606 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1607 * don't support dynamic speed switching. Until we have confirmation from Intel 1608 * that a specific host supports it, it's safer that we keep it disabled for all. 1609 * 1610 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1611 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1612 */ 1613 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1614 { 1615 #if IS_ENABLED(CONFIG_X86) 1616 struct cpuinfo_x86 *c = &cpu_data(0); 1617 1618 /* eGPU change speeds based on USB4 fabric conditions */ 1619 if (dev_is_removable(adev->dev)) 1620 return true; 1621 1622 if (c->x86_vendor == X86_VENDOR_INTEL) 1623 return false; 1624 #endif 1625 return true; 1626 } 1627 1628 /** 1629 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1630 * 1631 * @adev: amdgpu_device pointer 1632 * 1633 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1634 * be set for this device. 1635 * 1636 * Returns true if it should be used or false if not. 1637 */ 1638 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1639 { 1640 switch (amdgpu_aspm) { 1641 case -1: 1642 break; 1643 case 0: 1644 return false; 1645 case 1: 1646 return true; 1647 default: 1648 return false; 1649 } 1650 if (adev->flags & AMD_IS_APU) 1651 return false; 1652 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1653 return false; 1654 return pcie_aspm_enabled(adev->pdev); 1655 } 1656 1657 /* if we get transitioned to only one device, take VGA back */ 1658 /** 1659 * amdgpu_device_vga_set_decode - enable/disable vga decode 1660 * 1661 * @pdev: PCI device pointer 1662 * @state: enable/disable vga decode 1663 * 1664 * Enable/disable vga decode (all asics). 1665 * Returns VGA resource flags. 1666 */ 1667 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1668 bool state) 1669 { 1670 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1671 1672 amdgpu_asic_set_vga_state(adev, state); 1673 if (state) 1674 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1675 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1676 else 1677 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1678 } 1679 1680 /** 1681 * amdgpu_device_check_block_size - validate the vm block size 1682 * 1683 * @adev: amdgpu_device pointer 1684 * 1685 * Validates the vm block size specified via module parameter. 1686 * The vm block size defines number of bits in page table versus page directory, 1687 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1688 * page table and the remaining bits are in the page directory. 1689 */ 1690 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1691 { 1692 /* defines number of bits in page table versus page directory, 1693 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1694 * page table and the remaining bits are in the page directory 1695 */ 1696 if (amdgpu_vm_block_size == -1) 1697 return; 1698 1699 if (amdgpu_vm_block_size < 9) { 1700 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1701 amdgpu_vm_block_size); 1702 amdgpu_vm_block_size = -1; 1703 } 1704 } 1705 1706 /** 1707 * amdgpu_device_check_vm_size - validate the vm size 1708 * 1709 * @adev: amdgpu_device pointer 1710 * 1711 * Validates the vm size in GB specified via module parameter. 1712 * The VM size is the size of the GPU virtual memory space in GB. 1713 */ 1714 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1715 { 1716 /* no need to check the default value */ 1717 if (amdgpu_vm_size == -1) 1718 return; 1719 1720 if (amdgpu_vm_size < 1) { 1721 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1722 amdgpu_vm_size); 1723 amdgpu_vm_size = -1; 1724 } 1725 } 1726 1727 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1728 { 1729 struct sysinfo si; 1730 bool is_os_64 = (sizeof(void *) == 8); 1731 uint64_t total_memory; 1732 uint64_t dram_size_seven_GB = 0x1B8000000; 1733 uint64_t dram_size_three_GB = 0xB8000000; 1734 1735 if (amdgpu_smu_memory_pool_size == 0) 1736 return; 1737 1738 if (!is_os_64) { 1739 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1740 goto def_value; 1741 } 1742 si_meminfo(&si); 1743 total_memory = (uint64_t)si.totalram * si.mem_unit; 1744 1745 if ((amdgpu_smu_memory_pool_size == 1) || 1746 (amdgpu_smu_memory_pool_size == 2)) { 1747 if (total_memory < dram_size_three_GB) 1748 goto def_value1; 1749 } else if ((amdgpu_smu_memory_pool_size == 4) || 1750 (amdgpu_smu_memory_pool_size == 8)) { 1751 if (total_memory < dram_size_seven_GB) 1752 goto def_value1; 1753 } else { 1754 DRM_WARN("Smu memory pool size not supported\n"); 1755 goto def_value; 1756 } 1757 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1758 1759 return; 1760 1761 def_value1: 1762 DRM_WARN("No enough system memory\n"); 1763 def_value: 1764 adev->pm.smu_prv_buffer_size = 0; 1765 } 1766 1767 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1768 { 1769 if (!(adev->flags & AMD_IS_APU) || 1770 adev->asic_type < CHIP_RAVEN) 1771 return 0; 1772 1773 switch (adev->asic_type) { 1774 case CHIP_RAVEN: 1775 if (adev->pdev->device == 0x15dd) 1776 adev->apu_flags |= AMD_APU_IS_RAVEN; 1777 if (adev->pdev->device == 0x15d8) 1778 adev->apu_flags |= AMD_APU_IS_PICASSO; 1779 break; 1780 case CHIP_RENOIR: 1781 if ((adev->pdev->device == 0x1636) || 1782 (adev->pdev->device == 0x164c)) 1783 adev->apu_flags |= AMD_APU_IS_RENOIR; 1784 else 1785 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1786 break; 1787 case CHIP_VANGOGH: 1788 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1789 break; 1790 case CHIP_YELLOW_CARP: 1791 break; 1792 case CHIP_CYAN_SKILLFISH: 1793 if ((adev->pdev->device == 0x13FE) || 1794 (adev->pdev->device == 0x143F)) 1795 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1796 break; 1797 default: 1798 break; 1799 } 1800 1801 return 0; 1802 } 1803 1804 /** 1805 * amdgpu_device_check_arguments - validate module params 1806 * 1807 * @adev: amdgpu_device pointer 1808 * 1809 * Validates certain module parameters and updates 1810 * the associated values used by the driver (all asics). 1811 */ 1812 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1813 { 1814 if (amdgpu_sched_jobs < 4) { 1815 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1816 amdgpu_sched_jobs); 1817 amdgpu_sched_jobs = 4; 1818 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1819 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1820 amdgpu_sched_jobs); 1821 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1822 } 1823 1824 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1825 /* gart size must be greater or equal to 32M */ 1826 dev_warn(adev->dev, "gart size (%d) too small\n", 1827 amdgpu_gart_size); 1828 amdgpu_gart_size = -1; 1829 } 1830 1831 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1832 /* gtt size must be greater or equal to 32M */ 1833 dev_warn(adev->dev, "gtt size (%d) too small\n", 1834 amdgpu_gtt_size); 1835 amdgpu_gtt_size = -1; 1836 } 1837 1838 /* valid range is between 4 and 9 inclusive */ 1839 if (amdgpu_vm_fragment_size != -1 && 1840 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1841 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1842 amdgpu_vm_fragment_size = -1; 1843 } 1844 1845 if (amdgpu_sched_hw_submission < 2) { 1846 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1847 amdgpu_sched_hw_submission); 1848 amdgpu_sched_hw_submission = 2; 1849 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1850 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1851 amdgpu_sched_hw_submission); 1852 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1853 } 1854 1855 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1856 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1857 amdgpu_reset_method = -1; 1858 } 1859 1860 amdgpu_device_check_smu_prv_buffer_size(adev); 1861 1862 amdgpu_device_check_vm_size(adev); 1863 1864 amdgpu_device_check_block_size(adev); 1865 1866 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1867 1868 return 0; 1869 } 1870 1871 /** 1872 * amdgpu_switcheroo_set_state - set switcheroo state 1873 * 1874 * @pdev: pci dev pointer 1875 * @state: vga_switcheroo state 1876 * 1877 * Callback for the switcheroo driver. Suspends or resumes 1878 * the asics before or after it is powered up using ACPI methods. 1879 */ 1880 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1881 enum vga_switcheroo_state state) 1882 { 1883 struct drm_device *dev = pci_get_drvdata(pdev); 1884 int r; 1885 1886 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1887 return; 1888 1889 if (state == VGA_SWITCHEROO_ON) { 1890 pr_info("switched on\n"); 1891 /* don't suspend or resume card normally */ 1892 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1893 1894 pci_set_power_state(pdev, PCI_D0); 1895 amdgpu_device_load_pci_state(pdev); 1896 r = pci_enable_device(pdev); 1897 if (r) 1898 DRM_WARN("pci_enable_device failed (%d)\n", r); 1899 amdgpu_device_resume(dev, true); 1900 1901 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1902 } else { 1903 pr_info("switched off\n"); 1904 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1905 amdgpu_device_prepare(dev); 1906 amdgpu_device_suspend(dev, true); 1907 amdgpu_device_cache_pci_state(pdev); 1908 /* Shut down the device */ 1909 pci_disable_device(pdev); 1910 pci_set_power_state(pdev, PCI_D3cold); 1911 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1912 } 1913 } 1914 1915 /** 1916 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1917 * 1918 * @pdev: pci dev pointer 1919 * 1920 * Callback for the switcheroo driver. Check of the switcheroo 1921 * state can be changed. 1922 * Returns true if the state can be changed, false if not. 1923 */ 1924 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1925 { 1926 struct drm_device *dev = pci_get_drvdata(pdev); 1927 1928 /* 1929 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1930 * locking inversion with the driver load path. And the access here is 1931 * completely racy anyway. So don't bother with locking for now. 1932 */ 1933 return atomic_read(&dev->open_count) == 0; 1934 } 1935 1936 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1937 .set_gpu_state = amdgpu_switcheroo_set_state, 1938 .reprobe = NULL, 1939 .can_switch = amdgpu_switcheroo_can_switch, 1940 }; 1941 1942 /** 1943 * amdgpu_device_ip_set_clockgating_state - set the CG state 1944 * 1945 * @dev: amdgpu_device pointer 1946 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1947 * @state: clockgating state (gate or ungate) 1948 * 1949 * Sets the requested clockgating state for all instances of 1950 * the hardware IP specified. 1951 * Returns the error code from the last instance. 1952 */ 1953 int amdgpu_device_ip_set_clockgating_state(void *dev, 1954 enum amd_ip_block_type block_type, 1955 enum amd_clockgating_state state) 1956 { 1957 struct amdgpu_device *adev = dev; 1958 int i, r = 0; 1959 1960 for (i = 0; i < adev->num_ip_blocks; i++) { 1961 if (!adev->ip_blocks[i].status.valid) 1962 continue; 1963 if (adev->ip_blocks[i].version->type != block_type) 1964 continue; 1965 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1966 continue; 1967 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1968 (void *)adev, state); 1969 if (r) 1970 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1971 adev->ip_blocks[i].version->funcs->name, r); 1972 } 1973 return r; 1974 } 1975 1976 /** 1977 * amdgpu_device_ip_set_powergating_state - set the PG state 1978 * 1979 * @dev: amdgpu_device pointer 1980 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1981 * @state: powergating state (gate or ungate) 1982 * 1983 * Sets the requested powergating state for all instances of 1984 * the hardware IP specified. 1985 * Returns the error code from the last instance. 1986 */ 1987 int amdgpu_device_ip_set_powergating_state(void *dev, 1988 enum amd_ip_block_type block_type, 1989 enum amd_powergating_state state) 1990 { 1991 struct amdgpu_device *adev = dev; 1992 int i, r = 0; 1993 1994 for (i = 0; i < adev->num_ip_blocks; i++) { 1995 if (!adev->ip_blocks[i].status.valid) 1996 continue; 1997 if (adev->ip_blocks[i].version->type != block_type) 1998 continue; 1999 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2000 continue; 2001 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2002 (void *)adev, state); 2003 if (r) 2004 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2005 adev->ip_blocks[i].version->funcs->name, r); 2006 } 2007 return r; 2008 } 2009 2010 /** 2011 * amdgpu_device_ip_get_clockgating_state - get the CG state 2012 * 2013 * @adev: amdgpu_device pointer 2014 * @flags: clockgating feature flags 2015 * 2016 * Walks the list of IPs on the device and updates the clockgating 2017 * flags for each IP. 2018 * Updates @flags with the feature flags for each hardware IP where 2019 * clockgating is enabled. 2020 */ 2021 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2022 u64 *flags) 2023 { 2024 int i; 2025 2026 for (i = 0; i < adev->num_ip_blocks; i++) { 2027 if (!adev->ip_blocks[i].status.valid) 2028 continue; 2029 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2030 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2031 } 2032 } 2033 2034 /** 2035 * amdgpu_device_ip_wait_for_idle - wait for idle 2036 * 2037 * @adev: amdgpu_device pointer 2038 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2039 * 2040 * Waits for the request hardware IP to be idle. 2041 * Returns 0 for success or a negative error code on failure. 2042 */ 2043 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2044 enum amd_ip_block_type block_type) 2045 { 2046 int i, r; 2047 2048 for (i = 0; i < adev->num_ip_blocks; i++) { 2049 if (!adev->ip_blocks[i].status.valid) 2050 continue; 2051 if (adev->ip_blocks[i].version->type == block_type) { 2052 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2053 if (r) 2054 return r; 2055 break; 2056 } 2057 } 2058 return 0; 2059 2060 } 2061 2062 /** 2063 * amdgpu_device_ip_is_idle - is the hardware IP idle 2064 * 2065 * @adev: amdgpu_device pointer 2066 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2067 * 2068 * Check if the hardware IP is idle or not. 2069 * Returns true if it the IP is idle, false if not. 2070 */ 2071 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2072 enum amd_ip_block_type block_type) 2073 { 2074 int i; 2075 2076 for (i = 0; i < adev->num_ip_blocks; i++) { 2077 if (!adev->ip_blocks[i].status.valid) 2078 continue; 2079 if (adev->ip_blocks[i].version->type == block_type) 2080 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2081 } 2082 return true; 2083 2084 } 2085 2086 /** 2087 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2088 * 2089 * @adev: amdgpu_device pointer 2090 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2091 * 2092 * Returns a pointer to the hardware IP block structure 2093 * if it exists for the asic, otherwise NULL. 2094 */ 2095 struct amdgpu_ip_block * 2096 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2097 enum amd_ip_block_type type) 2098 { 2099 int i; 2100 2101 for (i = 0; i < adev->num_ip_blocks; i++) 2102 if (adev->ip_blocks[i].version->type == type) 2103 return &adev->ip_blocks[i]; 2104 2105 return NULL; 2106 } 2107 2108 /** 2109 * amdgpu_device_ip_block_version_cmp 2110 * 2111 * @adev: amdgpu_device pointer 2112 * @type: enum amd_ip_block_type 2113 * @major: major version 2114 * @minor: minor version 2115 * 2116 * return 0 if equal or greater 2117 * return 1 if smaller or the ip_block doesn't exist 2118 */ 2119 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2120 enum amd_ip_block_type type, 2121 u32 major, u32 minor) 2122 { 2123 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2124 2125 if (ip_block && ((ip_block->version->major > major) || 2126 ((ip_block->version->major == major) && 2127 (ip_block->version->minor >= minor)))) 2128 return 0; 2129 2130 return 1; 2131 } 2132 2133 /** 2134 * amdgpu_device_ip_block_add 2135 * 2136 * @adev: amdgpu_device pointer 2137 * @ip_block_version: pointer to the IP to add 2138 * 2139 * Adds the IP block driver information to the collection of IPs 2140 * on the asic. 2141 */ 2142 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2143 const struct amdgpu_ip_block_version *ip_block_version) 2144 { 2145 if (!ip_block_version) 2146 return -EINVAL; 2147 2148 switch (ip_block_version->type) { 2149 case AMD_IP_BLOCK_TYPE_VCN: 2150 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2151 return 0; 2152 break; 2153 case AMD_IP_BLOCK_TYPE_JPEG: 2154 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2155 return 0; 2156 break; 2157 default: 2158 break; 2159 } 2160 2161 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2162 ip_block_version->funcs->name); 2163 2164 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2165 2166 return 0; 2167 } 2168 2169 /** 2170 * amdgpu_device_enable_virtual_display - enable virtual display feature 2171 * 2172 * @adev: amdgpu_device pointer 2173 * 2174 * Enabled the virtual display feature if the user has enabled it via 2175 * the module parameter virtual_display. This feature provides a virtual 2176 * display hardware on headless boards or in virtualized environments. 2177 * This function parses and validates the configuration string specified by 2178 * the user and configues the virtual display configuration (number of 2179 * virtual connectors, crtcs, etc.) specified. 2180 */ 2181 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2182 { 2183 adev->enable_virtual_display = false; 2184 2185 if (amdgpu_virtual_display) { 2186 const char *pci_address_name = pci_name(adev->pdev); 2187 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2188 2189 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2190 pciaddstr_tmp = pciaddstr; 2191 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2192 pciaddname = strsep(&pciaddname_tmp, ","); 2193 if (!strcmp("all", pciaddname) 2194 || !strcmp(pci_address_name, pciaddname)) { 2195 long num_crtc; 2196 int res = -1; 2197 2198 adev->enable_virtual_display = true; 2199 2200 if (pciaddname_tmp) 2201 res = kstrtol(pciaddname_tmp, 10, 2202 &num_crtc); 2203 2204 if (!res) { 2205 if (num_crtc < 1) 2206 num_crtc = 1; 2207 if (num_crtc > 6) 2208 num_crtc = 6; 2209 adev->mode_info.num_crtc = num_crtc; 2210 } else { 2211 adev->mode_info.num_crtc = 1; 2212 } 2213 break; 2214 } 2215 } 2216 2217 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2218 amdgpu_virtual_display, pci_address_name, 2219 adev->enable_virtual_display, adev->mode_info.num_crtc); 2220 2221 kfree(pciaddstr); 2222 } 2223 } 2224 2225 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2226 { 2227 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2228 adev->mode_info.num_crtc = 1; 2229 adev->enable_virtual_display = true; 2230 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2231 adev->enable_virtual_display, adev->mode_info.num_crtc); 2232 } 2233 } 2234 2235 /** 2236 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2237 * 2238 * @adev: amdgpu_device pointer 2239 * 2240 * Parses the asic configuration parameters specified in the gpu info 2241 * firmware and makes them availale to the driver for use in configuring 2242 * the asic. 2243 * Returns 0 on success, -EINVAL on failure. 2244 */ 2245 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2246 { 2247 const char *chip_name; 2248 char fw_name[40]; 2249 int err; 2250 const struct gpu_info_firmware_header_v1_0 *hdr; 2251 2252 adev->firmware.gpu_info_fw = NULL; 2253 2254 if (adev->mman.discovery_bin) { 2255 /* 2256 * FIXME: The bounding box is still needed by Navi12, so 2257 * temporarily read it from gpu_info firmware. Should be dropped 2258 * when DAL no longer needs it. 2259 */ 2260 if (adev->asic_type != CHIP_NAVI12) 2261 return 0; 2262 } 2263 2264 switch (adev->asic_type) { 2265 default: 2266 return 0; 2267 case CHIP_VEGA10: 2268 chip_name = "vega10"; 2269 break; 2270 case CHIP_VEGA12: 2271 chip_name = "vega12"; 2272 break; 2273 case CHIP_RAVEN: 2274 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2275 chip_name = "raven2"; 2276 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2277 chip_name = "picasso"; 2278 else 2279 chip_name = "raven"; 2280 break; 2281 case CHIP_ARCTURUS: 2282 chip_name = "arcturus"; 2283 break; 2284 case CHIP_NAVI12: 2285 chip_name = "navi12"; 2286 break; 2287 } 2288 2289 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2290 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2291 if (err) { 2292 dev_err(adev->dev, 2293 "Failed to get gpu_info firmware \"%s\"\n", 2294 fw_name); 2295 goto out; 2296 } 2297 2298 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2299 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2300 2301 switch (hdr->version_major) { 2302 case 1: 2303 { 2304 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2305 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2306 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2307 2308 /* 2309 * Should be droped when DAL no longer needs it. 2310 */ 2311 if (adev->asic_type == CHIP_NAVI12) 2312 goto parse_soc_bounding_box; 2313 2314 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2315 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2316 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2317 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2318 adev->gfx.config.max_texture_channel_caches = 2319 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2320 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2321 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2322 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2323 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2324 adev->gfx.config.double_offchip_lds_buf = 2325 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2326 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2327 adev->gfx.cu_info.max_waves_per_simd = 2328 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2329 adev->gfx.cu_info.max_scratch_slots_per_cu = 2330 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2331 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2332 if (hdr->version_minor >= 1) { 2333 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2334 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2335 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2336 adev->gfx.config.num_sc_per_sh = 2337 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2338 adev->gfx.config.num_packer_per_sc = 2339 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2340 } 2341 2342 parse_soc_bounding_box: 2343 /* 2344 * soc bounding box info is not integrated in disocovery table, 2345 * we always need to parse it from gpu info firmware if needed. 2346 */ 2347 if (hdr->version_minor == 2) { 2348 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2349 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2350 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2351 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2352 } 2353 break; 2354 } 2355 default: 2356 dev_err(adev->dev, 2357 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2358 err = -EINVAL; 2359 goto out; 2360 } 2361 out: 2362 return err; 2363 } 2364 2365 /** 2366 * amdgpu_device_ip_early_init - run early init for hardware IPs 2367 * 2368 * @adev: amdgpu_device pointer 2369 * 2370 * Early initialization pass for hardware IPs. The hardware IPs that make 2371 * up each asic are discovered each IP's early_init callback is run. This 2372 * is the first stage in initializing the asic. 2373 * Returns 0 on success, negative error code on failure. 2374 */ 2375 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2376 { 2377 struct pci_dev *parent; 2378 int i, r; 2379 bool total; 2380 2381 amdgpu_device_enable_virtual_display(adev); 2382 2383 if (amdgpu_sriov_vf(adev)) { 2384 r = amdgpu_virt_request_full_gpu(adev, true); 2385 if (r) 2386 return r; 2387 } 2388 2389 switch (adev->asic_type) { 2390 #ifdef CONFIG_DRM_AMDGPU_SI 2391 case CHIP_VERDE: 2392 case CHIP_TAHITI: 2393 case CHIP_PITCAIRN: 2394 case CHIP_OLAND: 2395 case CHIP_HAINAN: 2396 adev->family = AMDGPU_FAMILY_SI; 2397 r = si_set_ip_blocks(adev); 2398 if (r) 2399 return r; 2400 break; 2401 #endif 2402 #ifdef CONFIG_DRM_AMDGPU_CIK 2403 case CHIP_BONAIRE: 2404 case CHIP_HAWAII: 2405 case CHIP_KAVERI: 2406 case CHIP_KABINI: 2407 case CHIP_MULLINS: 2408 if (adev->flags & AMD_IS_APU) 2409 adev->family = AMDGPU_FAMILY_KV; 2410 else 2411 adev->family = AMDGPU_FAMILY_CI; 2412 2413 r = cik_set_ip_blocks(adev); 2414 if (r) 2415 return r; 2416 break; 2417 #endif 2418 case CHIP_TOPAZ: 2419 case CHIP_TONGA: 2420 case CHIP_FIJI: 2421 case CHIP_POLARIS10: 2422 case CHIP_POLARIS11: 2423 case CHIP_POLARIS12: 2424 case CHIP_VEGAM: 2425 case CHIP_CARRIZO: 2426 case CHIP_STONEY: 2427 if (adev->flags & AMD_IS_APU) 2428 adev->family = AMDGPU_FAMILY_CZ; 2429 else 2430 adev->family = AMDGPU_FAMILY_VI; 2431 2432 r = vi_set_ip_blocks(adev); 2433 if (r) 2434 return r; 2435 break; 2436 default: 2437 r = amdgpu_discovery_set_ip_blocks(adev); 2438 if (r) 2439 return r; 2440 break; 2441 } 2442 2443 if (amdgpu_has_atpx() && 2444 (amdgpu_is_atpx_hybrid() || 2445 amdgpu_has_atpx_dgpu_power_cntl()) && 2446 ((adev->flags & AMD_IS_APU) == 0) && 2447 !dev_is_removable(&adev->pdev->dev)) 2448 adev->flags |= AMD_IS_PX; 2449 2450 if (!(adev->flags & AMD_IS_APU)) { 2451 parent = pcie_find_root_port(adev->pdev); 2452 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2453 } 2454 2455 2456 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2457 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2458 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2459 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2460 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2461 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2462 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2463 2464 total = true; 2465 for (i = 0; i < adev->num_ip_blocks; i++) { 2466 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2467 DRM_WARN("disabled ip block: %d <%s>\n", 2468 i, adev->ip_blocks[i].version->funcs->name); 2469 adev->ip_blocks[i].status.valid = false; 2470 } else { 2471 if (adev->ip_blocks[i].version->funcs->early_init) { 2472 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2473 if (r == -ENOENT) { 2474 adev->ip_blocks[i].status.valid = false; 2475 } else if (r) { 2476 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2477 adev->ip_blocks[i].version->funcs->name, r); 2478 total = false; 2479 } else { 2480 adev->ip_blocks[i].status.valid = true; 2481 } 2482 } else { 2483 adev->ip_blocks[i].status.valid = true; 2484 } 2485 } 2486 /* get the vbios after the asic_funcs are set up */ 2487 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2488 r = amdgpu_device_parse_gpu_info_fw(adev); 2489 if (r) 2490 return r; 2491 2492 /* Read BIOS */ 2493 if (amdgpu_device_read_bios(adev)) { 2494 if (!amdgpu_get_bios(adev)) 2495 return -EINVAL; 2496 2497 r = amdgpu_atombios_init(adev); 2498 if (r) { 2499 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2500 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2501 return r; 2502 } 2503 } 2504 2505 /*get pf2vf msg info at it's earliest time*/ 2506 if (amdgpu_sriov_vf(adev)) 2507 amdgpu_virt_init_data_exchange(adev); 2508 2509 } 2510 } 2511 if (!total) 2512 return -ENODEV; 2513 2514 amdgpu_amdkfd_device_probe(adev); 2515 adev->cg_flags &= amdgpu_cg_mask; 2516 adev->pg_flags &= amdgpu_pg_mask; 2517 2518 return 0; 2519 } 2520 2521 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2522 { 2523 int i, r; 2524 2525 for (i = 0; i < adev->num_ip_blocks; i++) { 2526 if (!adev->ip_blocks[i].status.sw) 2527 continue; 2528 if (adev->ip_blocks[i].status.hw) 2529 continue; 2530 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2531 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2532 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2533 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2534 if (r) { 2535 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2536 adev->ip_blocks[i].version->funcs->name, r); 2537 return r; 2538 } 2539 adev->ip_blocks[i].status.hw = true; 2540 } 2541 } 2542 2543 return 0; 2544 } 2545 2546 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2547 { 2548 int i, r; 2549 2550 for (i = 0; i < adev->num_ip_blocks; i++) { 2551 if (!adev->ip_blocks[i].status.sw) 2552 continue; 2553 if (adev->ip_blocks[i].status.hw) 2554 continue; 2555 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2556 if (r) { 2557 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2558 adev->ip_blocks[i].version->funcs->name, r); 2559 return r; 2560 } 2561 adev->ip_blocks[i].status.hw = true; 2562 } 2563 2564 return 0; 2565 } 2566 2567 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2568 { 2569 int r = 0; 2570 int i; 2571 uint32_t smu_version; 2572 2573 if (adev->asic_type >= CHIP_VEGA10) { 2574 for (i = 0; i < adev->num_ip_blocks; i++) { 2575 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2576 continue; 2577 2578 if (!adev->ip_blocks[i].status.sw) 2579 continue; 2580 2581 /* no need to do the fw loading again if already done*/ 2582 if (adev->ip_blocks[i].status.hw == true) 2583 break; 2584 2585 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2586 r = adev->ip_blocks[i].version->funcs->resume(adev); 2587 if (r) { 2588 DRM_ERROR("resume of IP block <%s> failed %d\n", 2589 adev->ip_blocks[i].version->funcs->name, r); 2590 return r; 2591 } 2592 } else { 2593 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2594 if (r) { 2595 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2596 adev->ip_blocks[i].version->funcs->name, r); 2597 return r; 2598 } 2599 } 2600 2601 adev->ip_blocks[i].status.hw = true; 2602 break; 2603 } 2604 } 2605 2606 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2607 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2608 2609 return r; 2610 } 2611 2612 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2613 { 2614 long timeout; 2615 int r, i; 2616 2617 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2618 struct amdgpu_ring *ring = adev->rings[i]; 2619 2620 /* No need to setup the GPU scheduler for rings that don't need it */ 2621 if (!ring || ring->no_scheduler) 2622 continue; 2623 2624 switch (ring->funcs->type) { 2625 case AMDGPU_RING_TYPE_GFX: 2626 timeout = adev->gfx_timeout; 2627 break; 2628 case AMDGPU_RING_TYPE_COMPUTE: 2629 timeout = adev->compute_timeout; 2630 break; 2631 case AMDGPU_RING_TYPE_SDMA: 2632 timeout = adev->sdma_timeout; 2633 break; 2634 default: 2635 timeout = adev->video_timeout; 2636 break; 2637 } 2638 2639 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2640 DRM_SCHED_PRIORITY_COUNT, 2641 ring->num_hw_submission, 0, 2642 timeout, adev->reset_domain->wq, 2643 ring->sched_score, ring->name, 2644 adev->dev); 2645 if (r) { 2646 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2647 ring->name); 2648 return r; 2649 } 2650 r = amdgpu_uvd_entity_init(adev, ring); 2651 if (r) { 2652 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2653 ring->name); 2654 return r; 2655 } 2656 r = amdgpu_vce_entity_init(adev, ring); 2657 if (r) { 2658 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2659 ring->name); 2660 return r; 2661 } 2662 } 2663 2664 amdgpu_xcp_update_partition_sched_list(adev); 2665 2666 return 0; 2667 } 2668 2669 2670 /** 2671 * amdgpu_device_ip_init - run init for hardware IPs 2672 * 2673 * @adev: amdgpu_device pointer 2674 * 2675 * Main initialization pass for hardware IPs. The list of all the hardware 2676 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2677 * are run. sw_init initializes the software state associated with each IP 2678 * and hw_init initializes the hardware associated with each IP. 2679 * Returns 0 on success, negative error code on failure. 2680 */ 2681 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2682 { 2683 int i, r; 2684 2685 r = amdgpu_ras_init(adev); 2686 if (r) 2687 return r; 2688 2689 for (i = 0; i < adev->num_ip_blocks; i++) { 2690 if (!adev->ip_blocks[i].status.valid) 2691 continue; 2692 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2693 if (r) { 2694 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2695 adev->ip_blocks[i].version->funcs->name, r); 2696 goto init_failed; 2697 } 2698 adev->ip_blocks[i].status.sw = true; 2699 2700 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2701 /* need to do common hw init early so everything is set up for gmc */ 2702 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2703 if (r) { 2704 DRM_ERROR("hw_init %d failed %d\n", i, r); 2705 goto init_failed; 2706 } 2707 adev->ip_blocks[i].status.hw = true; 2708 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2709 /* need to do gmc hw init early so we can allocate gpu mem */ 2710 /* Try to reserve bad pages early */ 2711 if (amdgpu_sriov_vf(adev)) 2712 amdgpu_virt_exchange_data(adev); 2713 2714 r = amdgpu_device_mem_scratch_init(adev); 2715 if (r) { 2716 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2717 goto init_failed; 2718 } 2719 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2720 if (r) { 2721 DRM_ERROR("hw_init %d failed %d\n", i, r); 2722 goto init_failed; 2723 } 2724 r = amdgpu_device_wb_init(adev); 2725 if (r) { 2726 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2727 goto init_failed; 2728 } 2729 adev->ip_blocks[i].status.hw = true; 2730 2731 /* right after GMC hw init, we create CSA */ 2732 if (adev->gfx.mcbp) { 2733 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2734 AMDGPU_GEM_DOMAIN_VRAM | 2735 AMDGPU_GEM_DOMAIN_GTT, 2736 AMDGPU_CSA_SIZE); 2737 if (r) { 2738 DRM_ERROR("allocate CSA failed %d\n", r); 2739 goto init_failed; 2740 } 2741 } 2742 2743 r = amdgpu_seq64_init(adev); 2744 if (r) { 2745 DRM_ERROR("allocate seq64 failed %d\n", r); 2746 goto init_failed; 2747 } 2748 } 2749 } 2750 2751 if (amdgpu_sriov_vf(adev)) 2752 amdgpu_virt_init_data_exchange(adev); 2753 2754 r = amdgpu_ib_pool_init(adev); 2755 if (r) { 2756 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2757 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2758 goto init_failed; 2759 } 2760 2761 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2762 if (r) 2763 goto init_failed; 2764 2765 r = amdgpu_device_ip_hw_init_phase1(adev); 2766 if (r) 2767 goto init_failed; 2768 2769 r = amdgpu_device_fw_loading(adev); 2770 if (r) 2771 goto init_failed; 2772 2773 r = amdgpu_device_ip_hw_init_phase2(adev); 2774 if (r) 2775 goto init_failed; 2776 2777 /* 2778 * retired pages will be loaded from eeprom and reserved here, 2779 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2780 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2781 * for I2C communication which only true at this point. 2782 * 2783 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2784 * failure from bad gpu situation and stop amdgpu init process 2785 * accordingly. For other failed cases, it will still release all 2786 * the resource and print error message, rather than returning one 2787 * negative value to upper level. 2788 * 2789 * Note: theoretically, this should be called before all vram allocations 2790 * to protect retired page from abusing 2791 */ 2792 r = amdgpu_ras_recovery_init(adev); 2793 if (r) 2794 goto init_failed; 2795 2796 /** 2797 * In case of XGMI grab extra reference for reset domain for this device 2798 */ 2799 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2800 if (amdgpu_xgmi_add_device(adev) == 0) { 2801 if (!amdgpu_sriov_vf(adev)) { 2802 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2803 2804 if (WARN_ON(!hive)) { 2805 r = -ENOENT; 2806 goto init_failed; 2807 } 2808 2809 if (!hive->reset_domain || 2810 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2811 r = -ENOENT; 2812 amdgpu_put_xgmi_hive(hive); 2813 goto init_failed; 2814 } 2815 2816 /* Drop the early temporary reset domain we created for device */ 2817 amdgpu_reset_put_reset_domain(adev->reset_domain); 2818 adev->reset_domain = hive->reset_domain; 2819 amdgpu_put_xgmi_hive(hive); 2820 } 2821 } 2822 } 2823 2824 r = amdgpu_device_init_schedulers(adev); 2825 if (r) 2826 goto init_failed; 2827 2828 if (adev->mman.buffer_funcs_ring->sched.ready) 2829 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2830 2831 /* Don't init kfd if whole hive need to be reset during init */ 2832 if (!adev->gmc.xgmi.pending_reset) { 2833 kgd2kfd_init_zone_device(adev); 2834 amdgpu_amdkfd_device_init(adev); 2835 } 2836 2837 amdgpu_fru_get_product_info(adev); 2838 2839 init_failed: 2840 2841 return r; 2842 } 2843 2844 /** 2845 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2846 * 2847 * @adev: amdgpu_device pointer 2848 * 2849 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2850 * this function before a GPU reset. If the value is retained after a 2851 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2852 */ 2853 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2854 { 2855 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2856 } 2857 2858 /** 2859 * amdgpu_device_check_vram_lost - check if vram is valid 2860 * 2861 * @adev: amdgpu_device pointer 2862 * 2863 * Checks the reset magic value written to the gart pointer in VRAM. 2864 * The driver calls this after a GPU reset to see if the contents of 2865 * VRAM is lost or now. 2866 * returns true if vram is lost, false if not. 2867 */ 2868 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2869 { 2870 if (memcmp(adev->gart.ptr, adev->reset_magic, 2871 AMDGPU_RESET_MAGIC_NUM)) 2872 return true; 2873 2874 if (!amdgpu_in_reset(adev)) 2875 return false; 2876 2877 /* 2878 * For all ASICs with baco/mode1 reset, the VRAM is 2879 * always assumed to be lost. 2880 */ 2881 switch (amdgpu_asic_reset_method(adev)) { 2882 case AMD_RESET_METHOD_BACO: 2883 case AMD_RESET_METHOD_MODE1: 2884 return true; 2885 default: 2886 return false; 2887 } 2888 } 2889 2890 /** 2891 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2892 * 2893 * @adev: amdgpu_device pointer 2894 * @state: clockgating state (gate or ungate) 2895 * 2896 * The list of all the hardware IPs that make up the asic is walked and the 2897 * set_clockgating_state callbacks are run. 2898 * Late initialization pass enabling clockgating for hardware IPs. 2899 * Fini or suspend, pass disabling clockgating for hardware IPs. 2900 * Returns 0 on success, negative error code on failure. 2901 */ 2902 2903 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2904 enum amd_clockgating_state state) 2905 { 2906 int i, j, r; 2907 2908 if (amdgpu_emu_mode == 1) 2909 return 0; 2910 2911 for (j = 0; j < adev->num_ip_blocks; j++) { 2912 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2913 if (!adev->ip_blocks[i].status.late_initialized) 2914 continue; 2915 /* skip CG for GFX, SDMA on S0ix */ 2916 if (adev->in_s0ix && 2917 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2918 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2919 continue; 2920 /* skip CG for VCE/UVD, it's handled specially */ 2921 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2922 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2923 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2924 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2925 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2926 /* enable clockgating to save power */ 2927 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2928 state); 2929 if (r) { 2930 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2931 adev->ip_blocks[i].version->funcs->name, r); 2932 return r; 2933 } 2934 } 2935 } 2936 2937 return 0; 2938 } 2939 2940 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2941 enum amd_powergating_state state) 2942 { 2943 int i, j, r; 2944 2945 if (amdgpu_emu_mode == 1) 2946 return 0; 2947 2948 for (j = 0; j < adev->num_ip_blocks; j++) { 2949 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2950 if (!adev->ip_blocks[i].status.late_initialized) 2951 continue; 2952 /* skip PG for GFX, SDMA on S0ix */ 2953 if (adev->in_s0ix && 2954 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2955 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2956 continue; 2957 /* skip CG for VCE/UVD, it's handled specially */ 2958 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2959 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2960 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2961 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2962 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2963 /* enable powergating to save power */ 2964 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2965 state); 2966 if (r) { 2967 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2968 adev->ip_blocks[i].version->funcs->name, r); 2969 return r; 2970 } 2971 } 2972 } 2973 return 0; 2974 } 2975 2976 static int amdgpu_device_enable_mgpu_fan_boost(void) 2977 { 2978 struct amdgpu_gpu_instance *gpu_ins; 2979 struct amdgpu_device *adev; 2980 int i, ret = 0; 2981 2982 mutex_lock(&mgpu_info.mutex); 2983 2984 /* 2985 * MGPU fan boost feature should be enabled 2986 * only when there are two or more dGPUs in 2987 * the system 2988 */ 2989 if (mgpu_info.num_dgpu < 2) 2990 goto out; 2991 2992 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2993 gpu_ins = &(mgpu_info.gpu_ins[i]); 2994 adev = gpu_ins->adev; 2995 if (!(adev->flags & AMD_IS_APU) && 2996 !gpu_ins->mgpu_fan_enabled) { 2997 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2998 if (ret) 2999 break; 3000 3001 gpu_ins->mgpu_fan_enabled = 1; 3002 } 3003 } 3004 3005 out: 3006 mutex_unlock(&mgpu_info.mutex); 3007 3008 return ret; 3009 } 3010 3011 /** 3012 * amdgpu_device_ip_late_init - run late init for hardware IPs 3013 * 3014 * @adev: amdgpu_device pointer 3015 * 3016 * Late initialization pass for hardware IPs. The list of all the hardware 3017 * IPs that make up the asic is walked and the late_init callbacks are run. 3018 * late_init covers any special initialization that an IP requires 3019 * after all of the have been initialized or something that needs to happen 3020 * late in the init process. 3021 * Returns 0 on success, negative error code on failure. 3022 */ 3023 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3024 { 3025 struct amdgpu_gpu_instance *gpu_instance; 3026 int i = 0, r; 3027 3028 for (i = 0; i < adev->num_ip_blocks; i++) { 3029 if (!adev->ip_blocks[i].status.hw) 3030 continue; 3031 if (adev->ip_blocks[i].version->funcs->late_init) { 3032 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3033 if (r) { 3034 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3035 adev->ip_blocks[i].version->funcs->name, r); 3036 return r; 3037 } 3038 } 3039 adev->ip_blocks[i].status.late_initialized = true; 3040 } 3041 3042 r = amdgpu_ras_late_init(adev); 3043 if (r) { 3044 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3045 return r; 3046 } 3047 3048 amdgpu_ras_set_error_query_ready(adev, true); 3049 3050 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3051 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3052 3053 amdgpu_device_fill_reset_magic(adev); 3054 3055 r = amdgpu_device_enable_mgpu_fan_boost(); 3056 if (r) 3057 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3058 3059 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3060 if (amdgpu_passthrough(adev) && 3061 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3062 adev->asic_type == CHIP_ALDEBARAN)) 3063 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3064 3065 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3066 mutex_lock(&mgpu_info.mutex); 3067 3068 /* 3069 * Reset device p-state to low as this was booted with high. 3070 * 3071 * This should be performed only after all devices from the same 3072 * hive get initialized. 3073 * 3074 * However, it's unknown how many device in the hive in advance. 3075 * As this is counted one by one during devices initializations. 3076 * 3077 * So, we wait for all XGMI interlinked devices initialized. 3078 * This may bring some delays as those devices may come from 3079 * different hives. But that should be OK. 3080 */ 3081 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3082 for (i = 0; i < mgpu_info.num_gpu; i++) { 3083 gpu_instance = &(mgpu_info.gpu_ins[i]); 3084 if (gpu_instance->adev->flags & AMD_IS_APU) 3085 continue; 3086 3087 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3088 AMDGPU_XGMI_PSTATE_MIN); 3089 if (r) { 3090 DRM_ERROR("pstate setting failed (%d).\n", r); 3091 break; 3092 } 3093 } 3094 } 3095 3096 mutex_unlock(&mgpu_info.mutex); 3097 } 3098 3099 return 0; 3100 } 3101 3102 /** 3103 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3104 * 3105 * @adev: amdgpu_device pointer 3106 * 3107 * For ASICs need to disable SMC first 3108 */ 3109 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3110 { 3111 int i, r; 3112 3113 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3114 return; 3115 3116 for (i = 0; i < adev->num_ip_blocks; i++) { 3117 if (!adev->ip_blocks[i].status.hw) 3118 continue; 3119 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3120 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3121 /* XXX handle errors */ 3122 if (r) { 3123 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3124 adev->ip_blocks[i].version->funcs->name, r); 3125 } 3126 adev->ip_blocks[i].status.hw = false; 3127 break; 3128 } 3129 } 3130 } 3131 3132 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3133 { 3134 int i, r; 3135 3136 for (i = 0; i < adev->num_ip_blocks; i++) { 3137 if (!adev->ip_blocks[i].version->funcs->early_fini) 3138 continue; 3139 3140 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3141 if (r) { 3142 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3143 adev->ip_blocks[i].version->funcs->name, r); 3144 } 3145 } 3146 3147 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3148 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3149 3150 amdgpu_amdkfd_suspend(adev, false); 3151 3152 /* Workaroud for ASICs need to disable SMC first */ 3153 amdgpu_device_smu_fini_early(adev); 3154 3155 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3156 if (!adev->ip_blocks[i].status.hw) 3157 continue; 3158 3159 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3160 /* XXX handle errors */ 3161 if (r) { 3162 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3163 adev->ip_blocks[i].version->funcs->name, r); 3164 } 3165 3166 adev->ip_blocks[i].status.hw = false; 3167 } 3168 3169 if (amdgpu_sriov_vf(adev)) { 3170 if (amdgpu_virt_release_full_gpu(adev, false)) 3171 DRM_ERROR("failed to release exclusive mode on fini\n"); 3172 } 3173 3174 return 0; 3175 } 3176 3177 /** 3178 * amdgpu_device_ip_fini - run fini for hardware IPs 3179 * 3180 * @adev: amdgpu_device pointer 3181 * 3182 * Main teardown pass for hardware IPs. The list of all the hardware 3183 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3184 * are run. hw_fini tears down the hardware associated with each IP 3185 * and sw_fini tears down any software state associated with each IP. 3186 * Returns 0 on success, negative error code on failure. 3187 */ 3188 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3189 { 3190 int i, r; 3191 3192 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3193 amdgpu_virt_release_ras_err_handler_data(adev); 3194 3195 if (adev->gmc.xgmi.num_physical_nodes > 1) 3196 amdgpu_xgmi_remove_device(adev); 3197 3198 amdgpu_amdkfd_device_fini_sw(adev); 3199 3200 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3201 if (!adev->ip_blocks[i].status.sw) 3202 continue; 3203 3204 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3205 amdgpu_ucode_free_bo(adev); 3206 amdgpu_free_static_csa(&adev->virt.csa_obj); 3207 amdgpu_device_wb_fini(adev); 3208 amdgpu_device_mem_scratch_fini(adev); 3209 amdgpu_ib_pool_fini(adev); 3210 amdgpu_seq64_fini(adev); 3211 } 3212 3213 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3214 /* XXX handle errors */ 3215 if (r) { 3216 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3217 adev->ip_blocks[i].version->funcs->name, r); 3218 } 3219 adev->ip_blocks[i].status.sw = false; 3220 adev->ip_blocks[i].status.valid = false; 3221 } 3222 3223 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3224 if (!adev->ip_blocks[i].status.late_initialized) 3225 continue; 3226 if (adev->ip_blocks[i].version->funcs->late_fini) 3227 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3228 adev->ip_blocks[i].status.late_initialized = false; 3229 } 3230 3231 amdgpu_ras_fini(adev); 3232 3233 return 0; 3234 } 3235 3236 /** 3237 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3238 * 3239 * @work: work_struct. 3240 */ 3241 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3242 { 3243 struct amdgpu_device *adev = 3244 container_of(work, struct amdgpu_device, delayed_init_work.work); 3245 int r; 3246 3247 r = amdgpu_ib_ring_tests(adev); 3248 if (r) 3249 DRM_ERROR("ib ring test failed (%d).\n", r); 3250 } 3251 3252 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3253 { 3254 struct amdgpu_device *adev = 3255 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3256 3257 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3258 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3259 3260 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3261 adev->gfx.gfx_off_state = true; 3262 } 3263 3264 /** 3265 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3266 * 3267 * @adev: amdgpu_device pointer 3268 * 3269 * Main suspend function for hardware IPs. The list of all the hardware 3270 * IPs that make up the asic is walked, clockgating is disabled and the 3271 * suspend callbacks are run. suspend puts the hardware and software state 3272 * in each IP into a state suitable for suspend. 3273 * Returns 0 on success, negative error code on failure. 3274 */ 3275 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3276 { 3277 int i, r; 3278 3279 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3280 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3281 3282 /* 3283 * Per PMFW team's suggestion, driver needs to handle gfxoff 3284 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3285 * scenario. Add the missing df cstate disablement here. 3286 */ 3287 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3288 dev_warn(adev->dev, "Failed to disallow df cstate"); 3289 3290 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3291 if (!adev->ip_blocks[i].status.valid) 3292 continue; 3293 3294 /* displays are handled separately */ 3295 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3296 continue; 3297 3298 /* XXX handle errors */ 3299 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3300 /* XXX handle errors */ 3301 if (r) { 3302 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3303 adev->ip_blocks[i].version->funcs->name, r); 3304 return r; 3305 } 3306 3307 adev->ip_blocks[i].status.hw = false; 3308 } 3309 3310 return 0; 3311 } 3312 3313 /** 3314 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3315 * 3316 * @adev: amdgpu_device pointer 3317 * 3318 * Main suspend function for hardware IPs. The list of all the hardware 3319 * IPs that make up the asic is walked, clockgating is disabled and the 3320 * suspend callbacks are run. suspend puts the hardware and software state 3321 * in each IP into a state suitable for suspend. 3322 * Returns 0 on success, negative error code on failure. 3323 */ 3324 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3325 { 3326 int i, r; 3327 3328 if (adev->in_s0ix) 3329 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3330 3331 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3332 if (!adev->ip_blocks[i].status.valid) 3333 continue; 3334 /* displays are handled in phase1 */ 3335 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3336 continue; 3337 /* PSP lost connection when err_event_athub occurs */ 3338 if (amdgpu_ras_intr_triggered() && 3339 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3340 adev->ip_blocks[i].status.hw = false; 3341 continue; 3342 } 3343 3344 /* skip unnecessary suspend if we do not initialize them yet */ 3345 if (adev->gmc.xgmi.pending_reset && 3346 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3347 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3348 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3349 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3350 adev->ip_blocks[i].status.hw = false; 3351 continue; 3352 } 3353 3354 /* skip suspend of gfx/mes and psp for S0ix 3355 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3356 * like at runtime. PSP is also part of the always on hardware 3357 * so no need to suspend it. 3358 */ 3359 if (adev->in_s0ix && 3360 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3361 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3362 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3363 continue; 3364 3365 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3366 if (adev->in_s0ix && 3367 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3368 IP_VERSION(5, 0, 0)) && 3369 (adev->ip_blocks[i].version->type == 3370 AMD_IP_BLOCK_TYPE_SDMA)) 3371 continue; 3372 3373 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3374 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3375 * from this location and RLC Autoload automatically also gets loaded 3376 * from here based on PMFW -> PSP message during re-init sequence. 3377 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3378 * the TMR and reload FWs again for IMU enabled APU ASICs. 3379 */ 3380 if (amdgpu_in_reset(adev) && 3381 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3382 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3383 continue; 3384 3385 /* XXX handle errors */ 3386 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3387 /* XXX handle errors */ 3388 if (r) { 3389 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3390 adev->ip_blocks[i].version->funcs->name, r); 3391 } 3392 adev->ip_blocks[i].status.hw = false; 3393 /* handle putting the SMC in the appropriate state */ 3394 if (!amdgpu_sriov_vf(adev)) { 3395 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3396 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3397 if (r) { 3398 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3399 adev->mp1_state, r); 3400 return r; 3401 } 3402 } 3403 } 3404 } 3405 3406 return 0; 3407 } 3408 3409 /** 3410 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3411 * 3412 * @adev: amdgpu_device pointer 3413 * 3414 * Main suspend function for hardware IPs. The list of all the hardware 3415 * IPs that make up the asic is walked, clockgating is disabled and the 3416 * suspend callbacks are run. suspend puts the hardware and software state 3417 * in each IP into a state suitable for suspend. 3418 * Returns 0 on success, negative error code on failure. 3419 */ 3420 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3421 { 3422 int r; 3423 3424 if (amdgpu_sriov_vf(adev)) { 3425 amdgpu_virt_fini_data_exchange(adev); 3426 amdgpu_virt_request_full_gpu(adev, false); 3427 } 3428 3429 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3430 3431 r = amdgpu_device_ip_suspend_phase1(adev); 3432 if (r) 3433 return r; 3434 r = amdgpu_device_ip_suspend_phase2(adev); 3435 3436 if (amdgpu_sriov_vf(adev)) 3437 amdgpu_virt_release_full_gpu(adev, false); 3438 3439 return r; 3440 } 3441 3442 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3443 { 3444 int i, r; 3445 3446 static enum amd_ip_block_type ip_order[] = { 3447 AMD_IP_BLOCK_TYPE_COMMON, 3448 AMD_IP_BLOCK_TYPE_GMC, 3449 AMD_IP_BLOCK_TYPE_PSP, 3450 AMD_IP_BLOCK_TYPE_IH, 3451 }; 3452 3453 for (i = 0; i < adev->num_ip_blocks; i++) { 3454 int j; 3455 struct amdgpu_ip_block *block; 3456 3457 block = &adev->ip_blocks[i]; 3458 block->status.hw = false; 3459 3460 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3461 3462 if (block->version->type != ip_order[j] || 3463 !block->status.valid) 3464 continue; 3465 3466 r = block->version->funcs->hw_init(adev); 3467 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3468 if (r) 3469 return r; 3470 block->status.hw = true; 3471 } 3472 } 3473 3474 return 0; 3475 } 3476 3477 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3478 { 3479 int i, r; 3480 3481 static enum amd_ip_block_type ip_order[] = { 3482 AMD_IP_BLOCK_TYPE_SMC, 3483 AMD_IP_BLOCK_TYPE_DCE, 3484 AMD_IP_BLOCK_TYPE_GFX, 3485 AMD_IP_BLOCK_TYPE_SDMA, 3486 AMD_IP_BLOCK_TYPE_MES, 3487 AMD_IP_BLOCK_TYPE_UVD, 3488 AMD_IP_BLOCK_TYPE_VCE, 3489 AMD_IP_BLOCK_TYPE_VCN, 3490 AMD_IP_BLOCK_TYPE_JPEG 3491 }; 3492 3493 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3494 int j; 3495 struct amdgpu_ip_block *block; 3496 3497 for (j = 0; j < adev->num_ip_blocks; j++) { 3498 block = &adev->ip_blocks[j]; 3499 3500 if (block->version->type != ip_order[i] || 3501 !block->status.valid || 3502 block->status.hw) 3503 continue; 3504 3505 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3506 r = block->version->funcs->resume(adev); 3507 else 3508 r = block->version->funcs->hw_init(adev); 3509 3510 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3511 if (r) 3512 return r; 3513 block->status.hw = true; 3514 } 3515 } 3516 3517 return 0; 3518 } 3519 3520 /** 3521 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3522 * 3523 * @adev: amdgpu_device pointer 3524 * 3525 * First resume function for hardware IPs. The list of all the hardware 3526 * IPs that make up the asic is walked and the resume callbacks are run for 3527 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3528 * after a suspend and updates the software state as necessary. This 3529 * function is also used for restoring the GPU after a GPU reset. 3530 * Returns 0 on success, negative error code on failure. 3531 */ 3532 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3533 { 3534 int i, r; 3535 3536 for (i = 0; i < adev->num_ip_blocks; i++) { 3537 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3538 continue; 3539 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3540 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3541 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3542 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3543 3544 r = adev->ip_blocks[i].version->funcs->resume(adev); 3545 if (r) { 3546 DRM_ERROR("resume of IP block <%s> failed %d\n", 3547 adev->ip_blocks[i].version->funcs->name, r); 3548 return r; 3549 } 3550 adev->ip_blocks[i].status.hw = true; 3551 } 3552 } 3553 3554 return 0; 3555 } 3556 3557 /** 3558 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3559 * 3560 * @adev: amdgpu_device pointer 3561 * 3562 * First resume function for hardware IPs. The list of all the hardware 3563 * IPs that make up the asic is walked and the resume callbacks are run for 3564 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3565 * functional state after a suspend and updates the software state as 3566 * necessary. This function is also used for restoring the GPU after a GPU 3567 * reset. 3568 * Returns 0 on success, negative error code on failure. 3569 */ 3570 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3571 { 3572 int i, r; 3573 3574 for (i = 0; i < adev->num_ip_blocks; i++) { 3575 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3576 continue; 3577 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3578 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3579 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3580 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3581 continue; 3582 r = adev->ip_blocks[i].version->funcs->resume(adev); 3583 if (r) { 3584 DRM_ERROR("resume of IP block <%s> failed %d\n", 3585 adev->ip_blocks[i].version->funcs->name, r); 3586 return r; 3587 } 3588 adev->ip_blocks[i].status.hw = true; 3589 } 3590 3591 return 0; 3592 } 3593 3594 /** 3595 * amdgpu_device_ip_resume - run resume for hardware IPs 3596 * 3597 * @adev: amdgpu_device pointer 3598 * 3599 * Main resume function for hardware IPs. The hardware IPs 3600 * are split into two resume functions because they are 3601 * also used in recovering from a GPU reset and some additional 3602 * steps need to be take between them. In this case (S3/S4) they are 3603 * run sequentially. 3604 * Returns 0 on success, negative error code on failure. 3605 */ 3606 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3607 { 3608 int r; 3609 3610 r = amdgpu_device_ip_resume_phase1(adev); 3611 if (r) 3612 return r; 3613 3614 r = amdgpu_device_fw_loading(adev); 3615 if (r) 3616 return r; 3617 3618 r = amdgpu_device_ip_resume_phase2(adev); 3619 3620 if (adev->mman.buffer_funcs_ring->sched.ready) 3621 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3622 3623 return r; 3624 } 3625 3626 /** 3627 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3628 * 3629 * @adev: amdgpu_device pointer 3630 * 3631 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3632 */ 3633 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3634 { 3635 if (amdgpu_sriov_vf(adev)) { 3636 if (adev->is_atom_fw) { 3637 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3638 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3639 } else { 3640 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3641 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3642 } 3643 3644 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3645 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3646 } 3647 } 3648 3649 /** 3650 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3651 * 3652 * @asic_type: AMD asic type 3653 * 3654 * Check if there is DC (new modesetting infrastructre) support for an asic. 3655 * returns true if DC has support, false if not. 3656 */ 3657 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3658 { 3659 switch (asic_type) { 3660 #ifdef CONFIG_DRM_AMDGPU_SI 3661 case CHIP_HAINAN: 3662 #endif 3663 case CHIP_TOPAZ: 3664 /* chips with no display hardware */ 3665 return false; 3666 #if defined(CONFIG_DRM_AMD_DC) 3667 case CHIP_TAHITI: 3668 case CHIP_PITCAIRN: 3669 case CHIP_VERDE: 3670 case CHIP_OLAND: 3671 /* 3672 * We have systems in the wild with these ASICs that require 3673 * LVDS and VGA support which is not supported with DC. 3674 * 3675 * Fallback to the non-DC driver here by default so as not to 3676 * cause regressions. 3677 */ 3678 #if defined(CONFIG_DRM_AMD_DC_SI) 3679 return amdgpu_dc > 0; 3680 #else 3681 return false; 3682 #endif 3683 case CHIP_BONAIRE: 3684 case CHIP_KAVERI: 3685 case CHIP_KABINI: 3686 case CHIP_MULLINS: 3687 /* 3688 * We have systems in the wild with these ASICs that require 3689 * VGA support which is not supported with DC. 3690 * 3691 * Fallback to the non-DC driver here by default so as not to 3692 * cause regressions. 3693 */ 3694 return amdgpu_dc > 0; 3695 default: 3696 return amdgpu_dc != 0; 3697 #else 3698 default: 3699 if (amdgpu_dc > 0) 3700 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3701 return false; 3702 #endif 3703 } 3704 } 3705 3706 /** 3707 * amdgpu_device_has_dc_support - check if dc is supported 3708 * 3709 * @adev: amdgpu_device pointer 3710 * 3711 * Returns true for supported, false for not supported 3712 */ 3713 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3714 { 3715 if (adev->enable_virtual_display || 3716 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3717 return false; 3718 3719 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3720 } 3721 3722 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3723 { 3724 struct amdgpu_device *adev = 3725 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3726 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3727 3728 /* It's a bug to not have a hive within this function */ 3729 if (WARN_ON(!hive)) 3730 return; 3731 3732 /* 3733 * Use task barrier to synchronize all xgmi reset works across the 3734 * hive. task_barrier_enter and task_barrier_exit will block 3735 * until all the threads running the xgmi reset works reach 3736 * those points. task_barrier_full will do both blocks. 3737 */ 3738 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3739 3740 task_barrier_enter(&hive->tb); 3741 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3742 3743 if (adev->asic_reset_res) 3744 goto fail; 3745 3746 task_barrier_exit(&hive->tb); 3747 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3748 3749 if (adev->asic_reset_res) 3750 goto fail; 3751 3752 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3753 } else { 3754 3755 task_barrier_full(&hive->tb); 3756 adev->asic_reset_res = amdgpu_asic_reset(adev); 3757 } 3758 3759 fail: 3760 if (adev->asic_reset_res) 3761 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3762 adev->asic_reset_res, adev_to_drm(adev)->unique); 3763 amdgpu_put_xgmi_hive(hive); 3764 } 3765 3766 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3767 { 3768 char *input = amdgpu_lockup_timeout; 3769 char *timeout_setting = NULL; 3770 int index = 0; 3771 long timeout; 3772 int ret = 0; 3773 3774 /* 3775 * By default timeout for non compute jobs is 10000 3776 * and 60000 for compute jobs. 3777 * In SR-IOV or passthrough mode, timeout for compute 3778 * jobs are 60000 by default. 3779 */ 3780 adev->gfx_timeout = msecs_to_jiffies(10000); 3781 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3782 if (amdgpu_sriov_vf(adev)) 3783 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3784 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3785 else 3786 adev->compute_timeout = msecs_to_jiffies(60000); 3787 3788 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3789 while ((timeout_setting = strsep(&input, ",")) && 3790 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3791 ret = kstrtol(timeout_setting, 0, &timeout); 3792 if (ret) 3793 return ret; 3794 3795 if (timeout == 0) { 3796 index++; 3797 continue; 3798 } else if (timeout < 0) { 3799 timeout = MAX_SCHEDULE_TIMEOUT; 3800 dev_warn(adev->dev, "lockup timeout disabled"); 3801 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3802 } else { 3803 timeout = msecs_to_jiffies(timeout); 3804 } 3805 3806 switch (index++) { 3807 case 0: 3808 adev->gfx_timeout = timeout; 3809 break; 3810 case 1: 3811 adev->compute_timeout = timeout; 3812 break; 3813 case 2: 3814 adev->sdma_timeout = timeout; 3815 break; 3816 case 3: 3817 adev->video_timeout = timeout; 3818 break; 3819 default: 3820 break; 3821 } 3822 } 3823 /* 3824 * There is only one value specified and 3825 * it should apply to all non-compute jobs. 3826 */ 3827 if (index == 1) { 3828 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3829 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3830 adev->compute_timeout = adev->gfx_timeout; 3831 } 3832 } 3833 3834 return ret; 3835 } 3836 3837 /** 3838 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3839 * 3840 * @adev: amdgpu_device pointer 3841 * 3842 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3843 */ 3844 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3845 { 3846 struct iommu_domain *domain; 3847 3848 domain = iommu_get_domain_for_dev(adev->dev); 3849 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3850 adev->ram_is_direct_mapped = true; 3851 } 3852 3853 static const struct attribute *amdgpu_dev_attributes[] = { 3854 &dev_attr_pcie_replay_count.attr, 3855 NULL 3856 }; 3857 3858 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3859 { 3860 if (amdgpu_mcbp == 1) 3861 adev->gfx.mcbp = true; 3862 else if (amdgpu_mcbp == 0) 3863 adev->gfx.mcbp = false; 3864 3865 if (amdgpu_sriov_vf(adev)) 3866 adev->gfx.mcbp = true; 3867 3868 if (adev->gfx.mcbp) 3869 DRM_INFO("MCBP is enabled\n"); 3870 } 3871 3872 /** 3873 * amdgpu_device_init - initialize the driver 3874 * 3875 * @adev: amdgpu_device pointer 3876 * @flags: driver flags 3877 * 3878 * Initializes the driver info and hw (all asics). 3879 * Returns 0 for success or an error on failure. 3880 * Called at driver startup. 3881 */ 3882 int amdgpu_device_init(struct amdgpu_device *adev, 3883 uint32_t flags) 3884 { 3885 struct drm_device *ddev = adev_to_drm(adev); 3886 struct pci_dev *pdev = adev->pdev; 3887 int r, i; 3888 bool px = false; 3889 u32 max_MBps; 3890 int tmp; 3891 3892 adev->shutdown = false; 3893 adev->flags = flags; 3894 3895 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3896 adev->asic_type = amdgpu_force_asic_type; 3897 else 3898 adev->asic_type = flags & AMD_ASIC_MASK; 3899 3900 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3901 if (amdgpu_emu_mode == 1) 3902 adev->usec_timeout *= 10; 3903 adev->gmc.gart_size = 512 * 1024 * 1024; 3904 adev->accel_working = false; 3905 adev->num_rings = 0; 3906 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3907 adev->mman.buffer_funcs = NULL; 3908 adev->mman.buffer_funcs_ring = NULL; 3909 adev->vm_manager.vm_pte_funcs = NULL; 3910 adev->vm_manager.vm_pte_num_scheds = 0; 3911 adev->gmc.gmc_funcs = NULL; 3912 adev->harvest_ip_mask = 0x0; 3913 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3914 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3915 3916 adev->smc_rreg = &amdgpu_invalid_rreg; 3917 adev->smc_wreg = &amdgpu_invalid_wreg; 3918 adev->pcie_rreg = &amdgpu_invalid_rreg; 3919 adev->pcie_wreg = &amdgpu_invalid_wreg; 3920 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3921 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3922 adev->pciep_rreg = &amdgpu_invalid_rreg; 3923 adev->pciep_wreg = &amdgpu_invalid_wreg; 3924 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3925 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3926 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3927 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3928 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3929 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3930 adev->didt_rreg = &amdgpu_invalid_rreg; 3931 adev->didt_wreg = &amdgpu_invalid_wreg; 3932 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3933 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3934 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3935 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3936 3937 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3938 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3939 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3940 3941 /* mutex initialization are all done here so we 3942 * can recall function without having locking issues 3943 */ 3944 mutex_init(&adev->firmware.mutex); 3945 mutex_init(&adev->pm.mutex); 3946 mutex_init(&adev->gfx.gpu_clock_mutex); 3947 mutex_init(&adev->srbm_mutex); 3948 mutex_init(&adev->gfx.pipe_reserve_mutex); 3949 mutex_init(&adev->gfx.gfx_off_mutex); 3950 mutex_init(&adev->gfx.partition_mutex); 3951 mutex_init(&adev->grbm_idx_mutex); 3952 mutex_init(&adev->mn_lock); 3953 mutex_init(&adev->virt.vf_errors.lock); 3954 hash_init(adev->mn_hash); 3955 mutex_init(&adev->psp.mutex); 3956 mutex_init(&adev->notifier_lock); 3957 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3958 mutex_init(&adev->benchmark_mutex); 3959 3960 amdgpu_device_init_apu_flags(adev); 3961 3962 r = amdgpu_device_check_arguments(adev); 3963 if (r) 3964 return r; 3965 3966 spin_lock_init(&adev->mmio_idx_lock); 3967 spin_lock_init(&adev->smc_idx_lock); 3968 spin_lock_init(&adev->pcie_idx_lock); 3969 spin_lock_init(&adev->uvd_ctx_idx_lock); 3970 spin_lock_init(&adev->didt_idx_lock); 3971 spin_lock_init(&adev->gc_cac_idx_lock); 3972 spin_lock_init(&adev->se_cac_idx_lock); 3973 spin_lock_init(&adev->audio_endpt_idx_lock); 3974 spin_lock_init(&adev->mm_stats.lock); 3975 3976 INIT_LIST_HEAD(&adev->shadow_list); 3977 mutex_init(&adev->shadow_list_lock); 3978 3979 INIT_LIST_HEAD(&adev->reset_list); 3980 3981 INIT_LIST_HEAD(&adev->ras_list); 3982 3983 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3984 3985 INIT_DELAYED_WORK(&adev->delayed_init_work, 3986 amdgpu_device_delayed_init_work_handler); 3987 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3988 amdgpu_device_delay_enable_gfx_off); 3989 3990 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3991 3992 adev->gfx.gfx_off_req_count = 1; 3993 adev->gfx.gfx_off_residency = 0; 3994 adev->gfx.gfx_off_entrycount = 0; 3995 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3996 3997 atomic_set(&adev->throttling_logging_enabled, 1); 3998 /* 3999 * If throttling continues, logging will be performed every minute 4000 * to avoid log flooding. "-1" is subtracted since the thermal 4001 * throttling interrupt comes every second. Thus, the total logging 4002 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4003 * for throttling interrupt) = 60 seconds. 4004 */ 4005 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4006 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4007 4008 /* Registers mapping */ 4009 /* TODO: block userspace mapping of io register */ 4010 if (adev->asic_type >= CHIP_BONAIRE) { 4011 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4012 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4013 } else { 4014 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4015 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4016 } 4017 4018 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4019 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4020 4021 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4022 if (!adev->rmmio) 4023 return -ENOMEM; 4024 4025 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4026 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4027 4028 /* 4029 * Reset domain needs to be present early, before XGMI hive discovered 4030 * (if any) and intitialized to use reset sem and in_gpu reset flag 4031 * early on during init and before calling to RREG32. 4032 */ 4033 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4034 if (!adev->reset_domain) 4035 return -ENOMEM; 4036 4037 /* detect hw virtualization here */ 4038 amdgpu_detect_virtualization(adev); 4039 4040 amdgpu_device_get_pcie_info(adev); 4041 4042 r = amdgpu_device_get_job_timeout_settings(adev); 4043 if (r) { 4044 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4045 return r; 4046 } 4047 4048 /* early init functions */ 4049 r = amdgpu_device_ip_early_init(adev); 4050 if (r) 4051 return r; 4052 4053 amdgpu_device_set_mcbp(adev); 4054 4055 /* Get rid of things like offb */ 4056 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4057 if (r) 4058 return r; 4059 4060 /* Enable TMZ based on IP_VERSION */ 4061 amdgpu_gmc_tmz_set(adev); 4062 4063 amdgpu_gmc_noretry_set(adev); 4064 /* Need to get xgmi info early to decide the reset behavior*/ 4065 if (adev->gmc.xgmi.supported) { 4066 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4067 if (r) 4068 return r; 4069 } 4070 4071 /* enable PCIE atomic ops */ 4072 if (amdgpu_sriov_vf(adev)) { 4073 if (adev->virt.fw_reserve.p_pf2vf) 4074 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4075 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4076 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4077 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4078 * internal path natively support atomics, set have_atomics_support to true. 4079 */ 4080 } else if ((adev->flags & AMD_IS_APU) && 4081 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4082 IP_VERSION(9, 0, 0))) { 4083 adev->have_atomics_support = true; 4084 } else { 4085 adev->have_atomics_support = 4086 !pci_enable_atomic_ops_to_root(adev->pdev, 4087 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4088 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4089 } 4090 4091 if (!adev->have_atomics_support) 4092 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4093 4094 /* doorbell bar mapping and doorbell index init*/ 4095 amdgpu_doorbell_init(adev); 4096 4097 if (amdgpu_emu_mode == 1) { 4098 /* post the asic on emulation mode */ 4099 emu_soc_asic_init(adev); 4100 goto fence_driver_init; 4101 } 4102 4103 amdgpu_reset_init(adev); 4104 4105 /* detect if we are with an SRIOV vbios */ 4106 if (adev->bios) 4107 amdgpu_device_detect_sriov_bios(adev); 4108 4109 /* check if we need to reset the asic 4110 * E.g., driver was not cleanly unloaded previously, etc. 4111 */ 4112 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4113 if (adev->gmc.xgmi.num_physical_nodes) { 4114 dev_info(adev->dev, "Pending hive reset.\n"); 4115 adev->gmc.xgmi.pending_reset = true; 4116 /* Only need to init necessary block for SMU to handle the reset */ 4117 for (i = 0; i < adev->num_ip_blocks; i++) { 4118 if (!adev->ip_blocks[i].status.valid) 4119 continue; 4120 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4121 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4122 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4123 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4124 DRM_DEBUG("IP %s disabled for hw_init.\n", 4125 adev->ip_blocks[i].version->funcs->name); 4126 adev->ip_blocks[i].status.hw = true; 4127 } 4128 } 4129 } else { 4130 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 4131 case IP_VERSION(13, 0, 0): 4132 case IP_VERSION(13, 0, 7): 4133 case IP_VERSION(13, 0, 10): 4134 r = psp_gpu_reset(adev); 4135 break; 4136 default: 4137 tmp = amdgpu_reset_method; 4138 /* It should do a default reset when loading or reloading the driver, 4139 * regardless of the module parameter reset_method. 4140 */ 4141 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4142 r = amdgpu_asic_reset(adev); 4143 amdgpu_reset_method = tmp; 4144 break; 4145 } 4146 4147 if (r) { 4148 dev_err(adev->dev, "asic reset on init failed\n"); 4149 goto failed; 4150 } 4151 } 4152 } 4153 4154 /* Post card if necessary */ 4155 if (amdgpu_device_need_post(adev)) { 4156 if (!adev->bios) { 4157 dev_err(adev->dev, "no vBIOS found\n"); 4158 r = -EINVAL; 4159 goto failed; 4160 } 4161 DRM_INFO("GPU posting now...\n"); 4162 r = amdgpu_device_asic_init(adev); 4163 if (r) { 4164 dev_err(adev->dev, "gpu post error!\n"); 4165 goto failed; 4166 } 4167 } 4168 4169 if (adev->bios) { 4170 if (adev->is_atom_fw) { 4171 /* Initialize clocks */ 4172 r = amdgpu_atomfirmware_get_clock_info(adev); 4173 if (r) { 4174 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4175 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4176 goto failed; 4177 } 4178 } else { 4179 /* Initialize clocks */ 4180 r = amdgpu_atombios_get_clock_info(adev); 4181 if (r) { 4182 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4183 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4184 goto failed; 4185 } 4186 /* init i2c buses */ 4187 if (!amdgpu_device_has_dc_support(adev)) 4188 amdgpu_atombios_i2c_init(adev); 4189 } 4190 } 4191 4192 fence_driver_init: 4193 /* Fence driver */ 4194 r = amdgpu_fence_driver_sw_init(adev); 4195 if (r) { 4196 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4197 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4198 goto failed; 4199 } 4200 4201 /* init the mode config */ 4202 drm_mode_config_init(adev_to_drm(adev)); 4203 4204 r = amdgpu_device_ip_init(adev); 4205 if (r) { 4206 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4207 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4208 goto release_ras_con; 4209 } 4210 4211 amdgpu_fence_driver_hw_init(adev); 4212 4213 dev_info(adev->dev, 4214 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4215 adev->gfx.config.max_shader_engines, 4216 adev->gfx.config.max_sh_per_se, 4217 adev->gfx.config.max_cu_per_sh, 4218 adev->gfx.cu_info.number); 4219 4220 adev->accel_working = true; 4221 4222 amdgpu_vm_check_compute_bug(adev); 4223 4224 /* Initialize the buffer migration limit. */ 4225 if (amdgpu_moverate >= 0) 4226 max_MBps = amdgpu_moverate; 4227 else 4228 max_MBps = 8; /* Allow 8 MB/s. */ 4229 /* Get a log2 for easy divisions. */ 4230 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4231 4232 /* 4233 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4234 * Otherwise the mgpu fan boost feature will be skipped due to the 4235 * gpu instance is counted less. 4236 */ 4237 amdgpu_register_gpu_instance(adev); 4238 4239 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4240 * explicit gating rather than handling it automatically. 4241 */ 4242 if (!adev->gmc.xgmi.pending_reset) { 4243 r = amdgpu_device_ip_late_init(adev); 4244 if (r) { 4245 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4246 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4247 goto release_ras_con; 4248 } 4249 /* must succeed. */ 4250 amdgpu_ras_resume(adev); 4251 queue_delayed_work(system_wq, &adev->delayed_init_work, 4252 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4253 } 4254 4255 if (amdgpu_sriov_vf(adev)) { 4256 amdgpu_virt_release_full_gpu(adev, true); 4257 flush_delayed_work(&adev->delayed_init_work); 4258 } 4259 4260 /* 4261 * Place those sysfs registering after `late_init`. As some of those 4262 * operations performed in `late_init` might affect the sysfs 4263 * interfaces creating. 4264 */ 4265 r = amdgpu_atombios_sysfs_init(adev); 4266 if (r) 4267 drm_err(&adev->ddev, 4268 "registering atombios sysfs failed (%d).\n", r); 4269 4270 r = amdgpu_pm_sysfs_init(adev); 4271 if (r) 4272 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4273 4274 r = amdgpu_ucode_sysfs_init(adev); 4275 if (r) { 4276 adev->ucode_sysfs_en = false; 4277 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4278 } else 4279 adev->ucode_sysfs_en = true; 4280 4281 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4282 if (r) 4283 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4284 4285 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4286 if (r) 4287 dev_err(adev->dev, 4288 "Could not create amdgpu board attributes\n"); 4289 4290 amdgpu_fru_sysfs_init(adev); 4291 amdgpu_reg_state_sysfs_init(adev); 4292 4293 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4294 r = amdgpu_pmu_init(adev); 4295 if (r) 4296 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4297 4298 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4299 if (amdgpu_device_cache_pci_state(adev->pdev)) 4300 pci_restore_state(pdev); 4301 4302 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4303 /* this will fail for cards that aren't VGA class devices, just 4304 * ignore it 4305 */ 4306 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4307 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4308 4309 px = amdgpu_device_supports_px(ddev); 4310 4311 if (px || (!dev_is_removable(&adev->pdev->dev) && 4312 apple_gmux_detect(NULL, NULL))) 4313 vga_switcheroo_register_client(adev->pdev, 4314 &amdgpu_switcheroo_ops, px); 4315 4316 if (px) 4317 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4318 4319 if (adev->gmc.xgmi.pending_reset) 4320 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4321 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4322 4323 amdgpu_device_check_iommu_direct_map(adev); 4324 4325 return 0; 4326 4327 release_ras_con: 4328 if (amdgpu_sriov_vf(adev)) 4329 amdgpu_virt_release_full_gpu(adev, true); 4330 4331 /* failed in exclusive mode due to timeout */ 4332 if (amdgpu_sriov_vf(adev) && 4333 !amdgpu_sriov_runtime(adev) && 4334 amdgpu_virt_mmio_blocked(adev) && 4335 !amdgpu_virt_wait_reset(adev)) { 4336 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4337 /* Don't send request since VF is inactive. */ 4338 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4339 adev->virt.ops = NULL; 4340 r = -EAGAIN; 4341 } 4342 amdgpu_release_ras_context(adev); 4343 4344 failed: 4345 amdgpu_vf_error_trans_all(adev); 4346 4347 return r; 4348 } 4349 4350 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4351 { 4352 4353 /* Clear all CPU mappings pointing to this device */ 4354 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4355 4356 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4357 amdgpu_doorbell_fini(adev); 4358 4359 iounmap(adev->rmmio); 4360 adev->rmmio = NULL; 4361 if (adev->mman.aper_base_kaddr) 4362 iounmap(adev->mman.aper_base_kaddr); 4363 adev->mman.aper_base_kaddr = NULL; 4364 4365 /* Memory manager related */ 4366 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4367 arch_phys_wc_del(adev->gmc.vram_mtrr); 4368 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4369 } 4370 } 4371 4372 /** 4373 * amdgpu_device_fini_hw - tear down the driver 4374 * 4375 * @adev: amdgpu_device pointer 4376 * 4377 * Tear down the driver info (all asics). 4378 * Called at driver shutdown. 4379 */ 4380 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4381 { 4382 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4383 flush_delayed_work(&adev->delayed_init_work); 4384 adev->shutdown = true; 4385 4386 /* make sure IB test finished before entering exclusive mode 4387 * to avoid preemption on IB test 4388 */ 4389 if (amdgpu_sriov_vf(adev)) { 4390 amdgpu_virt_request_full_gpu(adev, false); 4391 amdgpu_virt_fini_data_exchange(adev); 4392 } 4393 4394 /* disable all interrupts */ 4395 amdgpu_irq_disable_all(adev); 4396 if (adev->mode_info.mode_config_initialized) { 4397 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4398 drm_helper_force_disable_all(adev_to_drm(adev)); 4399 else 4400 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4401 } 4402 amdgpu_fence_driver_hw_fini(adev); 4403 4404 if (adev->mman.initialized) 4405 drain_workqueue(adev->mman.bdev.wq); 4406 4407 if (adev->pm.sysfs_initialized) 4408 amdgpu_pm_sysfs_fini(adev); 4409 if (adev->ucode_sysfs_en) 4410 amdgpu_ucode_sysfs_fini(adev); 4411 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4412 amdgpu_fru_sysfs_fini(adev); 4413 4414 amdgpu_reg_state_sysfs_fini(adev); 4415 4416 /* disable ras feature must before hw fini */ 4417 amdgpu_ras_pre_fini(adev); 4418 4419 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4420 4421 amdgpu_device_ip_fini_early(adev); 4422 4423 amdgpu_irq_fini_hw(adev); 4424 4425 if (adev->mman.initialized) 4426 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4427 4428 amdgpu_gart_dummy_page_fini(adev); 4429 4430 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4431 amdgpu_device_unmap_mmio(adev); 4432 4433 } 4434 4435 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4436 { 4437 int idx; 4438 bool px; 4439 4440 amdgpu_fence_driver_sw_fini(adev); 4441 amdgpu_device_ip_fini(adev); 4442 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4443 adev->accel_working = false; 4444 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4445 4446 amdgpu_reset_fini(adev); 4447 4448 /* free i2c buses */ 4449 if (!amdgpu_device_has_dc_support(adev)) 4450 amdgpu_i2c_fini(adev); 4451 4452 if (amdgpu_emu_mode != 1) 4453 amdgpu_atombios_fini(adev); 4454 4455 kfree(adev->bios); 4456 adev->bios = NULL; 4457 4458 kfree(adev->fru_info); 4459 adev->fru_info = NULL; 4460 4461 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4462 4463 if (px || (!dev_is_removable(&adev->pdev->dev) && 4464 apple_gmux_detect(NULL, NULL))) 4465 vga_switcheroo_unregister_client(adev->pdev); 4466 4467 if (px) 4468 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4469 4470 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4471 vga_client_unregister(adev->pdev); 4472 4473 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4474 4475 iounmap(adev->rmmio); 4476 adev->rmmio = NULL; 4477 amdgpu_doorbell_fini(adev); 4478 drm_dev_exit(idx); 4479 } 4480 4481 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4482 amdgpu_pmu_fini(adev); 4483 if (adev->mman.discovery_bin) 4484 amdgpu_discovery_fini(adev); 4485 4486 amdgpu_reset_put_reset_domain(adev->reset_domain); 4487 adev->reset_domain = NULL; 4488 4489 kfree(adev->pci_state); 4490 4491 } 4492 4493 /** 4494 * amdgpu_device_evict_resources - evict device resources 4495 * @adev: amdgpu device object 4496 * 4497 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4498 * of the vram memory type. Mainly used for evicting device resources 4499 * at suspend time. 4500 * 4501 */ 4502 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4503 { 4504 int ret; 4505 4506 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4507 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4508 return 0; 4509 4510 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4511 if (ret) 4512 DRM_WARN("evicting device resources failed\n"); 4513 return ret; 4514 } 4515 4516 /* 4517 * Suspend & resume. 4518 */ 4519 /** 4520 * amdgpu_device_prepare - prepare for device suspend 4521 * 4522 * @dev: drm dev pointer 4523 * 4524 * Prepare to put the hw in the suspend state (all asics). 4525 * Returns 0 for success or an error on failure. 4526 * Called at driver suspend. 4527 */ 4528 int amdgpu_device_prepare(struct drm_device *dev) 4529 { 4530 struct amdgpu_device *adev = drm_to_adev(dev); 4531 int i, r; 4532 4533 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4534 return 0; 4535 4536 /* Evict the majority of BOs before starting suspend sequence */ 4537 r = amdgpu_device_evict_resources(adev); 4538 if (r) 4539 return r; 4540 4541 for (i = 0; i < adev->num_ip_blocks; i++) { 4542 if (!adev->ip_blocks[i].status.valid) 4543 continue; 4544 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4545 continue; 4546 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4547 if (r) 4548 return r; 4549 } 4550 4551 return 0; 4552 } 4553 4554 /** 4555 * amdgpu_device_suspend - initiate device suspend 4556 * 4557 * @dev: drm dev pointer 4558 * @fbcon : notify the fbdev of suspend 4559 * 4560 * Puts the hw in the suspend state (all asics). 4561 * Returns 0 for success or an error on failure. 4562 * Called at driver suspend. 4563 */ 4564 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4565 { 4566 struct amdgpu_device *adev = drm_to_adev(dev); 4567 int r = 0; 4568 4569 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4570 return 0; 4571 4572 adev->in_suspend = true; 4573 4574 if (amdgpu_sriov_vf(adev)) { 4575 amdgpu_virt_fini_data_exchange(adev); 4576 r = amdgpu_virt_request_full_gpu(adev, false); 4577 if (r) 4578 return r; 4579 } 4580 4581 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4582 DRM_WARN("smart shift update failed\n"); 4583 4584 if (fbcon) 4585 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4586 4587 cancel_delayed_work_sync(&adev->delayed_init_work); 4588 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4589 4590 amdgpu_ras_suspend(adev); 4591 4592 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4593 4594 amdgpu_device_ip_suspend_phase1(adev); 4595 4596 if (!adev->in_s0ix) 4597 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4598 4599 r = amdgpu_device_evict_resources(adev); 4600 if (r) 4601 return r; 4602 4603 amdgpu_fence_driver_hw_fini(adev); 4604 4605 amdgpu_device_ip_suspend_phase2(adev); 4606 4607 if (amdgpu_sriov_vf(adev)) 4608 amdgpu_virt_release_full_gpu(adev, false); 4609 4610 r = amdgpu_dpm_notify_rlc_state(adev, false); 4611 if (r) 4612 return r; 4613 4614 return 0; 4615 } 4616 4617 /** 4618 * amdgpu_device_resume - initiate device resume 4619 * 4620 * @dev: drm dev pointer 4621 * @fbcon : notify the fbdev of resume 4622 * 4623 * Bring the hw back to operating state (all asics). 4624 * Returns 0 for success or an error on failure. 4625 * Called at driver resume. 4626 */ 4627 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4628 { 4629 struct amdgpu_device *adev = drm_to_adev(dev); 4630 int r = 0; 4631 4632 if (amdgpu_sriov_vf(adev)) { 4633 r = amdgpu_virt_request_full_gpu(adev, true); 4634 if (r) 4635 return r; 4636 } 4637 4638 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4639 return 0; 4640 4641 if (adev->in_s0ix) 4642 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4643 4644 /* post card */ 4645 if (amdgpu_device_need_post(adev)) { 4646 r = amdgpu_device_asic_init(adev); 4647 if (r) 4648 dev_err(adev->dev, "amdgpu asic init failed\n"); 4649 } 4650 4651 r = amdgpu_device_ip_resume(adev); 4652 4653 if (r) { 4654 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4655 goto exit; 4656 } 4657 amdgpu_fence_driver_hw_init(adev); 4658 4659 if (!adev->in_s0ix) { 4660 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4661 if (r) 4662 goto exit; 4663 } 4664 4665 r = amdgpu_device_ip_late_init(adev); 4666 if (r) 4667 goto exit; 4668 4669 queue_delayed_work(system_wq, &adev->delayed_init_work, 4670 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4671 exit: 4672 if (amdgpu_sriov_vf(adev)) { 4673 amdgpu_virt_init_data_exchange(adev); 4674 amdgpu_virt_release_full_gpu(adev, true); 4675 } 4676 4677 if (r) 4678 return r; 4679 4680 /* Make sure IB tests flushed */ 4681 flush_delayed_work(&adev->delayed_init_work); 4682 4683 if (fbcon) 4684 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4685 4686 amdgpu_ras_resume(adev); 4687 4688 if (adev->mode_info.num_crtc) { 4689 /* 4690 * Most of the connector probing functions try to acquire runtime pm 4691 * refs to ensure that the GPU is powered on when connector polling is 4692 * performed. Since we're calling this from a runtime PM callback, 4693 * trying to acquire rpm refs will cause us to deadlock. 4694 * 4695 * Since we're guaranteed to be holding the rpm lock, it's safe to 4696 * temporarily disable the rpm helpers so this doesn't deadlock us. 4697 */ 4698 #ifdef CONFIG_PM 4699 dev->dev->power.disable_depth++; 4700 #endif 4701 if (!adev->dc_enabled) 4702 drm_helper_hpd_irq_event(dev); 4703 else 4704 drm_kms_helper_hotplug_event(dev); 4705 #ifdef CONFIG_PM 4706 dev->dev->power.disable_depth--; 4707 #endif 4708 } 4709 adev->in_suspend = false; 4710 4711 if (adev->enable_mes) 4712 amdgpu_mes_self_test(adev); 4713 4714 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4715 DRM_WARN("smart shift update failed\n"); 4716 4717 return 0; 4718 } 4719 4720 /** 4721 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4722 * 4723 * @adev: amdgpu_device pointer 4724 * 4725 * The list of all the hardware IPs that make up the asic is walked and 4726 * the check_soft_reset callbacks are run. check_soft_reset determines 4727 * if the asic is still hung or not. 4728 * Returns true if any of the IPs are still in a hung state, false if not. 4729 */ 4730 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4731 { 4732 int i; 4733 bool asic_hang = false; 4734 4735 if (amdgpu_sriov_vf(adev)) 4736 return true; 4737 4738 if (amdgpu_asic_need_full_reset(adev)) 4739 return true; 4740 4741 for (i = 0; i < adev->num_ip_blocks; i++) { 4742 if (!adev->ip_blocks[i].status.valid) 4743 continue; 4744 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4745 adev->ip_blocks[i].status.hang = 4746 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4747 if (adev->ip_blocks[i].status.hang) { 4748 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4749 asic_hang = true; 4750 } 4751 } 4752 return asic_hang; 4753 } 4754 4755 /** 4756 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4757 * 4758 * @adev: amdgpu_device pointer 4759 * 4760 * The list of all the hardware IPs that make up the asic is walked and the 4761 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4762 * handles any IP specific hardware or software state changes that are 4763 * necessary for a soft reset to succeed. 4764 * Returns 0 on success, negative error code on failure. 4765 */ 4766 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4767 { 4768 int i, r = 0; 4769 4770 for (i = 0; i < adev->num_ip_blocks; i++) { 4771 if (!adev->ip_blocks[i].status.valid) 4772 continue; 4773 if (adev->ip_blocks[i].status.hang && 4774 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4775 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4776 if (r) 4777 return r; 4778 } 4779 } 4780 4781 return 0; 4782 } 4783 4784 /** 4785 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4786 * 4787 * @adev: amdgpu_device pointer 4788 * 4789 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4790 * reset is necessary to recover. 4791 * Returns true if a full asic reset is required, false if not. 4792 */ 4793 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4794 { 4795 int i; 4796 4797 if (amdgpu_asic_need_full_reset(adev)) 4798 return true; 4799 4800 for (i = 0; i < adev->num_ip_blocks; i++) { 4801 if (!adev->ip_blocks[i].status.valid) 4802 continue; 4803 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4804 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4805 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4806 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4807 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4808 if (adev->ip_blocks[i].status.hang) { 4809 dev_info(adev->dev, "Some block need full reset!\n"); 4810 return true; 4811 } 4812 } 4813 } 4814 return false; 4815 } 4816 4817 /** 4818 * amdgpu_device_ip_soft_reset - do a soft reset 4819 * 4820 * @adev: amdgpu_device pointer 4821 * 4822 * The list of all the hardware IPs that make up the asic is walked and the 4823 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4824 * IP specific hardware or software state changes that are necessary to soft 4825 * reset the IP. 4826 * Returns 0 on success, negative error code on failure. 4827 */ 4828 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4829 { 4830 int i, r = 0; 4831 4832 for (i = 0; i < adev->num_ip_blocks; i++) { 4833 if (!adev->ip_blocks[i].status.valid) 4834 continue; 4835 if (adev->ip_blocks[i].status.hang && 4836 adev->ip_blocks[i].version->funcs->soft_reset) { 4837 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4838 if (r) 4839 return r; 4840 } 4841 } 4842 4843 return 0; 4844 } 4845 4846 /** 4847 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4848 * 4849 * @adev: amdgpu_device pointer 4850 * 4851 * The list of all the hardware IPs that make up the asic is walked and the 4852 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4853 * handles any IP specific hardware or software state changes that are 4854 * necessary after the IP has been soft reset. 4855 * Returns 0 on success, negative error code on failure. 4856 */ 4857 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4858 { 4859 int i, r = 0; 4860 4861 for (i = 0; i < adev->num_ip_blocks; i++) { 4862 if (!adev->ip_blocks[i].status.valid) 4863 continue; 4864 if (adev->ip_blocks[i].status.hang && 4865 adev->ip_blocks[i].version->funcs->post_soft_reset) 4866 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4867 if (r) 4868 return r; 4869 } 4870 4871 return 0; 4872 } 4873 4874 /** 4875 * amdgpu_device_recover_vram - Recover some VRAM contents 4876 * 4877 * @adev: amdgpu_device pointer 4878 * 4879 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4880 * restore things like GPUVM page tables after a GPU reset where 4881 * the contents of VRAM might be lost. 4882 * 4883 * Returns: 4884 * 0 on success, negative error code on failure. 4885 */ 4886 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4887 { 4888 struct dma_fence *fence = NULL, *next = NULL; 4889 struct amdgpu_bo *shadow; 4890 struct amdgpu_bo_vm *vmbo; 4891 long r = 1, tmo; 4892 4893 if (amdgpu_sriov_runtime(adev)) 4894 tmo = msecs_to_jiffies(8000); 4895 else 4896 tmo = msecs_to_jiffies(100); 4897 4898 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4899 mutex_lock(&adev->shadow_list_lock); 4900 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4901 /* If vm is compute context or adev is APU, shadow will be NULL */ 4902 if (!vmbo->shadow) 4903 continue; 4904 shadow = vmbo->shadow; 4905 4906 /* No need to recover an evicted BO */ 4907 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4908 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4909 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4910 continue; 4911 4912 r = amdgpu_bo_restore_shadow(shadow, &next); 4913 if (r) 4914 break; 4915 4916 if (fence) { 4917 tmo = dma_fence_wait_timeout(fence, false, tmo); 4918 dma_fence_put(fence); 4919 fence = next; 4920 if (tmo == 0) { 4921 r = -ETIMEDOUT; 4922 break; 4923 } else if (tmo < 0) { 4924 r = tmo; 4925 break; 4926 } 4927 } else { 4928 fence = next; 4929 } 4930 } 4931 mutex_unlock(&adev->shadow_list_lock); 4932 4933 if (fence) 4934 tmo = dma_fence_wait_timeout(fence, false, tmo); 4935 dma_fence_put(fence); 4936 4937 if (r < 0 || tmo <= 0) { 4938 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4939 return -EIO; 4940 } 4941 4942 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4943 return 0; 4944 } 4945 4946 4947 /** 4948 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4949 * 4950 * @adev: amdgpu_device pointer 4951 * @from_hypervisor: request from hypervisor 4952 * 4953 * do VF FLR and reinitialize Asic 4954 * return 0 means succeeded otherwise failed 4955 */ 4956 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4957 bool from_hypervisor) 4958 { 4959 int r; 4960 struct amdgpu_hive_info *hive = NULL; 4961 int retry_limit = 0; 4962 4963 retry: 4964 amdgpu_amdkfd_pre_reset(adev); 4965 4966 if (from_hypervisor) 4967 r = amdgpu_virt_request_full_gpu(adev, true); 4968 else 4969 r = amdgpu_virt_reset_gpu(adev); 4970 if (r) 4971 return r; 4972 amdgpu_irq_gpu_reset_resume_helper(adev); 4973 4974 /* some sw clean up VF needs to do before recover */ 4975 amdgpu_virt_post_reset(adev); 4976 4977 /* Resume IP prior to SMC */ 4978 r = amdgpu_device_ip_reinit_early_sriov(adev); 4979 if (r) 4980 goto error; 4981 4982 amdgpu_virt_init_data_exchange(adev); 4983 4984 r = amdgpu_device_fw_loading(adev); 4985 if (r) 4986 return r; 4987 4988 /* now we are okay to resume SMC/CP/SDMA */ 4989 r = amdgpu_device_ip_reinit_late_sriov(adev); 4990 if (r) 4991 goto error; 4992 4993 hive = amdgpu_get_xgmi_hive(adev); 4994 /* Update PSP FW topology after reset */ 4995 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4996 r = amdgpu_xgmi_update_topology(hive, adev); 4997 4998 if (hive) 4999 amdgpu_put_xgmi_hive(hive); 5000 5001 if (!r) { 5002 r = amdgpu_ib_ring_tests(adev); 5003 5004 amdgpu_amdkfd_post_reset(adev); 5005 } 5006 5007 error: 5008 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5009 amdgpu_inc_vram_lost(adev); 5010 r = amdgpu_device_recover_vram(adev); 5011 } 5012 amdgpu_virt_release_full_gpu(adev, true); 5013 5014 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5015 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5016 retry_limit++; 5017 goto retry; 5018 } else 5019 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5020 } 5021 5022 return r; 5023 } 5024 5025 /** 5026 * amdgpu_device_has_job_running - check if there is any job in mirror list 5027 * 5028 * @adev: amdgpu_device pointer 5029 * 5030 * check if there is any job in mirror list 5031 */ 5032 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5033 { 5034 int i; 5035 struct drm_sched_job *job; 5036 5037 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5038 struct amdgpu_ring *ring = adev->rings[i]; 5039 5040 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5041 continue; 5042 5043 spin_lock(&ring->sched.job_list_lock); 5044 job = list_first_entry_or_null(&ring->sched.pending_list, 5045 struct drm_sched_job, list); 5046 spin_unlock(&ring->sched.job_list_lock); 5047 if (job) 5048 return true; 5049 } 5050 return false; 5051 } 5052 5053 /** 5054 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5055 * 5056 * @adev: amdgpu_device pointer 5057 * 5058 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5059 * a hung GPU. 5060 */ 5061 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5062 { 5063 5064 if (amdgpu_gpu_recovery == 0) 5065 goto disabled; 5066 5067 /* Skip soft reset check in fatal error mode */ 5068 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5069 return true; 5070 5071 if (amdgpu_sriov_vf(adev)) 5072 return true; 5073 5074 if (amdgpu_gpu_recovery == -1) { 5075 switch (adev->asic_type) { 5076 #ifdef CONFIG_DRM_AMDGPU_SI 5077 case CHIP_VERDE: 5078 case CHIP_TAHITI: 5079 case CHIP_PITCAIRN: 5080 case CHIP_OLAND: 5081 case CHIP_HAINAN: 5082 #endif 5083 #ifdef CONFIG_DRM_AMDGPU_CIK 5084 case CHIP_KAVERI: 5085 case CHIP_KABINI: 5086 case CHIP_MULLINS: 5087 #endif 5088 case CHIP_CARRIZO: 5089 case CHIP_STONEY: 5090 case CHIP_CYAN_SKILLFISH: 5091 goto disabled; 5092 default: 5093 break; 5094 } 5095 } 5096 5097 return true; 5098 5099 disabled: 5100 dev_info(adev->dev, "GPU recovery disabled.\n"); 5101 return false; 5102 } 5103 5104 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5105 { 5106 u32 i; 5107 int ret = 0; 5108 5109 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5110 5111 dev_info(adev->dev, "GPU mode1 reset\n"); 5112 5113 /* disable BM */ 5114 pci_clear_master(adev->pdev); 5115 5116 amdgpu_device_cache_pci_state(adev->pdev); 5117 5118 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5119 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5120 ret = amdgpu_dpm_mode1_reset(adev); 5121 } else { 5122 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5123 ret = psp_gpu_reset(adev); 5124 } 5125 5126 if (ret) 5127 goto mode1_reset_failed; 5128 5129 amdgpu_device_load_pci_state(adev->pdev); 5130 ret = amdgpu_psp_wait_for_bootloader(adev); 5131 if (ret) 5132 goto mode1_reset_failed; 5133 5134 /* wait for asic to come out of reset */ 5135 for (i = 0; i < adev->usec_timeout; i++) { 5136 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5137 5138 if (memsize != 0xffffffff) 5139 break; 5140 udelay(1); 5141 } 5142 5143 if (i >= adev->usec_timeout) { 5144 ret = -ETIMEDOUT; 5145 goto mode1_reset_failed; 5146 } 5147 5148 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5149 5150 return 0; 5151 5152 mode1_reset_failed: 5153 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5154 return ret; 5155 } 5156 5157 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5158 struct amdgpu_reset_context *reset_context) 5159 { 5160 int i, r = 0; 5161 struct amdgpu_job *job = NULL; 5162 bool need_full_reset = 5163 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5164 5165 if (reset_context->reset_req_dev == adev) 5166 job = reset_context->job; 5167 5168 if (amdgpu_sriov_vf(adev)) { 5169 /* stop the data exchange thread */ 5170 amdgpu_virt_fini_data_exchange(adev); 5171 } 5172 5173 amdgpu_fence_driver_isr_toggle(adev, true); 5174 5175 /* block all schedulers and reset given job's ring */ 5176 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5177 struct amdgpu_ring *ring = adev->rings[i]; 5178 5179 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5180 continue; 5181 5182 /* Clear job fence from fence drv to avoid force_completion 5183 * leave NULL and vm flush fence in fence drv 5184 */ 5185 amdgpu_fence_driver_clear_job_fences(ring); 5186 5187 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5188 amdgpu_fence_driver_force_completion(ring); 5189 } 5190 5191 amdgpu_fence_driver_isr_toggle(adev, false); 5192 5193 if (job && job->vm) 5194 drm_sched_increase_karma(&job->base); 5195 5196 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5197 /* If reset handler not implemented, continue; otherwise return */ 5198 if (r == -EOPNOTSUPP) 5199 r = 0; 5200 else 5201 return r; 5202 5203 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5204 if (!amdgpu_sriov_vf(adev)) { 5205 5206 if (!need_full_reset) 5207 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5208 5209 if (!need_full_reset && amdgpu_gpu_recovery && 5210 amdgpu_device_ip_check_soft_reset(adev)) { 5211 amdgpu_device_ip_pre_soft_reset(adev); 5212 r = amdgpu_device_ip_soft_reset(adev); 5213 amdgpu_device_ip_post_soft_reset(adev); 5214 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5215 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5216 need_full_reset = true; 5217 } 5218 } 5219 5220 if (need_full_reset) 5221 r = amdgpu_device_ip_suspend(adev); 5222 if (need_full_reset) 5223 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5224 else 5225 clear_bit(AMDGPU_NEED_FULL_RESET, 5226 &reset_context->flags); 5227 } 5228 5229 return r; 5230 } 5231 5232 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5233 { 5234 int i; 5235 5236 lockdep_assert_held(&adev->reset_domain->sem); 5237 5238 for (i = 0; i < adev->reset_info.num_regs; i++) { 5239 adev->reset_info.reset_dump_reg_value[i] = 5240 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5241 5242 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5243 adev->reset_info.reset_dump_reg_value[i]); 5244 } 5245 5246 return 0; 5247 } 5248 5249 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5250 struct amdgpu_reset_context *reset_context) 5251 { 5252 struct amdgpu_device *tmp_adev = NULL; 5253 bool need_full_reset, skip_hw_reset, vram_lost = false; 5254 int r = 0; 5255 bool gpu_reset_for_dev_remove = 0; 5256 5257 /* Try reset handler method first */ 5258 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5259 reset_list); 5260 amdgpu_reset_reg_dumps(tmp_adev); 5261 5262 reset_context->reset_device_list = device_list_handle; 5263 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5264 /* If reset handler not implemented, continue; otherwise return */ 5265 if (r == -EOPNOTSUPP) 5266 r = 0; 5267 else 5268 return r; 5269 5270 /* Reset handler not implemented, use the default method */ 5271 need_full_reset = 5272 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5273 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5274 5275 gpu_reset_for_dev_remove = 5276 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5277 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5278 5279 /* 5280 * ASIC reset has to be done on all XGMI hive nodes ASAP 5281 * to allow proper links negotiation in FW (within 1 sec) 5282 */ 5283 if (!skip_hw_reset && need_full_reset) { 5284 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5285 /* For XGMI run all resets in parallel to speed up the process */ 5286 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5287 tmp_adev->gmc.xgmi.pending_reset = false; 5288 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5289 r = -EALREADY; 5290 } else 5291 r = amdgpu_asic_reset(tmp_adev); 5292 5293 if (r) { 5294 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5295 r, adev_to_drm(tmp_adev)->unique); 5296 goto out; 5297 } 5298 } 5299 5300 /* For XGMI wait for all resets to complete before proceed */ 5301 if (!r) { 5302 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5303 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5304 flush_work(&tmp_adev->xgmi_reset_work); 5305 r = tmp_adev->asic_reset_res; 5306 if (r) 5307 break; 5308 } 5309 } 5310 } 5311 } 5312 5313 if (!r && amdgpu_ras_intr_triggered()) { 5314 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5315 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5316 } 5317 5318 amdgpu_ras_intr_cleared(); 5319 } 5320 5321 /* Since the mode1 reset affects base ip blocks, the 5322 * phase1 ip blocks need to be resumed. Otherwise there 5323 * will be a BIOS signature error and the psp bootloader 5324 * can't load kdb on the next amdgpu install. 5325 */ 5326 if (gpu_reset_for_dev_remove) { 5327 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5328 amdgpu_device_ip_resume_phase1(tmp_adev); 5329 5330 goto end; 5331 } 5332 5333 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5334 if (need_full_reset) { 5335 /* post card */ 5336 r = amdgpu_device_asic_init(tmp_adev); 5337 if (r) { 5338 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5339 } else { 5340 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5341 5342 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5343 if (r) 5344 goto out; 5345 5346 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5347 5348 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5349 5350 if (vram_lost) { 5351 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5352 amdgpu_inc_vram_lost(tmp_adev); 5353 } 5354 5355 r = amdgpu_device_fw_loading(tmp_adev); 5356 if (r) 5357 return r; 5358 5359 r = amdgpu_xcp_restore_partition_mode( 5360 tmp_adev->xcp_mgr); 5361 if (r) 5362 goto out; 5363 5364 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5365 if (r) 5366 goto out; 5367 5368 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5369 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5370 5371 if (vram_lost) 5372 amdgpu_device_fill_reset_magic(tmp_adev); 5373 5374 /* 5375 * Add this ASIC as tracked as reset was already 5376 * complete successfully. 5377 */ 5378 amdgpu_register_gpu_instance(tmp_adev); 5379 5380 if (!reset_context->hive && 5381 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5382 amdgpu_xgmi_add_device(tmp_adev); 5383 5384 r = amdgpu_device_ip_late_init(tmp_adev); 5385 if (r) 5386 goto out; 5387 5388 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5389 5390 /* 5391 * The GPU enters bad state once faulty pages 5392 * by ECC has reached the threshold, and ras 5393 * recovery is scheduled next. So add one check 5394 * here to break recovery if it indeed exceeds 5395 * bad page threshold, and remind user to 5396 * retire this GPU or setting one bigger 5397 * bad_page_threshold value to fix this once 5398 * probing driver again. 5399 */ 5400 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5401 /* must succeed. */ 5402 amdgpu_ras_resume(tmp_adev); 5403 } else { 5404 r = -EINVAL; 5405 goto out; 5406 } 5407 5408 /* Update PSP FW topology after reset */ 5409 if (reset_context->hive && 5410 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5411 r = amdgpu_xgmi_update_topology( 5412 reset_context->hive, tmp_adev); 5413 } 5414 } 5415 5416 out: 5417 if (!r) { 5418 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5419 r = amdgpu_ib_ring_tests(tmp_adev); 5420 if (r) { 5421 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5422 need_full_reset = true; 5423 r = -EAGAIN; 5424 goto end; 5425 } 5426 } 5427 5428 if (!r) 5429 r = amdgpu_device_recover_vram(tmp_adev); 5430 else 5431 tmp_adev->asic_reset_res = r; 5432 } 5433 5434 end: 5435 if (need_full_reset) 5436 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5437 else 5438 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5439 return r; 5440 } 5441 5442 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5443 { 5444 5445 switch (amdgpu_asic_reset_method(adev)) { 5446 case AMD_RESET_METHOD_MODE1: 5447 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5448 break; 5449 case AMD_RESET_METHOD_MODE2: 5450 adev->mp1_state = PP_MP1_STATE_RESET; 5451 break; 5452 default: 5453 adev->mp1_state = PP_MP1_STATE_NONE; 5454 break; 5455 } 5456 } 5457 5458 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5459 { 5460 amdgpu_vf_error_trans_all(adev); 5461 adev->mp1_state = PP_MP1_STATE_NONE; 5462 } 5463 5464 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5465 { 5466 struct pci_dev *p = NULL; 5467 5468 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5469 adev->pdev->bus->number, 1); 5470 if (p) { 5471 pm_runtime_enable(&(p->dev)); 5472 pm_runtime_resume(&(p->dev)); 5473 } 5474 5475 pci_dev_put(p); 5476 } 5477 5478 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5479 { 5480 enum amd_reset_method reset_method; 5481 struct pci_dev *p = NULL; 5482 u64 expires; 5483 5484 /* 5485 * For now, only BACO and mode1 reset are confirmed 5486 * to suffer the audio issue without proper suspended. 5487 */ 5488 reset_method = amdgpu_asic_reset_method(adev); 5489 if ((reset_method != AMD_RESET_METHOD_BACO) && 5490 (reset_method != AMD_RESET_METHOD_MODE1)) 5491 return -EINVAL; 5492 5493 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5494 adev->pdev->bus->number, 1); 5495 if (!p) 5496 return -ENODEV; 5497 5498 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5499 if (!expires) 5500 /* 5501 * If we cannot get the audio device autosuspend delay, 5502 * a fixed 4S interval will be used. Considering 3S is 5503 * the audio controller default autosuspend delay setting. 5504 * 4S used here is guaranteed to cover that. 5505 */ 5506 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5507 5508 while (!pm_runtime_status_suspended(&(p->dev))) { 5509 if (!pm_runtime_suspend(&(p->dev))) 5510 break; 5511 5512 if (expires < ktime_get_mono_fast_ns()) { 5513 dev_warn(adev->dev, "failed to suspend display audio\n"); 5514 pci_dev_put(p); 5515 /* TODO: abort the succeeding gpu reset? */ 5516 return -ETIMEDOUT; 5517 } 5518 } 5519 5520 pm_runtime_disable(&(p->dev)); 5521 5522 pci_dev_put(p); 5523 return 0; 5524 } 5525 5526 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5527 { 5528 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5529 5530 #if defined(CONFIG_DEBUG_FS) 5531 if (!amdgpu_sriov_vf(adev)) 5532 cancel_work(&adev->reset_work); 5533 #endif 5534 5535 if (adev->kfd.dev) 5536 cancel_work(&adev->kfd.reset_work); 5537 5538 if (amdgpu_sriov_vf(adev)) 5539 cancel_work(&adev->virt.flr_work); 5540 5541 if (con && adev->ras_enabled) 5542 cancel_work(&con->recovery_work); 5543 5544 } 5545 5546 /** 5547 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5548 * 5549 * @adev: amdgpu_device pointer 5550 * @job: which job trigger hang 5551 * @reset_context: amdgpu reset context pointer 5552 * 5553 * Attempt to reset the GPU if it has hung (all asics). 5554 * Attempt to do soft-reset or full-reset and reinitialize Asic 5555 * Returns 0 for success or an error on failure. 5556 */ 5557 5558 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5559 struct amdgpu_job *job, 5560 struct amdgpu_reset_context *reset_context) 5561 { 5562 struct list_head device_list, *device_list_handle = NULL; 5563 bool job_signaled = false; 5564 struct amdgpu_hive_info *hive = NULL; 5565 struct amdgpu_device *tmp_adev = NULL; 5566 int i, r = 0; 5567 bool need_emergency_restart = false; 5568 bool audio_suspended = false; 5569 bool gpu_reset_for_dev_remove = false; 5570 5571 gpu_reset_for_dev_remove = 5572 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5573 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5574 5575 /* 5576 * Special case: RAS triggered and full reset isn't supported 5577 */ 5578 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5579 5580 /* 5581 * Flush RAM to disk so that after reboot 5582 * the user can read log and see why the system rebooted. 5583 */ 5584 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5585 amdgpu_ras_get_context(adev)->reboot) { 5586 DRM_WARN("Emergency reboot."); 5587 5588 ksys_sync_helper(); 5589 emergency_restart(); 5590 } 5591 5592 dev_info(adev->dev, "GPU %s begin!\n", 5593 need_emergency_restart ? "jobs stop":"reset"); 5594 5595 if (!amdgpu_sriov_vf(adev)) 5596 hive = amdgpu_get_xgmi_hive(adev); 5597 if (hive) 5598 mutex_lock(&hive->hive_lock); 5599 5600 reset_context->job = job; 5601 reset_context->hive = hive; 5602 /* 5603 * Build list of devices to reset. 5604 * In case we are in XGMI hive mode, resort the device list 5605 * to put adev in the 1st position. 5606 */ 5607 INIT_LIST_HEAD(&device_list); 5608 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5609 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5610 list_add_tail(&tmp_adev->reset_list, &device_list); 5611 if (gpu_reset_for_dev_remove && adev->shutdown) 5612 tmp_adev->shutdown = true; 5613 } 5614 if (!list_is_first(&adev->reset_list, &device_list)) 5615 list_rotate_to_front(&adev->reset_list, &device_list); 5616 device_list_handle = &device_list; 5617 } else { 5618 list_add_tail(&adev->reset_list, &device_list); 5619 device_list_handle = &device_list; 5620 } 5621 5622 /* We need to lock reset domain only once both for XGMI and single device */ 5623 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5624 reset_list); 5625 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5626 5627 /* block all schedulers and reset given job's ring */ 5628 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5629 5630 amdgpu_device_set_mp1_state(tmp_adev); 5631 5632 /* 5633 * Try to put the audio codec into suspend state 5634 * before gpu reset started. 5635 * 5636 * Due to the power domain of the graphics device 5637 * is shared with AZ power domain. Without this, 5638 * we may change the audio hardware from behind 5639 * the audio driver's back. That will trigger 5640 * some audio codec errors. 5641 */ 5642 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5643 audio_suspended = true; 5644 5645 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5646 5647 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5648 5649 if (!amdgpu_sriov_vf(tmp_adev)) 5650 amdgpu_amdkfd_pre_reset(tmp_adev); 5651 5652 /* 5653 * Mark these ASICs to be reseted as untracked first 5654 * And add them back after reset completed 5655 */ 5656 amdgpu_unregister_gpu_instance(tmp_adev); 5657 5658 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5659 5660 /* disable ras on ALL IPs */ 5661 if (!need_emergency_restart && 5662 amdgpu_device_ip_need_full_reset(tmp_adev)) 5663 amdgpu_ras_suspend(tmp_adev); 5664 5665 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5666 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5667 5668 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5669 continue; 5670 5671 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5672 5673 if (need_emergency_restart) 5674 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5675 } 5676 atomic_inc(&tmp_adev->gpu_reset_counter); 5677 } 5678 5679 if (need_emergency_restart) 5680 goto skip_sched_resume; 5681 5682 /* 5683 * Must check guilty signal here since after this point all old 5684 * HW fences are force signaled. 5685 * 5686 * job->base holds a reference to parent fence 5687 */ 5688 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5689 job_signaled = true; 5690 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5691 goto skip_hw_reset; 5692 } 5693 5694 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5695 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5696 if (gpu_reset_for_dev_remove) { 5697 /* Workaroud for ASICs need to disable SMC first */ 5698 amdgpu_device_smu_fini_early(tmp_adev); 5699 } 5700 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5701 /*TODO Should we stop ?*/ 5702 if (r) { 5703 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5704 r, adev_to_drm(tmp_adev)->unique); 5705 tmp_adev->asic_reset_res = r; 5706 } 5707 5708 /* 5709 * Drop all pending non scheduler resets. Scheduler resets 5710 * were already dropped during drm_sched_stop 5711 */ 5712 amdgpu_device_stop_pending_resets(tmp_adev); 5713 } 5714 5715 /* Actual ASIC resets if needed.*/ 5716 /* Host driver will handle XGMI hive reset for SRIOV */ 5717 if (amdgpu_sriov_vf(adev)) { 5718 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5719 if (r) 5720 adev->asic_reset_res = r; 5721 5722 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5723 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5724 IP_VERSION(9, 4, 2) || 5725 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5726 amdgpu_ras_resume(adev); 5727 } else { 5728 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5729 if (r && r == -EAGAIN) 5730 goto retry; 5731 5732 if (!r && gpu_reset_for_dev_remove) 5733 goto recover_end; 5734 } 5735 5736 skip_hw_reset: 5737 5738 /* Post ASIC reset for all devs .*/ 5739 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5740 5741 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5742 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5743 5744 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5745 continue; 5746 5747 drm_sched_start(&ring->sched, true); 5748 } 5749 5750 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5751 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5752 5753 if (tmp_adev->asic_reset_res) 5754 r = tmp_adev->asic_reset_res; 5755 5756 tmp_adev->asic_reset_res = 0; 5757 5758 if (r) { 5759 /* bad news, how to tell it to userspace ? */ 5760 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5761 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5762 } else { 5763 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5764 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5765 DRM_WARN("smart shift update failed\n"); 5766 } 5767 } 5768 5769 skip_sched_resume: 5770 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5771 /* unlock kfd: SRIOV would do it separately */ 5772 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5773 amdgpu_amdkfd_post_reset(tmp_adev); 5774 5775 /* kfd_post_reset will do nothing if kfd device is not initialized, 5776 * need to bring up kfd here if it's not be initialized before 5777 */ 5778 if (!adev->kfd.init_complete) 5779 amdgpu_amdkfd_device_init(adev); 5780 5781 if (audio_suspended) 5782 amdgpu_device_resume_display_audio(tmp_adev); 5783 5784 amdgpu_device_unset_mp1_state(tmp_adev); 5785 5786 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5787 } 5788 5789 recover_end: 5790 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5791 reset_list); 5792 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5793 5794 if (hive) { 5795 mutex_unlock(&hive->hive_lock); 5796 amdgpu_put_xgmi_hive(hive); 5797 } 5798 5799 if (r) 5800 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5801 5802 atomic_set(&adev->reset_domain->reset_res, r); 5803 return r; 5804 } 5805 5806 /** 5807 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5808 * 5809 * @adev: amdgpu_device pointer 5810 * @speed: pointer to the speed of the link 5811 * @width: pointer to the width of the link 5812 * 5813 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5814 * first physical partner to an AMD dGPU. 5815 * This will exclude any virtual switches and links. 5816 */ 5817 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5818 enum pci_bus_speed *speed, 5819 enum pcie_link_width *width) 5820 { 5821 struct pci_dev *parent = adev->pdev; 5822 5823 if (!speed || !width) 5824 return; 5825 5826 *speed = PCI_SPEED_UNKNOWN; 5827 *width = PCIE_LNK_WIDTH_UNKNOWN; 5828 5829 while ((parent = pci_upstream_bridge(parent))) { 5830 /* skip upstream/downstream switches internal to dGPU*/ 5831 if (parent->vendor == PCI_VENDOR_ID_ATI) 5832 continue; 5833 *speed = pcie_get_speed_cap(parent); 5834 *width = pcie_get_width_cap(parent); 5835 break; 5836 } 5837 } 5838 5839 /** 5840 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5841 * 5842 * @adev: amdgpu_device pointer 5843 * 5844 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5845 * and lanes) of the slot the device is in. Handles APUs and 5846 * virtualized environments where PCIE config space may not be available. 5847 */ 5848 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5849 { 5850 struct pci_dev *pdev; 5851 enum pci_bus_speed speed_cap, platform_speed_cap; 5852 enum pcie_link_width platform_link_width; 5853 5854 if (amdgpu_pcie_gen_cap) 5855 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5856 5857 if (amdgpu_pcie_lane_cap) 5858 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5859 5860 /* covers APUs as well */ 5861 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5862 if (adev->pm.pcie_gen_mask == 0) 5863 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5864 if (adev->pm.pcie_mlw_mask == 0) 5865 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5866 return; 5867 } 5868 5869 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5870 return; 5871 5872 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5873 &platform_link_width); 5874 5875 if (adev->pm.pcie_gen_mask == 0) { 5876 /* asic caps */ 5877 pdev = adev->pdev; 5878 speed_cap = pcie_get_speed_cap(pdev); 5879 if (speed_cap == PCI_SPEED_UNKNOWN) { 5880 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5881 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5882 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5883 } else { 5884 if (speed_cap == PCIE_SPEED_32_0GT) 5885 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5886 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5887 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5888 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5889 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5890 else if (speed_cap == PCIE_SPEED_16_0GT) 5891 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5892 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5893 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5894 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5895 else if (speed_cap == PCIE_SPEED_8_0GT) 5896 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5897 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5898 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5899 else if (speed_cap == PCIE_SPEED_5_0GT) 5900 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5901 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5902 else 5903 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5904 } 5905 /* platform caps */ 5906 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5907 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5908 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5909 } else { 5910 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5911 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5912 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5913 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5914 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5915 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5916 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5917 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5918 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5919 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5920 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5921 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5922 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5923 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5924 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5925 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5926 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5927 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5928 else 5929 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5930 5931 } 5932 } 5933 if (adev->pm.pcie_mlw_mask == 0) { 5934 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5935 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5936 } else { 5937 switch (platform_link_width) { 5938 case PCIE_LNK_X32: 5939 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5940 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5941 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5942 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5943 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5944 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5945 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5946 break; 5947 case PCIE_LNK_X16: 5948 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5950 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5951 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5952 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5953 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5954 break; 5955 case PCIE_LNK_X12: 5956 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5960 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5961 break; 5962 case PCIE_LNK_X8: 5963 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5965 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5967 break; 5968 case PCIE_LNK_X4: 5969 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5970 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5971 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5972 break; 5973 case PCIE_LNK_X2: 5974 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5976 break; 5977 case PCIE_LNK_X1: 5978 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5979 break; 5980 default: 5981 break; 5982 } 5983 } 5984 } 5985 } 5986 5987 /** 5988 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5989 * 5990 * @adev: amdgpu_device pointer 5991 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5992 * 5993 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5994 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5995 * @peer_adev. 5996 */ 5997 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5998 struct amdgpu_device *peer_adev) 5999 { 6000 #ifdef CONFIG_HSA_AMD_P2P 6001 uint64_t address_mask = peer_adev->dev->dma_mask ? 6002 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6003 resource_size_t aper_limit = 6004 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6005 bool p2p_access = 6006 !adev->gmc.xgmi.connected_to_cpu && 6007 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6008 6009 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6010 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6011 !(adev->gmc.aper_base & address_mask || 6012 aper_limit & address_mask)); 6013 #else 6014 return false; 6015 #endif 6016 } 6017 6018 int amdgpu_device_baco_enter(struct drm_device *dev) 6019 { 6020 struct amdgpu_device *adev = drm_to_adev(dev); 6021 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6022 6023 if (!amdgpu_device_supports_baco(dev)) 6024 return -ENOTSUPP; 6025 6026 if (ras && adev->ras_enabled && 6027 adev->nbio.funcs->enable_doorbell_interrupt) 6028 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6029 6030 return amdgpu_dpm_baco_enter(adev); 6031 } 6032 6033 int amdgpu_device_baco_exit(struct drm_device *dev) 6034 { 6035 struct amdgpu_device *adev = drm_to_adev(dev); 6036 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6037 int ret = 0; 6038 6039 if (!amdgpu_device_supports_baco(dev)) 6040 return -ENOTSUPP; 6041 6042 ret = amdgpu_dpm_baco_exit(adev); 6043 if (ret) 6044 return ret; 6045 6046 if (ras && adev->ras_enabled && 6047 adev->nbio.funcs->enable_doorbell_interrupt) 6048 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6049 6050 if (amdgpu_passthrough(adev) && 6051 adev->nbio.funcs->clear_doorbell_interrupt) 6052 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6053 6054 return 0; 6055 } 6056 6057 /** 6058 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6059 * @pdev: PCI device struct 6060 * @state: PCI channel state 6061 * 6062 * Description: Called when a PCI error is detected. 6063 * 6064 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6065 */ 6066 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6067 { 6068 struct drm_device *dev = pci_get_drvdata(pdev); 6069 struct amdgpu_device *adev = drm_to_adev(dev); 6070 int i; 6071 6072 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6073 6074 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6075 DRM_WARN("No support for XGMI hive yet..."); 6076 return PCI_ERS_RESULT_DISCONNECT; 6077 } 6078 6079 adev->pci_channel_state = state; 6080 6081 switch (state) { 6082 case pci_channel_io_normal: 6083 return PCI_ERS_RESULT_CAN_RECOVER; 6084 /* Fatal error, prepare for slot reset */ 6085 case pci_channel_io_frozen: 6086 /* 6087 * Locking adev->reset_domain->sem will prevent any external access 6088 * to GPU during PCI error recovery 6089 */ 6090 amdgpu_device_lock_reset_domain(adev->reset_domain); 6091 amdgpu_device_set_mp1_state(adev); 6092 6093 /* 6094 * Block any work scheduling as we do for regular GPU reset 6095 * for the duration of the recovery 6096 */ 6097 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6098 struct amdgpu_ring *ring = adev->rings[i]; 6099 6100 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 6101 continue; 6102 6103 drm_sched_stop(&ring->sched, NULL); 6104 } 6105 atomic_inc(&adev->gpu_reset_counter); 6106 return PCI_ERS_RESULT_NEED_RESET; 6107 case pci_channel_io_perm_failure: 6108 /* Permanent error, prepare for device removal */ 6109 return PCI_ERS_RESULT_DISCONNECT; 6110 } 6111 6112 return PCI_ERS_RESULT_NEED_RESET; 6113 } 6114 6115 /** 6116 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6117 * @pdev: pointer to PCI device 6118 */ 6119 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6120 { 6121 6122 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6123 6124 /* TODO - dump whatever for debugging purposes */ 6125 6126 /* This called only if amdgpu_pci_error_detected returns 6127 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6128 * works, no need to reset slot. 6129 */ 6130 6131 return PCI_ERS_RESULT_RECOVERED; 6132 } 6133 6134 /** 6135 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6136 * @pdev: PCI device struct 6137 * 6138 * Description: This routine is called by the pci error recovery 6139 * code after the PCI slot has been reset, just before we 6140 * should resume normal operations. 6141 */ 6142 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6143 { 6144 struct drm_device *dev = pci_get_drvdata(pdev); 6145 struct amdgpu_device *adev = drm_to_adev(dev); 6146 int r, i; 6147 struct amdgpu_reset_context reset_context; 6148 u32 memsize; 6149 struct list_head device_list; 6150 6151 DRM_INFO("PCI error: slot reset callback!!\n"); 6152 6153 memset(&reset_context, 0, sizeof(reset_context)); 6154 6155 INIT_LIST_HEAD(&device_list); 6156 list_add_tail(&adev->reset_list, &device_list); 6157 6158 /* wait for asic to come out of reset */ 6159 msleep(500); 6160 6161 /* Restore PCI confspace */ 6162 amdgpu_device_load_pci_state(pdev); 6163 6164 /* confirm ASIC came out of reset */ 6165 for (i = 0; i < adev->usec_timeout; i++) { 6166 memsize = amdgpu_asic_get_config_memsize(adev); 6167 6168 if (memsize != 0xffffffff) 6169 break; 6170 udelay(1); 6171 } 6172 if (memsize == 0xffffffff) { 6173 r = -ETIME; 6174 goto out; 6175 } 6176 6177 reset_context.method = AMD_RESET_METHOD_NONE; 6178 reset_context.reset_req_dev = adev; 6179 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6180 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6181 6182 adev->no_hw_access = true; 6183 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6184 adev->no_hw_access = false; 6185 if (r) 6186 goto out; 6187 6188 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6189 6190 out: 6191 if (!r) { 6192 if (amdgpu_device_cache_pci_state(adev->pdev)) 6193 pci_restore_state(adev->pdev); 6194 6195 DRM_INFO("PCIe error recovery succeeded\n"); 6196 } else { 6197 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6198 amdgpu_device_unset_mp1_state(adev); 6199 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6200 } 6201 6202 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6203 } 6204 6205 /** 6206 * amdgpu_pci_resume() - resume normal ops after PCI reset 6207 * @pdev: pointer to PCI device 6208 * 6209 * Called when the error recovery driver tells us that its 6210 * OK to resume normal operation. 6211 */ 6212 void amdgpu_pci_resume(struct pci_dev *pdev) 6213 { 6214 struct drm_device *dev = pci_get_drvdata(pdev); 6215 struct amdgpu_device *adev = drm_to_adev(dev); 6216 int i; 6217 6218 6219 DRM_INFO("PCI error: resume callback!!\n"); 6220 6221 /* Only continue execution for the case of pci_channel_io_frozen */ 6222 if (adev->pci_channel_state != pci_channel_io_frozen) 6223 return; 6224 6225 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6226 struct amdgpu_ring *ring = adev->rings[i]; 6227 6228 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 6229 continue; 6230 6231 drm_sched_start(&ring->sched, true); 6232 } 6233 6234 amdgpu_device_unset_mp1_state(adev); 6235 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6236 } 6237 6238 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6239 { 6240 struct drm_device *dev = pci_get_drvdata(pdev); 6241 struct amdgpu_device *adev = drm_to_adev(dev); 6242 int r; 6243 6244 r = pci_save_state(pdev); 6245 if (!r) { 6246 kfree(adev->pci_state); 6247 6248 adev->pci_state = pci_store_saved_state(pdev); 6249 6250 if (!adev->pci_state) { 6251 DRM_ERROR("Failed to store PCI saved state"); 6252 return false; 6253 } 6254 } else { 6255 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6256 return false; 6257 } 6258 6259 return true; 6260 } 6261 6262 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6263 { 6264 struct drm_device *dev = pci_get_drvdata(pdev); 6265 struct amdgpu_device *adev = drm_to_adev(dev); 6266 int r; 6267 6268 if (!adev->pci_state) 6269 return false; 6270 6271 r = pci_load_saved_state(pdev, adev->pci_state); 6272 6273 if (!r) { 6274 pci_restore_state(pdev); 6275 } else { 6276 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6277 return false; 6278 } 6279 6280 return true; 6281 } 6282 6283 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6284 struct amdgpu_ring *ring) 6285 { 6286 #ifdef CONFIG_X86_64 6287 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6288 return; 6289 #endif 6290 if (adev->gmc.xgmi.connected_to_cpu) 6291 return; 6292 6293 if (ring && ring->funcs->emit_hdp_flush) 6294 amdgpu_ring_emit_hdp_flush(ring); 6295 else 6296 amdgpu_asic_flush_hdp(adev, ring); 6297 } 6298 6299 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6300 struct amdgpu_ring *ring) 6301 { 6302 #ifdef CONFIG_X86_64 6303 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6304 return; 6305 #endif 6306 if (adev->gmc.xgmi.connected_to_cpu) 6307 return; 6308 6309 amdgpu_asic_invalidate_hdp(adev, ring); 6310 } 6311 6312 int amdgpu_in_reset(struct amdgpu_device *adev) 6313 { 6314 return atomic_read(&adev->reset_domain->in_gpu_reset); 6315 } 6316 6317 /** 6318 * amdgpu_device_halt() - bring hardware to some kind of halt state 6319 * 6320 * @adev: amdgpu_device pointer 6321 * 6322 * Bring hardware to some kind of halt state so that no one can touch it 6323 * any more. It will help to maintain error context when error occurred. 6324 * Compare to a simple hang, the system will keep stable at least for SSH 6325 * access. Then it should be trivial to inspect the hardware state and 6326 * see what's going on. Implemented as following: 6327 * 6328 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6329 * clears all CPU mappings to device, disallows remappings through page faults 6330 * 2. amdgpu_irq_disable_all() disables all interrupts 6331 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6332 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6333 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6334 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6335 * flush any in flight DMA operations 6336 */ 6337 void amdgpu_device_halt(struct amdgpu_device *adev) 6338 { 6339 struct pci_dev *pdev = adev->pdev; 6340 struct drm_device *ddev = adev_to_drm(adev); 6341 6342 amdgpu_xcp_dev_unplug(adev); 6343 drm_dev_unplug(ddev); 6344 6345 amdgpu_irq_disable_all(adev); 6346 6347 amdgpu_fence_driver_hw_fini(adev); 6348 6349 adev->no_hw_access = true; 6350 6351 amdgpu_device_unmap_mmio(adev); 6352 6353 pci_disable_device(pdev); 6354 pci_wait_for_pending_transaction(pdev); 6355 } 6356 6357 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6358 u32 reg) 6359 { 6360 unsigned long flags, address, data; 6361 u32 r; 6362 6363 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6364 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6365 6366 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6367 WREG32(address, reg * 4); 6368 (void)RREG32(address); 6369 r = RREG32(data); 6370 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6371 return r; 6372 } 6373 6374 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6375 u32 reg, u32 v) 6376 { 6377 unsigned long flags, address, data; 6378 6379 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6380 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6381 6382 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6383 WREG32(address, reg * 4); 6384 (void)RREG32(address); 6385 WREG32(data, v); 6386 (void)RREG32(data); 6387 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6388 } 6389 6390 /** 6391 * amdgpu_device_switch_gang - switch to a new gang 6392 * @adev: amdgpu_device pointer 6393 * @gang: the gang to switch to 6394 * 6395 * Try to switch to a new gang. 6396 * Returns: NULL if we switched to the new gang or a reference to the current 6397 * gang leader. 6398 */ 6399 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6400 struct dma_fence *gang) 6401 { 6402 struct dma_fence *old = NULL; 6403 6404 do { 6405 dma_fence_put(old); 6406 rcu_read_lock(); 6407 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6408 rcu_read_unlock(); 6409 6410 if (old == gang) 6411 break; 6412 6413 if (!dma_fence_is_signaled(old)) 6414 return old; 6415 6416 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6417 old, gang) != old); 6418 6419 dma_fence_put(old); 6420 return NULL; 6421 } 6422 6423 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6424 { 6425 switch (adev->asic_type) { 6426 #ifdef CONFIG_DRM_AMDGPU_SI 6427 case CHIP_HAINAN: 6428 #endif 6429 case CHIP_TOPAZ: 6430 /* chips with no display hardware */ 6431 return false; 6432 #ifdef CONFIG_DRM_AMDGPU_SI 6433 case CHIP_TAHITI: 6434 case CHIP_PITCAIRN: 6435 case CHIP_VERDE: 6436 case CHIP_OLAND: 6437 #endif 6438 #ifdef CONFIG_DRM_AMDGPU_CIK 6439 case CHIP_BONAIRE: 6440 case CHIP_HAWAII: 6441 case CHIP_KAVERI: 6442 case CHIP_KABINI: 6443 case CHIP_MULLINS: 6444 #endif 6445 case CHIP_TONGA: 6446 case CHIP_FIJI: 6447 case CHIP_POLARIS10: 6448 case CHIP_POLARIS11: 6449 case CHIP_POLARIS12: 6450 case CHIP_VEGAM: 6451 case CHIP_CARRIZO: 6452 case CHIP_STONEY: 6453 /* chips with display hardware */ 6454 return true; 6455 default: 6456 /* IP discovery */ 6457 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6458 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6459 return false; 6460 return true; 6461 } 6462 } 6463 6464 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6465 uint32_t inst, uint32_t reg_addr, char reg_name[], 6466 uint32_t expected_value, uint32_t mask) 6467 { 6468 uint32_t ret = 0; 6469 uint32_t old_ = 0; 6470 uint32_t tmp_ = RREG32(reg_addr); 6471 uint32_t loop = adev->usec_timeout; 6472 6473 while ((tmp_ & (mask)) != (expected_value)) { 6474 if (old_ != tmp_) { 6475 loop = adev->usec_timeout; 6476 old_ = tmp_; 6477 } else 6478 udelay(1); 6479 tmp_ = RREG32(reg_addr); 6480 loop--; 6481 if (!loop) { 6482 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6483 inst, reg_name, (uint32_t)expected_value, 6484 (uint32_t)(tmp_ & (mask))); 6485 ret = -ETIMEDOUT; 6486 break; 6487 } 6488 } 6489 return ret; 6490 } 6491