1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 166 struct bin_attribute *attr, char *buf, 167 loff_t ppos, size_t count) 168 { 169 struct device *dev = kobj_to_dev(kobj); 170 struct drm_device *ddev = dev_get_drvdata(dev); 171 struct amdgpu_device *adev = drm_to_adev(ddev); 172 ssize_t bytes_read; 173 174 switch (ppos) { 175 case AMDGPU_SYS_REG_STATE_XGMI: 176 bytes_read = amdgpu_asic_get_reg_state( 177 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 178 break; 179 case AMDGPU_SYS_REG_STATE_WAFL: 180 bytes_read = amdgpu_asic_get_reg_state( 181 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 182 break; 183 case AMDGPU_SYS_REG_STATE_PCIE: 184 bytes_read = amdgpu_asic_get_reg_state( 185 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 186 break; 187 case AMDGPU_SYS_REG_STATE_USR: 188 bytes_read = amdgpu_asic_get_reg_state( 189 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 190 break; 191 case AMDGPU_SYS_REG_STATE_USR_1: 192 bytes_read = amdgpu_asic_get_reg_state( 193 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 194 break; 195 default: 196 return -EINVAL; 197 } 198 199 return bytes_read; 200 } 201 202 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 203 AMDGPU_SYS_REG_STATE_END); 204 205 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 206 { 207 int ret; 208 209 if (!amdgpu_asic_get_reg_state_supported(adev)) 210 return 0; 211 212 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 213 214 return ret; 215 } 216 217 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 218 { 219 if (!amdgpu_asic_get_reg_state_supported(adev)) 220 return; 221 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 222 } 223 224 /** 225 * DOC: board_info 226 * 227 * The amdgpu driver provides a sysfs API for giving board related information. 228 * It provides the form factor information in the format 229 * 230 * type : form factor 231 * 232 * Possible form factor values 233 * 234 * - "cem" - PCIE CEM card 235 * - "oam" - Open Compute Accelerator Module 236 * - "unknown" - Not known 237 * 238 */ 239 240 static ssize_t amdgpu_device_get_board_info(struct device *dev, 241 struct device_attribute *attr, 242 char *buf) 243 { 244 struct drm_device *ddev = dev_get_drvdata(dev); 245 struct amdgpu_device *adev = drm_to_adev(ddev); 246 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 247 const char *pkg; 248 249 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 250 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 251 252 switch (pkg_type) { 253 case AMDGPU_PKG_TYPE_CEM: 254 pkg = "cem"; 255 break; 256 case AMDGPU_PKG_TYPE_OAM: 257 pkg = "oam"; 258 break; 259 default: 260 pkg = "unknown"; 261 break; 262 } 263 264 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 265 } 266 267 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 268 269 static struct attribute *amdgpu_board_attrs[] = { 270 &dev_attr_board_info.attr, 271 NULL, 272 }; 273 274 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 275 struct attribute *attr, int n) 276 { 277 struct device *dev = kobj_to_dev(kobj); 278 struct drm_device *ddev = dev_get_drvdata(dev); 279 struct amdgpu_device *adev = drm_to_adev(ddev); 280 281 if (adev->flags & AMD_IS_APU) 282 return 0; 283 284 return attr->mode; 285 } 286 287 static const struct attribute_group amdgpu_board_attrs_group = { 288 .attrs = amdgpu_board_attrs, 289 .is_visible = amdgpu_board_attrs_is_visible 290 }; 291 292 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 293 294 295 /** 296 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 297 * 298 * @dev: drm_device pointer 299 * 300 * Returns true if the device is a dGPU with ATPX power control, 301 * otherwise return false. 302 */ 303 bool amdgpu_device_supports_px(struct drm_device *dev) 304 { 305 struct amdgpu_device *adev = drm_to_adev(dev); 306 307 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 308 return true; 309 return false; 310 } 311 312 /** 313 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 314 * 315 * @dev: drm_device pointer 316 * 317 * Returns true if the device is a dGPU with ACPI power control, 318 * otherwise return false. 319 */ 320 bool amdgpu_device_supports_boco(struct drm_device *dev) 321 { 322 struct amdgpu_device *adev = drm_to_adev(dev); 323 324 if (adev->has_pr3 || 325 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 326 return true; 327 return false; 328 } 329 330 /** 331 * amdgpu_device_supports_baco - Does the device support BACO 332 * 333 * @dev: drm_device pointer 334 * 335 * Returns true if the device supporte BACO, 336 * otherwise return false. 337 */ 338 bool amdgpu_device_supports_baco(struct drm_device *dev) 339 { 340 struct amdgpu_device *adev = drm_to_adev(dev); 341 342 return amdgpu_asic_supports_baco(adev); 343 } 344 345 /** 346 * amdgpu_device_supports_smart_shift - Is the device dGPU with 347 * smart shift support 348 * 349 * @dev: drm_device pointer 350 * 351 * Returns true if the device is a dGPU with Smart Shift support, 352 * otherwise returns false. 353 */ 354 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 355 { 356 return (amdgpu_device_supports_boco(dev) && 357 amdgpu_acpi_is_power_shift_control_supported()); 358 } 359 360 /* 361 * VRAM access helper functions 362 */ 363 364 /** 365 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 366 * 367 * @adev: amdgpu_device pointer 368 * @pos: offset of the buffer in vram 369 * @buf: virtual address of the buffer in system memory 370 * @size: read/write size, sizeof(@buf) must > @size 371 * @write: true - write to vram, otherwise - read from vram 372 */ 373 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 374 void *buf, size_t size, bool write) 375 { 376 unsigned long flags; 377 uint32_t hi = ~0, tmp = 0; 378 uint32_t *data = buf; 379 uint64_t last; 380 int idx; 381 382 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 383 return; 384 385 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 386 387 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 388 for (last = pos + size; pos < last; pos += 4) { 389 tmp = pos >> 31; 390 391 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 392 if (tmp != hi) { 393 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 394 hi = tmp; 395 } 396 if (write) 397 WREG32_NO_KIQ(mmMM_DATA, *data++); 398 else 399 *data++ = RREG32_NO_KIQ(mmMM_DATA); 400 } 401 402 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 403 drm_dev_exit(idx); 404 } 405 406 /** 407 * amdgpu_device_aper_access - access vram by vram aperature 408 * 409 * @adev: amdgpu_device pointer 410 * @pos: offset of the buffer in vram 411 * @buf: virtual address of the buffer in system memory 412 * @size: read/write size, sizeof(@buf) must > @size 413 * @write: true - write to vram, otherwise - read from vram 414 * 415 * The return value means how many bytes have been transferred. 416 */ 417 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 418 void *buf, size_t size, bool write) 419 { 420 #ifdef CONFIG_64BIT 421 void __iomem *addr; 422 size_t count = 0; 423 uint64_t last; 424 425 if (!adev->mman.aper_base_kaddr) 426 return 0; 427 428 last = min(pos + size, adev->gmc.visible_vram_size); 429 if (last > pos) { 430 addr = adev->mman.aper_base_kaddr + pos; 431 count = last - pos; 432 433 if (write) { 434 memcpy_toio(addr, buf, count); 435 /* Make sure HDP write cache flush happens without any reordering 436 * after the system memory contents are sent over PCIe device 437 */ 438 mb(); 439 amdgpu_device_flush_hdp(adev, NULL); 440 } else { 441 amdgpu_device_invalidate_hdp(adev, NULL); 442 /* Make sure HDP read cache is invalidated before issuing a read 443 * to the PCIe device 444 */ 445 mb(); 446 memcpy_fromio(buf, addr, count); 447 } 448 449 } 450 451 return count; 452 #else 453 return 0; 454 #endif 455 } 456 457 /** 458 * amdgpu_device_vram_access - read/write a buffer in vram 459 * 460 * @adev: amdgpu_device pointer 461 * @pos: offset of the buffer in vram 462 * @buf: virtual address of the buffer in system memory 463 * @size: read/write size, sizeof(@buf) must > @size 464 * @write: true - write to vram, otherwise - read from vram 465 */ 466 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 467 void *buf, size_t size, bool write) 468 { 469 size_t count; 470 471 /* try to using vram apreature to access vram first */ 472 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 473 size -= count; 474 if (size) { 475 /* using MM to access rest vram */ 476 pos += count; 477 buf += count; 478 amdgpu_device_mm_access(adev, pos, buf, size, write); 479 } 480 } 481 482 /* 483 * register access helper functions. 484 */ 485 486 /* Check if hw access should be skipped because of hotplug or device error */ 487 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 488 { 489 if (adev->no_hw_access) 490 return true; 491 492 #ifdef CONFIG_LOCKDEP 493 /* 494 * This is a bit complicated to understand, so worth a comment. What we assert 495 * here is that the GPU reset is not running on another thread in parallel. 496 * 497 * For this we trylock the read side of the reset semaphore, if that succeeds 498 * we know that the reset is not running in paralell. 499 * 500 * If the trylock fails we assert that we are either already holding the read 501 * side of the lock or are the reset thread itself and hold the write side of 502 * the lock. 503 */ 504 if (in_task()) { 505 if (down_read_trylock(&adev->reset_domain->sem)) 506 up_read(&adev->reset_domain->sem); 507 else 508 lockdep_assert_held(&adev->reset_domain->sem); 509 } 510 #endif 511 return false; 512 } 513 514 /** 515 * amdgpu_device_rreg - read a memory mapped IO or indirect register 516 * 517 * @adev: amdgpu_device pointer 518 * @reg: dword aligned register offset 519 * @acc_flags: access flags which require special behavior 520 * 521 * Returns the 32 bit value from the offset specified. 522 */ 523 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 524 uint32_t reg, uint32_t acc_flags) 525 { 526 uint32_t ret; 527 528 if (amdgpu_device_skip_hw_access(adev)) 529 return 0; 530 531 if ((reg * 4) < adev->rmmio_size) { 532 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 533 amdgpu_sriov_runtime(adev) && 534 down_read_trylock(&adev->reset_domain->sem)) { 535 ret = amdgpu_kiq_rreg(adev, reg, 0); 536 up_read(&adev->reset_domain->sem); 537 } else { 538 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 539 } 540 } else { 541 ret = adev->pcie_rreg(adev, reg * 4); 542 } 543 544 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 545 546 return ret; 547 } 548 549 /* 550 * MMIO register read with bytes helper functions 551 * @offset:bytes offset from MMIO start 552 */ 553 554 /** 555 * amdgpu_mm_rreg8 - read a memory mapped IO register 556 * 557 * @adev: amdgpu_device pointer 558 * @offset: byte aligned register offset 559 * 560 * Returns the 8 bit value from the offset specified. 561 */ 562 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 563 { 564 if (amdgpu_device_skip_hw_access(adev)) 565 return 0; 566 567 if (offset < adev->rmmio_size) 568 return (readb(adev->rmmio + offset)); 569 BUG(); 570 } 571 572 573 /** 574 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 575 * 576 * @adev: amdgpu_device pointer 577 * @reg: dword aligned register offset 578 * @acc_flags: access flags which require special behavior 579 * @xcc_id: xcc accelerated compute core id 580 * 581 * Returns the 32 bit value from the offset specified. 582 */ 583 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 584 uint32_t reg, uint32_t acc_flags, 585 uint32_t xcc_id) 586 { 587 uint32_t ret, rlcg_flag; 588 589 if (amdgpu_device_skip_hw_access(adev)) 590 return 0; 591 592 if ((reg * 4) < adev->rmmio_size) { 593 if (amdgpu_sriov_vf(adev) && 594 !amdgpu_sriov_runtime(adev) && 595 adev->gfx.rlc.rlcg_reg_access_supported && 596 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 597 GC_HWIP, false, 598 &rlcg_flag)) { 599 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 600 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 601 amdgpu_sriov_runtime(adev) && 602 down_read_trylock(&adev->reset_domain->sem)) { 603 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 604 up_read(&adev->reset_domain->sem); 605 } else { 606 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 607 } 608 } else { 609 ret = adev->pcie_rreg(adev, reg * 4); 610 } 611 612 return ret; 613 } 614 615 /* 616 * MMIO register write with bytes helper functions 617 * @offset:bytes offset from MMIO start 618 * @value: the value want to be written to the register 619 */ 620 621 /** 622 * amdgpu_mm_wreg8 - read a memory mapped IO register 623 * 624 * @adev: amdgpu_device pointer 625 * @offset: byte aligned register offset 626 * @value: 8 bit value to write 627 * 628 * Writes the value specified to the offset specified. 629 */ 630 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 631 { 632 if (amdgpu_device_skip_hw_access(adev)) 633 return; 634 635 if (offset < adev->rmmio_size) 636 writeb(value, adev->rmmio + offset); 637 else 638 BUG(); 639 } 640 641 /** 642 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 643 * 644 * @adev: amdgpu_device pointer 645 * @reg: dword aligned register offset 646 * @v: 32 bit value to write to the register 647 * @acc_flags: access flags which require special behavior 648 * 649 * Writes the value specified to the offset specified. 650 */ 651 void amdgpu_device_wreg(struct amdgpu_device *adev, 652 uint32_t reg, uint32_t v, 653 uint32_t acc_flags) 654 { 655 if (amdgpu_device_skip_hw_access(adev)) 656 return; 657 658 if ((reg * 4) < adev->rmmio_size) { 659 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 660 amdgpu_sriov_runtime(adev) && 661 down_read_trylock(&adev->reset_domain->sem)) { 662 amdgpu_kiq_wreg(adev, reg, v, 0); 663 up_read(&adev->reset_domain->sem); 664 } else { 665 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 666 } 667 } else { 668 adev->pcie_wreg(adev, reg * 4, v); 669 } 670 671 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 672 } 673 674 /** 675 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 676 * 677 * @adev: amdgpu_device pointer 678 * @reg: mmio/rlc register 679 * @v: value to write 680 * @xcc_id: xcc accelerated compute core id 681 * 682 * this function is invoked only for the debugfs register access 683 */ 684 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 685 uint32_t reg, uint32_t v, 686 uint32_t xcc_id) 687 { 688 if (amdgpu_device_skip_hw_access(adev)) 689 return; 690 691 if (amdgpu_sriov_fullaccess(adev) && 692 adev->gfx.rlc.funcs && 693 adev->gfx.rlc.funcs->is_rlcg_access_range) { 694 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 695 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 696 } else if ((reg * 4) >= adev->rmmio_size) { 697 adev->pcie_wreg(adev, reg * 4, v); 698 } else { 699 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 700 } 701 } 702 703 /** 704 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 705 * 706 * @adev: amdgpu_device pointer 707 * @reg: dword aligned register offset 708 * @v: 32 bit value to write to the register 709 * @acc_flags: access flags which require special behavior 710 * @xcc_id: xcc accelerated compute core id 711 * 712 * Writes the value specified to the offset specified. 713 */ 714 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 715 uint32_t reg, uint32_t v, 716 uint32_t acc_flags, uint32_t xcc_id) 717 { 718 uint32_t rlcg_flag; 719 720 if (amdgpu_device_skip_hw_access(adev)) 721 return; 722 723 if ((reg * 4) < adev->rmmio_size) { 724 if (amdgpu_sriov_vf(adev) && 725 !amdgpu_sriov_runtime(adev) && 726 adev->gfx.rlc.rlcg_reg_access_supported && 727 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 728 GC_HWIP, true, 729 &rlcg_flag)) { 730 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 731 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 732 amdgpu_sriov_runtime(adev) && 733 down_read_trylock(&adev->reset_domain->sem)) { 734 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 735 up_read(&adev->reset_domain->sem); 736 } else { 737 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 738 } 739 } else { 740 adev->pcie_wreg(adev, reg * 4, v); 741 } 742 } 743 744 /** 745 * amdgpu_device_indirect_rreg - read an indirect register 746 * 747 * @adev: amdgpu_device pointer 748 * @reg_addr: indirect register address to read from 749 * 750 * Returns the value of indirect register @reg_addr 751 */ 752 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 753 u32 reg_addr) 754 { 755 unsigned long flags, pcie_index, pcie_data; 756 void __iomem *pcie_index_offset; 757 void __iomem *pcie_data_offset; 758 u32 r; 759 760 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 761 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 762 763 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 764 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 765 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 766 767 writel(reg_addr, pcie_index_offset); 768 readl(pcie_index_offset); 769 r = readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 772 return r; 773 } 774 775 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 776 u64 reg_addr) 777 { 778 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 779 u32 r; 780 void __iomem *pcie_index_offset; 781 void __iomem *pcie_index_hi_offset; 782 void __iomem *pcie_data_offset; 783 784 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 785 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 786 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 787 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 788 else 789 pcie_index_hi = 0; 790 791 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 792 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 793 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 794 if (pcie_index_hi != 0) 795 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 796 pcie_index_hi * 4; 797 798 writel(reg_addr, pcie_index_offset); 799 readl(pcie_index_offset); 800 if (pcie_index_hi != 0) { 801 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 802 readl(pcie_index_hi_offset); 803 } 804 r = readl(pcie_data_offset); 805 806 /* clear the high bits */ 807 if (pcie_index_hi != 0) { 808 writel(0, pcie_index_hi_offset); 809 readl(pcie_index_hi_offset); 810 } 811 812 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 813 814 return r; 815 } 816 817 /** 818 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 819 * 820 * @adev: amdgpu_device pointer 821 * @reg_addr: indirect register address to read from 822 * 823 * Returns the value of indirect register @reg_addr 824 */ 825 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 826 u32 reg_addr) 827 { 828 unsigned long flags, pcie_index, pcie_data; 829 void __iomem *pcie_index_offset; 830 void __iomem *pcie_data_offset; 831 u64 r; 832 833 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 834 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 835 836 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 837 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 838 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 839 840 /* read low 32 bits */ 841 writel(reg_addr, pcie_index_offset); 842 readl(pcie_index_offset); 843 r = readl(pcie_data_offset); 844 /* read high 32 bits */ 845 writel(reg_addr + 4, pcie_index_offset); 846 readl(pcie_index_offset); 847 r |= ((u64)readl(pcie_data_offset) << 32); 848 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 849 850 return r; 851 } 852 853 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 854 u64 reg_addr) 855 { 856 unsigned long flags, pcie_index, pcie_data; 857 unsigned long pcie_index_hi = 0; 858 void __iomem *pcie_index_offset; 859 void __iomem *pcie_index_hi_offset; 860 void __iomem *pcie_data_offset; 861 u64 r; 862 863 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 864 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 865 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 866 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 867 868 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 869 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 870 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 871 if (pcie_index_hi != 0) 872 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 873 pcie_index_hi * 4; 874 875 /* read low 32 bits */ 876 writel(reg_addr, pcie_index_offset); 877 readl(pcie_index_offset); 878 if (pcie_index_hi != 0) { 879 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 880 readl(pcie_index_hi_offset); 881 } 882 r = readl(pcie_data_offset); 883 /* read high 32 bits */ 884 writel(reg_addr + 4, pcie_index_offset); 885 readl(pcie_index_offset); 886 if (pcie_index_hi != 0) { 887 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 888 readl(pcie_index_hi_offset); 889 } 890 r |= ((u64)readl(pcie_data_offset) << 32); 891 892 /* clear the high bits */ 893 if (pcie_index_hi != 0) { 894 writel(0, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 898 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 899 900 return r; 901 } 902 903 /** 904 * amdgpu_device_indirect_wreg - write an indirect register address 905 * 906 * @adev: amdgpu_device pointer 907 * @reg_addr: indirect register offset 908 * @reg_data: indirect register data 909 * 910 */ 911 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 912 u32 reg_addr, u32 reg_data) 913 { 914 unsigned long flags, pcie_index, pcie_data; 915 void __iomem *pcie_index_offset; 916 void __iomem *pcie_data_offset; 917 918 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 919 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 920 921 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 922 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 923 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 924 925 writel(reg_addr, pcie_index_offset); 926 readl(pcie_index_offset); 927 writel(reg_data, pcie_data_offset); 928 readl(pcie_data_offset); 929 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 930 } 931 932 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 933 u64 reg_addr, u32 reg_data) 934 { 935 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 936 void __iomem *pcie_index_offset; 937 void __iomem *pcie_index_hi_offset; 938 void __iomem *pcie_data_offset; 939 940 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 941 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 942 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 943 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 944 else 945 pcie_index_hi = 0; 946 947 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 948 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 949 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 950 if (pcie_index_hi != 0) 951 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 952 pcie_index_hi * 4; 953 954 writel(reg_addr, pcie_index_offset); 955 readl(pcie_index_offset); 956 if (pcie_index_hi != 0) { 957 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 958 readl(pcie_index_hi_offset); 959 } 960 writel(reg_data, pcie_data_offset); 961 readl(pcie_data_offset); 962 963 /* clear the high bits */ 964 if (pcie_index_hi != 0) { 965 writel(0, pcie_index_hi_offset); 966 readl(pcie_index_hi_offset); 967 } 968 969 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 970 } 971 972 /** 973 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 974 * 975 * @adev: amdgpu_device pointer 976 * @reg_addr: indirect register offset 977 * @reg_data: indirect register data 978 * 979 */ 980 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 981 u32 reg_addr, u64 reg_data) 982 { 983 unsigned long flags, pcie_index, pcie_data; 984 void __iomem *pcie_index_offset; 985 void __iomem *pcie_data_offset; 986 987 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 988 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 989 990 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 991 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 992 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 993 994 /* write low 32 bits */ 995 writel(reg_addr, pcie_index_offset); 996 readl(pcie_index_offset); 997 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 998 readl(pcie_data_offset); 999 /* write high 32 bits */ 1000 writel(reg_addr + 4, pcie_index_offset); 1001 readl(pcie_index_offset); 1002 writel((u32)(reg_data >> 32), pcie_data_offset); 1003 readl(pcie_data_offset); 1004 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1005 } 1006 1007 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1008 u64 reg_addr, u64 reg_data) 1009 { 1010 unsigned long flags, pcie_index, pcie_data; 1011 unsigned long pcie_index_hi = 0; 1012 void __iomem *pcie_index_offset; 1013 void __iomem *pcie_index_hi_offset; 1014 void __iomem *pcie_data_offset; 1015 1016 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1017 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1018 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1019 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1020 1021 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1022 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1023 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1024 if (pcie_index_hi != 0) 1025 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1026 pcie_index_hi * 4; 1027 1028 /* write low 32 bits */ 1029 writel(reg_addr, pcie_index_offset); 1030 readl(pcie_index_offset); 1031 if (pcie_index_hi != 0) { 1032 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1033 readl(pcie_index_hi_offset); 1034 } 1035 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1036 readl(pcie_data_offset); 1037 /* write high 32 bits */ 1038 writel(reg_addr + 4, pcie_index_offset); 1039 readl(pcie_index_offset); 1040 if (pcie_index_hi != 0) { 1041 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1042 readl(pcie_index_hi_offset); 1043 } 1044 writel((u32)(reg_data >> 32), pcie_data_offset); 1045 readl(pcie_data_offset); 1046 1047 /* clear the high bits */ 1048 if (pcie_index_hi != 0) { 1049 writel(0, pcie_index_hi_offset); 1050 readl(pcie_index_hi_offset); 1051 } 1052 1053 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1054 } 1055 1056 /** 1057 * amdgpu_device_get_rev_id - query device rev_id 1058 * 1059 * @adev: amdgpu_device pointer 1060 * 1061 * Return device rev_id 1062 */ 1063 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1064 { 1065 return adev->nbio.funcs->get_rev_id(adev); 1066 } 1067 1068 /** 1069 * amdgpu_invalid_rreg - dummy reg read function 1070 * 1071 * @adev: amdgpu_device pointer 1072 * @reg: offset of register 1073 * 1074 * Dummy register read function. Used for register blocks 1075 * that certain asics don't have (all asics). 1076 * Returns the value in the register. 1077 */ 1078 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1079 { 1080 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1081 BUG(); 1082 return 0; 1083 } 1084 1085 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1086 { 1087 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1088 BUG(); 1089 return 0; 1090 } 1091 1092 /** 1093 * amdgpu_invalid_wreg - dummy reg write function 1094 * 1095 * @adev: amdgpu_device pointer 1096 * @reg: offset of register 1097 * @v: value to write to the register 1098 * 1099 * Dummy register read function. Used for register blocks 1100 * that certain asics don't have (all asics). 1101 */ 1102 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1103 { 1104 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1105 reg, v); 1106 BUG(); 1107 } 1108 1109 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1110 { 1111 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1112 reg, v); 1113 BUG(); 1114 } 1115 1116 /** 1117 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1118 * 1119 * @adev: amdgpu_device pointer 1120 * @reg: offset of register 1121 * 1122 * Dummy register read function. Used for register blocks 1123 * that certain asics don't have (all asics). 1124 * Returns the value in the register. 1125 */ 1126 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1127 { 1128 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1129 BUG(); 1130 return 0; 1131 } 1132 1133 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1134 { 1135 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1136 BUG(); 1137 return 0; 1138 } 1139 1140 /** 1141 * amdgpu_invalid_wreg64 - dummy reg write function 1142 * 1143 * @adev: amdgpu_device pointer 1144 * @reg: offset of register 1145 * @v: value to write to the register 1146 * 1147 * Dummy register read function. Used for register blocks 1148 * that certain asics don't have (all asics). 1149 */ 1150 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1151 { 1152 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1153 reg, v); 1154 BUG(); 1155 } 1156 1157 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1158 { 1159 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1160 reg, v); 1161 BUG(); 1162 } 1163 1164 /** 1165 * amdgpu_block_invalid_rreg - dummy reg read function 1166 * 1167 * @adev: amdgpu_device pointer 1168 * @block: offset of instance 1169 * @reg: offset of register 1170 * 1171 * Dummy register read function. Used for register blocks 1172 * that certain asics don't have (all asics). 1173 * Returns the value in the register. 1174 */ 1175 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1176 uint32_t block, uint32_t reg) 1177 { 1178 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1179 reg, block); 1180 BUG(); 1181 return 0; 1182 } 1183 1184 /** 1185 * amdgpu_block_invalid_wreg - dummy reg write function 1186 * 1187 * @adev: amdgpu_device pointer 1188 * @block: offset of instance 1189 * @reg: offset of register 1190 * @v: value to write to the register 1191 * 1192 * Dummy register read function. Used for register blocks 1193 * that certain asics don't have (all asics). 1194 */ 1195 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1196 uint32_t block, 1197 uint32_t reg, uint32_t v) 1198 { 1199 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1200 reg, block, v); 1201 BUG(); 1202 } 1203 1204 /** 1205 * amdgpu_device_asic_init - Wrapper for atom asic_init 1206 * 1207 * @adev: amdgpu_device pointer 1208 * 1209 * Does any asic specific work and then calls atom asic init. 1210 */ 1211 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1212 { 1213 int ret; 1214 1215 amdgpu_asic_pre_asic_init(adev); 1216 1217 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1218 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1219 amdgpu_psp_wait_for_bootloader(adev); 1220 ret = amdgpu_atomfirmware_asic_init(adev, true); 1221 /* TODO: check the return val and stop device initialization if boot fails */ 1222 amdgpu_psp_query_boot_status(adev); 1223 return ret; 1224 } else { 1225 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1226 } 1227 1228 return 0; 1229 } 1230 1231 /** 1232 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1233 * 1234 * @adev: amdgpu_device pointer 1235 * 1236 * Allocates a scratch page of VRAM for use by various things in the 1237 * driver. 1238 */ 1239 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1240 { 1241 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1242 AMDGPU_GEM_DOMAIN_VRAM | 1243 AMDGPU_GEM_DOMAIN_GTT, 1244 &adev->mem_scratch.robj, 1245 &adev->mem_scratch.gpu_addr, 1246 (void **)&adev->mem_scratch.ptr); 1247 } 1248 1249 /** 1250 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1251 * 1252 * @adev: amdgpu_device pointer 1253 * 1254 * Frees the VRAM scratch page. 1255 */ 1256 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1257 { 1258 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1259 } 1260 1261 /** 1262 * amdgpu_device_program_register_sequence - program an array of registers. 1263 * 1264 * @adev: amdgpu_device pointer 1265 * @registers: pointer to the register array 1266 * @array_size: size of the register array 1267 * 1268 * Programs an array or registers with and or masks. 1269 * This is a helper for setting golden registers. 1270 */ 1271 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1272 const u32 *registers, 1273 const u32 array_size) 1274 { 1275 u32 tmp, reg, and_mask, or_mask; 1276 int i; 1277 1278 if (array_size % 3) 1279 return; 1280 1281 for (i = 0; i < array_size; i += 3) { 1282 reg = registers[i + 0]; 1283 and_mask = registers[i + 1]; 1284 or_mask = registers[i + 2]; 1285 1286 if (and_mask == 0xffffffff) { 1287 tmp = or_mask; 1288 } else { 1289 tmp = RREG32(reg); 1290 tmp &= ~and_mask; 1291 if (adev->family >= AMDGPU_FAMILY_AI) 1292 tmp |= (or_mask & and_mask); 1293 else 1294 tmp |= or_mask; 1295 } 1296 WREG32(reg, tmp); 1297 } 1298 } 1299 1300 /** 1301 * amdgpu_device_pci_config_reset - reset the GPU 1302 * 1303 * @adev: amdgpu_device pointer 1304 * 1305 * Resets the GPU using the pci config reset sequence. 1306 * Only applicable to asics prior to vega10. 1307 */ 1308 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1309 { 1310 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1311 } 1312 1313 /** 1314 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1315 * 1316 * @adev: amdgpu_device pointer 1317 * 1318 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1319 */ 1320 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1321 { 1322 return pci_reset_function(adev->pdev); 1323 } 1324 1325 /* 1326 * amdgpu_device_wb_*() 1327 * Writeback is the method by which the GPU updates special pages in memory 1328 * with the status of certain GPU events (fences, ring pointers,etc.). 1329 */ 1330 1331 /** 1332 * amdgpu_device_wb_fini - Disable Writeback and free memory 1333 * 1334 * @adev: amdgpu_device pointer 1335 * 1336 * Disables Writeback and frees the Writeback memory (all asics). 1337 * Used at driver shutdown. 1338 */ 1339 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1340 { 1341 if (adev->wb.wb_obj) { 1342 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1343 &adev->wb.gpu_addr, 1344 (void **)&adev->wb.wb); 1345 adev->wb.wb_obj = NULL; 1346 } 1347 } 1348 1349 /** 1350 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1351 * 1352 * @adev: amdgpu_device pointer 1353 * 1354 * Initializes writeback and allocates writeback memory (all asics). 1355 * Used at driver startup. 1356 * Returns 0 on success or an -error on failure. 1357 */ 1358 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1359 { 1360 int r; 1361 1362 if (adev->wb.wb_obj == NULL) { 1363 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1364 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1365 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1366 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1367 (void **)&adev->wb.wb); 1368 if (r) { 1369 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1370 return r; 1371 } 1372 1373 adev->wb.num_wb = AMDGPU_MAX_WB; 1374 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1375 1376 /* clear wb memory */ 1377 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1378 } 1379 1380 return 0; 1381 } 1382 1383 /** 1384 * amdgpu_device_wb_get - Allocate a wb entry 1385 * 1386 * @adev: amdgpu_device pointer 1387 * @wb: wb index 1388 * 1389 * Allocate a wb slot for use by the driver (all asics). 1390 * Returns 0 on success or -EINVAL on failure. 1391 */ 1392 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1393 { 1394 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1395 1396 if (offset < adev->wb.num_wb) { 1397 __set_bit(offset, adev->wb.used); 1398 *wb = offset << 3; /* convert to dw offset */ 1399 return 0; 1400 } else { 1401 return -EINVAL; 1402 } 1403 } 1404 1405 /** 1406 * amdgpu_device_wb_free - Free a wb entry 1407 * 1408 * @adev: amdgpu_device pointer 1409 * @wb: wb index 1410 * 1411 * Free a wb slot allocated for use by the driver (all asics) 1412 */ 1413 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1414 { 1415 wb >>= 3; 1416 if (wb < adev->wb.num_wb) 1417 __clear_bit(wb, adev->wb.used); 1418 } 1419 1420 /** 1421 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1422 * 1423 * @adev: amdgpu_device pointer 1424 * 1425 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1426 * to fail, but if any of the BARs is not accessible after the size we abort 1427 * driver loading by returning -ENODEV. 1428 */ 1429 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1430 { 1431 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1432 struct pci_bus *root; 1433 struct resource *res; 1434 unsigned int i; 1435 u16 cmd; 1436 int r; 1437 1438 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1439 return 0; 1440 1441 /* Bypass for VF */ 1442 if (amdgpu_sriov_vf(adev)) 1443 return 0; 1444 1445 /* skip if the bios has already enabled large BAR */ 1446 if (adev->gmc.real_vram_size && 1447 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1448 return 0; 1449 1450 /* Check if the root BUS has 64bit memory resources */ 1451 root = adev->pdev->bus; 1452 while (root->parent) 1453 root = root->parent; 1454 1455 pci_bus_for_each_resource(root, res, i) { 1456 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1457 res->start > 0x100000000ull) 1458 break; 1459 } 1460 1461 /* Trying to resize is pointless without a root hub window above 4GB */ 1462 if (!res) 1463 return 0; 1464 1465 /* Limit the BAR size to what is available */ 1466 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1467 rbar_size); 1468 1469 /* Disable memory decoding while we change the BAR addresses and size */ 1470 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1471 pci_write_config_word(adev->pdev, PCI_COMMAND, 1472 cmd & ~PCI_COMMAND_MEMORY); 1473 1474 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1475 amdgpu_doorbell_fini(adev); 1476 if (adev->asic_type >= CHIP_BONAIRE) 1477 pci_release_resource(adev->pdev, 2); 1478 1479 pci_release_resource(adev->pdev, 0); 1480 1481 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1482 if (r == -ENOSPC) 1483 DRM_INFO("Not enough PCI address space for a large BAR."); 1484 else if (r && r != -ENOTSUPP) 1485 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1486 1487 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1488 1489 /* When the doorbell or fb BAR isn't available we have no chance of 1490 * using the device. 1491 */ 1492 r = amdgpu_doorbell_init(adev); 1493 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1494 return -ENODEV; 1495 1496 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1497 1498 return 0; 1499 } 1500 1501 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1502 { 1503 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1504 return false; 1505 1506 return true; 1507 } 1508 1509 /* 1510 * GPU helpers function. 1511 */ 1512 /** 1513 * amdgpu_device_need_post - check if the hw need post or not 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Check if the asic has been initialized (all asics) at driver startup 1518 * or post is needed if hw reset is performed. 1519 * Returns true if need or false if not. 1520 */ 1521 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1522 { 1523 uint32_t reg; 1524 1525 if (amdgpu_sriov_vf(adev)) 1526 return false; 1527 1528 if (!amdgpu_device_read_bios(adev)) 1529 return false; 1530 1531 if (amdgpu_passthrough(adev)) { 1532 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1533 * some old smc fw still need driver do vPost otherwise gpu hang, while 1534 * those smc fw version above 22.15 doesn't have this flaw, so we force 1535 * vpost executed for smc version below 22.15 1536 */ 1537 if (adev->asic_type == CHIP_FIJI) { 1538 int err; 1539 uint32_t fw_ver; 1540 1541 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1542 /* force vPost if error occured */ 1543 if (err) 1544 return true; 1545 1546 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1547 release_firmware(adev->pm.fw); 1548 if (fw_ver < 0x00160e00) 1549 return true; 1550 } 1551 } 1552 1553 /* Don't post if we need to reset whole hive on init */ 1554 if (adev->gmc.xgmi.pending_reset) 1555 return false; 1556 1557 if (adev->has_hw_reset) { 1558 adev->has_hw_reset = false; 1559 return true; 1560 } 1561 1562 /* bios scratch used on CIK+ */ 1563 if (adev->asic_type >= CHIP_BONAIRE) 1564 return amdgpu_atombios_scratch_need_asic_init(adev); 1565 1566 /* check MEM_SIZE for older asics */ 1567 reg = amdgpu_asic_get_config_memsize(adev); 1568 1569 if ((reg != 0) && (reg != 0xffffffff)) 1570 return false; 1571 1572 return true; 1573 } 1574 1575 /* 1576 * Check whether seamless boot is supported. 1577 * 1578 * So far we only support seamless boot on DCE 3.0 or later. 1579 * If users report that it works on older ASICS as well, we may 1580 * loosen this. 1581 */ 1582 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1583 { 1584 switch (amdgpu_seamless) { 1585 case -1: 1586 break; 1587 case 1: 1588 return true; 1589 case 0: 1590 return false; 1591 default: 1592 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1593 amdgpu_seamless); 1594 return false; 1595 } 1596 1597 if (!(adev->flags & AMD_IS_APU)) 1598 return false; 1599 1600 if (adev->mman.keep_stolen_vga_memory) 1601 return false; 1602 1603 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1604 } 1605 1606 /* 1607 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1608 * don't support dynamic speed switching. Until we have confirmation from Intel 1609 * that a specific host supports it, it's safer that we keep it disabled for all. 1610 * 1611 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1612 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1613 */ 1614 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1615 { 1616 #if IS_ENABLED(CONFIG_X86) 1617 struct cpuinfo_x86 *c = &cpu_data(0); 1618 1619 /* eGPU change speeds based on USB4 fabric conditions */ 1620 if (dev_is_removable(adev->dev)) 1621 return true; 1622 1623 if (c->x86_vendor == X86_VENDOR_INTEL) 1624 return false; 1625 #endif 1626 return true; 1627 } 1628 1629 /** 1630 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1631 * 1632 * @adev: amdgpu_device pointer 1633 * 1634 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1635 * be set for this device. 1636 * 1637 * Returns true if it should be used or false if not. 1638 */ 1639 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1640 { 1641 switch (amdgpu_aspm) { 1642 case -1: 1643 break; 1644 case 0: 1645 return false; 1646 case 1: 1647 return true; 1648 default: 1649 return false; 1650 } 1651 if (adev->flags & AMD_IS_APU) 1652 return false; 1653 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1654 return false; 1655 return pcie_aspm_enabled(adev->pdev); 1656 } 1657 1658 /* if we get transitioned to only one device, take VGA back */ 1659 /** 1660 * amdgpu_device_vga_set_decode - enable/disable vga decode 1661 * 1662 * @pdev: PCI device pointer 1663 * @state: enable/disable vga decode 1664 * 1665 * Enable/disable vga decode (all asics). 1666 * Returns VGA resource flags. 1667 */ 1668 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1669 bool state) 1670 { 1671 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1672 1673 amdgpu_asic_set_vga_state(adev, state); 1674 if (state) 1675 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1676 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1677 else 1678 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1679 } 1680 1681 /** 1682 * amdgpu_device_check_block_size - validate the vm block size 1683 * 1684 * @adev: amdgpu_device pointer 1685 * 1686 * Validates the vm block size specified via module parameter. 1687 * The vm block size defines number of bits in page table versus page directory, 1688 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1689 * page table and the remaining bits are in the page directory. 1690 */ 1691 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1692 { 1693 /* defines number of bits in page table versus page directory, 1694 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1695 * page table and the remaining bits are in the page directory 1696 */ 1697 if (amdgpu_vm_block_size == -1) 1698 return; 1699 1700 if (amdgpu_vm_block_size < 9) { 1701 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1702 amdgpu_vm_block_size); 1703 amdgpu_vm_block_size = -1; 1704 } 1705 } 1706 1707 /** 1708 * amdgpu_device_check_vm_size - validate the vm size 1709 * 1710 * @adev: amdgpu_device pointer 1711 * 1712 * Validates the vm size in GB specified via module parameter. 1713 * The VM size is the size of the GPU virtual memory space in GB. 1714 */ 1715 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1716 { 1717 /* no need to check the default value */ 1718 if (amdgpu_vm_size == -1) 1719 return; 1720 1721 if (amdgpu_vm_size < 1) { 1722 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1723 amdgpu_vm_size); 1724 amdgpu_vm_size = -1; 1725 } 1726 } 1727 1728 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1729 { 1730 struct sysinfo si; 1731 bool is_os_64 = (sizeof(void *) == 8); 1732 uint64_t total_memory; 1733 uint64_t dram_size_seven_GB = 0x1B8000000; 1734 uint64_t dram_size_three_GB = 0xB8000000; 1735 1736 if (amdgpu_smu_memory_pool_size == 0) 1737 return; 1738 1739 if (!is_os_64) { 1740 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1741 goto def_value; 1742 } 1743 si_meminfo(&si); 1744 total_memory = (uint64_t)si.totalram * si.mem_unit; 1745 1746 if ((amdgpu_smu_memory_pool_size == 1) || 1747 (amdgpu_smu_memory_pool_size == 2)) { 1748 if (total_memory < dram_size_three_GB) 1749 goto def_value1; 1750 } else if ((amdgpu_smu_memory_pool_size == 4) || 1751 (amdgpu_smu_memory_pool_size == 8)) { 1752 if (total_memory < dram_size_seven_GB) 1753 goto def_value1; 1754 } else { 1755 DRM_WARN("Smu memory pool size not supported\n"); 1756 goto def_value; 1757 } 1758 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1759 1760 return; 1761 1762 def_value1: 1763 DRM_WARN("No enough system memory\n"); 1764 def_value: 1765 adev->pm.smu_prv_buffer_size = 0; 1766 } 1767 1768 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1769 { 1770 if (!(adev->flags & AMD_IS_APU) || 1771 adev->asic_type < CHIP_RAVEN) 1772 return 0; 1773 1774 switch (adev->asic_type) { 1775 case CHIP_RAVEN: 1776 if (adev->pdev->device == 0x15dd) 1777 adev->apu_flags |= AMD_APU_IS_RAVEN; 1778 if (adev->pdev->device == 0x15d8) 1779 adev->apu_flags |= AMD_APU_IS_PICASSO; 1780 break; 1781 case CHIP_RENOIR: 1782 if ((adev->pdev->device == 0x1636) || 1783 (adev->pdev->device == 0x164c)) 1784 adev->apu_flags |= AMD_APU_IS_RENOIR; 1785 else 1786 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1787 break; 1788 case CHIP_VANGOGH: 1789 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1790 break; 1791 case CHIP_YELLOW_CARP: 1792 break; 1793 case CHIP_CYAN_SKILLFISH: 1794 if ((adev->pdev->device == 0x13FE) || 1795 (adev->pdev->device == 0x143F)) 1796 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1797 break; 1798 default: 1799 break; 1800 } 1801 1802 return 0; 1803 } 1804 1805 /** 1806 * amdgpu_device_check_arguments - validate module params 1807 * 1808 * @adev: amdgpu_device pointer 1809 * 1810 * Validates certain module parameters and updates 1811 * the associated values used by the driver (all asics). 1812 */ 1813 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1814 { 1815 if (amdgpu_sched_jobs < 4) { 1816 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1817 amdgpu_sched_jobs); 1818 amdgpu_sched_jobs = 4; 1819 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1820 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1821 amdgpu_sched_jobs); 1822 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1823 } 1824 1825 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1826 /* gart size must be greater or equal to 32M */ 1827 dev_warn(adev->dev, "gart size (%d) too small\n", 1828 amdgpu_gart_size); 1829 amdgpu_gart_size = -1; 1830 } 1831 1832 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1833 /* gtt size must be greater or equal to 32M */ 1834 dev_warn(adev->dev, "gtt size (%d) too small\n", 1835 amdgpu_gtt_size); 1836 amdgpu_gtt_size = -1; 1837 } 1838 1839 /* valid range is between 4 and 9 inclusive */ 1840 if (amdgpu_vm_fragment_size != -1 && 1841 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1842 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1843 amdgpu_vm_fragment_size = -1; 1844 } 1845 1846 if (amdgpu_sched_hw_submission < 2) { 1847 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1848 amdgpu_sched_hw_submission); 1849 amdgpu_sched_hw_submission = 2; 1850 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1851 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1852 amdgpu_sched_hw_submission); 1853 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1854 } 1855 1856 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1857 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1858 amdgpu_reset_method = -1; 1859 } 1860 1861 amdgpu_device_check_smu_prv_buffer_size(adev); 1862 1863 amdgpu_device_check_vm_size(adev); 1864 1865 amdgpu_device_check_block_size(adev); 1866 1867 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1868 1869 return 0; 1870 } 1871 1872 /** 1873 * amdgpu_switcheroo_set_state - set switcheroo state 1874 * 1875 * @pdev: pci dev pointer 1876 * @state: vga_switcheroo state 1877 * 1878 * Callback for the switcheroo driver. Suspends or resumes 1879 * the asics before or after it is powered up using ACPI methods. 1880 */ 1881 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1882 enum vga_switcheroo_state state) 1883 { 1884 struct drm_device *dev = pci_get_drvdata(pdev); 1885 int r; 1886 1887 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1888 return; 1889 1890 if (state == VGA_SWITCHEROO_ON) { 1891 pr_info("switched on\n"); 1892 /* don't suspend or resume card normally */ 1893 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1894 1895 pci_set_power_state(pdev, PCI_D0); 1896 amdgpu_device_load_pci_state(pdev); 1897 r = pci_enable_device(pdev); 1898 if (r) 1899 DRM_WARN("pci_enable_device failed (%d)\n", r); 1900 amdgpu_device_resume(dev, true); 1901 1902 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1903 } else { 1904 pr_info("switched off\n"); 1905 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1906 amdgpu_device_prepare(dev); 1907 amdgpu_device_suspend(dev, true); 1908 amdgpu_device_cache_pci_state(pdev); 1909 /* Shut down the device */ 1910 pci_disable_device(pdev); 1911 pci_set_power_state(pdev, PCI_D3cold); 1912 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1913 } 1914 } 1915 1916 /** 1917 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1918 * 1919 * @pdev: pci dev pointer 1920 * 1921 * Callback for the switcheroo driver. Check of the switcheroo 1922 * state can be changed. 1923 * Returns true if the state can be changed, false if not. 1924 */ 1925 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1926 { 1927 struct drm_device *dev = pci_get_drvdata(pdev); 1928 1929 /* 1930 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1931 * locking inversion with the driver load path. And the access here is 1932 * completely racy anyway. So don't bother with locking for now. 1933 */ 1934 return atomic_read(&dev->open_count) == 0; 1935 } 1936 1937 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1938 .set_gpu_state = amdgpu_switcheroo_set_state, 1939 .reprobe = NULL, 1940 .can_switch = amdgpu_switcheroo_can_switch, 1941 }; 1942 1943 /** 1944 * amdgpu_device_ip_set_clockgating_state - set the CG state 1945 * 1946 * @dev: amdgpu_device pointer 1947 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1948 * @state: clockgating state (gate or ungate) 1949 * 1950 * Sets the requested clockgating state for all instances of 1951 * the hardware IP specified. 1952 * Returns the error code from the last instance. 1953 */ 1954 int amdgpu_device_ip_set_clockgating_state(void *dev, 1955 enum amd_ip_block_type block_type, 1956 enum amd_clockgating_state state) 1957 { 1958 struct amdgpu_device *adev = dev; 1959 int i, r = 0; 1960 1961 for (i = 0; i < adev->num_ip_blocks; i++) { 1962 if (!adev->ip_blocks[i].status.valid) 1963 continue; 1964 if (adev->ip_blocks[i].version->type != block_type) 1965 continue; 1966 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1967 continue; 1968 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1969 (void *)adev, state); 1970 if (r) 1971 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1972 adev->ip_blocks[i].version->funcs->name, r); 1973 } 1974 return r; 1975 } 1976 1977 /** 1978 * amdgpu_device_ip_set_powergating_state - set the PG state 1979 * 1980 * @dev: amdgpu_device pointer 1981 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1982 * @state: powergating state (gate or ungate) 1983 * 1984 * Sets the requested powergating state for all instances of 1985 * the hardware IP specified. 1986 * Returns the error code from the last instance. 1987 */ 1988 int amdgpu_device_ip_set_powergating_state(void *dev, 1989 enum amd_ip_block_type block_type, 1990 enum amd_powergating_state state) 1991 { 1992 struct amdgpu_device *adev = dev; 1993 int i, r = 0; 1994 1995 for (i = 0; i < adev->num_ip_blocks; i++) { 1996 if (!adev->ip_blocks[i].status.valid) 1997 continue; 1998 if (adev->ip_blocks[i].version->type != block_type) 1999 continue; 2000 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2001 continue; 2002 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2003 (void *)adev, state); 2004 if (r) 2005 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2006 adev->ip_blocks[i].version->funcs->name, r); 2007 } 2008 return r; 2009 } 2010 2011 /** 2012 * amdgpu_device_ip_get_clockgating_state - get the CG state 2013 * 2014 * @adev: amdgpu_device pointer 2015 * @flags: clockgating feature flags 2016 * 2017 * Walks the list of IPs on the device and updates the clockgating 2018 * flags for each IP. 2019 * Updates @flags with the feature flags for each hardware IP where 2020 * clockgating is enabled. 2021 */ 2022 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2023 u64 *flags) 2024 { 2025 int i; 2026 2027 for (i = 0; i < adev->num_ip_blocks; i++) { 2028 if (!adev->ip_blocks[i].status.valid) 2029 continue; 2030 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2031 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2032 } 2033 } 2034 2035 /** 2036 * amdgpu_device_ip_wait_for_idle - wait for idle 2037 * 2038 * @adev: amdgpu_device pointer 2039 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2040 * 2041 * Waits for the request hardware IP to be idle. 2042 * Returns 0 for success or a negative error code on failure. 2043 */ 2044 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2045 enum amd_ip_block_type block_type) 2046 { 2047 int i, r; 2048 2049 for (i = 0; i < adev->num_ip_blocks; i++) { 2050 if (!adev->ip_blocks[i].status.valid) 2051 continue; 2052 if (adev->ip_blocks[i].version->type == block_type) { 2053 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2054 if (r) 2055 return r; 2056 break; 2057 } 2058 } 2059 return 0; 2060 2061 } 2062 2063 /** 2064 * amdgpu_device_ip_is_idle - is the hardware IP idle 2065 * 2066 * @adev: amdgpu_device pointer 2067 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2068 * 2069 * Check if the hardware IP is idle or not. 2070 * Returns true if it the IP is idle, false if not. 2071 */ 2072 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2073 enum amd_ip_block_type block_type) 2074 { 2075 int i; 2076 2077 for (i = 0; i < adev->num_ip_blocks; i++) { 2078 if (!adev->ip_blocks[i].status.valid) 2079 continue; 2080 if (adev->ip_blocks[i].version->type == block_type) 2081 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2082 } 2083 return true; 2084 2085 } 2086 2087 /** 2088 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2089 * 2090 * @adev: amdgpu_device pointer 2091 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2092 * 2093 * Returns a pointer to the hardware IP block structure 2094 * if it exists for the asic, otherwise NULL. 2095 */ 2096 struct amdgpu_ip_block * 2097 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2098 enum amd_ip_block_type type) 2099 { 2100 int i; 2101 2102 for (i = 0; i < adev->num_ip_blocks; i++) 2103 if (adev->ip_blocks[i].version->type == type) 2104 return &adev->ip_blocks[i]; 2105 2106 return NULL; 2107 } 2108 2109 /** 2110 * amdgpu_device_ip_block_version_cmp 2111 * 2112 * @adev: amdgpu_device pointer 2113 * @type: enum amd_ip_block_type 2114 * @major: major version 2115 * @minor: minor version 2116 * 2117 * return 0 if equal or greater 2118 * return 1 if smaller or the ip_block doesn't exist 2119 */ 2120 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2121 enum amd_ip_block_type type, 2122 u32 major, u32 minor) 2123 { 2124 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2125 2126 if (ip_block && ((ip_block->version->major > major) || 2127 ((ip_block->version->major == major) && 2128 (ip_block->version->minor >= minor)))) 2129 return 0; 2130 2131 return 1; 2132 } 2133 2134 /** 2135 * amdgpu_device_ip_block_add 2136 * 2137 * @adev: amdgpu_device pointer 2138 * @ip_block_version: pointer to the IP to add 2139 * 2140 * Adds the IP block driver information to the collection of IPs 2141 * on the asic. 2142 */ 2143 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2144 const struct amdgpu_ip_block_version *ip_block_version) 2145 { 2146 if (!ip_block_version) 2147 return -EINVAL; 2148 2149 switch (ip_block_version->type) { 2150 case AMD_IP_BLOCK_TYPE_VCN: 2151 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2152 return 0; 2153 break; 2154 case AMD_IP_BLOCK_TYPE_JPEG: 2155 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2156 return 0; 2157 break; 2158 default: 2159 break; 2160 } 2161 2162 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2163 ip_block_version->funcs->name); 2164 2165 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2166 2167 return 0; 2168 } 2169 2170 /** 2171 * amdgpu_device_enable_virtual_display - enable virtual display feature 2172 * 2173 * @adev: amdgpu_device pointer 2174 * 2175 * Enabled the virtual display feature if the user has enabled it via 2176 * the module parameter virtual_display. This feature provides a virtual 2177 * display hardware on headless boards or in virtualized environments. 2178 * This function parses and validates the configuration string specified by 2179 * the user and configues the virtual display configuration (number of 2180 * virtual connectors, crtcs, etc.) specified. 2181 */ 2182 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2183 { 2184 adev->enable_virtual_display = false; 2185 2186 if (amdgpu_virtual_display) { 2187 const char *pci_address_name = pci_name(adev->pdev); 2188 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2189 2190 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2191 pciaddstr_tmp = pciaddstr; 2192 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2193 pciaddname = strsep(&pciaddname_tmp, ","); 2194 if (!strcmp("all", pciaddname) 2195 || !strcmp(pci_address_name, pciaddname)) { 2196 long num_crtc; 2197 int res = -1; 2198 2199 adev->enable_virtual_display = true; 2200 2201 if (pciaddname_tmp) 2202 res = kstrtol(pciaddname_tmp, 10, 2203 &num_crtc); 2204 2205 if (!res) { 2206 if (num_crtc < 1) 2207 num_crtc = 1; 2208 if (num_crtc > 6) 2209 num_crtc = 6; 2210 adev->mode_info.num_crtc = num_crtc; 2211 } else { 2212 adev->mode_info.num_crtc = 1; 2213 } 2214 break; 2215 } 2216 } 2217 2218 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2219 amdgpu_virtual_display, pci_address_name, 2220 adev->enable_virtual_display, adev->mode_info.num_crtc); 2221 2222 kfree(pciaddstr); 2223 } 2224 } 2225 2226 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2227 { 2228 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2229 adev->mode_info.num_crtc = 1; 2230 adev->enable_virtual_display = true; 2231 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2232 adev->enable_virtual_display, adev->mode_info.num_crtc); 2233 } 2234 } 2235 2236 /** 2237 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2238 * 2239 * @adev: amdgpu_device pointer 2240 * 2241 * Parses the asic configuration parameters specified in the gpu info 2242 * firmware and makes them availale to the driver for use in configuring 2243 * the asic. 2244 * Returns 0 on success, -EINVAL on failure. 2245 */ 2246 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2247 { 2248 const char *chip_name; 2249 char fw_name[40]; 2250 int err; 2251 const struct gpu_info_firmware_header_v1_0 *hdr; 2252 2253 adev->firmware.gpu_info_fw = NULL; 2254 2255 if (adev->mman.discovery_bin) 2256 return 0; 2257 2258 switch (adev->asic_type) { 2259 default: 2260 return 0; 2261 case CHIP_VEGA10: 2262 chip_name = "vega10"; 2263 break; 2264 case CHIP_VEGA12: 2265 chip_name = "vega12"; 2266 break; 2267 case CHIP_RAVEN: 2268 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2269 chip_name = "raven2"; 2270 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2271 chip_name = "picasso"; 2272 else 2273 chip_name = "raven"; 2274 break; 2275 case CHIP_ARCTURUS: 2276 chip_name = "arcturus"; 2277 break; 2278 case CHIP_NAVI12: 2279 chip_name = "navi12"; 2280 break; 2281 } 2282 2283 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2284 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2285 if (err) { 2286 dev_err(adev->dev, 2287 "Failed to get gpu_info firmware \"%s\"\n", 2288 fw_name); 2289 goto out; 2290 } 2291 2292 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2293 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2294 2295 switch (hdr->version_major) { 2296 case 1: 2297 { 2298 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2299 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2300 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2301 2302 /* 2303 * Should be droped when DAL no longer needs it. 2304 */ 2305 if (adev->asic_type == CHIP_NAVI12) 2306 goto parse_soc_bounding_box; 2307 2308 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2309 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2310 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2311 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2312 adev->gfx.config.max_texture_channel_caches = 2313 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2314 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2315 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2316 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2317 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2318 adev->gfx.config.double_offchip_lds_buf = 2319 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2320 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2321 adev->gfx.cu_info.max_waves_per_simd = 2322 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2323 adev->gfx.cu_info.max_scratch_slots_per_cu = 2324 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2325 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2326 if (hdr->version_minor >= 1) { 2327 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2328 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2329 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2330 adev->gfx.config.num_sc_per_sh = 2331 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2332 adev->gfx.config.num_packer_per_sc = 2333 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2334 } 2335 2336 parse_soc_bounding_box: 2337 /* 2338 * soc bounding box info is not integrated in disocovery table, 2339 * we always need to parse it from gpu info firmware if needed. 2340 */ 2341 if (hdr->version_minor == 2) { 2342 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2343 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2344 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2345 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2346 } 2347 break; 2348 } 2349 default: 2350 dev_err(adev->dev, 2351 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2352 err = -EINVAL; 2353 goto out; 2354 } 2355 out: 2356 return err; 2357 } 2358 2359 /** 2360 * amdgpu_device_ip_early_init - run early init for hardware IPs 2361 * 2362 * @adev: amdgpu_device pointer 2363 * 2364 * Early initialization pass for hardware IPs. The hardware IPs that make 2365 * up each asic are discovered each IP's early_init callback is run. This 2366 * is the first stage in initializing the asic. 2367 * Returns 0 on success, negative error code on failure. 2368 */ 2369 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2370 { 2371 struct pci_dev *parent; 2372 int i, r; 2373 bool total; 2374 2375 amdgpu_device_enable_virtual_display(adev); 2376 2377 if (amdgpu_sriov_vf(adev)) { 2378 r = amdgpu_virt_request_full_gpu(adev, true); 2379 if (r) 2380 return r; 2381 } 2382 2383 switch (adev->asic_type) { 2384 #ifdef CONFIG_DRM_AMDGPU_SI 2385 case CHIP_VERDE: 2386 case CHIP_TAHITI: 2387 case CHIP_PITCAIRN: 2388 case CHIP_OLAND: 2389 case CHIP_HAINAN: 2390 adev->family = AMDGPU_FAMILY_SI; 2391 r = si_set_ip_blocks(adev); 2392 if (r) 2393 return r; 2394 break; 2395 #endif 2396 #ifdef CONFIG_DRM_AMDGPU_CIK 2397 case CHIP_BONAIRE: 2398 case CHIP_HAWAII: 2399 case CHIP_KAVERI: 2400 case CHIP_KABINI: 2401 case CHIP_MULLINS: 2402 if (adev->flags & AMD_IS_APU) 2403 adev->family = AMDGPU_FAMILY_KV; 2404 else 2405 adev->family = AMDGPU_FAMILY_CI; 2406 2407 r = cik_set_ip_blocks(adev); 2408 if (r) 2409 return r; 2410 break; 2411 #endif 2412 case CHIP_TOPAZ: 2413 case CHIP_TONGA: 2414 case CHIP_FIJI: 2415 case CHIP_POLARIS10: 2416 case CHIP_POLARIS11: 2417 case CHIP_POLARIS12: 2418 case CHIP_VEGAM: 2419 case CHIP_CARRIZO: 2420 case CHIP_STONEY: 2421 if (adev->flags & AMD_IS_APU) 2422 adev->family = AMDGPU_FAMILY_CZ; 2423 else 2424 adev->family = AMDGPU_FAMILY_VI; 2425 2426 r = vi_set_ip_blocks(adev); 2427 if (r) 2428 return r; 2429 break; 2430 default: 2431 r = amdgpu_discovery_set_ip_blocks(adev); 2432 if (r) 2433 return r; 2434 break; 2435 } 2436 2437 if (amdgpu_has_atpx() && 2438 (amdgpu_is_atpx_hybrid() || 2439 amdgpu_has_atpx_dgpu_power_cntl()) && 2440 ((adev->flags & AMD_IS_APU) == 0) && 2441 !dev_is_removable(&adev->pdev->dev)) 2442 adev->flags |= AMD_IS_PX; 2443 2444 if (!(adev->flags & AMD_IS_APU)) { 2445 parent = pcie_find_root_port(adev->pdev); 2446 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2447 } 2448 2449 2450 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2451 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2452 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2453 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2454 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2455 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2456 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2457 2458 total = true; 2459 for (i = 0; i < adev->num_ip_blocks; i++) { 2460 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2461 DRM_WARN("disabled ip block: %d <%s>\n", 2462 i, adev->ip_blocks[i].version->funcs->name); 2463 adev->ip_blocks[i].status.valid = false; 2464 } else { 2465 if (adev->ip_blocks[i].version->funcs->early_init) { 2466 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2467 if (r == -ENOENT) { 2468 adev->ip_blocks[i].status.valid = false; 2469 } else if (r) { 2470 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2471 adev->ip_blocks[i].version->funcs->name, r); 2472 total = false; 2473 } else { 2474 adev->ip_blocks[i].status.valid = true; 2475 } 2476 } else { 2477 adev->ip_blocks[i].status.valid = true; 2478 } 2479 } 2480 /* get the vbios after the asic_funcs are set up */ 2481 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2482 r = amdgpu_device_parse_gpu_info_fw(adev); 2483 if (r) 2484 return r; 2485 2486 /* Read BIOS */ 2487 if (amdgpu_device_read_bios(adev)) { 2488 if (!amdgpu_get_bios(adev)) 2489 return -EINVAL; 2490 2491 r = amdgpu_atombios_init(adev); 2492 if (r) { 2493 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2494 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2495 return r; 2496 } 2497 } 2498 2499 /*get pf2vf msg info at it's earliest time*/ 2500 if (amdgpu_sriov_vf(adev)) 2501 amdgpu_virt_init_data_exchange(adev); 2502 2503 } 2504 } 2505 if (!total) 2506 return -ENODEV; 2507 2508 amdgpu_amdkfd_device_probe(adev); 2509 adev->cg_flags &= amdgpu_cg_mask; 2510 adev->pg_flags &= amdgpu_pg_mask; 2511 2512 return 0; 2513 } 2514 2515 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2516 { 2517 int i, r; 2518 2519 for (i = 0; i < adev->num_ip_blocks; i++) { 2520 if (!adev->ip_blocks[i].status.sw) 2521 continue; 2522 if (adev->ip_blocks[i].status.hw) 2523 continue; 2524 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2525 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2526 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2527 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2528 if (r) { 2529 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2530 adev->ip_blocks[i].version->funcs->name, r); 2531 return r; 2532 } 2533 adev->ip_blocks[i].status.hw = true; 2534 } 2535 } 2536 2537 return 0; 2538 } 2539 2540 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2541 { 2542 int i, r; 2543 2544 for (i = 0; i < adev->num_ip_blocks; i++) { 2545 if (!adev->ip_blocks[i].status.sw) 2546 continue; 2547 if (adev->ip_blocks[i].status.hw) 2548 continue; 2549 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2550 if (r) { 2551 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2552 adev->ip_blocks[i].version->funcs->name, r); 2553 return r; 2554 } 2555 adev->ip_blocks[i].status.hw = true; 2556 } 2557 2558 return 0; 2559 } 2560 2561 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2562 { 2563 int r = 0; 2564 int i; 2565 uint32_t smu_version; 2566 2567 if (adev->asic_type >= CHIP_VEGA10) { 2568 for (i = 0; i < adev->num_ip_blocks; i++) { 2569 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2570 continue; 2571 2572 if (!adev->ip_blocks[i].status.sw) 2573 continue; 2574 2575 /* no need to do the fw loading again if already done*/ 2576 if (adev->ip_blocks[i].status.hw == true) 2577 break; 2578 2579 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2580 r = adev->ip_blocks[i].version->funcs->resume(adev); 2581 if (r) { 2582 DRM_ERROR("resume of IP block <%s> failed %d\n", 2583 adev->ip_blocks[i].version->funcs->name, r); 2584 return r; 2585 } 2586 } else { 2587 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2588 if (r) { 2589 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2590 adev->ip_blocks[i].version->funcs->name, r); 2591 return r; 2592 } 2593 } 2594 2595 adev->ip_blocks[i].status.hw = true; 2596 break; 2597 } 2598 } 2599 2600 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2601 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2602 2603 return r; 2604 } 2605 2606 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2607 { 2608 long timeout; 2609 int r, i; 2610 2611 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2612 struct amdgpu_ring *ring = adev->rings[i]; 2613 2614 /* No need to setup the GPU scheduler for rings that don't need it */ 2615 if (!ring || ring->no_scheduler) 2616 continue; 2617 2618 switch (ring->funcs->type) { 2619 case AMDGPU_RING_TYPE_GFX: 2620 timeout = adev->gfx_timeout; 2621 break; 2622 case AMDGPU_RING_TYPE_COMPUTE: 2623 timeout = adev->compute_timeout; 2624 break; 2625 case AMDGPU_RING_TYPE_SDMA: 2626 timeout = adev->sdma_timeout; 2627 break; 2628 default: 2629 timeout = adev->video_timeout; 2630 break; 2631 } 2632 2633 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2634 DRM_SCHED_PRIORITY_COUNT, 2635 ring->num_hw_submission, 0, 2636 timeout, adev->reset_domain->wq, 2637 ring->sched_score, ring->name, 2638 adev->dev); 2639 if (r) { 2640 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2641 ring->name); 2642 return r; 2643 } 2644 r = amdgpu_uvd_entity_init(adev, ring); 2645 if (r) { 2646 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2647 ring->name); 2648 return r; 2649 } 2650 r = amdgpu_vce_entity_init(adev, ring); 2651 if (r) { 2652 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2653 ring->name); 2654 return r; 2655 } 2656 } 2657 2658 amdgpu_xcp_update_partition_sched_list(adev); 2659 2660 return 0; 2661 } 2662 2663 2664 /** 2665 * amdgpu_device_ip_init - run init for hardware IPs 2666 * 2667 * @adev: amdgpu_device pointer 2668 * 2669 * Main initialization pass for hardware IPs. The list of all the hardware 2670 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2671 * are run. sw_init initializes the software state associated with each IP 2672 * and hw_init initializes the hardware associated with each IP. 2673 * Returns 0 on success, negative error code on failure. 2674 */ 2675 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2676 { 2677 int i, r; 2678 2679 r = amdgpu_ras_init(adev); 2680 if (r) 2681 return r; 2682 2683 for (i = 0; i < adev->num_ip_blocks; i++) { 2684 if (!adev->ip_blocks[i].status.valid) 2685 continue; 2686 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2687 if (r) { 2688 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2689 adev->ip_blocks[i].version->funcs->name, r); 2690 goto init_failed; 2691 } 2692 adev->ip_blocks[i].status.sw = true; 2693 2694 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2695 /* need to do common hw init early so everything is set up for gmc */ 2696 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2697 if (r) { 2698 DRM_ERROR("hw_init %d failed %d\n", i, r); 2699 goto init_failed; 2700 } 2701 adev->ip_blocks[i].status.hw = true; 2702 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2703 /* need to do gmc hw init early so we can allocate gpu mem */ 2704 /* Try to reserve bad pages early */ 2705 if (amdgpu_sriov_vf(adev)) 2706 amdgpu_virt_exchange_data(adev); 2707 2708 r = amdgpu_device_mem_scratch_init(adev); 2709 if (r) { 2710 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2711 goto init_failed; 2712 } 2713 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2714 if (r) { 2715 DRM_ERROR("hw_init %d failed %d\n", i, r); 2716 goto init_failed; 2717 } 2718 r = amdgpu_device_wb_init(adev); 2719 if (r) { 2720 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2721 goto init_failed; 2722 } 2723 adev->ip_blocks[i].status.hw = true; 2724 2725 /* right after GMC hw init, we create CSA */ 2726 if (adev->gfx.mcbp) { 2727 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2728 AMDGPU_GEM_DOMAIN_VRAM | 2729 AMDGPU_GEM_DOMAIN_GTT, 2730 AMDGPU_CSA_SIZE); 2731 if (r) { 2732 DRM_ERROR("allocate CSA failed %d\n", r); 2733 goto init_failed; 2734 } 2735 } 2736 2737 r = amdgpu_seq64_init(adev); 2738 if (r) { 2739 DRM_ERROR("allocate seq64 failed %d\n", r); 2740 goto init_failed; 2741 } 2742 } 2743 } 2744 2745 if (amdgpu_sriov_vf(adev)) 2746 amdgpu_virt_init_data_exchange(adev); 2747 2748 r = amdgpu_ib_pool_init(adev); 2749 if (r) { 2750 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2751 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2752 goto init_failed; 2753 } 2754 2755 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2756 if (r) 2757 goto init_failed; 2758 2759 r = amdgpu_device_ip_hw_init_phase1(adev); 2760 if (r) 2761 goto init_failed; 2762 2763 r = amdgpu_device_fw_loading(adev); 2764 if (r) 2765 goto init_failed; 2766 2767 r = amdgpu_device_ip_hw_init_phase2(adev); 2768 if (r) 2769 goto init_failed; 2770 2771 /* 2772 * retired pages will be loaded from eeprom and reserved here, 2773 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2774 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2775 * for I2C communication which only true at this point. 2776 * 2777 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2778 * failure from bad gpu situation and stop amdgpu init process 2779 * accordingly. For other failed cases, it will still release all 2780 * the resource and print error message, rather than returning one 2781 * negative value to upper level. 2782 * 2783 * Note: theoretically, this should be called before all vram allocations 2784 * to protect retired page from abusing 2785 */ 2786 r = amdgpu_ras_recovery_init(adev); 2787 if (r) 2788 goto init_failed; 2789 2790 /** 2791 * In case of XGMI grab extra reference for reset domain for this device 2792 */ 2793 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2794 if (amdgpu_xgmi_add_device(adev) == 0) { 2795 if (!amdgpu_sriov_vf(adev)) { 2796 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2797 2798 if (WARN_ON(!hive)) { 2799 r = -ENOENT; 2800 goto init_failed; 2801 } 2802 2803 if (!hive->reset_domain || 2804 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2805 r = -ENOENT; 2806 amdgpu_put_xgmi_hive(hive); 2807 goto init_failed; 2808 } 2809 2810 /* Drop the early temporary reset domain we created for device */ 2811 amdgpu_reset_put_reset_domain(adev->reset_domain); 2812 adev->reset_domain = hive->reset_domain; 2813 amdgpu_put_xgmi_hive(hive); 2814 } 2815 } 2816 } 2817 2818 r = amdgpu_device_init_schedulers(adev); 2819 if (r) 2820 goto init_failed; 2821 2822 if (adev->mman.buffer_funcs_ring->sched.ready) 2823 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2824 2825 /* Don't init kfd if whole hive need to be reset during init */ 2826 if (!adev->gmc.xgmi.pending_reset) { 2827 kgd2kfd_init_zone_device(adev); 2828 amdgpu_amdkfd_device_init(adev); 2829 } 2830 2831 amdgpu_fru_get_product_info(adev); 2832 2833 init_failed: 2834 2835 return r; 2836 } 2837 2838 /** 2839 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2840 * 2841 * @adev: amdgpu_device pointer 2842 * 2843 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2844 * this function before a GPU reset. If the value is retained after a 2845 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2846 */ 2847 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2848 { 2849 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2850 } 2851 2852 /** 2853 * amdgpu_device_check_vram_lost - check if vram is valid 2854 * 2855 * @adev: amdgpu_device pointer 2856 * 2857 * Checks the reset magic value written to the gart pointer in VRAM. 2858 * The driver calls this after a GPU reset to see if the contents of 2859 * VRAM is lost or now. 2860 * returns true if vram is lost, false if not. 2861 */ 2862 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2863 { 2864 if (memcmp(adev->gart.ptr, adev->reset_magic, 2865 AMDGPU_RESET_MAGIC_NUM)) 2866 return true; 2867 2868 if (!amdgpu_in_reset(adev)) 2869 return false; 2870 2871 /* 2872 * For all ASICs with baco/mode1 reset, the VRAM is 2873 * always assumed to be lost. 2874 */ 2875 switch (amdgpu_asic_reset_method(adev)) { 2876 case AMD_RESET_METHOD_BACO: 2877 case AMD_RESET_METHOD_MODE1: 2878 return true; 2879 default: 2880 return false; 2881 } 2882 } 2883 2884 /** 2885 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2886 * 2887 * @adev: amdgpu_device pointer 2888 * @state: clockgating state (gate or ungate) 2889 * 2890 * The list of all the hardware IPs that make up the asic is walked and the 2891 * set_clockgating_state callbacks are run. 2892 * Late initialization pass enabling clockgating for hardware IPs. 2893 * Fini or suspend, pass disabling clockgating for hardware IPs. 2894 * Returns 0 on success, negative error code on failure. 2895 */ 2896 2897 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2898 enum amd_clockgating_state state) 2899 { 2900 int i, j, r; 2901 2902 if (amdgpu_emu_mode == 1) 2903 return 0; 2904 2905 for (j = 0; j < adev->num_ip_blocks; j++) { 2906 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2907 if (!adev->ip_blocks[i].status.late_initialized) 2908 continue; 2909 /* skip CG for GFX, SDMA on S0ix */ 2910 if (adev->in_s0ix && 2911 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2912 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2913 continue; 2914 /* skip CG for VCE/UVD, it's handled specially */ 2915 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2916 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2917 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2918 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2919 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2920 /* enable clockgating to save power */ 2921 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2922 state); 2923 if (r) { 2924 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2925 adev->ip_blocks[i].version->funcs->name, r); 2926 return r; 2927 } 2928 } 2929 } 2930 2931 return 0; 2932 } 2933 2934 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2935 enum amd_powergating_state state) 2936 { 2937 int i, j, r; 2938 2939 if (amdgpu_emu_mode == 1) 2940 return 0; 2941 2942 for (j = 0; j < adev->num_ip_blocks; j++) { 2943 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2944 if (!adev->ip_blocks[i].status.late_initialized) 2945 continue; 2946 /* skip PG for GFX, SDMA on S0ix */ 2947 if (adev->in_s0ix && 2948 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2949 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2950 continue; 2951 /* skip CG for VCE/UVD, it's handled specially */ 2952 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2953 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2954 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2955 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2956 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2957 /* enable powergating to save power */ 2958 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2959 state); 2960 if (r) { 2961 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2962 adev->ip_blocks[i].version->funcs->name, r); 2963 return r; 2964 } 2965 } 2966 } 2967 return 0; 2968 } 2969 2970 static int amdgpu_device_enable_mgpu_fan_boost(void) 2971 { 2972 struct amdgpu_gpu_instance *gpu_ins; 2973 struct amdgpu_device *adev; 2974 int i, ret = 0; 2975 2976 mutex_lock(&mgpu_info.mutex); 2977 2978 /* 2979 * MGPU fan boost feature should be enabled 2980 * only when there are two or more dGPUs in 2981 * the system 2982 */ 2983 if (mgpu_info.num_dgpu < 2) 2984 goto out; 2985 2986 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2987 gpu_ins = &(mgpu_info.gpu_ins[i]); 2988 adev = gpu_ins->adev; 2989 if (!(adev->flags & AMD_IS_APU) && 2990 !gpu_ins->mgpu_fan_enabled) { 2991 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2992 if (ret) 2993 break; 2994 2995 gpu_ins->mgpu_fan_enabled = 1; 2996 } 2997 } 2998 2999 out: 3000 mutex_unlock(&mgpu_info.mutex); 3001 3002 return ret; 3003 } 3004 3005 /** 3006 * amdgpu_device_ip_late_init - run late init for hardware IPs 3007 * 3008 * @adev: amdgpu_device pointer 3009 * 3010 * Late initialization pass for hardware IPs. The list of all the hardware 3011 * IPs that make up the asic is walked and the late_init callbacks are run. 3012 * late_init covers any special initialization that an IP requires 3013 * after all of the have been initialized or something that needs to happen 3014 * late in the init process. 3015 * Returns 0 on success, negative error code on failure. 3016 */ 3017 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3018 { 3019 struct amdgpu_gpu_instance *gpu_instance; 3020 int i = 0, r; 3021 3022 for (i = 0; i < adev->num_ip_blocks; i++) { 3023 if (!adev->ip_blocks[i].status.hw) 3024 continue; 3025 if (adev->ip_blocks[i].version->funcs->late_init) { 3026 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3027 if (r) { 3028 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3029 adev->ip_blocks[i].version->funcs->name, r); 3030 return r; 3031 } 3032 } 3033 adev->ip_blocks[i].status.late_initialized = true; 3034 } 3035 3036 r = amdgpu_ras_late_init(adev); 3037 if (r) { 3038 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3039 return r; 3040 } 3041 3042 amdgpu_ras_set_error_query_ready(adev, true); 3043 3044 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3045 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3046 3047 amdgpu_device_fill_reset_magic(adev); 3048 3049 r = amdgpu_device_enable_mgpu_fan_boost(); 3050 if (r) 3051 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3052 3053 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3054 if (amdgpu_passthrough(adev) && 3055 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3056 adev->asic_type == CHIP_ALDEBARAN)) 3057 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3058 3059 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3060 mutex_lock(&mgpu_info.mutex); 3061 3062 /* 3063 * Reset device p-state to low as this was booted with high. 3064 * 3065 * This should be performed only after all devices from the same 3066 * hive get initialized. 3067 * 3068 * However, it's unknown how many device in the hive in advance. 3069 * As this is counted one by one during devices initializations. 3070 * 3071 * So, we wait for all XGMI interlinked devices initialized. 3072 * This may bring some delays as those devices may come from 3073 * different hives. But that should be OK. 3074 */ 3075 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3076 for (i = 0; i < mgpu_info.num_gpu; i++) { 3077 gpu_instance = &(mgpu_info.gpu_ins[i]); 3078 if (gpu_instance->adev->flags & AMD_IS_APU) 3079 continue; 3080 3081 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3082 AMDGPU_XGMI_PSTATE_MIN); 3083 if (r) { 3084 DRM_ERROR("pstate setting failed (%d).\n", r); 3085 break; 3086 } 3087 } 3088 } 3089 3090 mutex_unlock(&mgpu_info.mutex); 3091 } 3092 3093 return 0; 3094 } 3095 3096 /** 3097 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3098 * 3099 * @adev: amdgpu_device pointer 3100 * 3101 * For ASICs need to disable SMC first 3102 */ 3103 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3104 { 3105 int i, r; 3106 3107 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3108 return; 3109 3110 for (i = 0; i < adev->num_ip_blocks; i++) { 3111 if (!adev->ip_blocks[i].status.hw) 3112 continue; 3113 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3114 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3115 /* XXX handle errors */ 3116 if (r) { 3117 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3118 adev->ip_blocks[i].version->funcs->name, r); 3119 } 3120 adev->ip_blocks[i].status.hw = false; 3121 break; 3122 } 3123 } 3124 } 3125 3126 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3127 { 3128 int i, r; 3129 3130 for (i = 0; i < adev->num_ip_blocks; i++) { 3131 if (!adev->ip_blocks[i].version->funcs->early_fini) 3132 continue; 3133 3134 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3135 if (r) { 3136 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3137 adev->ip_blocks[i].version->funcs->name, r); 3138 } 3139 } 3140 3141 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3142 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3143 3144 amdgpu_amdkfd_suspend(adev, false); 3145 3146 /* Workaroud for ASICs need to disable SMC first */ 3147 amdgpu_device_smu_fini_early(adev); 3148 3149 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3150 if (!adev->ip_blocks[i].status.hw) 3151 continue; 3152 3153 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3154 /* XXX handle errors */ 3155 if (r) { 3156 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3157 adev->ip_blocks[i].version->funcs->name, r); 3158 } 3159 3160 adev->ip_blocks[i].status.hw = false; 3161 } 3162 3163 if (amdgpu_sriov_vf(adev)) { 3164 if (amdgpu_virt_release_full_gpu(adev, false)) 3165 DRM_ERROR("failed to release exclusive mode on fini\n"); 3166 } 3167 3168 return 0; 3169 } 3170 3171 /** 3172 * amdgpu_device_ip_fini - run fini for hardware IPs 3173 * 3174 * @adev: amdgpu_device pointer 3175 * 3176 * Main teardown pass for hardware IPs. The list of all the hardware 3177 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3178 * are run. hw_fini tears down the hardware associated with each IP 3179 * and sw_fini tears down any software state associated with each IP. 3180 * Returns 0 on success, negative error code on failure. 3181 */ 3182 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3183 { 3184 int i, r; 3185 3186 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3187 amdgpu_virt_release_ras_err_handler_data(adev); 3188 3189 if (adev->gmc.xgmi.num_physical_nodes > 1) 3190 amdgpu_xgmi_remove_device(adev); 3191 3192 amdgpu_amdkfd_device_fini_sw(adev); 3193 3194 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3195 if (!adev->ip_blocks[i].status.sw) 3196 continue; 3197 3198 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3199 amdgpu_ucode_free_bo(adev); 3200 amdgpu_free_static_csa(&adev->virt.csa_obj); 3201 amdgpu_device_wb_fini(adev); 3202 amdgpu_device_mem_scratch_fini(adev); 3203 amdgpu_ib_pool_fini(adev); 3204 amdgpu_seq64_fini(adev); 3205 } 3206 3207 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3208 /* XXX handle errors */ 3209 if (r) { 3210 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3211 adev->ip_blocks[i].version->funcs->name, r); 3212 } 3213 adev->ip_blocks[i].status.sw = false; 3214 adev->ip_blocks[i].status.valid = false; 3215 } 3216 3217 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3218 if (!adev->ip_blocks[i].status.late_initialized) 3219 continue; 3220 if (adev->ip_blocks[i].version->funcs->late_fini) 3221 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3222 adev->ip_blocks[i].status.late_initialized = false; 3223 } 3224 3225 amdgpu_ras_fini(adev); 3226 3227 return 0; 3228 } 3229 3230 /** 3231 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3232 * 3233 * @work: work_struct. 3234 */ 3235 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3236 { 3237 struct amdgpu_device *adev = 3238 container_of(work, struct amdgpu_device, delayed_init_work.work); 3239 int r; 3240 3241 r = amdgpu_ib_ring_tests(adev); 3242 if (r) 3243 DRM_ERROR("ib ring test failed (%d).\n", r); 3244 } 3245 3246 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3247 { 3248 struct amdgpu_device *adev = 3249 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3250 3251 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3252 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3253 3254 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3255 adev->gfx.gfx_off_state = true; 3256 } 3257 3258 /** 3259 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3260 * 3261 * @adev: amdgpu_device pointer 3262 * 3263 * Main suspend function for hardware IPs. The list of all the hardware 3264 * IPs that make up the asic is walked, clockgating is disabled and the 3265 * suspend callbacks are run. suspend puts the hardware and software state 3266 * in each IP into a state suitable for suspend. 3267 * Returns 0 on success, negative error code on failure. 3268 */ 3269 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3270 { 3271 int i, r; 3272 3273 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3274 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3275 3276 /* 3277 * Per PMFW team's suggestion, driver needs to handle gfxoff 3278 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3279 * scenario. Add the missing df cstate disablement here. 3280 */ 3281 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3282 dev_warn(adev->dev, "Failed to disallow df cstate"); 3283 3284 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3285 if (!adev->ip_blocks[i].status.valid) 3286 continue; 3287 3288 /* displays are handled separately */ 3289 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3290 continue; 3291 3292 /* XXX handle errors */ 3293 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3294 /* XXX handle errors */ 3295 if (r) { 3296 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3297 adev->ip_blocks[i].version->funcs->name, r); 3298 return r; 3299 } 3300 3301 adev->ip_blocks[i].status.hw = false; 3302 } 3303 3304 return 0; 3305 } 3306 3307 /** 3308 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3309 * 3310 * @adev: amdgpu_device pointer 3311 * 3312 * Main suspend function for hardware IPs. The list of all the hardware 3313 * IPs that make up the asic is walked, clockgating is disabled and the 3314 * suspend callbacks are run. suspend puts the hardware and software state 3315 * in each IP into a state suitable for suspend. 3316 * Returns 0 on success, negative error code on failure. 3317 */ 3318 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3319 { 3320 int i, r; 3321 3322 if (adev->in_s0ix) 3323 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3324 3325 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3326 if (!adev->ip_blocks[i].status.valid) 3327 continue; 3328 /* displays are handled in phase1 */ 3329 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3330 continue; 3331 /* PSP lost connection when err_event_athub occurs */ 3332 if (amdgpu_ras_intr_triggered() && 3333 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3334 adev->ip_blocks[i].status.hw = false; 3335 continue; 3336 } 3337 3338 /* skip unnecessary suspend if we do not initialize them yet */ 3339 if (adev->gmc.xgmi.pending_reset && 3340 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3341 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3342 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3343 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3344 adev->ip_blocks[i].status.hw = false; 3345 continue; 3346 } 3347 3348 /* skip suspend of gfx/mes and psp for S0ix 3349 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3350 * like at runtime. PSP is also part of the always on hardware 3351 * so no need to suspend it. 3352 */ 3353 if (adev->in_s0ix && 3354 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3355 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3356 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3357 continue; 3358 3359 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3360 if (adev->in_s0ix && 3361 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3362 IP_VERSION(5, 0, 0)) && 3363 (adev->ip_blocks[i].version->type == 3364 AMD_IP_BLOCK_TYPE_SDMA)) 3365 continue; 3366 3367 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3368 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3369 * from this location and RLC Autoload automatically also gets loaded 3370 * from here based on PMFW -> PSP message during re-init sequence. 3371 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3372 * the TMR and reload FWs again for IMU enabled APU ASICs. 3373 */ 3374 if (amdgpu_in_reset(adev) && 3375 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3376 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3377 continue; 3378 3379 /* XXX handle errors */ 3380 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3381 /* XXX handle errors */ 3382 if (r) { 3383 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3384 adev->ip_blocks[i].version->funcs->name, r); 3385 } 3386 adev->ip_blocks[i].status.hw = false; 3387 /* handle putting the SMC in the appropriate state */ 3388 if (!amdgpu_sriov_vf(adev)) { 3389 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3390 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3391 if (r) { 3392 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3393 adev->mp1_state, r); 3394 return r; 3395 } 3396 } 3397 } 3398 } 3399 3400 return 0; 3401 } 3402 3403 /** 3404 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3405 * 3406 * @adev: amdgpu_device pointer 3407 * 3408 * Main suspend function for hardware IPs. The list of all the hardware 3409 * IPs that make up the asic is walked, clockgating is disabled and the 3410 * suspend callbacks are run. suspend puts the hardware and software state 3411 * in each IP into a state suitable for suspend. 3412 * Returns 0 on success, negative error code on failure. 3413 */ 3414 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3415 { 3416 int r; 3417 3418 if (amdgpu_sriov_vf(adev)) { 3419 amdgpu_virt_fini_data_exchange(adev); 3420 amdgpu_virt_request_full_gpu(adev, false); 3421 } 3422 3423 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3424 3425 r = amdgpu_device_ip_suspend_phase1(adev); 3426 if (r) 3427 return r; 3428 r = amdgpu_device_ip_suspend_phase2(adev); 3429 3430 if (amdgpu_sriov_vf(adev)) 3431 amdgpu_virt_release_full_gpu(adev, false); 3432 3433 return r; 3434 } 3435 3436 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3437 { 3438 int i, r; 3439 3440 static enum amd_ip_block_type ip_order[] = { 3441 AMD_IP_BLOCK_TYPE_COMMON, 3442 AMD_IP_BLOCK_TYPE_GMC, 3443 AMD_IP_BLOCK_TYPE_PSP, 3444 AMD_IP_BLOCK_TYPE_IH, 3445 }; 3446 3447 for (i = 0; i < adev->num_ip_blocks; i++) { 3448 int j; 3449 struct amdgpu_ip_block *block; 3450 3451 block = &adev->ip_blocks[i]; 3452 block->status.hw = false; 3453 3454 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3455 3456 if (block->version->type != ip_order[j] || 3457 !block->status.valid) 3458 continue; 3459 3460 r = block->version->funcs->hw_init(adev); 3461 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3462 if (r) 3463 return r; 3464 block->status.hw = true; 3465 } 3466 } 3467 3468 return 0; 3469 } 3470 3471 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3472 { 3473 int i, r; 3474 3475 static enum amd_ip_block_type ip_order[] = { 3476 AMD_IP_BLOCK_TYPE_SMC, 3477 AMD_IP_BLOCK_TYPE_DCE, 3478 AMD_IP_BLOCK_TYPE_GFX, 3479 AMD_IP_BLOCK_TYPE_SDMA, 3480 AMD_IP_BLOCK_TYPE_MES, 3481 AMD_IP_BLOCK_TYPE_UVD, 3482 AMD_IP_BLOCK_TYPE_VCE, 3483 AMD_IP_BLOCK_TYPE_VCN, 3484 AMD_IP_BLOCK_TYPE_JPEG 3485 }; 3486 3487 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3488 int j; 3489 struct amdgpu_ip_block *block; 3490 3491 for (j = 0; j < adev->num_ip_blocks; j++) { 3492 block = &adev->ip_blocks[j]; 3493 3494 if (block->version->type != ip_order[i] || 3495 !block->status.valid || 3496 block->status.hw) 3497 continue; 3498 3499 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3500 r = block->version->funcs->resume(adev); 3501 else 3502 r = block->version->funcs->hw_init(adev); 3503 3504 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3505 if (r) 3506 return r; 3507 block->status.hw = true; 3508 } 3509 } 3510 3511 return 0; 3512 } 3513 3514 /** 3515 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3516 * 3517 * @adev: amdgpu_device pointer 3518 * 3519 * First resume function for hardware IPs. The list of all the hardware 3520 * IPs that make up the asic is walked and the resume callbacks are run for 3521 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3522 * after a suspend and updates the software state as necessary. This 3523 * function is also used for restoring the GPU after a GPU reset. 3524 * Returns 0 on success, negative error code on failure. 3525 */ 3526 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3527 { 3528 int i, r; 3529 3530 for (i = 0; i < adev->num_ip_blocks; i++) { 3531 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3532 continue; 3533 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3534 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3535 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3536 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3537 3538 r = adev->ip_blocks[i].version->funcs->resume(adev); 3539 if (r) { 3540 DRM_ERROR("resume of IP block <%s> failed %d\n", 3541 adev->ip_blocks[i].version->funcs->name, r); 3542 return r; 3543 } 3544 adev->ip_blocks[i].status.hw = true; 3545 } 3546 } 3547 3548 return 0; 3549 } 3550 3551 /** 3552 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3553 * 3554 * @adev: amdgpu_device pointer 3555 * 3556 * First resume function for hardware IPs. The list of all the hardware 3557 * IPs that make up the asic is walked and the resume callbacks are run for 3558 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3559 * functional state after a suspend and updates the software state as 3560 * necessary. This function is also used for restoring the GPU after a GPU 3561 * reset. 3562 * Returns 0 on success, negative error code on failure. 3563 */ 3564 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3565 { 3566 int i, r; 3567 3568 for (i = 0; i < adev->num_ip_blocks; i++) { 3569 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3570 continue; 3571 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3572 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3573 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3574 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3575 continue; 3576 r = adev->ip_blocks[i].version->funcs->resume(adev); 3577 if (r) { 3578 DRM_ERROR("resume of IP block <%s> failed %d\n", 3579 adev->ip_blocks[i].version->funcs->name, r); 3580 return r; 3581 } 3582 adev->ip_blocks[i].status.hw = true; 3583 } 3584 3585 return 0; 3586 } 3587 3588 /** 3589 * amdgpu_device_ip_resume - run resume for hardware IPs 3590 * 3591 * @adev: amdgpu_device pointer 3592 * 3593 * Main resume function for hardware IPs. The hardware IPs 3594 * are split into two resume functions because they are 3595 * also used in recovering from a GPU reset and some additional 3596 * steps need to be take between them. In this case (S3/S4) they are 3597 * run sequentially. 3598 * Returns 0 on success, negative error code on failure. 3599 */ 3600 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3601 { 3602 int r; 3603 3604 r = amdgpu_device_ip_resume_phase1(adev); 3605 if (r) 3606 return r; 3607 3608 r = amdgpu_device_fw_loading(adev); 3609 if (r) 3610 return r; 3611 3612 r = amdgpu_device_ip_resume_phase2(adev); 3613 3614 if (adev->mman.buffer_funcs_ring->sched.ready) 3615 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3616 3617 return r; 3618 } 3619 3620 /** 3621 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3622 * 3623 * @adev: amdgpu_device pointer 3624 * 3625 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3626 */ 3627 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3628 { 3629 if (amdgpu_sriov_vf(adev)) { 3630 if (adev->is_atom_fw) { 3631 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3632 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3633 } else { 3634 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3635 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3636 } 3637 3638 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3639 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3640 } 3641 } 3642 3643 /** 3644 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3645 * 3646 * @asic_type: AMD asic type 3647 * 3648 * Check if there is DC (new modesetting infrastructre) support for an asic. 3649 * returns true if DC has support, false if not. 3650 */ 3651 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3652 { 3653 switch (asic_type) { 3654 #ifdef CONFIG_DRM_AMDGPU_SI 3655 case CHIP_HAINAN: 3656 #endif 3657 case CHIP_TOPAZ: 3658 /* chips with no display hardware */ 3659 return false; 3660 #if defined(CONFIG_DRM_AMD_DC) 3661 case CHIP_TAHITI: 3662 case CHIP_PITCAIRN: 3663 case CHIP_VERDE: 3664 case CHIP_OLAND: 3665 /* 3666 * We have systems in the wild with these ASICs that require 3667 * LVDS and VGA support which is not supported with DC. 3668 * 3669 * Fallback to the non-DC driver here by default so as not to 3670 * cause regressions. 3671 */ 3672 #if defined(CONFIG_DRM_AMD_DC_SI) 3673 return amdgpu_dc > 0; 3674 #else 3675 return false; 3676 #endif 3677 case CHIP_BONAIRE: 3678 case CHIP_KAVERI: 3679 case CHIP_KABINI: 3680 case CHIP_MULLINS: 3681 /* 3682 * We have systems in the wild with these ASICs that require 3683 * VGA support which is not supported with DC. 3684 * 3685 * Fallback to the non-DC driver here by default so as not to 3686 * cause regressions. 3687 */ 3688 return amdgpu_dc > 0; 3689 default: 3690 return amdgpu_dc != 0; 3691 #else 3692 default: 3693 if (amdgpu_dc > 0) 3694 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3695 return false; 3696 #endif 3697 } 3698 } 3699 3700 /** 3701 * amdgpu_device_has_dc_support - check if dc is supported 3702 * 3703 * @adev: amdgpu_device pointer 3704 * 3705 * Returns true for supported, false for not supported 3706 */ 3707 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3708 { 3709 if (adev->enable_virtual_display || 3710 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3711 return false; 3712 3713 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3714 } 3715 3716 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3717 { 3718 struct amdgpu_device *adev = 3719 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3720 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3721 3722 /* It's a bug to not have a hive within this function */ 3723 if (WARN_ON(!hive)) 3724 return; 3725 3726 /* 3727 * Use task barrier to synchronize all xgmi reset works across the 3728 * hive. task_barrier_enter and task_barrier_exit will block 3729 * until all the threads running the xgmi reset works reach 3730 * those points. task_barrier_full will do both blocks. 3731 */ 3732 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3733 3734 task_barrier_enter(&hive->tb); 3735 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3736 3737 if (adev->asic_reset_res) 3738 goto fail; 3739 3740 task_barrier_exit(&hive->tb); 3741 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3742 3743 if (adev->asic_reset_res) 3744 goto fail; 3745 3746 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3747 } else { 3748 3749 task_barrier_full(&hive->tb); 3750 adev->asic_reset_res = amdgpu_asic_reset(adev); 3751 } 3752 3753 fail: 3754 if (adev->asic_reset_res) 3755 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3756 adev->asic_reset_res, adev_to_drm(adev)->unique); 3757 amdgpu_put_xgmi_hive(hive); 3758 } 3759 3760 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3761 { 3762 char *input = amdgpu_lockup_timeout; 3763 char *timeout_setting = NULL; 3764 int index = 0; 3765 long timeout; 3766 int ret = 0; 3767 3768 /* 3769 * By default timeout for non compute jobs is 10000 3770 * and 60000 for compute jobs. 3771 * In SR-IOV or passthrough mode, timeout for compute 3772 * jobs are 60000 by default. 3773 */ 3774 adev->gfx_timeout = msecs_to_jiffies(10000); 3775 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3776 if (amdgpu_sriov_vf(adev)) 3777 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3778 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3779 else 3780 adev->compute_timeout = msecs_to_jiffies(60000); 3781 3782 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3783 while ((timeout_setting = strsep(&input, ",")) && 3784 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3785 ret = kstrtol(timeout_setting, 0, &timeout); 3786 if (ret) 3787 return ret; 3788 3789 if (timeout == 0) { 3790 index++; 3791 continue; 3792 } else if (timeout < 0) { 3793 timeout = MAX_SCHEDULE_TIMEOUT; 3794 dev_warn(adev->dev, "lockup timeout disabled"); 3795 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3796 } else { 3797 timeout = msecs_to_jiffies(timeout); 3798 } 3799 3800 switch (index++) { 3801 case 0: 3802 adev->gfx_timeout = timeout; 3803 break; 3804 case 1: 3805 adev->compute_timeout = timeout; 3806 break; 3807 case 2: 3808 adev->sdma_timeout = timeout; 3809 break; 3810 case 3: 3811 adev->video_timeout = timeout; 3812 break; 3813 default: 3814 break; 3815 } 3816 } 3817 /* 3818 * There is only one value specified and 3819 * it should apply to all non-compute jobs. 3820 */ 3821 if (index == 1) { 3822 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3823 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3824 adev->compute_timeout = adev->gfx_timeout; 3825 } 3826 } 3827 3828 return ret; 3829 } 3830 3831 /** 3832 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3833 * 3834 * @adev: amdgpu_device pointer 3835 * 3836 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3837 */ 3838 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3839 { 3840 struct iommu_domain *domain; 3841 3842 domain = iommu_get_domain_for_dev(adev->dev); 3843 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3844 adev->ram_is_direct_mapped = true; 3845 } 3846 3847 static const struct attribute *amdgpu_dev_attributes[] = { 3848 &dev_attr_pcie_replay_count.attr, 3849 NULL 3850 }; 3851 3852 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3853 { 3854 if (amdgpu_mcbp == 1) 3855 adev->gfx.mcbp = true; 3856 else if (amdgpu_mcbp == 0) 3857 adev->gfx.mcbp = false; 3858 3859 if (amdgpu_sriov_vf(adev)) 3860 adev->gfx.mcbp = true; 3861 3862 if (adev->gfx.mcbp) 3863 DRM_INFO("MCBP is enabled\n"); 3864 } 3865 3866 /** 3867 * amdgpu_device_init - initialize the driver 3868 * 3869 * @adev: amdgpu_device pointer 3870 * @flags: driver flags 3871 * 3872 * Initializes the driver info and hw (all asics). 3873 * Returns 0 for success or an error on failure. 3874 * Called at driver startup. 3875 */ 3876 int amdgpu_device_init(struct amdgpu_device *adev, 3877 uint32_t flags) 3878 { 3879 struct drm_device *ddev = adev_to_drm(adev); 3880 struct pci_dev *pdev = adev->pdev; 3881 int r, i; 3882 bool px = false; 3883 u32 max_MBps; 3884 int tmp; 3885 3886 adev->shutdown = false; 3887 adev->flags = flags; 3888 3889 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3890 adev->asic_type = amdgpu_force_asic_type; 3891 else 3892 adev->asic_type = flags & AMD_ASIC_MASK; 3893 3894 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3895 if (amdgpu_emu_mode == 1) 3896 adev->usec_timeout *= 10; 3897 adev->gmc.gart_size = 512 * 1024 * 1024; 3898 adev->accel_working = false; 3899 adev->num_rings = 0; 3900 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3901 adev->mman.buffer_funcs = NULL; 3902 adev->mman.buffer_funcs_ring = NULL; 3903 adev->vm_manager.vm_pte_funcs = NULL; 3904 adev->vm_manager.vm_pte_num_scheds = 0; 3905 adev->gmc.gmc_funcs = NULL; 3906 adev->harvest_ip_mask = 0x0; 3907 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3908 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3909 3910 adev->smc_rreg = &amdgpu_invalid_rreg; 3911 adev->smc_wreg = &amdgpu_invalid_wreg; 3912 adev->pcie_rreg = &amdgpu_invalid_rreg; 3913 adev->pcie_wreg = &amdgpu_invalid_wreg; 3914 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3915 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3916 adev->pciep_rreg = &amdgpu_invalid_rreg; 3917 adev->pciep_wreg = &amdgpu_invalid_wreg; 3918 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3919 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3920 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3921 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3922 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3923 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3924 adev->didt_rreg = &amdgpu_invalid_rreg; 3925 adev->didt_wreg = &amdgpu_invalid_wreg; 3926 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3927 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3928 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3929 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3930 3931 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3932 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3933 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3934 3935 /* mutex initialization are all done here so we 3936 * can recall function without having locking issues 3937 */ 3938 mutex_init(&adev->firmware.mutex); 3939 mutex_init(&adev->pm.mutex); 3940 mutex_init(&adev->gfx.gpu_clock_mutex); 3941 mutex_init(&adev->srbm_mutex); 3942 mutex_init(&adev->gfx.pipe_reserve_mutex); 3943 mutex_init(&adev->gfx.gfx_off_mutex); 3944 mutex_init(&adev->gfx.partition_mutex); 3945 mutex_init(&adev->grbm_idx_mutex); 3946 mutex_init(&adev->mn_lock); 3947 mutex_init(&adev->virt.vf_errors.lock); 3948 hash_init(adev->mn_hash); 3949 mutex_init(&adev->psp.mutex); 3950 mutex_init(&adev->notifier_lock); 3951 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3952 mutex_init(&adev->benchmark_mutex); 3953 3954 amdgpu_device_init_apu_flags(adev); 3955 3956 r = amdgpu_device_check_arguments(adev); 3957 if (r) 3958 return r; 3959 3960 spin_lock_init(&adev->mmio_idx_lock); 3961 spin_lock_init(&adev->smc_idx_lock); 3962 spin_lock_init(&adev->pcie_idx_lock); 3963 spin_lock_init(&adev->uvd_ctx_idx_lock); 3964 spin_lock_init(&adev->didt_idx_lock); 3965 spin_lock_init(&adev->gc_cac_idx_lock); 3966 spin_lock_init(&adev->se_cac_idx_lock); 3967 spin_lock_init(&adev->audio_endpt_idx_lock); 3968 spin_lock_init(&adev->mm_stats.lock); 3969 3970 INIT_LIST_HEAD(&adev->shadow_list); 3971 mutex_init(&adev->shadow_list_lock); 3972 3973 INIT_LIST_HEAD(&adev->reset_list); 3974 3975 INIT_LIST_HEAD(&adev->ras_list); 3976 3977 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3978 3979 INIT_DELAYED_WORK(&adev->delayed_init_work, 3980 amdgpu_device_delayed_init_work_handler); 3981 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3982 amdgpu_device_delay_enable_gfx_off); 3983 3984 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3985 3986 adev->gfx.gfx_off_req_count = 1; 3987 adev->gfx.gfx_off_residency = 0; 3988 adev->gfx.gfx_off_entrycount = 0; 3989 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3990 3991 atomic_set(&adev->throttling_logging_enabled, 1); 3992 /* 3993 * If throttling continues, logging will be performed every minute 3994 * to avoid log flooding. "-1" is subtracted since the thermal 3995 * throttling interrupt comes every second. Thus, the total logging 3996 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3997 * for throttling interrupt) = 60 seconds. 3998 */ 3999 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4000 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4001 4002 /* Registers mapping */ 4003 /* TODO: block userspace mapping of io register */ 4004 if (adev->asic_type >= CHIP_BONAIRE) { 4005 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4006 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4007 } else { 4008 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4009 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4010 } 4011 4012 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4013 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4014 4015 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4016 if (!adev->rmmio) 4017 return -ENOMEM; 4018 4019 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4020 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4021 4022 /* 4023 * Reset domain needs to be present early, before XGMI hive discovered 4024 * (if any) and intitialized to use reset sem and in_gpu reset flag 4025 * early on during init and before calling to RREG32. 4026 */ 4027 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4028 if (!adev->reset_domain) 4029 return -ENOMEM; 4030 4031 /* detect hw virtualization here */ 4032 amdgpu_detect_virtualization(adev); 4033 4034 amdgpu_device_get_pcie_info(adev); 4035 4036 r = amdgpu_device_get_job_timeout_settings(adev); 4037 if (r) { 4038 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4039 return r; 4040 } 4041 4042 /* early init functions */ 4043 r = amdgpu_device_ip_early_init(adev); 4044 if (r) 4045 return r; 4046 4047 amdgpu_device_set_mcbp(adev); 4048 4049 /* Get rid of things like offb */ 4050 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4051 if (r) 4052 return r; 4053 4054 /* Enable TMZ based on IP_VERSION */ 4055 amdgpu_gmc_tmz_set(adev); 4056 4057 amdgpu_gmc_noretry_set(adev); 4058 /* Need to get xgmi info early to decide the reset behavior*/ 4059 if (adev->gmc.xgmi.supported) { 4060 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4061 if (r) 4062 return r; 4063 } 4064 4065 /* enable PCIE atomic ops */ 4066 if (amdgpu_sriov_vf(adev)) { 4067 if (adev->virt.fw_reserve.p_pf2vf) 4068 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4069 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4070 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4071 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4072 * internal path natively support atomics, set have_atomics_support to true. 4073 */ 4074 } else if ((adev->flags & AMD_IS_APU) && 4075 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4076 IP_VERSION(9, 0, 0))) { 4077 adev->have_atomics_support = true; 4078 } else { 4079 adev->have_atomics_support = 4080 !pci_enable_atomic_ops_to_root(adev->pdev, 4081 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4082 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4083 } 4084 4085 if (!adev->have_atomics_support) 4086 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4087 4088 /* doorbell bar mapping and doorbell index init*/ 4089 amdgpu_doorbell_init(adev); 4090 4091 if (amdgpu_emu_mode == 1) { 4092 /* post the asic on emulation mode */ 4093 emu_soc_asic_init(adev); 4094 goto fence_driver_init; 4095 } 4096 4097 amdgpu_reset_init(adev); 4098 4099 /* detect if we are with an SRIOV vbios */ 4100 if (adev->bios) 4101 amdgpu_device_detect_sriov_bios(adev); 4102 4103 /* check if we need to reset the asic 4104 * E.g., driver was not cleanly unloaded previously, etc. 4105 */ 4106 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4107 if (adev->gmc.xgmi.num_physical_nodes) { 4108 dev_info(adev->dev, "Pending hive reset.\n"); 4109 adev->gmc.xgmi.pending_reset = true; 4110 /* Only need to init necessary block for SMU to handle the reset */ 4111 for (i = 0; i < adev->num_ip_blocks; i++) { 4112 if (!adev->ip_blocks[i].status.valid) 4113 continue; 4114 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4115 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4116 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4117 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4118 DRM_DEBUG("IP %s disabled for hw_init.\n", 4119 adev->ip_blocks[i].version->funcs->name); 4120 adev->ip_blocks[i].status.hw = true; 4121 } 4122 } 4123 } else { 4124 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 4125 case IP_VERSION(13, 0, 0): 4126 case IP_VERSION(13, 0, 7): 4127 case IP_VERSION(13, 0, 10): 4128 r = psp_gpu_reset(adev); 4129 break; 4130 default: 4131 tmp = amdgpu_reset_method; 4132 /* It should do a default reset when loading or reloading the driver, 4133 * regardless of the module parameter reset_method. 4134 */ 4135 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4136 r = amdgpu_asic_reset(adev); 4137 amdgpu_reset_method = tmp; 4138 break; 4139 } 4140 4141 if (r) { 4142 dev_err(adev->dev, "asic reset on init failed\n"); 4143 goto failed; 4144 } 4145 } 4146 } 4147 4148 /* Post card if necessary */ 4149 if (amdgpu_device_need_post(adev)) { 4150 if (!adev->bios) { 4151 dev_err(adev->dev, "no vBIOS found\n"); 4152 r = -EINVAL; 4153 goto failed; 4154 } 4155 DRM_INFO("GPU posting now...\n"); 4156 r = amdgpu_device_asic_init(adev); 4157 if (r) { 4158 dev_err(adev->dev, "gpu post error!\n"); 4159 goto failed; 4160 } 4161 } 4162 4163 if (adev->bios) { 4164 if (adev->is_atom_fw) { 4165 /* Initialize clocks */ 4166 r = amdgpu_atomfirmware_get_clock_info(adev); 4167 if (r) { 4168 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4169 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4170 goto failed; 4171 } 4172 } else { 4173 /* Initialize clocks */ 4174 r = amdgpu_atombios_get_clock_info(adev); 4175 if (r) { 4176 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4177 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4178 goto failed; 4179 } 4180 /* init i2c buses */ 4181 if (!amdgpu_device_has_dc_support(adev)) 4182 amdgpu_atombios_i2c_init(adev); 4183 } 4184 } 4185 4186 fence_driver_init: 4187 /* Fence driver */ 4188 r = amdgpu_fence_driver_sw_init(adev); 4189 if (r) { 4190 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4191 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4192 goto failed; 4193 } 4194 4195 /* init the mode config */ 4196 drm_mode_config_init(adev_to_drm(adev)); 4197 4198 r = amdgpu_device_ip_init(adev); 4199 if (r) { 4200 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4201 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4202 goto release_ras_con; 4203 } 4204 4205 amdgpu_fence_driver_hw_init(adev); 4206 4207 dev_info(adev->dev, 4208 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4209 adev->gfx.config.max_shader_engines, 4210 adev->gfx.config.max_sh_per_se, 4211 adev->gfx.config.max_cu_per_sh, 4212 adev->gfx.cu_info.number); 4213 4214 adev->accel_working = true; 4215 4216 amdgpu_vm_check_compute_bug(adev); 4217 4218 /* Initialize the buffer migration limit. */ 4219 if (amdgpu_moverate >= 0) 4220 max_MBps = amdgpu_moverate; 4221 else 4222 max_MBps = 8; /* Allow 8 MB/s. */ 4223 /* Get a log2 for easy divisions. */ 4224 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4225 4226 /* 4227 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4228 * Otherwise the mgpu fan boost feature will be skipped due to the 4229 * gpu instance is counted less. 4230 */ 4231 amdgpu_register_gpu_instance(adev); 4232 4233 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4234 * explicit gating rather than handling it automatically. 4235 */ 4236 if (!adev->gmc.xgmi.pending_reset) { 4237 r = amdgpu_device_ip_late_init(adev); 4238 if (r) { 4239 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4240 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4241 goto release_ras_con; 4242 } 4243 /* must succeed. */ 4244 amdgpu_ras_resume(adev); 4245 queue_delayed_work(system_wq, &adev->delayed_init_work, 4246 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4247 } 4248 4249 if (amdgpu_sriov_vf(adev)) { 4250 amdgpu_virt_release_full_gpu(adev, true); 4251 flush_delayed_work(&adev->delayed_init_work); 4252 } 4253 4254 /* 4255 * Place those sysfs registering after `late_init`. As some of those 4256 * operations performed in `late_init` might affect the sysfs 4257 * interfaces creating. 4258 */ 4259 r = amdgpu_atombios_sysfs_init(adev); 4260 if (r) 4261 drm_err(&adev->ddev, 4262 "registering atombios sysfs failed (%d).\n", r); 4263 4264 r = amdgpu_pm_sysfs_init(adev); 4265 if (r) 4266 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4267 4268 r = amdgpu_ucode_sysfs_init(adev); 4269 if (r) { 4270 adev->ucode_sysfs_en = false; 4271 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4272 } else 4273 adev->ucode_sysfs_en = true; 4274 4275 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4276 if (r) 4277 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4278 4279 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4280 if (r) 4281 dev_err(adev->dev, 4282 "Could not create amdgpu board attributes\n"); 4283 4284 amdgpu_fru_sysfs_init(adev); 4285 amdgpu_reg_state_sysfs_init(adev); 4286 4287 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4288 r = amdgpu_pmu_init(adev); 4289 if (r) 4290 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4291 4292 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4293 if (amdgpu_device_cache_pci_state(adev->pdev)) 4294 pci_restore_state(pdev); 4295 4296 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4297 /* this will fail for cards that aren't VGA class devices, just 4298 * ignore it 4299 */ 4300 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4301 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4302 4303 px = amdgpu_device_supports_px(ddev); 4304 4305 if (px || (!dev_is_removable(&adev->pdev->dev) && 4306 apple_gmux_detect(NULL, NULL))) 4307 vga_switcheroo_register_client(adev->pdev, 4308 &amdgpu_switcheroo_ops, px); 4309 4310 if (px) 4311 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4312 4313 if (adev->gmc.xgmi.pending_reset) 4314 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4315 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4316 4317 amdgpu_device_check_iommu_direct_map(adev); 4318 4319 return 0; 4320 4321 release_ras_con: 4322 if (amdgpu_sriov_vf(adev)) 4323 amdgpu_virt_release_full_gpu(adev, true); 4324 4325 /* failed in exclusive mode due to timeout */ 4326 if (amdgpu_sriov_vf(adev) && 4327 !amdgpu_sriov_runtime(adev) && 4328 amdgpu_virt_mmio_blocked(adev) && 4329 !amdgpu_virt_wait_reset(adev)) { 4330 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4331 /* Don't send request since VF is inactive. */ 4332 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4333 adev->virt.ops = NULL; 4334 r = -EAGAIN; 4335 } 4336 amdgpu_release_ras_context(adev); 4337 4338 failed: 4339 amdgpu_vf_error_trans_all(adev); 4340 4341 return r; 4342 } 4343 4344 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4345 { 4346 4347 /* Clear all CPU mappings pointing to this device */ 4348 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4349 4350 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4351 amdgpu_doorbell_fini(adev); 4352 4353 iounmap(adev->rmmio); 4354 adev->rmmio = NULL; 4355 if (adev->mman.aper_base_kaddr) 4356 iounmap(adev->mman.aper_base_kaddr); 4357 adev->mman.aper_base_kaddr = NULL; 4358 4359 /* Memory manager related */ 4360 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4361 arch_phys_wc_del(adev->gmc.vram_mtrr); 4362 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4363 } 4364 } 4365 4366 /** 4367 * amdgpu_device_fini_hw - tear down the driver 4368 * 4369 * @adev: amdgpu_device pointer 4370 * 4371 * Tear down the driver info (all asics). 4372 * Called at driver shutdown. 4373 */ 4374 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4375 { 4376 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4377 flush_delayed_work(&adev->delayed_init_work); 4378 adev->shutdown = true; 4379 4380 /* make sure IB test finished before entering exclusive mode 4381 * to avoid preemption on IB test 4382 */ 4383 if (amdgpu_sriov_vf(adev)) { 4384 amdgpu_virt_request_full_gpu(adev, false); 4385 amdgpu_virt_fini_data_exchange(adev); 4386 } 4387 4388 /* disable all interrupts */ 4389 amdgpu_irq_disable_all(adev); 4390 if (adev->mode_info.mode_config_initialized) { 4391 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4392 drm_helper_force_disable_all(adev_to_drm(adev)); 4393 else 4394 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4395 } 4396 amdgpu_fence_driver_hw_fini(adev); 4397 4398 if (adev->mman.initialized) 4399 drain_workqueue(adev->mman.bdev.wq); 4400 4401 if (adev->pm.sysfs_initialized) 4402 amdgpu_pm_sysfs_fini(adev); 4403 if (adev->ucode_sysfs_en) 4404 amdgpu_ucode_sysfs_fini(adev); 4405 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4406 amdgpu_fru_sysfs_fini(adev); 4407 4408 amdgpu_reg_state_sysfs_fini(adev); 4409 4410 /* disable ras feature must before hw fini */ 4411 amdgpu_ras_pre_fini(adev); 4412 4413 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4414 4415 amdgpu_device_ip_fini_early(adev); 4416 4417 amdgpu_irq_fini_hw(adev); 4418 4419 if (adev->mman.initialized) 4420 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4421 4422 amdgpu_gart_dummy_page_fini(adev); 4423 4424 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4425 amdgpu_device_unmap_mmio(adev); 4426 4427 } 4428 4429 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4430 { 4431 int idx; 4432 bool px; 4433 4434 amdgpu_fence_driver_sw_fini(adev); 4435 amdgpu_device_ip_fini(adev); 4436 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4437 adev->accel_working = false; 4438 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4439 4440 amdgpu_reset_fini(adev); 4441 4442 /* free i2c buses */ 4443 if (!amdgpu_device_has_dc_support(adev)) 4444 amdgpu_i2c_fini(adev); 4445 4446 if (amdgpu_emu_mode != 1) 4447 amdgpu_atombios_fini(adev); 4448 4449 kfree(adev->bios); 4450 adev->bios = NULL; 4451 4452 kfree(adev->fru_info); 4453 adev->fru_info = NULL; 4454 4455 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4456 4457 if (px || (!dev_is_removable(&adev->pdev->dev) && 4458 apple_gmux_detect(NULL, NULL))) 4459 vga_switcheroo_unregister_client(adev->pdev); 4460 4461 if (px) 4462 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4463 4464 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4465 vga_client_unregister(adev->pdev); 4466 4467 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4468 4469 iounmap(adev->rmmio); 4470 adev->rmmio = NULL; 4471 amdgpu_doorbell_fini(adev); 4472 drm_dev_exit(idx); 4473 } 4474 4475 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4476 amdgpu_pmu_fini(adev); 4477 if (adev->mman.discovery_bin) 4478 amdgpu_discovery_fini(adev); 4479 4480 amdgpu_reset_put_reset_domain(adev->reset_domain); 4481 adev->reset_domain = NULL; 4482 4483 kfree(adev->pci_state); 4484 4485 } 4486 4487 /** 4488 * amdgpu_device_evict_resources - evict device resources 4489 * @adev: amdgpu device object 4490 * 4491 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4492 * of the vram memory type. Mainly used for evicting device resources 4493 * at suspend time. 4494 * 4495 */ 4496 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4497 { 4498 int ret; 4499 4500 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4501 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4502 return 0; 4503 4504 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4505 if (ret) 4506 DRM_WARN("evicting device resources failed\n"); 4507 return ret; 4508 } 4509 4510 /* 4511 * Suspend & resume. 4512 */ 4513 /** 4514 * amdgpu_device_prepare - prepare for device suspend 4515 * 4516 * @dev: drm dev pointer 4517 * 4518 * Prepare to put the hw in the suspend state (all asics). 4519 * Returns 0 for success or an error on failure. 4520 * Called at driver suspend. 4521 */ 4522 int amdgpu_device_prepare(struct drm_device *dev) 4523 { 4524 struct amdgpu_device *adev = drm_to_adev(dev); 4525 int i, r; 4526 4527 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4528 return 0; 4529 4530 /* Evict the majority of BOs before starting suspend sequence */ 4531 r = amdgpu_device_evict_resources(adev); 4532 if (r) 4533 return r; 4534 4535 for (i = 0; i < adev->num_ip_blocks; i++) { 4536 if (!adev->ip_blocks[i].status.valid) 4537 continue; 4538 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4539 continue; 4540 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4541 if (r) 4542 return r; 4543 } 4544 4545 return 0; 4546 } 4547 4548 /** 4549 * amdgpu_device_suspend - initiate device suspend 4550 * 4551 * @dev: drm dev pointer 4552 * @fbcon : notify the fbdev of suspend 4553 * 4554 * Puts the hw in the suspend state (all asics). 4555 * Returns 0 for success or an error on failure. 4556 * Called at driver suspend. 4557 */ 4558 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4559 { 4560 struct amdgpu_device *adev = drm_to_adev(dev); 4561 int r = 0; 4562 4563 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4564 return 0; 4565 4566 adev->in_suspend = true; 4567 4568 if (amdgpu_sriov_vf(adev)) { 4569 amdgpu_virt_fini_data_exchange(adev); 4570 r = amdgpu_virt_request_full_gpu(adev, false); 4571 if (r) 4572 return r; 4573 } 4574 4575 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4576 DRM_WARN("smart shift update failed\n"); 4577 4578 if (fbcon) 4579 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4580 4581 cancel_delayed_work_sync(&adev->delayed_init_work); 4582 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4583 4584 amdgpu_ras_suspend(adev); 4585 4586 amdgpu_device_ip_suspend_phase1(adev); 4587 4588 if (!adev->in_s0ix) 4589 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4590 4591 r = amdgpu_device_evict_resources(adev); 4592 if (r) 4593 return r; 4594 4595 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4596 4597 amdgpu_fence_driver_hw_fini(adev); 4598 4599 amdgpu_device_ip_suspend_phase2(adev); 4600 4601 if (amdgpu_sriov_vf(adev)) 4602 amdgpu_virt_release_full_gpu(adev, false); 4603 4604 r = amdgpu_dpm_notify_rlc_state(adev, false); 4605 if (r) 4606 return r; 4607 4608 return 0; 4609 } 4610 4611 /** 4612 * amdgpu_device_resume - initiate device resume 4613 * 4614 * @dev: drm dev pointer 4615 * @fbcon : notify the fbdev of resume 4616 * 4617 * Bring the hw back to operating state (all asics). 4618 * Returns 0 for success or an error on failure. 4619 * Called at driver resume. 4620 */ 4621 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4622 { 4623 struct amdgpu_device *adev = drm_to_adev(dev); 4624 int r = 0; 4625 4626 if (amdgpu_sriov_vf(adev)) { 4627 r = amdgpu_virt_request_full_gpu(adev, true); 4628 if (r) 4629 return r; 4630 } 4631 4632 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4633 return 0; 4634 4635 if (adev->in_s0ix) 4636 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4637 4638 /* post card */ 4639 if (amdgpu_device_need_post(adev)) { 4640 r = amdgpu_device_asic_init(adev); 4641 if (r) 4642 dev_err(adev->dev, "amdgpu asic init failed\n"); 4643 } 4644 4645 r = amdgpu_device_ip_resume(adev); 4646 4647 if (r) { 4648 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4649 goto exit; 4650 } 4651 amdgpu_fence_driver_hw_init(adev); 4652 4653 if (!adev->in_s0ix) { 4654 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4655 if (r) 4656 goto exit; 4657 } 4658 4659 r = amdgpu_device_ip_late_init(adev); 4660 if (r) 4661 goto exit; 4662 4663 queue_delayed_work(system_wq, &adev->delayed_init_work, 4664 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4665 exit: 4666 if (amdgpu_sriov_vf(adev)) { 4667 amdgpu_virt_init_data_exchange(adev); 4668 amdgpu_virt_release_full_gpu(adev, true); 4669 } 4670 4671 if (r) 4672 return r; 4673 4674 /* Make sure IB tests flushed */ 4675 flush_delayed_work(&adev->delayed_init_work); 4676 4677 if (fbcon) 4678 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4679 4680 amdgpu_ras_resume(adev); 4681 4682 if (adev->mode_info.num_crtc) { 4683 /* 4684 * Most of the connector probing functions try to acquire runtime pm 4685 * refs to ensure that the GPU is powered on when connector polling is 4686 * performed. Since we're calling this from a runtime PM callback, 4687 * trying to acquire rpm refs will cause us to deadlock. 4688 * 4689 * Since we're guaranteed to be holding the rpm lock, it's safe to 4690 * temporarily disable the rpm helpers so this doesn't deadlock us. 4691 */ 4692 #ifdef CONFIG_PM 4693 dev->dev->power.disable_depth++; 4694 #endif 4695 if (!adev->dc_enabled) 4696 drm_helper_hpd_irq_event(dev); 4697 else 4698 drm_kms_helper_hotplug_event(dev); 4699 #ifdef CONFIG_PM 4700 dev->dev->power.disable_depth--; 4701 #endif 4702 } 4703 adev->in_suspend = false; 4704 4705 if (adev->enable_mes) 4706 amdgpu_mes_self_test(adev); 4707 4708 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4709 DRM_WARN("smart shift update failed\n"); 4710 4711 return 0; 4712 } 4713 4714 /** 4715 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4716 * 4717 * @adev: amdgpu_device pointer 4718 * 4719 * The list of all the hardware IPs that make up the asic is walked and 4720 * the check_soft_reset callbacks are run. check_soft_reset determines 4721 * if the asic is still hung or not. 4722 * Returns true if any of the IPs are still in a hung state, false if not. 4723 */ 4724 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4725 { 4726 int i; 4727 bool asic_hang = false; 4728 4729 if (amdgpu_sriov_vf(adev)) 4730 return true; 4731 4732 if (amdgpu_asic_need_full_reset(adev)) 4733 return true; 4734 4735 for (i = 0; i < adev->num_ip_blocks; i++) { 4736 if (!adev->ip_blocks[i].status.valid) 4737 continue; 4738 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4739 adev->ip_blocks[i].status.hang = 4740 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4741 if (adev->ip_blocks[i].status.hang) { 4742 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4743 asic_hang = true; 4744 } 4745 } 4746 return asic_hang; 4747 } 4748 4749 /** 4750 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4751 * 4752 * @adev: amdgpu_device pointer 4753 * 4754 * The list of all the hardware IPs that make up the asic is walked and the 4755 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4756 * handles any IP specific hardware or software state changes that are 4757 * necessary for a soft reset to succeed. 4758 * Returns 0 on success, negative error code on failure. 4759 */ 4760 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4761 { 4762 int i, r = 0; 4763 4764 for (i = 0; i < adev->num_ip_blocks; i++) { 4765 if (!adev->ip_blocks[i].status.valid) 4766 continue; 4767 if (adev->ip_blocks[i].status.hang && 4768 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4769 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4770 if (r) 4771 return r; 4772 } 4773 } 4774 4775 return 0; 4776 } 4777 4778 /** 4779 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4780 * 4781 * @adev: amdgpu_device pointer 4782 * 4783 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4784 * reset is necessary to recover. 4785 * Returns true if a full asic reset is required, false if not. 4786 */ 4787 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4788 { 4789 int i; 4790 4791 if (amdgpu_asic_need_full_reset(adev)) 4792 return true; 4793 4794 for (i = 0; i < adev->num_ip_blocks; i++) { 4795 if (!adev->ip_blocks[i].status.valid) 4796 continue; 4797 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4798 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4799 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4800 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4801 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4802 if (adev->ip_blocks[i].status.hang) { 4803 dev_info(adev->dev, "Some block need full reset!\n"); 4804 return true; 4805 } 4806 } 4807 } 4808 return false; 4809 } 4810 4811 /** 4812 * amdgpu_device_ip_soft_reset - do a soft reset 4813 * 4814 * @adev: amdgpu_device pointer 4815 * 4816 * The list of all the hardware IPs that make up the asic is walked and the 4817 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4818 * IP specific hardware or software state changes that are necessary to soft 4819 * reset the IP. 4820 * Returns 0 on success, negative error code on failure. 4821 */ 4822 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4823 { 4824 int i, r = 0; 4825 4826 for (i = 0; i < adev->num_ip_blocks; i++) { 4827 if (!adev->ip_blocks[i].status.valid) 4828 continue; 4829 if (adev->ip_blocks[i].status.hang && 4830 adev->ip_blocks[i].version->funcs->soft_reset) { 4831 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4832 if (r) 4833 return r; 4834 } 4835 } 4836 4837 return 0; 4838 } 4839 4840 /** 4841 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4842 * 4843 * @adev: amdgpu_device pointer 4844 * 4845 * The list of all the hardware IPs that make up the asic is walked and the 4846 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4847 * handles any IP specific hardware or software state changes that are 4848 * necessary after the IP has been soft reset. 4849 * Returns 0 on success, negative error code on failure. 4850 */ 4851 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4852 { 4853 int i, r = 0; 4854 4855 for (i = 0; i < adev->num_ip_blocks; i++) { 4856 if (!adev->ip_blocks[i].status.valid) 4857 continue; 4858 if (adev->ip_blocks[i].status.hang && 4859 adev->ip_blocks[i].version->funcs->post_soft_reset) 4860 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4861 if (r) 4862 return r; 4863 } 4864 4865 return 0; 4866 } 4867 4868 /** 4869 * amdgpu_device_recover_vram - Recover some VRAM contents 4870 * 4871 * @adev: amdgpu_device pointer 4872 * 4873 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4874 * restore things like GPUVM page tables after a GPU reset where 4875 * the contents of VRAM might be lost. 4876 * 4877 * Returns: 4878 * 0 on success, negative error code on failure. 4879 */ 4880 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4881 { 4882 struct dma_fence *fence = NULL, *next = NULL; 4883 struct amdgpu_bo *shadow; 4884 struct amdgpu_bo_vm *vmbo; 4885 long r = 1, tmo; 4886 4887 if (amdgpu_sriov_runtime(adev)) 4888 tmo = msecs_to_jiffies(8000); 4889 else 4890 tmo = msecs_to_jiffies(100); 4891 4892 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4893 mutex_lock(&adev->shadow_list_lock); 4894 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4895 /* If vm is compute context or adev is APU, shadow will be NULL */ 4896 if (!vmbo->shadow) 4897 continue; 4898 shadow = vmbo->shadow; 4899 4900 /* No need to recover an evicted BO */ 4901 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4902 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4903 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4904 continue; 4905 4906 r = amdgpu_bo_restore_shadow(shadow, &next); 4907 if (r) 4908 break; 4909 4910 if (fence) { 4911 tmo = dma_fence_wait_timeout(fence, false, tmo); 4912 dma_fence_put(fence); 4913 fence = next; 4914 if (tmo == 0) { 4915 r = -ETIMEDOUT; 4916 break; 4917 } else if (tmo < 0) { 4918 r = tmo; 4919 break; 4920 } 4921 } else { 4922 fence = next; 4923 } 4924 } 4925 mutex_unlock(&adev->shadow_list_lock); 4926 4927 if (fence) 4928 tmo = dma_fence_wait_timeout(fence, false, tmo); 4929 dma_fence_put(fence); 4930 4931 if (r < 0 || tmo <= 0) { 4932 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4933 return -EIO; 4934 } 4935 4936 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4937 return 0; 4938 } 4939 4940 4941 /** 4942 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4943 * 4944 * @adev: amdgpu_device pointer 4945 * @from_hypervisor: request from hypervisor 4946 * 4947 * do VF FLR and reinitialize Asic 4948 * return 0 means succeeded otherwise failed 4949 */ 4950 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4951 bool from_hypervisor) 4952 { 4953 int r; 4954 struct amdgpu_hive_info *hive = NULL; 4955 int retry_limit = 0; 4956 4957 retry: 4958 amdgpu_amdkfd_pre_reset(adev); 4959 4960 if (from_hypervisor) 4961 r = amdgpu_virt_request_full_gpu(adev, true); 4962 else 4963 r = amdgpu_virt_reset_gpu(adev); 4964 if (r) 4965 return r; 4966 amdgpu_irq_gpu_reset_resume_helper(adev); 4967 4968 /* some sw clean up VF needs to do before recover */ 4969 amdgpu_virt_post_reset(adev); 4970 4971 /* Resume IP prior to SMC */ 4972 r = amdgpu_device_ip_reinit_early_sriov(adev); 4973 if (r) 4974 goto error; 4975 4976 amdgpu_virt_init_data_exchange(adev); 4977 4978 r = amdgpu_device_fw_loading(adev); 4979 if (r) 4980 return r; 4981 4982 /* now we are okay to resume SMC/CP/SDMA */ 4983 r = amdgpu_device_ip_reinit_late_sriov(adev); 4984 if (r) 4985 goto error; 4986 4987 hive = amdgpu_get_xgmi_hive(adev); 4988 /* Update PSP FW topology after reset */ 4989 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4990 r = amdgpu_xgmi_update_topology(hive, adev); 4991 4992 if (hive) 4993 amdgpu_put_xgmi_hive(hive); 4994 4995 if (!r) { 4996 r = amdgpu_ib_ring_tests(adev); 4997 4998 amdgpu_amdkfd_post_reset(adev); 4999 } 5000 5001 error: 5002 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5003 amdgpu_inc_vram_lost(adev); 5004 r = amdgpu_device_recover_vram(adev); 5005 } 5006 amdgpu_virt_release_full_gpu(adev, true); 5007 5008 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5009 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5010 retry_limit++; 5011 goto retry; 5012 } else 5013 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5014 } 5015 5016 return r; 5017 } 5018 5019 /** 5020 * amdgpu_device_has_job_running - check if there is any job in mirror list 5021 * 5022 * @adev: amdgpu_device pointer 5023 * 5024 * check if there is any job in mirror list 5025 */ 5026 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5027 { 5028 int i; 5029 struct drm_sched_job *job; 5030 5031 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5032 struct amdgpu_ring *ring = adev->rings[i]; 5033 5034 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5035 continue; 5036 5037 spin_lock(&ring->sched.job_list_lock); 5038 job = list_first_entry_or_null(&ring->sched.pending_list, 5039 struct drm_sched_job, list); 5040 spin_unlock(&ring->sched.job_list_lock); 5041 if (job) 5042 return true; 5043 } 5044 return false; 5045 } 5046 5047 /** 5048 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5049 * 5050 * @adev: amdgpu_device pointer 5051 * 5052 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5053 * a hung GPU. 5054 */ 5055 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5056 { 5057 5058 if (amdgpu_gpu_recovery == 0) 5059 goto disabled; 5060 5061 /* Skip soft reset check in fatal error mode */ 5062 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5063 return true; 5064 5065 if (amdgpu_sriov_vf(adev)) 5066 return true; 5067 5068 if (amdgpu_gpu_recovery == -1) { 5069 switch (adev->asic_type) { 5070 #ifdef CONFIG_DRM_AMDGPU_SI 5071 case CHIP_VERDE: 5072 case CHIP_TAHITI: 5073 case CHIP_PITCAIRN: 5074 case CHIP_OLAND: 5075 case CHIP_HAINAN: 5076 #endif 5077 #ifdef CONFIG_DRM_AMDGPU_CIK 5078 case CHIP_KAVERI: 5079 case CHIP_KABINI: 5080 case CHIP_MULLINS: 5081 #endif 5082 case CHIP_CARRIZO: 5083 case CHIP_STONEY: 5084 case CHIP_CYAN_SKILLFISH: 5085 goto disabled; 5086 default: 5087 break; 5088 } 5089 } 5090 5091 return true; 5092 5093 disabled: 5094 dev_info(adev->dev, "GPU recovery disabled.\n"); 5095 return false; 5096 } 5097 5098 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5099 { 5100 u32 i; 5101 int ret = 0; 5102 5103 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5104 5105 dev_info(adev->dev, "GPU mode1 reset\n"); 5106 5107 /* disable BM */ 5108 pci_clear_master(adev->pdev); 5109 5110 amdgpu_device_cache_pci_state(adev->pdev); 5111 5112 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5113 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5114 ret = amdgpu_dpm_mode1_reset(adev); 5115 } else { 5116 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5117 ret = psp_gpu_reset(adev); 5118 } 5119 5120 if (ret) 5121 goto mode1_reset_failed; 5122 5123 amdgpu_device_load_pci_state(adev->pdev); 5124 ret = amdgpu_psp_wait_for_bootloader(adev); 5125 if (ret) 5126 goto mode1_reset_failed; 5127 5128 /* wait for asic to come out of reset */ 5129 for (i = 0; i < adev->usec_timeout; i++) { 5130 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5131 5132 if (memsize != 0xffffffff) 5133 break; 5134 udelay(1); 5135 } 5136 5137 if (i >= adev->usec_timeout) { 5138 ret = -ETIMEDOUT; 5139 goto mode1_reset_failed; 5140 } 5141 5142 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5143 5144 return 0; 5145 5146 mode1_reset_failed: 5147 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5148 return ret; 5149 } 5150 5151 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5152 struct amdgpu_reset_context *reset_context) 5153 { 5154 int i, r = 0; 5155 struct amdgpu_job *job = NULL; 5156 bool need_full_reset = 5157 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5158 5159 if (reset_context->reset_req_dev == adev) 5160 job = reset_context->job; 5161 5162 if (amdgpu_sriov_vf(adev)) { 5163 /* stop the data exchange thread */ 5164 amdgpu_virt_fini_data_exchange(adev); 5165 } 5166 5167 amdgpu_fence_driver_isr_toggle(adev, true); 5168 5169 /* block all schedulers and reset given job's ring */ 5170 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5171 struct amdgpu_ring *ring = adev->rings[i]; 5172 5173 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5174 continue; 5175 5176 /* Clear job fence from fence drv to avoid force_completion 5177 * leave NULL and vm flush fence in fence drv 5178 */ 5179 amdgpu_fence_driver_clear_job_fences(ring); 5180 5181 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5182 amdgpu_fence_driver_force_completion(ring); 5183 } 5184 5185 amdgpu_fence_driver_isr_toggle(adev, false); 5186 5187 if (job && job->vm) 5188 drm_sched_increase_karma(&job->base); 5189 5190 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5191 /* If reset handler not implemented, continue; otherwise return */ 5192 if (r == -EOPNOTSUPP) 5193 r = 0; 5194 else 5195 return r; 5196 5197 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5198 if (!amdgpu_sriov_vf(adev)) { 5199 5200 if (!need_full_reset) 5201 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5202 5203 if (!need_full_reset && amdgpu_gpu_recovery && 5204 amdgpu_device_ip_check_soft_reset(adev)) { 5205 amdgpu_device_ip_pre_soft_reset(adev); 5206 r = amdgpu_device_ip_soft_reset(adev); 5207 amdgpu_device_ip_post_soft_reset(adev); 5208 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5209 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5210 need_full_reset = true; 5211 } 5212 } 5213 5214 if (need_full_reset) 5215 r = amdgpu_device_ip_suspend(adev); 5216 if (need_full_reset) 5217 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5218 else 5219 clear_bit(AMDGPU_NEED_FULL_RESET, 5220 &reset_context->flags); 5221 } 5222 5223 return r; 5224 } 5225 5226 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5227 { 5228 int i; 5229 5230 lockdep_assert_held(&adev->reset_domain->sem); 5231 5232 for (i = 0; i < adev->reset_info.num_regs; i++) { 5233 adev->reset_info.reset_dump_reg_value[i] = 5234 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5235 5236 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5237 adev->reset_info.reset_dump_reg_value[i]); 5238 } 5239 5240 return 0; 5241 } 5242 5243 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5244 struct amdgpu_reset_context *reset_context) 5245 { 5246 struct amdgpu_device *tmp_adev = NULL; 5247 bool need_full_reset, skip_hw_reset, vram_lost = false; 5248 int r = 0; 5249 5250 /* Try reset handler method first */ 5251 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5252 reset_list); 5253 amdgpu_reset_reg_dumps(tmp_adev); 5254 5255 reset_context->reset_device_list = device_list_handle; 5256 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5257 /* If reset handler not implemented, continue; otherwise return */ 5258 if (r == -EOPNOTSUPP) 5259 r = 0; 5260 else 5261 return r; 5262 5263 /* Reset handler not implemented, use the default method */ 5264 need_full_reset = 5265 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5266 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5267 5268 /* 5269 * ASIC reset has to be done on all XGMI hive nodes ASAP 5270 * to allow proper links negotiation in FW (within 1 sec) 5271 */ 5272 if (!skip_hw_reset && need_full_reset) { 5273 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5274 /* For XGMI run all resets in parallel to speed up the process */ 5275 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5276 tmp_adev->gmc.xgmi.pending_reset = false; 5277 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5278 r = -EALREADY; 5279 } else 5280 r = amdgpu_asic_reset(tmp_adev); 5281 5282 if (r) { 5283 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5284 r, adev_to_drm(tmp_adev)->unique); 5285 goto out; 5286 } 5287 } 5288 5289 /* For XGMI wait for all resets to complete before proceed */ 5290 if (!r) { 5291 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5292 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5293 flush_work(&tmp_adev->xgmi_reset_work); 5294 r = tmp_adev->asic_reset_res; 5295 if (r) 5296 break; 5297 } 5298 } 5299 } 5300 } 5301 5302 if (!r && amdgpu_ras_intr_triggered()) { 5303 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5304 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5305 } 5306 5307 amdgpu_ras_intr_cleared(); 5308 } 5309 5310 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5311 if (need_full_reset) { 5312 /* post card */ 5313 r = amdgpu_device_asic_init(tmp_adev); 5314 if (r) { 5315 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5316 } else { 5317 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5318 5319 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5320 if (r) 5321 goto out; 5322 5323 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5324 5325 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5326 5327 if (vram_lost) { 5328 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5329 amdgpu_inc_vram_lost(tmp_adev); 5330 } 5331 5332 r = amdgpu_device_fw_loading(tmp_adev); 5333 if (r) 5334 return r; 5335 5336 r = amdgpu_xcp_restore_partition_mode( 5337 tmp_adev->xcp_mgr); 5338 if (r) 5339 goto out; 5340 5341 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5342 if (r) 5343 goto out; 5344 5345 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5346 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5347 5348 if (vram_lost) 5349 amdgpu_device_fill_reset_magic(tmp_adev); 5350 5351 /* 5352 * Add this ASIC as tracked as reset was already 5353 * complete successfully. 5354 */ 5355 amdgpu_register_gpu_instance(tmp_adev); 5356 5357 if (!reset_context->hive && 5358 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5359 amdgpu_xgmi_add_device(tmp_adev); 5360 5361 r = amdgpu_device_ip_late_init(tmp_adev); 5362 if (r) 5363 goto out; 5364 5365 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5366 5367 /* 5368 * The GPU enters bad state once faulty pages 5369 * by ECC has reached the threshold, and ras 5370 * recovery is scheduled next. So add one check 5371 * here to break recovery if it indeed exceeds 5372 * bad page threshold, and remind user to 5373 * retire this GPU or setting one bigger 5374 * bad_page_threshold value to fix this once 5375 * probing driver again. 5376 */ 5377 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5378 /* must succeed. */ 5379 amdgpu_ras_resume(tmp_adev); 5380 } else { 5381 r = -EINVAL; 5382 goto out; 5383 } 5384 5385 /* Update PSP FW topology after reset */ 5386 if (reset_context->hive && 5387 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5388 r = amdgpu_xgmi_update_topology( 5389 reset_context->hive, tmp_adev); 5390 } 5391 } 5392 5393 out: 5394 if (!r) { 5395 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5396 r = amdgpu_ib_ring_tests(tmp_adev); 5397 if (r) { 5398 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5399 need_full_reset = true; 5400 r = -EAGAIN; 5401 goto end; 5402 } 5403 } 5404 5405 if (!r) 5406 r = amdgpu_device_recover_vram(tmp_adev); 5407 else 5408 tmp_adev->asic_reset_res = r; 5409 } 5410 5411 end: 5412 if (need_full_reset) 5413 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5414 else 5415 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5416 return r; 5417 } 5418 5419 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5420 { 5421 5422 switch (amdgpu_asic_reset_method(adev)) { 5423 case AMD_RESET_METHOD_MODE1: 5424 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5425 break; 5426 case AMD_RESET_METHOD_MODE2: 5427 adev->mp1_state = PP_MP1_STATE_RESET; 5428 break; 5429 default: 5430 adev->mp1_state = PP_MP1_STATE_NONE; 5431 break; 5432 } 5433 } 5434 5435 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5436 { 5437 amdgpu_vf_error_trans_all(adev); 5438 adev->mp1_state = PP_MP1_STATE_NONE; 5439 } 5440 5441 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5442 { 5443 struct pci_dev *p = NULL; 5444 5445 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5446 adev->pdev->bus->number, 1); 5447 if (p) { 5448 pm_runtime_enable(&(p->dev)); 5449 pm_runtime_resume(&(p->dev)); 5450 } 5451 5452 pci_dev_put(p); 5453 } 5454 5455 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5456 { 5457 enum amd_reset_method reset_method; 5458 struct pci_dev *p = NULL; 5459 u64 expires; 5460 5461 /* 5462 * For now, only BACO and mode1 reset are confirmed 5463 * to suffer the audio issue without proper suspended. 5464 */ 5465 reset_method = amdgpu_asic_reset_method(adev); 5466 if ((reset_method != AMD_RESET_METHOD_BACO) && 5467 (reset_method != AMD_RESET_METHOD_MODE1)) 5468 return -EINVAL; 5469 5470 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5471 adev->pdev->bus->number, 1); 5472 if (!p) 5473 return -ENODEV; 5474 5475 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5476 if (!expires) 5477 /* 5478 * If we cannot get the audio device autosuspend delay, 5479 * a fixed 4S interval will be used. Considering 3S is 5480 * the audio controller default autosuspend delay setting. 5481 * 4S used here is guaranteed to cover that. 5482 */ 5483 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5484 5485 while (!pm_runtime_status_suspended(&(p->dev))) { 5486 if (!pm_runtime_suspend(&(p->dev))) 5487 break; 5488 5489 if (expires < ktime_get_mono_fast_ns()) { 5490 dev_warn(adev->dev, "failed to suspend display audio\n"); 5491 pci_dev_put(p); 5492 /* TODO: abort the succeeding gpu reset? */ 5493 return -ETIMEDOUT; 5494 } 5495 } 5496 5497 pm_runtime_disable(&(p->dev)); 5498 5499 pci_dev_put(p); 5500 return 0; 5501 } 5502 5503 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5504 { 5505 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5506 5507 #if defined(CONFIG_DEBUG_FS) 5508 if (!amdgpu_sriov_vf(adev)) 5509 cancel_work(&adev->reset_work); 5510 #endif 5511 5512 if (adev->kfd.dev) 5513 cancel_work(&adev->kfd.reset_work); 5514 5515 if (amdgpu_sriov_vf(adev)) 5516 cancel_work(&adev->virt.flr_work); 5517 5518 if (con && adev->ras_enabled) 5519 cancel_work(&con->recovery_work); 5520 5521 } 5522 5523 /** 5524 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5525 * 5526 * @adev: amdgpu_device pointer 5527 * @job: which job trigger hang 5528 * @reset_context: amdgpu reset context pointer 5529 * 5530 * Attempt to reset the GPU if it has hung (all asics). 5531 * Attempt to do soft-reset or full-reset and reinitialize Asic 5532 * Returns 0 for success or an error on failure. 5533 */ 5534 5535 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5536 struct amdgpu_job *job, 5537 struct amdgpu_reset_context *reset_context) 5538 { 5539 struct list_head device_list, *device_list_handle = NULL; 5540 bool job_signaled = false; 5541 struct amdgpu_hive_info *hive = NULL; 5542 struct amdgpu_device *tmp_adev = NULL; 5543 int i, r = 0; 5544 bool need_emergency_restart = false; 5545 bool audio_suspended = false; 5546 5547 /* 5548 * Special case: RAS triggered and full reset isn't supported 5549 */ 5550 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5551 5552 /* 5553 * Flush RAM to disk so that after reboot 5554 * the user can read log and see why the system rebooted. 5555 */ 5556 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5557 amdgpu_ras_get_context(adev)->reboot) { 5558 DRM_WARN("Emergency reboot."); 5559 5560 ksys_sync_helper(); 5561 emergency_restart(); 5562 } 5563 5564 dev_info(adev->dev, "GPU %s begin!\n", 5565 need_emergency_restart ? "jobs stop":"reset"); 5566 5567 if (!amdgpu_sriov_vf(adev)) 5568 hive = amdgpu_get_xgmi_hive(adev); 5569 if (hive) 5570 mutex_lock(&hive->hive_lock); 5571 5572 reset_context->job = job; 5573 reset_context->hive = hive; 5574 /* 5575 * Build list of devices to reset. 5576 * In case we are in XGMI hive mode, resort the device list 5577 * to put adev in the 1st position. 5578 */ 5579 INIT_LIST_HEAD(&device_list); 5580 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5581 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5582 list_add_tail(&tmp_adev->reset_list, &device_list); 5583 if (adev->shutdown) 5584 tmp_adev->shutdown = true; 5585 } 5586 if (!list_is_first(&adev->reset_list, &device_list)) 5587 list_rotate_to_front(&adev->reset_list, &device_list); 5588 device_list_handle = &device_list; 5589 } else { 5590 list_add_tail(&adev->reset_list, &device_list); 5591 device_list_handle = &device_list; 5592 } 5593 5594 /* We need to lock reset domain only once both for XGMI and single device */ 5595 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5596 reset_list); 5597 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5598 5599 /* block all schedulers and reset given job's ring */ 5600 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5601 5602 amdgpu_device_set_mp1_state(tmp_adev); 5603 5604 /* 5605 * Try to put the audio codec into suspend state 5606 * before gpu reset started. 5607 * 5608 * Due to the power domain of the graphics device 5609 * is shared with AZ power domain. Without this, 5610 * we may change the audio hardware from behind 5611 * the audio driver's back. That will trigger 5612 * some audio codec errors. 5613 */ 5614 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5615 audio_suspended = true; 5616 5617 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5618 5619 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5620 5621 if (!amdgpu_sriov_vf(tmp_adev)) 5622 amdgpu_amdkfd_pre_reset(tmp_adev); 5623 5624 /* 5625 * Mark these ASICs to be reseted as untracked first 5626 * And add them back after reset completed 5627 */ 5628 amdgpu_unregister_gpu_instance(tmp_adev); 5629 5630 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5631 5632 /* disable ras on ALL IPs */ 5633 if (!need_emergency_restart && 5634 amdgpu_device_ip_need_full_reset(tmp_adev)) 5635 amdgpu_ras_suspend(tmp_adev); 5636 5637 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5638 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5639 5640 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5641 continue; 5642 5643 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5644 5645 if (need_emergency_restart) 5646 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5647 } 5648 atomic_inc(&tmp_adev->gpu_reset_counter); 5649 } 5650 5651 if (need_emergency_restart) 5652 goto skip_sched_resume; 5653 5654 /* 5655 * Must check guilty signal here since after this point all old 5656 * HW fences are force signaled. 5657 * 5658 * job->base holds a reference to parent fence 5659 */ 5660 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5661 job_signaled = true; 5662 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5663 goto skip_hw_reset; 5664 } 5665 5666 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5667 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5668 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5669 /*TODO Should we stop ?*/ 5670 if (r) { 5671 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5672 r, adev_to_drm(tmp_adev)->unique); 5673 tmp_adev->asic_reset_res = r; 5674 } 5675 5676 /* 5677 * Drop all pending non scheduler resets. Scheduler resets 5678 * were already dropped during drm_sched_stop 5679 */ 5680 amdgpu_device_stop_pending_resets(tmp_adev); 5681 } 5682 5683 /* Actual ASIC resets if needed.*/ 5684 /* Host driver will handle XGMI hive reset for SRIOV */ 5685 if (amdgpu_sriov_vf(adev)) { 5686 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5687 if (r) 5688 adev->asic_reset_res = r; 5689 5690 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5691 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5692 IP_VERSION(9, 4, 2) || 5693 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5694 amdgpu_ras_resume(adev); 5695 } else { 5696 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5697 if (r && r == -EAGAIN) 5698 goto retry; 5699 } 5700 5701 skip_hw_reset: 5702 5703 /* Post ASIC reset for all devs .*/ 5704 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5705 5706 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5707 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5708 5709 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5710 continue; 5711 5712 drm_sched_start(&ring->sched, true); 5713 } 5714 5715 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5716 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5717 5718 if (tmp_adev->asic_reset_res) 5719 r = tmp_adev->asic_reset_res; 5720 5721 tmp_adev->asic_reset_res = 0; 5722 5723 if (r) { 5724 /* bad news, how to tell it to userspace ? */ 5725 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5726 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5727 } else { 5728 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5729 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5730 DRM_WARN("smart shift update failed\n"); 5731 } 5732 } 5733 5734 skip_sched_resume: 5735 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5736 /* unlock kfd: SRIOV would do it separately */ 5737 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5738 amdgpu_amdkfd_post_reset(tmp_adev); 5739 5740 /* kfd_post_reset will do nothing if kfd device is not initialized, 5741 * need to bring up kfd here if it's not be initialized before 5742 */ 5743 if (!adev->kfd.init_complete) 5744 amdgpu_amdkfd_device_init(adev); 5745 5746 if (audio_suspended) 5747 amdgpu_device_resume_display_audio(tmp_adev); 5748 5749 amdgpu_device_unset_mp1_state(tmp_adev); 5750 5751 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5752 } 5753 5754 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5755 reset_list); 5756 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5757 5758 if (hive) { 5759 mutex_unlock(&hive->hive_lock); 5760 amdgpu_put_xgmi_hive(hive); 5761 } 5762 5763 if (r) 5764 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5765 5766 atomic_set(&adev->reset_domain->reset_res, r); 5767 return r; 5768 } 5769 5770 /** 5771 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5772 * 5773 * @adev: amdgpu_device pointer 5774 * @speed: pointer to the speed of the link 5775 * @width: pointer to the width of the link 5776 * 5777 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5778 * first physical partner to an AMD dGPU. 5779 * This will exclude any virtual switches and links. 5780 */ 5781 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5782 enum pci_bus_speed *speed, 5783 enum pcie_link_width *width) 5784 { 5785 struct pci_dev *parent = adev->pdev; 5786 5787 if (!speed || !width) 5788 return; 5789 5790 *speed = PCI_SPEED_UNKNOWN; 5791 *width = PCIE_LNK_WIDTH_UNKNOWN; 5792 5793 while ((parent = pci_upstream_bridge(parent))) { 5794 /* skip upstream/downstream switches internal to dGPU*/ 5795 if (parent->vendor == PCI_VENDOR_ID_ATI) 5796 continue; 5797 *speed = pcie_get_speed_cap(parent); 5798 *width = pcie_get_width_cap(parent); 5799 break; 5800 } 5801 } 5802 5803 /** 5804 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5805 * 5806 * @adev: amdgpu_device pointer 5807 * 5808 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5809 * and lanes) of the slot the device is in. Handles APUs and 5810 * virtualized environments where PCIE config space may not be available. 5811 */ 5812 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5813 { 5814 struct pci_dev *pdev; 5815 enum pci_bus_speed speed_cap, platform_speed_cap; 5816 enum pcie_link_width platform_link_width; 5817 5818 if (amdgpu_pcie_gen_cap) 5819 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5820 5821 if (amdgpu_pcie_lane_cap) 5822 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5823 5824 /* covers APUs as well */ 5825 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5826 if (adev->pm.pcie_gen_mask == 0) 5827 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5828 if (adev->pm.pcie_mlw_mask == 0) 5829 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5830 return; 5831 } 5832 5833 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5834 return; 5835 5836 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5837 &platform_link_width); 5838 5839 if (adev->pm.pcie_gen_mask == 0) { 5840 /* asic caps */ 5841 pdev = adev->pdev; 5842 speed_cap = pcie_get_speed_cap(pdev); 5843 if (speed_cap == PCI_SPEED_UNKNOWN) { 5844 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5845 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5846 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5847 } else { 5848 if (speed_cap == PCIE_SPEED_32_0GT) 5849 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5850 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5851 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5852 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5853 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5854 else if (speed_cap == PCIE_SPEED_16_0GT) 5855 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5856 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5857 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5858 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5859 else if (speed_cap == PCIE_SPEED_8_0GT) 5860 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5861 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5862 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5863 else if (speed_cap == PCIE_SPEED_5_0GT) 5864 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5865 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5866 else 5867 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5868 } 5869 /* platform caps */ 5870 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5871 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5872 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5873 } else { 5874 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5875 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5876 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5877 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5878 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5879 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5880 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5881 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5882 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5883 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5884 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5885 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5886 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5887 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5888 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5889 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5890 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5891 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5892 else 5893 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5894 5895 } 5896 } 5897 if (adev->pm.pcie_mlw_mask == 0) { 5898 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5899 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5900 } else { 5901 switch (platform_link_width) { 5902 case PCIE_LNK_X32: 5903 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5904 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5905 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5906 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5907 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5908 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5909 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5910 break; 5911 case PCIE_LNK_X16: 5912 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5913 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5914 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5915 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5916 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5917 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5918 break; 5919 case PCIE_LNK_X12: 5920 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5921 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5922 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5923 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5924 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5925 break; 5926 case PCIE_LNK_X8: 5927 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5928 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5929 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5930 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5931 break; 5932 case PCIE_LNK_X4: 5933 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5934 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5935 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5936 break; 5937 case PCIE_LNK_X2: 5938 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5939 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5940 break; 5941 case PCIE_LNK_X1: 5942 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5943 break; 5944 default: 5945 break; 5946 } 5947 } 5948 } 5949 } 5950 5951 /** 5952 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5953 * 5954 * @adev: amdgpu_device pointer 5955 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5956 * 5957 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5958 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5959 * @peer_adev. 5960 */ 5961 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5962 struct amdgpu_device *peer_adev) 5963 { 5964 #ifdef CONFIG_HSA_AMD_P2P 5965 uint64_t address_mask = peer_adev->dev->dma_mask ? 5966 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5967 resource_size_t aper_limit = 5968 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5969 bool p2p_access = 5970 !adev->gmc.xgmi.connected_to_cpu && 5971 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5972 5973 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5974 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5975 !(adev->gmc.aper_base & address_mask || 5976 aper_limit & address_mask)); 5977 #else 5978 return false; 5979 #endif 5980 } 5981 5982 int amdgpu_device_baco_enter(struct drm_device *dev) 5983 { 5984 struct amdgpu_device *adev = drm_to_adev(dev); 5985 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5986 5987 if (!amdgpu_device_supports_baco(dev)) 5988 return -ENOTSUPP; 5989 5990 if (ras && adev->ras_enabled && 5991 adev->nbio.funcs->enable_doorbell_interrupt) 5992 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5993 5994 return amdgpu_dpm_baco_enter(adev); 5995 } 5996 5997 int amdgpu_device_baco_exit(struct drm_device *dev) 5998 { 5999 struct amdgpu_device *adev = drm_to_adev(dev); 6000 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6001 int ret = 0; 6002 6003 if (!amdgpu_device_supports_baco(dev)) 6004 return -ENOTSUPP; 6005 6006 ret = amdgpu_dpm_baco_exit(adev); 6007 if (ret) 6008 return ret; 6009 6010 if (ras && adev->ras_enabled && 6011 adev->nbio.funcs->enable_doorbell_interrupt) 6012 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6013 6014 if (amdgpu_passthrough(adev) && 6015 adev->nbio.funcs->clear_doorbell_interrupt) 6016 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6017 6018 return 0; 6019 } 6020 6021 /** 6022 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6023 * @pdev: PCI device struct 6024 * @state: PCI channel state 6025 * 6026 * Description: Called when a PCI error is detected. 6027 * 6028 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6029 */ 6030 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6031 { 6032 struct drm_device *dev = pci_get_drvdata(pdev); 6033 struct amdgpu_device *adev = drm_to_adev(dev); 6034 int i; 6035 6036 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6037 6038 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6039 DRM_WARN("No support for XGMI hive yet..."); 6040 return PCI_ERS_RESULT_DISCONNECT; 6041 } 6042 6043 adev->pci_channel_state = state; 6044 6045 switch (state) { 6046 case pci_channel_io_normal: 6047 return PCI_ERS_RESULT_CAN_RECOVER; 6048 /* Fatal error, prepare for slot reset */ 6049 case pci_channel_io_frozen: 6050 /* 6051 * Locking adev->reset_domain->sem will prevent any external access 6052 * to GPU during PCI error recovery 6053 */ 6054 amdgpu_device_lock_reset_domain(adev->reset_domain); 6055 amdgpu_device_set_mp1_state(adev); 6056 6057 /* 6058 * Block any work scheduling as we do for regular GPU reset 6059 * for the duration of the recovery 6060 */ 6061 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6062 struct amdgpu_ring *ring = adev->rings[i]; 6063 6064 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 6065 continue; 6066 6067 drm_sched_stop(&ring->sched, NULL); 6068 } 6069 atomic_inc(&adev->gpu_reset_counter); 6070 return PCI_ERS_RESULT_NEED_RESET; 6071 case pci_channel_io_perm_failure: 6072 /* Permanent error, prepare for device removal */ 6073 return PCI_ERS_RESULT_DISCONNECT; 6074 } 6075 6076 return PCI_ERS_RESULT_NEED_RESET; 6077 } 6078 6079 /** 6080 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6081 * @pdev: pointer to PCI device 6082 */ 6083 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6084 { 6085 6086 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6087 6088 /* TODO - dump whatever for debugging purposes */ 6089 6090 /* This called only if amdgpu_pci_error_detected returns 6091 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6092 * works, no need to reset slot. 6093 */ 6094 6095 return PCI_ERS_RESULT_RECOVERED; 6096 } 6097 6098 /** 6099 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6100 * @pdev: PCI device struct 6101 * 6102 * Description: This routine is called by the pci error recovery 6103 * code after the PCI slot has been reset, just before we 6104 * should resume normal operations. 6105 */ 6106 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6107 { 6108 struct drm_device *dev = pci_get_drvdata(pdev); 6109 struct amdgpu_device *adev = drm_to_adev(dev); 6110 int r, i; 6111 struct amdgpu_reset_context reset_context; 6112 u32 memsize; 6113 struct list_head device_list; 6114 6115 DRM_INFO("PCI error: slot reset callback!!\n"); 6116 6117 memset(&reset_context, 0, sizeof(reset_context)); 6118 6119 INIT_LIST_HEAD(&device_list); 6120 list_add_tail(&adev->reset_list, &device_list); 6121 6122 /* wait for asic to come out of reset */ 6123 msleep(500); 6124 6125 /* Restore PCI confspace */ 6126 amdgpu_device_load_pci_state(pdev); 6127 6128 /* confirm ASIC came out of reset */ 6129 for (i = 0; i < adev->usec_timeout; i++) { 6130 memsize = amdgpu_asic_get_config_memsize(adev); 6131 6132 if (memsize != 0xffffffff) 6133 break; 6134 udelay(1); 6135 } 6136 if (memsize == 0xffffffff) { 6137 r = -ETIME; 6138 goto out; 6139 } 6140 6141 reset_context.method = AMD_RESET_METHOD_NONE; 6142 reset_context.reset_req_dev = adev; 6143 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6144 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6145 6146 adev->no_hw_access = true; 6147 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6148 adev->no_hw_access = false; 6149 if (r) 6150 goto out; 6151 6152 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6153 6154 out: 6155 if (!r) { 6156 if (amdgpu_device_cache_pci_state(adev->pdev)) 6157 pci_restore_state(adev->pdev); 6158 6159 DRM_INFO("PCIe error recovery succeeded\n"); 6160 } else { 6161 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6162 amdgpu_device_unset_mp1_state(adev); 6163 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6164 } 6165 6166 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6167 } 6168 6169 /** 6170 * amdgpu_pci_resume() - resume normal ops after PCI reset 6171 * @pdev: pointer to PCI device 6172 * 6173 * Called when the error recovery driver tells us that its 6174 * OK to resume normal operation. 6175 */ 6176 void amdgpu_pci_resume(struct pci_dev *pdev) 6177 { 6178 struct drm_device *dev = pci_get_drvdata(pdev); 6179 struct amdgpu_device *adev = drm_to_adev(dev); 6180 int i; 6181 6182 6183 DRM_INFO("PCI error: resume callback!!\n"); 6184 6185 /* Only continue execution for the case of pci_channel_io_frozen */ 6186 if (adev->pci_channel_state != pci_channel_io_frozen) 6187 return; 6188 6189 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6190 struct amdgpu_ring *ring = adev->rings[i]; 6191 6192 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 6193 continue; 6194 6195 drm_sched_start(&ring->sched, true); 6196 } 6197 6198 amdgpu_device_unset_mp1_state(adev); 6199 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6200 } 6201 6202 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6203 { 6204 struct drm_device *dev = pci_get_drvdata(pdev); 6205 struct amdgpu_device *adev = drm_to_adev(dev); 6206 int r; 6207 6208 r = pci_save_state(pdev); 6209 if (!r) { 6210 kfree(adev->pci_state); 6211 6212 adev->pci_state = pci_store_saved_state(pdev); 6213 6214 if (!adev->pci_state) { 6215 DRM_ERROR("Failed to store PCI saved state"); 6216 return false; 6217 } 6218 } else { 6219 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6220 return false; 6221 } 6222 6223 return true; 6224 } 6225 6226 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6227 { 6228 struct drm_device *dev = pci_get_drvdata(pdev); 6229 struct amdgpu_device *adev = drm_to_adev(dev); 6230 int r; 6231 6232 if (!adev->pci_state) 6233 return false; 6234 6235 r = pci_load_saved_state(pdev, adev->pci_state); 6236 6237 if (!r) { 6238 pci_restore_state(pdev); 6239 } else { 6240 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6241 return false; 6242 } 6243 6244 return true; 6245 } 6246 6247 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6248 struct amdgpu_ring *ring) 6249 { 6250 #ifdef CONFIG_X86_64 6251 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6252 return; 6253 #endif 6254 if (adev->gmc.xgmi.connected_to_cpu) 6255 return; 6256 6257 if (ring && ring->funcs->emit_hdp_flush) 6258 amdgpu_ring_emit_hdp_flush(ring); 6259 else 6260 amdgpu_asic_flush_hdp(adev, ring); 6261 } 6262 6263 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6264 struct amdgpu_ring *ring) 6265 { 6266 #ifdef CONFIG_X86_64 6267 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6268 return; 6269 #endif 6270 if (adev->gmc.xgmi.connected_to_cpu) 6271 return; 6272 6273 amdgpu_asic_invalidate_hdp(adev, ring); 6274 } 6275 6276 int amdgpu_in_reset(struct amdgpu_device *adev) 6277 { 6278 return atomic_read(&adev->reset_domain->in_gpu_reset); 6279 } 6280 6281 /** 6282 * amdgpu_device_halt() - bring hardware to some kind of halt state 6283 * 6284 * @adev: amdgpu_device pointer 6285 * 6286 * Bring hardware to some kind of halt state so that no one can touch it 6287 * any more. It will help to maintain error context when error occurred. 6288 * Compare to a simple hang, the system will keep stable at least for SSH 6289 * access. Then it should be trivial to inspect the hardware state and 6290 * see what's going on. Implemented as following: 6291 * 6292 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6293 * clears all CPU mappings to device, disallows remappings through page faults 6294 * 2. amdgpu_irq_disable_all() disables all interrupts 6295 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6296 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6297 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6298 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6299 * flush any in flight DMA operations 6300 */ 6301 void amdgpu_device_halt(struct amdgpu_device *adev) 6302 { 6303 struct pci_dev *pdev = adev->pdev; 6304 struct drm_device *ddev = adev_to_drm(adev); 6305 6306 amdgpu_xcp_dev_unplug(adev); 6307 drm_dev_unplug(ddev); 6308 6309 amdgpu_irq_disable_all(adev); 6310 6311 amdgpu_fence_driver_hw_fini(adev); 6312 6313 adev->no_hw_access = true; 6314 6315 amdgpu_device_unmap_mmio(adev); 6316 6317 pci_disable_device(pdev); 6318 pci_wait_for_pending_transaction(pdev); 6319 } 6320 6321 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6322 u32 reg) 6323 { 6324 unsigned long flags, address, data; 6325 u32 r; 6326 6327 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6328 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6329 6330 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6331 WREG32(address, reg * 4); 6332 (void)RREG32(address); 6333 r = RREG32(data); 6334 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6335 return r; 6336 } 6337 6338 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6339 u32 reg, u32 v) 6340 { 6341 unsigned long flags, address, data; 6342 6343 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6344 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6345 6346 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6347 WREG32(address, reg * 4); 6348 (void)RREG32(address); 6349 WREG32(data, v); 6350 (void)RREG32(data); 6351 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6352 } 6353 6354 /** 6355 * amdgpu_device_switch_gang - switch to a new gang 6356 * @adev: amdgpu_device pointer 6357 * @gang: the gang to switch to 6358 * 6359 * Try to switch to a new gang. 6360 * Returns: NULL if we switched to the new gang or a reference to the current 6361 * gang leader. 6362 */ 6363 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6364 struct dma_fence *gang) 6365 { 6366 struct dma_fence *old = NULL; 6367 6368 do { 6369 dma_fence_put(old); 6370 rcu_read_lock(); 6371 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6372 rcu_read_unlock(); 6373 6374 if (old == gang) 6375 break; 6376 6377 if (!dma_fence_is_signaled(old)) 6378 return old; 6379 6380 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6381 old, gang) != old); 6382 6383 dma_fence_put(old); 6384 return NULL; 6385 } 6386 6387 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6388 { 6389 switch (adev->asic_type) { 6390 #ifdef CONFIG_DRM_AMDGPU_SI 6391 case CHIP_HAINAN: 6392 #endif 6393 case CHIP_TOPAZ: 6394 /* chips with no display hardware */ 6395 return false; 6396 #ifdef CONFIG_DRM_AMDGPU_SI 6397 case CHIP_TAHITI: 6398 case CHIP_PITCAIRN: 6399 case CHIP_VERDE: 6400 case CHIP_OLAND: 6401 #endif 6402 #ifdef CONFIG_DRM_AMDGPU_CIK 6403 case CHIP_BONAIRE: 6404 case CHIP_HAWAII: 6405 case CHIP_KAVERI: 6406 case CHIP_KABINI: 6407 case CHIP_MULLINS: 6408 #endif 6409 case CHIP_TONGA: 6410 case CHIP_FIJI: 6411 case CHIP_POLARIS10: 6412 case CHIP_POLARIS11: 6413 case CHIP_POLARIS12: 6414 case CHIP_VEGAM: 6415 case CHIP_CARRIZO: 6416 case CHIP_STONEY: 6417 /* chips with display hardware */ 6418 return true; 6419 default: 6420 /* IP discovery */ 6421 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6422 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6423 return false; 6424 return true; 6425 } 6426 } 6427 6428 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6429 uint32_t inst, uint32_t reg_addr, char reg_name[], 6430 uint32_t expected_value, uint32_t mask) 6431 { 6432 uint32_t ret = 0; 6433 uint32_t old_ = 0; 6434 uint32_t tmp_ = RREG32(reg_addr); 6435 uint32_t loop = adev->usec_timeout; 6436 6437 while ((tmp_ & (mask)) != (expected_value)) { 6438 if (old_ != tmp_) { 6439 loop = adev->usec_timeout; 6440 old_ = tmp_; 6441 } else 6442 udelay(1); 6443 tmp_ = RREG32(reg_addr); 6444 loop--; 6445 if (!loop) { 6446 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6447 inst, reg_name, (uint32_t)expected_value, 6448 (uint32_t)(tmp_ & (mask))); 6449 ret = -ETIMEDOUT; 6450 break; 6451 } 6452 } 6453 return ret; 6454 } 6455