1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 166 struct bin_attribute *attr, char *buf, 167 loff_t ppos, size_t count) 168 { 169 struct device *dev = kobj_to_dev(kobj); 170 struct drm_device *ddev = dev_get_drvdata(dev); 171 struct amdgpu_device *adev = drm_to_adev(ddev); 172 ssize_t bytes_read; 173 174 switch (ppos) { 175 case AMDGPU_SYS_REG_STATE_XGMI: 176 bytes_read = amdgpu_asic_get_reg_state( 177 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 178 break; 179 case AMDGPU_SYS_REG_STATE_WAFL: 180 bytes_read = amdgpu_asic_get_reg_state( 181 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 182 break; 183 case AMDGPU_SYS_REG_STATE_PCIE: 184 bytes_read = amdgpu_asic_get_reg_state( 185 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 186 break; 187 case AMDGPU_SYS_REG_STATE_USR: 188 bytes_read = amdgpu_asic_get_reg_state( 189 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 190 break; 191 case AMDGPU_SYS_REG_STATE_USR_1: 192 bytes_read = amdgpu_asic_get_reg_state( 193 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 194 break; 195 default: 196 return -EINVAL; 197 } 198 199 return bytes_read; 200 } 201 202 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 203 AMDGPU_SYS_REG_STATE_END); 204 205 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 206 { 207 int ret; 208 209 if (!amdgpu_asic_get_reg_state_supported(adev)) 210 return 0; 211 212 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 213 214 return ret; 215 } 216 217 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 218 { 219 if (!amdgpu_asic_get_reg_state_supported(adev)) 220 return; 221 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 222 } 223 224 /** 225 * DOC: board_info 226 * 227 * The amdgpu driver provides a sysfs API for giving board related information. 228 * It provides the form factor information in the format 229 * 230 * type : form factor 231 * 232 * Possible form factor values 233 * 234 * - "cem" - PCIE CEM card 235 * - "oam" - Open Compute Accelerator Module 236 * - "unknown" - Not known 237 * 238 */ 239 240 static ssize_t amdgpu_device_get_board_info(struct device *dev, 241 struct device_attribute *attr, 242 char *buf) 243 { 244 struct drm_device *ddev = dev_get_drvdata(dev); 245 struct amdgpu_device *adev = drm_to_adev(ddev); 246 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 247 const char *pkg; 248 249 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 250 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 251 252 switch (pkg_type) { 253 case AMDGPU_PKG_TYPE_CEM: 254 pkg = "cem"; 255 break; 256 case AMDGPU_PKG_TYPE_OAM: 257 pkg = "oam"; 258 break; 259 default: 260 pkg = "unknown"; 261 break; 262 } 263 264 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 265 } 266 267 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 268 269 static struct attribute *amdgpu_board_attrs[] = { 270 &dev_attr_board_info.attr, 271 NULL, 272 }; 273 274 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 275 struct attribute *attr, int n) 276 { 277 struct device *dev = kobj_to_dev(kobj); 278 struct drm_device *ddev = dev_get_drvdata(dev); 279 struct amdgpu_device *adev = drm_to_adev(ddev); 280 281 if (adev->flags & AMD_IS_APU) 282 return 0; 283 284 return attr->mode; 285 } 286 287 static const struct attribute_group amdgpu_board_attrs_group = { 288 .attrs = amdgpu_board_attrs, 289 .is_visible = amdgpu_board_attrs_is_visible 290 }; 291 292 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 293 294 295 /** 296 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 297 * 298 * @dev: drm_device pointer 299 * 300 * Returns true if the device is a dGPU with ATPX power control, 301 * otherwise return false. 302 */ 303 bool amdgpu_device_supports_px(struct drm_device *dev) 304 { 305 struct amdgpu_device *adev = drm_to_adev(dev); 306 307 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 308 return true; 309 return false; 310 } 311 312 /** 313 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 314 * 315 * @dev: drm_device pointer 316 * 317 * Returns true if the device is a dGPU with ACPI power control, 318 * otherwise return false. 319 */ 320 bool amdgpu_device_supports_boco(struct drm_device *dev) 321 { 322 struct amdgpu_device *adev = drm_to_adev(dev); 323 324 if (adev->has_pr3 || 325 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 326 return true; 327 return false; 328 } 329 330 /** 331 * amdgpu_device_supports_baco - Does the device support BACO 332 * 333 * @dev: drm_device pointer 334 * 335 * Returns true if the device supporte BACO, 336 * otherwise return false. 337 */ 338 bool amdgpu_device_supports_baco(struct drm_device *dev) 339 { 340 struct amdgpu_device *adev = drm_to_adev(dev); 341 342 return amdgpu_asic_supports_baco(adev); 343 } 344 345 /** 346 * amdgpu_device_supports_smart_shift - Is the device dGPU with 347 * smart shift support 348 * 349 * @dev: drm_device pointer 350 * 351 * Returns true if the device is a dGPU with Smart Shift support, 352 * otherwise returns false. 353 */ 354 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 355 { 356 return (amdgpu_device_supports_boco(dev) && 357 amdgpu_acpi_is_power_shift_control_supported()); 358 } 359 360 /* 361 * VRAM access helper functions 362 */ 363 364 /** 365 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 366 * 367 * @adev: amdgpu_device pointer 368 * @pos: offset of the buffer in vram 369 * @buf: virtual address of the buffer in system memory 370 * @size: read/write size, sizeof(@buf) must > @size 371 * @write: true - write to vram, otherwise - read from vram 372 */ 373 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 374 void *buf, size_t size, bool write) 375 { 376 unsigned long flags; 377 uint32_t hi = ~0, tmp = 0; 378 uint32_t *data = buf; 379 uint64_t last; 380 int idx; 381 382 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 383 return; 384 385 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 386 387 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 388 for (last = pos + size; pos < last; pos += 4) { 389 tmp = pos >> 31; 390 391 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 392 if (tmp != hi) { 393 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 394 hi = tmp; 395 } 396 if (write) 397 WREG32_NO_KIQ(mmMM_DATA, *data++); 398 else 399 *data++ = RREG32_NO_KIQ(mmMM_DATA); 400 } 401 402 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 403 drm_dev_exit(idx); 404 } 405 406 /** 407 * amdgpu_device_aper_access - access vram by vram aperature 408 * 409 * @adev: amdgpu_device pointer 410 * @pos: offset of the buffer in vram 411 * @buf: virtual address of the buffer in system memory 412 * @size: read/write size, sizeof(@buf) must > @size 413 * @write: true - write to vram, otherwise - read from vram 414 * 415 * The return value means how many bytes have been transferred. 416 */ 417 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 418 void *buf, size_t size, bool write) 419 { 420 #ifdef CONFIG_64BIT 421 void __iomem *addr; 422 size_t count = 0; 423 uint64_t last; 424 425 if (!adev->mman.aper_base_kaddr) 426 return 0; 427 428 last = min(pos + size, adev->gmc.visible_vram_size); 429 if (last > pos) { 430 addr = adev->mman.aper_base_kaddr + pos; 431 count = last - pos; 432 433 if (write) { 434 memcpy_toio(addr, buf, count); 435 /* Make sure HDP write cache flush happens without any reordering 436 * after the system memory contents are sent over PCIe device 437 */ 438 mb(); 439 amdgpu_device_flush_hdp(adev, NULL); 440 } else { 441 amdgpu_device_invalidate_hdp(adev, NULL); 442 /* Make sure HDP read cache is invalidated before issuing a read 443 * to the PCIe device 444 */ 445 mb(); 446 memcpy_fromio(buf, addr, count); 447 } 448 449 } 450 451 return count; 452 #else 453 return 0; 454 #endif 455 } 456 457 /** 458 * amdgpu_device_vram_access - read/write a buffer in vram 459 * 460 * @adev: amdgpu_device pointer 461 * @pos: offset of the buffer in vram 462 * @buf: virtual address of the buffer in system memory 463 * @size: read/write size, sizeof(@buf) must > @size 464 * @write: true - write to vram, otherwise - read from vram 465 */ 466 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 467 void *buf, size_t size, bool write) 468 { 469 size_t count; 470 471 /* try to using vram apreature to access vram first */ 472 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 473 size -= count; 474 if (size) { 475 /* using MM to access rest vram */ 476 pos += count; 477 buf += count; 478 amdgpu_device_mm_access(adev, pos, buf, size, write); 479 } 480 } 481 482 /* 483 * register access helper functions. 484 */ 485 486 /* Check if hw access should be skipped because of hotplug or device error */ 487 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 488 { 489 if (adev->no_hw_access) 490 return true; 491 492 #ifdef CONFIG_LOCKDEP 493 /* 494 * This is a bit complicated to understand, so worth a comment. What we assert 495 * here is that the GPU reset is not running on another thread in parallel. 496 * 497 * For this we trylock the read side of the reset semaphore, if that succeeds 498 * we know that the reset is not running in paralell. 499 * 500 * If the trylock fails we assert that we are either already holding the read 501 * side of the lock or are the reset thread itself and hold the write side of 502 * the lock. 503 */ 504 if (in_task()) { 505 if (down_read_trylock(&adev->reset_domain->sem)) 506 up_read(&adev->reset_domain->sem); 507 else 508 lockdep_assert_held(&adev->reset_domain->sem); 509 } 510 #endif 511 return false; 512 } 513 514 /** 515 * amdgpu_device_rreg - read a memory mapped IO or indirect register 516 * 517 * @adev: amdgpu_device pointer 518 * @reg: dword aligned register offset 519 * @acc_flags: access flags which require special behavior 520 * 521 * Returns the 32 bit value from the offset specified. 522 */ 523 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 524 uint32_t reg, uint32_t acc_flags) 525 { 526 uint32_t ret; 527 528 if (amdgpu_device_skip_hw_access(adev)) 529 return 0; 530 531 if ((reg * 4) < adev->rmmio_size) { 532 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 533 amdgpu_sriov_runtime(adev) && 534 down_read_trylock(&adev->reset_domain->sem)) { 535 ret = amdgpu_kiq_rreg(adev, reg, 0); 536 up_read(&adev->reset_domain->sem); 537 } else { 538 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 539 } 540 } else { 541 ret = adev->pcie_rreg(adev, reg * 4); 542 } 543 544 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 545 546 return ret; 547 } 548 549 /* 550 * MMIO register read with bytes helper functions 551 * @offset:bytes offset from MMIO start 552 */ 553 554 /** 555 * amdgpu_mm_rreg8 - read a memory mapped IO register 556 * 557 * @adev: amdgpu_device pointer 558 * @offset: byte aligned register offset 559 * 560 * Returns the 8 bit value from the offset specified. 561 */ 562 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 563 { 564 if (amdgpu_device_skip_hw_access(adev)) 565 return 0; 566 567 if (offset < adev->rmmio_size) 568 return (readb(adev->rmmio + offset)); 569 BUG(); 570 } 571 572 573 /** 574 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 575 * 576 * @adev: amdgpu_device pointer 577 * @reg: dword aligned register offset 578 * @acc_flags: access flags which require special behavior 579 * @xcc_id: xcc accelerated compute core id 580 * 581 * Returns the 32 bit value from the offset specified. 582 */ 583 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 584 uint32_t reg, uint32_t acc_flags, 585 uint32_t xcc_id) 586 { 587 uint32_t ret, rlcg_flag; 588 589 if (amdgpu_device_skip_hw_access(adev)) 590 return 0; 591 592 if ((reg * 4) < adev->rmmio_size) { 593 if (amdgpu_sriov_vf(adev) && 594 !amdgpu_sriov_runtime(adev) && 595 adev->gfx.rlc.rlcg_reg_access_supported && 596 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 597 GC_HWIP, false, 598 &rlcg_flag)) { 599 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 600 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 601 amdgpu_sriov_runtime(adev) && 602 down_read_trylock(&adev->reset_domain->sem)) { 603 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 604 up_read(&adev->reset_domain->sem); 605 } else { 606 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 607 } 608 } else { 609 ret = adev->pcie_rreg(adev, reg * 4); 610 } 611 612 return ret; 613 } 614 615 /* 616 * MMIO register write with bytes helper functions 617 * @offset:bytes offset from MMIO start 618 * @value: the value want to be written to the register 619 */ 620 621 /** 622 * amdgpu_mm_wreg8 - read a memory mapped IO register 623 * 624 * @adev: amdgpu_device pointer 625 * @offset: byte aligned register offset 626 * @value: 8 bit value to write 627 * 628 * Writes the value specified to the offset specified. 629 */ 630 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 631 { 632 if (amdgpu_device_skip_hw_access(adev)) 633 return; 634 635 if (offset < adev->rmmio_size) 636 writeb(value, adev->rmmio + offset); 637 else 638 BUG(); 639 } 640 641 /** 642 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 643 * 644 * @adev: amdgpu_device pointer 645 * @reg: dword aligned register offset 646 * @v: 32 bit value to write to the register 647 * @acc_flags: access flags which require special behavior 648 * 649 * Writes the value specified to the offset specified. 650 */ 651 void amdgpu_device_wreg(struct amdgpu_device *adev, 652 uint32_t reg, uint32_t v, 653 uint32_t acc_flags) 654 { 655 if (amdgpu_device_skip_hw_access(adev)) 656 return; 657 658 if ((reg * 4) < adev->rmmio_size) { 659 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 660 amdgpu_sriov_runtime(adev) && 661 down_read_trylock(&adev->reset_domain->sem)) { 662 amdgpu_kiq_wreg(adev, reg, v, 0); 663 up_read(&adev->reset_domain->sem); 664 } else { 665 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 666 } 667 } else { 668 adev->pcie_wreg(adev, reg * 4, v); 669 } 670 671 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 672 } 673 674 /** 675 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 676 * 677 * @adev: amdgpu_device pointer 678 * @reg: mmio/rlc register 679 * @v: value to write 680 * @xcc_id: xcc accelerated compute core id 681 * 682 * this function is invoked only for the debugfs register access 683 */ 684 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 685 uint32_t reg, uint32_t v, 686 uint32_t xcc_id) 687 { 688 if (amdgpu_device_skip_hw_access(adev)) 689 return; 690 691 if (amdgpu_sriov_fullaccess(adev) && 692 adev->gfx.rlc.funcs && 693 adev->gfx.rlc.funcs->is_rlcg_access_range) { 694 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 695 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 696 } else if ((reg * 4) >= adev->rmmio_size) { 697 adev->pcie_wreg(adev, reg * 4, v); 698 } else { 699 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 700 } 701 } 702 703 /** 704 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 705 * 706 * @adev: amdgpu_device pointer 707 * @reg: dword aligned register offset 708 * @v: 32 bit value to write to the register 709 * @acc_flags: access flags which require special behavior 710 * @xcc_id: xcc accelerated compute core id 711 * 712 * Writes the value specified to the offset specified. 713 */ 714 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 715 uint32_t reg, uint32_t v, 716 uint32_t acc_flags, uint32_t xcc_id) 717 { 718 uint32_t rlcg_flag; 719 720 if (amdgpu_device_skip_hw_access(adev)) 721 return; 722 723 if ((reg * 4) < adev->rmmio_size) { 724 if (amdgpu_sriov_vf(adev) && 725 !amdgpu_sriov_runtime(adev) && 726 adev->gfx.rlc.rlcg_reg_access_supported && 727 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 728 GC_HWIP, true, 729 &rlcg_flag)) { 730 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 731 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 732 amdgpu_sriov_runtime(adev) && 733 down_read_trylock(&adev->reset_domain->sem)) { 734 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 735 up_read(&adev->reset_domain->sem); 736 } else { 737 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 738 } 739 } else { 740 adev->pcie_wreg(adev, reg * 4, v); 741 } 742 } 743 744 /** 745 * amdgpu_device_indirect_rreg - read an indirect register 746 * 747 * @adev: amdgpu_device pointer 748 * @reg_addr: indirect register address to read from 749 * 750 * Returns the value of indirect register @reg_addr 751 */ 752 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 753 u32 reg_addr) 754 { 755 unsigned long flags, pcie_index, pcie_data; 756 void __iomem *pcie_index_offset; 757 void __iomem *pcie_data_offset; 758 u32 r; 759 760 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 761 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 762 763 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 764 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 765 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 766 767 writel(reg_addr, pcie_index_offset); 768 readl(pcie_index_offset); 769 r = readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 772 return r; 773 } 774 775 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 776 u64 reg_addr) 777 { 778 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 779 u32 r; 780 void __iomem *pcie_index_offset; 781 void __iomem *pcie_index_hi_offset; 782 void __iomem *pcie_data_offset; 783 784 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 785 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 786 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 787 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 788 else 789 pcie_index_hi = 0; 790 791 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 792 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 793 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 794 if (pcie_index_hi != 0) 795 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 796 pcie_index_hi * 4; 797 798 writel(reg_addr, pcie_index_offset); 799 readl(pcie_index_offset); 800 if (pcie_index_hi != 0) { 801 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 802 readl(pcie_index_hi_offset); 803 } 804 r = readl(pcie_data_offset); 805 806 /* clear the high bits */ 807 if (pcie_index_hi != 0) { 808 writel(0, pcie_index_hi_offset); 809 readl(pcie_index_hi_offset); 810 } 811 812 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 813 814 return r; 815 } 816 817 /** 818 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 819 * 820 * @adev: amdgpu_device pointer 821 * @reg_addr: indirect register address to read from 822 * 823 * Returns the value of indirect register @reg_addr 824 */ 825 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 826 u32 reg_addr) 827 { 828 unsigned long flags, pcie_index, pcie_data; 829 void __iomem *pcie_index_offset; 830 void __iomem *pcie_data_offset; 831 u64 r; 832 833 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 834 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 835 836 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 837 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 838 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 839 840 /* read low 32 bits */ 841 writel(reg_addr, pcie_index_offset); 842 readl(pcie_index_offset); 843 r = readl(pcie_data_offset); 844 /* read high 32 bits */ 845 writel(reg_addr + 4, pcie_index_offset); 846 readl(pcie_index_offset); 847 r |= ((u64)readl(pcie_data_offset) << 32); 848 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 849 850 return r; 851 } 852 853 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 854 u64 reg_addr) 855 { 856 unsigned long flags, pcie_index, pcie_data; 857 unsigned long pcie_index_hi = 0; 858 void __iomem *pcie_index_offset; 859 void __iomem *pcie_index_hi_offset; 860 void __iomem *pcie_data_offset; 861 u64 r; 862 863 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 864 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 865 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 866 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 867 868 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 869 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 870 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 871 if (pcie_index_hi != 0) 872 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 873 pcie_index_hi * 4; 874 875 /* read low 32 bits */ 876 writel(reg_addr, pcie_index_offset); 877 readl(pcie_index_offset); 878 if (pcie_index_hi != 0) { 879 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 880 readl(pcie_index_hi_offset); 881 } 882 r = readl(pcie_data_offset); 883 /* read high 32 bits */ 884 writel(reg_addr + 4, pcie_index_offset); 885 readl(pcie_index_offset); 886 if (pcie_index_hi != 0) { 887 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 888 readl(pcie_index_hi_offset); 889 } 890 r |= ((u64)readl(pcie_data_offset) << 32); 891 892 /* clear the high bits */ 893 if (pcie_index_hi != 0) { 894 writel(0, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 898 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 899 900 return r; 901 } 902 903 /** 904 * amdgpu_device_indirect_wreg - write an indirect register address 905 * 906 * @adev: amdgpu_device pointer 907 * @reg_addr: indirect register offset 908 * @reg_data: indirect register data 909 * 910 */ 911 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 912 u32 reg_addr, u32 reg_data) 913 { 914 unsigned long flags, pcie_index, pcie_data; 915 void __iomem *pcie_index_offset; 916 void __iomem *pcie_data_offset; 917 918 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 919 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 920 921 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 922 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 923 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 924 925 writel(reg_addr, pcie_index_offset); 926 readl(pcie_index_offset); 927 writel(reg_data, pcie_data_offset); 928 readl(pcie_data_offset); 929 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 930 } 931 932 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 933 u64 reg_addr, u32 reg_data) 934 { 935 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 936 void __iomem *pcie_index_offset; 937 void __iomem *pcie_index_hi_offset; 938 void __iomem *pcie_data_offset; 939 940 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 941 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 942 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 943 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 944 else 945 pcie_index_hi = 0; 946 947 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 948 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 949 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 950 if (pcie_index_hi != 0) 951 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 952 pcie_index_hi * 4; 953 954 writel(reg_addr, pcie_index_offset); 955 readl(pcie_index_offset); 956 if (pcie_index_hi != 0) { 957 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 958 readl(pcie_index_hi_offset); 959 } 960 writel(reg_data, pcie_data_offset); 961 readl(pcie_data_offset); 962 963 /* clear the high bits */ 964 if (pcie_index_hi != 0) { 965 writel(0, pcie_index_hi_offset); 966 readl(pcie_index_hi_offset); 967 } 968 969 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 970 } 971 972 /** 973 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 974 * 975 * @adev: amdgpu_device pointer 976 * @reg_addr: indirect register offset 977 * @reg_data: indirect register data 978 * 979 */ 980 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 981 u32 reg_addr, u64 reg_data) 982 { 983 unsigned long flags, pcie_index, pcie_data; 984 void __iomem *pcie_index_offset; 985 void __iomem *pcie_data_offset; 986 987 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 988 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 989 990 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 991 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 992 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 993 994 /* write low 32 bits */ 995 writel(reg_addr, pcie_index_offset); 996 readl(pcie_index_offset); 997 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 998 readl(pcie_data_offset); 999 /* write high 32 bits */ 1000 writel(reg_addr + 4, pcie_index_offset); 1001 readl(pcie_index_offset); 1002 writel((u32)(reg_data >> 32), pcie_data_offset); 1003 readl(pcie_data_offset); 1004 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1005 } 1006 1007 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1008 u64 reg_addr, u64 reg_data) 1009 { 1010 unsigned long flags, pcie_index, pcie_data; 1011 unsigned long pcie_index_hi = 0; 1012 void __iomem *pcie_index_offset; 1013 void __iomem *pcie_index_hi_offset; 1014 void __iomem *pcie_data_offset; 1015 1016 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1017 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1018 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1019 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1020 1021 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1022 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1023 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1024 if (pcie_index_hi != 0) 1025 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1026 pcie_index_hi * 4; 1027 1028 /* write low 32 bits */ 1029 writel(reg_addr, pcie_index_offset); 1030 readl(pcie_index_offset); 1031 if (pcie_index_hi != 0) { 1032 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1033 readl(pcie_index_hi_offset); 1034 } 1035 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1036 readl(pcie_data_offset); 1037 /* write high 32 bits */ 1038 writel(reg_addr + 4, pcie_index_offset); 1039 readl(pcie_index_offset); 1040 if (pcie_index_hi != 0) { 1041 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1042 readl(pcie_index_hi_offset); 1043 } 1044 writel((u32)(reg_data >> 32), pcie_data_offset); 1045 readl(pcie_data_offset); 1046 1047 /* clear the high bits */ 1048 if (pcie_index_hi != 0) { 1049 writel(0, pcie_index_hi_offset); 1050 readl(pcie_index_hi_offset); 1051 } 1052 1053 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1054 } 1055 1056 /** 1057 * amdgpu_device_get_rev_id - query device rev_id 1058 * 1059 * @adev: amdgpu_device pointer 1060 * 1061 * Return device rev_id 1062 */ 1063 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1064 { 1065 return adev->nbio.funcs->get_rev_id(adev); 1066 } 1067 1068 /** 1069 * amdgpu_invalid_rreg - dummy reg read function 1070 * 1071 * @adev: amdgpu_device pointer 1072 * @reg: offset of register 1073 * 1074 * Dummy register read function. Used for register blocks 1075 * that certain asics don't have (all asics). 1076 * Returns the value in the register. 1077 */ 1078 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1079 { 1080 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1081 BUG(); 1082 return 0; 1083 } 1084 1085 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1086 { 1087 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1088 BUG(); 1089 return 0; 1090 } 1091 1092 /** 1093 * amdgpu_invalid_wreg - dummy reg write function 1094 * 1095 * @adev: amdgpu_device pointer 1096 * @reg: offset of register 1097 * @v: value to write to the register 1098 * 1099 * Dummy register read function. Used for register blocks 1100 * that certain asics don't have (all asics). 1101 */ 1102 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1103 { 1104 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1105 reg, v); 1106 BUG(); 1107 } 1108 1109 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1110 { 1111 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1112 reg, v); 1113 BUG(); 1114 } 1115 1116 /** 1117 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1118 * 1119 * @adev: amdgpu_device pointer 1120 * @reg: offset of register 1121 * 1122 * Dummy register read function. Used for register blocks 1123 * that certain asics don't have (all asics). 1124 * Returns the value in the register. 1125 */ 1126 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1127 { 1128 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1129 BUG(); 1130 return 0; 1131 } 1132 1133 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1134 { 1135 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1136 BUG(); 1137 return 0; 1138 } 1139 1140 /** 1141 * amdgpu_invalid_wreg64 - dummy reg write function 1142 * 1143 * @adev: amdgpu_device pointer 1144 * @reg: offset of register 1145 * @v: value to write to the register 1146 * 1147 * Dummy register read function. Used for register blocks 1148 * that certain asics don't have (all asics). 1149 */ 1150 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1151 { 1152 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1153 reg, v); 1154 BUG(); 1155 } 1156 1157 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1158 { 1159 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1160 reg, v); 1161 BUG(); 1162 } 1163 1164 /** 1165 * amdgpu_block_invalid_rreg - dummy reg read function 1166 * 1167 * @adev: amdgpu_device pointer 1168 * @block: offset of instance 1169 * @reg: offset of register 1170 * 1171 * Dummy register read function. Used for register blocks 1172 * that certain asics don't have (all asics). 1173 * Returns the value in the register. 1174 */ 1175 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1176 uint32_t block, uint32_t reg) 1177 { 1178 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1179 reg, block); 1180 BUG(); 1181 return 0; 1182 } 1183 1184 /** 1185 * amdgpu_block_invalid_wreg - dummy reg write function 1186 * 1187 * @adev: amdgpu_device pointer 1188 * @block: offset of instance 1189 * @reg: offset of register 1190 * @v: value to write to the register 1191 * 1192 * Dummy register read function. Used for register blocks 1193 * that certain asics don't have (all asics). 1194 */ 1195 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1196 uint32_t block, 1197 uint32_t reg, uint32_t v) 1198 { 1199 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1200 reg, block, v); 1201 BUG(); 1202 } 1203 1204 /** 1205 * amdgpu_device_asic_init - Wrapper for atom asic_init 1206 * 1207 * @adev: amdgpu_device pointer 1208 * 1209 * Does any asic specific work and then calls atom asic init. 1210 */ 1211 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1212 { 1213 int ret; 1214 1215 amdgpu_asic_pre_asic_init(adev); 1216 1217 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1218 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1219 amdgpu_psp_wait_for_bootloader(adev); 1220 ret = amdgpu_atomfirmware_asic_init(adev, true); 1221 /* TODO: check the return val and stop device initialization if boot fails */ 1222 amdgpu_psp_query_boot_status(adev); 1223 return ret; 1224 } else { 1225 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1226 } 1227 1228 return 0; 1229 } 1230 1231 /** 1232 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1233 * 1234 * @adev: amdgpu_device pointer 1235 * 1236 * Allocates a scratch page of VRAM for use by various things in the 1237 * driver. 1238 */ 1239 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1240 { 1241 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1242 AMDGPU_GEM_DOMAIN_VRAM | 1243 AMDGPU_GEM_DOMAIN_GTT, 1244 &adev->mem_scratch.robj, 1245 &adev->mem_scratch.gpu_addr, 1246 (void **)&adev->mem_scratch.ptr); 1247 } 1248 1249 /** 1250 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1251 * 1252 * @adev: amdgpu_device pointer 1253 * 1254 * Frees the VRAM scratch page. 1255 */ 1256 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1257 { 1258 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1259 } 1260 1261 /** 1262 * amdgpu_device_program_register_sequence - program an array of registers. 1263 * 1264 * @adev: amdgpu_device pointer 1265 * @registers: pointer to the register array 1266 * @array_size: size of the register array 1267 * 1268 * Programs an array or registers with and or masks. 1269 * This is a helper for setting golden registers. 1270 */ 1271 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1272 const u32 *registers, 1273 const u32 array_size) 1274 { 1275 u32 tmp, reg, and_mask, or_mask; 1276 int i; 1277 1278 if (array_size % 3) 1279 return; 1280 1281 for (i = 0; i < array_size; i += 3) { 1282 reg = registers[i + 0]; 1283 and_mask = registers[i + 1]; 1284 or_mask = registers[i + 2]; 1285 1286 if (and_mask == 0xffffffff) { 1287 tmp = or_mask; 1288 } else { 1289 tmp = RREG32(reg); 1290 tmp &= ~and_mask; 1291 if (adev->family >= AMDGPU_FAMILY_AI) 1292 tmp |= (or_mask & and_mask); 1293 else 1294 tmp |= or_mask; 1295 } 1296 WREG32(reg, tmp); 1297 } 1298 } 1299 1300 /** 1301 * amdgpu_device_pci_config_reset - reset the GPU 1302 * 1303 * @adev: amdgpu_device pointer 1304 * 1305 * Resets the GPU using the pci config reset sequence. 1306 * Only applicable to asics prior to vega10. 1307 */ 1308 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1309 { 1310 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1311 } 1312 1313 /** 1314 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1315 * 1316 * @adev: amdgpu_device pointer 1317 * 1318 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1319 */ 1320 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1321 { 1322 return pci_reset_function(adev->pdev); 1323 } 1324 1325 /* 1326 * amdgpu_device_wb_*() 1327 * Writeback is the method by which the GPU updates special pages in memory 1328 * with the status of certain GPU events (fences, ring pointers,etc.). 1329 */ 1330 1331 /** 1332 * amdgpu_device_wb_fini - Disable Writeback and free memory 1333 * 1334 * @adev: amdgpu_device pointer 1335 * 1336 * Disables Writeback and frees the Writeback memory (all asics). 1337 * Used at driver shutdown. 1338 */ 1339 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1340 { 1341 if (adev->wb.wb_obj) { 1342 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1343 &adev->wb.gpu_addr, 1344 (void **)&adev->wb.wb); 1345 adev->wb.wb_obj = NULL; 1346 } 1347 } 1348 1349 /** 1350 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1351 * 1352 * @adev: amdgpu_device pointer 1353 * 1354 * Initializes writeback and allocates writeback memory (all asics). 1355 * Used at driver startup. 1356 * Returns 0 on success or an -error on failure. 1357 */ 1358 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1359 { 1360 int r; 1361 1362 if (adev->wb.wb_obj == NULL) { 1363 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1364 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1365 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1366 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1367 (void **)&adev->wb.wb); 1368 if (r) { 1369 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1370 return r; 1371 } 1372 1373 adev->wb.num_wb = AMDGPU_MAX_WB; 1374 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1375 1376 /* clear wb memory */ 1377 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1378 } 1379 1380 return 0; 1381 } 1382 1383 /** 1384 * amdgpu_device_wb_get - Allocate a wb entry 1385 * 1386 * @adev: amdgpu_device pointer 1387 * @wb: wb index 1388 * 1389 * Allocate a wb slot for use by the driver (all asics). 1390 * Returns 0 on success or -EINVAL on failure. 1391 */ 1392 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1393 { 1394 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1395 1396 if (offset < adev->wb.num_wb) { 1397 __set_bit(offset, adev->wb.used); 1398 *wb = offset << 3; /* convert to dw offset */ 1399 return 0; 1400 } else { 1401 return -EINVAL; 1402 } 1403 } 1404 1405 /** 1406 * amdgpu_device_wb_free - Free a wb entry 1407 * 1408 * @adev: amdgpu_device pointer 1409 * @wb: wb index 1410 * 1411 * Free a wb slot allocated for use by the driver (all asics) 1412 */ 1413 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1414 { 1415 wb >>= 3; 1416 if (wb < adev->wb.num_wb) 1417 __clear_bit(wb, adev->wb.used); 1418 } 1419 1420 /** 1421 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1422 * 1423 * @adev: amdgpu_device pointer 1424 * 1425 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1426 * to fail, but if any of the BARs is not accessible after the size we abort 1427 * driver loading by returning -ENODEV. 1428 */ 1429 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1430 { 1431 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1432 struct pci_bus *root; 1433 struct resource *res; 1434 unsigned int i; 1435 u16 cmd; 1436 int r; 1437 1438 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1439 return 0; 1440 1441 /* Bypass for VF */ 1442 if (amdgpu_sriov_vf(adev)) 1443 return 0; 1444 1445 /* skip if the bios has already enabled large BAR */ 1446 if (adev->gmc.real_vram_size && 1447 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1448 return 0; 1449 1450 /* Check if the root BUS has 64bit memory resources */ 1451 root = adev->pdev->bus; 1452 while (root->parent) 1453 root = root->parent; 1454 1455 pci_bus_for_each_resource(root, res, i) { 1456 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1457 res->start > 0x100000000ull) 1458 break; 1459 } 1460 1461 /* Trying to resize is pointless without a root hub window above 4GB */ 1462 if (!res) 1463 return 0; 1464 1465 /* Limit the BAR size to what is available */ 1466 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1467 rbar_size); 1468 1469 /* Disable memory decoding while we change the BAR addresses and size */ 1470 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1471 pci_write_config_word(adev->pdev, PCI_COMMAND, 1472 cmd & ~PCI_COMMAND_MEMORY); 1473 1474 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1475 amdgpu_doorbell_fini(adev); 1476 if (adev->asic_type >= CHIP_BONAIRE) 1477 pci_release_resource(adev->pdev, 2); 1478 1479 pci_release_resource(adev->pdev, 0); 1480 1481 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1482 if (r == -ENOSPC) 1483 DRM_INFO("Not enough PCI address space for a large BAR."); 1484 else if (r && r != -ENOTSUPP) 1485 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1486 1487 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1488 1489 /* When the doorbell or fb BAR isn't available we have no chance of 1490 * using the device. 1491 */ 1492 r = amdgpu_doorbell_init(adev); 1493 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1494 return -ENODEV; 1495 1496 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1497 1498 return 0; 1499 } 1500 1501 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1502 { 1503 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1504 return false; 1505 1506 return true; 1507 } 1508 1509 /* 1510 * GPU helpers function. 1511 */ 1512 /** 1513 * amdgpu_device_need_post - check if the hw need post or not 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Check if the asic has been initialized (all asics) at driver startup 1518 * or post is needed if hw reset is performed. 1519 * Returns true if need or false if not. 1520 */ 1521 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1522 { 1523 uint32_t reg; 1524 1525 if (amdgpu_sriov_vf(adev)) 1526 return false; 1527 1528 if (!amdgpu_device_read_bios(adev)) 1529 return false; 1530 1531 if (amdgpu_passthrough(adev)) { 1532 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1533 * some old smc fw still need driver do vPost otherwise gpu hang, while 1534 * those smc fw version above 22.15 doesn't have this flaw, so we force 1535 * vpost executed for smc version below 22.15 1536 */ 1537 if (adev->asic_type == CHIP_FIJI) { 1538 int err; 1539 uint32_t fw_ver; 1540 1541 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1542 /* force vPost if error occured */ 1543 if (err) 1544 return true; 1545 1546 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1547 if (fw_ver < 0x00160e00) 1548 return true; 1549 } 1550 } 1551 1552 /* Don't post if we need to reset whole hive on init */ 1553 if (adev->gmc.xgmi.pending_reset) 1554 return false; 1555 1556 if (adev->has_hw_reset) { 1557 adev->has_hw_reset = false; 1558 return true; 1559 } 1560 1561 /* bios scratch used on CIK+ */ 1562 if (adev->asic_type >= CHIP_BONAIRE) 1563 return amdgpu_atombios_scratch_need_asic_init(adev); 1564 1565 /* check MEM_SIZE for older asics */ 1566 reg = amdgpu_asic_get_config_memsize(adev); 1567 1568 if ((reg != 0) && (reg != 0xffffffff)) 1569 return false; 1570 1571 return true; 1572 } 1573 1574 /* 1575 * Check whether seamless boot is supported. 1576 * 1577 * So far we only support seamless boot on DCE 3.0 or later. 1578 * If users report that it works on older ASICS as well, we may 1579 * loosen this. 1580 */ 1581 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1582 { 1583 switch (amdgpu_seamless) { 1584 case -1: 1585 break; 1586 case 1: 1587 return true; 1588 case 0: 1589 return false; 1590 default: 1591 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1592 amdgpu_seamless); 1593 return false; 1594 } 1595 1596 if (!(adev->flags & AMD_IS_APU)) 1597 return false; 1598 1599 if (adev->mman.keep_stolen_vga_memory) 1600 return false; 1601 1602 return adev->ip_versions[DCE_HWIP][0] >= IP_VERSION(3, 0, 0); 1603 } 1604 1605 /* 1606 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1607 * don't support dynamic speed switching. Until we have confirmation from Intel 1608 * that a specific host supports it, it's safer that we keep it disabled for all. 1609 * 1610 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1611 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1612 */ 1613 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1614 { 1615 #if IS_ENABLED(CONFIG_X86) 1616 struct cpuinfo_x86 *c = &cpu_data(0); 1617 1618 /* eGPU change speeds based on USB4 fabric conditions */ 1619 if (dev_is_removable(adev->dev)) 1620 return true; 1621 1622 if (c->x86_vendor == X86_VENDOR_INTEL) 1623 return false; 1624 #endif 1625 return true; 1626 } 1627 1628 /** 1629 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1630 * 1631 * @adev: amdgpu_device pointer 1632 * 1633 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1634 * be set for this device. 1635 * 1636 * Returns true if it should be used or false if not. 1637 */ 1638 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1639 { 1640 switch (amdgpu_aspm) { 1641 case -1: 1642 break; 1643 case 0: 1644 return false; 1645 case 1: 1646 return true; 1647 default: 1648 return false; 1649 } 1650 if (adev->flags & AMD_IS_APU) 1651 return false; 1652 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1653 return false; 1654 return pcie_aspm_enabled(adev->pdev); 1655 } 1656 1657 /* if we get transitioned to only one device, take VGA back */ 1658 /** 1659 * amdgpu_device_vga_set_decode - enable/disable vga decode 1660 * 1661 * @pdev: PCI device pointer 1662 * @state: enable/disable vga decode 1663 * 1664 * Enable/disable vga decode (all asics). 1665 * Returns VGA resource flags. 1666 */ 1667 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1668 bool state) 1669 { 1670 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1671 1672 amdgpu_asic_set_vga_state(adev, state); 1673 if (state) 1674 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1675 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1676 else 1677 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1678 } 1679 1680 /** 1681 * amdgpu_device_check_block_size - validate the vm block size 1682 * 1683 * @adev: amdgpu_device pointer 1684 * 1685 * Validates the vm block size specified via module parameter. 1686 * The vm block size defines number of bits in page table versus page directory, 1687 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1688 * page table and the remaining bits are in the page directory. 1689 */ 1690 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1691 { 1692 /* defines number of bits in page table versus page directory, 1693 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1694 * page table and the remaining bits are in the page directory 1695 */ 1696 if (amdgpu_vm_block_size == -1) 1697 return; 1698 1699 if (amdgpu_vm_block_size < 9) { 1700 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1701 amdgpu_vm_block_size); 1702 amdgpu_vm_block_size = -1; 1703 } 1704 } 1705 1706 /** 1707 * amdgpu_device_check_vm_size - validate the vm size 1708 * 1709 * @adev: amdgpu_device pointer 1710 * 1711 * Validates the vm size in GB specified via module parameter. 1712 * The VM size is the size of the GPU virtual memory space in GB. 1713 */ 1714 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1715 { 1716 /* no need to check the default value */ 1717 if (amdgpu_vm_size == -1) 1718 return; 1719 1720 if (amdgpu_vm_size < 1) { 1721 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1722 amdgpu_vm_size); 1723 amdgpu_vm_size = -1; 1724 } 1725 } 1726 1727 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1728 { 1729 struct sysinfo si; 1730 bool is_os_64 = (sizeof(void *) == 8); 1731 uint64_t total_memory; 1732 uint64_t dram_size_seven_GB = 0x1B8000000; 1733 uint64_t dram_size_three_GB = 0xB8000000; 1734 1735 if (amdgpu_smu_memory_pool_size == 0) 1736 return; 1737 1738 if (!is_os_64) { 1739 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1740 goto def_value; 1741 } 1742 si_meminfo(&si); 1743 total_memory = (uint64_t)si.totalram * si.mem_unit; 1744 1745 if ((amdgpu_smu_memory_pool_size == 1) || 1746 (amdgpu_smu_memory_pool_size == 2)) { 1747 if (total_memory < dram_size_three_GB) 1748 goto def_value1; 1749 } else if ((amdgpu_smu_memory_pool_size == 4) || 1750 (amdgpu_smu_memory_pool_size == 8)) { 1751 if (total_memory < dram_size_seven_GB) 1752 goto def_value1; 1753 } else { 1754 DRM_WARN("Smu memory pool size not supported\n"); 1755 goto def_value; 1756 } 1757 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1758 1759 return; 1760 1761 def_value1: 1762 DRM_WARN("No enough system memory\n"); 1763 def_value: 1764 adev->pm.smu_prv_buffer_size = 0; 1765 } 1766 1767 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1768 { 1769 if (!(adev->flags & AMD_IS_APU) || 1770 adev->asic_type < CHIP_RAVEN) 1771 return 0; 1772 1773 switch (adev->asic_type) { 1774 case CHIP_RAVEN: 1775 if (adev->pdev->device == 0x15dd) 1776 adev->apu_flags |= AMD_APU_IS_RAVEN; 1777 if (adev->pdev->device == 0x15d8) 1778 adev->apu_flags |= AMD_APU_IS_PICASSO; 1779 break; 1780 case CHIP_RENOIR: 1781 if ((adev->pdev->device == 0x1636) || 1782 (adev->pdev->device == 0x164c)) 1783 adev->apu_flags |= AMD_APU_IS_RENOIR; 1784 else 1785 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1786 break; 1787 case CHIP_VANGOGH: 1788 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1789 break; 1790 case CHIP_YELLOW_CARP: 1791 break; 1792 case CHIP_CYAN_SKILLFISH: 1793 if ((adev->pdev->device == 0x13FE) || 1794 (adev->pdev->device == 0x143F)) 1795 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1796 break; 1797 default: 1798 break; 1799 } 1800 1801 return 0; 1802 } 1803 1804 /** 1805 * amdgpu_device_check_arguments - validate module params 1806 * 1807 * @adev: amdgpu_device pointer 1808 * 1809 * Validates certain module parameters and updates 1810 * the associated values used by the driver (all asics). 1811 */ 1812 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1813 { 1814 if (amdgpu_sched_jobs < 4) { 1815 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1816 amdgpu_sched_jobs); 1817 amdgpu_sched_jobs = 4; 1818 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1819 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1820 amdgpu_sched_jobs); 1821 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1822 } 1823 1824 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1825 /* gart size must be greater or equal to 32M */ 1826 dev_warn(adev->dev, "gart size (%d) too small\n", 1827 amdgpu_gart_size); 1828 amdgpu_gart_size = -1; 1829 } 1830 1831 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1832 /* gtt size must be greater or equal to 32M */ 1833 dev_warn(adev->dev, "gtt size (%d) too small\n", 1834 amdgpu_gtt_size); 1835 amdgpu_gtt_size = -1; 1836 } 1837 1838 /* valid range is between 4 and 9 inclusive */ 1839 if (amdgpu_vm_fragment_size != -1 && 1840 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1841 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1842 amdgpu_vm_fragment_size = -1; 1843 } 1844 1845 if (amdgpu_sched_hw_submission < 2) { 1846 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1847 amdgpu_sched_hw_submission); 1848 amdgpu_sched_hw_submission = 2; 1849 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1850 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1851 amdgpu_sched_hw_submission); 1852 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1853 } 1854 1855 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1856 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1857 amdgpu_reset_method = -1; 1858 } 1859 1860 amdgpu_device_check_smu_prv_buffer_size(adev); 1861 1862 amdgpu_device_check_vm_size(adev); 1863 1864 amdgpu_device_check_block_size(adev); 1865 1866 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1867 1868 return 0; 1869 } 1870 1871 /** 1872 * amdgpu_switcheroo_set_state - set switcheroo state 1873 * 1874 * @pdev: pci dev pointer 1875 * @state: vga_switcheroo state 1876 * 1877 * Callback for the switcheroo driver. Suspends or resumes 1878 * the asics before or after it is powered up using ACPI methods. 1879 */ 1880 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1881 enum vga_switcheroo_state state) 1882 { 1883 struct drm_device *dev = pci_get_drvdata(pdev); 1884 int r; 1885 1886 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1887 return; 1888 1889 if (state == VGA_SWITCHEROO_ON) { 1890 pr_info("switched on\n"); 1891 /* don't suspend or resume card normally */ 1892 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1893 1894 pci_set_power_state(pdev, PCI_D0); 1895 amdgpu_device_load_pci_state(pdev); 1896 r = pci_enable_device(pdev); 1897 if (r) 1898 DRM_WARN("pci_enable_device failed (%d)\n", r); 1899 amdgpu_device_resume(dev, true); 1900 1901 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1902 } else { 1903 pr_info("switched off\n"); 1904 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1905 amdgpu_device_prepare(dev); 1906 amdgpu_device_suspend(dev, true); 1907 amdgpu_device_cache_pci_state(pdev); 1908 /* Shut down the device */ 1909 pci_disable_device(pdev); 1910 pci_set_power_state(pdev, PCI_D3cold); 1911 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1912 } 1913 } 1914 1915 /** 1916 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1917 * 1918 * @pdev: pci dev pointer 1919 * 1920 * Callback for the switcheroo driver. Check of the switcheroo 1921 * state can be changed. 1922 * Returns true if the state can be changed, false if not. 1923 */ 1924 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1925 { 1926 struct drm_device *dev = pci_get_drvdata(pdev); 1927 1928 /* 1929 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1930 * locking inversion with the driver load path. And the access here is 1931 * completely racy anyway. So don't bother with locking for now. 1932 */ 1933 return atomic_read(&dev->open_count) == 0; 1934 } 1935 1936 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1937 .set_gpu_state = amdgpu_switcheroo_set_state, 1938 .reprobe = NULL, 1939 .can_switch = amdgpu_switcheroo_can_switch, 1940 }; 1941 1942 /** 1943 * amdgpu_device_ip_set_clockgating_state - set the CG state 1944 * 1945 * @dev: amdgpu_device pointer 1946 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1947 * @state: clockgating state (gate or ungate) 1948 * 1949 * Sets the requested clockgating state for all instances of 1950 * the hardware IP specified. 1951 * Returns the error code from the last instance. 1952 */ 1953 int amdgpu_device_ip_set_clockgating_state(void *dev, 1954 enum amd_ip_block_type block_type, 1955 enum amd_clockgating_state state) 1956 { 1957 struct amdgpu_device *adev = dev; 1958 int i, r = 0; 1959 1960 for (i = 0; i < adev->num_ip_blocks; i++) { 1961 if (!adev->ip_blocks[i].status.valid) 1962 continue; 1963 if (adev->ip_blocks[i].version->type != block_type) 1964 continue; 1965 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1966 continue; 1967 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1968 (void *)adev, state); 1969 if (r) 1970 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1971 adev->ip_blocks[i].version->funcs->name, r); 1972 } 1973 return r; 1974 } 1975 1976 /** 1977 * amdgpu_device_ip_set_powergating_state - set the PG state 1978 * 1979 * @dev: amdgpu_device pointer 1980 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1981 * @state: powergating state (gate or ungate) 1982 * 1983 * Sets the requested powergating state for all instances of 1984 * the hardware IP specified. 1985 * Returns the error code from the last instance. 1986 */ 1987 int amdgpu_device_ip_set_powergating_state(void *dev, 1988 enum amd_ip_block_type block_type, 1989 enum amd_powergating_state state) 1990 { 1991 struct amdgpu_device *adev = dev; 1992 int i, r = 0; 1993 1994 for (i = 0; i < adev->num_ip_blocks; i++) { 1995 if (!adev->ip_blocks[i].status.valid) 1996 continue; 1997 if (adev->ip_blocks[i].version->type != block_type) 1998 continue; 1999 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2000 continue; 2001 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2002 (void *)adev, state); 2003 if (r) 2004 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2005 adev->ip_blocks[i].version->funcs->name, r); 2006 } 2007 return r; 2008 } 2009 2010 /** 2011 * amdgpu_device_ip_get_clockgating_state - get the CG state 2012 * 2013 * @adev: amdgpu_device pointer 2014 * @flags: clockgating feature flags 2015 * 2016 * Walks the list of IPs on the device and updates the clockgating 2017 * flags for each IP. 2018 * Updates @flags with the feature flags for each hardware IP where 2019 * clockgating is enabled. 2020 */ 2021 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2022 u64 *flags) 2023 { 2024 int i; 2025 2026 for (i = 0; i < adev->num_ip_blocks; i++) { 2027 if (!adev->ip_blocks[i].status.valid) 2028 continue; 2029 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2030 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2031 } 2032 } 2033 2034 /** 2035 * amdgpu_device_ip_wait_for_idle - wait for idle 2036 * 2037 * @adev: amdgpu_device pointer 2038 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2039 * 2040 * Waits for the request hardware IP to be idle. 2041 * Returns 0 for success or a negative error code on failure. 2042 */ 2043 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2044 enum amd_ip_block_type block_type) 2045 { 2046 int i, r; 2047 2048 for (i = 0; i < adev->num_ip_blocks; i++) { 2049 if (!adev->ip_blocks[i].status.valid) 2050 continue; 2051 if (adev->ip_blocks[i].version->type == block_type) { 2052 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2053 if (r) 2054 return r; 2055 break; 2056 } 2057 } 2058 return 0; 2059 2060 } 2061 2062 /** 2063 * amdgpu_device_ip_is_idle - is the hardware IP idle 2064 * 2065 * @adev: amdgpu_device pointer 2066 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2067 * 2068 * Check if the hardware IP is idle or not. 2069 * Returns true if it the IP is idle, false if not. 2070 */ 2071 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2072 enum amd_ip_block_type block_type) 2073 { 2074 int i; 2075 2076 for (i = 0; i < adev->num_ip_blocks; i++) { 2077 if (!adev->ip_blocks[i].status.valid) 2078 continue; 2079 if (adev->ip_blocks[i].version->type == block_type) 2080 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2081 } 2082 return true; 2083 2084 } 2085 2086 /** 2087 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2088 * 2089 * @adev: amdgpu_device pointer 2090 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2091 * 2092 * Returns a pointer to the hardware IP block structure 2093 * if it exists for the asic, otherwise NULL. 2094 */ 2095 struct amdgpu_ip_block * 2096 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2097 enum amd_ip_block_type type) 2098 { 2099 int i; 2100 2101 for (i = 0; i < adev->num_ip_blocks; i++) 2102 if (adev->ip_blocks[i].version->type == type) 2103 return &adev->ip_blocks[i]; 2104 2105 return NULL; 2106 } 2107 2108 /** 2109 * amdgpu_device_ip_block_version_cmp 2110 * 2111 * @adev: amdgpu_device pointer 2112 * @type: enum amd_ip_block_type 2113 * @major: major version 2114 * @minor: minor version 2115 * 2116 * return 0 if equal or greater 2117 * return 1 if smaller or the ip_block doesn't exist 2118 */ 2119 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2120 enum amd_ip_block_type type, 2121 u32 major, u32 minor) 2122 { 2123 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2124 2125 if (ip_block && ((ip_block->version->major > major) || 2126 ((ip_block->version->major == major) && 2127 (ip_block->version->minor >= minor)))) 2128 return 0; 2129 2130 return 1; 2131 } 2132 2133 /** 2134 * amdgpu_device_ip_block_add 2135 * 2136 * @adev: amdgpu_device pointer 2137 * @ip_block_version: pointer to the IP to add 2138 * 2139 * Adds the IP block driver information to the collection of IPs 2140 * on the asic. 2141 */ 2142 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2143 const struct amdgpu_ip_block_version *ip_block_version) 2144 { 2145 if (!ip_block_version) 2146 return -EINVAL; 2147 2148 switch (ip_block_version->type) { 2149 case AMD_IP_BLOCK_TYPE_VCN: 2150 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2151 return 0; 2152 break; 2153 case AMD_IP_BLOCK_TYPE_JPEG: 2154 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2155 return 0; 2156 break; 2157 default: 2158 break; 2159 } 2160 2161 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2162 ip_block_version->funcs->name); 2163 2164 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2165 2166 return 0; 2167 } 2168 2169 /** 2170 * amdgpu_device_enable_virtual_display - enable virtual display feature 2171 * 2172 * @adev: amdgpu_device pointer 2173 * 2174 * Enabled the virtual display feature if the user has enabled it via 2175 * the module parameter virtual_display. This feature provides a virtual 2176 * display hardware on headless boards or in virtualized environments. 2177 * This function parses and validates the configuration string specified by 2178 * the user and configues the virtual display configuration (number of 2179 * virtual connectors, crtcs, etc.) specified. 2180 */ 2181 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2182 { 2183 adev->enable_virtual_display = false; 2184 2185 if (amdgpu_virtual_display) { 2186 const char *pci_address_name = pci_name(adev->pdev); 2187 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2188 2189 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2190 pciaddstr_tmp = pciaddstr; 2191 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2192 pciaddname = strsep(&pciaddname_tmp, ","); 2193 if (!strcmp("all", pciaddname) 2194 || !strcmp(pci_address_name, pciaddname)) { 2195 long num_crtc; 2196 int res = -1; 2197 2198 adev->enable_virtual_display = true; 2199 2200 if (pciaddname_tmp) 2201 res = kstrtol(pciaddname_tmp, 10, 2202 &num_crtc); 2203 2204 if (!res) { 2205 if (num_crtc < 1) 2206 num_crtc = 1; 2207 if (num_crtc > 6) 2208 num_crtc = 6; 2209 adev->mode_info.num_crtc = num_crtc; 2210 } else { 2211 adev->mode_info.num_crtc = 1; 2212 } 2213 break; 2214 } 2215 } 2216 2217 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2218 amdgpu_virtual_display, pci_address_name, 2219 adev->enable_virtual_display, adev->mode_info.num_crtc); 2220 2221 kfree(pciaddstr); 2222 } 2223 } 2224 2225 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2226 { 2227 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2228 adev->mode_info.num_crtc = 1; 2229 adev->enable_virtual_display = true; 2230 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2231 adev->enable_virtual_display, adev->mode_info.num_crtc); 2232 } 2233 } 2234 2235 /** 2236 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2237 * 2238 * @adev: amdgpu_device pointer 2239 * 2240 * Parses the asic configuration parameters specified in the gpu info 2241 * firmware and makes them availale to the driver for use in configuring 2242 * the asic. 2243 * Returns 0 on success, -EINVAL on failure. 2244 */ 2245 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2246 { 2247 const char *chip_name; 2248 char fw_name[40]; 2249 int err; 2250 const struct gpu_info_firmware_header_v1_0 *hdr; 2251 2252 adev->firmware.gpu_info_fw = NULL; 2253 2254 if (adev->mman.discovery_bin) { 2255 /* 2256 * FIXME: The bounding box is still needed by Navi12, so 2257 * temporarily read it from gpu_info firmware. Should be dropped 2258 * when DAL no longer needs it. 2259 */ 2260 if (adev->asic_type != CHIP_NAVI12) 2261 return 0; 2262 } 2263 2264 switch (adev->asic_type) { 2265 default: 2266 return 0; 2267 case CHIP_VEGA10: 2268 chip_name = "vega10"; 2269 break; 2270 case CHIP_VEGA12: 2271 chip_name = "vega12"; 2272 break; 2273 case CHIP_RAVEN: 2274 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2275 chip_name = "raven2"; 2276 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2277 chip_name = "picasso"; 2278 else 2279 chip_name = "raven"; 2280 break; 2281 case CHIP_ARCTURUS: 2282 chip_name = "arcturus"; 2283 break; 2284 case CHIP_NAVI12: 2285 chip_name = "navi12"; 2286 break; 2287 } 2288 2289 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2290 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2291 if (err) { 2292 dev_err(adev->dev, 2293 "Failed to get gpu_info firmware \"%s\"\n", 2294 fw_name); 2295 goto out; 2296 } 2297 2298 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2299 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2300 2301 switch (hdr->version_major) { 2302 case 1: 2303 { 2304 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2305 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2306 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2307 2308 /* 2309 * Should be droped when DAL no longer needs it. 2310 */ 2311 if (adev->asic_type == CHIP_NAVI12) 2312 goto parse_soc_bounding_box; 2313 2314 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2315 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2316 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2317 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2318 adev->gfx.config.max_texture_channel_caches = 2319 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2320 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2321 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2322 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2323 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2324 adev->gfx.config.double_offchip_lds_buf = 2325 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2326 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2327 adev->gfx.cu_info.max_waves_per_simd = 2328 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2329 adev->gfx.cu_info.max_scratch_slots_per_cu = 2330 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2331 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2332 if (hdr->version_minor >= 1) { 2333 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2334 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2335 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2336 adev->gfx.config.num_sc_per_sh = 2337 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2338 adev->gfx.config.num_packer_per_sc = 2339 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2340 } 2341 2342 parse_soc_bounding_box: 2343 /* 2344 * soc bounding box info is not integrated in disocovery table, 2345 * we always need to parse it from gpu info firmware if needed. 2346 */ 2347 if (hdr->version_minor == 2) { 2348 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2349 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2350 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2351 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2352 } 2353 break; 2354 } 2355 default: 2356 dev_err(adev->dev, 2357 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2358 err = -EINVAL; 2359 goto out; 2360 } 2361 out: 2362 return err; 2363 } 2364 2365 /** 2366 * amdgpu_device_ip_early_init - run early init for hardware IPs 2367 * 2368 * @adev: amdgpu_device pointer 2369 * 2370 * Early initialization pass for hardware IPs. The hardware IPs that make 2371 * up each asic are discovered each IP's early_init callback is run. This 2372 * is the first stage in initializing the asic. 2373 * Returns 0 on success, negative error code on failure. 2374 */ 2375 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2376 { 2377 struct pci_dev *parent; 2378 int i, r; 2379 bool total; 2380 2381 amdgpu_device_enable_virtual_display(adev); 2382 2383 if (amdgpu_sriov_vf(adev)) { 2384 r = amdgpu_virt_request_full_gpu(adev, true); 2385 if (r) 2386 return r; 2387 } 2388 2389 switch (adev->asic_type) { 2390 #ifdef CONFIG_DRM_AMDGPU_SI 2391 case CHIP_VERDE: 2392 case CHIP_TAHITI: 2393 case CHIP_PITCAIRN: 2394 case CHIP_OLAND: 2395 case CHIP_HAINAN: 2396 adev->family = AMDGPU_FAMILY_SI; 2397 r = si_set_ip_blocks(adev); 2398 if (r) 2399 return r; 2400 break; 2401 #endif 2402 #ifdef CONFIG_DRM_AMDGPU_CIK 2403 case CHIP_BONAIRE: 2404 case CHIP_HAWAII: 2405 case CHIP_KAVERI: 2406 case CHIP_KABINI: 2407 case CHIP_MULLINS: 2408 if (adev->flags & AMD_IS_APU) 2409 adev->family = AMDGPU_FAMILY_KV; 2410 else 2411 adev->family = AMDGPU_FAMILY_CI; 2412 2413 r = cik_set_ip_blocks(adev); 2414 if (r) 2415 return r; 2416 break; 2417 #endif 2418 case CHIP_TOPAZ: 2419 case CHIP_TONGA: 2420 case CHIP_FIJI: 2421 case CHIP_POLARIS10: 2422 case CHIP_POLARIS11: 2423 case CHIP_POLARIS12: 2424 case CHIP_VEGAM: 2425 case CHIP_CARRIZO: 2426 case CHIP_STONEY: 2427 if (adev->flags & AMD_IS_APU) 2428 adev->family = AMDGPU_FAMILY_CZ; 2429 else 2430 adev->family = AMDGPU_FAMILY_VI; 2431 2432 r = vi_set_ip_blocks(adev); 2433 if (r) 2434 return r; 2435 break; 2436 default: 2437 r = amdgpu_discovery_set_ip_blocks(adev); 2438 if (r) 2439 return r; 2440 break; 2441 } 2442 2443 if (amdgpu_has_atpx() && 2444 (amdgpu_is_atpx_hybrid() || 2445 amdgpu_has_atpx_dgpu_power_cntl()) && 2446 ((adev->flags & AMD_IS_APU) == 0) && 2447 !dev_is_removable(&adev->pdev->dev)) 2448 adev->flags |= AMD_IS_PX; 2449 2450 if (!(adev->flags & AMD_IS_APU)) { 2451 parent = pcie_find_root_port(adev->pdev); 2452 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2453 } 2454 2455 2456 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2457 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2458 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2459 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2460 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2461 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2462 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2463 2464 total = true; 2465 for (i = 0; i < adev->num_ip_blocks; i++) { 2466 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2467 DRM_WARN("disabled ip block: %d <%s>\n", 2468 i, adev->ip_blocks[i].version->funcs->name); 2469 adev->ip_blocks[i].status.valid = false; 2470 } else { 2471 if (adev->ip_blocks[i].version->funcs->early_init) { 2472 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2473 if (r == -ENOENT) { 2474 adev->ip_blocks[i].status.valid = false; 2475 } else if (r) { 2476 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2477 adev->ip_blocks[i].version->funcs->name, r); 2478 total = false; 2479 } else { 2480 adev->ip_blocks[i].status.valid = true; 2481 } 2482 } else { 2483 adev->ip_blocks[i].status.valid = true; 2484 } 2485 } 2486 /* get the vbios after the asic_funcs are set up */ 2487 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2488 r = amdgpu_device_parse_gpu_info_fw(adev); 2489 if (r) 2490 return r; 2491 2492 /* Read BIOS */ 2493 if (amdgpu_device_read_bios(adev)) { 2494 if (!amdgpu_get_bios(adev)) 2495 return -EINVAL; 2496 2497 r = amdgpu_atombios_init(adev); 2498 if (r) { 2499 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2500 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2501 return r; 2502 } 2503 } 2504 2505 /*get pf2vf msg info at it's earliest time*/ 2506 if (amdgpu_sriov_vf(adev)) 2507 amdgpu_virt_init_data_exchange(adev); 2508 2509 } 2510 } 2511 if (!total) 2512 return -ENODEV; 2513 2514 amdgpu_amdkfd_device_probe(adev); 2515 adev->cg_flags &= amdgpu_cg_mask; 2516 adev->pg_flags &= amdgpu_pg_mask; 2517 2518 return 0; 2519 } 2520 2521 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2522 { 2523 int i, r; 2524 2525 for (i = 0; i < adev->num_ip_blocks; i++) { 2526 if (!adev->ip_blocks[i].status.sw) 2527 continue; 2528 if (adev->ip_blocks[i].status.hw) 2529 continue; 2530 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2531 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2532 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2533 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2534 if (r) { 2535 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2536 adev->ip_blocks[i].version->funcs->name, r); 2537 return r; 2538 } 2539 adev->ip_blocks[i].status.hw = true; 2540 } 2541 } 2542 2543 return 0; 2544 } 2545 2546 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2547 { 2548 int i, r; 2549 2550 for (i = 0; i < adev->num_ip_blocks; i++) { 2551 if (!adev->ip_blocks[i].status.sw) 2552 continue; 2553 if (adev->ip_blocks[i].status.hw) 2554 continue; 2555 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2556 if (r) { 2557 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2558 adev->ip_blocks[i].version->funcs->name, r); 2559 return r; 2560 } 2561 adev->ip_blocks[i].status.hw = true; 2562 } 2563 2564 return 0; 2565 } 2566 2567 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2568 { 2569 int r = 0; 2570 int i; 2571 uint32_t smu_version; 2572 2573 if (adev->asic_type >= CHIP_VEGA10) { 2574 for (i = 0; i < adev->num_ip_blocks; i++) { 2575 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2576 continue; 2577 2578 if (!adev->ip_blocks[i].status.sw) 2579 continue; 2580 2581 /* no need to do the fw loading again if already done*/ 2582 if (adev->ip_blocks[i].status.hw == true) 2583 break; 2584 2585 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2586 r = adev->ip_blocks[i].version->funcs->resume(adev); 2587 if (r) { 2588 DRM_ERROR("resume of IP block <%s> failed %d\n", 2589 adev->ip_blocks[i].version->funcs->name, r); 2590 return r; 2591 } 2592 } else { 2593 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2594 if (r) { 2595 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2596 adev->ip_blocks[i].version->funcs->name, r); 2597 return r; 2598 } 2599 } 2600 2601 adev->ip_blocks[i].status.hw = true; 2602 break; 2603 } 2604 } 2605 2606 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2607 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2608 2609 return r; 2610 } 2611 2612 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2613 { 2614 long timeout; 2615 int r, i; 2616 2617 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2618 struct amdgpu_ring *ring = adev->rings[i]; 2619 2620 /* No need to setup the GPU scheduler for rings that don't need it */ 2621 if (!ring || ring->no_scheduler) 2622 continue; 2623 2624 switch (ring->funcs->type) { 2625 case AMDGPU_RING_TYPE_GFX: 2626 timeout = adev->gfx_timeout; 2627 break; 2628 case AMDGPU_RING_TYPE_COMPUTE: 2629 timeout = adev->compute_timeout; 2630 break; 2631 case AMDGPU_RING_TYPE_SDMA: 2632 timeout = adev->sdma_timeout; 2633 break; 2634 default: 2635 timeout = adev->video_timeout; 2636 break; 2637 } 2638 2639 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2640 DRM_SCHED_PRIORITY_COUNT, 2641 ring->num_hw_submission, 0, 2642 timeout, adev->reset_domain->wq, 2643 ring->sched_score, ring->name, 2644 adev->dev); 2645 if (r) { 2646 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2647 ring->name); 2648 return r; 2649 } 2650 r = amdgpu_uvd_entity_init(adev, ring); 2651 if (r) { 2652 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2653 ring->name); 2654 return r; 2655 } 2656 r = amdgpu_vce_entity_init(adev, ring); 2657 if (r) { 2658 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2659 ring->name); 2660 return r; 2661 } 2662 } 2663 2664 amdgpu_xcp_update_partition_sched_list(adev); 2665 2666 return 0; 2667 } 2668 2669 2670 /** 2671 * amdgpu_device_ip_init - run init for hardware IPs 2672 * 2673 * @adev: amdgpu_device pointer 2674 * 2675 * Main initialization pass for hardware IPs. The list of all the hardware 2676 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2677 * are run. sw_init initializes the software state associated with each IP 2678 * and hw_init initializes the hardware associated with each IP. 2679 * Returns 0 on success, negative error code on failure. 2680 */ 2681 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2682 { 2683 int i, r; 2684 2685 r = amdgpu_ras_init(adev); 2686 if (r) 2687 return r; 2688 2689 for (i = 0; i < adev->num_ip_blocks; i++) { 2690 if (!adev->ip_blocks[i].status.valid) 2691 continue; 2692 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2693 if (r) { 2694 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2695 adev->ip_blocks[i].version->funcs->name, r); 2696 goto init_failed; 2697 } 2698 adev->ip_blocks[i].status.sw = true; 2699 2700 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2701 /* need to do common hw init early so everything is set up for gmc */ 2702 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2703 if (r) { 2704 DRM_ERROR("hw_init %d failed %d\n", i, r); 2705 goto init_failed; 2706 } 2707 adev->ip_blocks[i].status.hw = true; 2708 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2709 /* need to do gmc hw init early so we can allocate gpu mem */ 2710 /* Try to reserve bad pages early */ 2711 if (amdgpu_sriov_vf(adev)) 2712 amdgpu_virt_exchange_data(adev); 2713 2714 r = amdgpu_device_mem_scratch_init(adev); 2715 if (r) { 2716 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2717 goto init_failed; 2718 } 2719 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2720 if (r) { 2721 DRM_ERROR("hw_init %d failed %d\n", i, r); 2722 goto init_failed; 2723 } 2724 r = amdgpu_device_wb_init(adev); 2725 if (r) { 2726 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2727 goto init_failed; 2728 } 2729 adev->ip_blocks[i].status.hw = true; 2730 2731 /* right after GMC hw init, we create CSA */ 2732 if (adev->gfx.mcbp) { 2733 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2734 AMDGPU_GEM_DOMAIN_VRAM | 2735 AMDGPU_GEM_DOMAIN_GTT, 2736 AMDGPU_CSA_SIZE); 2737 if (r) { 2738 DRM_ERROR("allocate CSA failed %d\n", r); 2739 goto init_failed; 2740 } 2741 } 2742 2743 r = amdgpu_seq64_init(adev); 2744 if (r) { 2745 DRM_ERROR("allocate seq64 failed %d\n", r); 2746 goto init_failed; 2747 } 2748 } 2749 } 2750 2751 if (amdgpu_sriov_vf(adev)) 2752 amdgpu_virt_init_data_exchange(adev); 2753 2754 r = amdgpu_ib_pool_init(adev); 2755 if (r) { 2756 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2757 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2758 goto init_failed; 2759 } 2760 2761 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2762 if (r) 2763 goto init_failed; 2764 2765 r = amdgpu_device_ip_hw_init_phase1(adev); 2766 if (r) 2767 goto init_failed; 2768 2769 r = amdgpu_device_fw_loading(adev); 2770 if (r) 2771 goto init_failed; 2772 2773 r = amdgpu_device_ip_hw_init_phase2(adev); 2774 if (r) 2775 goto init_failed; 2776 2777 /* 2778 * retired pages will be loaded from eeprom and reserved here, 2779 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2780 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2781 * for I2C communication which only true at this point. 2782 * 2783 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2784 * failure from bad gpu situation and stop amdgpu init process 2785 * accordingly. For other failed cases, it will still release all 2786 * the resource and print error message, rather than returning one 2787 * negative value to upper level. 2788 * 2789 * Note: theoretically, this should be called before all vram allocations 2790 * to protect retired page from abusing 2791 */ 2792 r = amdgpu_ras_recovery_init(adev); 2793 if (r) 2794 goto init_failed; 2795 2796 /** 2797 * In case of XGMI grab extra reference for reset domain for this device 2798 */ 2799 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2800 if (amdgpu_xgmi_add_device(adev) == 0) { 2801 if (!amdgpu_sriov_vf(adev)) { 2802 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2803 2804 if (WARN_ON(!hive)) { 2805 r = -ENOENT; 2806 goto init_failed; 2807 } 2808 2809 if (!hive->reset_domain || 2810 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2811 r = -ENOENT; 2812 amdgpu_put_xgmi_hive(hive); 2813 goto init_failed; 2814 } 2815 2816 /* Drop the early temporary reset domain we created for device */ 2817 amdgpu_reset_put_reset_domain(adev->reset_domain); 2818 adev->reset_domain = hive->reset_domain; 2819 amdgpu_put_xgmi_hive(hive); 2820 } 2821 } 2822 } 2823 2824 r = amdgpu_device_init_schedulers(adev); 2825 if (r) 2826 goto init_failed; 2827 2828 if (adev->mman.buffer_funcs_ring->sched.ready) 2829 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2830 2831 /* Don't init kfd if whole hive need to be reset during init */ 2832 if (!adev->gmc.xgmi.pending_reset) { 2833 kgd2kfd_init_zone_device(adev); 2834 amdgpu_amdkfd_device_init(adev); 2835 } 2836 2837 amdgpu_fru_get_product_info(adev); 2838 2839 init_failed: 2840 2841 return r; 2842 } 2843 2844 /** 2845 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2846 * 2847 * @adev: amdgpu_device pointer 2848 * 2849 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2850 * this function before a GPU reset. If the value is retained after a 2851 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2852 */ 2853 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2854 { 2855 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2856 } 2857 2858 /** 2859 * amdgpu_device_check_vram_lost - check if vram is valid 2860 * 2861 * @adev: amdgpu_device pointer 2862 * 2863 * Checks the reset magic value written to the gart pointer in VRAM. 2864 * The driver calls this after a GPU reset to see if the contents of 2865 * VRAM is lost or now. 2866 * returns true if vram is lost, false if not. 2867 */ 2868 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2869 { 2870 if (memcmp(adev->gart.ptr, adev->reset_magic, 2871 AMDGPU_RESET_MAGIC_NUM)) 2872 return true; 2873 2874 if (!amdgpu_in_reset(adev)) 2875 return false; 2876 2877 /* 2878 * For all ASICs with baco/mode1 reset, the VRAM is 2879 * always assumed to be lost. 2880 */ 2881 switch (amdgpu_asic_reset_method(adev)) { 2882 case AMD_RESET_METHOD_BACO: 2883 case AMD_RESET_METHOD_MODE1: 2884 return true; 2885 default: 2886 return false; 2887 } 2888 } 2889 2890 /** 2891 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2892 * 2893 * @adev: amdgpu_device pointer 2894 * @state: clockgating state (gate or ungate) 2895 * 2896 * The list of all the hardware IPs that make up the asic is walked and the 2897 * set_clockgating_state callbacks are run. 2898 * Late initialization pass enabling clockgating for hardware IPs. 2899 * Fini or suspend, pass disabling clockgating for hardware IPs. 2900 * Returns 0 on success, negative error code on failure. 2901 */ 2902 2903 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2904 enum amd_clockgating_state state) 2905 { 2906 int i, j, r; 2907 2908 if (amdgpu_emu_mode == 1) 2909 return 0; 2910 2911 for (j = 0; j < adev->num_ip_blocks; j++) { 2912 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2913 if (!adev->ip_blocks[i].status.late_initialized) 2914 continue; 2915 /* skip CG for GFX, SDMA on S0ix */ 2916 if (adev->in_s0ix && 2917 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2918 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2919 continue; 2920 /* skip CG for VCE/UVD, it's handled specially */ 2921 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2922 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2923 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2924 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2925 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2926 /* enable clockgating to save power */ 2927 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2928 state); 2929 if (r) { 2930 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2931 adev->ip_blocks[i].version->funcs->name, r); 2932 return r; 2933 } 2934 } 2935 } 2936 2937 return 0; 2938 } 2939 2940 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2941 enum amd_powergating_state state) 2942 { 2943 int i, j, r; 2944 2945 if (amdgpu_emu_mode == 1) 2946 return 0; 2947 2948 for (j = 0; j < adev->num_ip_blocks; j++) { 2949 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2950 if (!adev->ip_blocks[i].status.late_initialized) 2951 continue; 2952 /* skip PG for GFX, SDMA on S0ix */ 2953 if (adev->in_s0ix && 2954 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2955 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2956 continue; 2957 /* skip CG for VCE/UVD, it's handled specially */ 2958 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2959 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2960 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2961 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2962 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2963 /* enable powergating to save power */ 2964 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2965 state); 2966 if (r) { 2967 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2968 adev->ip_blocks[i].version->funcs->name, r); 2969 return r; 2970 } 2971 } 2972 } 2973 return 0; 2974 } 2975 2976 static int amdgpu_device_enable_mgpu_fan_boost(void) 2977 { 2978 struct amdgpu_gpu_instance *gpu_ins; 2979 struct amdgpu_device *adev; 2980 int i, ret = 0; 2981 2982 mutex_lock(&mgpu_info.mutex); 2983 2984 /* 2985 * MGPU fan boost feature should be enabled 2986 * only when there are two or more dGPUs in 2987 * the system 2988 */ 2989 if (mgpu_info.num_dgpu < 2) 2990 goto out; 2991 2992 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2993 gpu_ins = &(mgpu_info.gpu_ins[i]); 2994 adev = gpu_ins->adev; 2995 if (!(adev->flags & AMD_IS_APU) && 2996 !gpu_ins->mgpu_fan_enabled) { 2997 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2998 if (ret) 2999 break; 3000 3001 gpu_ins->mgpu_fan_enabled = 1; 3002 } 3003 } 3004 3005 out: 3006 mutex_unlock(&mgpu_info.mutex); 3007 3008 return ret; 3009 } 3010 3011 /** 3012 * amdgpu_device_ip_late_init - run late init for hardware IPs 3013 * 3014 * @adev: amdgpu_device pointer 3015 * 3016 * Late initialization pass for hardware IPs. The list of all the hardware 3017 * IPs that make up the asic is walked and the late_init callbacks are run. 3018 * late_init covers any special initialization that an IP requires 3019 * after all of the have been initialized or something that needs to happen 3020 * late in the init process. 3021 * Returns 0 on success, negative error code on failure. 3022 */ 3023 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3024 { 3025 struct amdgpu_gpu_instance *gpu_instance; 3026 int i = 0, r; 3027 3028 for (i = 0; i < adev->num_ip_blocks; i++) { 3029 if (!adev->ip_blocks[i].status.hw) 3030 continue; 3031 if (adev->ip_blocks[i].version->funcs->late_init) { 3032 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3033 if (r) { 3034 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3035 adev->ip_blocks[i].version->funcs->name, r); 3036 return r; 3037 } 3038 } 3039 adev->ip_blocks[i].status.late_initialized = true; 3040 } 3041 3042 r = amdgpu_ras_late_init(adev); 3043 if (r) { 3044 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3045 return r; 3046 } 3047 3048 amdgpu_ras_set_error_query_ready(adev, true); 3049 3050 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3051 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3052 3053 amdgpu_device_fill_reset_magic(adev); 3054 3055 r = amdgpu_device_enable_mgpu_fan_boost(); 3056 if (r) 3057 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3058 3059 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3060 if (amdgpu_passthrough(adev) && 3061 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3062 adev->asic_type == CHIP_ALDEBARAN)) 3063 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3064 3065 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3066 mutex_lock(&mgpu_info.mutex); 3067 3068 /* 3069 * Reset device p-state to low as this was booted with high. 3070 * 3071 * This should be performed only after all devices from the same 3072 * hive get initialized. 3073 * 3074 * However, it's unknown how many device in the hive in advance. 3075 * As this is counted one by one during devices initializations. 3076 * 3077 * So, we wait for all XGMI interlinked devices initialized. 3078 * This may bring some delays as those devices may come from 3079 * different hives. But that should be OK. 3080 */ 3081 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3082 for (i = 0; i < mgpu_info.num_gpu; i++) { 3083 gpu_instance = &(mgpu_info.gpu_ins[i]); 3084 if (gpu_instance->adev->flags & AMD_IS_APU) 3085 continue; 3086 3087 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3088 AMDGPU_XGMI_PSTATE_MIN); 3089 if (r) { 3090 DRM_ERROR("pstate setting failed (%d).\n", r); 3091 break; 3092 } 3093 } 3094 } 3095 3096 mutex_unlock(&mgpu_info.mutex); 3097 } 3098 3099 return 0; 3100 } 3101 3102 /** 3103 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3104 * 3105 * @adev: amdgpu_device pointer 3106 * 3107 * For ASICs need to disable SMC first 3108 */ 3109 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3110 { 3111 int i, r; 3112 3113 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3114 return; 3115 3116 for (i = 0; i < adev->num_ip_blocks; i++) { 3117 if (!adev->ip_blocks[i].status.hw) 3118 continue; 3119 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3120 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3121 /* XXX handle errors */ 3122 if (r) { 3123 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3124 adev->ip_blocks[i].version->funcs->name, r); 3125 } 3126 adev->ip_blocks[i].status.hw = false; 3127 break; 3128 } 3129 } 3130 } 3131 3132 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3133 { 3134 int i, r; 3135 3136 for (i = 0; i < adev->num_ip_blocks; i++) { 3137 if (!adev->ip_blocks[i].version->funcs->early_fini) 3138 continue; 3139 3140 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3141 if (r) { 3142 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3143 adev->ip_blocks[i].version->funcs->name, r); 3144 } 3145 } 3146 3147 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3148 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3149 3150 amdgpu_amdkfd_suspend(adev, false); 3151 3152 /* Workaroud for ASICs need to disable SMC first */ 3153 amdgpu_device_smu_fini_early(adev); 3154 3155 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3156 if (!adev->ip_blocks[i].status.hw) 3157 continue; 3158 3159 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3160 /* XXX handle errors */ 3161 if (r) { 3162 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3163 adev->ip_blocks[i].version->funcs->name, r); 3164 } 3165 3166 adev->ip_blocks[i].status.hw = false; 3167 } 3168 3169 if (amdgpu_sriov_vf(adev)) { 3170 if (amdgpu_virt_release_full_gpu(adev, false)) 3171 DRM_ERROR("failed to release exclusive mode on fini\n"); 3172 } 3173 3174 return 0; 3175 } 3176 3177 /** 3178 * amdgpu_device_ip_fini - run fini for hardware IPs 3179 * 3180 * @adev: amdgpu_device pointer 3181 * 3182 * Main teardown pass for hardware IPs. The list of all the hardware 3183 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3184 * are run. hw_fini tears down the hardware associated with each IP 3185 * and sw_fini tears down any software state associated with each IP. 3186 * Returns 0 on success, negative error code on failure. 3187 */ 3188 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3189 { 3190 int i, r; 3191 3192 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3193 amdgpu_virt_release_ras_err_handler_data(adev); 3194 3195 if (adev->gmc.xgmi.num_physical_nodes > 1) 3196 amdgpu_xgmi_remove_device(adev); 3197 3198 amdgpu_amdkfd_device_fini_sw(adev); 3199 3200 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3201 if (!adev->ip_blocks[i].status.sw) 3202 continue; 3203 3204 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3205 amdgpu_ucode_free_bo(adev); 3206 amdgpu_free_static_csa(&adev->virt.csa_obj); 3207 amdgpu_device_wb_fini(adev); 3208 amdgpu_device_mem_scratch_fini(adev); 3209 amdgpu_ib_pool_fini(adev); 3210 amdgpu_seq64_fini(adev); 3211 } 3212 3213 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3214 /* XXX handle errors */ 3215 if (r) { 3216 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3217 adev->ip_blocks[i].version->funcs->name, r); 3218 } 3219 adev->ip_blocks[i].status.sw = false; 3220 adev->ip_blocks[i].status.valid = false; 3221 } 3222 3223 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3224 if (!adev->ip_blocks[i].status.late_initialized) 3225 continue; 3226 if (adev->ip_blocks[i].version->funcs->late_fini) 3227 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3228 adev->ip_blocks[i].status.late_initialized = false; 3229 } 3230 3231 amdgpu_ras_fini(adev); 3232 3233 return 0; 3234 } 3235 3236 /** 3237 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3238 * 3239 * @work: work_struct. 3240 */ 3241 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3242 { 3243 struct amdgpu_device *adev = 3244 container_of(work, struct amdgpu_device, delayed_init_work.work); 3245 int r; 3246 3247 r = amdgpu_ib_ring_tests(adev); 3248 if (r) 3249 DRM_ERROR("ib ring test failed (%d).\n", r); 3250 } 3251 3252 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3253 { 3254 struct amdgpu_device *adev = 3255 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3256 3257 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3258 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3259 3260 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3261 adev->gfx.gfx_off_state = true; 3262 } 3263 3264 /** 3265 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3266 * 3267 * @adev: amdgpu_device pointer 3268 * 3269 * Main suspend function for hardware IPs. The list of all the hardware 3270 * IPs that make up the asic is walked, clockgating is disabled and the 3271 * suspend callbacks are run. suspend puts the hardware and software state 3272 * in each IP into a state suitable for suspend. 3273 * Returns 0 on success, negative error code on failure. 3274 */ 3275 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3276 { 3277 int i, r; 3278 3279 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3280 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3281 3282 /* 3283 * Per PMFW team's suggestion, driver needs to handle gfxoff 3284 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3285 * scenario. Add the missing df cstate disablement here. 3286 */ 3287 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3288 dev_warn(adev->dev, "Failed to disallow df cstate"); 3289 3290 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3291 if (!adev->ip_blocks[i].status.valid) 3292 continue; 3293 3294 /* displays are handled separately */ 3295 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3296 continue; 3297 3298 /* XXX handle errors */ 3299 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3300 /* XXX handle errors */ 3301 if (r) { 3302 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3303 adev->ip_blocks[i].version->funcs->name, r); 3304 return r; 3305 } 3306 3307 adev->ip_blocks[i].status.hw = false; 3308 } 3309 3310 return 0; 3311 } 3312 3313 /** 3314 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3315 * 3316 * @adev: amdgpu_device pointer 3317 * 3318 * Main suspend function for hardware IPs. The list of all the hardware 3319 * IPs that make up the asic is walked, clockgating is disabled and the 3320 * suspend callbacks are run. suspend puts the hardware and software state 3321 * in each IP into a state suitable for suspend. 3322 * Returns 0 on success, negative error code on failure. 3323 */ 3324 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3325 { 3326 int i, r; 3327 3328 if (adev->in_s0ix) 3329 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3330 3331 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3332 if (!adev->ip_blocks[i].status.valid) 3333 continue; 3334 /* displays are handled in phase1 */ 3335 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3336 continue; 3337 /* PSP lost connection when err_event_athub occurs */ 3338 if (amdgpu_ras_intr_triggered() && 3339 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3340 adev->ip_blocks[i].status.hw = false; 3341 continue; 3342 } 3343 3344 /* skip unnecessary suspend if we do not initialize them yet */ 3345 if (adev->gmc.xgmi.pending_reset && 3346 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3347 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3348 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3349 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3350 adev->ip_blocks[i].status.hw = false; 3351 continue; 3352 } 3353 3354 /* skip suspend of gfx/mes and psp for S0ix 3355 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3356 * like at runtime. PSP is also part of the always on hardware 3357 * so no need to suspend it. 3358 */ 3359 if (adev->in_s0ix && 3360 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3361 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3362 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3363 continue; 3364 3365 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3366 if (adev->in_s0ix && 3367 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3368 IP_VERSION(5, 0, 0)) && 3369 (adev->ip_blocks[i].version->type == 3370 AMD_IP_BLOCK_TYPE_SDMA)) 3371 continue; 3372 3373 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3374 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3375 * from this location and RLC Autoload automatically also gets loaded 3376 * from here based on PMFW -> PSP message during re-init sequence. 3377 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3378 * the TMR and reload FWs again for IMU enabled APU ASICs. 3379 */ 3380 if (amdgpu_in_reset(adev) && 3381 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3382 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3383 continue; 3384 3385 /* XXX handle errors */ 3386 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3387 /* XXX handle errors */ 3388 if (r) { 3389 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3390 adev->ip_blocks[i].version->funcs->name, r); 3391 } 3392 adev->ip_blocks[i].status.hw = false; 3393 /* handle putting the SMC in the appropriate state */ 3394 if (!amdgpu_sriov_vf(adev)) { 3395 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3396 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3397 if (r) { 3398 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3399 adev->mp1_state, r); 3400 return r; 3401 } 3402 } 3403 } 3404 } 3405 3406 return 0; 3407 } 3408 3409 /** 3410 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3411 * 3412 * @adev: amdgpu_device pointer 3413 * 3414 * Main suspend function for hardware IPs. The list of all the hardware 3415 * IPs that make up the asic is walked, clockgating is disabled and the 3416 * suspend callbacks are run. suspend puts the hardware and software state 3417 * in each IP into a state suitable for suspend. 3418 * Returns 0 on success, negative error code on failure. 3419 */ 3420 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3421 { 3422 int r; 3423 3424 if (amdgpu_sriov_vf(adev)) { 3425 amdgpu_virt_fini_data_exchange(adev); 3426 amdgpu_virt_request_full_gpu(adev, false); 3427 } 3428 3429 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3430 3431 r = amdgpu_device_ip_suspend_phase1(adev); 3432 if (r) 3433 return r; 3434 r = amdgpu_device_ip_suspend_phase2(adev); 3435 3436 if (amdgpu_sriov_vf(adev)) 3437 amdgpu_virt_release_full_gpu(adev, false); 3438 3439 return r; 3440 } 3441 3442 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3443 { 3444 int i, r; 3445 3446 static enum amd_ip_block_type ip_order[] = { 3447 AMD_IP_BLOCK_TYPE_COMMON, 3448 AMD_IP_BLOCK_TYPE_GMC, 3449 AMD_IP_BLOCK_TYPE_PSP, 3450 AMD_IP_BLOCK_TYPE_IH, 3451 }; 3452 3453 for (i = 0; i < adev->num_ip_blocks; i++) { 3454 int j; 3455 struct amdgpu_ip_block *block; 3456 3457 block = &adev->ip_blocks[i]; 3458 block->status.hw = false; 3459 3460 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3461 3462 if (block->version->type != ip_order[j] || 3463 !block->status.valid) 3464 continue; 3465 3466 r = block->version->funcs->hw_init(adev); 3467 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3468 if (r) 3469 return r; 3470 block->status.hw = true; 3471 } 3472 } 3473 3474 return 0; 3475 } 3476 3477 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3478 { 3479 int i, r; 3480 3481 static enum amd_ip_block_type ip_order[] = { 3482 AMD_IP_BLOCK_TYPE_SMC, 3483 AMD_IP_BLOCK_TYPE_DCE, 3484 AMD_IP_BLOCK_TYPE_GFX, 3485 AMD_IP_BLOCK_TYPE_SDMA, 3486 AMD_IP_BLOCK_TYPE_MES, 3487 AMD_IP_BLOCK_TYPE_UVD, 3488 AMD_IP_BLOCK_TYPE_VCE, 3489 AMD_IP_BLOCK_TYPE_VCN, 3490 AMD_IP_BLOCK_TYPE_JPEG 3491 }; 3492 3493 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3494 int j; 3495 struct amdgpu_ip_block *block; 3496 3497 for (j = 0; j < adev->num_ip_blocks; j++) { 3498 block = &adev->ip_blocks[j]; 3499 3500 if (block->version->type != ip_order[i] || 3501 !block->status.valid || 3502 block->status.hw) 3503 continue; 3504 3505 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3506 r = block->version->funcs->resume(adev); 3507 else 3508 r = block->version->funcs->hw_init(adev); 3509 3510 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3511 if (r) 3512 return r; 3513 block->status.hw = true; 3514 } 3515 } 3516 3517 return 0; 3518 } 3519 3520 /** 3521 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3522 * 3523 * @adev: amdgpu_device pointer 3524 * 3525 * First resume function for hardware IPs. The list of all the hardware 3526 * IPs that make up the asic is walked and the resume callbacks are run for 3527 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3528 * after a suspend and updates the software state as necessary. This 3529 * function is also used for restoring the GPU after a GPU reset. 3530 * Returns 0 on success, negative error code on failure. 3531 */ 3532 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3533 { 3534 int i, r; 3535 3536 for (i = 0; i < adev->num_ip_blocks; i++) { 3537 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3538 continue; 3539 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3540 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3541 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3542 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3543 3544 r = adev->ip_blocks[i].version->funcs->resume(adev); 3545 if (r) { 3546 DRM_ERROR("resume of IP block <%s> failed %d\n", 3547 adev->ip_blocks[i].version->funcs->name, r); 3548 return r; 3549 } 3550 adev->ip_blocks[i].status.hw = true; 3551 } 3552 } 3553 3554 return 0; 3555 } 3556 3557 /** 3558 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3559 * 3560 * @adev: amdgpu_device pointer 3561 * 3562 * First resume function for hardware IPs. The list of all the hardware 3563 * IPs that make up the asic is walked and the resume callbacks are run for 3564 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3565 * functional state after a suspend and updates the software state as 3566 * necessary. This function is also used for restoring the GPU after a GPU 3567 * reset. 3568 * Returns 0 on success, negative error code on failure. 3569 */ 3570 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3571 { 3572 int i, r; 3573 3574 for (i = 0; i < adev->num_ip_blocks; i++) { 3575 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3576 continue; 3577 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3578 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3579 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3580 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3581 continue; 3582 r = adev->ip_blocks[i].version->funcs->resume(adev); 3583 if (r) { 3584 DRM_ERROR("resume of IP block <%s> failed %d\n", 3585 adev->ip_blocks[i].version->funcs->name, r); 3586 return r; 3587 } 3588 adev->ip_blocks[i].status.hw = true; 3589 } 3590 3591 return 0; 3592 } 3593 3594 /** 3595 * amdgpu_device_ip_resume - run resume for hardware IPs 3596 * 3597 * @adev: amdgpu_device pointer 3598 * 3599 * Main resume function for hardware IPs. The hardware IPs 3600 * are split into two resume functions because they are 3601 * also used in recovering from a GPU reset and some additional 3602 * steps need to be take between them. In this case (S3/S4) they are 3603 * run sequentially. 3604 * Returns 0 on success, negative error code on failure. 3605 */ 3606 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3607 { 3608 int r; 3609 3610 r = amdgpu_device_ip_resume_phase1(adev); 3611 if (r) 3612 return r; 3613 3614 r = amdgpu_device_fw_loading(adev); 3615 if (r) 3616 return r; 3617 3618 r = amdgpu_device_ip_resume_phase2(adev); 3619 3620 if (adev->mman.buffer_funcs_ring->sched.ready) 3621 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3622 3623 return r; 3624 } 3625 3626 /** 3627 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3628 * 3629 * @adev: amdgpu_device pointer 3630 * 3631 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3632 */ 3633 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3634 { 3635 if (amdgpu_sriov_vf(adev)) { 3636 if (adev->is_atom_fw) { 3637 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3638 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3639 } else { 3640 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3641 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3642 } 3643 3644 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3645 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3646 } 3647 } 3648 3649 /** 3650 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3651 * 3652 * @asic_type: AMD asic type 3653 * 3654 * Check if there is DC (new modesetting infrastructre) support for an asic. 3655 * returns true if DC has support, false if not. 3656 */ 3657 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3658 { 3659 switch (asic_type) { 3660 #ifdef CONFIG_DRM_AMDGPU_SI 3661 case CHIP_HAINAN: 3662 #endif 3663 case CHIP_TOPAZ: 3664 /* chips with no display hardware */ 3665 return false; 3666 #if defined(CONFIG_DRM_AMD_DC) 3667 case CHIP_TAHITI: 3668 case CHIP_PITCAIRN: 3669 case CHIP_VERDE: 3670 case CHIP_OLAND: 3671 /* 3672 * We have systems in the wild with these ASICs that require 3673 * LVDS and VGA support which is not supported with DC. 3674 * 3675 * Fallback to the non-DC driver here by default so as not to 3676 * cause regressions. 3677 */ 3678 #if defined(CONFIG_DRM_AMD_DC_SI) 3679 return amdgpu_dc > 0; 3680 #else 3681 return false; 3682 #endif 3683 case CHIP_BONAIRE: 3684 case CHIP_KAVERI: 3685 case CHIP_KABINI: 3686 case CHIP_MULLINS: 3687 /* 3688 * We have systems in the wild with these ASICs that require 3689 * VGA support which is not supported with DC. 3690 * 3691 * Fallback to the non-DC driver here by default so as not to 3692 * cause regressions. 3693 */ 3694 return amdgpu_dc > 0; 3695 default: 3696 return amdgpu_dc != 0; 3697 #else 3698 default: 3699 if (amdgpu_dc > 0) 3700 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3701 return false; 3702 #endif 3703 } 3704 } 3705 3706 /** 3707 * amdgpu_device_has_dc_support - check if dc is supported 3708 * 3709 * @adev: amdgpu_device pointer 3710 * 3711 * Returns true for supported, false for not supported 3712 */ 3713 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3714 { 3715 if (adev->enable_virtual_display || 3716 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3717 return false; 3718 3719 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3720 } 3721 3722 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3723 { 3724 struct amdgpu_device *adev = 3725 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3726 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3727 3728 /* It's a bug to not have a hive within this function */ 3729 if (WARN_ON(!hive)) 3730 return; 3731 3732 /* 3733 * Use task barrier to synchronize all xgmi reset works across the 3734 * hive. task_barrier_enter and task_barrier_exit will block 3735 * until all the threads running the xgmi reset works reach 3736 * those points. task_barrier_full will do both blocks. 3737 */ 3738 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3739 3740 task_barrier_enter(&hive->tb); 3741 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3742 3743 if (adev->asic_reset_res) 3744 goto fail; 3745 3746 task_barrier_exit(&hive->tb); 3747 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3748 3749 if (adev->asic_reset_res) 3750 goto fail; 3751 3752 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3753 } else { 3754 3755 task_barrier_full(&hive->tb); 3756 adev->asic_reset_res = amdgpu_asic_reset(adev); 3757 } 3758 3759 fail: 3760 if (adev->asic_reset_res) 3761 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3762 adev->asic_reset_res, adev_to_drm(adev)->unique); 3763 amdgpu_put_xgmi_hive(hive); 3764 } 3765 3766 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3767 { 3768 char *input = amdgpu_lockup_timeout; 3769 char *timeout_setting = NULL; 3770 int index = 0; 3771 long timeout; 3772 int ret = 0; 3773 3774 /* 3775 * By default timeout for non compute jobs is 10000 3776 * and 60000 for compute jobs. 3777 * In SR-IOV or passthrough mode, timeout for compute 3778 * jobs are 60000 by default. 3779 */ 3780 adev->gfx_timeout = msecs_to_jiffies(10000); 3781 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3782 if (amdgpu_sriov_vf(adev)) 3783 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3784 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3785 else 3786 adev->compute_timeout = msecs_to_jiffies(60000); 3787 3788 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3789 while ((timeout_setting = strsep(&input, ",")) && 3790 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3791 ret = kstrtol(timeout_setting, 0, &timeout); 3792 if (ret) 3793 return ret; 3794 3795 if (timeout == 0) { 3796 index++; 3797 continue; 3798 } else if (timeout < 0) { 3799 timeout = MAX_SCHEDULE_TIMEOUT; 3800 dev_warn(adev->dev, "lockup timeout disabled"); 3801 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3802 } else { 3803 timeout = msecs_to_jiffies(timeout); 3804 } 3805 3806 switch (index++) { 3807 case 0: 3808 adev->gfx_timeout = timeout; 3809 break; 3810 case 1: 3811 adev->compute_timeout = timeout; 3812 break; 3813 case 2: 3814 adev->sdma_timeout = timeout; 3815 break; 3816 case 3: 3817 adev->video_timeout = timeout; 3818 break; 3819 default: 3820 break; 3821 } 3822 } 3823 /* 3824 * There is only one value specified and 3825 * it should apply to all non-compute jobs. 3826 */ 3827 if (index == 1) { 3828 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3829 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3830 adev->compute_timeout = adev->gfx_timeout; 3831 } 3832 } 3833 3834 return ret; 3835 } 3836 3837 /** 3838 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3839 * 3840 * @adev: amdgpu_device pointer 3841 * 3842 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3843 */ 3844 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3845 { 3846 struct iommu_domain *domain; 3847 3848 domain = iommu_get_domain_for_dev(adev->dev); 3849 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3850 adev->ram_is_direct_mapped = true; 3851 } 3852 3853 static const struct attribute *amdgpu_dev_attributes[] = { 3854 &dev_attr_pcie_replay_count.attr, 3855 NULL 3856 }; 3857 3858 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3859 { 3860 if (amdgpu_mcbp == 1) 3861 adev->gfx.mcbp = true; 3862 else if (amdgpu_mcbp == 0) 3863 adev->gfx.mcbp = false; 3864 else if ((amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 0, 0)) && 3865 (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(10, 0, 0)) && 3866 adev->gfx.num_gfx_rings) 3867 adev->gfx.mcbp = true; 3868 3869 if (amdgpu_sriov_vf(adev)) 3870 adev->gfx.mcbp = true; 3871 3872 if (adev->gfx.mcbp) 3873 DRM_INFO("MCBP is enabled\n"); 3874 } 3875 3876 /** 3877 * amdgpu_device_init - initialize the driver 3878 * 3879 * @adev: amdgpu_device pointer 3880 * @flags: driver flags 3881 * 3882 * Initializes the driver info and hw (all asics). 3883 * Returns 0 for success or an error on failure. 3884 * Called at driver startup. 3885 */ 3886 int amdgpu_device_init(struct amdgpu_device *adev, 3887 uint32_t flags) 3888 { 3889 struct drm_device *ddev = adev_to_drm(adev); 3890 struct pci_dev *pdev = adev->pdev; 3891 int r, i; 3892 bool px = false; 3893 u32 max_MBps; 3894 int tmp; 3895 3896 adev->shutdown = false; 3897 adev->flags = flags; 3898 3899 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3900 adev->asic_type = amdgpu_force_asic_type; 3901 else 3902 adev->asic_type = flags & AMD_ASIC_MASK; 3903 3904 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3905 if (amdgpu_emu_mode == 1) 3906 adev->usec_timeout *= 10; 3907 adev->gmc.gart_size = 512 * 1024 * 1024; 3908 adev->accel_working = false; 3909 adev->num_rings = 0; 3910 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3911 adev->mman.buffer_funcs = NULL; 3912 adev->mman.buffer_funcs_ring = NULL; 3913 adev->vm_manager.vm_pte_funcs = NULL; 3914 adev->vm_manager.vm_pte_num_scheds = 0; 3915 adev->gmc.gmc_funcs = NULL; 3916 adev->harvest_ip_mask = 0x0; 3917 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3918 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3919 3920 adev->smc_rreg = &amdgpu_invalid_rreg; 3921 adev->smc_wreg = &amdgpu_invalid_wreg; 3922 adev->pcie_rreg = &amdgpu_invalid_rreg; 3923 adev->pcie_wreg = &amdgpu_invalid_wreg; 3924 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3925 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3926 adev->pciep_rreg = &amdgpu_invalid_rreg; 3927 adev->pciep_wreg = &amdgpu_invalid_wreg; 3928 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3929 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3930 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3931 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3932 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3933 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3934 adev->didt_rreg = &amdgpu_invalid_rreg; 3935 adev->didt_wreg = &amdgpu_invalid_wreg; 3936 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3937 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3938 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3939 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3940 3941 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3942 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3943 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3944 3945 /* mutex initialization are all done here so we 3946 * can recall function without having locking issues 3947 */ 3948 mutex_init(&adev->firmware.mutex); 3949 mutex_init(&adev->pm.mutex); 3950 mutex_init(&adev->gfx.gpu_clock_mutex); 3951 mutex_init(&adev->srbm_mutex); 3952 mutex_init(&adev->gfx.pipe_reserve_mutex); 3953 mutex_init(&adev->gfx.gfx_off_mutex); 3954 mutex_init(&adev->gfx.partition_mutex); 3955 mutex_init(&adev->grbm_idx_mutex); 3956 mutex_init(&adev->mn_lock); 3957 mutex_init(&adev->virt.vf_errors.lock); 3958 hash_init(adev->mn_hash); 3959 mutex_init(&adev->psp.mutex); 3960 mutex_init(&adev->notifier_lock); 3961 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3962 mutex_init(&adev->benchmark_mutex); 3963 3964 amdgpu_device_init_apu_flags(adev); 3965 3966 r = amdgpu_device_check_arguments(adev); 3967 if (r) 3968 return r; 3969 3970 spin_lock_init(&adev->mmio_idx_lock); 3971 spin_lock_init(&adev->smc_idx_lock); 3972 spin_lock_init(&adev->pcie_idx_lock); 3973 spin_lock_init(&adev->uvd_ctx_idx_lock); 3974 spin_lock_init(&adev->didt_idx_lock); 3975 spin_lock_init(&adev->gc_cac_idx_lock); 3976 spin_lock_init(&adev->se_cac_idx_lock); 3977 spin_lock_init(&adev->audio_endpt_idx_lock); 3978 spin_lock_init(&adev->mm_stats.lock); 3979 3980 INIT_LIST_HEAD(&adev->shadow_list); 3981 mutex_init(&adev->shadow_list_lock); 3982 3983 INIT_LIST_HEAD(&adev->reset_list); 3984 3985 INIT_LIST_HEAD(&adev->ras_list); 3986 3987 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3988 3989 INIT_DELAYED_WORK(&adev->delayed_init_work, 3990 amdgpu_device_delayed_init_work_handler); 3991 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3992 amdgpu_device_delay_enable_gfx_off); 3993 3994 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3995 3996 adev->gfx.gfx_off_req_count = 1; 3997 adev->gfx.gfx_off_residency = 0; 3998 adev->gfx.gfx_off_entrycount = 0; 3999 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4000 4001 atomic_set(&adev->throttling_logging_enabled, 1); 4002 /* 4003 * If throttling continues, logging will be performed every minute 4004 * to avoid log flooding. "-1" is subtracted since the thermal 4005 * throttling interrupt comes every second. Thus, the total logging 4006 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4007 * for throttling interrupt) = 60 seconds. 4008 */ 4009 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4010 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4011 4012 /* Registers mapping */ 4013 /* TODO: block userspace mapping of io register */ 4014 if (adev->asic_type >= CHIP_BONAIRE) { 4015 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4016 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4017 } else { 4018 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4019 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4020 } 4021 4022 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4023 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4024 4025 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4026 if (!adev->rmmio) 4027 return -ENOMEM; 4028 4029 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4030 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4031 4032 /* 4033 * Reset domain needs to be present early, before XGMI hive discovered 4034 * (if any) and intitialized to use reset sem and in_gpu reset flag 4035 * early on during init and before calling to RREG32. 4036 */ 4037 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4038 if (!adev->reset_domain) 4039 return -ENOMEM; 4040 4041 /* detect hw virtualization here */ 4042 amdgpu_detect_virtualization(adev); 4043 4044 amdgpu_device_get_pcie_info(adev); 4045 4046 r = amdgpu_device_get_job_timeout_settings(adev); 4047 if (r) { 4048 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4049 return r; 4050 } 4051 4052 /* early init functions */ 4053 r = amdgpu_device_ip_early_init(adev); 4054 if (r) 4055 return r; 4056 4057 amdgpu_device_set_mcbp(adev); 4058 4059 /* Get rid of things like offb */ 4060 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4061 if (r) 4062 return r; 4063 4064 /* Enable TMZ based on IP_VERSION */ 4065 amdgpu_gmc_tmz_set(adev); 4066 4067 amdgpu_gmc_noretry_set(adev); 4068 /* Need to get xgmi info early to decide the reset behavior*/ 4069 if (adev->gmc.xgmi.supported) { 4070 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4071 if (r) 4072 return r; 4073 } 4074 4075 /* enable PCIE atomic ops */ 4076 if (amdgpu_sriov_vf(adev)) { 4077 if (adev->virt.fw_reserve.p_pf2vf) 4078 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4079 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4080 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4081 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4082 * internal path natively support atomics, set have_atomics_support to true. 4083 */ 4084 } else if ((adev->flags & AMD_IS_APU) && 4085 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4086 IP_VERSION(9, 0, 0))) { 4087 adev->have_atomics_support = true; 4088 } else { 4089 adev->have_atomics_support = 4090 !pci_enable_atomic_ops_to_root(adev->pdev, 4091 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4092 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4093 } 4094 4095 if (!adev->have_atomics_support) 4096 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4097 4098 /* doorbell bar mapping and doorbell index init*/ 4099 amdgpu_doorbell_init(adev); 4100 4101 if (amdgpu_emu_mode == 1) { 4102 /* post the asic on emulation mode */ 4103 emu_soc_asic_init(adev); 4104 goto fence_driver_init; 4105 } 4106 4107 amdgpu_reset_init(adev); 4108 4109 /* detect if we are with an SRIOV vbios */ 4110 if (adev->bios) 4111 amdgpu_device_detect_sriov_bios(adev); 4112 4113 /* check if we need to reset the asic 4114 * E.g., driver was not cleanly unloaded previously, etc. 4115 */ 4116 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4117 if (adev->gmc.xgmi.num_physical_nodes) { 4118 dev_info(adev->dev, "Pending hive reset.\n"); 4119 adev->gmc.xgmi.pending_reset = true; 4120 /* Only need to init necessary block for SMU to handle the reset */ 4121 for (i = 0; i < adev->num_ip_blocks; i++) { 4122 if (!adev->ip_blocks[i].status.valid) 4123 continue; 4124 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4125 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4126 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4127 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4128 DRM_DEBUG("IP %s disabled for hw_init.\n", 4129 adev->ip_blocks[i].version->funcs->name); 4130 adev->ip_blocks[i].status.hw = true; 4131 } 4132 } 4133 } else { 4134 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 4135 case IP_VERSION(13, 0, 0): 4136 case IP_VERSION(13, 0, 7): 4137 case IP_VERSION(13, 0, 10): 4138 r = psp_gpu_reset(adev); 4139 break; 4140 default: 4141 tmp = amdgpu_reset_method; 4142 /* It should do a default reset when loading or reloading the driver, 4143 * regardless of the module parameter reset_method. 4144 */ 4145 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4146 r = amdgpu_asic_reset(adev); 4147 amdgpu_reset_method = tmp; 4148 break; 4149 } 4150 4151 if (r) { 4152 dev_err(adev->dev, "asic reset on init failed\n"); 4153 goto failed; 4154 } 4155 } 4156 } 4157 4158 /* Post card if necessary */ 4159 if (amdgpu_device_need_post(adev)) { 4160 if (!adev->bios) { 4161 dev_err(adev->dev, "no vBIOS found\n"); 4162 r = -EINVAL; 4163 goto failed; 4164 } 4165 DRM_INFO("GPU posting now...\n"); 4166 r = amdgpu_device_asic_init(adev); 4167 if (r) { 4168 dev_err(adev->dev, "gpu post error!\n"); 4169 goto failed; 4170 } 4171 } 4172 4173 if (adev->bios) { 4174 if (adev->is_atom_fw) { 4175 /* Initialize clocks */ 4176 r = amdgpu_atomfirmware_get_clock_info(adev); 4177 if (r) { 4178 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4179 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4180 goto failed; 4181 } 4182 } else { 4183 /* Initialize clocks */ 4184 r = amdgpu_atombios_get_clock_info(adev); 4185 if (r) { 4186 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4187 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4188 goto failed; 4189 } 4190 /* init i2c buses */ 4191 if (!amdgpu_device_has_dc_support(adev)) 4192 amdgpu_atombios_i2c_init(adev); 4193 } 4194 } 4195 4196 fence_driver_init: 4197 /* Fence driver */ 4198 r = amdgpu_fence_driver_sw_init(adev); 4199 if (r) { 4200 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4201 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4202 goto failed; 4203 } 4204 4205 /* init the mode config */ 4206 drm_mode_config_init(adev_to_drm(adev)); 4207 4208 r = amdgpu_device_ip_init(adev); 4209 if (r) { 4210 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4211 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4212 goto release_ras_con; 4213 } 4214 4215 amdgpu_fence_driver_hw_init(adev); 4216 4217 dev_info(adev->dev, 4218 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4219 adev->gfx.config.max_shader_engines, 4220 adev->gfx.config.max_sh_per_se, 4221 adev->gfx.config.max_cu_per_sh, 4222 adev->gfx.cu_info.number); 4223 4224 adev->accel_working = true; 4225 4226 amdgpu_vm_check_compute_bug(adev); 4227 4228 /* Initialize the buffer migration limit. */ 4229 if (amdgpu_moverate >= 0) 4230 max_MBps = amdgpu_moverate; 4231 else 4232 max_MBps = 8; /* Allow 8 MB/s. */ 4233 /* Get a log2 for easy divisions. */ 4234 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4235 4236 /* 4237 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4238 * Otherwise the mgpu fan boost feature will be skipped due to the 4239 * gpu instance is counted less. 4240 */ 4241 amdgpu_register_gpu_instance(adev); 4242 4243 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4244 * explicit gating rather than handling it automatically. 4245 */ 4246 if (!adev->gmc.xgmi.pending_reset) { 4247 r = amdgpu_device_ip_late_init(adev); 4248 if (r) { 4249 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4250 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4251 goto release_ras_con; 4252 } 4253 /* must succeed. */ 4254 amdgpu_ras_resume(adev); 4255 queue_delayed_work(system_wq, &adev->delayed_init_work, 4256 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4257 } 4258 4259 if (amdgpu_sriov_vf(adev)) { 4260 amdgpu_virt_release_full_gpu(adev, true); 4261 flush_delayed_work(&adev->delayed_init_work); 4262 } 4263 4264 /* 4265 * Place those sysfs registering after `late_init`. As some of those 4266 * operations performed in `late_init` might affect the sysfs 4267 * interfaces creating. 4268 */ 4269 r = amdgpu_atombios_sysfs_init(adev); 4270 if (r) 4271 drm_err(&adev->ddev, 4272 "registering atombios sysfs failed (%d).\n", r); 4273 4274 r = amdgpu_pm_sysfs_init(adev); 4275 if (r) 4276 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4277 4278 r = amdgpu_ucode_sysfs_init(adev); 4279 if (r) { 4280 adev->ucode_sysfs_en = false; 4281 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4282 } else 4283 adev->ucode_sysfs_en = true; 4284 4285 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4286 if (r) 4287 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4288 4289 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4290 if (r) 4291 dev_err(adev->dev, 4292 "Could not create amdgpu board attributes\n"); 4293 4294 amdgpu_fru_sysfs_init(adev); 4295 amdgpu_reg_state_sysfs_init(adev); 4296 4297 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4298 r = amdgpu_pmu_init(adev); 4299 if (r) 4300 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4301 4302 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4303 if (amdgpu_device_cache_pci_state(adev->pdev)) 4304 pci_restore_state(pdev); 4305 4306 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4307 /* this will fail for cards that aren't VGA class devices, just 4308 * ignore it 4309 */ 4310 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4311 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4312 4313 px = amdgpu_device_supports_px(ddev); 4314 4315 if (px || (!dev_is_removable(&adev->pdev->dev) && 4316 apple_gmux_detect(NULL, NULL))) 4317 vga_switcheroo_register_client(adev->pdev, 4318 &amdgpu_switcheroo_ops, px); 4319 4320 if (px) 4321 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4322 4323 if (adev->gmc.xgmi.pending_reset) 4324 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4325 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4326 4327 amdgpu_device_check_iommu_direct_map(adev); 4328 4329 return 0; 4330 4331 release_ras_con: 4332 if (amdgpu_sriov_vf(adev)) 4333 amdgpu_virt_release_full_gpu(adev, true); 4334 4335 /* failed in exclusive mode due to timeout */ 4336 if (amdgpu_sriov_vf(adev) && 4337 !amdgpu_sriov_runtime(adev) && 4338 amdgpu_virt_mmio_blocked(adev) && 4339 !amdgpu_virt_wait_reset(adev)) { 4340 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4341 /* Don't send request since VF is inactive. */ 4342 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4343 adev->virt.ops = NULL; 4344 r = -EAGAIN; 4345 } 4346 amdgpu_release_ras_context(adev); 4347 4348 failed: 4349 amdgpu_vf_error_trans_all(adev); 4350 4351 return r; 4352 } 4353 4354 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4355 { 4356 4357 /* Clear all CPU mappings pointing to this device */ 4358 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4359 4360 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4361 amdgpu_doorbell_fini(adev); 4362 4363 iounmap(adev->rmmio); 4364 adev->rmmio = NULL; 4365 if (adev->mman.aper_base_kaddr) 4366 iounmap(adev->mman.aper_base_kaddr); 4367 adev->mman.aper_base_kaddr = NULL; 4368 4369 /* Memory manager related */ 4370 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4371 arch_phys_wc_del(adev->gmc.vram_mtrr); 4372 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4373 } 4374 } 4375 4376 /** 4377 * amdgpu_device_fini_hw - tear down the driver 4378 * 4379 * @adev: amdgpu_device pointer 4380 * 4381 * Tear down the driver info (all asics). 4382 * Called at driver shutdown. 4383 */ 4384 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4385 { 4386 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4387 flush_delayed_work(&adev->delayed_init_work); 4388 adev->shutdown = true; 4389 4390 /* make sure IB test finished before entering exclusive mode 4391 * to avoid preemption on IB test 4392 */ 4393 if (amdgpu_sriov_vf(adev)) { 4394 amdgpu_virt_request_full_gpu(adev, false); 4395 amdgpu_virt_fini_data_exchange(adev); 4396 } 4397 4398 /* disable all interrupts */ 4399 amdgpu_irq_disable_all(adev); 4400 if (adev->mode_info.mode_config_initialized) { 4401 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4402 drm_helper_force_disable_all(adev_to_drm(adev)); 4403 else 4404 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4405 } 4406 amdgpu_fence_driver_hw_fini(adev); 4407 4408 if (adev->mman.initialized) 4409 drain_workqueue(adev->mman.bdev.wq); 4410 4411 if (adev->pm.sysfs_initialized) 4412 amdgpu_pm_sysfs_fini(adev); 4413 if (adev->ucode_sysfs_en) 4414 amdgpu_ucode_sysfs_fini(adev); 4415 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4416 amdgpu_fru_sysfs_fini(adev); 4417 4418 amdgpu_reg_state_sysfs_fini(adev); 4419 4420 /* disable ras feature must before hw fini */ 4421 amdgpu_ras_pre_fini(adev); 4422 4423 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4424 4425 amdgpu_device_ip_fini_early(adev); 4426 4427 amdgpu_irq_fini_hw(adev); 4428 4429 if (adev->mman.initialized) 4430 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4431 4432 amdgpu_gart_dummy_page_fini(adev); 4433 4434 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4435 amdgpu_device_unmap_mmio(adev); 4436 4437 } 4438 4439 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4440 { 4441 int idx; 4442 bool px; 4443 4444 amdgpu_fence_driver_sw_fini(adev); 4445 amdgpu_device_ip_fini(adev); 4446 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4447 adev->accel_working = false; 4448 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4449 4450 amdgpu_reset_fini(adev); 4451 4452 /* free i2c buses */ 4453 if (!amdgpu_device_has_dc_support(adev)) 4454 amdgpu_i2c_fini(adev); 4455 4456 if (amdgpu_emu_mode != 1) 4457 amdgpu_atombios_fini(adev); 4458 4459 kfree(adev->bios); 4460 adev->bios = NULL; 4461 4462 kfree(adev->fru_info); 4463 adev->fru_info = NULL; 4464 4465 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4466 4467 if (px || (!dev_is_removable(&adev->pdev->dev) && 4468 apple_gmux_detect(NULL, NULL))) 4469 vga_switcheroo_unregister_client(adev->pdev); 4470 4471 if (px) 4472 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4473 4474 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4475 vga_client_unregister(adev->pdev); 4476 4477 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4478 4479 iounmap(adev->rmmio); 4480 adev->rmmio = NULL; 4481 amdgpu_doorbell_fini(adev); 4482 drm_dev_exit(idx); 4483 } 4484 4485 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4486 amdgpu_pmu_fini(adev); 4487 if (adev->mman.discovery_bin) 4488 amdgpu_discovery_fini(adev); 4489 4490 amdgpu_reset_put_reset_domain(adev->reset_domain); 4491 adev->reset_domain = NULL; 4492 4493 kfree(adev->pci_state); 4494 4495 } 4496 4497 /** 4498 * amdgpu_device_evict_resources - evict device resources 4499 * @adev: amdgpu device object 4500 * 4501 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4502 * of the vram memory type. Mainly used for evicting device resources 4503 * at suspend time. 4504 * 4505 */ 4506 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4507 { 4508 int ret; 4509 4510 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4511 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4512 return 0; 4513 4514 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4515 if (ret) 4516 DRM_WARN("evicting device resources failed\n"); 4517 return ret; 4518 } 4519 4520 /* 4521 * Suspend & resume. 4522 */ 4523 /** 4524 * amdgpu_device_prepare - prepare for device suspend 4525 * 4526 * @dev: drm dev pointer 4527 * 4528 * Prepare to put the hw in the suspend state (all asics). 4529 * Returns 0 for success or an error on failure. 4530 * Called at driver suspend. 4531 */ 4532 int amdgpu_device_prepare(struct drm_device *dev) 4533 { 4534 struct amdgpu_device *adev = drm_to_adev(dev); 4535 int i, r; 4536 4537 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4538 return 0; 4539 4540 /* Evict the majority of BOs before starting suspend sequence */ 4541 r = amdgpu_device_evict_resources(adev); 4542 if (r) 4543 return r; 4544 4545 for (i = 0; i < adev->num_ip_blocks; i++) { 4546 if (!adev->ip_blocks[i].status.valid) 4547 continue; 4548 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4549 continue; 4550 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4551 if (r) 4552 return r; 4553 } 4554 4555 return 0; 4556 } 4557 4558 /** 4559 * amdgpu_device_suspend - initiate device suspend 4560 * 4561 * @dev: drm dev pointer 4562 * @fbcon : notify the fbdev of suspend 4563 * 4564 * Puts the hw in the suspend state (all asics). 4565 * Returns 0 for success or an error on failure. 4566 * Called at driver suspend. 4567 */ 4568 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4569 { 4570 struct amdgpu_device *adev = drm_to_adev(dev); 4571 int r = 0; 4572 4573 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4574 return 0; 4575 4576 adev->in_suspend = true; 4577 4578 if (amdgpu_sriov_vf(adev)) { 4579 amdgpu_virt_fini_data_exchange(adev); 4580 r = amdgpu_virt_request_full_gpu(adev, false); 4581 if (r) 4582 return r; 4583 } 4584 4585 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4586 DRM_WARN("smart shift update failed\n"); 4587 4588 if (fbcon) 4589 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4590 4591 cancel_delayed_work_sync(&adev->delayed_init_work); 4592 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4593 4594 amdgpu_ras_suspend(adev); 4595 4596 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4597 4598 amdgpu_device_ip_suspend_phase1(adev); 4599 4600 if (!adev->in_s0ix) 4601 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4602 4603 r = amdgpu_device_evict_resources(adev); 4604 if (r) 4605 return r; 4606 4607 amdgpu_fence_driver_hw_fini(adev); 4608 4609 amdgpu_device_ip_suspend_phase2(adev); 4610 4611 if (amdgpu_sriov_vf(adev)) 4612 amdgpu_virt_release_full_gpu(adev, false); 4613 4614 r = amdgpu_dpm_notify_rlc_state(adev, false); 4615 if (r) 4616 return r; 4617 4618 return 0; 4619 } 4620 4621 /** 4622 * amdgpu_device_resume - initiate device resume 4623 * 4624 * @dev: drm dev pointer 4625 * @fbcon : notify the fbdev of resume 4626 * 4627 * Bring the hw back to operating state (all asics). 4628 * Returns 0 for success or an error on failure. 4629 * Called at driver resume. 4630 */ 4631 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4632 { 4633 struct amdgpu_device *adev = drm_to_adev(dev); 4634 int r = 0; 4635 4636 if (amdgpu_sriov_vf(adev)) { 4637 r = amdgpu_virt_request_full_gpu(adev, true); 4638 if (r) 4639 return r; 4640 } 4641 4642 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4643 return 0; 4644 4645 if (adev->in_s0ix) 4646 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4647 4648 /* post card */ 4649 if (amdgpu_device_need_post(adev)) { 4650 r = amdgpu_device_asic_init(adev); 4651 if (r) 4652 dev_err(adev->dev, "amdgpu asic init failed\n"); 4653 } 4654 4655 r = amdgpu_device_ip_resume(adev); 4656 4657 if (r) { 4658 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4659 goto exit; 4660 } 4661 amdgpu_fence_driver_hw_init(adev); 4662 4663 if (!adev->in_s0ix) { 4664 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4665 if (r) 4666 goto exit; 4667 } 4668 4669 r = amdgpu_device_ip_late_init(adev); 4670 if (r) 4671 goto exit; 4672 4673 queue_delayed_work(system_wq, &adev->delayed_init_work, 4674 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4675 exit: 4676 if (amdgpu_sriov_vf(adev)) { 4677 amdgpu_virt_init_data_exchange(adev); 4678 amdgpu_virt_release_full_gpu(adev, true); 4679 } 4680 4681 if (r) 4682 return r; 4683 4684 /* Make sure IB tests flushed */ 4685 flush_delayed_work(&adev->delayed_init_work); 4686 4687 if (fbcon) 4688 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4689 4690 amdgpu_ras_resume(adev); 4691 4692 if (adev->mode_info.num_crtc) { 4693 /* 4694 * Most of the connector probing functions try to acquire runtime pm 4695 * refs to ensure that the GPU is powered on when connector polling is 4696 * performed. Since we're calling this from a runtime PM callback, 4697 * trying to acquire rpm refs will cause us to deadlock. 4698 * 4699 * Since we're guaranteed to be holding the rpm lock, it's safe to 4700 * temporarily disable the rpm helpers so this doesn't deadlock us. 4701 */ 4702 #ifdef CONFIG_PM 4703 dev->dev->power.disable_depth++; 4704 #endif 4705 if (!adev->dc_enabled) 4706 drm_helper_hpd_irq_event(dev); 4707 else 4708 drm_kms_helper_hotplug_event(dev); 4709 #ifdef CONFIG_PM 4710 dev->dev->power.disable_depth--; 4711 #endif 4712 } 4713 adev->in_suspend = false; 4714 4715 if (adev->enable_mes) 4716 amdgpu_mes_self_test(adev); 4717 4718 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4719 DRM_WARN("smart shift update failed\n"); 4720 4721 return 0; 4722 } 4723 4724 /** 4725 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4726 * 4727 * @adev: amdgpu_device pointer 4728 * 4729 * The list of all the hardware IPs that make up the asic is walked and 4730 * the check_soft_reset callbacks are run. check_soft_reset determines 4731 * if the asic is still hung or not. 4732 * Returns true if any of the IPs are still in a hung state, false if not. 4733 */ 4734 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4735 { 4736 int i; 4737 bool asic_hang = false; 4738 4739 if (amdgpu_sriov_vf(adev)) 4740 return true; 4741 4742 if (amdgpu_asic_need_full_reset(adev)) 4743 return true; 4744 4745 for (i = 0; i < adev->num_ip_blocks; i++) { 4746 if (!adev->ip_blocks[i].status.valid) 4747 continue; 4748 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4749 adev->ip_blocks[i].status.hang = 4750 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4751 if (adev->ip_blocks[i].status.hang) { 4752 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4753 asic_hang = true; 4754 } 4755 } 4756 return asic_hang; 4757 } 4758 4759 /** 4760 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4761 * 4762 * @adev: amdgpu_device pointer 4763 * 4764 * The list of all the hardware IPs that make up the asic is walked and the 4765 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4766 * handles any IP specific hardware or software state changes that are 4767 * necessary for a soft reset to succeed. 4768 * Returns 0 on success, negative error code on failure. 4769 */ 4770 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4771 { 4772 int i, r = 0; 4773 4774 for (i = 0; i < adev->num_ip_blocks; i++) { 4775 if (!adev->ip_blocks[i].status.valid) 4776 continue; 4777 if (adev->ip_blocks[i].status.hang && 4778 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4779 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4780 if (r) 4781 return r; 4782 } 4783 } 4784 4785 return 0; 4786 } 4787 4788 /** 4789 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4790 * 4791 * @adev: amdgpu_device pointer 4792 * 4793 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4794 * reset is necessary to recover. 4795 * Returns true if a full asic reset is required, false if not. 4796 */ 4797 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4798 { 4799 int i; 4800 4801 if (amdgpu_asic_need_full_reset(adev)) 4802 return true; 4803 4804 for (i = 0; i < adev->num_ip_blocks; i++) { 4805 if (!adev->ip_blocks[i].status.valid) 4806 continue; 4807 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4808 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4809 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4810 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4811 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4812 if (adev->ip_blocks[i].status.hang) { 4813 dev_info(adev->dev, "Some block need full reset!\n"); 4814 return true; 4815 } 4816 } 4817 } 4818 return false; 4819 } 4820 4821 /** 4822 * amdgpu_device_ip_soft_reset - do a soft reset 4823 * 4824 * @adev: amdgpu_device pointer 4825 * 4826 * The list of all the hardware IPs that make up the asic is walked and the 4827 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4828 * IP specific hardware or software state changes that are necessary to soft 4829 * reset the IP. 4830 * Returns 0 on success, negative error code on failure. 4831 */ 4832 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4833 { 4834 int i, r = 0; 4835 4836 for (i = 0; i < adev->num_ip_blocks; i++) { 4837 if (!adev->ip_blocks[i].status.valid) 4838 continue; 4839 if (adev->ip_blocks[i].status.hang && 4840 adev->ip_blocks[i].version->funcs->soft_reset) { 4841 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4842 if (r) 4843 return r; 4844 } 4845 } 4846 4847 return 0; 4848 } 4849 4850 /** 4851 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4852 * 4853 * @adev: amdgpu_device pointer 4854 * 4855 * The list of all the hardware IPs that make up the asic is walked and the 4856 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4857 * handles any IP specific hardware or software state changes that are 4858 * necessary after the IP has been soft reset. 4859 * Returns 0 on success, negative error code on failure. 4860 */ 4861 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4862 { 4863 int i, r = 0; 4864 4865 for (i = 0; i < adev->num_ip_blocks; i++) { 4866 if (!adev->ip_blocks[i].status.valid) 4867 continue; 4868 if (adev->ip_blocks[i].status.hang && 4869 adev->ip_blocks[i].version->funcs->post_soft_reset) 4870 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4871 if (r) 4872 return r; 4873 } 4874 4875 return 0; 4876 } 4877 4878 /** 4879 * amdgpu_device_recover_vram - Recover some VRAM contents 4880 * 4881 * @adev: amdgpu_device pointer 4882 * 4883 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4884 * restore things like GPUVM page tables after a GPU reset where 4885 * the contents of VRAM might be lost. 4886 * 4887 * Returns: 4888 * 0 on success, negative error code on failure. 4889 */ 4890 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4891 { 4892 struct dma_fence *fence = NULL, *next = NULL; 4893 struct amdgpu_bo *shadow; 4894 struct amdgpu_bo_vm *vmbo; 4895 long r = 1, tmo; 4896 4897 if (amdgpu_sriov_runtime(adev)) 4898 tmo = msecs_to_jiffies(8000); 4899 else 4900 tmo = msecs_to_jiffies(100); 4901 4902 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4903 mutex_lock(&adev->shadow_list_lock); 4904 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4905 /* If vm is compute context or adev is APU, shadow will be NULL */ 4906 if (!vmbo->shadow) 4907 continue; 4908 shadow = vmbo->shadow; 4909 4910 /* No need to recover an evicted BO */ 4911 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4912 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4913 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4914 continue; 4915 4916 r = amdgpu_bo_restore_shadow(shadow, &next); 4917 if (r) 4918 break; 4919 4920 if (fence) { 4921 tmo = dma_fence_wait_timeout(fence, false, tmo); 4922 dma_fence_put(fence); 4923 fence = next; 4924 if (tmo == 0) { 4925 r = -ETIMEDOUT; 4926 break; 4927 } else if (tmo < 0) { 4928 r = tmo; 4929 break; 4930 } 4931 } else { 4932 fence = next; 4933 } 4934 } 4935 mutex_unlock(&adev->shadow_list_lock); 4936 4937 if (fence) 4938 tmo = dma_fence_wait_timeout(fence, false, tmo); 4939 dma_fence_put(fence); 4940 4941 if (r < 0 || tmo <= 0) { 4942 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4943 return -EIO; 4944 } 4945 4946 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4947 return 0; 4948 } 4949 4950 4951 /** 4952 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4953 * 4954 * @adev: amdgpu_device pointer 4955 * @from_hypervisor: request from hypervisor 4956 * 4957 * do VF FLR and reinitialize Asic 4958 * return 0 means succeeded otherwise failed 4959 */ 4960 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4961 bool from_hypervisor) 4962 { 4963 int r; 4964 struct amdgpu_hive_info *hive = NULL; 4965 int retry_limit = 0; 4966 4967 retry: 4968 amdgpu_amdkfd_pre_reset(adev); 4969 4970 if (from_hypervisor) 4971 r = amdgpu_virt_request_full_gpu(adev, true); 4972 else 4973 r = amdgpu_virt_reset_gpu(adev); 4974 if (r) 4975 return r; 4976 amdgpu_irq_gpu_reset_resume_helper(adev); 4977 4978 /* some sw clean up VF needs to do before recover */ 4979 amdgpu_virt_post_reset(adev); 4980 4981 /* Resume IP prior to SMC */ 4982 r = amdgpu_device_ip_reinit_early_sriov(adev); 4983 if (r) 4984 goto error; 4985 4986 amdgpu_virt_init_data_exchange(adev); 4987 4988 r = amdgpu_device_fw_loading(adev); 4989 if (r) 4990 return r; 4991 4992 /* now we are okay to resume SMC/CP/SDMA */ 4993 r = amdgpu_device_ip_reinit_late_sriov(adev); 4994 if (r) 4995 goto error; 4996 4997 hive = amdgpu_get_xgmi_hive(adev); 4998 /* Update PSP FW topology after reset */ 4999 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5000 r = amdgpu_xgmi_update_topology(hive, adev); 5001 5002 if (hive) 5003 amdgpu_put_xgmi_hive(hive); 5004 5005 if (!r) { 5006 r = amdgpu_ib_ring_tests(adev); 5007 5008 amdgpu_amdkfd_post_reset(adev); 5009 } 5010 5011 error: 5012 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5013 amdgpu_inc_vram_lost(adev); 5014 r = amdgpu_device_recover_vram(adev); 5015 } 5016 amdgpu_virt_release_full_gpu(adev, true); 5017 5018 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5019 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5020 retry_limit++; 5021 goto retry; 5022 } else 5023 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5024 } 5025 5026 return r; 5027 } 5028 5029 /** 5030 * amdgpu_device_has_job_running - check if there is any job in mirror list 5031 * 5032 * @adev: amdgpu_device pointer 5033 * 5034 * check if there is any job in mirror list 5035 */ 5036 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5037 { 5038 int i; 5039 struct drm_sched_job *job; 5040 5041 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5042 struct amdgpu_ring *ring = adev->rings[i]; 5043 5044 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5045 continue; 5046 5047 spin_lock(&ring->sched.job_list_lock); 5048 job = list_first_entry_or_null(&ring->sched.pending_list, 5049 struct drm_sched_job, list); 5050 spin_unlock(&ring->sched.job_list_lock); 5051 if (job) 5052 return true; 5053 } 5054 return false; 5055 } 5056 5057 /** 5058 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5059 * 5060 * @adev: amdgpu_device pointer 5061 * 5062 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5063 * a hung GPU. 5064 */ 5065 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5066 { 5067 5068 if (amdgpu_gpu_recovery == 0) 5069 goto disabled; 5070 5071 /* Skip soft reset check in fatal error mode */ 5072 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5073 return true; 5074 5075 if (amdgpu_sriov_vf(adev)) 5076 return true; 5077 5078 if (amdgpu_gpu_recovery == -1) { 5079 switch (adev->asic_type) { 5080 #ifdef CONFIG_DRM_AMDGPU_SI 5081 case CHIP_VERDE: 5082 case CHIP_TAHITI: 5083 case CHIP_PITCAIRN: 5084 case CHIP_OLAND: 5085 case CHIP_HAINAN: 5086 #endif 5087 #ifdef CONFIG_DRM_AMDGPU_CIK 5088 case CHIP_KAVERI: 5089 case CHIP_KABINI: 5090 case CHIP_MULLINS: 5091 #endif 5092 case CHIP_CARRIZO: 5093 case CHIP_STONEY: 5094 case CHIP_CYAN_SKILLFISH: 5095 goto disabled; 5096 default: 5097 break; 5098 } 5099 } 5100 5101 return true; 5102 5103 disabled: 5104 dev_info(adev->dev, "GPU recovery disabled.\n"); 5105 return false; 5106 } 5107 5108 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5109 { 5110 u32 i; 5111 int ret = 0; 5112 5113 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5114 5115 dev_info(adev->dev, "GPU mode1 reset\n"); 5116 5117 /* disable BM */ 5118 pci_clear_master(adev->pdev); 5119 5120 amdgpu_device_cache_pci_state(adev->pdev); 5121 5122 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5123 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5124 ret = amdgpu_dpm_mode1_reset(adev); 5125 } else { 5126 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5127 ret = psp_gpu_reset(adev); 5128 } 5129 5130 if (ret) 5131 goto mode1_reset_failed; 5132 5133 amdgpu_device_load_pci_state(adev->pdev); 5134 ret = amdgpu_psp_wait_for_bootloader(adev); 5135 if (ret) 5136 goto mode1_reset_failed; 5137 5138 /* wait for asic to come out of reset */ 5139 for (i = 0; i < adev->usec_timeout; i++) { 5140 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5141 5142 if (memsize != 0xffffffff) 5143 break; 5144 udelay(1); 5145 } 5146 5147 if (i >= adev->usec_timeout) { 5148 ret = -ETIMEDOUT; 5149 goto mode1_reset_failed; 5150 } 5151 5152 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5153 5154 return 0; 5155 5156 mode1_reset_failed: 5157 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5158 return ret; 5159 } 5160 5161 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5162 struct amdgpu_reset_context *reset_context) 5163 { 5164 int i, r = 0; 5165 struct amdgpu_job *job = NULL; 5166 bool need_full_reset = 5167 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5168 5169 if (reset_context->reset_req_dev == adev) 5170 job = reset_context->job; 5171 5172 if (amdgpu_sriov_vf(adev)) { 5173 /* stop the data exchange thread */ 5174 amdgpu_virt_fini_data_exchange(adev); 5175 } 5176 5177 amdgpu_fence_driver_isr_toggle(adev, true); 5178 5179 /* block all schedulers and reset given job's ring */ 5180 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5181 struct amdgpu_ring *ring = adev->rings[i]; 5182 5183 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5184 continue; 5185 5186 /* Clear job fence from fence drv to avoid force_completion 5187 * leave NULL and vm flush fence in fence drv 5188 */ 5189 amdgpu_fence_driver_clear_job_fences(ring); 5190 5191 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5192 amdgpu_fence_driver_force_completion(ring); 5193 } 5194 5195 amdgpu_fence_driver_isr_toggle(adev, false); 5196 5197 if (job && job->vm) 5198 drm_sched_increase_karma(&job->base); 5199 5200 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5201 /* If reset handler not implemented, continue; otherwise return */ 5202 if (r == -EOPNOTSUPP) 5203 r = 0; 5204 else 5205 return r; 5206 5207 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5208 if (!amdgpu_sriov_vf(adev)) { 5209 5210 if (!need_full_reset) 5211 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5212 5213 if (!need_full_reset && amdgpu_gpu_recovery && 5214 amdgpu_device_ip_check_soft_reset(adev)) { 5215 amdgpu_device_ip_pre_soft_reset(adev); 5216 r = amdgpu_device_ip_soft_reset(adev); 5217 amdgpu_device_ip_post_soft_reset(adev); 5218 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5219 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5220 need_full_reset = true; 5221 } 5222 } 5223 5224 if (need_full_reset) 5225 r = amdgpu_device_ip_suspend(adev); 5226 if (need_full_reset) 5227 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5228 else 5229 clear_bit(AMDGPU_NEED_FULL_RESET, 5230 &reset_context->flags); 5231 } 5232 5233 return r; 5234 } 5235 5236 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5237 { 5238 int i; 5239 5240 lockdep_assert_held(&adev->reset_domain->sem); 5241 5242 for (i = 0; i < adev->reset_info.num_regs; i++) { 5243 adev->reset_info.reset_dump_reg_value[i] = 5244 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5245 5246 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5247 adev->reset_info.reset_dump_reg_value[i]); 5248 } 5249 5250 return 0; 5251 } 5252 5253 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5254 struct amdgpu_reset_context *reset_context) 5255 { 5256 struct amdgpu_device *tmp_adev = NULL; 5257 bool need_full_reset, skip_hw_reset, vram_lost = false; 5258 int r = 0; 5259 bool gpu_reset_for_dev_remove = 0; 5260 5261 /* Try reset handler method first */ 5262 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5263 reset_list); 5264 amdgpu_reset_reg_dumps(tmp_adev); 5265 5266 reset_context->reset_device_list = device_list_handle; 5267 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5268 /* If reset handler not implemented, continue; otherwise return */ 5269 if (r == -EOPNOTSUPP) 5270 r = 0; 5271 else 5272 return r; 5273 5274 /* Reset handler not implemented, use the default method */ 5275 need_full_reset = 5276 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5277 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5278 5279 gpu_reset_for_dev_remove = 5280 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5281 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5282 5283 /* 5284 * ASIC reset has to be done on all XGMI hive nodes ASAP 5285 * to allow proper links negotiation in FW (within 1 sec) 5286 */ 5287 if (!skip_hw_reset && need_full_reset) { 5288 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5289 /* For XGMI run all resets in parallel to speed up the process */ 5290 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5291 tmp_adev->gmc.xgmi.pending_reset = false; 5292 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5293 r = -EALREADY; 5294 } else 5295 r = amdgpu_asic_reset(tmp_adev); 5296 5297 if (r) { 5298 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5299 r, adev_to_drm(tmp_adev)->unique); 5300 goto out; 5301 } 5302 } 5303 5304 /* For XGMI wait for all resets to complete before proceed */ 5305 if (!r) { 5306 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5307 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5308 flush_work(&tmp_adev->xgmi_reset_work); 5309 r = tmp_adev->asic_reset_res; 5310 if (r) 5311 break; 5312 } 5313 } 5314 } 5315 } 5316 5317 if (!r && amdgpu_ras_intr_triggered()) { 5318 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5319 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5320 } 5321 5322 amdgpu_ras_intr_cleared(); 5323 } 5324 5325 /* Since the mode1 reset affects base ip blocks, the 5326 * phase1 ip blocks need to be resumed. Otherwise there 5327 * will be a BIOS signature error and the psp bootloader 5328 * can't load kdb on the next amdgpu install. 5329 */ 5330 if (gpu_reset_for_dev_remove) { 5331 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5332 amdgpu_device_ip_resume_phase1(tmp_adev); 5333 5334 goto end; 5335 } 5336 5337 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5338 if (need_full_reset) { 5339 /* post card */ 5340 r = amdgpu_device_asic_init(tmp_adev); 5341 if (r) { 5342 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5343 } else { 5344 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5345 5346 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5347 if (r) 5348 goto out; 5349 5350 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5351 5352 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5353 5354 if (vram_lost) { 5355 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5356 amdgpu_inc_vram_lost(tmp_adev); 5357 } 5358 5359 r = amdgpu_device_fw_loading(tmp_adev); 5360 if (r) 5361 return r; 5362 5363 r = amdgpu_xcp_restore_partition_mode( 5364 tmp_adev->xcp_mgr); 5365 if (r) 5366 goto out; 5367 5368 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5369 if (r) 5370 goto out; 5371 5372 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5373 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5374 5375 if (vram_lost) 5376 amdgpu_device_fill_reset_magic(tmp_adev); 5377 5378 /* 5379 * Add this ASIC as tracked as reset was already 5380 * complete successfully. 5381 */ 5382 amdgpu_register_gpu_instance(tmp_adev); 5383 5384 if (!reset_context->hive && 5385 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5386 amdgpu_xgmi_add_device(tmp_adev); 5387 5388 r = amdgpu_device_ip_late_init(tmp_adev); 5389 if (r) 5390 goto out; 5391 5392 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5393 5394 /* 5395 * The GPU enters bad state once faulty pages 5396 * by ECC has reached the threshold, and ras 5397 * recovery is scheduled next. So add one check 5398 * here to break recovery if it indeed exceeds 5399 * bad page threshold, and remind user to 5400 * retire this GPU or setting one bigger 5401 * bad_page_threshold value to fix this once 5402 * probing driver again. 5403 */ 5404 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5405 /* must succeed. */ 5406 amdgpu_ras_resume(tmp_adev); 5407 } else { 5408 r = -EINVAL; 5409 goto out; 5410 } 5411 5412 /* Update PSP FW topology after reset */ 5413 if (reset_context->hive && 5414 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5415 r = amdgpu_xgmi_update_topology( 5416 reset_context->hive, tmp_adev); 5417 } 5418 } 5419 5420 out: 5421 if (!r) { 5422 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5423 r = amdgpu_ib_ring_tests(tmp_adev); 5424 if (r) { 5425 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5426 need_full_reset = true; 5427 r = -EAGAIN; 5428 goto end; 5429 } 5430 } 5431 5432 if (!r) 5433 r = amdgpu_device_recover_vram(tmp_adev); 5434 else 5435 tmp_adev->asic_reset_res = r; 5436 } 5437 5438 end: 5439 if (need_full_reset) 5440 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5441 else 5442 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5443 return r; 5444 } 5445 5446 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5447 { 5448 5449 switch (amdgpu_asic_reset_method(adev)) { 5450 case AMD_RESET_METHOD_MODE1: 5451 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5452 break; 5453 case AMD_RESET_METHOD_MODE2: 5454 adev->mp1_state = PP_MP1_STATE_RESET; 5455 break; 5456 default: 5457 adev->mp1_state = PP_MP1_STATE_NONE; 5458 break; 5459 } 5460 } 5461 5462 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5463 { 5464 amdgpu_vf_error_trans_all(adev); 5465 adev->mp1_state = PP_MP1_STATE_NONE; 5466 } 5467 5468 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5469 { 5470 struct pci_dev *p = NULL; 5471 5472 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5473 adev->pdev->bus->number, 1); 5474 if (p) { 5475 pm_runtime_enable(&(p->dev)); 5476 pm_runtime_resume(&(p->dev)); 5477 } 5478 5479 pci_dev_put(p); 5480 } 5481 5482 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5483 { 5484 enum amd_reset_method reset_method; 5485 struct pci_dev *p = NULL; 5486 u64 expires; 5487 5488 /* 5489 * For now, only BACO and mode1 reset are confirmed 5490 * to suffer the audio issue without proper suspended. 5491 */ 5492 reset_method = amdgpu_asic_reset_method(adev); 5493 if ((reset_method != AMD_RESET_METHOD_BACO) && 5494 (reset_method != AMD_RESET_METHOD_MODE1)) 5495 return -EINVAL; 5496 5497 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5498 adev->pdev->bus->number, 1); 5499 if (!p) 5500 return -ENODEV; 5501 5502 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5503 if (!expires) 5504 /* 5505 * If we cannot get the audio device autosuspend delay, 5506 * a fixed 4S interval will be used. Considering 3S is 5507 * the audio controller default autosuspend delay setting. 5508 * 4S used here is guaranteed to cover that. 5509 */ 5510 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5511 5512 while (!pm_runtime_status_suspended(&(p->dev))) { 5513 if (!pm_runtime_suspend(&(p->dev))) 5514 break; 5515 5516 if (expires < ktime_get_mono_fast_ns()) { 5517 dev_warn(adev->dev, "failed to suspend display audio\n"); 5518 pci_dev_put(p); 5519 /* TODO: abort the succeeding gpu reset? */ 5520 return -ETIMEDOUT; 5521 } 5522 } 5523 5524 pm_runtime_disable(&(p->dev)); 5525 5526 pci_dev_put(p); 5527 return 0; 5528 } 5529 5530 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5531 { 5532 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5533 5534 #if defined(CONFIG_DEBUG_FS) 5535 if (!amdgpu_sriov_vf(adev)) 5536 cancel_work(&adev->reset_work); 5537 #endif 5538 5539 if (adev->kfd.dev) 5540 cancel_work(&adev->kfd.reset_work); 5541 5542 if (amdgpu_sriov_vf(adev)) 5543 cancel_work(&adev->virt.flr_work); 5544 5545 if (con && adev->ras_enabled) 5546 cancel_work(&con->recovery_work); 5547 5548 } 5549 5550 /** 5551 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5552 * 5553 * @adev: amdgpu_device pointer 5554 * @job: which job trigger hang 5555 * @reset_context: amdgpu reset context pointer 5556 * 5557 * Attempt to reset the GPU if it has hung (all asics). 5558 * Attempt to do soft-reset or full-reset and reinitialize Asic 5559 * Returns 0 for success or an error on failure. 5560 */ 5561 5562 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5563 struct amdgpu_job *job, 5564 struct amdgpu_reset_context *reset_context) 5565 { 5566 struct list_head device_list, *device_list_handle = NULL; 5567 bool job_signaled = false; 5568 struct amdgpu_hive_info *hive = NULL; 5569 struct amdgpu_device *tmp_adev = NULL; 5570 int i, r = 0; 5571 bool need_emergency_restart = false; 5572 bool audio_suspended = false; 5573 bool gpu_reset_for_dev_remove = false; 5574 5575 gpu_reset_for_dev_remove = 5576 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5577 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5578 5579 /* 5580 * Special case: RAS triggered and full reset isn't supported 5581 */ 5582 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5583 5584 /* 5585 * Flush RAM to disk so that after reboot 5586 * the user can read log and see why the system rebooted. 5587 */ 5588 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5589 amdgpu_ras_get_context(adev)->reboot) { 5590 DRM_WARN("Emergency reboot."); 5591 5592 ksys_sync_helper(); 5593 emergency_restart(); 5594 } 5595 5596 dev_info(adev->dev, "GPU %s begin!\n", 5597 need_emergency_restart ? "jobs stop":"reset"); 5598 5599 if (!amdgpu_sriov_vf(adev)) 5600 hive = amdgpu_get_xgmi_hive(adev); 5601 if (hive) 5602 mutex_lock(&hive->hive_lock); 5603 5604 reset_context->job = job; 5605 reset_context->hive = hive; 5606 /* 5607 * Build list of devices to reset. 5608 * In case we are in XGMI hive mode, resort the device list 5609 * to put adev in the 1st position. 5610 */ 5611 INIT_LIST_HEAD(&device_list); 5612 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5613 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5614 list_add_tail(&tmp_adev->reset_list, &device_list); 5615 if (gpu_reset_for_dev_remove && adev->shutdown) 5616 tmp_adev->shutdown = true; 5617 } 5618 if (!list_is_first(&adev->reset_list, &device_list)) 5619 list_rotate_to_front(&adev->reset_list, &device_list); 5620 device_list_handle = &device_list; 5621 } else { 5622 list_add_tail(&adev->reset_list, &device_list); 5623 device_list_handle = &device_list; 5624 } 5625 5626 /* We need to lock reset domain only once both for XGMI and single device */ 5627 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5628 reset_list); 5629 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5630 5631 /* block all schedulers and reset given job's ring */ 5632 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5633 5634 amdgpu_device_set_mp1_state(tmp_adev); 5635 5636 /* 5637 * Try to put the audio codec into suspend state 5638 * before gpu reset started. 5639 * 5640 * Due to the power domain of the graphics device 5641 * is shared with AZ power domain. Without this, 5642 * we may change the audio hardware from behind 5643 * the audio driver's back. That will trigger 5644 * some audio codec errors. 5645 */ 5646 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5647 audio_suspended = true; 5648 5649 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5650 5651 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5652 5653 if (!amdgpu_sriov_vf(tmp_adev)) 5654 amdgpu_amdkfd_pre_reset(tmp_adev); 5655 5656 /* 5657 * Mark these ASICs to be reseted as untracked first 5658 * And add them back after reset completed 5659 */ 5660 amdgpu_unregister_gpu_instance(tmp_adev); 5661 5662 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5663 5664 /* disable ras on ALL IPs */ 5665 if (!need_emergency_restart && 5666 amdgpu_device_ip_need_full_reset(tmp_adev)) 5667 amdgpu_ras_suspend(tmp_adev); 5668 5669 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5670 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5671 5672 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5673 continue; 5674 5675 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5676 5677 if (need_emergency_restart) 5678 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5679 } 5680 atomic_inc(&tmp_adev->gpu_reset_counter); 5681 } 5682 5683 if (need_emergency_restart) 5684 goto skip_sched_resume; 5685 5686 /* 5687 * Must check guilty signal here since after this point all old 5688 * HW fences are force signaled. 5689 * 5690 * job->base holds a reference to parent fence 5691 */ 5692 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5693 job_signaled = true; 5694 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5695 goto skip_hw_reset; 5696 } 5697 5698 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5699 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5700 if (gpu_reset_for_dev_remove) { 5701 /* Workaroud for ASICs need to disable SMC first */ 5702 amdgpu_device_smu_fini_early(tmp_adev); 5703 } 5704 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5705 /*TODO Should we stop ?*/ 5706 if (r) { 5707 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5708 r, adev_to_drm(tmp_adev)->unique); 5709 tmp_adev->asic_reset_res = r; 5710 } 5711 5712 /* 5713 * Drop all pending non scheduler resets. Scheduler resets 5714 * were already dropped during drm_sched_stop 5715 */ 5716 amdgpu_device_stop_pending_resets(tmp_adev); 5717 } 5718 5719 /* Actual ASIC resets if needed.*/ 5720 /* Host driver will handle XGMI hive reset for SRIOV */ 5721 if (amdgpu_sriov_vf(adev)) { 5722 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5723 if (r) 5724 adev->asic_reset_res = r; 5725 5726 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5727 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5728 IP_VERSION(9, 4, 2) || 5729 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5730 amdgpu_ras_resume(adev); 5731 } else { 5732 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5733 if (r && r == -EAGAIN) 5734 goto retry; 5735 5736 if (!r && gpu_reset_for_dev_remove) 5737 goto recover_end; 5738 } 5739 5740 skip_hw_reset: 5741 5742 /* Post ASIC reset for all devs .*/ 5743 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5744 5745 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5746 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5747 5748 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5749 continue; 5750 5751 drm_sched_start(&ring->sched, true); 5752 } 5753 5754 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5755 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5756 5757 if (tmp_adev->asic_reset_res) 5758 r = tmp_adev->asic_reset_res; 5759 5760 tmp_adev->asic_reset_res = 0; 5761 5762 if (r) { 5763 /* bad news, how to tell it to userspace ? */ 5764 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5765 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5766 } else { 5767 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5768 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5769 DRM_WARN("smart shift update failed\n"); 5770 } 5771 } 5772 5773 skip_sched_resume: 5774 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5775 /* unlock kfd: SRIOV would do it separately */ 5776 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5777 amdgpu_amdkfd_post_reset(tmp_adev); 5778 5779 /* kfd_post_reset will do nothing if kfd device is not initialized, 5780 * need to bring up kfd here if it's not be initialized before 5781 */ 5782 if (!adev->kfd.init_complete) 5783 amdgpu_amdkfd_device_init(adev); 5784 5785 if (audio_suspended) 5786 amdgpu_device_resume_display_audio(tmp_adev); 5787 5788 amdgpu_device_unset_mp1_state(tmp_adev); 5789 5790 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5791 } 5792 5793 recover_end: 5794 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5795 reset_list); 5796 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5797 5798 if (hive) { 5799 mutex_unlock(&hive->hive_lock); 5800 amdgpu_put_xgmi_hive(hive); 5801 } 5802 5803 if (r) 5804 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5805 5806 atomic_set(&adev->reset_domain->reset_res, r); 5807 return r; 5808 } 5809 5810 /** 5811 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5812 * 5813 * @adev: amdgpu_device pointer 5814 * @speed: pointer to the speed of the link 5815 * @width: pointer to the width of the link 5816 * 5817 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5818 * first physical partner to an AMD dGPU. 5819 * This will exclude any virtual switches and links. 5820 */ 5821 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5822 enum pci_bus_speed *speed, 5823 enum pcie_link_width *width) 5824 { 5825 struct pci_dev *parent = adev->pdev; 5826 5827 if (!speed || !width) 5828 return; 5829 5830 *speed = PCI_SPEED_UNKNOWN; 5831 *width = PCIE_LNK_WIDTH_UNKNOWN; 5832 5833 while ((parent = pci_upstream_bridge(parent))) { 5834 /* skip upstream/downstream switches internal to dGPU*/ 5835 if (parent->vendor == PCI_VENDOR_ID_ATI) 5836 continue; 5837 *speed = pcie_get_speed_cap(parent); 5838 *width = pcie_get_width_cap(parent); 5839 break; 5840 } 5841 } 5842 5843 /** 5844 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5845 * 5846 * @adev: amdgpu_device pointer 5847 * 5848 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5849 * and lanes) of the slot the device is in. Handles APUs and 5850 * virtualized environments where PCIE config space may not be available. 5851 */ 5852 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5853 { 5854 struct pci_dev *pdev; 5855 enum pci_bus_speed speed_cap, platform_speed_cap; 5856 enum pcie_link_width platform_link_width; 5857 5858 if (amdgpu_pcie_gen_cap) 5859 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5860 5861 if (amdgpu_pcie_lane_cap) 5862 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5863 5864 /* covers APUs as well */ 5865 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5866 if (adev->pm.pcie_gen_mask == 0) 5867 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5868 if (adev->pm.pcie_mlw_mask == 0) 5869 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5870 return; 5871 } 5872 5873 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5874 return; 5875 5876 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5877 &platform_link_width); 5878 5879 if (adev->pm.pcie_gen_mask == 0) { 5880 /* asic caps */ 5881 pdev = adev->pdev; 5882 speed_cap = pcie_get_speed_cap(pdev); 5883 if (speed_cap == PCI_SPEED_UNKNOWN) { 5884 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5885 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5886 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5887 } else { 5888 if (speed_cap == PCIE_SPEED_32_0GT) 5889 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5890 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5891 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5892 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5893 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5894 else if (speed_cap == PCIE_SPEED_16_0GT) 5895 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5896 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5897 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5898 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5899 else if (speed_cap == PCIE_SPEED_8_0GT) 5900 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5901 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5902 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5903 else if (speed_cap == PCIE_SPEED_5_0GT) 5904 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5905 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5906 else 5907 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5908 } 5909 /* platform caps */ 5910 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5911 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5912 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5913 } else { 5914 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5915 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5916 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5917 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5918 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5919 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5920 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5921 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5922 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5923 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5924 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5925 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5926 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5927 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5928 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5929 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5930 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5931 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5932 else 5933 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5934 5935 } 5936 } 5937 if (adev->pm.pcie_mlw_mask == 0) { 5938 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5939 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5940 } else { 5941 switch (platform_link_width) { 5942 case PCIE_LNK_X32: 5943 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5944 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5945 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5946 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5947 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5948 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5950 break; 5951 case PCIE_LNK_X16: 5952 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5953 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5954 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5955 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5958 break; 5959 case PCIE_LNK_X12: 5960 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5961 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5962 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5963 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5965 break; 5966 case PCIE_LNK_X8: 5967 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5969 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5970 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5971 break; 5972 case PCIE_LNK_X4: 5973 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5974 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5976 break; 5977 case PCIE_LNK_X2: 5978 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5979 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5980 break; 5981 case PCIE_LNK_X1: 5982 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5983 break; 5984 default: 5985 break; 5986 } 5987 } 5988 } 5989 } 5990 5991 /** 5992 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5993 * 5994 * @adev: amdgpu_device pointer 5995 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5996 * 5997 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5998 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5999 * @peer_adev. 6000 */ 6001 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6002 struct amdgpu_device *peer_adev) 6003 { 6004 #ifdef CONFIG_HSA_AMD_P2P 6005 uint64_t address_mask = peer_adev->dev->dma_mask ? 6006 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6007 resource_size_t aper_limit = 6008 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6009 bool p2p_access = 6010 !adev->gmc.xgmi.connected_to_cpu && 6011 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6012 6013 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6014 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6015 !(adev->gmc.aper_base & address_mask || 6016 aper_limit & address_mask)); 6017 #else 6018 return false; 6019 #endif 6020 } 6021 6022 int amdgpu_device_baco_enter(struct drm_device *dev) 6023 { 6024 struct amdgpu_device *adev = drm_to_adev(dev); 6025 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6026 6027 if (!amdgpu_device_supports_baco(dev)) 6028 return -ENOTSUPP; 6029 6030 if (ras && adev->ras_enabled && 6031 adev->nbio.funcs->enable_doorbell_interrupt) 6032 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6033 6034 return amdgpu_dpm_baco_enter(adev); 6035 } 6036 6037 int amdgpu_device_baco_exit(struct drm_device *dev) 6038 { 6039 struct amdgpu_device *adev = drm_to_adev(dev); 6040 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6041 int ret = 0; 6042 6043 if (!amdgpu_device_supports_baco(dev)) 6044 return -ENOTSUPP; 6045 6046 ret = amdgpu_dpm_baco_exit(adev); 6047 if (ret) 6048 return ret; 6049 6050 if (ras && adev->ras_enabled && 6051 adev->nbio.funcs->enable_doorbell_interrupt) 6052 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6053 6054 if (amdgpu_passthrough(adev) && 6055 adev->nbio.funcs->clear_doorbell_interrupt) 6056 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6057 6058 return 0; 6059 } 6060 6061 /** 6062 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6063 * @pdev: PCI device struct 6064 * @state: PCI channel state 6065 * 6066 * Description: Called when a PCI error is detected. 6067 * 6068 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6069 */ 6070 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6071 { 6072 struct drm_device *dev = pci_get_drvdata(pdev); 6073 struct amdgpu_device *adev = drm_to_adev(dev); 6074 int i; 6075 6076 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6077 6078 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6079 DRM_WARN("No support for XGMI hive yet..."); 6080 return PCI_ERS_RESULT_DISCONNECT; 6081 } 6082 6083 adev->pci_channel_state = state; 6084 6085 switch (state) { 6086 case pci_channel_io_normal: 6087 return PCI_ERS_RESULT_CAN_RECOVER; 6088 /* Fatal error, prepare for slot reset */ 6089 case pci_channel_io_frozen: 6090 /* 6091 * Locking adev->reset_domain->sem will prevent any external access 6092 * to GPU during PCI error recovery 6093 */ 6094 amdgpu_device_lock_reset_domain(adev->reset_domain); 6095 amdgpu_device_set_mp1_state(adev); 6096 6097 /* 6098 * Block any work scheduling as we do for regular GPU reset 6099 * for the duration of the recovery 6100 */ 6101 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6102 struct amdgpu_ring *ring = adev->rings[i]; 6103 6104 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 6105 continue; 6106 6107 drm_sched_stop(&ring->sched, NULL); 6108 } 6109 atomic_inc(&adev->gpu_reset_counter); 6110 return PCI_ERS_RESULT_NEED_RESET; 6111 case pci_channel_io_perm_failure: 6112 /* Permanent error, prepare for device removal */ 6113 return PCI_ERS_RESULT_DISCONNECT; 6114 } 6115 6116 return PCI_ERS_RESULT_NEED_RESET; 6117 } 6118 6119 /** 6120 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6121 * @pdev: pointer to PCI device 6122 */ 6123 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6124 { 6125 6126 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6127 6128 /* TODO - dump whatever for debugging purposes */ 6129 6130 /* This called only if amdgpu_pci_error_detected returns 6131 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6132 * works, no need to reset slot. 6133 */ 6134 6135 return PCI_ERS_RESULT_RECOVERED; 6136 } 6137 6138 /** 6139 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6140 * @pdev: PCI device struct 6141 * 6142 * Description: This routine is called by the pci error recovery 6143 * code after the PCI slot has been reset, just before we 6144 * should resume normal operations. 6145 */ 6146 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6147 { 6148 struct drm_device *dev = pci_get_drvdata(pdev); 6149 struct amdgpu_device *adev = drm_to_adev(dev); 6150 int r, i; 6151 struct amdgpu_reset_context reset_context; 6152 u32 memsize; 6153 struct list_head device_list; 6154 6155 DRM_INFO("PCI error: slot reset callback!!\n"); 6156 6157 memset(&reset_context, 0, sizeof(reset_context)); 6158 6159 INIT_LIST_HEAD(&device_list); 6160 list_add_tail(&adev->reset_list, &device_list); 6161 6162 /* wait for asic to come out of reset */ 6163 msleep(500); 6164 6165 /* Restore PCI confspace */ 6166 amdgpu_device_load_pci_state(pdev); 6167 6168 /* confirm ASIC came out of reset */ 6169 for (i = 0; i < adev->usec_timeout; i++) { 6170 memsize = amdgpu_asic_get_config_memsize(adev); 6171 6172 if (memsize != 0xffffffff) 6173 break; 6174 udelay(1); 6175 } 6176 if (memsize == 0xffffffff) { 6177 r = -ETIME; 6178 goto out; 6179 } 6180 6181 reset_context.method = AMD_RESET_METHOD_NONE; 6182 reset_context.reset_req_dev = adev; 6183 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6184 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6185 6186 adev->no_hw_access = true; 6187 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6188 adev->no_hw_access = false; 6189 if (r) 6190 goto out; 6191 6192 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6193 6194 out: 6195 if (!r) { 6196 if (amdgpu_device_cache_pci_state(adev->pdev)) 6197 pci_restore_state(adev->pdev); 6198 6199 DRM_INFO("PCIe error recovery succeeded\n"); 6200 } else { 6201 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6202 amdgpu_device_unset_mp1_state(adev); 6203 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6204 } 6205 6206 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6207 } 6208 6209 /** 6210 * amdgpu_pci_resume() - resume normal ops after PCI reset 6211 * @pdev: pointer to PCI device 6212 * 6213 * Called when the error recovery driver tells us that its 6214 * OK to resume normal operation. 6215 */ 6216 void amdgpu_pci_resume(struct pci_dev *pdev) 6217 { 6218 struct drm_device *dev = pci_get_drvdata(pdev); 6219 struct amdgpu_device *adev = drm_to_adev(dev); 6220 int i; 6221 6222 6223 DRM_INFO("PCI error: resume callback!!\n"); 6224 6225 /* Only continue execution for the case of pci_channel_io_frozen */ 6226 if (adev->pci_channel_state != pci_channel_io_frozen) 6227 return; 6228 6229 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6230 struct amdgpu_ring *ring = adev->rings[i]; 6231 6232 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 6233 continue; 6234 6235 drm_sched_start(&ring->sched, true); 6236 } 6237 6238 amdgpu_device_unset_mp1_state(adev); 6239 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6240 } 6241 6242 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6243 { 6244 struct drm_device *dev = pci_get_drvdata(pdev); 6245 struct amdgpu_device *adev = drm_to_adev(dev); 6246 int r; 6247 6248 r = pci_save_state(pdev); 6249 if (!r) { 6250 kfree(adev->pci_state); 6251 6252 adev->pci_state = pci_store_saved_state(pdev); 6253 6254 if (!adev->pci_state) { 6255 DRM_ERROR("Failed to store PCI saved state"); 6256 return false; 6257 } 6258 } else { 6259 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6260 return false; 6261 } 6262 6263 return true; 6264 } 6265 6266 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6267 { 6268 struct drm_device *dev = pci_get_drvdata(pdev); 6269 struct amdgpu_device *adev = drm_to_adev(dev); 6270 int r; 6271 6272 if (!adev->pci_state) 6273 return false; 6274 6275 r = pci_load_saved_state(pdev, adev->pci_state); 6276 6277 if (!r) { 6278 pci_restore_state(pdev); 6279 } else { 6280 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6281 return false; 6282 } 6283 6284 return true; 6285 } 6286 6287 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6288 struct amdgpu_ring *ring) 6289 { 6290 #ifdef CONFIG_X86_64 6291 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6292 return; 6293 #endif 6294 if (adev->gmc.xgmi.connected_to_cpu) 6295 return; 6296 6297 if (ring && ring->funcs->emit_hdp_flush) 6298 amdgpu_ring_emit_hdp_flush(ring); 6299 else 6300 amdgpu_asic_flush_hdp(adev, ring); 6301 } 6302 6303 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6304 struct amdgpu_ring *ring) 6305 { 6306 #ifdef CONFIG_X86_64 6307 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6308 return; 6309 #endif 6310 if (adev->gmc.xgmi.connected_to_cpu) 6311 return; 6312 6313 amdgpu_asic_invalidate_hdp(adev, ring); 6314 } 6315 6316 int amdgpu_in_reset(struct amdgpu_device *adev) 6317 { 6318 return atomic_read(&adev->reset_domain->in_gpu_reset); 6319 } 6320 6321 /** 6322 * amdgpu_device_halt() - bring hardware to some kind of halt state 6323 * 6324 * @adev: amdgpu_device pointer 6325 * 6326 * Bring hardware to some kind of halt state so that no one can touch it 6327 * any more. It will help to maintain error context when error occurred. 6328 * Compare to a simple hang, the system will keep stable at least for SSH 6329 * access. Then it should be trivial to inspect the hardware state and 6330 * see what's going on. Implemented as following: 6331 * 6332 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6333 * clears all CPU mappings to device, disallows remappings through page faults 6334 * 2. amdgpu_irq_disable_all() disables all interrupts 6335 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6336 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6337 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6338 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6339 * flush any in flight DMA operations 6340 */ 6341 void amdgpu_device_halt(struct amdgpu_device *adev) 6342 { 6343 struct pci_dev *pdev = adev->pdev; 6344 struct drm_device *ddev = adev_to_drm(adev); 6345 6346 amdgpu_xcp_dev_unplug(adev); 6347 drm_dev_unplug(ddev); 6348 6349 amdgpu_irq_disable_all(adev); 6350 6351 amdgpu_fence_driver_hw_fini(adev); 6352 6353 adev->no_hw_access = true; 6354 6355 amdgpu_device_unmap_mmio(adev); 6356 6357 pci_disable_device(pdev); 6358 pci_wait_for_pending_transaction(pdev); 6359 } 6360 6361 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6362 u32 reg) 6363 { 6364 unsigned long flags, address, data; 6365 u32 r; 6366 6367 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6368 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6369 6370 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6371 WREG32(address, reg * 4); 6372 (void)RREG32(address); 6373 r = RREG32(data); 6374 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6375 return r; 6376 } 6377 6378 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6379 u32 reg, u32 v) 6380 { 6381 unsigned long flags, address, data; 6382 6383 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6384 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6385 6386 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6387 WREG32(address, reg * 4); 6388 (void)RREG32(address); 6389 WREG32(data, v); 6390 (void)RREG32(data); 6391 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6392 } 6393 6394 /** 6395 * amdgpu_device_switch_gang - switch to a new gang 6396 * @adev: amdgpu_device pointer 6397 * @gang: the gang to switch to 6398 * 6399 * Try to switch to a new gang. 6400 * Returns: NULL if we switched to the new gang or a reference to the current 6401 * gang leader. 6402 */ 6403 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6404 struct dma_fence *gang) 6405 { 6406 struct dma_fence *old = NULL; 6407 6408 do { 6409 dma_fence_put(old); 6410 rcu_read_lock(); 6411 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6412 rcu_read_unlock(); 6413 6414 if (old == gang) 6415 break; 6416 6417 if (!dma_fence_is_signaled(old)) 6418 return old; 6419 6420 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6421 old, gang) != old); 6422 6423 dma_fence_put(old); 6424 return NULL; 6425 } 6426 6427 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6428 { 6429 switch (adev->asic_type) { 6430 #ifdef CONFIG_DRM_AMDGPU_SI 6431 case CHIP_HAINAN: 6432 #endif 6433 case CHIP_TOPAZ: 6434 /* chips with no display hardware */ 6435 return false; 6436 #ifdef CONFIG_DRM_AMDGPU_SI 6437 case CHIP_TAHITI: 6438 case CHIP_PITCAIRN: 6439 case CHIP_VERDE: 6440 case CHIP_OLAND: 6441 #endif 6442 #ifdef CONFIG_DRM_AMDGPU_CIK 6443 case CHIP_BONAIRE: 6444 case CHIP_HAWAII: 6445 case CHIP_KAVERI: 6446 case CHIP_KABINI: 6447 case CHIP_MULLINS: 6448 #endif 6449 case CHIP_TONGA: 6450 case CHIP_FIJI: 6451 case CHIP_POLARIS10: 6452 case CHIP_POLARIS11: 6453 case CHIP_POLARIS12: 6454 case CHIP_VEGAM: 6455 case CHIP_CARRIZO: 6456 case CHIP_STONEY: 6457 /* chips with display hardware */ 6458 return true; 6459 default: 6460 /* IP discovery */ 6461 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6462 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6463 return false; 6464 return true; 6465 } 6466 } 6467 6468 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6469 uint32_t inst, uint32_t reg_addr, char reg_name[], 6470 uint32_t expected_value, uint32_t mask) 6471 { 6472 uint32_t ret = 0; 6473 uint32_t old_ = 0; 6474 uint32_t tmp_ = RREG32(reg_addr); 6475 uint32_t loop = adev->usec_timeout; 6476 6477 while ((tmp_ & (mask)) != (expected_value)) { 6478 if (old_ != tmp_) { 6479 loop = adev->usec_timeout; 6480 old_ = tmp_; 6481 } else 6482 udelay(1); 6483 tmp_ = RREG32(reg_addr); 6484 loop--; 6485 if (!loop) { 6486 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6487 inst, reg_name, (uint32_t)expected_value, 6488 (uint32_t)(tmp_ & (mask))); 6489 ret = -ETIMEDOUT; 6490 break; 6491 } 6492 } 6493 return ret; 6494 } 6495