1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 168 /** 169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device is a dGPU with ATPX power control, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_px(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = drm_to_adev(dev); 179 180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 181 return true; 182 return false; 183 } 184 185 /** 186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 187 * 188 * @dev: drm_device pointer 189 * 190 * Returns true if the device is a dGPU with ACPI power control, 191 * otherwise return false. 192 */ 193 bool amdgpu_device_supports_boco(struct drm_device *dev) 194 { 195 struct amdgpu_device *adev = drm_to_adev(dev); 196 197 if (adev->has_pr3 || 198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 199 return true; 200 return false; 201 } 202 203 /** 204 * amdgpu_device_supports_baco - Does the device support BACO 205 * 206 * @dev: drm_device pointer 207 * 208 * Returns true if the device supporte BACO, 209 * otherwise return false. 210 */ 211 bool amdgpu_device_supports_baco(struct drm_device *dev) 212 { 213 struct amdgpu_device *adev = drm_to_adev(dev); 214 215 return amdgpu_asic_supports_baco(adev); 216 } 217 218 /** 219 * amdgpu_device_supports_smart_shift - Is the device dGPU with 220 * smart shift support 221 * 222 * @dev: drm_device pointer 223 * 224 * Returns true if the device is a dGPU with Smart Shift support, 225 * otherwise returns false. 226 */ 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 228 { 229 return (amdgpu_device_supports_boco(dev) && 230 amdgpu_acpi_is_power_shift_control_supported()); 231 } 232 233 /* 234 * VRAM access helper functions 235 */ 236 237 /** 238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 239 * 240 * @adev: amdgpu_device pointer 241 * @pos: offset of the buffer in vram 242 * @buf: virtual address of the buffer in system memory 243 * @size: read/write size, sizeof(@buf) must > @size 244 * @write: true - write to vram, otherwise - read from vram 245 */ 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 247 void *buf, size_t size, bool write) 248 { 249 unsigned long flags; 250 uint32_t hi = ~0, tmp = 0; 251 uint32_t *data = buf; 252 uint64_t last; 253 int idx; 254 255 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 256 return; 257 258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 259 260 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 261 for (last = pos + size; pos < last; pos += 4) { 262 tmp = pos >> 31; 263 264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 265 if (tmp != hi) { 266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 267 hi = tmp; 268 } 269 if (write) 270 WREG32_NO_KIQ(mmMM_DATA, *data++); 271 else 272 *data++ = RREG32_NO_KIQ(mmMM_DATA); 273 } 274 275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 276 drm_dev_exit(idx); 277 } 278 279 /** 280 * amdgpu_device_aper_access - access vram by vram aperature 281 * 282 * @adev: amdgpu_device pointer 283 * @pos: offset of the buffer in vram 284 * @buf: virtual address of the buffer in system memory 285 * @size: read/write size, sizeof(@buf) must > @size 286 * @write: true - write to vram, otherwise - read from vram 287 * 288 * The return value means how many bytes have been transferred. 289 */ 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 291 void *buf, size_t size, bool write) 292 { 293 #ifdef CONFIG_64BIT 294 void __iomem *addr; 295 size_t count = 0; 296 uint64_t last; 297 298 if (!adev->mman.aper_base_kaddr) 299 return 0; 300 301 last = min(pos + size, adev->gmc.visible_vram_size); 302 if (last > pos) { 303 addr = adev->mman.aper_base_kaddr + pos; 304 count = last - pos; 305 306 if (write) { 307 memcpy_toio(addr, buf, count); 308 /* Make sure HDP write cache flush happens without any reordering 309 * after the system memory contents are sent over PCIe device 310 */ 311 mb(); 312 amdgpu_device_flush_hdp(adev, NULL); 313 } else { 314 amdgpu_device_invalidate_hdp(adev, NULL); 315 /* Make sure HDP read cache is invalidated before issuing a read 316 * to the PCIe device 317 */ 318 mb(); 319 memcpy_fromio(buf, addr, count); 320 } 321 322 } 323 324 return count; 325 #else 326 return 0; 327 #endif 328 } 329 330 /** 331 * amdgpu_device_vram_access - read/write a buffer in vram 332 * 333 * @adev: amdgpu_device pointer 334 * @pos: offset of the buffer in vram 335 * @buf: virtual address of the buffer in system memory 336 * @size: read/write size, sizeof(@buf) must > @size 337 * @write: true - write to vram, otherwise - read from vram 338 */ 339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 340 void *buf, size_t size, bool write) 341 { 342 size_t count; 343 344 /* try to using vram apreature to access vram first */ 345 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 346 size -= count; 347 if (size) { 348 /* using MM to access rest vram */ 349 pos += count; 350 buf += count; 351 amdgpu_device_mm_access(adev, pos, buf, size, write); 352 } 353 } 354 355 /* 356 * register access helper functions. 357 */ 358 359 /* Check if hw access should be skipped because of hotplug or device error */ 360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 361 { 362 if (adev->no_hw_access) 363 return true; 364 365 #ifdef CONFIG_LOCKDEP 366 /* 367 * This is a bit complicated to understand, so worth a comment. What we assert 368 * here is that the GPU reset is not running on another thread in parallel. 369 * 370 * For this we trylock the read side of the reset semaphore, if that succeeds 371 * we know that the reset is not running in paralell. 372 * 373 * If the trylock fails we assert that we are either already holding the read 374 * side of the lock or are the reset thread itself and hold the write side of 375 * the lock. 376 */ 377 if (in_task()) { 378 if (down_read_trylock(&adev->reset_domain->sem)) 379 up_read(&adev->reset_domain->sem); 380 else 381 lockdep_assert_held(&adev->reset_domain->sem); 382 } 383 #endif 384 return false; 385 } 386 387 /** 388 * amdgpu_device_rreg - read a memory mapped IO or indirect register 389 * 390 * @adev: amdgpu_device pointer 391 * @reg: dword aligned register offset 392 * @acc_flags: access flags which require special behavior 393 * 394 * Returns the 32 bit value from the offset specified. 395 */ 396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 397 uint32_t reg, uint32_t acc_flags) 398 { 399 uint32_t ret; 400 401 if (amdgpu_device_skip_hw_access(adev)) 402 return 0; 403 404 if ((reg * 4) < adev->rmmio_size) { 405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 406 amdgpu_sriov_runtime(adev) && 407 down_read_trylock(&adev->reset_domain->sem)) { 408 ret = amdgpu_kiq_rreg(adev, reg); 409 up_read(&adev->reset_domain->sem); 410 } else { 411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 412 } 413 } else { 414 ret = adev->pcie_rreg(adev, reg * 4); 415 } 416 417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 418 419 return ret; 420 } 421 422 /* 423 * MMIO register read with bytes helper functions 424 * @offset:bytes offset from MMIO start 425 */ 426 427 /** 428 * amdgpu_mm_rreg8 - read a memory mapped IO register 429 * 430 * @adev: amdgpu_device pointer 431 * @offset: byte aligned register offset 432 * 433 * Returns the 8 bit value from the offset specified. 434 */ 435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 436 { 437 if (amdgpu_device_skip_hw_access(adev)) 438 return 0; 439 440 if (offset < adev->rmmio_size) 441 return (readb(adev->rmmio + offset)); 442 BUG(); 443 } 444 445 /* 446 * MMIO register write with bytes helper functions 447 * @offset:bytes offset from MMIO start 448 * @value: the value want to be written to the register 449 */ 450 451 /** 452 * amdgpu_mm_wreg8 - read a memory mapped IO register 453 * 454 * @adev: amdgpu_device pointer 455 * @offset: byte aligned register offset 456 * @value: 8 bit value to write 457 * 458 * Writes the value specified to the offset specified. 459 */ 460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 461 { 462 if (amdgpu_device_skip_hw_access(adev)) 463 return; 464 465 if (offset < adev->rmmio_size) 466 writeb(value, adev->rmmio + offset); 467 else 468 BUG(); 469 } 470 471 /** 472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 473 * 474 * @adev: amdgpu_device pointer 475 * @reg: dword aligned register offset 476 * @v: 32 bit value to write to the register 477 * @acc_flags: access flags which require special behavior 478 * 479 * Writes the value specified to the offset specified. 480 */ 481 void amdgpu_device_wreg(struct amdgpu_device *adev, 482 uint32_t reg, uint32_t v, 483 uint32_t acc_flags) 484 { 485 if (amdgpu_device_skip_hw_access(adev)) 486 return; 487 488 if ((reg * 4) < adev->rmmio_size) { 489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 490 amdgpu_sriov_runtime(adev) && 491 down_read_trylock(&adev->reset_domain->sem)) { 492 amdgpu_kiq_wreg(adev, reg, v); 493 up_read(&adev->reset_domain->sem); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } else { 498 adev->pcie_wreg(adev, reg * 4, v); 499 } 500 501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 502 } 503 504 /** 505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 506 * 507 * @adev: amdgpu_device pointer 508 * @reg: mmio/rlc register 509 * @v: value to write 510 * 511 * this function is invoked only for the debugfs register access 512 */ 513 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 514 uint32_t reg, uint32_t v, 515 uint32_t xcc_id) 516 { 517 if (amdgpu_device_skip_hw_access(adev)) 518 return; 519 520 if (amdgpu_sriov_fullaccess(adev) && 521 adev->gfx.rlc.funcs && 522 adev->gfx.rlc.funcs->is_rlcg_access_range) { 523 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 524 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 525 } else if ((reg * 4) >= adev->rmmio_size) { 526 adev->pcie_wreg(adev, reg * 4, v); 527 } else { 528 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 529 } 530 } 531 532 /** 533 * amdgpu_device_indirect_rreg - read an indirect register 534 * 535 * @adev: amdgpu_device pointer 536 * @reg_addr: indirect register address to read from 537 * 538 * Returns the value of indirect register @reg_addr 539 */ 540 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 541 u32 reg_addr) 542 { 543 unsigned long flags, pcie_index, pcie_data; 544 void __iomem *pcie_index_offset; 545 void __iomem *pcie_data_offset; 546 u32 r; 547 548 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 549 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 550 551 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 552 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 553 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 554 555 writel(reg_addr, pcie_index_offset); 556 readl(pcie_index_offset); 557 r = readl(pcie_data_offset); 558 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 559 560 return r; 561 } 562 563 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 564 u64 reg_addr) 565 { 566 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 567 u32 r; 568 void __iomem *pcie_index_offset; 569 void __iomem *pcie_index_hi_offset; 570 void __iomem *pcie_data_offset; 571 572 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 573 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 574 if (adev->nbio.funcs->get_pcie_index_hi_offset) 575 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 576 else 577 pcie_index_hi = 0; 578 579 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 580 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 581 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 582 if (pcie_index_hi != 0) 583 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 584 pcie_index_hi * 4; 585 586 writel(reg_addr, pcie_index_offset); 587 readl(pcie_index_offset); 588 if (pcie_index_hi != 0) { 589 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 590 readl(pcie_index_hi_offset); 591 } 592 r = readl(pcie_data_offset); 593 594 /* clear the high bits */ 595 if (pcie_index_hi != 0) { 596 writel(0, pcie_index_hi_offset); 597 readl(pcie_index_hi_offset); 598 } 599 600 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 601 602 return r; 603 } 604 605 /** 606 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 607 * 608 * @adev: amdgpu_device pointer 609 * @reg_addr: indirect register address to read from 610 * 611 * Returns the value of indirect register @reg_addr 612 */ 613 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 614 u32 reg_addr) 615 { 616 unsigned long flags, pcie_index, pcie_data; 617 void __iomem *pcie_index_offset; 618 void __iomem *pcie_data_offset; 619 u64 r; 620 621 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 622 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 623 624 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 627 628 /* read low 32 bits */ 629 writel(reg_addr, pcie_index_offset); 630 readl(pcie_index_offset); 631 r = readl(pcie_data_offset); 632 /* read high 32 bits */ 633 writel(reg_addr + 4, pcie_index_offset); 634 readl(pcie_index_offset); 635 r |= ((u64)readl(pcie_data_offset) << 32); 636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 637 638 return r; 639 } 640 641 /** 642 * amdgpu_device_indirect_wreg - write an indirect register address 643 * 644 * @adev: amdgpu_device pointer 645 * @reg_addr: indirect register offset 646 * @reg_data: indirect register data 647 * 648 */ 649 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 650 u32 reg_addr, u32 reg_data) 651 { 652 unsigned long flags, pcie_index, pcie_data; 653 void __iomem *pcie_index_offset; 654 void __iomem *pcie_data_offset; 655 656 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 657 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 658 659 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 662 663 writel(reg_addr, pcie_index_offset); 664 readl(pcie_index_offset); 665 writel(reg_data, pcie_data_offset); 666 readl(pcie_data_offset); 667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 668 } 669 670 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 671 u64 reg_addr, u32 reg_data) 672 { 673 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 674 void __iomem *pcie_index_offset; 675 void __iomem *pcie_index_hi_offset; 676 void __iomem *pcie_data_offset; 677 678 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 679 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 680 if (adev->nbio.funcs->get_pcie_index_hi_offset) 681 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 682 else 683 pcie_index_hi = 0; 684 685 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 688 if (pcie_index_hi != 0) 689 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 690 pcie_index_hi * 4; 691 692 writel(reg_addr, pcie_index_offset); 693 readl(pcie_index_offset); 694 if (pcie_index_hi != 0) { 695 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 696 readl(pcie_index_hi_offset); 697 } 698 writel(reg_data, pcie_data_offset); 699 readl(pcie_data_offset); 700 701 /* clear the high bits */ 702 if (pcie_index_hi != 0) { 703 writel(0, pcie_index_hi_offset); 704 readl(pcie_index_hi_offset); 705 } 706 707 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 708 } 709 710 /** 711 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 712 * 713 * @adev: amdgpu_device pointer 714 * @reg_addr: indirect register offset 715 * @reg_data: indirect register data 716 * 717 */ 718 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 719 u32 reg_addr, u64 reg_data) 720 { 721 unsigned long flags, pcie_index, pcie_data; 722 void __iomem *pcie_index_offset; 723 void __iomem *pcie_data_offset; 724 725 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 726 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 727 728 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 731 732 /* write low 32 bits */ 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 736 readl(pcie_data_offset); 737 /* write high 32 bits */ 738 writel(reg_addr + 4, pcie_index_offset); 739 readl(pcie_index_offset); 740 writel((u32)(reg_data >> 32), pcie_data_offset); 741 readl(pcie_data_offset); 742 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 743 } 744 745 /** 746 * amdgpu_device_get_rev_id - query device rev_id 747 * 748 * @adev: amdgpu_device pointer 749 * 750 * Return device rev_id 751 */ 752 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 753 { 754 return adev->nbio.funcs->get_rev_id(adev); 755 } 756 757 /** 758 * amdgpu_invalid_rreg - dummy reg read function 759 * 760 * @adev: amdgpu_device pointer 761 * @reg: offset of register 762 * 763 * Dummy register read function. Used for register blocks 764 * that certain asics don't have (all asics). 765 * Returns the value in the register. 766 */ 767 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 768 { 769 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 770 BUG(); 771 return 0; 772 } 773 774 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 775 { 776 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 777 BUG(); 778 return 0; 779 } 780 781 /** 782 * amdgpu_invalid_wreg - dummy reg write function 783 * 784 * @adev: amdgpu_device pointer 785 * @reg: offset of register 786 * @v: value to write to the register 787 * 788 * Dummy register read function. Used for register blocks 789 * that certain asics don't have (all asics). 790 */ 791 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 792 { 793 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 794 reg, v); 795 BUG(); 796 } 797 798 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 799 { 800 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 801 reg, v); 802 BUG(); 803 } 804 805 /** 806 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 807 * 808 * @adev: amdgpu_device pointer 809 * @reg: offset of register 810 * 811 * Dummy register read function. Used for register blocks 812 * that certain asics don't have (all asics). 813 * Returns the value in the register. 814 */ 815 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 816 { 817 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 818 BUG(); 819 return 0; 820 } 821 822 /** 823 * amdgpu_invalid_wreg64 - dummy reg write function 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: offset of register 827 * @v: value to write to the register 828 * 829 * Dummy register read function. Used for register blocks 830 * that certain asics don't have (all asics). 831 */ 832 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 833 { 834 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 835 reg, v); 836 BUG(); 837 } 838 839 /** 840 * amdgpu_block_invalid_rreg - dummy reg read function 841 * 842 * @adev: amdgpu_device pointer 843 * @block: offset of instance 844 * @reg: offset of register 845 * 846 * Dummy register read function. Used for register blocks 847 * that certain asics don't have (all asics). 848 * Returns the value in the register. 849 */ 850 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 851 uint32_t block, uint32_t reg) 852 { 853 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 854 reg, block); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_block_invalid_wreg - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @block: offset of instance 864 * @reg: offset of register 865 * @v: value to write to the register 866 * 867 * Dummy register read function. Used for register blocks 868 * that certain asics don't have (all asics). 869 */ 870 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 871 uint32_t block, 872 uint32_t reg, uint32_t v) 873 { 874 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 875 reg, block, v); 876 BUG(); 877 } 878 879 /** 880 * amdgpu_device_asic_init - Wrapper for atom asic_init 881 * 882 * @adev: amdgpu_device pointer 883 * 884 * Does any asic specific work and then calls atom asic init. 885 */ 886 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 887 { 888 int ret; 889 890 amdgpu_asic_pre_asic_init(adev); 891 892 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 893 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { 894 amdgpu_psp_wait_for_bootloader(adev); 895 ret = amdgpu_atomfirmware_asic_init(adev, true); 896 return ret; 897 } else { 898 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 899 } 900 901 return 0; 902 } 903 904 /** 905 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 906 * 907 * @adev: amdgpu_device pointer 908 * 909 * Allocates a scratch page of VRAM for use by various things in the 910 * driver. 911 */ 912 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 913 { 914 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 915 AMDGPU_GEM_DOMAIN_VRAM | 916 AMDGPU_GEM_DOMAIN_GTT, 917 &adev->mem_scratch.robj, 918 &adev->mem_scratch.gpu_addr, 919 (void **)&adev->mem_scratch.ptr); 920 } 921 922 /** 923 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 924 * 925 * @adev: amdgpu_device pointer 926 * 927 * Frees the VRAM scratch page. 928 */ 929 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 930 { 931 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 932 } 933 934 /** 935 * amdgpu_device_program_register_sequence - program an array of registers. 936 * 937 * @adev: amdgpu_device pointer 938 * @registers: pointer to the register array 939 * @array_size: size of the register array 940 * 941 * Programs an array or registers with and or masks. 942 * This is a helper for setting golden registers. 943 */ 944 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 945 const u32 *registers, 946 const u32 array_size) 947 { 948 u32 tmp, reg, and_mask, or_mask; 949 int i; 950 951 if (array_size % 3) 952 return; 953 954 for (i = 0; i < array_size; i += 3) { 955 reg = registers[i + 0]; 956 and_mask = registers[i + 1]; 957 or_mask = registers[i + 2]; 958 959 if (and_mask == 0xffffffff) { 960 tmp = or_mask; 961 } else { 962 tmp = RREG32(reg); 963 tmp &= ~and_mask; 964 if (adev->family >= AMDGPU_FAMILY_AI) 965 tmp |= (or_mask & and_mask); 966 else 967 tmp |= or_mask; 968 } 969 WREG32(reg, tmp); 970 } 971 } 972 973 /** 974 * amdgpu_device_pci_config_reset - reset the GPU 975 * 976 * @adev: amdgpu_device pointer 977 * 978 * Resets the GPU using the pci config reset sequence. 979 * Only applicable to asics prior to vega10. 980 */ 981 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 982 { 983 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 984 } 985 986 /** 987 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 988 * 989 * @adev: amdgpu_device pointer 990 * 991 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 992 */ 993 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 994 { 995 return pci_reset_function(adev->pdev); 996 } 997 998 /* 999 * amdgpu_device_wb_*() 1000 * Writeback is the method by which the GPU updates special pages in memory 1001 * with the status of certain GPU events (fences, ring pointers,etc.). 1002 */ 1003 1004 /** 1005 * amdgpu_device_wb_fini - Disable Writeback and free memory 1006 * 1007 * @adev: amdgpu_device pointer 1008 * 1009 * Disables Writeback and frees the Writeback memory (all asics). 1010 * Used at driver shutdown. 1011 */ 1012 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1013 { 1014 if (adev->wb.wb_obj) { 1015 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1016 &adev->wb.gpu_addr, 1017 (void **)&adev->wb.wb); 1018 adev->wb.wb_obj = NULL; 1019 } 1020 } 1021 1022 /** 1023 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1024 * 1025 * @adev: amdgpu_device pointer 1026 * 1027 * Initializes writeback and allocates writeback memory (all asics). 1028 * Used at driver startup. 1029 * Returns 0 on success or an -error on failure. 1030 */ 1031 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1032 { 1033 int r; 1034 1035 if (adev->wb.wb_obj == NULL) { 1036 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1037 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1038 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1039 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1040 (void **)&adev->wb.wb); 1041 if (r) { 1042 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1043 return r; 1044 } 1045 1046 adev->wb.num_wb = AMDGPU_MAX_WB; 1047 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1048 1049 /* clear wb memory */ 1050 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1051 } 1052 1053 return 0; 1054 } 1055 1056 /** 1057 * amdgpu_device_wb_get - Allocate a wb entry 1058 * 1059 * @adev: amdgpu_device pointer 1060 * @wb: wb index 1061 * 1062 * Allocate a wb slot for use by the driver (all asics). 1063 * Returns 0 on success or -EINVAL on failure. 1064 */ 1065 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1066 { 1067 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1068 1069 if (offset < adev->wb.num_wb) { 1070 __set_bit(offset, adev->wb.used); 1071 *wb = offset << 3; /* convert to dw offset */ 1072 return 0; 1073 } else { 1074 return -EINVAL; 1075 } 1076 } 1077 1078 /** 1079 * amdgpu_device_wb_free - Free a wb entry 1080 * 1081 * @adev: amdgpu_device pointer 1082 * @wb: wb index 1083 * 1084 * Free a wb slot allocated for use by the driver (all asics) 1085 */ 1086 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1087 { 1088 wb >>= 3; 1089 if (wb < adev->wb.num_wb) 1090 __clear_bit(wb, adev->wb.used); 1091 } 1092 1093 /** 1094 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1095 * 1096 * @adev: amdgpu_device pointer 1097 * 1098 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1099 * to fail, but if any of the BARs is not accessible after the size we abort 1100 * driver loading by returning -ENODEV. 1101 */ 1102 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1103 { 1104 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1105 struct pci_bus *root; 1106 struct resource *res; 1107 unsigned int i; 1108 u16 cmd; 1109 int r; 1110 1111 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1112 return 0; 1113 1114 /* Bypass for VF */ 1115 if (amdgpu_sriov_vf(adev)) 1116 return 0; 1117 1118 /* skip if the bios has already enabled large BAR */ 1119 if (adev->gmc.real_vram_size && 1120 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1121 return 0; 1122 1123 /* Check if the root BUS has 64bit memory resources */ 1124 root = adev->pdev->bus; 1125 while (root->parent) 1126 root = root->parent; 1127 1128 pci_bus_for_each_resource(root, res, i) { 1129 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1130 res->start > 0x100000000ull) 1131 break; 1132 } 1133 1134 /* Trying to resize is pointless without a root hub window above 4GB */ 1135 if (!res) 1136 return 0; 1137 1138 /* Limit the BAR size to what is available */ 1139 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1140 rbar_size); 1141 1142 /* Disable memory decoding while we change the BAR addresses and size */ 1143 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1144 pci_write_config_word(adev->pdev, PCI_COMMAND, 1145 cmd & ~PCI_COMMAND_MEMORY); 1146 1147 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1148 amdgpu_doorbell_fini(adev); 1149 if (adev->asic_type >= CHIP_BONAIRE) 1150 pci_release_resource(adev->pdev, 2); 1151 1152 pci_release_resource(adev->pdev, 0); 1153 1154 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1155 if (r == -ENOSPC) 1156 DRM_INFO("Not enough PCI address space for a large BAR."); 1157 else if (r && r != -ENOTSUPP) 1158 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1159 1160 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1161 1162 /* When the doorbell or fb BAR isn't available we have no chance of 1163 * using the device. 1164 */ 1165 r = amdgpu_doorbell_init(adev); 1166 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1167 return -ENODEV; 1168 1169 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1170 1171 return 0; 1172 } 1173 1174 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1175 { 1176 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1177 return false; 1178 1179 return true; 1180 } 1181 1182 /* 1183 * GPU helpers function. 1184 */ 1185 /** 1186 * amdgpu_device_need_post - check if the hw need post or not 1187 * 1188 * @adev: amdgpu_device pointer 1189 * 1190 * Check if the asic has been initialized (all asics) at driver startup 1191 * or post is needed if hw reset is performed. 1192 * Returns true if need or false if not. 1193 */ 1194 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1195 { 1196 uint32_t reg; 1197 1198 if (amdgpu_sriov_vf(adev)) 1199 return false; 1200 1201 if (!amdgpu_device_read_bios(adev)) 1202 return false; 1203 1204 if (amdgpu_passthrough(adev)) { 1205 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1206 * some old smc fw still need driver do vPost otherwise gpu hang, while 1207 * those smc fw version above 22.15 doesn't have this flaw, so we force 1208 * vpost executed for smc version below 22.15 1209 */ 1210 if (adev->asic_type == CHIP_FIJI) { 1211 int err; 1212 uint32_t fw_ver; 1213 1214 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1215 /* force vPost if error occured */ 1216 if (err) 1217 return true; 1218 1219 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1220 if (fw_ver < 0x00160e00) 1221 return true; 1222 } 1223 } 1224 1225 /* Don't post if we need to reset whole hive on init */ 1226 if (adev->gmc.xgmi.pending_reset) 1227 return false; 1228 1229 if (adev->has_hw_reset) { 1230 adev->has_hw_reset = false; 1231 return true; 1232 } 1233 1234 /* bios scratch used on CIK+ */ 1235 if (adev->asic_type >= CHIP_BONAIRE) 1236 return amdgpu_atombios_scratch_need_asic_init(adev); 1237 1238 /* check MEM_SIZE for older asics */ 1239 reg = amdgpu_asic_get_config_memsize(adev); 1240 1241 if ((reg != 0) && (reg != 0xffffffff)) 1242 return false; 1243 1244 return true; 1245 } 1246 1247 /* 1248 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1249 * speed switching. Until we have confirmation from Intel that a specific host 1250 * supports it, it's safer that we keep it disabled for all. 1251 * 1252 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1253 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1254 */ 1255 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1256 { 1257 #if IS_ENABLED(CONFIG_X86) 1258 struct cpuinfo_x86 *c = &cpu_data(0); 1259 1260 if (c->x86_vendor == X86_VENDOR_INTEL) 1261 return false; 1262 #endif 1263 return true; 1264 } 1265 1266 /** 1267 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1272 * be set for this device. 1273 * 1274 * Returns true if it should be used or false if not. 1275 */ 1276 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1277 { 1278 switch (amdgpu_aspm) { 1279 case -1: 1280 break; 1281 case 0: 1282 return false; 1283 case 1: 1284 return true; 1285 default: 1286 return false; 1287 } 1288 return pcie_aspm_enabled(adev->pdev); 1289 } 1290 1291 bool amdgpu_device_aspm_support_quirk(void) 1292 { 1293 #if IS_ENABLED(CONFIG_X86) 1294 struct cpuinfo_x86 *c = &cpu_data(0); 1295 1296 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1297 #else 1298 return true; 1299 #endif 1300 } 1301 1302 /* if we get transitioned to only one device, take VGA back */ 1303 /** 1304 * amdgpu_device_vga_set_decode - enable/disable vga decode 1305 * 1306 * @pdev: PCI device pointer 1307 * @state: enable/disable vga decode 1308 * 1309 * Enable/disable vga decode (all asics). 1310 * Returns VGA resource flags. 1311 */ 1312 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1313 bool state) 1314 { 1315 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1316 1317 amdgpu_asic_set_vga_state(adev, state); 1318 if (state) 1319 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1320 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1321 else 1322 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1323 } 1324 1325 /** 1326 * amdgpu_device_check_block_size - validate the vm block size 1327 * 1328 * @adev: amdgpu_device pointer 1329 * 1330 * Validates the vm block size specified via module parameter. 1331 * The vm block size defines number of bits in page table versus page directory, 1332 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1333 * page table and the remaining bits are in the page directory. 1334 */ 1335 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1336 { 1337 /* defines number of bits in page table versus page directory, 1338 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1339 * page table and the remaining bits are in the page directory 1340 */ 1341 if (amdgpu_vm_block_size == -1) 1342 return; 1343 1344 if (amdgpu_vm_block_size < 9) { 1345 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1346 amdgpu_vm_block_size); 1347 amdgpu_vm_block_size = -1; 1348 } 1349 } 1350 1351 /** 1352 * amdgpu_device_check_vm_size - validate the vm size 1353 * 1354 * @adev: amdgpu_device pointer 1355 * 1356 * Validates the vm size in GB specified via module parameter. 1357 * The VM size is the size of the GPU virtual memory space in GB. 1358 */ 1359 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1360 { 1361 /* no need to check the default value */ 1362 if (amdgpu_vm_size == -1) 1363 return; 1364 1365 if (amdgpu_vm_size < 1) { 1366 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1367 amdgpu_vm_size); 1368 amdgpu_vm_size = -1; 1369 } 1370 } 1371 1372 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1373 { 1374 struct sysinfo si; 1375 bool is_os_64 = (sizeof(void *) == 8); 1376 uint64_t total_memory; 1377 uint64_t dram_size_seven_GB = 0x1B8000000; 1378 uint64_t dram_size_three_GB = 0xB8000000; 1379 1380 if (amdgpu_smu_memory_pool_size == 0) 1381 return; 1382 1383 if (!is_os_64) { 1384 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1385 goto def_value; 1386 } 1387 si_meminfo(&si); 1388 total_memory = (uint64_t)si.totalram * si.mem_unit; 1389 1390 if ((amdgpu_smu_memory_pool_size == 1) || 1391 (amdgpu_smu_memory_pool_size == 2)) { 1392 if (total_memory < dram_size_three_GB) 1393 goto def_value1; 1394 } else if ((amdgpu_smu_memory_pool_size == 4) || 1395 (amdgpu_smu_memory_pool_size == 8)) { 1396 if (total_memory < dram_size_seven_GB) 1397 goto def_value1; 1398 } else { 1399 DRM_WARN("Smu memory pool size not supported\n"); 1400 goto def_value; 1401 } 1402 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1403 1404 return; 1405 1406 def_value1: 1407 DRM_WARN("No enough system memory\n"); 1408 def_value: 1409 adev->pm.smu_prv_buffer_size = 0; 1410 } 1411 1412 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1413 { 1414 if (!(adev->flags & AMD_IS_APU) || 1415 adev->asic_type < CHIP_RAVEN) 1416 return 0; 1417 1418 switch (adev->asic_type) { 1419 case CHIP_RAVEN: 1420 if (adev->pdev->device == 0x15dd) 1421 adev->apu_flags |= AMD_APU_IS_RAVEN; 1422 if (adev->pdev->device == 0x15d8) 1423 adev->apu_flags |= AMD_APU_IS_PICASSO; 1424 break; 1425 case CHIP_RENOIR: 1426 if ((adev->pdev->device == 0x1636) || 1427 (adev->pdev->device == 0x164c)) 1428 adev->apu_flags |= AMD_APU_IS_RENOIR; 1429 else 1430 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1431 break; 1432 case CHIP_VANGOGH: 1433 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1434 break; 1435 case CHIP_YELLOW_CARP: 1436 break; 1437 case CHIP_CYAN_SKILLFISH: 1438 if ((adev->pdev->device == 0x13FE) || 1439 (adev->pdev->device == 0x143F)) 1440 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1441 break; 1442 default: 1443 break; 1444 } 1445 1446 return 0; 1447 } 1448 1449 /** 1450 * amdgpu_device_check_arguments - validate module params 1451 * 1452 * @adev: amdgpu_device pointer 1453 * 1454 * Validates certain module parameters and updates 1455 * the associated values used by the driver (all asics). 1456 */ 1457 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1458 { 1459 if (amdgpu_sched_jobs < 4) { 1460 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1461 amdgpu_sched_jobs); 1462 amdgpu_sched_jobs = 4; 1463 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1464 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1465 amdgpu_sched_jobs); 1466 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1467 } 1468 1469 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1470 /* gart size must be greater or equal to 32M */ 1471 dev_warn(adev->dev, "gart size (%d) too small\n", 1472 amdgpu_gart_size); 1473 amdgpu_gart_size = -1; 1474 } 1475 1476 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1477 /* gtt size must be greater or equal to 32M */ 1478 dev_warn(adev->dev, "gtt size (%d) too small\n", 1479 amdgpu_gtt_size); 1480 amdgpu_gtt_size = -1; 1481 } 1482 1483 /* valid range is between 4 and 9 inclusive */ 1484 if (amdgpu_vm_fragment_size != -1 && 1485 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1486 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1487 amdgpu_vm_fragment_size = -1; 1488 } 1489 1490 if (amdgpu_sched_hw_submission < 2) { 1491 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1492 amdgpu_sched_hw_submission); 1493 amdgpu_sched_hw_submission = 2; 1494 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1495 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1496 amdgpu_sched_hw_submission); 1497 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1498 } 1499 1500 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1501 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1502 amdgpu_reset_method = -1; 1503 } 1504 1505 amdgpu_device_check_smu_prv_buffer_size(adev); 1506 1507 amdgpu_device_check_vm_size(adev); 1508 1509 amdgpu_device_check_block_size(adev); 1510 1511 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1512 1513 return 0; 1514 } 1515 1516 /** 1517 * amdgpu_switcheroo_set_state - set switcheroo state 1518 * 1519 * @pdev: pci dev pointer 1520 * @state: vga_switcheroo state 1521 * 1522 * Callback for the switcheroo driver. Suspends or resumes 1523 * the asics before or after it is powered up using ACPI methods. 1524 */ 1525 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1526 enum vga_switcheroo_state state) 1527 { 1528 struct drm_device *dev = pci_get_drvdata(pdev); 1529 int r; 1530 1531 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1532 return; 1533 1534 if (state == VGA_SWITCHEROO_ON) { 1535 pr_info("switched on\n"); 1536 /* don't suspend or resume card normally */ 1537 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1538 1539 pci_set_power_state(pdev, PCI_D0); 1540 amdgpu_device_load_pci_state(pdev); 1541 r = pci_enable_device(pdev); 1542 if (r) 1543 DRM_WARN("pci_enable_device failed (%d)\n", r); 1544 amdgpu_device_resume(dev, true); 1545 1546 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1547 } else { 1548 pr_info("switched off\n"); 1549 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1550 amdgpu_device_suspend(dev, true); 1551 amdgpu_device_cache_pci_state(pdev); 1552 /* Shut down the device */ 1553 pci_disable_device(pdev); 1554 pci_set_power_state(pdev, PCI_D3cold); 1555 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1556 } 1557 } 1558 1559 /** 1560 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1561 * 1562 * @pdev: pci dev pointer 1563 * 1564 * Callback for the switcheroo driver. Check of the switcheroo 1565 * state can be changed. 1566 * Returns true if the state can be changed, false if not. 1567 */ 1568 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1569 { 1570 struct drm_device *dev = pci_get_drvdata(pdev); 1571 1572 /* 1573 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1574 * locking inversion with the driver load path. And the access here is 1575 * completely racy anyway. So don't bother with locking for now. 1576 */ 1577 return atomic_read(&dev->open_count) == 0; 1578 } 1579 1580 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1581 .set_gpu_state = amdgpu_switcheroo_set_state, 1582 .reprobe = NULL, 1583 .can_switch = amdgpu_switcheroo_can_switch, 1584 }; 1585 1586 /** 1587 * amdgpu_device_ip_set_clockgating_state - set the CG state 1588 * 1589 * @dev: amdgpu_device pointer 1590 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1591 * @state: clockgating state (gate or ungate) 1592 * 1593 * Sets the requested clockgating state for all instances of 1594 * the hardware IP specified. 1595 * Returns the error code from the last instance. 1596 */ 1597 int amdgpu_device_ip_set_clockgating_state(void *dev, 1598 enum amd_ip_block_type block_type, 1599 enum amd_clockgating_state state) 1600 { 1601 struct amdgpu_device *adev = dev; 1602 int i, r = 0; 1603 1604 for (i = 0; i < adev->num_ip_blocks; i++) { 1605 if (!adev->ip_blocks[i].status.valid) 1606 continue; 1607 if (adev->ip_blocks[i].version->type != block_type) 1608 continue; 1609 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1610 continue; 1611 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1612 (void *)adev, state); 1613 if (r) 1614 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1615 adev->ip_blocks[i].version->funcs->name, r); 1616 } 1617 return r; 1618 } 1619 1620 /** 1621 * amdgpu_device_ip_set_powergating_state - set the PG state 1622 * 1623 * @dev: amdgpu_device pointer 1624 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1625 * @state: powergating state (gate or ungate) 1626 * 1627 * Sets the requested powergating state for all instances of 1628 * the hardware IP specified. 1629 * Returns the error code from the last instance. 1630 */ 1631 int amdgpu_device_ip_set_powergating_state(void *dev, 1632 enum amd_ip_block_type block_type, 1633 enum amd_powergating_state state) 1634 { 1635 struct amdgpu_device *adev = dev; 1636 int i, r = 0; 1637 1638 for (i = 0; i < adev->num_ip_blocks; i++) { 1639 if (!adev->ip_blocks[i].status.valid) 1640 continue; 1641 if (adev->ip_blocks[i].version->type != block_type) 1642 continue; 1643 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1644 continue; 1645 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1646 (void *)adev, state); 1647 if (r) 1648 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1649 adev->ip_blocks[i].version->funcs->name, r); 1650 } 1651 return r; 1652 } 1653 1654 /** 1655 * amdgpu_device_ip_get_clockgating_state - get the CG state 1656 * 1657 * @adev: amdgpu_device pointer 1658 * @flags: clockgating feature flags 1659 * 1660 * Walks the list of IPs on the device and updates the clockgating 1661 * flags for each IP. 1662 * Updates @flags with the feature flags for each hardware IP where 1663 * clockgating is enabled. 1664 */ 1665 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1666 u64 *flags) 1667 { 1668 int i; 1669 1670 for (i = 0; i < adev->num_ip_blocks; i++) { 1671 if (!adev->ip_blocks[i].status.valid) 1672 continue; 1673 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1674 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1675 } 1676 } 1677 1678 /** 1679 * amdgpu_device_ip_wait_for_idle - wait for idle 1680 * 1681 * @adev: amdgpu_device pointer 1682 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1683 * 1684 * Waits for the request hardware IP to be idle. 1685 * Returns 0 for success or a negative error code on failure. 1686 */ 1687 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1688 enum amd_ip_block_type block_type) 1689 { 1690 int i, r; 1691 1692 for (i = 0; i < adev->num_ip_blocks; i++) { 1693 if (!adev->ip_blocks[i].status.valid) 1694 continue; 1695 if (adev->ip_blocks[i].version->type == block_type) { 1696 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1697 if (r) 1698 return r; 1699 break; 1700 } 1701 } 1702 return 0; 1703 1704 } 1705 1706 /** 1707 * amdgpu_device_ip_is_idle - is the hardware IP idle 1708 * 1709 * @adev: amdgpu_device pointer 1710 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1711 * 1712 * Check if the hardware IP is idle or not. 1713 * Returns true if it the IP is idle, false if not. 1714 */ 1715 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1716 enum amd_ip_block_type block_type) 1717 { 1718 int i; 1719 1720 for (i = 0; i < adev->num_ip_blocks; i++) { 1721 if (!adev->ip_blocks[i].status.valid) 1722 continue; 1723 if (adev->ip_blocks[i].version->type == block_type) 1724 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1725 } 1726 return true; 1727 1728 } 1729 1730 /** 1731 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1732 * 1733 * @adev: amdgpu_device pointer 1734 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1735 * 1736 * Returns a pointer to the hardware IP block structure 1737 * if it exists for the asic, otherwise NULL. 1738 */ 1739 struct amdgpu_ip_block * 1740 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1741 enum amd_ip_block_type type) 1742 { 1743 int i; 1744 1745 for (i = 0; i < adev->num_ip_blocks; i++) 1746 if (adev->ip_blocks[i].version->type == type) 1747 return &adev->ip_blocks[i]; 1748 1749 return NULL; 1750 } 1751 1752 /** 1753 * amdgpu_device_ip_block_version_cmp 1754 * 1755 * @adev: amdgpu_device pointer 1756 * @type: enum amd_ip_block_type 1757 * @major: major version 1758 * @minor: minor version 1759 * 1760 * return 0 if equal or greater 1761 * return 1 if smaller or the ip_block doesn't exist 1762 */ 1763 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1764 enum amd_ip_block_type type, 1765 u32 major, u32 minor) 1766 { 1767 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1768 1769 if (ip_block && ((ip_block->version->major > major) || 1770 ((ip_block->version->major == major) && 1771 (ip_block->version->minor >= minor)))) 1772 return 0; 1773 1774 return 1; 1775 } 1776 1777 /** 1778 * amdgpu_device_ip_block_add 1779 * 1780 * @adev: amdgpu_device pointer 1781 * @ip_block_version: pointer to the IP to add 1782 * 1783 * Adds the IP block driver information to the collection of IPs 1784 * on the asic. 1785 */ 1786 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1787 const struct amdgpu_ip_block_version *ip_block_version) 1788 { 1789 if (!ip_block_version) 1790 return -EINVAL; 1791 1792 switch (ip_block_version->type) { 1793 case AMD_IP_BLOCK_TYPE_VCN: 1794 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1795 return 0; 1796 break; 1797 case AMD_IP_BLOCK_TYPE_JPEG: 1798 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1799 return 0; 1800 break; 1801 default: 1802 break; 1803 } 1804 1805 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1806 ip_block_version->funcs->name); 1807 1808 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1809 1810 return 0; 1811 } 1812 1813 /** 1814 * amdgpu_device_enable_virtual_display - enable virtual display feature 1815 * 1816 * @adev: amdgpu_device pointer 1817 * 1818 * Enabled the virtual display feature if the user has enabled it via 1819 * the module parameter virtual_display. This feature provides a virtual 1820 * display hardware on headless boards or in virtualized environments. 1821 * This function parses and validates the configuration string specified by 1822 * the user and configues the virtual display configuration (number of 1823 * virtual connectors, crtcs, etc.) specified. 1824 */ 1825 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1826 { 1827 adev->enable_virtual_display = false; 1828 1829 if (amdgpu_virtual_display) { 1830 const char *pci_address_name = pci_name(adev->pdev); 1831 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1832 1833 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1834 pciaddstr_tmp = pciaddstr; 1835 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1836 pciaddname = strsep(&pciaddname_tmp, ","); 1837 if (!strcmp("all", pciaddname) 1838 || !strcmp(pci_address_name, pciaddname)) { 1839 long num_crtc; 1840 int res = -1; 1841 1842 adev->enable_virtual_display = true; 1843 1844 if (pciaddname_tmp) 1845 res = kstrtol(pciaddname_tmp, 10, 1846 &num_crtc); 1847 1848 if (!res) { 1849 if (num_crtc < 1) 1850 num_crtc = 1; 1851 if (num_crtc > 6) 1852 num_crtc = 6; 1853 adev->mode_info.num_crtc = num_crtc; 1854 } else { 1855 adev->mode_info.num_crtc = 1; 1856 } 1857 break; 1858 } 1859 } 1860 1861 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1862 amdgpu_virtual_display, pci_address_name, 1863 adev->enable_virtual_display, adev->mode_info.num_crtc); 1864 1865 kfree(pciaddstr); 1866 } 1867 } 1868 1869 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1870 { 1871 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1872 adev->mode_info.num_crtc = 1; 1873 adev->enable_virtual_display = true; 1874 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1875 adev->enable_virtual_display, adev->mode_info.num_crtc); 1876 } 1877 } 1878 1879 /** 1880 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1881 * 1882 * @adev: amdgpu_device pointer 1883 * 1884 * Parses the asic configuration parameters specified in the gpu info 1885 * firmware and makes them availale to the driver for use in configuring 1886 * the asic. 1887 * Returns 0 on success, -EINVAL on failure. 1888 */ 1889 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1890 { 1891 const char *chip_name; 1892 char fw_name[40]; 1893 int err; 1894 const struct gpu_info_firmware_header_v1_0 *hdr; 1895 1896 adev->firmware.gpu_info_fw = NULL; 1897 1898 if (adev->mman.discovery_bin) { 1899 /* 1900 * FIXME: The bounding box is still needed by Navi12, so 1901 * temporarily read it from gpu_info firmware. Should be dropped 1902 * when DAL no longer needs it. 1903 */ 1904 if (adev->asic_type != CHIP_NAVI12) 1905 return 0; 1906 } 1907 1908 switch (adev->asic_type) { 1909 default: 1910 return 0; 1911 case CHIP_VEGA10: 1912 chip_name = "vega10"; 1913 break; 1914 case CHIP_VEGA12: 1915 chip_name = "vega12"; 1916 break; 1917 case CHIP_RAVEN: 1918 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1919 chip_name = "raven2"; 1920 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1921 chip_name = "picasso"; 1922 else 1923 chip_name = "raven"; 1924 break; 1925 case CHIP_ARCTURUS: 1926 chip_name = "arcturus"; 1927 break; 1928 case CHIP_NAVI12: 1929 chip_name = "navi12"; 1930 break; 1931 } 1932 1933 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1934 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1935 if (err) { 1936 dev_err(adev->dev, 1937 "Failed to get gpu_info firmware \"%s\"\n", 1938 fw_name); 1939 goto out; 1940 } 1941 1942 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1943 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1944 1945 switch (hdr->version_major) { 1946 case 1: 1947 { 1948 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1949 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1950 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1951 1952 /* 1953 * Should be droped when DAL no longer needs it. 1954 */ 1955 if (adev->asic_type == CHIP_NAVI12) 1956 goto parse_soc_bounding_box; 1957 1958 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1959 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1960 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1961 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1962 adev->gfx.config.max_texture_channel_caches = 1963 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1964 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1965 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1966 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1967 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1968 adev->gfx.config.double_offchip_lds_buf = 1969 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1970 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1971 adev->gfx.cu_info.max_waves_per_simd = 1972 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1973 adev->gfx.cu_info.max_scratch_slots_per_cu = 1974 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1975 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1976 if (hdr->version_minor >= 1) { 1977 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1978 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1979 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1980 adev->gfx.config.num_sc_per_sh = 1981 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1982 adev->gfx.config.num_packer_per_sc = 1983 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1984 } 1985 1986 parse_soc_bounding_box: 1987 /* 1988 * soc bounding box info is not integrated in disocovery table, 1989 * we always need to parse it from gpu info firmware if needed. 1990 */ 1991 if (hdr->version_minor == 2) { 1992 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1993 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1994 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1995 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1996 } 1997 break; 1998 } 1999 default: 2000 dev_err(adev->dev, 2001 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2002 err = -EINVAL; 2003 goto out; 2004 } 2005 out: 2006 return err; 2007 } 2008 2009 /** 2010 * amdgpu_device_ip_early_init - run early init for hardware IPs 2011 * 2012 * @adev: amdgpu_device pointer 2013 * 2014 * Early initialization pass for hardware IPs. The hardware IPs that make 2015 * up each asic are discovered each IP's early_init callback is run. This 2016 * is the first stage in initializing the asic. 2017 * Returns 0 on success, negative error code on failure. 2018 */ 2019 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2020 { 2021 struct drm_device *dev = adev_to_drm(adev); 2022 struct pci_dev *parent; 2023 int i, r; 2024 bool total; 2025 2026 amdgpu_device_enable_virtual_display(adev); 2027 2028 if (amdgpu_sriov_vf(adev)) { 2029 r = amdgpu_virt_request_full_gpu(adev, true); 2030 if (r) 2031 return r; 2032 } 2033 2034 switch (adev->asic_type) { 2035 #ifdef CONFIG_DRM_AMDGPU_SI 2036 case CHIP_VERDE: 2037 case CHIP_TAHITI: 2038 case CHIP_PITCAIRN: 2039 case CHIP_OLAND: 2040 case CHIP_HAINAN: 2041 adev->family = AMDGPU_FAMILY_SI; 2042 r = si_set_ip_blocks(adev); 2043 if (r) 2044 return r; 2045 break; 2046 #endif 2047 #ifdef CONFIG_DRM_AMDGPU_CIK 2048 case CHIP_BONAIRE: 2049 case CHIP_HAWAII: 2050 case CHIP_KAVERI: 2051 case CHIP_KABINI: 2052 case CHIP_MULLINS: 2053 if (adev->flags & AMD_IS_APU) 2054 adev->family = AMDGPU_FAMILY_KV; 2055 else 2056 adev->family = AMDGPU_FAMILY_CI; 2057 2058 r = cik_set_ip_blocks(adev); 2059 if (r) 2060 return r; 2061 break; 2062 #endif 2063 case CHIP_TOPAZ: 2064 case CHIP_TONGA: 2065 case CHIP_FIJI: 2066 case CHIP_POLARIS10: 2067 case CHIP_POLARIS11: 2068 case CHIP_POLARIS12: 2069 case CHIP_VEGAM: 2070 case CHIP_CARRIZO: 2071 case CHIP_STONEY: 2072 if (adev->flags & AMD_IS_APU) 2073 adev->family = AMDGPU_FAMILY_CZ; 2074 else 2075 adev->family = AMDGPU_FAMILY_VI; 2076 2077 r = vi_set_ip_blocks(adev); 2078 if (r) 2079 return r; 2080 break; 2081 default: 2082 r = amdgpu_discovery_set_ip_blocks(adev); 2083 if (r) 2084 return r; 2085 break; 2086 } 2087 2088 if (amdgpu_has_atpx() && 2089 (amdgpu_is_atpx_hybrid() || 2090 amdgpu_has_atpx_dgpu_power_cntl()) && 2091 ((adev->flags & AMD_IS_APU) == 0) && 2092 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2093 adev->flags |= AMD_IS_PX; 2094 2095 if (!(adev->flags & AMD_IS_APU)) { 2096 parent = pci_upstream_bridge(adev->pdev); 2097 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2098 } 2099 2100 2101 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2102 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2103 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2104 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2105 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2106 2107 total = true; 2108 for (i = 0; i < adev->num_ip_blocks; i++) { 2109 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2110 DRM_WARN("disabled ip block: %d <%s>\n", 2111 i, adev->ip_blocks[i].version->funcs->name); 2112 adev->ip_blocks[i].status.valid = false; 2113 } else { 2114 if (adev->ip_blocks[i].version->funcs->early_init) { 2115 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2116 if (r == -ENOENT) { 2117 adev->ip_blocks[i].status.valid = false; 2118 } else if (r) { 2119 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2120 adev->ip_blocks[i].version->funcs->name, r); 2121 total = false; 2122 } else { 2123 adev->ip_blocks[i].status.valid = true; 2124 } 2125 } else { 2126 adev->ip_blocks[i].status.valid = true; 2127 } 2128 } 2129 /* get the vbios after the asic_funcs are set up */ 2130 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2131 r = amdgpu_device_parse_gpu_info_fw(adev); 2132 if (r) 2133 return r; 2134 2135 /* Read BIOS */ 2136 if (amdgpu_device_read_bios(adev)) { 2137 if (!amdgpu_get_bios(adev)) 2138 return -EINVAL; 2139 2140 r = amdgpu_atombios_init(adev); 2141 if (r) { 2142 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2143 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2144 return r; 2145 } 2146 } 2147 2148 /*get pf2vf msg info at it's earliest time*/ 2149 if (amdgpu_sriov_vf(adev)) 2150 amdgpu_virt_init_data_exchange(adev); 2151 2152 } 2153 } 2154 if (!total) 2155 return -ENODEV; 2156 2157 amdgpu_amdkfd_device_probe(adev); 2158 adev->cg_flags &= amdgpu_cg_mask; 2159 adev->pg_flags &= amdgpu_pg_mask; 2160 2161 return 0; 2162 } 2163 2164 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2165 { 2166 int i, r; 2167 2168 for (i = 0; i < adev->num_ip_blocks; i++) { 2169 if (!adev->ip_blocks[i].status.sw) 2170 continue; 2171 if (adev->ip_blocks[i].status.hw) 2172 continue; 2173 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2174 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2176 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2177 if (r) { 2178 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2179 adev->ip_blocks[i].version->funcs->name, r); 2180 return r; 2181 } 2182 adev->ip_blocks[i].status.hw = true; 2183 } 2184 } 2185 2186 return 0; 2187 } 2188 2189 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2190 { 2191 int i, r; 2192 2193 for (i = 0; i < adev->num_ip_blocks; i++) { 2194 if (!adev->ip_blocks[i].status.sw) 2195 continue; 2196 if (adev->ip_blocks[i].status.hw) 2197 continue; 2198 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2199 if (r) { 2200 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2201 adev->ip_blocks[i].version->funcs->name, r); 2202 return r; 2203 } 2204 adev->ip_blocks[i].status.hw = true; 2205 } 2206 2207 return 0; 2208 } 2209 2210 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2211 { 2212 int r = 0; 2213 int i; 2214 uint32_t smu_version; 2215 2216 if (adev->asic_type >= CHIP_VEGA10) { 2217 for (i = 0; i < adev->num_ip_blocks; i++) { 2218 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2219 continue; 2220 2221 if (!adev->ip_blocks[i].status.sw) 2222 continue; 2223 2224 /* no need to do the fw loading again if already done*/ 2225 if (adev->ip_blocks[i].status.hw == true) 2226 break; 2227 2228 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2229 r = adev->ip_blocks[i].version->funcs->resume(adev); 2230 if (r) { 2231 DRM_ERROR("resume of IP block <%s> failed %d\n", 2232 adev->ip_blocks[i].version->funcs->name, r); 2233 return r; 2234 } 2235 } else { 2236 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2237 if (r) { 2238 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2239 adev->ip_blocks[i].version->funcs->name, r); 2240 return r; 2241 } 2242 } 2243 2244 adev->ip_blocks[i].status.hw = true; 2245 break; 2246 } 2247 } 2248 2249 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2250 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2251 2252 return r; 2253 } 2254 2255 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2256 { 2257 long timeout; 2258 int r, i; 2259 2260 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2261 struct amdgpu_ring *ring = adev->rings[i]; 2262 2263 /* No need to setup the GPU scheduler for rings that don't need it */ 2264 if (!ring || ring->no_scheduler) 2265 continue; 2266 2267 switch (ring->funcs->type) { 2268 case AMDGPU_RING_TYPE_GFX: 2269 timeout = adev->gfx_timeout; 2270 break; 2271 case AMDGPU_RING_TYPE_COMPUTE: 2272 timeout = adev->compute_timeout; 2273 break; 2274 case AMDGPU_RING_TYPE_SDMA: 2275 timeout = adev->sdma_timeout; 2276 break; 2277 default: 2278 timeout = adev->video_timeout; 2279 break; 2280 } 2281 2282 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2283 DRM_SCHED_PRIORITY_COUNT, 2284 ring->num_hw_submission, 0, 2285 timeout, adev->reset_domain->wq, 2286 ring->sched_score, ring->name, 2287 adev->dev); 2288 if (r) { 2289 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2290 ring->name); 2291 return r; 2292 } 2293 } 2294 2295 amdgpu_xcp_update_partition_sched_list(adev); 2296 2297 return 0; 2298 } 2299 2300 2301 /** 2302 * amdgpu_device_ip_init - run init for hardware IPs 2303 * 2304 * @adev: amdgpu_device pointer 2305 * 2306 * Main initialization pass for hardware IPs. The list of all the hardware 2307 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2308 * are run. sw_init initializes the software state associated with each IP 2309 * and hw_init initializes the hardware associated with each IP. 2310 * Returns 0 on success, negative error code on failure. 2311 */ 2312 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2313 { 2314 int i, r; 2315 2316 r = amdgpu_ras_init(adev); 2317 if (r) 2318 return r; 2319 2320 for (i = 0; i < adev->num_ip_blocks; i++) { 2321 if (!adev->ip_blocks[i].status.valid) 2322 continue; 2323 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2324 if (r) { 2325 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2326 adev->ip_blocks[i].version->funcs->name, r); 2327 goto init_failed; 2328 } 2329 adev->ip_blocks[i].status.sw = true; 2330 2331 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2332 /* need to do common hw init early so everything is set up for gmc */ 2333 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2334 if (r) { 2335 DRM_ERROR("hw_init %d failed %d\n", i, r); 2336 goto init_failed; 2337 } 2338 adev->ip_blocks[i].status.hw = true; 2339 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2340 /* need to do gmc hw init early so we can allocate gpu mem */ 2341 /* Try to reserve bad pages early */ 2342 if (amdgpu_sriov_vf(adev)) 2343 amdgpu_virt_exchange_data(adev); 2344 2345 r = amdgpu_device_mem_scratch_init(adev); 2346 if (r) { 2347 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2348 goto init_failed; 2349 } 2350 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2351 if (r) { 2352 DRM_ERROR("hw_init %d failed %d\n", i, r); 2353 goto init_failed; 2354 } 2355 r = amdgpu_device_wb_init(adev); 2356 if (r) { 2357 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2358 goto init_failed; 2359 } 2360 adev->ip_blocks[i].status.hw = true; 2361 2362 /* right after GMC hw init, we create CSA */ 2363 if (adev->gfx.mcbp) { 2364 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2365 AMDGPU_GEM_DOMAIN_VRAM | 2366 AMDGPU_GEM_DOMAIN_GTT, 2367 AMDGPU_CSA_SIZE); 2368 if (r) { 2369 DRM_ERROR("allocate CSA failed %d\n", r); 2370 goto init_failed; 2371 } 2372 } 2373 } 2374 } 2375 2376 if (amdgpu_sriov_vf(adev)) 2377 amdgpu_virt_init_data_exchange(adev); 2378 2379 r = amdgpu_ib_pool_init(adev); 2380 if (r) { 2381 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2382 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2383 goto init_failed; 2384 } 2385 2386 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2387 if (r) 2388 goto init_failed; 2389 2390 r = amdgpu_device_ip_hw_init_phase1(adev); 2391 if (r) 2392 goto init_failed; 2393 2394 r = amdgpu_device_fw_loading(adev); 2395 if (r) 2396 goto init_failed; 2397 2398 r = amdgpu_device_ip_hw_init_phase2(adev); 2399 if (r) 2400 goto init_failed; 2401 2402 /* 2403 * retired pages will be loaded from eeprom and reserved here, 2404 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2405 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2406 * for I2C communication which only true at this point. 2407 * 2408 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2409 * failure from bad gpu situation and stop amdgpu init process 2410 * accordingly. For other failed cases, it will still release all 2411 * the resource and print error message, rather than returning one 2412 * negative value to upper level. 2413 * 2414 * Note: theoretically, this should be called before all vram allocations 2415 * to protect retired page from abusing 2416 */ 2417 r = amdgpu_ras_recovery_init(adev); 2418 if (r) 2419 goto init_failed; 2420 2421 /** 2422 * In case of XGMI grab extra reference for reset domain for this device 2423 */ 2424 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2425 if (amdgpu_xgmi_add_device(adev) == 0) { 2426 if (!amdgpu_sriov_vf(adev)) { 2427 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2428 2429 if (WARN_ON(!hive)) { 2430 r = -ENOENT; 2431 goto init_failed; 2432 } 2433 2434 if (!hive->reset_domain || 2435 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2436 r = -ENOENT; 2437 amdgpu_put_xgmi_hive(hive); 2438 goto init_failed; 2439 } 2440 2441 /* Drop the early temporary reset domain we created for device */ 2442 amdgpu_reset_put_reset_domain(adev->reset_domain); 2443 adev->reset_domain = hive->reset_domain; 2444 amdgpu_put_xgmi_hive(hive); 2445 } 2446 } 2447 } 2448 2449 r = amdgpu_device_init_schedulers(adev); 2450 if (r) 2451 goto init_failed; 2452 2453 if (adev->mman.buffer_funcs_ring->sched.ready) 2454 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2455 2456 /* Don't init kfd if whole hive need to be reset during init */ 2457 if (!adev->gmc.xgmi.pending_reset) { 2458 kgd2kfd_init_zone_device(adev); 2459 amdgpu_amdkfd_device_init(adev); 2460 } 2461 2462 amdgpu_fru_get_product_info(adev); 2463 2464 init_failed: 2465 2466 return r; 2467 } 2468 2469 /** 2470 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2471 * 2472 * @adev: amdgpu_device pointer 2473 * 2474 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2475 * this function before a GPU reset. If the value is retained after a 2476 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2477 */ 2478 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2479 { 2480 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2481 } 2482 2483 /** 2484 * amdgpu_device_check_vram_lost - check if vram is valid 2485 * 2486 * @adev: amdgpu_device pointer 2487 * 2488 * Checks the reset magic value written to the gart pointer in VRAM. 2489 * The driver calls this after a GPU reset to see if the contents of 2490 * VRAM is lost or now. 2491 * returns true if vram is lost, false if not. 2492 */ 2493 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2494 { 2495 if (memcmp(adev->gart.ptr, adev->reset_magic, 2496 AMDGPU_RESET_MAGIC_NUM)) 2497 return true; 2498 2499 if (!amdgpu_in_reset(adev)) 2500 return false; 2501 2502 /* 2503 * For all ASICs with baco/mode1 reset, the VRAM is 2504 * always assumed to be lost. 2505 */ 2506 switch (amdgpu_asic_reset_method(adev)) { 2507 case AMD_RESET_METHOD_BACO: 2508 case AMD_RESET_METHOD_MODE1: 2509 return true; 2510 default: 2511 return false; 2512 } 2513 } 2514 2515 /** 2516 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2517 * 2518 * @adev: amdgpu_device pointer 2519 * @state: clockgating state (gate or ungate) 2520 * 2521 * The list of all the hardware IPs that make up the asic is walked and the 2522 * set_clockgating_state callbacks are run. 2523 * Late initialization pass enabling clockgating for hardware IPs. 2524 * Fini or suspend, pass disabling clockgating for hardware IPs. 2525 * Returns 0 on success, negative error code on failure. 2526 */ 2527 2528 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2529 enum amd_clockgating_state state) 2530 { 2531 int i, j, r; 2532 2533 if (amdgpu_emu_mode == 1) 2534 return 0; 2535 2536 for (j = 0; j < adev->num_ip_blocks; j++) { 2537 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2538 if (!adev->ip_blocks[i].status.late_initialized) 2539 continue; 2540 /* skip CG for GFX, SDMA on S0ix */ 2541 if (adev->in_s0ix && 2542 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2543 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2544 continue; 2545 /* skip CG for VCE/UVD, it's handled specially */ 2546 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2547 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2548 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2549 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2550 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2551 /* enable clockgating to save power */ 2552 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2553 state); 2554 if (r) { 2555 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2556 adev->ip_blocks[i].version->funcs->name, r); 2557 return r; 2558 } 2559 } 2560 } 2561 2562 return 0; 2563 } 2564 2565 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2566 enum amd_powergating_state state) 2567 { 2568 int i, j, r; 2569 2570 if (amdgpu_emu_mode == 1) 2571 return 0; 2572 2573 for (j = 0; j < adev->num_ip_blocks; j++) { 2574 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2575 if (!adev->ip_blocks[i].status.late_initialized) 2576 continue; 2577 /* skip PG for GFX, SDMA on S0ix */ 2578 if (adev->in_s0ix && 2579 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2580 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2581 continue; 2582 /* skip CG for VCE/UVD, it's handled specially */ 2583 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2584 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2585 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2586 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2587 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2588 /* enable powergating to save power */ 2589 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2590 state); 2591 if (r) { 2592 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2593 adev->ip_blocks[i].version->funcs->name, r); 2594 return r; 2595 } 2596 } 2597 } 2598 return 0; 2599 } 2600 2601 static int amdgpu_device_enable_mgpu_fan_boost(void) 2602 { 2603 struct amdgpu_gpu_instance *gpu_ins; 2604 struct amdgpu_device *adev; 2605 int i, ret = 0; 2606 2607 mutex_lock(&mgpu_info.mutex); 2608 2609 /* 2610 * MGPU fan boost feature should be enabled 2611 * only when there are two or more dGPUs in 2612 * the system 2613 */ 2614 if (mgpu_info.num_dgpu < 2) 2615 goto out; 2616 2617 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2618 gpu_ins = &(mgpu_info.gpu_ins[i]); 2619 adev = gpu_ins->adev; 2620 if (!(adev->flags & AMD_IS_APU) && 2621 !gpu_ins->mgpu_fan_enabled) { 2622 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2623 if (ret) 2624 break; 2625 2626 gpu_ins->mgpu_fan_enabled = 1; 2627 } 2628 } 2629 2630 out: 2631 mutex_unlock(&mgpu_info.mutex); 2632 2633 return ret; 2634 } 2635 2636 /** 2637 * amdgpu_device_ip_late_init - run late init for hardware IPs 2638 * 2639 * @adev: amdgpu_device pointer 2640 * 2641 * Late initialization pass for hardware IPs. The list of all the hardware 2642 * IPs that make up the asic is walked and the late_init callbacks are run. 2643 * late_init covers any special initialization that an IP requires 2644 * after all of the have been initialized or something that needs to happen 2645 * late in the init process. 2646 * Returns 0 on success, negative error code on failure. 2647 */ 2648 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2649 { 2650 struct amdgpu_gpu_instance *gpu_instance; 2651 int i = 0, r; 2652 2653 for (i = 0; i < adev->num_ip_blocks; i++) { 2654 if (!adev->ip_blocks[i].status.hw) 2655 continue; 2656 if (adev->ip_blocks[i].version->funcs->late_init) { 2657 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2658 if (r) { 2659 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2660 adev->ip_blocks[i].version->funcs->name, r); 2661 return r; 2662 } 2663 } 2664 adev->ip_blocks[i].status.late_initialized = true; 2665 } 2666 2667 r = amdgpu_ras_late_init(adev); 2668 if (r) { 2669 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2670 return r; 2671 } 2672 2673 amdgpu_ras_set_error_query_ready(adev, true); 2674 2675 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2676 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2677 2678 amdgpu_device_fill_reset_magic(adev); 2679 2680 r = amdgpu_device_enable_mgpu_fan_boost(); 2681 if (r) 2682 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2683 2684 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2685 if (amdgpu_passthrough(adev) && 2686 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2687 adev->asic_type == CHIP_ALDEBARAN)) 2688 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2689 2690 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2691 mutex_lock(&mgpu_info.mutex); 2692 2693 /* 2694 * Reset device p-state to low as this was booted with high. 2695 * 2696 * This should be performed only after all devices from the same 2697 * hive get initialized. 2698 * 2699 * However, it's unknown how many device in the hive in advance. 2700 * As this is counted one by one during devices initializations. 2701 * 2702 * So, we wait for all XGMI interlinked devices initialized. 2703 * This may bring some delays as those devices may come from 2704 * different hives. But that should be OK. 2705 */ 2706 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2707 for (i = 0; i < mgpu_info.num_gpu; i++) { 2708 gpu_instance = &(mgpu_info.gpu_ins[i]); 2709 if (gpu_instance->adev->flags & AMD_IS_APU) 2710 continue; 2711 2712 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2713 AMDGPU_XGMI_PSTATE_MIN); 2714 if (r) { 2715 DRM_ERROR("pstate setting failed (%d).\n", r); 2716 break; 2717 } 2718 } 2719 } 2720 2721 mutex_unlock(&mgpu_info.mutex); 2722 } 2723 2724 return 0; 2725 } 2726 2727 /** 2728 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2729 * 2730 * @adev: amdgpu_device pointer 2731 * 2732 * For ASICs need to disable SMC first 2733 */ 2734 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2735 { 2736 int i, r; 2737 2738 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2739 return; 2740 2741 for (i = 0; i < adev->num_ip_blocks; i++) { 2742 if (!adev->ip_blocks[i].status.hw) 2743 continue; 2744 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2745 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2746 /* XXX handle errors */ 2747 if (r) { 2748 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2749 adev->ip_blocks[i].version->funcs->name, r); 2750 } 2751 adev->ip_blocks[i].status.hw = false; 2752 break; 2753 } 2754 } 2755 } 2756 2757 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2758 { 2759 int i, r; 2760 2761 for (i = 0; i < adev->num_ip_blocks; i++) { 2762 if (!adev->ip_blocks[i].version->funcs->early_fini) 2763 continue; 2764 2765 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2766 if (r) { 2767 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2768 adev->ip_blocks[i].version->funcs->name, r); 2769 } 2770 } 2771 2772 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2773 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2774 2775 amdgpu_amdkfd_suspend(adev, false); 2776 2777 /* Workaroud for ASICs need to disable SMC first */ 2778 amdgpu_device_smu_fini_early(adev); 2779 2780 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2781 if (!adev->ip_blocks[i].status.hw) 2782 continue; 2783 2784 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2785 /* XXX handle errors */ 2786 if (r) { 2787 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2788 adev->ip_blocks[i].version->funcs->name, r); 2789 } 2790 2791 adev->ip_blocks[i].status.hw = false; 2792 } 2793 2794 if (amdgpu_sriov_vf(adev)) { 2795 if (amdgpu_virt_release_full_gpu(adev, false)) 2796 DRM_ERROR("failed to release exclusive mode on fini\n"); 2797 } 2798 2799 return 0; 2800 } 2801 2802 /** 2803 * amdgpu_device_ip_fini - run fini for hardware IPs 2804 * 2805 * @adev: amdgpu_device pointer 2806 * 2807 * Main teardown pass for hardware IPs. The list of all the hardware 2808 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2809 * are run. hw_fini tears down the hardware associated with each IP 2810 * and sw_fini tears down any software state associated with each IP. 2811 * Returns 0 on success, negative error code on failure. 2812 */ 2813 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2814 { 2815 int i, r; 2816 2817 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2818 amdgpu_virt_release_ras_err_handler_data(adev); 2819 2820 if (adev->gmc.xgmi.num_physical_nodes > 1) 2821 amdgpu_xgmi_remove_device(adev); 2822 2823 amdgpu_amdkfd_device_fini_sw(adev); 2824 2825 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2826 if (!adev->ip_blocks[i].status.sw) 2827 continue; 2828 2829 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2830 amdgpu_ucode_free_bo(adev); 2831 amdgpu_free_static_csa(&adev->virt.csa_obj); 2832 amdgpu_device_wb_fini(adev); 2833 amdgpu_device_mem_scratch_fini(adev); 2834 amdgpu_ib_pool_fini(adev); 2835 } 2836 2837 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2838 /* XXX handle errors */ 2839 if (r) { 2840 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2841 adev->ip_blocks[i].version->funcs->name, r); 2842 } 2843 adev->ip_blocks[i].status.sw = false; 2844 adev->ip_blocks[i].status.valid = false; 2845 } 2846 2847 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2848 if (!adev->ip_blocks[i].status.late_initialized) 2849 continue; 2850 if (adev->ip_blocks[i].version->funcs->late_fini) 2851 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2852 adev->ip_blocks[i].status.late_initialized = false; 2853 } 2854 2855 amdgpu_ras_fini(adev); 2856 2857 return 0; 2858 } 2859 2860 /** 2861 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2862 * 2863 * @work: work_struct. 2864 */ 2865 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2866 { 2867 struct amdgpu_device *adev = 2868 container_of(work, struct amdgpu_device, delayed_init_work.work); 2869 int r; 2870 2871 r = amdgpu_ib_ring_tests(adev); 2872 if (r) 2873 DRM_ERROR("ib ring test failed (%d).\n", r); 2874 } 2875 2876 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2877 { 2878 struct amdgpu_device *adev = 2879 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2880 2881 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2882 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2883 2884 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2885 adev->gfx.gfx_off_state = true; 2886 } 2887 2888 /** 2889 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2890 * 2891 * @adev: amdgpu_device pointer 2892 * 2893 * Main suspend function for hardware IPs. The list of all the hardware 2894 * IPs that make up the asic is walked, clockgating is disabled and the 2895 * suspend callbacks are run. suspend puts the hardware and software state 2896 * in each IP into a state suitable for suspend. 2897 * Returns 0 on success, negative error code on failure. 2898 */ 2899 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2900 { 2901 int i, r; 2902 2903 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2904 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2905 2906 /* 2907 * Per PMFW team's suggestion, driver needs to handle gfxoff 2908 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2909 * scenario. Add the missing df cstate disablement here. 2910 */ 2911 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2912 dev_warn(adev->dev, "Failed to disallow df cstate"); 2913 2914 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2915 if (!adev->ip_blocks[i].status.valid) 2916 continue; 2917 2918 /* displays are handled separately */ 2919 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2920 continue; 2921 2922 /* XXX handle errors */ 2923 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2924 /* XXX handle errors */ 2925 if (r) { 2926 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2927 adev->ip_blocks[i].version->funcs->name, r); 2928 return r; 2929 } 2930 2931 adev->ip_blocks[i].status.hw = false; 2932 } 2933 2934 return 0; 2935 } 2936 2937 /** 2938 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2939 * 2940 * @adev: amdgpu_device pointer 2941 * 2942 * Main suspend function for hardware IPs. The list of all the hardware 2943 * IPs that make up the asic is walked, clockgating is disabled and the 2944 * suspend callbacks are run. suspend puts the hardware and software state 2945 * in each IP into a state suitable for suspend. 2946 * Returns 0 on success, negative error code on failure. 2947 */ 2948 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2949 { 2950 int i, r; 2951 2952 if (adev->in_s0ix) 2953 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2954 2955 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2956 if (!adev->ip_blocks[i].status.valid) 2957 continue; 2958 /* displays are handled in phase1 */ 2959 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2960 continue; 2961 /* PSP lost connection when err_event_athub occurs */ 2962 if (amdgpu_ras_intr_triggered() && 2963 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2964 adev->ip_blocks[i].status.hw = false; 2965 continue; 2966 } 2967 2968 /* skip unnecessary suspend if we do not initialize them yet */ 2969 if (adev->gmc.xgmi.pending_reset && 2970 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2971 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2972 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2973 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2974 adev->ip_blocks[i].status.hw = false; 2975 continue; 2976 } 2977 2978 /* skip suspend of gfx/mes and psp for S0ix 2979 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2980 * like at runtime. PSP is also part of the always on hardware 2981 * so no need to suspend it. 2982 */ 2983 if (adev->in_s0ix && 2984 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2985 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2986 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 2987 continue; 2988 2989 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 2990 if (adev->in_s0ix && 2991 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 2992 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2993 continue; 2994 2995 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 2996 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 2997 * from this location and RLC Autoload automatically also gets loaded 2998 * from here based on PMFW -> PSP message during re-init sequence. 2999 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3000 * the TMR and reload FWs again for IMU enabled APU ASICs. 3001 */ 3002 if (amdgpu_in_reset(adev) && 3003 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3004 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3005 continue; 3006 3007 /* XXX handle errors */ 3008 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3009 /* XXX handle errors */ 3010 if (r) { 3011 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3012 adev->ip_blocks[i].version->funcs->name, r); 3013 } 3014 adev->ip_blocks[i].status.hw = false; 3015 /* handle putting the SMC in the appropriate state */ 3016 if (!amdgpu_sriov_vf(adev)) { 3017 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3018 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3019 if (r) { 3020 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3021 adev->mp1_state, r); 3022 return r; 3023 } 3024 } 3025 } 3026 } 3027 3028 return 0; 3029 } 3030 3031 /** 3032 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3033 * 3034 * @adev: amdgpu_device pointer 3035 * 3036 * Main suspend function for hardware IPs. The list of all the hardware 3037 * IPs that make up the asic is walked, clockgating is disabled and the 3038 * suspend callbacks are run. suspend puts the hardware and software state 3039 * in each IP into a state suitable for suspend. 3040 * Returns 0 on success, negative error code on failure. 3041 */ 3042 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3043 { 3044 int r; 3045 3046 if (amdgpu_sriov_vf(adev)) { 3047 amdgpu_virt_fini_data_exchange(adev); 3048 amdgpu_virt_request_full_gpu(adev, false); 3049 } 3050 3051 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3052 3053 r = amdgpu_device_ip_suspend_phase1(adev); 3054 if (r) 3055 return r; 3056 r = amdgpu_device_ip_suspend_phase2(adev); 3057 3058 if (amdgpu_sriov_vf(adev)) 3059 amdgpu_virt_release_full_gpu(adev, false); 3060 3061 return r; 3062 } 3063 3064 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3065 { 3066 int i, r; 3067 3068 static enum amd_ip_block_type ip_order[] = { 3069 AMD_IP_BLOCK_TYPE_COMMON, 3070 AMD_IP_BLOCK_TYPE_GMC, 3071 AMD_IP_BLOCK_TYPE_PSP, 3072 AMD_IP_BLOCK_TYPE_IH, 3073 }; 3074 3075 for (i = 0; i < adev->num_ip_blocks; i++) { 3076 int j; 3077 struct amdgpu_ip_block *block; 3078 3079 block = &adev->ip_blocks[i]; 3080 block->status.hw = false; 3081 3082 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3083 3084 if (block->version->type != ip_order[j] || 3085 !block->status.valid) 3086 continue; 3087 3088 r = block->version->funcs->hw_init(adev); 3089 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3090 if (r) 3091 return r; 3092 block->status.hw = true; 3093 } 3094 } 3095 3096 return 0; 3097 } 3098 3099 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3100 { 3101 int i, r; 3102 3103 static enum amd_ip_block_type ip_order[] = { 3104 AMD_IP_BLOCK_TYPE_SMC, 3105 AMD_IP_BLOCK_TYPE_DCE, 3106 AMD_IP_BLOCK_TYPE_GFX, 3107 AMD_IP_BLOCK_TYPE_SDMA, 3108 AMD_IP_BLOCK_TYPE_MES, 3109 AMD_IP_BLOCK_TYPE_UVD, 3110 AMD_IP_BLOCK_TYPE_VCE, 3111 AMD_IP_BLOCK_TYPE_VCN, 3112 AMD_IP_BLOCK_TYPE_JPEG 3113 }; 3114 3115 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3116 int j; 3117 struct amdgpu_ip_block *block; 3118 3119 for (j = 0; j < adev->num_ip_blocks; j++) { 3120 block = &adev->ip_blocks[j]; 3121 3122 if (block->version->type != ip_order[i] || 3123 !block->status.valid || 3124 block->status.hw) 3125 continue; 3126 3127 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3128 r = block->version->funcs->resume(adev); 3129 else 3130 r = block->version->funcs->hw_init(adev); 3131 3132 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3133 if (r) 3134 return r; 3135 block->status.hw = true; 3136 } 3137 } 3138 3139 return 0; 3140 } 3141 3142 /** 3143 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3144 * 3145 * @adev: amdgpu_device pointer 3146 * 3147 * First resume function for hardware IPs. The list of all the hardware 3148 * IPs that make up the asic is walked and the resume callbacks are run for 3149 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3150 * after a suspend and updates the software state as necessary. This 3151 * function is also used for restoring the GPU after a GPU reset. 3152 * Returns 0 on success, negative error code on failure. 3153 */ 3154 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3155 { 3156 int i, r; 3157 3158 for (i = 0; i < adev->num_ip_blocks; i++) { 3159 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3160 continue; 3161 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3162 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3163 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3164 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3165 3166 r = adev->ip_blocks[i].version->funcs->resume(adev); 3167 if (r) { 3168 DRM_ERROR("resume of IP block <%s> failed %d\n", 3169 adev->ip_blocks[i].version->funcs->name, r); 3170 return r; 3171 } 3172 adev->ip_blocks[i].status.hw = true; 3173 } 3174 } 3175 3176 return 0; 3177 } 3178 3179 /** 3180 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3181 * 3182 * @adev: amdgpu_device pointer 3183 * 3184 * First resume function for hardware IPs. The list of all the hardware 3185 * IPs that make up the asic is walked and the resume callbacks are run for 3186 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3187 * functional state after a suspend and updates the software state as 3188 * necessary. This function is also used for restoring the GPU after a GPU 3189 * reset. 3190 * Returns 0 on success, negative error code on failure. 3191 */ 3192 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3193 { 3194 int i, r; 3195 3196 for (i = 0; i < adev->num_ip_blocks; i++) { 3197 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3198 continue; 3199 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3200 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3201 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3202 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3203 continue; 3204 r = adev->ip_blocks[i].version->funcs->resume(adev); 3205 if (r) { 3206 DRM_ERROR("resume of IP block <%s> failed %d\n", 3207 adev->ip_blocks[i].version->funcs->name, r); 3208 return r; 3209 } 3210 adev->ip_blocks[i].status.hw = true; 3211 } 3212 3213 return 0; 3214 } 3215 3216 /** 3217 * amdgpu_device_ip_resume - run resume for hardware IPs 3218 * 3219 * @adev: amdgpu_device pointer 3220 * 3221 * Main resume function for hardware IPs. The hardware IPs 3222 * are split into two resume functions because they are 3223 * also used in recovering from a GPU reset and some additional 3224 * steps need to be take between them. In this case (S3/S4) they are 3225 * run sequentially. 3226 * Returns 0 on success, negative error code on failure. 3227 */ 3228 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3229 { 3230 int r; 3231 3232 r = amdgpu_device_ip_resume_phase1(adev); 3233 if (r) 3234 return r; 3235 3236 r = amdgpu_device_fw_loading(adev); 3237 if (r) 3238 return r; 3239 3240 r = amdgpu_device_ip_resume_phase2(adev); 3241 3242 if (adev->mman.buffer_funcs_ring->sched.ready) 3243 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3244 3245 return r; 3246 } 3247 3248 /** 3249 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3250 * 3251 * @adev: amdgpu_device pointer 3252 * 3253 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3254 */ 3255 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3256 { 3257 if (amdgpu_sriov_vf(adev)) { 3258 if (adev->is_atom_fw) { 3259 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3260 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3261 } else { 3262 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3263 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3264 } 3265 3266 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3267 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3268 } 3269 } 3270 3271 /** 3272 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3273 * 3274 * @asic_type: AMD asic type 3275 * 3276 * Check if there is DC (new modesetting infrastructre) support for an asic. 3277 * returns true if DC has support, false if not. 3278 */ 3279 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3280 { 3281 switch (asic_type) { 3282 #ifdef CONFIG_DRM_AMDGPU_SI 3283 case CHIP_HAINAN: 3284 #endif 3285 case CHIP_TOPAZ: 3286 /* chips with no display hardware */ 3287 return false; 3288 #if defined(CONFIG_DRM_AMD_DC) 3289 case CHIP_TAHITI: 3290 case CHIP_PITCAIRN: 3291 case CHIP_VERDE: 3292 case CHIP_OLAND: 3293 /* 3294 * We have systems in the wild with these ASICs that require 3295 * LVDS and VGA support which is not supported with DC. 3296 * 3297 * Fallback to the non-DC driver here by default so as not to 3298 * cause regressions. 3299 */ 3300 #if defined(CONFIG_DRM_AMD_DC_SI) 3301 return amdgpu_dc > 0; 3302 #else 3303 return false; 3304 #endif 3305 case CHIP_BONAIRE: 3306 case CHIP_KAVERI: 3307 case CHIP_KABINI: 3308 case CHIP_MULLINS: 3309 /* 3310 * We have systems in the wild with these ASICs that require 3311 * VGA support which is not supported with DC. 3312 * 3313 * Fallback to the non-DC driver here by default so as not to 3314 * cause regressions. 3315 */ 3316 return amdgpu_dc > 0; 3317 default: 3318 return amdgpu_dc != 0; 3319 #else 3320 default: 3321 if (amdgpu_dc > 0) 3322 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3323 return false; 3324 #endif 3325 } 3326 } 3327 3328 /** 3329 * amdgpu_device_has_dc_support - check if dc is supported 3330 * 3331 * @adev: amdgpu_device pointer 3332 * 3333 * Returns true for supported, false for not supported 3334 */ 3335 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3336 { 3337 if (adev->enable_virtual_display || 3338 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3339 return false; 3340 3341 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3342 } 3343 3344 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3345 { 3346 struct amdgpu_device *adev = 3347 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3348 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3349 3350 /* It's a bug to not have a hive within this function */ 3351 if (WARN_ON(!hive)) 3352 return; 3353 3354 /* 3355 * Use task barrier to synchronize all xgmi reset works across the 3356 * hive. task_barrier_enter and task_barrier_exit will block 3357 * until all the threads running the xgmi reset works reach 3358 * those points. task_barrier_full will do both blocks. 3359 */ 3360 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3361 3362 task_barrier_enter(&hive->tb); 3363 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3364 3365 if (adev->asic_reset_res) 3366 goto fail; 3367 3368 task_barrier_exit(&hive->tb); 3369 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3370 3371 if (adev->asic_reset_res) 3372 goto fail; 3373 3374 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3375 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3376 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3377 } else { 3378 3379 task_barrier_full(&hive->tb); 3380 adev->asic_reset_res = amdgpu_asic_reset(adev); 3381 } 3382 3383 fail: 3384 if (adev->asic_reset_res) 3385 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3386 adev->asic_reset_res, adev_to_drm(adev)->unique); 3387 amdgpu_put_xgmi_hive(hive); 3388 } 3389 3390 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3391 { 3392 char *input = amdgpu_lockup_timeout; 3393 char *timeout_setting = NULL; 3394 int index = 0; 3395 long timeout; 3396 int ret = 0; 3397 3398 /* 3399 * By default timeout for non compute jobs is 10000 3400 * and 60000 for compute jobs. 3401 * In SR-IOV or passthrough mode, timeout for compute 3402 * jobs are 60000 by default. 3403 */ 3404 adev->gfx_timeout = msecs_to_jiffies(10000); 3405 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3406 if (amdgpu_sriov_vf(adev)) 3407 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3408 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3409 else 3410 adev->compute_timeout = msecs_to_jiffies(60000); 3411 3412 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3413 while ((timeout_setting = strsep(&input, ",")) && 3414 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3415 ret = kstrtol(timeout_setting, 0, &timeout); 3416 if (ret) 3417 return ret; 3418 3419 if (timeout == 0) { 3420 index++; 3421 continue; 3422 } else if (timeout < 0) { 3423 timeout = MAX_SCHEDULE_TIMEOUT; 3424 dev_warn(adev->dev, "lockup timeout disabled"); 3425 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3426 } else { 3427 timeout = msecs_to_jiffies(timeout); 3428 } 3429 3430 switch (index++) { 3431 case 0: 3432 adev->gfx_timeout = timeout; 3433 break; 3434 case 1: 3435 adev->compute_timeout = timeout; 3436 break; 3437 case 2: 3438 adev->sdma_timeout = timeout; 3439 break; 3440 case 3: 3441 adev->video_timeout = timeout; 3442 break; 3443 default: 3444 break; 3445 } 3446 } 3447 /* 3448 * There is only one value specified and 3449 * it should apply to all non-compute jobs. 3450 */ 3451 if (index == 1) { 3452 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3453 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3454 adev->compute_timeout = adev->gfx_timeout; 3455 } 3456 } 3457 3458 return ret; 3459 } 3460 3461 /** 3462 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3463 * 3464 * @adev: amdgpu_device pointer 3465 * 3466 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3467 */ 3468 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3469 { 3470 struct iommu_domain *domain; 3471 3472 domain = iommu_get_domain_for_dev(adev->dev); 3473 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3474 adev->ram_is_direct_mapped = true; 3475 } 3476 3477 static const struct attribute *amdgpu_dev_attributes[] = { 3478 &dev_attr_pcie_replay_count.attr, 3479 NULL 3480 }; 3481 3482 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3483 { 3484 if (amdgpu_mcbp == 1) 3485 adev->gfx.mcbp = true; 3486 else if (amdgpu_mcbp == 0) 3487 adev->gfx.mcbp = false; 3488 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3489 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3490 adev->gfx.num_gfx_rings) 3491 adev->gfx.mcbp = true; 3492 3493 if (amdgpu_sriov_vf(adev)) 3494 adev->gfx.mcbp = true; 3495 3496 if (adev->gfx.mcbp) 3497 DRM_INFO("MCBP is enabled\n"); 3498 } 3499 3500 /** 3501 * amdgpu_device_init - initialize the driver 3502 * 3503 * @adev: amdgpu_device pointer 3504 * @flags: driver flags 3505 * 3506 * Initializes the driver info and hw (all asics). 3507 * Returns 0 for success or an error on failure. 3508 * Called at driver startup. 3509 */ 3510 int amdgpu_device_init(struct amdgpu_device *adev, 3511 uint32_t flags) 3512 { 3513 struct drm_device *ddev = adev_to_drm(adev); 3514 struct pci_dev *pdev = adev->pdev; 3515 int r, i; 3516 bool px = false; 3517 u32 max_MBps; 3518 int tmp; 3519 3520 adev->shutdown = false; 3521 adev->flags = flags; 3522 3523 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3524 adev->asic_type = amdgpu_force_asic_type; 3525 else 3526 adev->asic_type = flags & AMD_ASIC_MASK; 3527 3528 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3529 if (amdgpu_emu_mode == 1) 3530 adev->usec_timeout *= 10; 3531 adev->gmc.gart_size = 512 * 1024 * 1024; 3532 adev->accel_working = false; 3533 adev->num_rings = 0; 3534 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3535 adev->mman.buffer_funcs = NULL; 3536 adev->mman.buffer_funcs_ring = NULL; 3537 adev->vm_manager.vm_pte_funcs = NULL; 3538 adev->vm_manager.vm_pte_num_scheds = 0; 3539 adev->gmc.gmc_funcs = NULL; 3540 adev->harvest_ip_mask = 0x0; 3541 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3542 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3543 3544 adev->smc_rreg = &amdgpu_invalid_rreg; 3545 adev->smc_wreg = &amdgpu_invalid_wreg; 3546 adev->pcie_rreg = &amdgpu_invalid_rreg; 3547 adev->pcie_wreg = &amdgpu_invalid_wreg; 3548 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3549 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3550 adev->pciep_rreg = &amdgpu_invalid_rreg; 3551 adev->pciep_wreg = &amdgpu_invalid_wreg; 3552 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3553 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3554 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3555 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3556 adev->didt_rreg = &amdgpu_invalid_rreg; 3557 adev->didt_wreg = &amdgpu_invalid_wreg; 3558 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3559 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3560 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3561 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3562 3563 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3564 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3565 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3566 3567 /* mutex initialization are all done here so we 3568 * can recall function without having locking issues 3569 */ 3570 mutex_init(&adev->firmware.mutex); 3571 mutex_init(&adev->pm.mutex); 3572 mutex_init(&adev->gfx.gpu_clock_mutex); 3573 mutex_init(&adev->srbm_mutex); 3574 mutex_init(&adev->gfx.pipe_reserve_mutex); 3575 mutex_init(&adev->gfx.gfx_off_mutex); 3576 mutex_init(&adev->gfx.partition_mutex); 3577 mutex_init(&adev->grbm_idx_mutex); 3578 mutex_init(&adev->mn_lock); 3579 mutex_init(&adev->virt.vf_errors.lock); 3580 hash_init(adev->mn_hash); 3581 mutex_init(&adev->psp.mutex); 3582 mutex_init(&adev->notifier_lock); 3583 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3584 mutex_init(&adev->benchmark_mutex); 3585 3586 amdgpu_device_init_apu_flags(adev); 3587 3588 r = amdgpu_device_check_arguments(adev); 3589 if (r) 3590 return r; 3591 3592 spin_lock_init(&adev->mmio_idx_lock); 3593 spin_lock_init(&adev->smc_idx_lock); 3594 spin_lock_init(&adev->pcie_idx_lock); 3595 spin_lock_init(&adev->uvd_ctx_idx_lock); 3596 spin_lock_init(&adev->didt_idx_lock); 3597 spin_lock_init(&adev->gc_cac_idx_lock); 3598 spin_lock_init(&adev->se_cac_idx_lock); 3599 spin_lock_init(&adev->audio_endpt_idx_lock); 3600 spin_lock_init(&adev->mm_stats.lock); 3601 3602 INIT_LIST_HEAD(&adev->shadow_list); 3603 mutex_init(&adev->shadow_list_lock); 3604 3605 INIT_LIST_HEAD(&adev->reset_list); 3606 3607 INIT_LIST_HEAD(&adev->ras_list); 3608 3609 INIT_DELAYED_WORK(&adev->delayed_init_work, 3610 amdgpu_device_delayed_init_work_handler); 3611 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3612 amdgpu_device_delay_enable_gfx_off); 3613 3614 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3615 3616 adev->gfx.gfx_off_req_count = 1; 3617 adev->gfx.gfx_off_residency = 0; 3618 adev->gfx.gfx_off_entrycount = 0; 3619 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3620 3621 atomic_set(&adev->throttling_logging_enabled, 1); 3622 /* 3623 * If throttling continues, logging will be performed every minute 3624 * to avoid log flooding. "-1" is subtracted since the thermal 3625 * throttling interrupt comes every second. Thus, the total logging 3626 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3627 * for throttling interrupt) = 60 seconds. 3628 */ 3629 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3630 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3631 3632 /* Registers mapping */ 3633 /* TODO: block userspace mapping of io register */ 3634 if (adev->asic_type >= CHIP_BONAIRE) { 3635 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3636 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3637 } else { 3638 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3639 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3640 } 3641 3642 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3643 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3644 3645 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3646 if (!adev->rmmio) 3647 return -ENOMEM; 3648 3649 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3650 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3651 3652 /* 3653 * Reset domain needs to be present early, before XGMI hive discovered 3654 * (if any) and intitialized to use reset sem and in_gpu reset flag 3655 * early on during init and before calling to RREG32. 3656 */ 3657 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3658 if (!adev->reset_domain) 3659 return -ENOMEM; 3660 3661 /* detect hw virtualization here */ 3662 amdgpu_detect_virtualization(adev); 3663 3664 amdgpu_device_get_pcie_info(adev); 3665 3666 r = amdgpu_device_get_job_timeout_settings(adev); 3667 if (r) { 3668 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3669 return r; 3670 } 3671 3672 /* early init functions */ 3673 r = amdgpu_device_ip_early_init(adev); 3674 if (r) 3675 return r; 3676 3677 amdgpu_device_set_mcbp(adev); 3678 3679 /* Get rid of things like offb */ 3680 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3681 if (r) 3682 return r; 3683 3684 /* Enable TMZ based on IP_VERSION */ 3685 amdgpu_gmc_tmz_set(adev); 3686 3687 amdgpu_gmc_noretry_set(adev); 3688 /* Need to get xgmi info early to decide the reset behavior*/ 3689 if (adev->gmc.xgmi.supported) { 3690 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3691 if (r) 3692 return r; 3693 } 3694 3695 /* enable PCIE atomic ops */ 3696 if (amdgpu_sriov_vf(adev)) { 3697 if (adev->virt.fw_reserve.p_pf2vf) 3698 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3699 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3700 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3701 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3702 * internal path natively support atomics, set have_atomics_support to true. 3703 */ 3704 } else if ((adev->flags & AMD_IS_APU) && 3705 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3706 adev->have_atomics_support = true; 3707 } else { 3708 adev->have_atomics_support = 3709 !pci_enable_atomic_ops_to_root(adev->pdev, 3710 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3711 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3712 } 3713 3714 if (!adev->have_atomics_support) 3715 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3716 3717 /* doorbell bar mapping and doorbell index init*/ 3718 amdgpu_doorbell_init(adev); 3719 3720 if (amdgpu_emu_mode == 1) { 3721 /* post the asic on emulation mode */ 3722 emu_soc_asic_init(adev); 3723 goto fence_driver_init; 3724 } 3725 3726 amdgpu_reset_init(adev); 3727 3728 /* detect if we are with an SRIOV vbios */ 3729 if (adev->bios) 3730 amdgpu_device_detect_sriov_bios(adev); 3731 3732 /* check if we need to reset the asic 3733 * E.g., driver was not cleanly unloaded previously, etc. 3734 */ 3735 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3736 if (adev->gmc.xgmi.num_physical_nodes) { 3737 dev_info(adev->dev, "Pending hive reset.\n"); 3738 adev->gmc.xgmi.pending_reset = true; 3739 /* Only need to init necessary block for SMU to handle the reset */ 3740 for (i = 0; i < adev->num_ip_blocks; i++) { 3741 if (!adev->ip_blocks[i].status.valid) 3742 continue; 3743 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3744 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3745 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3746 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3747 DRM_DEBUG("IP %s disabled for hw_init.\n", 3748 adev->ip_blocks[i].version->funcs->name); 3749 adev->ip_blocks[i].status.hw = true; 3750 } 3751 } 3752 } else { 3753 tmp = amdgpu_reset_method; 3754 /* It should do a default reset when loading or reloading the driver, 3755 * regardless of the module parameter reset_method. 3756 */ 3757 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3758 r = amdgpu_asic_reset(adev); 3759 amdgpu_reset_method = tmp; 3760 if (r) { 3761 dev_err(adev->dev, "asic reset on init failed\n"); 3762 goto failed; 3763 } 3764 } 3765 } 3766 3767 /* Post card if necessary */ 3768 if (amdgpu_device_need_post(adev)) { 3769 if (!adev->bios) { 3770 dev_err(adev->dev, "no vBIOS found\n"); 3771 r = -EINVAL; 3772 goto failed; 3773 } 3774 DRM_INFO("GPU posting now...\n"); 3775 r = amdgpu_device_asic_init(adev); 3776 if (r) { 3777 dev_err(adev->dev, "gpu post error!\n"); 3778 goto failed; 3779 } 3780 } 3781 3782 if (adev->bios) { 3783 if (adev->is_atom_fw) { 3784 /* Initialize clocks */ 3785 r = amdgpu_atomfirmware_get_clock_info(adev); 3786 if (r) { 3787 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3788 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3789 goto failed; 3790 } 3791 } else { 3792 /* Initialize clocks */ 3793 r = amdgpu_atombios_get_clock_info(adev); 3794 if (r) { 3795 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3796 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3797 goto failed; 3798 } 3799 /* init i2c buses */ 3800 if (!amdgpu_device_has_dc_support(adev)) 3801 amdgpu_atombios_i2c_init(adev); 3802 } 3803 } 3804 3805 fence_driver_init: 3806 /* Fence driver */ 3807 r = amdgpu_fence_driver_sw_init(adev); 3808 if (r) { 3809 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3810 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3811 goto failed; 3812 } 3813 3814 /* init the mode config */ 3815 drm_mode_config_init(adev_to_drm(adev)); 3816 3817 r = amdgpu_device_ip_init(adev); 3818 if (r) { 3819 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3820 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3821 goto release_ras_con; 3822 } 3823 3824 amdgpu_fence_driver_hw_init(adev); 3825 3826 dev_info(adev->dev, 3827 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3828 adev->gfx.config.max_shader_engines, 3829 adev->gfx.config.max_sh_per_se, 3830 adev->gfx.config.max_cu_per_sh, 3831 adev->gfx.cu_info.number); 3832 3833 adev->accel_working = true; 3834 3835 amdgpu_vm_check_compute_bug(adev); 3836 3837 /* Initialize the buffer migration limit. */ 3838 if (amdgpu_moverate >= 0) 3839 max_MBps = amdgpu_moverate; 3840 else 3841 max_MBps = 8; /* Allow 8 MB/s. */ 3842 /* Get a log2 for easy divisions. */ 3843 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3844 3845 r = amdgpu_atombios_sysfs_init(adev); 3846 if (r) 3847 drm_err(&adev->ddev, 3848 "registering atombios sysfs failed (%d).\n", r); 3849 3850 r = amdgpu_pm_sysfs_init(adev); 3851 if (r) 3852 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3853 3854 r = amdgpu_ucode_sysfs_init(adev); 3855 if (r) { 3856 adev->ucode_sysfs_en = false; 3857 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3858 } else 3859 adev->ucode_sysfs_en = true; 3860 3861 /* 3862 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3863 * Otherwise the mgpu fan boost feature will be skipped due to the 3864 * gpu instance is counted less. 3865 */ 3866 amdgpu_register_gpu_instance(adev); 3867 3868 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3869 * explicit gating rather than handling it automatically. 3870 */ 3871 if (!adev->gmc.xgmi.pending_reset) { 3872 r = amdgpu_device_ip_late_init(adev); 3873 if (r) { 3874 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3875 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3876 goto release_ras_con; 3877 } 3878 /* must succeed. */ 3879 amdgpu_ras_resume(adev); 3880 queue_delayed_work(system_wq, &adev->delayed_init_work, 3881 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3882 } 3883 3884 if (amdgpu_sriov_vf(adev)) { 3885 amdgpu_virt_release_full_gpu(adev, true); 3886 flush_delayed_work(&adev->delayed_init_work); 3887 } 3888 3889 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3890 if (r) 3891 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3892 3893 amdgpu_fru_sysfs_init(adev); 3894 3895 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3896 r = amdgpu_pmu_init(adev); 3897 if (r) 3898 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3899 3900 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3901 if (amdgpu_device_cache_pci_state(adev->pdev)) 3902 pci_restore_state(pdev); 3903 3904 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3905 /* this will fail for cards that aren't VGA class devices, just 3906 * ignore it 3907 */ 3908 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3909 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3910 3911 px = amdgpu_device_supports_px(ddev); 3912 3913 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3914 apple_gmux_detect(NULL, NULL))) 3915 vga_switcheroo_register_client(adev->pdev, 3916 &amdgpu_switcheroo_ops, px); 3917 3918 if (px) 3919 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3920 3921 if (adev->gmc.xgmi.pending_reset) 3922 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3923 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3924 3925 amdgpu_device_check_iommu_direct_map(adev); 3926 3927 return 0; 3928 3929 release_ras_con: 3930 if (amdgpu_sriov_vf(adev)) 3931 amdgpu_virt_release_full_gpu(adev, true); 3932 3933 /* failed in exclusive mode due to timeout */ 3934 if (amdgpu_sriov_vf(adev) && 3935 !amdgpu_sriov_runtime(adev) && 3936 amdgpu_virt_mmio_blocked(adev) && 3937 !amdgpu_virt_wait_reset(adev)) { 3938 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3939 /* Don't send request since VF is inactive. */ 3940 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3941 adev->virt.ops = NULL; 3942 r = -EAGAIN; 3943 } 3944 amdgpu_release_ras_context(adev); 3945 3946 failed: 3947 amdgpu_vf_error_trans_all(adev); 3948 3949 return r; 3950 } 3951 3952 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3953 { 3954 3955 /* Clear all CPU mappings pointing to this device */ 3956 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3957 3958 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3959 amdgpu_doorbell_fini(adev); 3960 3961 iounmap(adev->rmmio); 3962 adev->rmmio = NULL; 3963 if (adev->mman.aper_base_kaddr) 3964 iounmap(adev->mman.aper_base_kaddr); 3965 adev->mman.aper_base_kaddr = NULL; 3966 3967 /* Memory manager related */ 3968 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 3969 arch_phys_wc_del(adev->gmc.vram_mtrr); 3970 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3971 } 3972 } 3973 3974 /** 3975 * amdgpu_device_fini_hw - tear down the driver 3976 * 3977 * @adev: amdgpu_device pointer 3978 * 3979 * Tear down the driver info (all asics). 3980 * Called at driver shutdown. 3981 */ 3982 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3983 { 3984 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3985 flush_delayed_work(&adev->delayed_init_work); 3986 adev->shutdown = true; 3987 3988 /* make sure IB test finished before entering exclusive mode 3989 * to avoid preemption on IB test 3990 */ 3991 if (amdgpu_sriov_vf(adev)) { 3992 amdgpu_virt_request_full_gpu(adev, false); 3993 amdgpu_virt_fini_data_exchange(adev); 3994 } 3995 3996 /* disable all interrupts */ 3997 amdgpu_irq_disable_all(adev); 3998 if (adev->mode_info.mode_config_initialized) { 3999 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4000 drm_helper_force_disable_all(adev_to_drm(adev)); 4001 else 4002 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4003 } 4004 amdgpu_fence_driver_hw_fini(adev); 4005 4006 if (adev->mman.initialized) 4007 drain_workqueue(adev->mman.bdev.wq); 4008 4009 if (adev->pm.sysfs_initialized) 4010 amdgpu_pm_sysfs_fini(adev); 4011 if (adev->ucode_sysfs_en) 4012 amdgpu_ucode_sysfs_fini(adev); 4013 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4014 amdgpu_fru_sysfs_fini(adev); 4015 4016 /* disable ras feature must before hw fini */ 4017 amdgpu_ras_pre_fini(adev); 4018 4019 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4020 4021 amdgpu_device_ip_fini_early(adev); 4022 4023 amdgpu_irq_fini_hw(adev); 4024 4025 if (adev->mman.initialized) 4026 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4027 4028 amdgpu_gart_dummy_page_fini(adev); 4029 4030 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4031 amdgpu_device_unmap_mmio(adev); 4032 4033 } 4034 4035 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4036 { 4037 int idx; 4038 bool px; 4039 4040 amdgpu_fence_driver_sw_fini(adev); 4041 amdgpu_device_ip_fini(adev); 4042 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4043 adev->accel_working = false; 4044 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4045 4046 amdgpu_reset_fini(adev); 4047 4048 /* free i2c buses */ 4049 if (!amdgpu_device_has_dc_support(adev)) 4050 amdgpu_i2c_fini(adev); 4051 4052 if (amdgpu_emu_mode != 1) 4053 amdgpu_atombios_fini(adev); 4054 4055 kfree(adev->bios); 4056 adev->bios = NULL; 4057 4058 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4059 4060 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4061 apple_gmux_detect(NULL, NULL))) 4062 vga_switcheroo_unregister_client(adev->pdev); 4063 4064 if (px) 4065 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4066 4067 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4068 vga_client_unregister(adev->pdev); 4069 4070 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4071 4072 iounmap(adev->rmmio); 4073 adev->rmmio = NULL; 4074 amdgpu_doorbell_fini(adev); 4075 drm_dev_exit(idx); 4076 } 4077 4078 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4079 amdgpu_pmu_fini(adev); 4080 if (adev->mman.discovery_bin) 4081 amdgpu_discovery_fini(adev); 4082 4083 amdgpu_reset_put_reset_domain(adev->reset_domain); 4084 adev->reset_domain = NULL; 4085 4086 kfree(adev->pci_state); 4087 4088 } 4089 4090 /** 4091 * amdgpu_device_evict_resources - evict device resources 4092 * @adev: amdgpu device object 4093 * 4094 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4095 * of the vram memory type. Mainly used for evicting device resources 4096 * at suspend time. 4097 * 4098 */ 4099 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4100 { 4101 int ret; 4102 4103 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4104 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4105 return 0; 4106 4107 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4108 if (ret) 4109 DRM_WARN("evicting device resources failed\n"); 4110 return ret; 4111 } 4112 4113 /* 4114 * Suspend & resume. 4115 */ 4116 /** 4117 * amdgpu_device_suspend - initiate device suspend 4118 * 4119 * @dev: drm dev pointer 4120 * @fbcon : notify the fbdev of suspend 4121 * 4122 * Puts the hw in the suspend state (all asics). 4123 * Returns 0 for success or an error on failure. 4124 * Called at driver suspend. 4125 */ 4126 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4127 { 4128 struct amdgpu_device *adev = drm_to_adev(dev); 4129 int r = 0; 4130 4131 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4132 return 0; 4133 4134 adev->in_suspend = true; 4135 4136 /* Evict the majority of BOs before grabbing the full access */ 4137 r = amdgpu_device_evict_resources(adev); 4138 if (r) 4139 return r; 4140 4141 if (amdgpu_sriov_vf(adev)) { 4142 amdgpu_virt_fini_data_exchange(adev); 4143 r = amdgpu_virt_request_full_gpu(adev, false); 4144 if (r) 4145 return r; 4146 } 4147 4148 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4149 DRM_WARN("smart shift update failed\n"); 4150 4151 if (fbcon) 4152 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4153 4154 cancel_delayed_work_sync(&adev->delayed_init_work); 4155 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4156 4157 amdgpu_ras_suspend(adev); 4158 4159 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4160 4161 amdgpu_device_ip_suspend_phase1(adev); 4162 4163 if (!adev->in_s0ix) 4164 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4165 4166 r = amdgpu_device_evict_resources(adev); 4167 if (r) 4168 return r; 4169 4170 amdgpu_fence_driver_hw_fini(adev); 4171 4172 amdgpu_device_ip_suspend_phase2(adev); 4173 4174 if (amdgpu_sriov_vf(adev)) 4175 amdgpu_virt_release_full_gpu(adev, false); 4176 4177 return 0; 4178 } 4179 4180 /** 4181 * amdgpu_device_resume - initiate device resume 4182 * 4183 * @dev: drm dev pointer 4184 * @fbcon : notify the fbdev of resume 4185 * 4186 * Bring the hw back to operating state (all asics). 4187 * Returns 0 for success or an error on failure. 4188 * Called at driver resume. 4189 */ 4190 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4191 { 4192 struct amdgpu_device *adev = drm_to_adev(dev); 4193 int r = 0; 4194 4195 if (amdgpu_sriov_vf(adev)) { 4196 r = amdgpu_virt_request_full_gpu(adev, true); 4197 if (r) 4198 return r; 4199 } 4200 4201 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4202 return 0; 4203 4204 if (adev->in_s0ix) 4205 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4206 4207 /* post card */ 4208 if (amdgpu_device_need_post(adev)) { 4209 r = amdgpu_device_asic_init(adev); 4210 if (r) 4211 dev_err(adev->dev, "amdgpu asic init failed\n"); 4212 } 4213 4214 r = amdgpu_device_ip_resume(adev); 4215 4216 if (r) { 4217 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4218 goto exit; 4219 } 4220 amdgpu_fence_driver_hw_init(adev); 4221 4222 r = amdgpu_device_ip_late_init(adev); 4223 if (r) 4224 goto exit; 4225 4226 queue_delayed_work(system_wq, &adev->delayed_init_work, 4227 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4228 4229 if (!adev->in_s0ix) { 4230 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4231 if (r) 4232 goto exit; 4233 } 4234 4235 exit: 4236 if (amdgpu_sriov_vf(adev)) { 4237 amdgpu_virt_init_data_exchange(adev); 4238 amdgpu_virt_release_full_gpu(adev, true); 4239 } 4240 4241 if (r) 4242 return r; 4243 4244 /* Make sure IB tests flushed */ 4245 flush_delayed_work(&adev->delayed_init_work); 4246 4247 if (fbcon) 4248 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4249 4250 amdgpu_ras_resume(adev); 4251 4252 if (adev->mode_info.num_crtc) { 4253 /* 4254 * Most of the connector probing functions try to acquire runtime pm 4255 * refs to ensure that the GPU is powered on when connector polling is 4256 * performed. Since we're calling this from a runtime PM callback, 4257 * trying to acquire rpm refs will cause us to deadlock. 4258 * 4259 * Since we're guaranteed to be holding the rpm lock, it's safe to 4260 * temporarily disable the rpm helpers so this doesn't deadlock us. 4261 */ 4262 #ifdef CONFIG_PM 4263 dev->dev->power.disable_depth++; 4264 #endif 4265 if (!adev->dc_enabled) 4266 drm_helper_hpd_irq_event(dev); 4267 else 4268 drm_kms_helper_hotplug_event(dev); 4269 #ifdef CONFIG_PM 4270 dev->dev->power.disable_depth--; 4271 #endif 4272 } 4273 adev->in_suspend = false; 4274 4275 if (adev->enable_mes) 4276 amdgpu_mes_self_test(adev); 4277 4278 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4279 DRM_WARN("smart shift update failed\n"); 4280 4281 return 0; 4282 } 4283 4284 /** 4285 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4286 * 4287 * @adev: amdgpu_device pointer 4288 * 4289 * The list of all the hardware IPs that make up the asic is walked and 4290 * the check_soft_reset callbacks are run. check_soft_reset determines 4291 * if the asic is still hung or not. 4292 * Returns true if any of the IPs are still in a hung state, false if not. 4293 */ 4294 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4295 { 4296 int i; 4297 bool asic_hang = false; 4298 4299 if (amdgpu_sriov_vf(adev)) 4300 return true; 4301 4302 if (amdgpu_asic_need_full_reset(adev)) 4303 return true; 4304 4305 for (i = 0; i < adev->num_ip_blocks; i++) { 4306 if (!adev->ip_blocks[i].status.valid) 4307 continue; 4308 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4309 adev->ip_blocks[i].status.hang = 4310 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4311 if (adev->ip_blocks[i].status.hang) { 4312 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4313 asic_hang = true; 4314 } 4315 } 4316 return asic_hang; 4317 } 4318 4319 /** 4320 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4321 * 4322 * @adev: amdgpu_device pointer 4323 * 4324 * The list of all the hardware IPs that make up the asic is walked and the 4325 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4326 * handles any IP specific hardware or software state changes that are 4327 * necessary for a soft reset to succeed. 4328 * Returns 0 on success, negative error code on failure. 4329 */ 4330 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4331 { 4332 int i, r = 0; 4333 4334 for (i = 0; i < adev->num_ip_blocks; i++) { 4335 if (!adev->ip_blocks[i].status.valid) 4336 continue; 4337 if (adev->ip_blocks[i].status.hang && 4338 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4339 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4340 if (r) 4341 return r; 4342 } 4343 } 4344 4345 return 0; 4346 } 4347 4348 /** 4349 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4350 * 4351 * @adev: amdgpu_device pointer 4352 * 4353 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4354 * reset is necessary to recover. 4355 * Returns true if a full asic reset is required, false if not. 4356 */ 4357 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4358 { 4359 int i; 4360 4361 if (amdgpu_asic_need_full_reset(adev)) 4362 return true; 4363 4364 for (i = 0; i < adev->num_ip_blocks; i++) { 4365 if (!adev->ip_blocks[i].status.valid) 4366 continue; 4367 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4368 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4369 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4370 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4371 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4372 if (adev->ip_blocks[i].status.hang) { 4373 dev_info(adev->dev, "Some block need full reset!\n"); 4374 return true; 4375 } 4376 } 4377 } 4378 return false; 4379 } 4380 4381 /** 4382 * amdgpu_device_ip_soft_reset - do a soft reset 4383 * 4384 * @adev: amdgpu_device pointer 4385 * 4386 * The list of all the hardware IPs that make up the asic is walked and the 4387 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4388 * IP specific hardware or software state changes that are necessary to soft 4389 * reset the IP. 4390 * Returns 0 on success, negative error code on failure. 4391 */ 4392 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4393 { 4394 int i, r = 0; 4395 4396 for (i = 0; i < adev->num_ip_blocks; i++) { 4397 if (!adev->ip_blocks[i].status.valid) 4398 continue; 4399 if (adev->ip_blocks[i].status.hang && 4400 adev->ip_blocks[i].version->funcs->soft_reset) { 4401 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4402 if (r) 4403 return r; 4404 } 4405 } 4406 4407 return 0; 4408 } 4409 4410 /** 4411 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4412 * 4413 * @adev: amdgpu_device pointer 4414 * 4415 * The list of all the hardware IPs that make up the asic is walked and the 4416 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4417 * handles any IP specific hardware or software state changes that are 4418 * necessary after the IP has been soft reset. 4419 * Returns 0 on success, negative error code on failure. 4420 */ 4421 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4422 { 4423 int i, r = 0; 4424 4425 for (i = 0; i < adev->num_ip_blocks; i++) { 4426 if (!adev->ip_blocks[i].status.valid) 4427 continue; 4428 if (adev->ip_blocks[i].status.hang && 4429 adev->ip_blocks[i].version->funcs->post_soft_reset) 4430 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4431 if (r) 4432 return r; 4433 } 4434 4435 return 0; 4436 } 4437 4438 /** 4439 * amdgpu_device_recover_vram - Recover some VRAM contents 4440 * 4441 * @adev: amdgpu_device pointer 4442 * 4443 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4444 * restore things like GPUVM page tables after a GPU reset where 4445 * the contents of VRAM might be lost. 4446 * 4447 * Returns: 4448 * 0 on success, negative error code on failure. 4449 */ 4450 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4451 { 4452 struct dma_fence *fence = NULL, *next = NULL; 4453 struct amdgpu_bo *shadow; 4454 struct amdgpu_bo_vm *vmbo; 4455 long r = 1, tmo; 4456 4457 if (amdgpu_sriov_runtime(adev)) 4458 tmo = msecs_to_jiffies(8000); 4459 else 4460 tmo = msecs_to_jiffies(100); 4461 4462 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4463 mutex_lock(&adev->shadow_list_lock); 4464 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4465 /* If vm is compute context or adev is APU, shadow will be NULL */ 4466 if (!vmbo->shadow) 4467 continue; 4468 shadow = vmbo->shadow; 4469 4470 /* No need to recover an evicted BO */ 4471 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4472 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4473 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4474 continue; 4475 4476 r = amdgpu_bo_restore_shadow(shadow, &next); 4477 if (r) 4478 break; 4479 4480 if (fence) { 4481 tmo = dma_fence_wait_timeout(fence, false, tmo); 4482 dma_fence_put(fence); 4483 fence = next; 4484 if (tmo == 0) { 4485 r = -ETIMEDOUT; 4486 break; 4487 } else if (tmo < 0) { 4488 r = tmo; 4489 break; 4490 } 4491 } else { 4492 fence = next; 4493 } 4494 } 4495 mutex_unlock(&adev->shadow_list_lock); 4496 4497 if (fence) 4498 tmo = dma_fence_wait_timeout(fence, false, tmo); 4499 dma_fence_put(fence); 4500 4501 if (r < 0 || tmo <= 0) { 4502 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4503 return -EIO; 4504 } 4505 4506 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4507 return 0; 4508 } 4509 4510 4511 /** 4512 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4513 * 4514 * @adev: amdgpu_device pointer 4515 * @from_hypervisor: request from hypervisor 4516 * 4517 * do VF FLR and reinitialize Asic 4518 * return 0 means succeeded otherwise failed 4519 */ 4520 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4521 bool from_hypervisor) 4522 { 4523 int r; 4524 struct amdgpu_hive_info *hive = NULL; 4525 int retry_limit = 0; 4526 4527 retry: 4528 amdgpu_amdkfd_pre_reset(adev); 4529 4530 if (from_hypervisor) 4531 r = amdgpu_virt_request_full_gpu(adev, true); 4532 else 4533 r = amdgpu_virt_reset_gpu(adev); 4534 if (r) 4535 return r; 4536 amdgpu_irq_gpu_reset_resume_helper(adev); 4537 4538 /* some sw clean up VF needs to do before recover */ 4539 amdgpu_virt_post_reset(adev); 4540 4541 /* Resume IP prior to SMC */ 4542 r = amdgpu_device_ip_reinit_early_sriov(adev); 4543 if (r) 4544 goto error; 4545 4546 amdgpu_virt_init_data_exchange(adev); 4547 4548 r = amdgpu_device_fw_loading(adev); 4549 if (r) 4550 return r; 4551 4552 /* now we are okay to resume SMC/CP/SDMA */ 4553 r = amdgpu_device_ip_reinit_late_sriov(adev); 4554 if (r) 4555 goto error; 4556 4557 hive = amdgpu_get_xgmi_hive(adev); 4558 /* Update PSP FW topology after reset */ 4559 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4560 r = amdgpu_xgmi_update_topology(hive, adev); 4561 4562 if (hive) 4563 amdgpu_put_xgmi_hive(hive); 4564 4565 if (!r) { 4566 r = amdgpu_ib_ring_tests(adev); 4567 4568 amdgpu_amdkfd_post_reset(adev); 4569 } 4570 4571 error: 4572 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4573 amdgpu_inc_vram_lost(adev); 4574 r = amdgpu_device_recover_vram(adev); 4575 } 4576 amdgpu_virt_release_full_gpu(adev, true); 4577 4578 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4579 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4580 retry_limit++; 4581 goto retry; 4582 } else 4583 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4584 } 4585 4586 return r; 4587 } 4588 4589 /** 4590 * amdgpu_device_has_job_running - check if there is any job in mirror list 4591 * 4592 * @adev: amdgpu_device pointer 4593 * 4594 * check if there is any job in mirror list 4595 */ 4596 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4597 { 4598 int i; 4599 struct drm_sched_job *job; 4600 4601 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4602 struct amdgpu_ring *ring = adev->rings[i]; 4603 4604 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 4605 continue; 4606 4607 spin_lock(&ring->sched.job_list_lock); 4608 job = list_first_entry_or_null(&ring->sched.pending_list, 4609 struct drm_sched_job, list); 4610 spin_unlock(&ring->sched.job_list_lock); 4611 if (job) 4612 return true; 4613 } 4614 return false; 4615 } 4616 4617 /** 4618 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4619 * 4620 * @adev: amdgpu_device pointer 4621 * 4622 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4623 * a hung GPU. 4624 */ 4625 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4626 { 4627 4628 if (amdgpu_gpu_recovery == 0) 4629 goto disabled; 4630 4631 /* Skip soft reset check in fatal error mode */ 4632 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4633 return true; 4634 4635 if (amdgpu_sriov_vf(adev)) 4636 return true; 4637 4638 if (amdgpu_gpu_recovery == -1) { 4639 switch (adev->asic_type) { 4640 #ifdef CONFIG_DRM_AMDGPU_SI 4641 case CHIP_VERDE: 4642 case CHIP_TAHITI: 4643 case CHIP_PITCAIRN: 4644 case CHIP_OLAND: 4645 case CHIP_HAINAN: 4646 #endif 4647 #ifdef CONFIG_DRM_AMDGPU_CIK 4648 case CHIP_KAVERI: 4649 case CHIP_KABINI: 4650 case CHIP_MULLINS: 4651 #endif 4652 case CHIP_CARRIZO: 4653 case CHIP_STONEY: 4654 case CHIP_CYAN_SKILLFISH: 4655 goto disabled; 4656 default: 4657 break; 4658 } 4659 } 4660 4661 return true; 4662 4663 disabled: 4664 dev_info(adev->dev, "GPU recovery disabled.\n"); 4665 return false; 4666 } 4667 4668 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4669 { 4670 u32 i; 4671 int ret = 0; 4672 4673 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4674 4675 dev_info(adev->dev, "GPU mode1 reset\n"); 4676 4677 /* disable BM */ 4678 pci_clear_master(adev->pdev); 4679 4680 amdgpu_device_cache_pci_state(adev->pdev); 4681 4682 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4683 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4684 ret = amdgpu_dpm_mode1_reset(adev); 4685 } else { 4686 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4687 ret = psp_gpu_reset(adev); 4688 } 4689 4690 if (ret) 4691 goto mode1_reset_failed; 4692 4693 amdgpu_device_load_pci_state(adev->pdev); 4694 ret = amdgpu_psp_wait_for_bootloader(adev); 4695 if (ret) 4696 goto mode1_reset_failed; 4697 4698 /* wait for asic to come out of reset */ 4699 for (i = 0; i < adev->usec_timeout; i++) { 4700 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4701 4702 if (memsize != 0xffffffff) 4703 break; 4704 udelay(1); 4705 } 4706 4707 if (i >= adev->usec_timeout) { 4708 ret = -ETIMEDOUT; 4709 goto mode1_reset_failed; 4710 } 4711 4712 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4713 4714 return 0; 4715 4716 mode1_reset_failed: 4717 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4718 return ret; 4719 } 4720 4721 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4722 struct amdgpu_reset_context *reset_context) 4723 { 4724 int i, r = 0; 4725 struct amdgpu_job *job = NULL; 4726 bool need_full_reset = 4727 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4728 4729 if (reset_context->reset_req_dev == adev) 4730 job = reset_context->job; 4731 4732 if (amdgpu_sriov_vf(adev)) { 4733 /* stop the data exchange thread */ 4734 amdgpu_virt_fini_data_exchange(adev); 4735 } 4736 4737 amdgpu_fence_driver_isr_toggle(adev, true); 4738 4739 /* block all schedulers and reset given job's ring */ 4740 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4741 struct amdgpu_ring *ring = adev->rings[i]; 4742 4743 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 4744 continue; 4745 4746 /* Clear job fence from fence drv to avoid force_completion 4747 * leave NULL and vm flush fence in fence drv 4748 */ 4749 amdgpu_fence_driver_clear_job_fences(ring); 4750 4751 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4752 amdgpu_fence_driver_force_completion(ring); 4753 } 4754 4755 amdgpu_fence_driver_isr_toggle(adev, false); 4756 4757 if (job && job->vm) 4758 drm_sched_increase_karma(&job->base); 4759 4760 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4761 /* If reset handler not implemented, continue; otherwise return */ 4762 if (r == -EOPNOTSUPP) 4763 r = 0; 4764 else 4765 return r; 4766 4767 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4768 if (!amdgpu_sriov_vf(adev)) { 4769 4770 if (!need_full_reset) 4771 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4772 4773 if (!need_full_reset && amdgpu_gpu_recovery && 4774 amdgpu_device_ip_check_soft_reset(adev)) { 4775 amdgpu_device_ip_pre_soft_reset(adev); 4776 r = amdgpu_device_ip_soft_reset(adev); 4777 amdgpu_device_ip_post_soft_reset(adev); 4778 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4779 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4780 need_full_reset = true; 4781 } 4782 } 4783 4784 if (need_full_reset) 4785 r = amdgpu_device_ip_suspend(adev); 4786 if (need_full_reset) 4787 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4788 else 4789 clear_bit(AMDGPU_NEED_FULL_RESET, 4790 &reset_context->flags); 4791 } 4792 4793 return r; 4794 } 4795 4796 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4797 { 4798 int i; 4799 4800 lockdep_assert_held(&adev->reset_domain->sem); 4801 4802 for (i = 0; i < adev->num_regs; i++) { 4803 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4804 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4805 adev->reset_dump_reg_value[i]); 4806 } 4807 4808 return 0; 4809 } 4810 4811 #ifdef CONFIG_DEV_COREDUMP 4812 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4813 size_t count, void *data, size_t datalen) 4814 { 4815 struct drm_printer p; 4816 struct amdgpu_device *adev = data; 4817 struct drm_print_iterator iter; 4818 int i; 4819 4820 iter.data = buffer; 4821 iter.offset = 0; 4822 iter.start = offset; 4823 iter.remain = count; 4824 4825 p = drm_coredump_printer(&iter); 4826 4827 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4828 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4829 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4830 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4831 if (adev->reset_task_info.pid) 4832 drm_printf(&p, "process_name: %s PID: %d\n", 4833 adev->reset_task_info.process_name, 4834 adev->reset_task_info.pid); 4835 4836 if (adev->reset_vram_lost) 4837 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4838 if (adev->num_regs) { 4839 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4840 4841 for (i = 0; i < adev->num_regs; i++) 4842 drm_printf(&p, "0x%08x: 0x%08x\n", 4843 adev->reset_dump_reg_list[i], 4844 adev->reset_dump_reg_value[i]); 4845 } 4846 4847 return count - iter.remain; 4848 } 4849 4850 static void amdgpu_devcoredump_free(void *data) 4851 { 4852 } 4853 4854 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4855 { 4856 struct drm_device *dev = adev_to_drm(adev); 4857 4858 ktime_get_ts64(&adev->reset_time); 4859 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, 4860 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4861 } 4862 #endif 4863 4864 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4865 struct amdgpu_reset_context *reset_context) 4866 { 4867 struct amdgpu_device *tmp_adev = NULL; 4868 bool need_full_reset, skip_hw_reset, vram_lost = false; 4869 int r = 0; 4870 bool gpu_reset_for_dev_remove = 0; 4871 4872 /* Try reset handler method first */ 4873 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4874 reset_list); 4875 amdgpu_reset_reg_dumps(tmp_adev); 4876 4877 reset_context->reset_device_list = device_list_handle; 4878 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4879 /* If reset handler not implemented, continue; otherwise return */ 4880 if (r == -EOPNOTSUPP) 4881 r = 0; 4882 else 4883 return r; 4884 4885 /* Reset handler not implemented, use the default method */ 4886 need_full_reset = 4887 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4888 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4889 4890 gpu_reset_for_dev_remove = 4891 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4892 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4893 4894 /* 4895 * ASIC reset has to be done on all XGMI hive nodes ASAP 4896 * to allow proper links negotiation in FW (within 1 sec) 4897 */ 4898 if (!skip_hw_reset && need_full_reset) { 4899 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4900 /* For XGMI run all resets in parallel to speed up the process */ 4901 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4902 tmp_adev->gmc.xgmi.pending_reset = false; 4903 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4904 r = -EALREADY; 4905 } else 4906 r = amdgpu_asic_reset(tmp_adev); 4907 4908 if (r) { 4909 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4910 r, adev_to_drm(tmp_adev)->unique); 4911 break; 4912 } 4913 } 4914 4915 /* For XGMI wait for all resets to complete before proceed */ 4916 if (!r) { 4917 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4918 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4919 flush_work(&tmp_adev->xgmi_reset_work); 4920 r = tmp_adev->asic_reset_res; 4921 if (r) 4922 break; 4923 } 4924 } 4925 } 4926 } 4927 4928 if (!r && amdgpu_ras_intr_triggered()) { 4929 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4930 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4931 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4932 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4933 } 4934 4935 amdgpu_ras_intr_cleared(); 4936 } 4937 4938 /* Since the mode1 reset affects base ip blocks, the 4939 * phase1 ip blocks need to be resumed. Otherwise there 4940 * will be a BIOS signature error and the psp bootloader 4941 * can't load kdb on the next amdgpu install. 4942 */ 4943 if (gpu_reset_for_dev_remove) { 4944 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4945 amdgpu_device_ip_resume_phase1(tmp_adev); 4946 4947 goto end; 4948 } 4949 4950 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4951 if (need_full_reset) { 4952 /* post card */ 4953 r = amdgpu_device_asic_init(tmp_adev); 4954 if (r) { 4955 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4956 } else { 4957 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4958 4959 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4960 if (r) 4961 goto out; 4962 4963 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4964 #ifdef CONFIG_DEV_COREDUMP 4965 tmp_adev->reset_vram_lost = vram_lost; 4966 memset(&tmp_adev->reset_task_info, 0, 4967 sizeof(tmp_adev->reset_task_info)); 4968 if (reset_context->job && reset_context->job->vm) 4969 tmp_adev->reset_task_info = 4970 reset_context->job->vm->task_info; 4971 amdgpu_reset_capture_coredumpm(tmp_adev); 4972 #endif 4973 if (vram_lost) { 4974 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4975 amdgpu_inc_vram_lost(tmp_adev); 4976 } 4977 4978 r = amdgpu_device_fw_loading(tmp_adev); 4979 if (r) 4980 return r; 4981 4982 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4983 if (r) 4984 goto out; 4985 4986 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 4987 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 4988 4989 if (vram_lost) 4990 amdgpu_device_fill_reset_magic(tmp_adev); 4991 4992 /* 4993 * Add this ASIC as tracked as reset was already 4994 * complete successfully. 4995 */ 4996 amdgpu_register_gpu_instance(tmp_adev); 4997 4998 if (!reset_context->hive && 4999 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5000 amdgpu_xgmi_add_device(tmp_adev); 5001 5002 r = amdgpu_device_ip_late_init(tmp_adev); 5003 if (r) 5004 goto out; 5005 5006 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5007 5008 /* 5009 * The GPU enters bad state once faulty pages 5010 * by ECC has reached the threshold, and ras 5011 * recovery is scheduled next. So add one check 5012 * here to break recovery if it indeed exceeds 5013 * bad page threshold, and remind user to 5014 * retire this GPU or setting one bigger 5015 * bad_page_threshold value to fix this once 5016 * probing driver again. 5017 */ 5018 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5019 /* must succeed. */ 5020 amdgpu_ras_resume(tmp_adev); 5021 } else { 5022 r = -EINVAL; 5023 goto out; 5024 } 5025 5026 /* Update PSP FW topology after reset */ 5027 if (reset_context->hive && 5028 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5029 r = amdgpu_xgmi_update_topology( 5030 reset_context->hive, tmp_adev); 5031 } 5032 } 5033 5034 out: 5035 if (!r) { 5036 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5037 r = amdgpu_ib_ring_tests(tmp_adev); 5038 if (r) { 5039 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5040 need_full_reset = true; 5041 r = -EAGAIN; 5042 goto end; 5043 } 5044 } 5045 5046 if (!r) 5047 r = amdgpu_device_recover_vram(tmp_adev); 5048 else 5049 tmp_adev->asic_reset_res = r; 5050 } 5051 5052 end: 5053 if (need_full_reset) 5054 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5055 else 5056 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5057 return r; 5058 } 5059 5060 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5061 { 5062 5063 switch (amdgpu_asic_reset_method(adev)) { 5064 case AMD_RESET_METHOD_MODE1: 5065 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5066 break; 5067 case AMD_RESET_METHOD_MODE2: 5068 adev->mp1_state = PP_MP1_STATE_RESET; 5069 break; 5070 default: 5071 adev->mp1_state = PP_MP1_STATE_NONE; 5072 break; 5073 } 5074 } 5075 5076 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5077 { 5078 amdgpu_vf_error_trans_all(adev); 5079 adev->mp1_state = PP_MP1_STATE_NONE; 5080 } 5081 5082 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5083 { 5084 struct pci_dev *p = NULL; 5085 5086 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5087 adev->pdev->bus->number, 1); 5088 if (p) { 5089 pm_runtime_enable(&(p->dev)); 5090 pm_runtime_resume(&(p->dev)); 5091 } 5092 5093 pci_dev_put(p); 5094 } 5095 5096 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5097 { 5098 enum amd_reset_method reset_method; 5099 struct pci_dev *p = NULL; 5100 u64 expires; 5101 5102 /* 5103 * For now, only BACO and mode1 reset are confirmed 5104 * to suffer the audio issue without proper suspended. 5105 */ 5106 reset_method = amdgpu_asic_reset_method(adev); 5107 if ((reset_method != AMD_RESET_METHOD_BACO) && 5108 (reset_method != AMD_RESET_METHOD_MODE1)) 5109 return -EINVAL; 5110 5111 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5112 adev->pdev->bus->number, 1); 5113 if (!p) 5114 return -ENODEV; 5115 5116 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5117 if (!expires) 5118 /* 5119 * If we cannot get the audio device autosuspend delay, 5120 * a fixed 4S interval will be used. Considering 3S is 5121 * the audio controller default autosuspend delay setting. 5122 * 4S used here is guaranteed to cover that. 5123 */ 5124 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5125 5126 while (!pm_runtime_status_suspended(&(p->dev))) { 5127 if (!pm_runtime_suspend(&(p->dev))) 5128 break; 5129 5130 if (expires < ktime_get_mono_fast_ns()) { 5131 dev_warn(adev->dev, "failed to suspend display audio\n"); 5132 pci_dev_put(p); 5133 /* TODO: abort the succeeding gpu reset? */ 5134 return -ETIMEDOUT; 5135 } 5136 } 5137 5138 pm_runtime_disable(&(p->dev)); 5139 5140 pci_dev_put(p); 5141 return 0; 5142 } 5143 5144 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5145 { 5146 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5147 5148 #if defined(CONFIG_DEBUG_FS) 5149 if (!amdgpu_sriov_vf(adev)) 5150 cancel_work(&adev->reset_work); 5151 #endif 5152 5153 if (adev->kfd.dev) 5154 cancel_work(&adev->kfd.reset_work); 5155 5156 if (amdgpu_sriov_vf(adev)) 5157 cancel_work(&adev->virt.flr_work); 5158 5159 if (con && adev->ras_enabled) 5160 cancel_work(&con->recovery_work); 5161 5162 } 5163 5164 /** 5165 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5166 * 5167 * @adev: amdgpu_device pointer 5168 * @job: which job trigger hang 5169 * @reset_context: amdgpu reset context pointer 5170 * 5171 * Attempt to reset the GPU if it has hung (all asics). 5172 * Attempt to do soft-reset or full-reset and reinitialize Asic 5173 * Returns 0 for success or an error on failure. 5174 */ 5175 5176 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5177 struct amdgpu_job *job, 5178 struct amdgpu_reset_context *reset_context) 5179 { 5180 struct list_head device_list, *device_list_handle = NULL; 5181 bool job_signaled = false; 5182 struct amdgpu_hive_info *hive = NULL; 5183 struct amdgpu_device *tmp_adev = NULL; 5184 int i, r = 0; 5185 bool need_emergency_restart = false; 5186 bool audio_suspended = false; 5187 bool gpu_reset_for_dev_remove = false; 5188 5189 gpu_reset_for_dev_remove = 5190 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5191 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5192 5193 /* 5194 * Special case: RAS triggered and full reset isn't supported 5195 */ 5196 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5197 5198 /* 5199 * Flush RAM to disk so that after reboot 5200 * the user can read log and see why the system rebooted. 5201 */ 5202 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5203 DRM_WARN("Emergency reboot."); 5204 5205 ksys_sync_helper(); 5206 emergency_restart(); 5207 } 5208 5209 dev_info(adev->dev, "GPU %s begin!\n", 5210 need_emergency_restart ? "jobs stop":"reset"); 5211 5212 if (!amdgpu_sriov_vf(adev)) 5213 hive = amdgpu_get_xgmi_hive(adev); 5214 if (hive) 5215 mutex_lock(&hive->hive_lock); 5216 5217 reset_context->job = job; 5218 reset_context->hive = hive; 5219 /* 5220 * Build list of devices to reset. 5221 * In case we are in XGMI hive mode, resort the device list 5222 * to put adev in the 1st position. 5223 */ 5224 INIT_LIST_HEAD(&device_list); 5225 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5226 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5227 list_add_tail(&tmp_adev->reset_list, &device_list); 5228 if (gpu_reset_for_dev_remove && adev->shutdown) 5229 tmp_adev->shutdown = true; 5230 } 5231 if (!list_is_first(&adev->reset_list, &device_list)) 5232 list_rotate_to_front(&adev->reset_list, &device_list); 5233 device_list_handle = &device_list; 5234 } else { 5235 list_add_tail(&adev->reset_list, &device_list); 5236 device_list_handle = &device_list; 5237 } 5238 5239 /* We need to lock reset domain only once both for XGMI and single device */ 5240 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5241 reset_list); 5242 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5243 5244 /* block all schedulers and reset given job's ring */ 5245 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5246 5247 amdgpu_device_set_mp1_state(tmp_adev); 5248 5249 /* 5250 * Try to put the audio codec into suspend state 5251 * before gpu reset started. 5252 * 5253 * Due to the power domain of the graphics device 5254 * is shared with AZ power domain. Without this, 5255 * we may change the audio hardware from behind 5256 * the audio driver's back. That will trigger 5257 * some audio codec errors. 5258 */ 5259 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5260 audio_suspended = true; 5261 5262 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5263 5264 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5265 5266 if (!amdgpu_sriov_vf(tmp_adev)) 5267 amdgpu_amdkfd_pre_reset(tmp_adev); 5268 5269 /* 5270 * Mark these ASICs to be reseted as untracked first 5271 * And add them back after reset completed 5272 */ 5273 amdgpu_unregister_gpu_instance(tmp_adev); 5274 5275 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5276 5277 /* disable ras on ALL IPs */ 5278 if (!need_emergency_restart && 5279 amdgpu_device_ip_need_full_reset(tmp_adev)) 5280 amdgpu_ras_suspend(tmp_adev); 5281 5282 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5283 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5284 5285 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5286 continue; 5287 5288 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5289 5290 if (need_emergency_restart) 5291 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5292 } 5293 atomic_inc(&tmp_adev->gpu_reset_counter); 5294 } 5295 5296 if (need_emergency_restart) 5297 goto skip_sched_resume; 5298 5299 /* 5300 * Must check guilty signal here since after this point all old 5301 * HW fences are force signaled. 5302 * 5303 * job->base holds a reference to parent fence 5304 */ 5305 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5306 job_signaled = true; 5307 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5308 goto skip_hw_reset; 5309 } 5310 5311 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5312 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5313 if (gpu_reset_for_dev_remove) { 5314 /* Workaroud for ASICs need to disable SMC first */ 5315 amdgpu_device_smu_fini_early(tmp_adev); 5316 } 5317 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5318 /*TODO Should we stop ?*/ 5319 if (r) { 5320 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5321 r, adev_to_drm(tmp_adev)->unique); 5322 tmp_adev->asic_reset_res = r; 5323 } 5324 5325 /* 5326 * Drop all pending non scheduler resets. Scheduler resets 5327 * were already dropped during drm_sched_stop 5328 */ 5329 amdgpu_device_stop_pending_resets(tmp_adev); 5330 } 5331 5332 /* Actual ASIC resets if needed.*/ 5333 /* Host driver will handle XGMI hive reset for SRIOV */ 5334 if (amdgpu_sriov_vf(adev)) { 5335 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5336 if (r) 5337 adev->asic_reset_res = r; 5338 5339 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5340 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5341 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5342 amdgpu_ras_resume(adev); 5343 } else { 5344 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5345 if (r && r == -EAGAIN) 5346 goto retry; 5347 5348 if (!r && gpu_reset_for_dev_remove) 5349 goto recover_end; 5350 } 5351 5352 skip_hw_reset: 5353 5354 /* Post ASIC reset for all devs .*/ 5355 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5356 5357 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5358 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5359 5360 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5361 continue; 5362 5363 drm_sched_start(&ring->sched, true); 5364 } 5365 5366 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5367 amdgpu_mes_self_test(tmp_adev); 5368 5369 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5370 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5371 5372 if (tmp_adev->asic_reset_res) 5373 r = tmp_adev->asic_reset_res; 5374 5375 tmp_adev->asic_reset_res = 0; 5376 5377 if (r) { 5378 /* bad news, how to tell it to userspace ? */ 5379 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5380 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5381 } else { 5382 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5383 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5384 DRM_WARN("smart shift update failed\n"); 5385 } 5386 } 5387 5388 skip_sched_resume: 5389 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5390 /* unlock kfd: SRIOV would do it separately */ 5391 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5392 amdgpu_amdkfd_post_reset(tmp_adev); 5393 5394 /* kfd_post_reset will do nothing if kfd device is not initialized, 5395 * need to bring up kfd here if it's not be initialized before 5396 */ 5397 if (!adev->kfd.init_complete) 5398 amdgpu_amdkfd_device_init(adev); 5399 5400 if (audio_suspended) 5401 amdgpu_device_resume_display_audio(tmp_adev); 5402 5403 amdgpu_device_unset_mp1_state(tmp_adev); 5404 5405 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5406 } 5407 5408 recover_end: 5409 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5410 reset_list); 5411 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5412 5413 if (hive) { 5414 mutex_unlock(&hive->hive_lock); 5415 amdgpu_put_xgmi_hive(hive); 5416 } 5417 5418 if (r) 5419 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5420 5421 atomic_set(&adev->reset_domain->reset_res, r); 5422 return r; 5423 } 5424 5425 /** 5426 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5427 * 5428 * @adev: amdgpu_device pointer 5429 * 5430 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5431 * and lanes) of the slot the device is in. Handles APUs and 5432 * virtualized environments where PCIE config space may not be available. 5433 */ 5434 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5435 { 5436 struct pci_dev *pdev; 5437 enum pci_bus_speed speed_cap, platform_speed_cap; 5438 enum pcie_link_width platform_link_width; 5439 5440 if (amdgpu_pcie_gen_cap) 5441 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5442 5443 if (amdgpu_pcie_lane_cap) 5444 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5445 5446 /* covers APUs as well */ 5447 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5448 if (adev->pm.pcie_gen_mask == 0) 5449 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5450 if (adev->pm.pcie_mlw_mask == 0) 5451 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5452 return; 5453 } 5454 5455 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5456 return; 5457 5458 pcie_bandwidth_available(adev->pdev, NULL, 5459 &platform_speed_cap, &platform_link_width); 5460 5461 if (adev->pm.pcie_gen_mask == 0) { 5462 /* asic caps */ 5463 pdev = adev->pdev; 5464 speed_cap = pcie_get_speed_cap(pdev); 5465 if (speed_cap == PCI_SPEED_UNKNOWN) { 5466 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5467 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5468 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5469 } else { 5470 if (speed_cap == PCIE_SPEED_32_0GT) 5471 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5472 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5473 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5474 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5475 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5476 else if (speed_cap == PCIE_SPEED_16_0GT) 5477 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5478 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5479 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5480 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5481 else if (speed_cap == PCIE_SPEED_8_0GT) 5482 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5483 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5484 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5485 else if (speed_cap == PCIE_SPEED_5_0GT) 5486 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5487 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5488 else 5489 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5490 } 5491 /* platform caps */ 5492 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5493 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5494 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5495 } else { 5496 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5497 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5498 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5499 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5500 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5501 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5502 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5503 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5504 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5505 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5506 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5507 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5508 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5509 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5510 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5511 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5512 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5513 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5514 else 5515 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5516 5517 } 5518 } 5519 if (adev->pm.pcie_mlw_mask == 0) { 5520 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5521 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5522 } else { 5523 switch (platform_link_width) { 5524 case PCIE_LNK_X32: 5525 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5526 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5531 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5532 break; 5533 case PCIE_LNK_X16: 5534 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5540 break; 5541 case PCIE_LNK_X12: 5542 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5543 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5547 break; 5548 case PCIE_LNK_X8: 5549 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5553 break; 5554 case PCIE_LNK_X4: 5555 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5558 break; 5559 case PCIE_LNK_X2: 5560 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5562 break; 5563 case PCIE_LNK_X1: 5564 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5565 break; 5566 default: 5567 break; 5568 } 5569 } 5570 } 5571 } 5572 5573 /** 5574 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5575 * 5576 * @adev: amdgpu_device pointer 5577 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5578 * 5579 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5580 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5581 * @peer_adev. 5582 */ 5583 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5584 struct amdgpu_device *peer_adev) 5585 { 5586 #ifdef CONFIG_HSA_AMD_P2P 5587 uint64_t address_mask = peer_adev->dev->dma_mask ? 5588 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5589 resource_size_t aper_limit = 5590 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5591 bool p2p_access = 5592 !adev->gmc.xgmi.connected_to_cpu && 5593 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5594 5595 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5596 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5597 !(adev->gmc.aper_base & address_mask || 5598 aper_limit & address_mask)); 5599 #else 5600 return false; 5601 #endif 5602 } 5603 5604 int amdgpu_device_baco_enter(struct drm_device *dev) 5605 { 5606 struct amdgpu_device *adev = drm_to_adev(dev); 5607 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5608 5609 if (!amdgpu_device_supports_baco(dev)) 5610 return -ENOTSUPP; 5611 5612 if (ras && adev->ras_enabled && 5613 adev->nbio.funcs->enable_doorbell_interrupt) 5614 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5615 5616 return amdgpu_dpm_baco_enter(adev); 5617 } 5618 5619 int amdgpu_device_baco_exit(struct drm_device *dev) 5620 { 5621 struct amdgpu_device *adev = drm_to_adev(dev); 5622 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5623 int ret = 0; 5624 5625 if (!amdgpu_device_supports_baco(dev)) 5626 return -ENOTSUPP; 5627 5628 ret = amdgpu_dpm_baco_exit(adev); 5629 if (ret) 5630 return ret; 5631 5632 if (ras && adev->ras_enabled && 5633 adev->nbio.funcs->enable_doorbell_interrupt) 5634 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5635 5636 if (amdgpu_passthrough(adev) && 5637 adev->nbio.funcs->clear_doorbell_interrupt) 5638 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5639 5640 return 0; 5641 } 5642 5643 /** 5644 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5645 * @pdev: PCI device struct 5646 * @state: PCI channel state 5647 * 5648 * Description: Called when a PCI error is detected. 5649 * 5650 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5651 */ 5652 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5653 { 5654 struct drm_device *dev = pci_get_drvdata(pdev); 5655 struct amdgpu_device *adev = drm_to_adev(dev); 5656 int i; 5657 5658 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5659 5660 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5661 DRM_WARN("No support for XGMI hive yet..."); 5662 return PCI_ERS_RESULT_DISCONNECT; 5663 } 5664 5665 adev->pci_channel_state = state; 5666 5667 switch (state) { 5668 case pci_channel_io_normal: 5669 return PCI_ERS_RESULT_CAN_RECOVER; 5670 /* Fatal error, prepare for slot reset */ 5671 case pci_channel_io_frozen: 5672 /* 5673 * Locking adev->reset_domain->sem will prevent any external access 5674 * to GPU during PCI error recovery 5675 */ 5676 amdgpu_device_lock_reset_domain(adev->reset_domain); 5677 amdgpu_device_set_mp1_state(adev); 5678 5679 /* 5680 * Block any work scheduling as we do for regular GPU reset 5681 * for the duration of the recovery 5682 */ 5683 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5684 struct amdgpu_ring *ring = adev->rings[i]; 5685 5686 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5687 continue; 5688 5689 drm_sched_stop(&ring->sched, NULL); 5690 } 5691 atomic_inc(&adev->gpu_reset_counter); 5692 return PCI_ERS_RESULT_NEED_RESET; 5693 case pci_channel_io_perm_failure: 5694 /* Permanent error, prepare for device removal */ 5695 return PCI_ERS_RESULT_DISCONNECT; 5696 } 5697 5698 return PCI_ERS_RESULT_NEED_RESET; 5699 } 5700 5701 /** 5702 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5703 * @pdev: pointer to PCI device 5704 */ 5705 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5706 { 5707 5708 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5709 5710 /* TODO - dump whatever for debugging purposes */ 5711 5712 /* This called only if amdgpu_pci_error_detected returns 5713 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5714 * works, no need to reset slot. 5715 */ 5716 5717 return PCI_ERS_RESULT_RECOVERED; 5718 } 5719 5720 /** 5721 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5722 * @pdev: PCI device struct 5723 * 5724 * Description: This routine is called by the pci error recovery 5725 * code after the PCI slot has been reset, just before we 5726 * should resume normal operations. 5727 */ 5728 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5729 { 5730 struct drm_device *dev = pci_get_drvdata(pdev); 5731 struct amdgpu_device *adev = drm_to_adev(dev); 5732 int r, i; 5733 struct amdgpu_reset_context reset_context; 5734 u32 memsize; 5735 struct list_head device_list; 5736 5737 DRM_INFO("PCI error: slot reset callback!!\n"); 5738 5739 memset(&reset_context, 0, sizeof(reset_context)); 5740 5741 INIT_LIST_HEAD(&device_list); 5742 list_add_tail(&adev->reset_list, &device_list); 5743 5744 /* wait for asic to come out of reset */ 5745 msleep(500); 5746 5747 /* Restore PCI confspace */ 5748 amdgpu_device_load_pci_state(pdev); 5749 5750 /* confirm ASIC came out of reset */ 5751 for (i = 0; i < adev->usec_timeout; i++) { 5752 memsize = amdgpu_asic_get_config_memsize(adev); 5753 5754 if (memsize != 0xffffffff) 5755 break; 5756 udelay(1); 5757 } 5758 if (memsize == 0xffffffff) { 5759 r = -ETIME; 5760 goto out; 5761 } 5762 5763 reset_context.method = AMD_RESET_METHOD_NONE; 5764 reset_context.reset_req_dev = adev; 5765 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5766 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5767 5768 adev->no_hw_access = true; 5769 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5770 adev->no_hw_access = false; 5771 if (r) 5772 goto out; 5773 5774 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5775 5776 out: 5777 if (!r) { 5778 if (amdgpu_device_cache_pci_state(adev->pdev)) 5779 pci_restore_state(adev->pdev); 5780 5781 DRM_INFO("PCIe error recovery succeeded\n"); 5782 } else { 5783 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5784 amdgpu_device_unset_mp1_state(adev); 5785 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5786 } 5787 5788 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5789 } 5790 5791 /** 5792 * amdgpu_pci_resume() - resume normal ops after PCI reset 5793 * @pdev: pointer to PCI device 5794 * 5795 * Called when the error recovery driver tells us that its 5796 * OK to resume normal operation. 5797 */ 5798 void amdgpu_pci_resume(struct pci_dev *pdev) 5799 { 5800 struct drm_device *dev = pci_get_drvdata(pdev); 5801 struct amdgpu_device *adev = drm_to_adev(dev); 5802 int i; 5803 5804 5805 DRM_INFO("PCI error: resume callback!!\n"); 5806 5807 /* Only continue execution for the case of pci_channel_io_frozen */ 5808 if (adev->pci_channel_state != pci_channel_io_frozen) 5809 return; 5810 5811 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5812 struct amdgpu_ring *ring = adev->rings[i]; 5813 5814 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5815 continue; 5816 5817 drm_sched_start(&ring->sched, true); 5818 } 5819 5820 amdgpu_device_unset_mp1_state(adev); 5821 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5822 } 5823 5824 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5825 { 5826 struct drm_device *dev = pci_get_drvdata(pdev); 5827 struct amdgpu_device *adev = drm_to_adev(dev); 5828 int r; 5829 5830 r = pci_save_state(pdev); 5831 if (!r) { 5832 kfree(adev->pci_state); 5833 5834 adev->pci_state = pci_store_saved_state(pdev); 5835 5836 if (!adev->pci_state) { 5837 DRM_ERROR("Failed to store PCI saved state"); 5838 return false; 5839 } 5840 } else { 5841 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5842 return false; 5843 } 5844 5845 return true; 5846 } 5847 5848 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5849 { 5850 struct drm_device *dev = pci_get_drvdata(pdev); 5851 struct amdgpu_device *adev = drm_to_adev(dev); 5852 int r; 5853 5854 if (!adev->pci_state) 5855 return false; 5856 5857 r = pci_load_saved_state(pdev, adev->pci_state); 5858 5859 if (!r) { 5860 pci_restore_state(pdev); 5861 } else { 5862 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5863 return false; 5864 } 5865 5866 return true; 5867 } 5868 5869 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5870 struct amdgpu_ring *ring) 5871 { 5872 #ifdef CONFIG_X86_64 5873 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5874 return; 5875 #endif 5876 if (adev->gmc.xgmi.connected_to_cpu) 5877 return; 5878 5879 if (ring && ring->funcs->emit_hdp_flush) 5880 amdgpu_ring_emit_hdp_flush(ring); 5881 else 5882 amdgpu_asic_flush_hdp(adev, ring); 5883 } 5884 5885 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5886 struct amdgpu_ring *ring) 5887 { 5888 #ifdef CONFIG_X86_64 5889 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5890 return; 5891 #endif 5892 if (adev->gmc.xgmi.connected_to_cpu) 5893 return; 5894 5895 amdgpu_asic_invalidate_hdp(adev, ring); 5896 } 5897 5898 int amdgpu_in_reset(struct amdgpu_device *adev) 5899 { 5900 return atomic_read(&adev->reset_domain->in_gpu_reset); 5901 } 5902 5903 /** 5904 * amdgpu_device_halt() - bring hardware to some kind of halt state 5905 * 5906 * @adev: amdgpu_device pointer 5907 * 5908 * Bring hardware to some kind of halt state so that no one can touch it 5909 * any more. It will help to maintain error context when error occurred. 5910 * Compare to a simple hang, the system will keep stable at least for SSH 5911 * access. Then it should be trivial to inspect the hardware state and 5912 * see what's going on. Implemented as following: 5913 * 5914 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5915 * clears all CPU mappings to device, disallows remappings through page faults 5916 * 2. amdgpu_irq_disable_all() disables all interrupts 5917 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5918 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5919 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5920 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5921 * flush any in flight DMA operations 5922 */ 5923 void amdgpu_device_halt(struct amdgpu_device *adev) 5924 { 5925 struct pci_dev *pdev = adev->pdev; 5926 struct drm_device *ddev = adev_to_drm(adev); 5927 5928 amdgpu_xcp_dev_unplug(adev); 5929 drm_dev_unplug(ddev); 5930 5931 amdgpu_irq_disable_all(adev); 5932 5933 amdgpu_fence_driver_hw_fini(adev); 5934 5935 adev->no_hw_access = true; 5936 5937 amdgpu_device_unmap_mmio(adev); 5938 5939 pci_disable_device(pdev); 5940 pci_wait_for_pending_transaction(pdev); 5941 } 5942 5943 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5944 u32 reg) 5945 { 5946 unsigned long flags, address, data; 5947 u32 r; 5948 5949 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5950 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5951 5952 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5953 WREG32(address, reg * 4); 5954 (void)RREG32(address); 5955 r = RREG32(data); 5956 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5957 return r; 5958 } 5959 5960 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5961 u32 reg, u32 v) 5962 { 5963 unsigned long flags, address, data; 5964 5965 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5966 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5967 5968 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5969 WREG32(address, reg * 4); 5970 (void)RREG32(address); 5971 WREG32(data, v); 5972 (void)RREG32(data); 5973 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5974 } 5975 5976 /** 5977 * amdgpu_device_switch_gang - switch to a new gang 5978 * @adev: amdgpu_device pointer 5979 * @gang: the gang to switch to 5980 * 5981 * Try to switch to a new gang. 5982 * Returns: NULL if we switched to the new gang or a reference to the current 5983 * gang leader. 5984 */ 5985 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5986 struct dma_fence *gang) 5987 { 5988 struct dma_fence *old = NULL; 5989 5990 do { 5991 dma_fence_put(old); 5992 rcu_read_lock(); 5993 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5994 rcu_read_unlock(); 5995 5996 if (old == gang) 5997 break; 5998 5999 if (!dma_fence_is_signaled(old)) 6000 return old; 6001 6002 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6003 old, gang) != old); 6004 6005 dma_fence_put(old); 6006 return NULL; 6007 } 6008 6009 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6010 { 6011 switch (adev->asic_type) { 6012 #ifdef CONFIG_DRM_AMDGPU_SI 6013 case CHIP_HAINAN: 6014 #endif 6015 case CHIP_TOPAZ: 6016 /* chips with no display hardware */ 6017 return false; 6018 #ifdef CONFIG_DRM_AMDGPU_SI 6019 case CHIP_TAHITI: 6020 case CHIP_PITCAIRN: 6021 case CHIP_VERDE: 6022 case CHIP_OLAND: 6023 #endif 6024 #ifdef CONFIG_DRM_AMDGPU_CIK 6025 case CHIP_BONAIRE: 6026 case CHIP_HAWAII: 6027 case CHIP_KAVERI: 6028 case CHIP_KABINI: 6029 case CHIP_MULLINS: 6030 #endif 6031 case CHIP_TONGA: 6032 case CHIP_FIJI: 6033 case CHIP_POLARIS10: 6034 case CHIP_POLARIS11: 6035 case CHIP_POLARIS12: 6036 case CHIP_VEGAM: 6037 case CHIP_CARRIZO: 6038 case CHIP_STONEY: 6039 /* chips with display hardware */ 6040 return true; 6041 default: 6042 /* IP discovery */ 6043 if (!adev->ip_versions[DCE_HWIP][0] || 6044 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6045 return false; 6046 return true; 6047 } 6048 } 6049 6050 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6051 uint32_t inst, uint32_t reg_addr, char reg_name[], 6052 uint32_t expected_value, uint32_t mask) 6053 { 6054 uint32_t ret = 0; 6055 uint32_t old_ = 0; 6056 uint32_t tmp_ = RREG32(reg_addr); 6057 uint32_t loop = adev->usec_timeout; 6058 6059 while ((tmp_ & (mask)) != (expected_value)) { 6060 if (old_ != tmp_) { 6061 loop = adev->usec_timeout; 6062 old_ = tmp_; 6063 } else 6064 udelay(1); 6065 tmp_ = RREG32(reg_addr); 6066 loop--; 6067 if (!loop) { 6068 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6069 inst, reg_name, (uint32_t)expected_value, 6070 (uint32_t)(tmp_ & (mask))); 6071 ret = -ETIMEDOUT; 6072 break; 6073 } 6074 } 6075 return ret; 6076 } 6077