1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/vgaarb.h> 45 #include <linux/vga_switcheroo.h> 46 #include <linux/efi.h> 47 #include "amdgpu.h" 48 #include "amdgpu_trace.h" 49 #include "amdgpu_i2c.h" 50 #include "atom.h" 51 #include "amdgpu_atombios.h" 52 #include "amdgpu_atomfirmware.h" 53 #include "amd_pcie.h" 54 #ifdef CONFIG_DRM_AMDGPU_SI 55 #include "si.h" 56 #endif 57 #ifdef CONFIG_DRM_AMDGPU_CIK 58 #include "cik.h" 59 #endif 60 #include "vi.h" 61 #include "soc15.h" 62 #include "nv.h" 63 #include "bif/bif_4_1_d.h" 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 #include "amdgpu_fru_eeprom.h" 74 #include "amdgpu_reset.h" 75 76 #include <linux/suspend.h> 77 #include <drm/task_barrier.h> 78 #include <linux/pm_runtime.h> 79 80 #include <drm/drm_drv.h> 81 82 #if IS_ENABLED(CONFIG_X86) 83 #include <asm/intel-family.h> 84 #endif 85 86 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 93 94 #define AMDGPU_RESUME_MS 2000 95 #define AMDGPU_MAX_RETRY_LIMIT 2 96 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 97 98 static const struct drm_driver amdgpu_kms_driver; 99 100 const char *amdgpu_asic_name[] = { 101 "TAHITI", 102 "PITCAIRN", 103 "VERDE", 104 "OLAND", 105 "HAINAN", 106 "BONAIRE", 107 "KAVERI", 108 "KABINI", 109 "HAWAII", 110 "MULLINS", 111 "TOPAZ", 112 "TONGA", 113 "FIJI", 114 "CARRIZO", 115 "STONEY", 116 "POLARIS10", 117 "POLARIS11", 118 "POLARIS12", 119 "VEGAM", 120 "VEGA10", 121 "VEGA12", 122 "VEGA20", 123 "RAVEN", 124 "ARCTURUS", 125 "RENOIR", 126 "ALDEBARAN", 127 "NAVI10", 128 "CYAN_SKILLFISH", 129 "NAVI14", 130 "NAVI12", 131 "SIENNA_CICHLID", 132 "NAVY_FLOUNDER", 133 "VANGOGH", 134 "DIMGREY_CAVEFISH", 135 "BEIGE_GOBY", 136 "YELLOW_CARP", 137 "IP DISCOVERY", 138 "LAST", 139 }; 140 141 /** 142 * DOC: pcie_replay_count 143 * 144 * The amdgpu driver provides a sysfs API for reporting the total number 145 * of PCIe replays (NAKs) 146 * The file pcie_replay_count is used for this and returns the total 147 * number of replays as a sum of the NAKs generated and NAKs received 148 */ 149 150 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 151 struct device_attribute *attr, char *buf) 152 { 153 struct drm_device *ddev = dev_get_drvdata(dev); 154 struct amdgpu_device *adev = drm_to_adev(ddev); 155 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 156 157 return sysfs_emit(buf, "%llu\n", cnt); 158 } 159 160 static DEVICE_ATTR(pcie_replay_count, 0444, 161 amdgpu_device_get_pcie_replay_count, NULL); 162 163 /** 164 * DOC: board_info 165 * 166 * The amdgpu driver provides a sysfs API for giving board related information. 167 * It provides the form factor information in the format 168 * 169 * type : form factor 170 * 171 * Possible form factor values 172 * 173 * - "cem" - PCIE CEM card 174 * - "oam" - Open Compute Accelerator Module 175 * - "unknown" - Not known 176 * 177 */ 178 179 static ssize_t amdgpu_device_get_board_info(struct device *dev, 180 struct device_attribute *attr, 181 char *buf) 182 { 183 struct drm_device *ddev = dev_get_drvdata(dev); 184 struct amdgpu_device *adev = drm_to_adev(ddev); 185 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 186 const char *pkg; 187 188 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 189 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 190 191 switch (pkg_type) { 192 case AMDGPU_PKG_TYPE_CEM: 193 pkg = "cem"; 194 break; 195 case AMDGPU_PKG_TYPE_OAM: 196 pkg = "oam"; 197 break; 198 default: 199 pkg = "unknown"; 200 break; 201 } 202 203 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 204 } 205 206 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 207 208 static struct attribute *amdgpu_board_attrs[] = { 209 &dev_attr_board_info.attr, 210 NULL, 211 }; 212 213 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 214 struct attribute *attr, int n) 215 { 216 struct device *dev = kobj_to_dev(kobj); 217 struct drm_device *ddev = dev_get_drvdata(dev); 218 struct amdgpu_device *adev = drm_to_adev(ddev); 219 220 if (adev->flags & AMD_IS_APU) 221 return 0; 222 223 return attr->mode; 224 } 225 226 static const struct attribute_group amdgpu_board_attrs_group = { 227 .attrs = amdgpu_board_attrs, 228 .is_visible = amdgpu_board_attrs_is_visible 229 }; 230 231 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 232 233 234 /** 235 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 236 * 237 * @dev: drm_device pointer 238 * 239 * Returns true if the device is a dGPU with ATPX power control, 240 * otherwise return false. 241 */ 242 bool amdgpu_device_supports_px(struct drm_device *dev) 243 { 244 struct amdgpu_device *adev = drm_to_adev(dev); 245 246 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 247 return true; 248 return false; 249 } 250 251 /** 252 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 253 * 254 * @dev: drm_device pointer 255 * 256 * Returns true if the device is a dGPU with ACPI power control, 257 * otherwise return false. 258 */ 259 bool amdgpu_device_supports_boco(struct drm_device *dev) 260 { 261 struct amdgpu_device *adev = drm_to_adev(dev); 262 263 if (adev->has_pr3 || 264 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 265 return true; 266 return false; 267 } 268 269 /** 270 * amdgpu_device_supports_baco - Does the device support BACO 271 * 272 * @dev: drm_device pointer 273 * 274 * Returns true if the device supporte BACO, 275 * otherwise return false. 276 */ 277 bool amdgpu_device_supports_baco(struct drm_device *dev) 278 { 279 struct amdgpu_device *adev = drm_to_adev(dev); 280 281 return amdgpu_asic_supports_baco(adev); 282 } 283 284 /** 285 * amdgpu_device_supports_smart_shift - Is the device dGPU with 286 * smart shift support 287 * 288 * @dev: drm_device pointer 289 * 290 * Returns true if the device is a dGPU with Smart Shift support, 291 * otherwise returns false. 292 */ 293 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 294 { 295 return (amdgpu_device_supports_boco(dev) && 296 amdgpu_acpi_is_power_shift_control_supported()); 297 } 298 299 /* 300 * VRAM access helper functions 301 */ 302 303 /** 304 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 305 * 306 * @adev: amdgpu_device pointer 307 * @pos: offset of the buffer in vram 308 * @buf: virtual address of the buffer in system memory 309 * @size: read/write size, sizeof(@buf) must > @size 310 * @write: true - write to vram, otherwise - read from vram 311 */ 312 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 313 void *buf, size_t size, bool write) 314 { 315 unsigned long flags; 316 uint32_t hi = ~0, tmp = 0; 317 uint32_t *data = buf; 318 uint64_t last; 319 int idx; 320 321 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 322 return; 323 324 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 325 326 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 327 for (last = pos + size; pos < last; pos += 4) { 328 tmp = pos >> 31; 329 330 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 331 if (tmp != hi) { 332 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 333 hi = tmp; 334 } 335 if (write) 336 WREG32_NO_KIQ(mmMM_DATA, *data++); 337 else 338 *data++ = RREG32_NO_KIQ(mmMM_DATA); 339 } 340 341 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 342 drm_dev_exit(idx); 343 } 344 345 /** 346 * amdgpu_device_aper_access - access vram by vram aperature 347 * 348 * @adev: amdgpu_device pointer 349 * @pos: offset of the buffer in vram 350 * @buf: virtual address of the buffer in system memory 351 * @size: read/write size, sizeof(@buf) must > @size 352 * @write: true - write to vram, otherwise - read from vram 353 * 354 * The return value means how many bytes have been transferred. 355 */ 356 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 357 void *buf, size_t size, bool write) 358 { 359 #ifdef CONFIG_64BIT 360 void __iomem *addr; 361 size_t count = 0; 362 uint64_t last; 363 364 if (!adev->mman.aper_base_kaddr) 365 return 0; 366 367 last = min(pos + size, adev->gmc.visible_vram_size); 368 if (last > pos) { 369 addr = adev->mman.aper_base_kaddr + pos; 370 count = last - pos; 371 372 if (write) { 373 memcpy_toio(addr, buf, count); 374 /* Make sure HDP write cache flush happens without any reordering 375 * after the system memory contents are sent over PCIe device 376 */ 377 mb(); 378 amdgpu_device_flush_hdp(adev, NULL); 379 } else { 380 amdgpu_device_invalidate_hdp(adev, NULL); 381 /* Make sure HDP read cache is invalidated before issuing a read 382 * to the PCIe device 383 */ 384 mb(); 385 memcpy_fromio(buf, addr, count); 386 } 387 388 } 389 390 return count; 391 #else 392 return 0; 393 #endif 394 } 395 396 /** 397 * amdgpu_device_vram_access - read/write a buffer in vram 398 * 399 * @adev: amdgpu_device pointer 400 * @pos: offset of the buffer in vram 401 * @buf: virtual address of the buffer in system memory 402 * @size: read/write size, sizeof(@buf) must > @size 403 * @write: true - write to vram, otherwise - read from vram 404 */ 405 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 406 void *buf, size_t size, bool write) 407 { 408 size_t count; 409 410 /* try to using vram apreature to access vram first */ 411 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 412 size -= count; 413 if (size) { 414 /* using MM to access rest vram */ 415 pos += count; 416 buf += count; 417 amdgpu_device_mm_access(adev, pos, buf, size, write); 418 } 419 } 420 421 /* 422 * register access helper functions. 423 */ 424 425 /* Check if hw access should be skipped because of hotplug or device error */ 426 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 427 { 428 if (adev->no_hw_access) 429 return true; 430 431 #ifdef CONFIG_LOCKDEP 432 /* 433 * This is a bit complicated to understand, so worth a comment. What we assert 434 * here is that the GPU reset is not running on another thread in parallel. 435 * 436 * For this we trylock the read side of the reset semaphore, if that succeeds 437 * we know that the reset is not running in paralell. 438 * 439 * If the trylock fails we assert that we are either already holding the read 440 * side of the lock or are the reset thread itself and hold the write side of 441 * the lock. 442 */ 443 if (in_task()) { 444 if (down_read_trylock(&adev->reset_domain->sem)) 445 up_read(&adev->reset_domain->sem); 446 else 447 lockdep_assert_held(&adev->reset_domain->sem); 448 } 449 #endif 450 return false; 451 } 452 453 /** 454 * amdgpu_device_rreg - read a memory mapped IO or indirect register 455 * 456 * @adev: amdgpu_device pointer 457 * @reg: dword aligned register offset 458 * @acc_flags: access flags which require special behavior 459 * 460 * Returns the 32 bit value from the offset specified. 461 */ 462 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 463 uint32_t reg, uint32_t acc_flags) 464 { 465 uint32_t ret; 466 467 if (amdgpu_device_skip_hw_access(adev)) 468 return 0; 469 470 if ((reg * 4) < adev->rmmio_size) { 471 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 472 amdgpu_sriov_runtime(adev) && 473 down_read_trylock(&adev->reset_domain->sem)) { 474 ret = amdgpu_kiq_rreg(adev, reg); 475 up_read(&adev->reset_domain->sem); 476 } else { 477 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 478 } 479 } else { 480 ret = adev->pcie_rreg(adev, reg * 4); 481 } 482 483 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 484 485 return ret; 486 } 487 488 /* 489 * MMIO register read with bytes helper functions 490 * @offset:bytes offset from MMIO start 491 */ 492 493 /** 494 * amdgpu_mm_rreg8 - read a memory mapped IO register 495 * 496 * @adev: amdgpu_device pointer 497 * @offset: byte aligned register offset 498 * 499 * Returns the 8 bit value from the offset specified. 500 */ 501 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 502 { 503 if (amdgpu_device_skip_hw_access(adev)) 504 return 0; 505 506 if (offset < adev->rmmio_size) 507 return (readb(adev->rmmio + offset)); 508 BUG(); 509 } 510 511 /* 512 * MMIO register write with bytes helper functions 513 * @offset:bytes offset from MMIO start 514 * @value: the value want to be written to the register 515 */ 516 517 /** 518 * amdgpu_mm_wreg8 - read a memory mapped IO register 519 * 520 * @adev: amdgpu_device pointer 521 * @offset: byte aligned register offset 522 * @value: 8 bit value to write 523 * 524 * Writes the value specified to the offset specified. 525 */ 526 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 527 { 528 if (amdgpu_device_skip_hw_access(adev)) 529 return; 530 531 if (offset < adev->rmmio_size) 532 writeb(value, adev->rmmio + offset); 533 else 534 BUG(); 535 } 536 537 /** 538 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 539 * 540 * @adev: amdgpu_device pointer 541 * @reg: dword aligned register offset 542 * @v: 32 bit value to write to the register 543 * @acc_flags: access flags which require special behavior 544 * 545 * Writes the value specified to the offset specified. 546 */ 547 void amdgpu_device_wreg(struct amdgpu_device *adev, 548 uint32_t reg, uint32_t v, 549 uint32_t acc_flags) 550 { 551 if (amdgpu_device_skip_hw_access(adev)) 552 return; 553 554 if ((reg * 4) < adev->rmmio_size) { 555 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 556 amdgpu_sriov_runtime(adev) && 557 down_read_trylock(&adev->reset_domain->sem)) { 558 amdgpu_kiq_wreg(adev, reg, v); 559 up_read(&adev->reset_domain->sem); 560 } else { 561 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 562 } 563 } else { 564 adev->pcie_wreg(adev, reg * 4, v); 565 } 566 567 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 568 } 569 570 /** 571 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 572 * 573 * @adev: amdgpu_device pointer 574 * @reg: mmio/rlc register 575 * @v: value to write 576 * @xcc_id: xcc accelerated compute core id 577 * 578 * this function is invoked only for the debugfs register access 579 */ 580 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 581 uint32_t reg, uint32_t v, 582 uint32_t xcc_id) 583 { 584 if (amdgpu_device_skip_hw_access(adev)) 585 return; 586 587 if (amdgpu_sriov_fullaccess(adev) && 588 adev->gfx.rlc.funcs && 589 adev->gfx.rlc.funcs->is_rlcg_access_range) { 590 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 591 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 592 } else if ((reg * 4) >= adev->rmmio_size) { 593 adev->pcie_wreg(adev, reg * 4, v); 594 } else { 595 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 596 } 597 } 598 599 /** 600 * amdgpu_device_indirect_rreg - read an indirect register 601 * 602 * @adev: amdgpu_device pointer 603 * @reg_addr: indirect register address to read from 604 * 605 * Returns the value of indirect register @reg_addr 606 */ 607 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 608 u32 reg_addr) 609 { 610 unsigned long flags, pcie_index, pcie_data; 611 void __iomem *pcie_index_offset; 612 void __iomem *pcie_data_offset; 613 u32 r; 614 615 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 616 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 617 618 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 619 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 620 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 621 622 writel(reg_addr, pcie_index_offset); 623 readl(pcie_index_offset); 624 r = readl(pcie_data_offset); 625 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 626 627 return r; 628 } 629 630 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 631 u64 reg_addr) 632 { 633 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 634 u32 r; 635 void __iomem *pcie_index_offset; 636 void __iomem *pcie_index_hi_offset; 637 void __iomem *pcie_data_offset; 638 639 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 640 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 641 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 642 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 643 else 644 pcie_index_hi = 0; 645 646 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 647 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 648 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 649 if (pcie_index_hi != 0) 650 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 651 pcie_index_hi * 4; 652 653 writel(reg_addr, pcie_index_offset); 654 readl(pcie_index_offset); 655 if (pcie_index_hi != 0) { 656 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 657 readl(pcie_index_hi_offset); 658 } 659 r = readl(pcie_data_offset); 660 661 /* clear the high bits */ 662 if (pcie_index_hi != 0) { 663 writel(0, pcie_index_hi_offset); 664 readl(pcie_index_hi_offset); 665 } 666 667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 668 669 return r; 670 } 671 672 /** 673 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 674 * 675 * @adev: amdgpu_device pointer 676 * @reg_addr: indirect register address to read from 677 * 678 * Returns the value of indirect register @reg_addr 679 */ 680 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 681 u32 reg_addr) 682 { 683 unsigned long flags, pcie_index, pcie_data; 684 void __iomem *pcie_index_offset; 685 void __iomem *pcie_data_offset; 686 u64 r; 687 688 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 689 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 690 691 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 692 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 693 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 694 695 /* read low 32 bits */ 696 writel(reg_addr, pcie_index_offset); 697 readl(pcie_index_offset); 698 r = readl(pcie_data_offset); 699 /* read high 32 bits */ 700 writel(reg_addr + 4, pcie_index_offset); 701 readl(pcie_index_offset); 702 r |= ((u64)readl(pcie_data_offset) << 32); 703 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 704 705 return r; 706 } 707 708 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 709 u64 reg_addr) 710 { 711 unsigned long flags, pcie_index, pcie_data; 712 unsigned long pcie_index_hi = 0; 713 void __iomem *pcie_index_offset; 714 void __iomem *pcie_index_hi_offset; 715 void __iomem *pcie_data_offset; 716 u64 r; 717 718 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 719 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 720 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 721 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 722 723 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 724 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 725 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 726 if (pcie_index_hi != 0) 727 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 728 pcie_index_hi * 4; 729 730 /* read low 32 bits */ 731 writel(reg_addr, pcie_index_offset); 732 readl(pcie_index_offset); 733 if (pcie_index_hi != 0) { 734 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 735 readl(pcie_index_hi_offset); 736 } 737 r = readl(pcie_data_offset); 738 /* read high 32 bits */ 739 writel(reg_addr + 4, pcie_index_offset); 740 readl(pcie_index_offset); 741 if (pcie_index_hi != 0) { 742 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 743 readl(pcie_index_hi_offset); 744 } 745 r |= ((u64)readl(pcie_data_offset) << 32); 746 747 /* clear the high bits */ 748 if (pcie_index_hi != 0) { 749 writel(0, pcie_index_hi_offset); 750 readl(pcie_index_hi_offset); 751 } 752 753 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 754 755 return r; 756 } 757 758 /** 759 * amdgpu_device_indirect_wreg - write an indirect register address 760 * 761 * @adev: amdgpu_device pointer 762 * @reg_addr: indirect register offset 763 * @reg_data: indirect register data 764 * 765 */ 766 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 767 u32 reg_addr, u32 reg_data) 768 { 769 unsigned long flags, pcie_index, pcie_data; 770 void __iomem *pcie_index_offset; 771 void __iomem *pcie_data_offset; 772 773 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 774 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 775 776 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 777 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 778 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 779 780 writel(reg_addr, pcie_index_offset); 781 readl(pcie_index_offset); 782 writel(reg_data, pcie_data_offset); 783 readl(pcie_data_offset); 784 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 785 } 786 787 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 788 u64 reg_addr, u32 reg_data) 789 { 790 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 791 void __iomem *pcie_index_offset; 792 void __iomem *pcie_index_hi_offset; 793 void __iomem *pcie_data_offset; 794 795 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 796 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 797 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 798 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 799 else 800 pcie_index_hi = 0; 801 802 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 803 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 804 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 805 if (pcie_index_hi != 0) 806 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 807 pcie_index_hi * 4; 808 809 writel(reg_addr, pcie_index_offset); 810 readl(pcie_index_offset); 811 if (pcie_index_hi != 0) { 812 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 813 readl(pcie_index_hi_offset); 814 } 815 writel(reg_data, pcie_data_offset); 816 readl(pcie_data_offset); 817 818 /* clear the high bits */ 819 if (pcie_index_hi != 0) { 820 writel(0, pcie_index_hi_offset); 821 readl(pcie_index_hi_offset); 822 } 823 824 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 825 } 826 827 /** 828 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 829 * 830 * @adev: amdgpu_device pointer 831 * @reg_addr: indirect register offset 832 * @reg_data: indirect register data 833 * 834 */ 835 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 836 u32 reg_addr, u64 reg_data) 837 { 838 unsigned long flags, pcie_index, pcie_data; 839 void __iomem *pcie_index_offset; 840 void __iomem *pcie_data_offset; 841 842 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 843 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 844 845 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 846 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 847 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 848 849 /* write low 32 bits */ 850 writel(reg_addr, pcie_index_offset); 851 readl(pcie_index_offset); 852 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 853 readl(pcie_data_offset); 854 /* write high 32 bits */ 855 writel(reg_addr + 4, pcie_index_offset); 856 readl(pcie_index_offset); 857 writel((u32)(reg_data >> 32), pcie_data_offset); 858 readl(pcie_data_offset); 859 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 860 } 861 862 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 863 u64 reg_addr, u64 reg_data) 864 { 865 unsigned long flags, pcie_index, pcie_data; 866 unsigned long pcie_index_hi = 0; 867 void __iomem *pcie_index_offset; 868 void __iomem *pcie_index_hi_offset; 869 void __iomem *pcie_data_offset; 870 871 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 872 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 873 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 874 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 875 876 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 877 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 878 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 879 if (pcie_index_hi != 0) 880 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 881 pcie_index_hi * 4; 882 883 /* write low 32 bits */ 884 writel(reg_addr, pcie_index_offset); 885 readl(pcie_index_offset); 886 if (pcie_index_hi != 0) { 887 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 888 readl(pcie_index_hi_offset); 889 } 890 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 891 readl(pcie_data_offset); 892 /* write high 32 bits */ 893 writel(reg_addr + 4, pcie_index_offset); 894 readl(pcie_index_offset); 895 if (pcie_index_hi != 0) { 896 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 897 readl(pcie_index_hi_offset); 898 } 899 writel((u32)(reg_data >> 32), pcie_data_offset); 900 readl(pcie_data_offset); 901 902 /* clear the high bits */ 903 if (pcie_index_hi != 0) { 904 writel(0, pcie_index_hi_offset); 905 readl(pcie_index_hi_offset); 906 } 907 908 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 909 } 910 911 /** 912 * amdgpu_device_get_rev_id - query device rev_id 913 * 914 * @adev: amdgpu_device pointer 915 * 916 * Return device rev_id 917 */ 918 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 919 { 920 return adev->nbio.funcs->get_rev_id(adev); 921 } 922 923 /** 924 * amdgpu_invalid_rreg - dummy reg read function 925 * 926 * @adev: amdgpu_device pointer 927 * @reg: offset of register 928 * 929 * Dummy register read function. Used for register blocks 930 * that certain asics don't have (all asics). 931 * Returns the value in the register. 932 */ 933 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 934 { 935 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 936 BUG(); 937 return 0; 938 } 939 940 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 941 { 942 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 943 BUG(); 944 return 0; 945 } 946 947 /** 948 * amdgpu_invalid_wreg - dummy reg write function 949 * 950 * @adev: amdgpu_device pointer 951 * @reg: offset of register 952 * @v: value to write to the register 953 * 954 * Dummy register read function. Used for register blocks 955 * that certain asics don't have (all asics). 956 */ 957 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 958 { 959 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 960 reg, v); 961 BUG(); 962 } 963 964 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 965 { 966 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 967 reg, v); 968 BUG(); 969 } 970 971 /** 972 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 973 * 974 * @adev: amdgpu_device pointer 975 * @reg: offset of register 976 * 977 * Dummy register read function. Used for register blocks 978 * that certain asics don't have (all asics). 979 * Returns the value in the register. 980 */ 981 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 982 { 983 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 984 BUG(); 985 return 0; 986 } 987 988 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 989 { 990 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 991 BUG(); 992 return 0; 993 } 994 995 /** 996 * amdgpu_invalid_wreg64 - dummy reg write function 997 * 998 * @adev: amdgpu_device pointer 999 * @reg: offset of register 1000 * @v: value to write to the register 1001 * 1002 * Dummy register read function. Used for register blocks 1003 * that certain asics don't have (all asics). 1004 */ 1005 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1006 { 1007 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1008 reg, v); 1009 BUG(); 1010 } 1011 1012 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1013 { 1014 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1015 reg, v); 1016 BUG(); 1017 } 1018 1019 /** 1020 * amdgpu_block_invalid_rreg - dummy reg read function 1021 * 1022 * @adev: amdgpu_device pointer 1023 * @block: offset of instance 1024 * @reg: offset of register 1025 * 1026 * Dummy register read function. Used for register blocks 1027 * that certain asics don't have (all asics). 1028 * Returns the value in the register. 1029 */ 1030 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1031 uint32_t block, uint32_t reg) 1032 { 1033 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1034 reg, block); 1035 BUG(); 1036 return 0; 1037 } 1038 1039 /** 1040 * amdgpu_block_invalid_wreg - dummy reg write function 1041 * 1042 * @adev: amdgpu_device pointer 1043 * @block: offset of instance 1044 * @reg: offset of register 1045 * @v: value to write to the register 1046 * 1047 * Dummy register read function. Used for register blocks 1048 * that certain asics don't have (all asics). 1049 */ 1050 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1051 uint32_t block, 1052 uint32_t reg, uint32_t v) 1053 { 1054 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1055 reg, block, v); 1056 BUG(); 1057 } 1058 1059 /** 1060 * amdgpu_device_asic_init - Wrapper for atom asic_init 1061 * 1062 * @adev: amdgpu_device pointer 1063 * 1064 * Does any asic specific work and then calls atom asic init. 1065 */ 1066 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1067 { 1068 int ret; 1069 1070 amdgpu_asic_pre_asic_init(adev); 1071 1072 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1073 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1074 amdgpu_psp_wait_for_bootloader(adev); 1075 ret = amdgpu_atomfirmware_asic_init(adev, true); 1076 return ret; 1077 } else { 1078 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1079 } 1080 1081 return 0; 1082 } 1083 1084 /** 1085 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1086 * 1087 * @adev: amdgpu_device pointer 1088 * 1089 * Allocates a scratch page of VRAM for use by various things in the 1090 * driver. 1091 */ 1092 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1093 { 1094 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1095 AMDGPU_GEM_DOMAIN_VRAM | 1096 AMDGPU_GEM_DOMAIN_GTT, 1097 &adev->mem_scratch.robj, 1098 &adev->mem_scratch.gpu_addr, 1099 (void **)&adev->mem_scratch.ptr); 1100 } 1101 1102 /** 1103 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1104 * 1105 * @adev: amdgpu_device pointer 1106 * 1107 * Frees the VRAM scratch page. 1108 */ 1109 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1110 { 1111 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1112 } 1113 1114 /** 1115 * amdgpu_device_program_register_sequence - program an array of registers. 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @registers: pointer to the register array 1119 * @array_size: size of the register array 1120 * 1121 * Programs an array or registers with and or masks. 1122 * This is a helper for setting golden registers. 1123 */ 1124 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1125 const u32 *registers, 1126 const u32 array_size) 1127 { 1128 u32 tmp, reg, and_mask, or_mask; 1129 int i; 1130 1131 if (array_size % 3) 1132 return; 1133 1134 for (i = 0; i < array_size; i += 3) { 1135 reg = registers[i + 0]; 1136 and_mask = registers[i + 1]; 1137 or_mask = registers[i + 2]; 1138 1139 if (and_mask == 0xffffffff) { 1140 tmp = or_mask; 1141 } else { 1142 tmp = RREG32(reg); 1143 tmp &= ~and_mask; 1144 if (adev->family >= AMDGPU_FAMILY_AI) 1145 tmp |= (or_mask & and_mask); 1146 else 1147 tmp |= or_mask; 1148 } 1149 WREG32(reg, tmp); 1150 } 1151 } 1152 1153 /** 1154 * amdgpu_device_pci_config_reset - reset the GPU 1155 * 1156 * @adev: amdgpu_device pointer 1157 * 1158 * Resets the GPU using the pci config reset sequence. 1159 * Only applicable to asics prior to vega10. 1160 */ 1161 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1162 { 1163 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1164 } 1165 1166 /** 1167 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1168 * 1169 * @adev: amdgpu_device pointer 1170 * 1171 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1172 */ 1173 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1174 { 1175 return pci_reset_function(adev->pdev); 1176 } 1177 1178 /* 1179 * amdgpu_device_wb_*() 1180 * Writeback is the method by which the GPU updates special pages in memory 1181 * with the status of certain GPU events (fences, ring pointers,etc.). 1182 */ 1183 1184 /** 1185 * amdgpu_device_wb_fini - Disable Writeback and free memory 1186 * 1187 * @adev: amdgpu_device pointer 1188 * 1189 * Disables Writeback and frees the Writeback memory (all asics). 1190 * Used at driver shutdown. 1191 */ 1192 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1193 { 1194 if (adev->wb.wb_obj) { 1195 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1196 &adev->wb.gpu_addr, 1197 (void **)&adev->wb.wb); 1198 adev->wb.wb_obj = NULL; 1199 } 1200 } 1201 1202 /** 1203 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1204 * 1205 * @adev: amdgpu_device pointer 1206 * 1207 * Initializes writeback and allocates writeback memory (all asics). 1208 * Used at driver startup. 1209 * Returns 0 on success or an -error on failure. 1210 */ 1211 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1212 { 1213 int r; 1214 1215 if (adev->wb.wb_obj == NULL) { 1216 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1217 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1218 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1219 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1220 (void **)&adev->wb.wb); 1221 if (r) { 1222 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1223 return r; 1224 } 1225 1226 adev->wb.num_wb = AMDGPU_MAX_WB; 1227 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1228 1229 /* clear wb memory */ 1230 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1231 } 1232 1233 return 0; 1234 } 1235 1236 /** 1237 * amdgpu_device_wb_get - Allocate a wb entry 1238 * 1239 * @adev: amdgpu_device pointer 1240 * @wb: wb index 1241 * 1242 * Allocate a wb slot for use by the driver (all asics). 1243 * Returns 0 on success or -EINVAL on failure. 1244 */ 1245 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1246 { 1247 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1248 1249 if (offset < adev->wb.num_wb) { 1250 __set_bit(offset, adev->wb.used); 1251 *wb = offset << 3; /* convert to dw offset */ 1252 return 0; 1253 } else { 1254 return -EINVAL; 1255 } 1256 } 1257 1258 /** 1259 * amdgpu_device_wb_free - Free a wb entry 1260 * 1261 * @adev: amdgpu_device pointer 1262 * @wb: wb index 1263 * 1264 * Free a wb slot allocated for use by the driver (all asics) 1265 */ 1266 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1267 { 1268 wb >>= 3; 1269 if (wb < adev->wb.num_wb) 1270 __clear_bit(wb, adev->wb.used); 1271 } 1272 1273 /** 1274 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1275 * 1276 * @adev: amdgpu_device pointer 1277 * 1278 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1279 * to fail, but if any of the BARs is not accessible after the size we abort 1280 * driver loading by returning -ENODEV. 1281 */ 1282 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1283 { 1284 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1285 struct pci_bus *root; 1286 struct resource *res; 1287 unsigned int i; 1288 u16 cmd; 1289 int r; 1290 1291 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1292 return 0; 1293 1294 /* Bypass for VF */ 1295 if (amdgpu_sriov_vf(adev)) 1296 return 0; 1297 1298 /* skip if the bios has already enabled large BAR */ 1299 if (adev->gmc.real_vram_size && 1300 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1301 return 0; 1302 1303 /* Check if the root BUS has 64bit memory resources */ 1304 root = adev->pdev->bus; 1305 while (root->parent) 1306 root = root->parent; 1307 1308 pci_bus_for_each_resource(root, res, i) { 1309 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1310 res->start > 0x100000000ull) 1311 break; 1312 } 1313 1314 /* Trying to resize is pointless without a root hub window above 4GB */ 1315 if (!res) 1316 return 0; 1317 1318 /* Limit the BAR size to what is available */ 1319 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1320 rbar_size); 1321 1322 /* Disable memory decoding while we change the BAR addresses and size */ 1323 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1324 pci_write_config_word(adev->pdev, PCI_COMMAND, 1325 cmd & ~PCI_COMMAND_MEMORY); 1326 1327 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1328 amdgpu_doorbell_fini(adev); 1329 if (adev->asic_type >= CHIP_BONAIRE) 1330 pci_release_resource(adev->pdev, 2); 1331 1332 pci_release_resource(adev->pdev, 0); 1333 1334 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1335 if (r == -ENOSPC) 1336 DRM_INFO("Not enough PCI address space for a large BAR."); 1337 else if (r && r != -ENOTSUPP) 1338 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1339 1340 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1341 1342 /* When the doorbell or fb BAR isn't available we have no chance of 1343 * using the device. 1344 */ 1345 r = amdgpu_doorbell_init(adev); 1346 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1347 return -ENODEV; 1348 1349 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1350 1351 return 0; 1352 } 1353 1354 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1355 { 1356 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1357 return false; 1358 1359 return true; 1360 } 1361 1362 /* 1363 * GPU helpers function. 1364 */ 1365 /** 1366 * amdgpu_device_need_post - check if the hw need post or not 1367 * 1368 * @adev: amdgpu_device pointer 1369 * 1370 * Check if the asic has been initialized (all asics) at driver startup 1371 * or post is needed if hw reset is performed. 1372 * Returns true if need or false if not. 1373 */ 1374 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1375 { 1376 uint32_t reg; 1377 1378 if (amdgpu_sriov_vf(adev)) 1379 return false; 1380 1381 if (!amdgpu_device_read_bios(adev)) 1382 return false; 1383 1384 if (amdgpu_passthrough(adev)) { 1385 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1386 * some old smc fw still need driver do vPost otherwise gpu hang, while 1387 * those smc fw version above 22.15 doesn't have this flaw, so we force 1388 * vpost executed for smc version below 22.15 1389 */ 1390 if (adev->asic_type == CHIP_FIJI) { 1391 int err; 1392 uint32_t fw_ver; 1393 1394 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1395 /* force vPost if error occured */ 1396 if (err) 1397 return true; 1398 1399 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1400 if (fw_ver < 0x00160e00) 1401 return true; 1402 } 1403 } 1404 1405 /* Don't post if we need to reset whole hive on init */ 1406 if (adev->gmc.xgmi.pending_reset) 1407 return false; 1408 1409 if (adev->has_hw_reset) { 1410 adev->has_hw_reset = false; 1411 return true; 1412 } 1413 1414 /* bios scratch used on CIK+ */ 1415 if (adev->asic_type >= CHIP_BONAIRE) 1416 return amdgpu_atombios_scratch_need_asic_init(adev); 1417 1418 /* check MEM_SIZE for older asics */ 1419 reg = amdgpu_asic_get_config_memsize(adev); 1420 1421 if ((reg != 0) && (reg != 0xffffffff)) 1422 return false; 1423 1424 return true; 1425 } 1426 1427 /* 1428 * Check whether seamless boot is supported. 1429 * 1430 * So far we only support seamless boot on DCE 3.0 or later. 1431 * If users report that it works on older ASICS as well, we may 1432 * loosen this. 1433 */ 1434 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1435 { 1436 switch (amdgpu_seamless) { 1437 case -1: 1438 break; 1439 case 1: 1440 return true; 1441 case 0: 1442 return false; 1443 default: 1444 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1445 amdgpu_seamless); 1446 return false; 1447 } 1448 1449 if (!(adev->flags & AMD_IS_APU)) 1450 return false; 1451 1452 if (adev->mman.keep_stolen_vga_memory) 1453 return false; 1454 1455 return adev->ip_versions[DCE_HWIP][0] >= IP_VERSION(3, 0, 0); 1456 } 1457 1458 /* 1459 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1460 * don't support dynamic speed switching. Until we have confirmation from Intel 1461 * that a specific host supports it, it's safer that we keep it disabled for all. 1462 * 1463 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1464 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1465 */ 1466 static bool amdgpu_device_pcie_dynamic_switching_supported(void) 1467 { 1468 #if IS_ENABLED(CONFIG_X86) 1469 struct cpuinfo_x86 *c = &cpu_data(0); 1470 1471 if (c->x86_vendor == X86_VENDOR_INTEL) 1472 return false; 1473 #endif 1474 return true; 1475 } 1476 1477 /** 1478 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1479 * 1480 * @adev: amdgpu_device pointer 1481 * 1482 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1483 * be set for this device. 1484 * 1485 * Returns true if it should be used or false if not. 1486 */ 1487 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1488 { 1489 switch (amdgpu_aspm) { 1490 case -1: 1491 break; 1492 case 0: 1493 return false; 1494 case 1: 1495 return true; 1496 default: 1497 return false; 1498 } 1499 if (adev->flags & AMD_IS_APU) 1500 return false; 1501 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1502 return false; 1503 return pcie_aspm_enabled(adev->pdev); 1504 } 1505 1506 /* if we get transitioned to only one device, take VGA back */ 1507 /** 1508 * amdgpu_device_vga_set_decode - enable/disable vga decode 1509 * 1510 * @pdev: PCI device pointer 1511 * @state: enable/disable vga decode 1512 * 1513 * Enable/disable vga decode (all asics). 1514 * Returns VGA resource flags. 1515 */ 1516 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1517 bool state) 1518 { 1519 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1520 1521 amdgpu_asic_set_vga_state(adev, state); 1522 if (state) 1523 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1524 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1525 else 1526 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1527 } 1528 1529 /** 1530 * amdgpu_device_check_block_size - validate the vm block size 1531 * 1532 * @adev: amdgpu_device pointer 1533 * 1534 * Validates the vm block size specified via module parameter. 1535 * The vm block size defines number of bits in page table versus page directory, 1536 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1537 * page table and the remaining bits are in the page directory. 1538 */ 1539 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1540 { 1541 /* defines number of bits in page table versus page directory, 1542 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1543 * page table and the remaining bits are in the page directory 1544 */ 1545 if (amdgpu_vm_block_size == -1) 1546 return; 1547 1548 if (amdgpu_vm_block_size < 9) { 1549 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1550 amdgpu_vm_block_size); 1551 amdgpu_vm_block_size = -1; 1552 } 1553 } 1554 1555 /** 1556 * amdgpu_device_check_vm_size - validate the vm size 1557 * 1558 * @adev: amdgpu_device pointer 1559 * 1560 * Validates the vm size in GB specified via module parameter. 1561 * The VM size is the size of the GPU virtual memory space in GB. 1562 */ 1563 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1564 { 1565 /* no need to check the default value */ 1566 if (amdgpu_vm_size == -1) 1567 return; 1568 1569 if (amdgpu_vm_size < 1) { 1570 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1571 amdgpu_vm_size); 1572 amdgpu_vm_size = -1; 1573 } 1574 } 1575 1576 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1577 { 1578 struct sysinfo si; 1579 bool is_os_64 = (sizeof(void *) == 8); 1580 uint64_t total_memory; 1581 uint64_t dram_size_seven_GB = 0x1B8000000; 1582 uint64_t dram_size_three_GB = 0xB8000000; 1583 1584 if (amdgpu_smu_memory_pool_size == 0) 1585 return; 1586 1587 if (!is_os_64) { 1588 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1589 goto def_value; 1590 } 1591 si_meminfo(&si); 1592 total_memory = (uint64_t)si.totalram * si.mem_unit; 1593 1594 if ((amdgpu_smu_memory_pool_size == 1) || 1595 (amdgpu_smu_memory_pool_size == 2)) { 1596 if (total_memory < dram_size_three_GB) 1597 goto def_value1; 1598 } else if ((amdgpu_smu_memory_pool_size == 4) || 1599 (amdgpu_smu_memory_pool_size == 8)) { 1600 if (total_memory < dram_size_seven_GB) 1601 goto def_value1; 1602 } else { 1603 DRM_WARN("Smu memory pool size not supported\n"); 1604 goto def_value; 1605 } 1606 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1607 1608 return; 1609 1610 def_value1: 1611 DRM_WARN("No enough system memory\n"); 1612 def_value: 1613 adev->pm.smu_prv_buffer_size = 0; 1614 } 1615 1616 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1617 { 1618 if (!(adev->flags & AMD_IS_APU) || 1619 adev->asic_type < CHIP_RAVEN) 1620 return 0; 1621 1622 switch (adev->asic_type) { 1623 case CHIP_RAVEN: 1624 if (adev->pdev->device == 0x15dd) 1625 adev->apu_flags |= AMD_APU_IS_RAVEN; 1626 if (adev->pdev->device == 0x15d8) 1627 adev->apu_flags |= AMD_APU_IS_PICASSO; 1628 break; 1629 case CHIP_RENOIR: 1630 if ((adev->pdev->device == 0x1636) || 1631 (adev->pdev->device == 0x164c)) 1632 adev->apu_flags |= AMD_APU_IS_RENOIR; 1633 else 1634 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1635 break; 1636 case CHIP_VANGOGH: 1637 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1638 break; 1639 case CHIP_YELLOW_CARP: 1640 break; 1641 case CHIP_CYAN_SKILLFISH: 1642 if ((adev->pdev->device == 0x13FE) || 1643 (adev->pdev->device == 0x143F)) 1644 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1645 break; 1646 default: 1647 break; 1648 } 1649 1650 return 0; 1651 } 1652 1653 /** 1654 * amdgpu_device_check_arguments - validate module params 1655 * 1656 * @adev: amdgpu_device pointer 1657 * 1658 * Validates certain module parameters and updates 1659 * the associated values used by the driver (all asics). 1660 */ 1661 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1662 { 1663 if (amdgpu_sched_jobs < 4) { 1664 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1665 amdgpu_sched_jobs); 1666 amdgpu_sched_jobs = 4; 1667 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1668 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1669 amdgpu_sched_jobs); 1670 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1671 } 1672 1673 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1674 /* gart size must be greater or equal to 32M */ 1675 dev_warn(adev->dev, "gart size (%d) too small\n", 1676 amdgpu_gart_size); 1677 amdgpu_gart_size = -1; 1678 } 1679 1680 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1681 /* gtt size must be greater or equal to 32M */ 1682 dev_warn(adev->dev, "gtt size (%d) too small\n", 1683 amdgpu_gtt_size); 1684 amdgpu_gtt_size = -1; 1685 } 1686 1687 /* valid range is between 4 and 9 inclusive */ 1688 if (amdgpu_vm_fragment_size != -1 && 1689 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1690 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1691 amdgpu_vm_fragment_size = -1; 1692 } 1693 1694 if (amdgpu_sched_hw_submission < 2) { 1695 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1696 amdgpu_sched_hw_submission); 1697 amdgpu_sched_hw_submission = 2; 1698 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1699 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1700 amdgpu_sched_hw_submission); 1701 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1702 } 1703 1704 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1705 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1706 amdgpu_reset_method = -1; 1707 } 1708 1709 amdgpu_device_check_smu_prv_buffer_size(adev); 1710 1711 amdgpu_device_check_vm_size(adev); 1712 1713 amdgpu_device_check_block_size(adev); 1714 1715 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1716 1717 return 0; 1718 } 1719 1720 /** 1721 * amdgpu_switcheroo_set_state - set switcheroo state 1722 * 1723 * @pdev: pci dev pointer 1724 * @state: vga_switcheroo state 1725 * 1726 * Callback for the switcheroo driver. Suspends or resumes 1727 * the asics before or after it is powered up using ACPI methods. 1728 */ 1729 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1730 enum vga_switcheroo_state state) 1731 { 1732 struct drm_device *dev = pci_get_drvdata(pdev); 1733 int r; 1734 1735 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1736 return; 1737 1738 if (state == VGA_SWITCHEROO_ON) { 1739 pr_info("switched on\n"); 1740 /* don't suspend or resume card normally */ 1741 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1742 1743 pci_set_power_state(pdev, PCI_D0); 1744 amdgpu_device_load_pci_state(pdev); 1745 r = pci_enable_device(pdev); 1746 if (r) 1747 DRM_WARN("pci_enable_device failed (%d)\n", r); 1748 amdgpu_device_resume(dev, true); 1749 1750 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1751 } else { 1752 pr_info("switched off\n"); 1753 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1754 amdgpu_device_prepare(dev); 1755 amdgpu_device_suspend(dev, true); 1756 amdgpu_device_cache_pci_state(pdev); 1757 /* Shut down the device */ 1758 pci_disable_device(pdev); 1759 pci_set_power_state(pdev, PCI_D3cold); 1760 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1761 } 1762 } 1763 1764 /** 1765 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1766 * 1767 * @pdev: pci dev pointer 1768 * 1769 * Callback for the switcheroo driver. Check of the switcheroo 1770 * state can be changed. 1771 * Returns true if the state can be changed, false if not. 1772 */ 1773 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1774 { 1775 struct drm_device *dev = pci_get_drvdata(pdev); 1776 1777 /* 1778 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1779 * locking inversion with the driver load path. And the access here is 1780 * completely racy anyway. So don't bother with locking for now. 1781 */ 1782 return atomic_read(&dev->open_count) == 0; 1783 } 1784 1785 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1786 .set_gpu_state = amdgpu_switcheroo_set_state, 1787 .reprobe = NULL, 1788 .can_switch = amdgpu_switcheroo_can_switch, 1789 }; 1790 1791 /** 1792 * amdgpu_device_ip_set_clockgating_state - set the CG state 1793 * 1794 * @dev: amdgpu_device pointer 1795 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1796 * @state: clockgating state (gate or ungate) 1797 * 1798 * Sets the requested clockgating state for all instances of 1799 * the hardware IP specified. 1800 * Returns the error code from the last instance. 1801 */ 1802 int amdgpu_device_ip_set_clockgating_state(void *dev, 1803 enum amd_ip_block_type block_type, 1804 enum amd_clockgating_state state) 1805 { 1806 struct amdgpu_device *adev = dev; 1807 int i, r = 0; 1808 1809 for (i = 0; i < adev->num_ip_blocks; i++) { 1810 if (!adev->ip_blocks[i].status.valid) 1811 continue; 1812 if (adev->ip_blocks[i].version->type != block_type) 1813 continue; 1814 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1815 continue; 1816 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1817 (void *)adev, state); 1818 if (r) 1819 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1820 adev->ip_blocks[i].version->funcs->name, r); 1821 } 1822 return r; 1823 } 1824 1825 /** 1826 * amdgpu_device_ip_set_powergating_state - set the PG state 1827 * 1828 * @dev: amdgpu_device pointer 1829 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1830 * @state: powergating state (gate or ungate) 1831 * 1832 * Sets the requested powergating state for all instances of 1833 * the hardware IP specified. 1834 * Returns the error code from the last instance. 1835 */ 1836 int amdgpu_device_ip_set_powergating_state(void *dev, 1837 enum amd_ip_block_type block_type, 1838 enum amd_powergating_state state) 1839 { 1840 struct amdgpu_device *adev = dev; 1841 int i, r = 0; 1842 1843 for (i = 0; i < adev->num_ip_blocks; i++) { 1844 if (!adev->ip_blocks[i].status.valid) 1845 continue; 1846 if (adev->ip_blocks[i].version->type != block_type) 1847 continue; 1848 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1849 continue; 1850 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1851 (void *)adev, state); 1852 if (r) 1853 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1854 adev->ip_blocks[i].version->funcs->name, r); 1855 } 1856 return r; 1857 } 1858 1859 /** 1860 * amdgpu_device_ip_get_clockgating_state - get the CG state 1861 * 1862 * @adev: amdgpu_device pointer 1863 * @flags: clockgating feature flags 1864 * 1865 * Walks the list of IPs on the device and updates the clockgating 1866 * flags for each IP. 1867 * Updates @flags with the feature flags for each hardware IP where 1868 * clockgating is enabled. 1869 */ 1870 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1871 u64 *flags) 1872 { 1873 int i; 1874 1875 for (i = 0; i < adev->num_ip_blocks; i++) { 1876 if (!adev->ip_blocks[i].status.valid) 1877 continue; 1878 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1879 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1880 } 1881 } 1882 1883 /** 1884 * amdgpu_device_ip_wait_for_idle - wait for idle 1885 * 1886 * @adev: amdgpu_device pointer 1887 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1888 * 1889 * Waits for the request hardware IP to be idle. 1890 * Returns 0 for success or a negative error code on failure. 1891 */ 1892 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1893 enum amd_ip_block_type block_type) 1894 { 1895 int i, r; 1896 1897 for (i = 0; i < adev->num_ip_blocks; i++) { 1898 if (!adev->ip_blocks[i].status.valid) 1899 continue; 1900 if (adev->ip_blocks[i].version->type == block_type) { 1901 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1902 if (r) 1903 return r; 1904 break; 1905 } 1906 } 1907 return 0; 1908 1909 } 1910 1911 /** 1912 * amdgpu_device_ip_is_idle - is the hardware IP idle 1913 * 1914 * @adev: amdgpu_device pointer 1915 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1916 * 1917 * Check if the hardware IP is idle or not. 1918 * Returns true if it the IP is idle, false if not. 1919 */ 1920 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1921 enum amd_ip_block_type block_type) 1922 { 1923 int i; 1924 1925 for (i = 0; i < adev->num_ip_blocks; i++) { 1926 if (!adev->ip_blocks[i].status.valid) 1927 continue; 1928 if (adev->ip_blocks[i].version->type == block_type) 1929 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1930 } 1931 return true; 1932 1933 } 1934 1935 /** 1936 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1937 * 1938 * @adev: amdgpu_device pointer 1939 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1940 * 1941 * Returns a pointer to the hardware IP block structure 1942 * if it exists for the asic, otherwise NULL. 1943 */ 1944 struct amdgpu_ip_block * 1945 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1946 enum amd_ip_block_type type) 1947 { 1948 int i; 1949 1950 for (i = 0; i < adev->num_ip_blocks; i++) 1951 if (adev->ip_blocks[i].version->type == type) 1952 return &adev->ip_blocks[i]; 1953 1954 return NULL; 1955 } 1956 1957 /** 1958 * amdgpu_device_ip_block_version_cmp 1959 * 1960 * @adev: amdgpu_device pointer 1961 * @type: enum amd_ip_block_type 1962 * @major: major version 1963 * @minor: minor version 1964 * 1965 * return 0 if equal or greater 1966 * return 1 if smaller or the ip_block doesn't exist 1967 */ 1968 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1969 enum amd_ip_block_type type, 1970 u32 major, u32 minor) 1971 { 1972 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1973 1974 if (ip_block && ((ip_block->version->major > major) || 1975 ((ip_block->version->major == major) && 1976 (ip_block->version->minor >= minor)))) 1977 return 0; 1978 1979 return 1; 1980 } 1981 1982 /** 1983 * amdgpu_device_ip_block_add 1984 * 1985 * @adev: amdgpu_device pointer 1986 * @ip_block_version: pointer to the IP to add 1987 * 1988 * Adds the IP block driver information to the collection of IPs 1989 * on the asic. 1990 */ 1991 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1992 const struct amdgpu_ip_block_version *ip_block_version) 1993 { 1994 if (!ip_block_version) 1995 return -EINVAL; 1996 1997 switch (ip_block_version->type) { 1998 case AMD_IP_BLOCK_TYPE_VCN: 1999 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2000 return 0; 2001 break; 2002 case AMD_IP_BLOCK_TYPE_JPEG: 2003 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2004 return 0; 2005 break; 2006 default: 2007 break; 2008 } 2009 2010 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2011 ip_block_version->funcs->name); 2012 2013 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2014 2015 return 0; 2016 } 2017 2018 /** 2019 * amdgpu_device_enable_virtual_display - enable virtual display feature 2020 * 2021 * @adev: amdgpu_device pointer 2022 * 2023 * Enabled the virtual display feature if the user has enabled it via 2024 * the module parameter virtual_display. This feature provides a virtual 2025 * display hardware on headless boards or in virtualized environments. 2026 * This function parses and validates the configuration string specified by 2027 * the user and configues the virtual display configuration (number of 2028 * virtual connectors, crtcs, etc.) specified. 2029 */ 2030 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2031 { 2032 adev->enable_virtual_display = false; 2033 2034 if (amdgpu_virtual_display) { 2035 const char *pci_address_name = pci_name(adev->pdev); 2036 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2037 2038 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2039 pciaddstr_tmp = pciaddstr; 2040 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2041 pciaddname = strsep(&pciaddname_tmp, ","); 2042 if (!strcmp("all", pciaddname) 2043 || !strcmp(pci_address_name, pciaddname)) { 2044 long num_crtc; 2045 int res = -1; 2046 2047 adev->enable_virtual_display = true; 2048 2049 if (pciaddname_tmp) 2050 res = kstrtol(pciaddname_tmp, 10, 2051 &num_crtc); 2052 2053 if (!res) { 2054 if (num_crtc < 1) 2055 num_crtc = 1; 2056 if (num_crtc > 6) 2057 num_crtc = 6; 2058 adev->mode_info.num_crtc = num_crtc; 2059 } else { 2060 adev->mode_info.num_crtc = 1; 2061 } 2062 break; 2063 } 2064 } 2065 2066 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2067 amdgpu_virtual_display, pci_address_name, 2068 adev->enable_virtual_display, adev->mode_info.num_crtc); 2069 2070 kfree(pciaddstr); 2071 } 2072 } 2073 2074 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2075 { 2076 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2077 adev->mode_info.num_crtc = 1; 2078 adev->enable_virtual_display = true; 2079 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2080 adev->enable_virtual_display, adev->mode_info.num_crtc); 2081 } 2082 } 2083 2084 /** 2085 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2086 * 2087 * @adev: amdgpu_device pointer 2088 * 2089 * Parses the asic configuration parameters specified in the gpu info 2090 * firmware and makes them availale to the driver for use in configuring 2091 * the asic. 2092 * Returns 0 on success, -EINVAL on failure. 2093 */ 2094 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2095 { 2096 const char *chip_name; 2097 char fw_name[40]; 2098 int err; 2099 const struct gpu_info_firmware_header_v1_0 *hdr; 2100 2101 adev->firmware.gpu_info_fw = NULL; 2102 2103 if (adev->mman.discovery_bin) { 2104 /* 2105 * FIXME: The bounding box is still needed by Navi12, so 2106 * temporarily read it from gpu_info firmware. Should be dropped 2107 * when DAL no longer needs it. 2108 */ 2109 if (adev->asic_type != CHIP_NAVI12) 2110 return 0; 2111 } 2112 2113 switch (adev->asic_type) { 2114 default: 2115 return 0; 2116 case CHIP_VEGA10: 2117 chip_name = "vega10"; 2118 break; 2119 case CHIP_VEGA12: 2120 chip_name = "vega12"; 2121 break; 2122 case CHIP_RAVEN: 2123 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2124 chip_name = "raven2"; 2125 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2126 chip_name = "picasso"; 2127 else 2128 chip_name = "raven"; 2129 break; 2130 case CHIP_ARCTURUS: 2131 chip_name = "arcturus"; 2132 break; 2133 case CHIP_NAVI12: 2134 chip_name = "navi12"; 2135 break; 2136 } 2137 2138 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2139 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2140 if (err) { 2141 dev_err(adev->dev, 2142 "Failed to get gpu_info firmware \"%s\"\n", 2143 fw_name); 2144 goto out; 2145 } 2146 2147 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2148 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2149 2150 switch (hdr->version_major) { 2151 case 1: 2152 { 2153 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2154 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2155 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2156 2157 /* 2158 * Should be droped when DAL no longer needs it. 2159 */ 2160 if (adev->asic_type == CHIP_NAVI12) 2161 goto parse_soc_bounding_box; 2162 2163 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2164 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2165 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2166 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2167 adev->gfx.config.max_texture_channel_caches = 2168 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2169 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2170 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2171 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2172 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2173 adev->gfx.config.double_offchip_lds_buf = 2174 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2175 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2176 adev->gfx.cu_info.max_waves_per_simd = 2177 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2178 adev->gfx.cu_info.max_scratch_slots_per_cu = 2179 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2180 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2181 if (hdr->version_minor >= 1) { 2182 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2183 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2184 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2185 adev->gfx.config.num_sc_per_sh = 2186 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2187 adev->gfx.config.num_packer_per_sc = 2188 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2189 } 2190 2191 parse_soc_bounding_box: 2192 /* 2193 * soc bounding box info is not integrated in disocovery table, 2194 * we always need to parse it from gpu info firmware if needed. 2195 */ 2196 if (hdr->version_minor == 2) { 2197 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2198 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2199 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2200 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2201 } 2202 break; 2203 } 2204 default: 2205 dev_err(adev->dev, 2206 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2207 err = -EINVAL; 2208 goto out; 2209 } 2210 out: 2211 return err; 2212 } 2213 2214 /** 2215 * amdgpu_device_ip_early_init - run early init for hardware IPs 2216 * 2217 * @adev: amdgpu_device pointer 2218 * 2219 * Early initialization pass for hardware IPs. The hardware IPs that make 2220 * up each asic are discovered each IP's early_init callback is run. This 2221 * is the first stage in initializing the asic. 2222 * Returns 0 on success, negative error code on failure. 2223 */ 2224 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2225 { 2226 struct drm_device *dev = adev_to_drm(adev); 2227 struct pci_dev *parent; 2228 int i, r; 2229 bool total; 2230 2231 amdgpu_device_enable_virtual_display(adev); 2232 2233 if (amdgpu_sriov_vf(adev)) { 2234 r = amdgpu_virt_request_full_gpu(adev, true); 2235 if (r) 2236 return r; 2237 } 2238 2239 switch (adev->asic_type) { 2240 #ifdef CONFIG_DRM_AMDGPU_SI 2241 case CHIP_VERDE: 2242 case CHIP_TAHITI: 2243 case CHIP_PITCAIRN: 2244 case CHIP_OLAND: 2245 case CHIP_HAINAN: 2246 adev->family = AMDGPU_FAMILY_SI; 2247 r = si_set_ip_blocks(adev); 2248 if (r) 2249 return r; 2250 break; 2251 #endif 2252 #ifdef CONFIG_DRM_AMDGPU_CIK 2253 case CHIP_BONAIRE: 2254 case CHIP_HAWAII: 2255 case CHIP_KAVERI: 2256 case CHIP_KABINI: 2257 case CHIP_MULLINS: 2258 if (adev->flags & AMD_IS_APU) 2259 adev->family = AMDGPU_FAMILY_KV; 2260 else 2261 adev->family = AMDGPU_FAMILY_CI; 2262 2263 r = cik_set_ip_blocks(adev); 2264 if (r) 2265 return r; 2266 break; 2267 #endif 2268 case CHIP_TOPAZ: 2269 case CHIP_TONGA: 2270 case CHIP_FIJI: 2271 case CHIP_POLARIS10: 2272 case CHIP_POLARIS11: 2273 case CHIP_POLARIS12: 2274 case CHIP_VEGAM: 2275 case CHIP_CARRIZO: 2276 case CHIP_STONEY: 2277 if (adev->flags & AMD_IS_APU) 2278 adev->family = AMDGPU_FAMILY_CZ; 2279 else 2280 adev->family = AMDGPU_FAMILY_VI; 2281 2282 r = vi_set_ip_blocks(adev); 2283 if (r) 2284 return r; 2285 break; 2286 default: 2287 r = amdgpu_discovery_set_ip_blocks(adev); 2288 if (r) 2289 return r; 2290 break; 2291 } 2292 2293 if (amdgpu_has_atpx() && 2294 (amdgpu_is_atpx_hybrid() || 2295 amdgpu_has_atpx_dgpu_power_cntl()) && 2296 ((adev->flags & AMD_IS_APU) == 0) && 2297 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2298 adev->flags |= AMD_IS_PX; 2299 2300 if (!(adev->flags & AMD_IS_APU)) { 2301 parent = pcie_find_root_port(adev->pdev); 2302 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2303 } 2304 2305 2306 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2307 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2308 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2309 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2310 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2311 if (!amdgpu_device_pcie_dynamic_switching_supported()) 2312 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2313 2314 total = true; 2315 for (i = 0; i < adev->num_ip_blocks; i++) { 2316 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2317 DRM_WARN("disabled ip block: %d <%s>\n", 2318 i, adev->ip_blocks[i].version->funcs->name); 2319 adev->ip_blocks[i].status.valid = false; 2320 } else { 2321 if (adev->ip_blocks[i].version->funcs->early_init) { 2322 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2323 if (r == -ENOENT) { 2324 adev->ip_blocks[i].status.valid = false; 2325 } else if (r) { 2326 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2327 adev->ip_blocks[i].version->funcs->name, r); 2328 total = false; 2329 } else { 2330 adev->ip_blocks[i].status.valid = true; 2331 } 2332 } else { 2333 adev->ip_blocks[i].status.valid = true; 2334 } 2335 } 2336 /* get the vbios after the asic_funcs are set up */ 2337 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2338 r = amdgpu_device_parse_gpu_info_fw(adev); 2339 if (r) 2340 return r; 2341 2342 /* Read BIOS */ 2343 if (amdgpu_device_read_bios(adev)) { 2344 if (!amdgpu_get_bios(adev)) 2345 return -EINVAL; 2346 2347 r = amdgpu_atombios_init(adev); 2348 if (r) { 2349 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2350 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2351 return r; 2352 } 2353 } 2354 2355 /*get pf2vf msg info at it's earliest time*/ 2356 if (amdgpu_sriov_vf(adev)) 2357 amdgpu_virt_init_data_exchange(adev); 2358 2359 } 2360 } 2361 if (!total) 2362 return -ENODEV; 2363 2364 amdgpu_amdkfd_device_probe(adev); 2365 adev->cg_flags &= amdgpu_cg_mask; 2366 adev->pg_flags &= amdgpu_pg_mask; 2367 2368 return 0; 2369 } 2370 2371 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2372 { 2373 int i, r; 2374 2375 for (i = 0; i < adev->num_ip_blocks; i++) { 2376 if (!adev->ip_blocks[i].status.sw) 2377 continue; 2378 if (adev->ip_blocks[i].status.hw) 2379 continue; 2380 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2381 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2382 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2383 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2384 if (r) { 2385 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2386 adev->ip_blocks[i].version->funcs->name, r); 2387 return r; 2388 } 2389 adev->ip_blocks[i].status.hw = true; 2390 } 2391 } 2392 2393 return 0; 2394 } 2395 2396 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2397 { 2398 int i, r; 2399 2400 for (i = 0; i < adev->num_ip_blocks; i++) { 2401 if (!adev->ip_blocks[i].status.sw) 2402 continue; 2403 if (adev->ip_blocks[i].status.hw) 2404 continue; 2405 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2406 if (r) { 2407 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2408 adev->ip_blocks[i].version->funcs->name, r); 2409 return r; 2410 } 2411 adev->ip_blocks[i].status.hw = true; 2412 } 2413 2414 return 0; 2415 } 2416 2417 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2418 { 2419 int r = 0; 2420 int i; 2421 uint32_t smu_version; 2422 2423 if (adev->asic_type >= CHIP_VEGA10) { 2424 for (i = 0; i < adev->num_ip_blocks; i++) { 2425 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2426 continue; 2427 2428 if (!adev->ip_blocks[i].status.sw) 2429 continue; 2430 2431 /* no need to do the fw loading again if already done*/ 2432 if (adev->ip_blocks[i].status.hw == true) 2433 break; 2434 2435 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2436 r = adev->ip_blocks[i].version->funcs->resume(adev); 2437 if (r) { 2438 DRM_ERROR("resume of IP block <%s> failed %d\n", 2439 adev->ip_blocks[i].version->funcs->name, r); 2440 return r; 2441 } 2442 } else { 2443 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2444 if (r) { 2445 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2446 adev->ip_blocks[i].version->funcs->name, r); 2447 return r; 2448 } 2449 } 2450 2451 adev->ip_blocks[i].status.hw = true; 2452 break; 2453 } 2454 } 2455 2456 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2457 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2458 2459 return r; 2460 } 2461 2462 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2463 { 2464 long timeout; 2465 int r, i; 2466 2467 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2468 struct amdgpu_ring *ring = adev->rings[i]; 2469 2470 /* No need to setup the GPU scheduler for rings that don't need it */ 2471 if (!ring || ring->no_scheduler) 2472 continue; 2473 2474 switch (ring->funcs->type) { 2475 case AMDGPU_RING_TYPE_GFX: 2476 timeout = adev->gfx_timeout; 2477 break; 2478 case AMDGPU_RING_TYPE_COMPUTE: 2479 timeout = adev->compute_timeout; 2480 break; 2481 case AMDGPU_RING_TYPE_SDMA: 2482 timeout = adev->sdma_timeout; 2483 break; 2484 default: 2485 timeout = adev->video_timeout; 2486 break; 2487 } 2488 2489 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2490 DRM_SCHED_PRIORITY_COUNT, 2491 ring->num_hw_submission, 0, 2492 timeout, adev->reset_domain->wq, 2493 ring->sched_score, ring->name, 2494 adev->dev); 2495 if (r) { 2496 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2497 ring->name); 2498 return r; 2499 } 2500 } 2501 2502 amdgpu_xcp_update_partition_sched_list(adev); 2503 2504 return 0; 2505 } 2506 2507 2508 /** 2509 * amdgpu_device_ip_init - run init for hardware IPs 2510 * 2511 * @adev: amdgpu_device pointer 2512 * 2513 * Main initialization pass for hardware IPs. The list of all the hardware 2514 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2515 * are run. sw_init initializes the software state associated with each IP 2516 * and hw_init initializes the hardware associated with each IP. 2517 * Returns 0 on success, negative error code on failure. 2518 */ 2519 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2520 { 2521 int i, r; 2522 2523 r = amdgpu_ras_init(adev); 2524 if (r) 2525 return r; 2526 2527 for (i = 0; i < adev->num_ip_blocks; i++) { 2528 if (!adev->ip_blocks[i].status.valid) 2529 continue; 2530 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2531 if (r) { 2532 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2533 adev->ip_blocks[i].version->funcs->name, r); 2534 goto init_failed; 2535 } 2536 adev->ip_blocks[i].status.sw = true; 2537 2538 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2539 /* need to do common hw init early so everything is set up for gmc */ 2540 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2541 if (r) { 2542 DRM_ERROR("hw_init %d failed %d\n", i, r); 2543 goto init_failed; 2544 } 2545 adev->ip_blocks[i].status.hw = true; 2546 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2547 /* need to do gmc hw init early so we can allocate gpu mem */ 2548 /* Try to reserve bad pages early */ 2549 if (amdgpu_sriov_vf(adev)) 2550 amdgpu_virt_exchange_data(adev); 2551 2552 r = amdgpu_device_mem_scratch_init(adev); 2553 if (r) { 2554 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2555 goto init_failed; 2556 } 2557 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2558 if (r) { 2559 DRM_ERROR("hw_init %d failed %d\n", i, r); 2560 goto init_failed; 2561 } 2562 r = amdgpu_device_wb_init(adev); 2563 if (r) { 2564 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2565 goto init_failed; 2566 } 2567 adev->ip_blocks[i].status.hw = true; 2568 2569 /* right after GMC hw init, we create CSA */ 2570 if (adev->gfx.mcbp) { 2571 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2572 AMDGPU_GEM_DOMAIN_VRAM | 2573 AMDGPU_GEM_DOMAIN_GTT, 2574 AMDGPU_CSA_SIZE); 2575 if (r) { 2576 DRM_ERROR("allocate CSA failed %d\n", r); 2577 goto init_failed; 2578 } 2579 } 2580 } 2581 } 2582 2583 if (amdgpu_sriov_vf(adev)) 2584 amdgpu_virt_init_data_exchange(adev); 2585 2586 r = amdgpu_ib_pool_init(adev); 2587 if (r) { 2588 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2589 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2590 goto init_failed; 2591 } 2592 2593 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2594 if (r) 2595 goto init_failed; 2596 2597 r = amdgpu_device_ip_hw_init_phase1(adev); 2598 if (r) 2599 goto init_failed; 2600 2601 r = amdgpu_device_fw_loading(adev); 2602 if (r) 2603 goto init_failed; 2604 2605 r = amdgpu_device_ip_hw_init_phase2(adev); 2606 if (r) 2607 goto init_failed; 2608 2609 /* 2610 * retired pages will be loaded from eeprom and reserved here, 2611 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2612 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2613 * for I2C communication which only true at this point. 2614 * 2615 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2616 * failure from bad gpu situation and stop amdgpu init process 2617 * accordingly. For other failed cases, it will still release all 2618 * the resource and print error message, rather than returning one 2619 * negative value to upper level. 2620 * 2621 * Note: theoretically, this should be called before all vram allocations 2622 * to protect retired page from abusing 2623 */ 2624 r = amdgpu_ras_recovery_init(adev); 2625 if (r) 2626 goto init_failed; 2627 2628 /** 2629 * In case of XGMI grab extra reference for reset domain for this device 2630 */ 2631 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2632 if (amdgpu_xgmi_add_device(adev) == 0) { 2633 if (!amdgpu_sriov_vf(adev)) { 2634 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2635 2636 if (WARN_ON(!hive)) { 2637 r = -ENOENT; 2638 goto init_failed; 2639 } 2640 2641 if (!hive->reset_domain || 2642 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2643 r = -ENOENT; 2644 amdgpu_put_xgmi_hive(hive); 2645 goto init_failed; 2646 } 2647 2648 /* Drop the early temporary reset domain we created for device */ 2649 amdgpu_reset_put_reset_domain(adev->reset_domain); 2650 adev->reset_domain = hive->reset_domain; 2651 amdgpu_put_xgmi_hive(hive); 2652 } 2653 } 2654 } 2655 2656 r = amdgpu_device_init_schedulers(adev); 2657 if (r) 2658 goto init_failed; 2659 2660 if (adev->mman.buffer_funcs_ring->sched.ready) 2661 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2662 2663 /* Don't init kfd if whole hive need to be reset during init */ 2664 if (!adev->gmc.xgmi.pending_reset) { 2665 kgd2kfd_init_zone_device(adev); 2666 amdgpu_amdkfd_device_init(adev); 2667 } 2668 2669 amdgpu_fru_get_product_info(adev); 2670 2671 init_failed: 2672 2673 return r; 2674 } 2675 2676 /** 2677 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2678 * 2679 * @adev: amdgpu_device pointer 2680 * 2681 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2682 * this function before a GPU reset. If the value is retained after a 2683 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2684 */ 2685 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2686 { 2687 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2688 } 2689 2690 /** 2691 * amdgpu_device_check_vram_lost - check if vram is valid 2692 * 2693 * @adev: amdgpu_device pointer 2694 * 2695 * Checks the reset magic value written to the gart pointer in VRAM. 2696 * The driver calls this after a GPU reset to see if the contents of 2697 * VRAM is lost or now. 2698 * returns true if vram is lost, false if not. 2699 */ 2700 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2701 { 2702 if (memcmp(adev->gart.ptr, adev->reset_magic, 2703 AMDGPU_RESET_MAGIC_NUM)) 2704 return true; 2705 2706 if (!amdgpu_in_reset(adev)) 2707 return false; 2708 2709 /* 2710 * For all ASICs with baco/mode1 reset, the VRAM is 2711 * always assumed to be lost. 2712 */ 2713 switch (amdgpu_asic_reset_method(adev)) { 2714 case AMD_RESET_METHOD_BACO: 2715 case AMD_RESET_METHOD_MODE1: 2716 return true; 2717 default: 2718 return false; 2719 } 2720 } 2721 2722 /** 2723 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2724 * 2725 * @adev: amdgpu_device pointer 2726 * @state: clockgating state (gate or ungate) 2727 * 2728 * The list of all the hardware IPs that make up the asic is walked and the 2729 * set_clockgating_state callbacks are run. 2730 * Late initialization pass enabling clockgating for hardware IPs. 2731 * Fini or suspend, pass disabling clockgating for hardware IPs. 2732 * Returns 0 on success, negative error code on failure. 2733 */ 2734 2735 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2736 enum amd_clockgating_state state) 2737 { 2738 int i, j, r; 2739 2740 if (amdgpu_emu_mode == 1) 2741 return 0; 2742 2743 for (j = 0; j < adev->num_ip_blocks; j++) { 2744 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2745 if (!adev->ip_blocks[i].status.late_initialized) 2746 continue; 2747 /* skip CG for GFX, SDMA on S0ix */ 2748 if (adev->in_s0ix && 2749 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2750 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2751 continue; 2752 /* skip CG for VCE/UVD, it's handled specially */ 2753 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2754 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2755 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2756 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2757 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2758 /* enable clockgating to save power */ 2759 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2760 state); 2761 if (r) { 2762 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2763 adev->ip_blocks[i].version->funcs->name, r); 2764 return r; 2765 } 2766 } 2767 } 2768 2769 return 0; 2770 } 2771 2772 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2773 enum amd_powergating_state state) 2774 { 2775 int i, j, r; 2776 2777 if (amdgpu_emu_mode == 1) 2778 return 0; 2779 2780 for (j = 0; j < adev->num_ip_blocks; j++) { 2781 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2782 if (!adev->ip_blocks[i].status.late_initialized) 2783 continue; 2784 /* skip PG for GFX, SDMA on S0ix */ 2785 if (adev->in_s0ix && 2786 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2787 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2788 continue; 2789 /* skip CG for VCE/UVD, it's handled specially */ 2790 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2791 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2792 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2793 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2794 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2795 /* enable powergating to save power */ 2796 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2797 state); 2798 if (r) { 2799 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2800 adev->ip_blocks[i].version->funcs->name, r); 2801 return r; 2802 } 2803 } 2804 } 2805 return 0; 2806 } 2807 2808 static int amdgpu_device_enable_mgpu_fan_boost(void) 2809 { 2810 struct amdgpu_gpu_instance *gpu_ins; 2811 struct amdgpu_device *adev; 2812 int i, ret = 0; 2813 2814 mutex_lock(&mgpu_info.mutex); 2815 2816 /* 2817 * MGPU fan boost feature should be enabled 2818 * only when there are two or more dGPUs in 2819 * the system 2820 */ 2821 if (mgpu_info.num_dgpu < 2) 2822 goto out; 2823 2824 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2825 gpu_ins = &(mgpu_info.gpu_ins[i]); 2826 adev = gpu_ins->adev; 2827 if (!(adev->flags & AMD_IS_APU) && 2828 !gpu_ins->mgpu_fan_enabled) { 2829 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2830 if (ret) 2831 break; 2832 2833 gpu_ins->mgpu_fan_enabled = 1; 2834 } 2835 } 2836 2837 out: 2838 mutex_unlock(&mgpu_info.mutex); 2839 2840 return ret; 2841 } 2842 2843 /** 2844 * amdgpu_device_ip_late_init - run late init for hardware IPs 2845 * 2846 * @adev: amdgpu_device pointer 2847 * 2848 * Late initialization pass for hardware IPs. The list of all the hardware 2849 * IPs that make up the asic is walked and the late_init callbacks are run. 2850 * late_init covers any special initialization that an IP requires 2851 * after all of the have been initialized or something that needs to happen 2852 * late in the init process. 2853 * Returns 0 on success, negative error code on failure. 2854 */ 2855 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2856 { 2857 struct amdgpu_gpu_instance *gpu_instance; 2858 int i = 0, r; 2859 2860 for (i = 0; i < adev->num_ip_blocks; i++) { 2861 if (!adev->ip_blocks[i].status.hw) 2862 continue; 2863 if (adev->ip_blocks[i].version->funcs->late_init) { 2864 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2865 if (r) { 2866 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2867 adev->ip_blocks[i].version->funcs->name, r); 2868 return r; 2869 } 2870 } 2871 adev->ip_blocks[i].status.late_initialized = true; 2872 } 2873 2874 r = amdgpu_ras_late_init(adev); 2875 if (r) { 2876 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2877 return r; 2878 } 2879 2880 amdgpu_ras_set_error_query_ready(adev, true); 2881 2882 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2883 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2884 2885 amdgpu_device_fill_reset_magic(adev); 2886 2887 r = amdgpu_device_enable_mgpu_fan_boost(); 2888 if (r) 2889 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2890 2891 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2892 if (amdgpu_passthrough(adev) && 2893 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2894 adev->asic_type == CHIP_ALDEBARAN)) 2895 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2896 2897 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2898 mutex_lock(&mgpu_info.mutex); 2899 2900 /* 2901 * Reset device p-state to low as this was booted with high. 2902 * 2903 * This should be performed only after all devices from the same 2904 * hive get initialized. 2905 * 2906 * However, it's unknown how many device in the hive in advance. 2907 * As this is counted one by one during devices initializations. 2908 * 2909 * So, we wait for all XGMI interlinked devices initialized. 2910 * This may bring some delays as those devices may come from 2911 * different hives. But that should be OK. 2912 */ 2913 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2914 for (i = 0; i < mgpu_info.num_gpu; i++) { 2915 gpu_instance = &(mgpu_info.gpu_ins[i]); 2916 if (gpu_instance->adev->flags & AMD_IS_APU) 2917 continue; 2918 2919 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2920 AMDGPU_XGMI_PSTATE_MIN); 2921 if (r) { 2922 DRM_ERROR("pstate setting failed (%d).\n", r); 2923 break; 2924 } 2925 } 2926 } 2927 2928 mutex_unlock(&mgpu_info.mutex); 2929 } 2930 2931 return 0; 2932 } 2933 2934 /** 2935 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2936 * 2937 * @adev: amdgpu_device pointer 2938 * 2939 * For ASICs need to disable SMC first 2940 */ 2941 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2942 { 2943 int i, r; 2944 2945 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 2946 return; 2947 2948 for (i = 0; i < adev->num_ip_blocks; i++) { 2949 if (!adev->ip_blocks[i].status.hw) 2950 continue; 2951 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2952 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2953 /* XXX handle errors */ 2954 if (r) { 2955 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2956 adev->ip_blocks[i].version->funcs->name, r); 2957 } 2958 adev->ip_blocks[i].status.hw = false; 2959 break; 2960 } 2961 } 2962 } 2963 2964 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2965 { 2966 int i, r; 2967 2968 for (i = 0; i < adev->num_ip_blocks; i++) { 2969 if (!adev->ip_blocks[i].version->funcs->early_fini) 2970 continue; 2971 2972 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2973 if (r) { 2974 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2975 adev->ip_blocks[i].version->funcs->name, r); 2976 } 2977 } 2978 2979 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2980 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2981 2982 amdgpu_amdkfd_suspend(adev, false); 2983 2984 /* Workaroud for ASICs need to disable SMC first */ 2985 amdgpu_device_smu_fini_early(adev); 2986 2987 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2988 if (!adev->ip_blocks[i].status.hw) 2989 continue; 2990 2991 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2992 /* XXX handle errors */ 2993 if (r) { 2994 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2995 adev->ip_blocks[i].version->funcs->name, r); 2996 } 2997 2998 adev->ip_blocks[i].status.hw = false; 2999 } 3000 3001 if (amdgpu_sriov_vf(adev)) { 3002 if (amdgpu_virt_release_full_gpu(adev, false)) 3003 DRM_ERROR("failed to release exclusive mode on fini\n"); 3004 } 3005 3006 return 0; 3007 } 3008 3009 /** 3010 * amdgpu_device_ip_fini - run fini for hardware IPs 3011 * 3012 * @adev: amdgpu_device pointer 3013 * 3014 * Main teardown pass for hardware IPs. The list of all the hardware 3015 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3016 * are run. hw_fini tears down the hardware associated with each IP 3017 * and sw_fini tears down any software state associated with each IP. 3018 * Returns 0 on success, negative error code on failure. 3019 */ 3020 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3021 { 3022 int i, r; 3023 3024 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3025 amdgpu_virt_release_ras_err_handler_data(adev); 3026 3027 if (adev->gmc.xgmi.num_physical_nodes > 1) 3028 amdgpu_xgmi_remove_device(adev); 3029 3030 amdgpu_amdkfd_device_fini_sw(adev); 3031 3032 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3033 if (!adev->ip_blocks[i].status.sw) 3034 continue; 3035 3036 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3037 amdgpu_ucode_free_bo(adev); 3038 amdgpu_free_static_csa(&adev->virt.csa_obj); 3039 amdgpu_device_wb_fini(adev); 3040 amdgpu_device_mem_scratch_fini(adev); 3041 amdgpu_ib_pool_fini(adev); 3042 } 3043 3044 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3045 /* XXX handle errors */ 3046 if (r) { 3047 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3048 adev->ip_blocks[i].version->funcs->name, r); 3049 } 3050 adev->ip_blocks[i].status.sw = false; 3051 adev->ip_blocks[i].status.valid = false; 3052 } 3053 3054 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3055 if (!adev->ip_blocks[i].status.late_initialized) 3056 continue; 3057 if (adev->ip_blocks[i].version->funcs->late_fini) 3058 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3059 adev->ip_blocks[i].status.late_initialized = false; 3060 } 3061 3062 amdgpu_ras_fini(adev); 3063 3064 return 0; 3065 } 3066 3067 /** 3068 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3069 * 3070 * @work: work_struct. 3071 */ 3072 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3073 { 3074 struct amdgpu_device *adev = 3075 container_of(work, struct amdgpu_device, delayed_init_work.work); 3076 int r; 3077 3078 r = amdgpu_ib_ring_tests(adev); 3079 if (r) 3080 DRM_ERROR("ib ring test failed (%d).\n", r); 3081 } 3082 3083 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3084 { 3085 struct amdgpu_device *adev = 3086 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3087 3088 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3089 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3090 3091 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3092 adev->gfx.gfx_off_state = true; 3093 } 3094 3095 /** 3096 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3097 * 3098 * @adev: amdgpu_device pointer 3099 * 3100 * Main suspend function for hardware IPs. The list of all the hardware 3101 * IPs that make up the asic is walked, clockgating is disabled and the 3102 * suspend callbacks are run. suspend puts the hardware and software state 3103 * in each IP into a state suitable for suspend. 3104 * Returns 0 on success, negative error code on failure. 3105 */ 3106 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3107 { 3108 int i, r; 3109 3110 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3111 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3112 3113 /* 3114 * Per PMFW team's suggestion, driver needs to handle gfxoff 3115 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3116 * scenario. Add the missing df cstate disablement here. 3117 */ 3118 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3119 dev_warn(adev->dev, "Failed to disallow df cstate"); 3120 3121 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3122 if (!adev->ip_blocks[i].status.valid) 3123 continue; 3124 3125 /* displays are handled separately */ 3126 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3127 continue; 3128 3129 /* XXX handle errors */ 3130 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3131 /* XXX handle errors */ 3132 if (r) { 3133 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3134 adev->ip_blocks[i].version->funcs->name, r); 3135 return r; 3136 } 3137 3138 adev->ip_blocks[i].status.hw = false; 3139 } 3140 3141 return 0; 3142 } 3143 3144 /** 3145 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3146 * 3147 * @adev: amdgpu_device pointer 3148 * 3149 * Main suspend function for hardware IPs. The list of all the hardware 3150 * IPs that make up the asic is walked, clockgating is disabled and the 3151 * suspend callbacks are run. suspend puts the hardware and software state 3152 * in each IP into a state suitable for suspend. 3153 * Returns 0 on success, negative error code on failure. 3154 */ 3155 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3156 { 3157 int i, r; 3158 3159 if (adev->in_s0ix) 3160 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3161 3162 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3163 if (!adev->ip_blocks[i].status.valid) 3164 continue; 3165 /* displays are handled in phase1 */ 3166 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3167 continue; 3168 /* PSP lost connection when err_event_athub occurs */ 3169 if (amdgpu_ras_intr_triggered() && 3170 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3171 adev->ip_blocks[i].status.hw = false; 3172 continue; 3173 } 3174 3175 /* skip unnecessary suspend if we do not initialize them yet */ 3176 if (adev->gmc.xgmi.pending_reset && 3177 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3178 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3179 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3180 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3181 adev->ip_blocks[i].status.hw = false; 3182 continue; 3183 } 3184 3185 /* skip suspend of gfx/mes and psp for S0ix 3186 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3187 * like at runtime. PSP is also part of the always on hardware 3188 * so no need to suspend it. 3189 */ 3190 if (adev->in_s0ix && 3191 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3192 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3193 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3194 continue; 3195 3196 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3197 if (adev->in_s0ix && 3198 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3199 IP_VERSION(5, 0, 0)) && 3200 (adev->ip_blocks[i].version->type == 3201 AMD_IP_BLOCK_TYPE_SDMA)) 3202 continue; 3203 3204 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3205 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3206 * from this location and RLC Autoload automatically also gets loaded 3207 * from here based on PMFW -> PSP message during re-init sequence. 3208 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3209 * the TMR and reload FWs again for IMU enabled APU ASICs. 3210 */ 3211 if (amdgpu_in_reset(adev) && 3212 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3213 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3214 continue; 3215 3216 /* XXX handle errors */ 3217 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3218 /* XXX handle errors */ 3219 if (r) { 3220 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3221 adev->ip_blocks[i].version->funcs->name, r); 3222 } 3223 adev->ip_blocks[i].status.hw = false; 3224 /* handle putting the SMC in the appropriate state */ 3225 if (!amdgpu_sriov_vf(adev)) { 3226 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3227 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3228 if (r) { 3229 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3230 adev->mp1_state, r); 3231 return r; 3232 } 3233 } 3234 } 3235 } 3236 3237 return 0; 3238 } 3239 3240 /** 3241 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3242 * 3243 * @adev: amdgpu_device pointer 3244 * 3245 * Main suspend function for hardware IPs. The list of all the hardware 3246 * IPs that make up the asic is walked, clockgating is disabled and the 3247 * suspend callbacks are run. suspend puts the hardware and software state 3248 * in each IP into a state suitable for suspend. 3249 * Returns 0 on success, negative error code on failure. 3250 */ 3251 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3252 { 3253 int r; 3254 3255 if (amdgpu_sriov_vf(adev)) { 3256 amdgpu_virt_fini_data_exchange(adev); 3257 amdgpu_virt_request_full_gpu(adev, false); 3258 } 3259 3260 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3261 3262 r = amdgpu_device_ip_suspend_phase1(adev); 3263 if (r) 3264 return r; 3265 r = amdgpu_device_ip_suspend_phase2(adev); 3266 3267 if (amdgpu_sriov_vf(adev)) 3268 amdgpu_virt_release_full_gpu(adev, false); 3269 3270 return r; 3271 } 3272 3273 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3274 { 3275 int i, r; 3276 3277 static enum amd_ip_block_type ip_order[] = { 3278 AMD_IP_BLOCK_TYPE_COMMON, 3279 AMD_IP_BLOCK_TYPE_GMC, 3280 AMD_IP_BLOCK_TYPE_PSP, 3281 AMD_IP_BLOCK_TYPE_IH, 3282 }; 3283 3284 for (i = 0; i < adev->num_ip_blocks; i++) { 3285 int j; 3286 struct amdgpu_ip_block *block; 3287 3288 block = &adev->ip_blocks[i]; 3289 block->status.hw = false; 3290 3291 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3292 3293 if (block->version->type != ip_order[j] || 3294 !block->status.valid) 3295 continue; 3296 3297 r = block->version->funcs->hw_init(adev); 3298 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3299 if (r) 3300 return r; 3301 block->status.hw = true; 3302 } 3303 } 3304 3305 return 0; 3306 } 3307 3308 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3309 { 3310 int i, r; 3311 3312 static enum amd_ip_block_type ip_order[] = { 3313 AMD_IP_BLOCK_TYPE_SMC, 3314 AMD_IP_BLOCK_TYPE_DCE, 3315 AMD_IP_BLOCK_TYPE_GFX, 3316 AMD_IP_BLOCK_TYPE_SDMA, 3317 AMD_IP_BLOCK_TYPE_MES, 3318 AMD_IP_BLOCK_TYPE_UVD, 3319 AMD_IP_BLOCK_TYPE_VCE, 3320 AMD_IP_BLOCK_TYPE_VCN, 3321 AMD_IP_BLOCK_TYPE_JPEG 3322 }; 3323 3324 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3325 int j; 3326 struct amdgpu_ip_block *block; 3327 3328 for (j = 0; j < adev->num_ip_blocks; j++) { 3329 block = &adev->ip_blocks[j]; 3330 3331 if (block->version->type != ip_order[i] || 3332 !block->status.valid || 3333 block->status.hw) 3334 continue; 3335 3336 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3337 r = block->version->funcs->resume(adev); 3338 else 3339 r = block->version->funcs->hw_init(adev); 3340 3341 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3342 if (r) 3343 return r; 3344 block->status.hw = true; 3345 } 3346 } 3347 3348 return 0; 3349 } 3350 3351 /** 3352 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3353 * 3354 * @adev: amdgpu_device pointer 3355 * 3356 * First resume function for hardware IPs. The list of all the hardware 3357 * IPs that make up the asic is walked and the resume callbacks are run for 3358 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3359 * after a suspend and updates the software state as necessary. This 3360 * function is also used for restoring the GPU after a GPU reset. 3361 * Returns 0 on success, negative error code on failure. 3362 */ 3363 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3364 { 3365 int i, r; 3366 3367 for (i = 0; i < adev->num_ip_blocks; i++) { 3368 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3369 continue; 3370 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3371 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3372 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3373 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3374 3375 r = adev->ip_blocks[i].version->funcs->resume(adev); 3376 if (r) { 3377 DRM_ERROR("resume of IP block <%s> failed %d\n", 3378 adev->ip_blocks[i].version->funcs->name, r); 3379 return r; 3380 } 3381 adev->ip_blocks[i].status.hw = true; 3382 } 3383 } 3384 3385 return 0; 3386 } 3387 3388 /** 3389 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3390 * 3391 * @adev: amdgpu_device pointer 3392 * 3393 * First resume function for hardware IPs. The list of all the hardware 3394 * IPs that make up the asic is walked and the resume callbacks are run for 3395 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3396 * functional state after a suspend and updates the software state as 3397 * necessary. This function is also used for restoring the GPU after a GPU 3398 * reset. 3399 * Returns 0 on success, negative error code on failure. 3400 */ 3401 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3402 { 3403 int i, r; 3404 3405 for (i = 0; i < adev->num_ip_blocks; i++) { 3406 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3407 continue; 3408 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3409 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3410 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3411 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3412 continue; 3413 r = adev->ip_blocks[i].version->funcs->resume(adev); 3414 if (r) { 3415 DRM_ERROR("resume of IP block <%s> failed %d\n", 3416 adev->ip_blocks[i].version->funcs->name, r); 3417 return r; 3418 } 3419 adev->ip_blocks[i].status.hw = true; 3420 } 3421 3422 return 0; 3423 } 3424 3425 /** 3426 * amdgpu_device_ip_resume - run resume for hardware IPs 3427 * 3428 * @adev: amdgpu_device pointer 3429 * 3430 * Main resume function for hardware IPs. The hardware IPs 3431 * are split into two resume functions because they are 3432 * also used in recovering from a GPU reset and some additional 3433 * steps need to be take between them. In this case (S3/S4) they are 3434 * run sequentially. 3435 * Returns 0 on success, negative error code on failure. 3436 */ 3437 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3438 { 3439 int r; 3440 3441 r = amdgpu_device_ip_resume_phase1(adev); 3442 if (r) 3443 return r; 3444 3445 r = amdgpu_device_fw_loading(adev); 3446 if (r) 3447 return r; 3448 3449 r = amdgpu_device_ip_resume_phase2(adev); 3450 3451 if (adev->mman.buffer_funcs_ring->sched.ready) 3452 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3453 3454 return r; 3455 } 3456 3457 /** 3458 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3459 * 3460 * @adev: amdgpu_device pointer 3461 * 3462 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3463 */ 3464 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3465 { 3466 if (amdgpu_sriov_vf(adev)) { 3467 if (adev->is_atom_fw) { 3468 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3469 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3470 } else { 3471 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3472 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3473 } 3474 3475 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3476 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3477 } 3478 } 3479 3480 /** 3481 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3482 * 3483 * @asic_type: AMD asic type 3484 * 3485 * Check if there is DC (new modesetting infrastructre) support for an asic. 3486 * returns true if DC has support, false if not. 3487 */ 3488 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3489 { 3490 switch (asic_type) { 3491 #ifdef CONFIG_DRM_AMDGPU_SI 3492 case CHIP_HAINAN: 3493 #endif 3494 case CHIP_TOPAZ: 3495 /* chips with no display hardware */ 3496 return false; 3497 #if defined(CONFIG_DRM_AMD_DC) 3498 case CHIP_TAHITI: 3499 case CHIP_PITCAIRN: 3500 case CHIP_VERDE: 3501 case CHIP_OLAND: 3502 /* 3503 * We have systems in the wild with these ASICs that require 3504 * LVDS and VGA support which is not supported with DC. 3505 * 3506 * Fallback to the non-DC driver here by default so as not to 3507 * cause regressions. 3508 */ 3509 #if defined(CONFIG_DRM_AMD_DC_SI) 3510 return amdgpu_dc > 0; 3511 #else 3512 return false; 3513 #endif 3514 case CHIP_BONAIRE: 3515 case CHIP_KAVERI: 3516 case CHIP_KABINI: 3517 case CHIP_MULLINS: 3518 /* 3519 * We have systems in the wild with these ASICs that require 3520 * VGA support which is not supported with DC. 3521 * 3522 * Fallback to the non-DC driver here by default so as not to 3523 * cause regressions. 3524 */ 3525 return amdgpu_dc > 0; 3526 default: 3527 return amdgpu_dc != 0; 3528 #else 3529 default: 3530 if (amdgpu_dc > 0) 3531 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3532 return false; 3533 #endif 3534 } 3535 } 3536 3537 /** 3538 * amdgpu_device_has_dc_support - check if dc is supported 3539 * 3540 * @adev: amdgpu_device pointer 3541 * 3542 * Returns true for supported, false for not supported 3543 */ 3544 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3545 { 3546 if (adev->enable_virtual_display || 3547 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3548 return false; 3549 3550 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3551 } 3552 3553 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3554 { 3555 struct amdgpu_device *adev = 3556 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3557 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3558 3559 /* It's a bug to not have a hive within this function */ 3560 if (WARN_ON(!hive)) 3561 return; 3562 3563 /* 3564 * Use task barrier to synchronize all xgmi reset works across the 3565 * hive. task_barrier_enter and task_barrier_exit will block 3566 * until all the threads running the xgmi reset works reach 3567 * those points. task_barrier_full will do both blocks. 3568 */ 3569 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3570 3571 task_barrier_enter(&hive->tb); 3572 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3573 3574 if (adev->asic_reset_res) 3575 goto fail; 3576 3577 task_barrier_exit(&hive->tb); 3578 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3579 3580 if (adev->asic_reset_res) 3581 goto fail; 3582 3583 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3584 } else { 3585 3586 task_barrier_full(&hive->tb); 3587 adev->asic_reset_res = amdgpu_asic_reset(adev); 3588 } 3589 3590 fail: 3591 if (adev->asic_reset_res) 3592 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3593 adev->asic_reset_res, adev_to_drm(adev)->unique); 3594 amdgpu_put_xgmi_hive(hive); 3595 } 3596 3597 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3598 { 3599 char *input = amdgpu_lockup_timeout; 3600 char *timeout_setting = NULL; 3601 int index = 0; 3602 long timeout; 3603 int ret = 0; 3604 3605 /* 3606 * By default timeout for non compute jobs is 10000 3607 * and 60000 for compute jobs. 3608 * In SR-IOV or passthrough mode, timeout for compute 3609 * jobs are 60000 by default. 3610 */ 3611 adev->gfx_timeout = msecs_to_jiffies(10000); 3612 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3613 if (amdgpu_sriov_vf(adev)) 3614 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3615 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3616 else 3617 adev->compute_timeout = msecs_to_jiffies(60000); 3618 3619 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3620 while ((timeout_setting = strsep(&input, ",")) && 3621 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3622 ret = kstrtol(timeout_setting, 0, &timeout); 3623 if (ret) 3624 return ret; 3625 3626 if (timeout == 0) { 3627 index++; 3628 continue; 3629 } else if (timeout < 0) { 3630 timeout = MAX_SCHEDULE_TIMEOUT; 3631 dev_warn(adev->dev, "lockup timeout disabled"); 3632 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3633 } else { 3634 timeout = msecs_to_jiffies(timeout); 3635 } 3636 3637 switch (index++) { 3638 case 0: 3639 adev->gfx_timeout = timeout; 3640 break; 3641 case 1: 3642 adev->compute_timeout = timeout; 3643 break; 3644 case 2: 3645 adev->sdma_timeout = timeout; 3646 break; 3647 case 3: 3648 adev->video_timeout = timeout; 3649 break; 3650 default: 3651 break; 3652 } 3653 } 3654 /* 3655 * There is only one value specified and 3656 * it should apply to all non-compute jobs. 3657 */ 3658 if (index == 1) { 3659 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3660 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3661 adev->compute_timeout = adev->gfx_timeout; 3662 } 3663 } 3664 3665 return ret; 3666 } 3667 3668 /** 3669 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3670 * 3671 * @adev: amdgpu_device pointer 3672 * 3673 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3674 */ 3675 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3676 { 3677 struct iommu_domain *domain; 3678 3679 domain = iommu_get_domain_for_dev(adev->dev); 3680 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3681 adev->ram_is_direct_mapped = true; 3682 } 3683 3684 static const struct attribute *amdgpu_dev_attributes[] = { 3685 &dev_attr_pcie_replay_count.attr, 3686 NULL 3687 }; 3688 3689 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3690 { 3691 if (amdgpu_mcbp == 1) 3692 adev->gfx.mcbp = true; 3693 else if (amdgpu_mcbp == 0) 3694 adev->gfx.mcbp = false; 3695 else if ((amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 0, 0)) && 3696 (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(10, 0, 0)) && 3697 adev->gfx.num_gfx_rings) 3698 adev->gfx.mcbp = true; 3699 3700 if (amdgpu_sriov_vf(adev)) 3701 adev->gfx.mcbp = true; 3702 3703 if (adev->gfx.mcbp) 3704 DRM_INFO("MCBP is enabled\n"); 3705 } 3706 3707 /** 3708 * amdgpu_device_init - initialize the driver 3709 * 3710 * @adev: amdgpu_device pointer 3711 * @flags: driver flags 3712 * 3713 * Initializes the driver info and hw (all asics). 3714 * Returns 0 for success or an error on failure. 3715 * Called at driver startup. 3716 */ 3717 int amdgpu_device_init(struct amdgpu_device *adev, 3718 uint32_t flags) 3719 { 3720 struct drm_device *ddev = adev_to_drm(adev); 3721 struct pci_dev *pdev = adev->pdev; 3722 int r, i; 3723 bool px = false; 3724 u32 max_MBps; 3725 int tmp; 3726 3727 adev->shutdown = false; 3728 adev->flags = flags; 3729 3730 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3731 adev->asic_type = amdgpu_force_asic_type; 3732 else 3733 adev->asic_type = flags & AMD_ASIC_MASK; 3734 3735 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3736 if (amdgpu_emu_mode == 1) 3737 adev->usec_timeout *= 10; 3738 adev->gmc.gart_size = 512 * 1024 * 1024; 3739 adev->accel_working = false; 3740 adev->num_rings = 0; 3741 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3742 adev->mman.buffer_funcs = NULL; 3743 adev->mman.buffer_funcs_ring = NULL; 3744 adev->vm_manager.vm_pte_funcs = NULL; 3745 adev->vm_manager.vm_pte_num_scheds = 0; 3746 adev->gmc.gmc_funcs = NULL; 3747 adev->harvest_ip_mask = 0x0; 3748 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3749 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3750 3751 adev->smc_rreg = &amdgpu_invalid_rreg; 3752 adev->smc_wreg = &amdgpu_invalid_wreg; 3753 adev->pcie_rreg = &amdgpu_invalid_rreg; 3754 adev->pcie_wreg = &amdgpu_invalid_wreg; 3755 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3756 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3757 adev->pciep_rreg = &amdgpu_invalid_rreg; 3758 adev->pciep_wreg = &amdgpu_invalid_wreg; 3759 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3760 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3761 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3762 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3763 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3764 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3765 adev->didt_rreg = &amdgpu_invalid_rreg; 3766 adev->didt_wreg = &amdgpu_invalid_wreg; 3767 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3768 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3769 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3770 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3771 3772 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3773 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3774 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3775 3776 /* mutex initialization are all done here so we 3777 * can recall function without having locking issues 3778 */ 3779 mutex_init(&adev->firmware.mutex); 3780 mutex_init(&adev->pm.mutex); 3781 mutex_init(&adev->gfx.gpu_clock_mutex); 3782 mutex_init(&adev->srbm_mutex); 3783 mutex_init(&adev->gfx.pipe_reserve_mutex); 3784 mutex_init(&adev->gfx.gfx_off_mutex); 3785 mutex_init(&adev->gfx.partition_mutex); 3786 mutex_init(&adev->grbm_idx_mutex); 3787 mutex_init(&adev->mn_lock); 3788 mutex_init(&adev->virt.vf_errors.lock); 3789 hash_init(adev->mn_hash); 3790 mutex_init(&adev->psp.mutex); 3791 mutex_init(&adev->notifier_lock); 3792 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3793 mutex_init(&adev->benchmark_mutex); 3794 3795 amdgpu_device_init_apu_flags(adev); 3796 3797 r = amdgpu_device_check_arguments(adev); 3798 if (r) 3799 return r; 3800 3801 spin_lock_init(&adev->mmio_idx_lock); 3802 spin_lock_init(&adev->smc_idx_lock); 3803 spin_lock_init(&adev->pcie_idx_lock); 3804 spin_lock_init(&adev->uvd_ctx_idx_lock); 3805 spin_lock_init(&adev->didt_idx_lock); 3806 spin_lock_init(&adev->gc_cac_idx_lock); 3807 spin_lock_init(&adev->se_cac_idx_lock); 3808 spin_lock_init(&adev->audio_endpt_idx_lock); 3809 spin_lock_init(&adev->mm_stats.lock); 3810 3811 INIT_LIST_HEAD(&adev->shadow_list); 3812 mutex_init(&adev->shadow_list_lock); 3813 3814 INIT_LIST_HEAD(&adev->reset_list); 3815 3816 INIT_LIST_HEAD(&adev->ras_list); 3817 3818 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3819 3820 INIT_DELAYED_WORK(&adev->delayed_init_work, 3821 amdgpu_device_delayed_init_work_handler); 3822 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3823 amdgpu_device_delay_enable_gfx_off); 3824 3825 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3826 3827 adev->gfx.gfx_off_req_count = 1; 3828 adev->gfx.gfx_off_residency = 0; 3829 adev->gfx.gfx_off_entrycount = 0; 3830 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3831 3832 atomic_set(&adev->throttling_logging_enabled, 1); 3833 /* 3834 * If throttling continues, logging will be performed every minute 3835 * to avoid log flooding. "-1" is subtracted since the thermal 3836 * throttling interrupt comes every second. Thus, the total logging 3837 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3838 * for throttling interrupt) = 60 seconds. 3839 */ 3840 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3841 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3842 3843 /* Registers mapping */ 3844 /* TODO: block userspace mapping of io register */ 3845 if (adev->asic_type >= CHIP_BONAIRE) { 3846 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3847 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3848 } else { 3849 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3850 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3851 } 3852 3853 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3854 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3855 3856 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3857 if (!adev->rmmio) 3858 return -ENOMEM; 3859 3860 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3861 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3862 3863 /* 3864 * Reset domain needs to be present early, before XGMI hive discovered 3865 * (if any) and intitialized to use reset sem and in_gpu reset flag 3866 * early on during init and before calling to RREG32. 3867 */ 3868 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3869 if (!adev->reset_domain) 3870 return -ENOMEM; 3871 3872 /* detect hw virtualization here */ 3873 amdgpu_detect_virtualization(adev); 3874 3875 amdgpu_device_get_pcie_info(adev); 3876 3877 r = amdgpu_device_get_job_timeout_settings(adev); 3878 if (r) { 3879 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3880 return r; 3881 } 3882 3883 /* early init functions */ 3884 r = amdgpu_device_ip_early_init(adev); 3885 if (r) 3886 return r; 3887 3888 amdgpu_device_set_mcbp(adev); 3889 3890 /* Get rid of things like offb */ 3891 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3892 if (r) 3893 return r; 3894 3895 /* Enable TMZ based on IP_VERSION */ 3896 amdgpu_gmc_tmz_set(adev); 3897 3898 amdgpu_gmc_noretry_set(adev); 3899 /* Need to get xgmi info early to decide the reset behavior*/ 3900 if (adev->gmc.xgmi.supported) { 3901 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3902 if (r) 3903 return r; 3904 } 3905 3906 /* enable PCIE atomic ops */ 3907 if (amdgpu_sriov_vf(adev)) { 3908 if (adev->virt.fw_reserve.p_pf2vf) 3909 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3910 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3911 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3912 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3913 * internal path natively support atomics, set have_atomics_support to true. 3914 */ 3915 } else if ((adev->flags & AMD_IS_APU) && 3916 (amdgpu_ip_version(adev, GC_HWIP, 0) > 3917 IP_VERSION(9, 0, 0))) { 3918 adev->have_atomics_support = true; 3919 } else { 3920 adev->have_atomics_support = 3921 !pci_enable_atomic_ops_to_root(adev->pdev, 3922 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3923 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3924 } 3925 3926 if (!adev->have_atomics_support) 3927 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3928 3929 /* doorbell bar mapping and doorbell index init*/ 3930 amdgpu_doorbell_init(adev); 3931 3932 if (amdgpu_emu_mode == 1) { 3933 /* post the asic on emulation mode */ 3934 emu_soc_asic_init(adev); 3935 goto fence_driver_init; 3936 } 3937 3938 amdgpu_reset_init(adev); 3939 3940 /* detect if we are with an SRIOV vbios */ 3941 if (adev->bios) 3942 amdgpu_device_detect_sriov_bios(adev); 3943 3944 /* check if we need to reset the asic 3945 * E.g., driver was not cleanly unloaded previously, etc. 3946 */ 3947 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3948 if (adev->gmc.xgmi.num_physical_nodes) { 3949 dev_info(adev->dev, "Pending hive reset.\n"); 3950 adev->gmc.xgmi.pending_reset = true; 3951 /* Only need to init necessary block for SMU to handle the reset */ 3952 for (i = 0; i < adev->num_ip_blocks; i++) { 3953 if (!adev->ip_blocks[i].status.valid) 3954 continue; 3955 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3956 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3957 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3958 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3959 DRM_DEBUG("IP %s disabled for hw_init.\n", 3960 adev->ip_blocks[i].version->funcs->name); 3961 adev->ip_blocks[i].status.hw = true; 3962 } 3963 } 3964 } else { 3965 tmp = amdgpu_reset_method; 3966 /* It should do a default reset when loading or reloading the driver, 3967 * regardless of the module parameter reset_method. 3968 */ 3969 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3970 r = amdgpu_asic_reset(adev); 3971 amdgpu_reset_method = tmp; 3972 if (r) { 3973 dev_err(adev->dev, "asic reset on init failed\n"); 3974 goto failed; 3975 } 3976 } 3977 } 3978 3979 /* Post card if necessary */ 3980 if (amdgpu_device_need_post(adev)) { 3981 if (!adev->bios) { 3982 dev_err(adev->dev, "no vBIOS found\n"); 3983 r = -EINVAL; 3984 goto failed; 3985 } 3986 DRM_INFO("GPU posting now...\n"); 3987 r = amdgpu_device_asic_init(adev); 3988 if (r) { 3989 dev_err(adev->dev, "gpu post error!\n"); 3990 goto failed; 3991 } 3992 } 3993 3994 if (adev->bios) { 3995 if (adev->is_atom_fw) { 3996 /* Initialize clocks */ 3997 r = amdgpu_atomfirmware_get_clock_info(adev); 3998 if (r) { 3999 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4000 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4001 goto failed; 4002 } 4003 } else { 4004 /* Initialize clocks */ 4005 r = amdgpu_atombios_get_clock_info(adev); 4006 if (r) { 4007 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4008 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4009 goto failed; 4010 } 4011 /* init i2c buses */ 4012 if (!amdgpu_device_has_dc_support(adev)) 4013 amdgpu_atombios_i2c_init(adev); 4014 } 4015 } 4016 4017 fence_driver_init: 4018 /* Fence driver */ 4019 r = amdgpu_fence_driver_sw_init(adev); 4020 if (r) { 4021 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4022 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4023 goto failed; 4024 } 4025 4026 /* init the mode config */ 4027 drm_mode_config_init(adev_to_drm(adev)); 4028 4029 r = amdgpu_device_ip_init(adev); 4030 if (r) { 4031 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4032 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4033 goto release_ras_con; 4034 } 4035 4036 amdgpu_fence_driver_hw_init(adev); 4037 4038 dev_info(adev->dev, 4039 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4040 adev->gfx.config.max_shader_engines, 4041 adev->gfx.config.max_sh_per_se, 4042 adev->gfx.config.max_cu_per_sh, 4043 adev->gfx.cu_info.number); 4044 4045 adev->accel_working = true; 4046 4047 amdgpu_vm_check_compute_bug(adev); 4048 4049 /* Initialize the buffer migration limit. */ 4050 if (amdgpu_moverate >= 0) 4051 max_MBps = amdgpu_moverate; 4052 else 4053 max_MBps = 8; /* Allow 8 MB/s. */ 4054 /* Get a log2 for easy divisions. */ 4055 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4056 4057 /* 4058 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4059 * Otherwise the mgpu fan boost feature will be skipped due to the 4060 * gpu instance is counted less. 4061 */ 4062 amdgpu_register_gpu_instance(adev); 4063 4064 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4065 * explicit gating rather than handling it automatically. 4066 */ 4067 if (!adev->gmc.xgmi.pending_reset) { 4068 r = amdgpu_device_ip_late_init(adev); 4069 if (r) { 4070 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4071 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4072 goto release_ras_con; 4073 } 4074 /* must succeed. */ 4075 amdgpu_ras_resume(adev); 4076 queue_delayed_work(system_wq, &adev->delayed_init_work, 4077 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4078 } 4079 4080 if (amdgpu_sriov_vf(adev)) { 4081 amdgpu_virt_release_full_gpu(adev, true); 4082 flush_delayed_work(&adev->delayed_init_work); 4083 } 4084 4085 /* 4086 * Place those sysfs registering after `late_init`. As some of those 4087 * operations performed in `late_init` might affect the sysfs 4088 * interfaces creating. 4089 */ 4090 r = amdgpu_atombios_sysfs_init(adev); 4091 if (r) 4092 drm_err(&adev->ddev, 4093 "registering atombios sysfs failed (%d).\n", r); 4094 4095 r = amdgpu_pm_sysfs_init(adev); 4096 if (r) 4097 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4098 4099 r = amdgpu_ucode_sysfs_init(adev); 4100 if (r) { 4101 adev->ucode_sysfs_en = false; 4102 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4103 } else 4104 adev->ucode_sysfs_en = true; 4105 4106 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4107 if (r) 4108 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4109 4110 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4111 if (r) 4112 dev_err(adev->dev, 4113 "Could not create amdgpu board attributes\n"); 4114 4115 amdgpu_fru_sysfs_init(adev); 4116 4117 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4118 r = amdgpu_pmu_init(adev); 4119 if (r) 4120 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4121 4122 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4123 if (amdgpu_device_cache_pci_state(adev->pdev)) 4124 pci_restore_state(pdev); 4125 4126 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4127 /* this will fail for cards that aren't VGA class devices, just 4128 * ignore it 4129 */ 4130 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4131 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4132 4133 px = amdgpu_device_supports_px(ddev); 4134 4135 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4136 apple_gmux_detect(NULL, NULL))) 4137 vga_switcheroo_register_client(adev->pdev, 4138 &amdgpu_switcheroo_ops, px); 4139 4140 if (px) 4141 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4142 4143 if (adev->gmc.xgmi.pending_reset) 4144 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4145 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4146 4147 amdgpu_device_check_iommu_direct_map(adev); 4148 4149 return 0; 4150 4151 release_ras_con: 4152 if (amdgpu_sriov_vf(adev)) 4153 amdgpu_virt_release_full_gpu(adev, true); 4154 4155 /* failed in exclusive mode due to timeout */ 4156 if (amdgpu_sriov_vf(adev) && 4157 !amdgpu_sriov_runtime(adev) && 4158 amdgpu_virt_mmio_blocked(adev) && 4159 !amdgpu_virt_wait_reset(adev)) { 4160 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4161 /* Don't send request since VF is inactive. */ 4162 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4163 adev->virt.ops = NULL; 4164 r = -EAGAIN; 4165 } 4166 amdgpu_release_ras_context(adev); 4167 4168 failed: 4169 amdgpu_vf_error_trans_all(adev); 4170 4171 return r; 4172 } 4173 4174 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4175 { 4176 4177 /* Clear all CPU mappings pointing to this device */ 4178 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4179 4180 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4181 amdgpu_doorbell_fini(adev); 4182 4183 iounmap(adev->rmmio); 4184 adev->rmmio = NULL; 4185 if (adev->mman.aper_base_kaddr) 4186 iounmap(adev->mman.aper_base_kaddr); 4187 adev->mman.aper_base_kaddr = NULL; 4188 4189 /* Memory manager related */ 4190 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4191 arch_phys_wc_del(adev->gmc.vram_mtrr); 4192 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4193 } 4194 } 4195 4196 /** 4197 * amdgpu_device_fini_hw - tear down the driver 4198 * 4199 * @adev: amdgpu_device pointer 4200 * 4201 * Tear down the driver info (all asics). 4202 * Called at driver shutdown. 4203 */ 4204 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4205 { 4206 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4207 flush_delayed_work(&adev->delayed_init_work); 4208 adev->shutdown = true; 4209 4210 /* make sure IB test finished before entering exclusive mode 4211 * to avoid preemption on IB test 4212 */ 4213 if (amdgpu_sriov_vf(adev)) { 4214 amdgpu_virt_request_full_gpu(adev, false); 4215 amdgpu_virt_fini_data_exchange(adev); 4216 } 4217 4218 /* disable all interrupts */ 4219 amdgpu_irq_disable_all(adev); 4220 if (adev->mode_info.mode_config_initialized) { 4221 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4222 drm_helper_force_disable_all(adev_to_drm(adev)); 4223 else 4224 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4225 } 4226 amdgpu_fence_driver_hw_fini(adev); 4227 4228 if (adev->mman.initialized) 4229 drain_workqueue(adev->mman.bdev.wq); 4230 4231 if (adev->pm.sysfs_initialized) 4232 amdgpu_pm_sysfs_fini(adev); 4233 if (adev->ucode_sysfs_en) 4234 amdgpu_ucode_sysfs_fini(adev); 4235 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4236 amdgpu_fru_sysfs_fini(adev); 4237 4238 /* disable ras feature must before hw fini */ 4239 amdgpu_ras_pre_fini(adev); 4240 4241 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4242 4243 amdgpu_device_ip_fini_early(adev); 4244 4245 amdgpu_irq_fini_hw(adev); 4246 4247 if (adev->mman.initialized) 4248 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4249 4250 amdgpu_gart_dummy_page_fini(adev); 4251 4252 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4253 amdgpu_device_unmap_mmio(adev); 4254 4255 } 4256 4257 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4258 { 4259 int idx; 4260 bool px; 4261 4262 amdgpu_fence_driver_sw_fini(adev); 4263 amdgpu_device_ip_fini(adev); 4264 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4265 adev->accel_working = false; 4266 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4267 4268 amdgpu_reset_fini(adev); 4269 4270 /* free i2c buses */ 4271 if (!amdgpu_device_has_dc_support(adev)) 4272 amdgpu_i2c_fini(adev); 4273 4274 if (amdgpu_emu_mode != 1) 4275 amdgpu_atombios_fini(adev); 4276 4277 kfree(adev->bios); 4278 adev->bios = NULL; 4279 4280 kfree(adev->fru_info); 4281 adev->fru_info = NULL; 4282 4283 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4284 4285 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4286 apple_gmux_detect(NULL, NULL))) 4287 vga_switcheroo_unregister_client(adev->pdev); 4288 4289 if (px) 4290 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4291 4292 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4293 vga_client_unregister(adev->pdev); 4294 4295 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4296 4297 iounmap(adev->rmmio); 4298 adev->rmmio = NULL; 4299 amdgpu_doorbell_fini(adev); 4300 drm_dev_exit(idx); 4301 } 4302 4303 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4304 amdgpu_pmu_fini(adev); 4305 if (adev->mman.discovery_bin) 4306 amdgpu_discovery_fini(adev); 4307 4308 amdgpu_reset_put_reset_domain(adev->reset_domain); 4309 adev->reset_domain = NULL; 4310 4311 kfree(adev->pci_state); 4312 4313 } 4314 4315 /** 4316 * amdgpu_device_evict_resources - evict device resources 4317 * @adev: amdgpu device object 4318 * 4319 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4320 * of the vram memory type. Mainly used for evicting device resources 4321 * at suspend time. 4322 * 4323 */ 4324 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4325 { 4326 int ret; 4327 4328 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4329 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4330 return 0; 4331 4332 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4333 if (ret) 4334 DRM_WARN("evicting device resources failed\n"); 4335 return ret; 4336 } 4337 4338 /* 4339 * Suspend & resume. 4340 */ 4341 /** 4342 * amdgpu_device_prepare - prepare for device suspend 4343 * 4344 * @dev: drm dev pointer 4345 * 4346 * Prepare to put the hw in the suspend state (all asics). 4347 * Returns 0 for success or an error on failure. 4348 * Called at driver suspend. 4349 */ 4350 int amdgpu_device_prepare(struct drm_device *dev) 4351 { 4352 struct amdgpu_device *adev = drm_to_adev(dev); 4353 int i, r; 4354 4355 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4356 return 0; 4357 4358 /* Evict the majority of BOs before starting suspend sequence */ 4359 r = amdgpu_device_evict_resources(adev); 4360 if (r) 4361 return r; 4362 4363 for (i = 0; i < adev->num_ip_blocks; i++) { 4364 if (!adev->ip_blocks[i].status.valid) 4365 continue; 4366 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4367 continue; 4368 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4369 if (r) 4370 return r; 4371 } 4372 4373 return 0; 4374 } 4375 4376 /** 4377 * amdgpu_device_suspend - initiate device suspend 4378 * 4379 * @dev: drm dev pointer 4380 * @fbcon : notify the fbdev of suspend 4381 * 4382 * Puts the hw in the suspend state (all asics). 4383 * Returns 0 for success or an error on failure. 4384 * Called at driver suspend. 4385 */ 4386 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4387 { 4388 struct amdgpu_device *adev = drm_to_adev(dev); 4389 int r = 0; 4390 4391 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4392 return 0; 4393 4394 adev->in_suspend = true; 4395 4396 if (amdgpu_sriov_vf(adev)) { 4397 amdgpu_virt_fini_data_exchange(adev); 4398 r = amdgpu_virt_request_full_gpu(adev, false); 4399 if (r) 4400 return r; 4401 } 4402 4403 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4404 DRM_WARN("smart shift update failed\n"); 4405 4406 if (fbcon) 4407 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4408 4409 cancel_delayed_work_sync(&adev->delayed_init_work); 4410 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4411 4412 amdgpu_ras_suspend(adev); 4413 4414 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4415 4416 amdgpu_device_ip_suspend_phase1(adev); 4417 4418 if (!adev->in_s0ix) 4419 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4420 4421 r = amdgpu_device_evict_resources(adev); 4422 if (r) 4423 return r; 4424 4425 amdgpu_fence_driver_hw_fini(adev); 4426 4427 amdgpu_device_ip_suspend_phase2(adev); 4428 4429 if (amdgpu_sriov_vf(adev)) 4430 amdgpu_virt_release_full_gpu(adev, false); 4431 4432 return 0; 4433 } 4434 4435 /** 4436 * amdgpu_device_resume - initiate device resume 4437 * 4438 * @dev: drm dev pointer 4439 * @fbcon : notify the fbdev of resume 4440 * 4441 * Bring the hw back to operating state (all asics). 4442 * Returns 0 for success or an error on failure. 4443 * Called at driver resume. 4444 */ 4445 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4446 { 4447 struct amdgpu_device *adev = drm_to_adev(dev); 4448 int r = 0; 4449 4450 if (amdgpu_sriov_vf(adev)) { 4451 r = amdgpu_virt_request_full_gpu(adev, true); 4452 if (r) 4453 return r; 4454 } 4455 4456 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4457 return 0; 4458 4459 if (adev->in_s0ix) 4460 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4461 4462 /* post card */ 4463 if (amdgpu_device_need_post(adev)) { 4464 r = amdgpu_device_asic_init(adev); 4465 if (r) 4466 dev_err(adev->dev, "amdgpu asic init failed\n"); 4467 } 4468 4469 r = amdgpu_device_ip_resume(adev); 4470 4471 if (r) { 4472 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4473 goto exit; 4474 } 4475 amdgpu_fence_driver_hw_init(adev); 4476 4477 r = amdgpu_device_ip_late_init(adev); 4478 if (r) 4479 goto exit; 4480 4481 queue_delayed_work(system_wq, &adev->delayed_init_work, 4482 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4483 4484 if (!adev->in_s0ix) { 4485 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4486 if (r) 4487 goto exit; 4488 } 4489 4490 exit: 4491 if (amdgpu_sriov_vf(adev)) { 4492 amdgpu_virt_init_data_exchange(adev); 4493 amdgpu_virt_release_full_gpu(adev, true); 4494 } 4495 4496 if (r) 4497 return r; 4498 4499 /* Make sure IB tests flushed */ 4500 flush_delayed_work(&adev->delayed_init_work); 4501 4502 if (fbcon) 4503 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4504 4505 amdgpu_ras_resume(adev); 4506 4507 if (adev->mode_info.num_crtc) { 4508 /* 4509 * Most of the connector probing functions try to acquire runtime pm 4510 * refs to ensure that the GPU is powered on when connector polling is 4511 * performed. Since we're calling this from a runtime PM callback, 4512 * trying to acquire rpm refs will cause us to deadlock. 4513 * 4514 * Since we're guaranteed to be holding the rpm lock, it's safe to 4515 * temporarily disable the rpm helpers so this doesn't deadlock us. 4516 */ 4517 #ifdef CONFIG_PM 4518 dev->dev->power.disable_depth++; 4519 #endif 4520 if (!adev->dc_enabled) 4521 drm_helper_hpd_irq_event(dev); 4522 else 4523 drm_kms_helper_hotplug_event(dev); 4524 #ifdef CONFIG_PM 4525 dev->dev->power.disable_depth--; 4526 #endif 4527 } 4528 adev->in_suspend = false; 4529 4530 if (adev->enable_mes) 4531 amdgpu_mes_self_test(adev); 4532 4533 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4534 DRM_WARN("smart shift update failed\n"); 4535 4536 return 0; 4537 } 4538 4539 /** 4540 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4541 * 4542 * @adev: amdgpu_device pointer 4543 * 4544 * The list of all the hardware IPs that make up the asic is walked and 4545 * the check_soft_reset callbacks are run. check_soft_reset determines 4546 * if the asic is still hung or not. 4547 * Returns true if any of the IPs are still in a hung state, false if not. 4548 */ 4549 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4550 { 4551 int i; 4552 bool asic_hang = false; 4553 4554 if (amdgpu_sriov_vf(adev)) 4555 return true; 4556 4557 if (amdgpu_asic_need_full_reset(adev)) 4558 return true; 4559 4560 for (i = 0; i < adev->num_ip_blocks; i++) { 4561 if (!adev->ip_blocks[i].status.valid) 4562 continue; 4563 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4564 adev->ip_blocks[i].status.hang = 4565 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4566 if (adev->ip_blocks[i].status.hang) { 4567 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4568 asic_hang = true; 4569 } 4570 } 4571 return asic_hang; 4572 } 4573 4574 /** 4575 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4576 * 4577 * @adev: amdgpu_device pointer 4578 * 4579 * The list of all the hardware IPs that make up the asic is walked and the 4580 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4581 * handles any IP specific hardware or software state changes that are 4582 * necessary for a soft reset to succeed. 4583 * Returns 0 on success, negative error code on failure. 4584 */ 4585 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4586 { 4587 int i, r = 0; 4588 4589 for (i = 0; i < adev->num_ip_blocks; i++) { 4590 if (!adev->ip_blocks[i].status.valid) 4591 continue; 4592 if (adev->ip_blocks[i].status.hang && 4593 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4594 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4595 if (r) 4596 return r; 4597 } 4598 } 4599 4600 return 0; 4601 } 4602 4603 /** 4604 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4605 * 4606 * @adev: amdgpu_device pointer 4607 * 4608 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4609 * reset is necessary to recover. 4610 * Returns true if a full asic reset is required, false if not. 4611 */ 4612 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4613 { 4614 int i; 4615 4616 if (amdgpu_asic_need_full_reset(adev)) 4617 return true; 4618 4619 for (i = 0; i < adev->num_ip_blocks; i++) { 4620 if (!adev->ip_blocks[i].status.valid) 4621 continue; 4622 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4623 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4624 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4625 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4626 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4627 if (adev->ip_blocks[i].status.hang) { 4628 dev_info(adev->dev, "Some block need full reset!\n"); 4629 return true; 4630 } 4631 } 4632 } 4633 return false; 4634 } 4635 4636 /** 4637 * amdgpu_device_ip_soft_reset - do a soft reset 4638 * 4639 * @adev: amdgpu_device pointer 4640 * 4641 * The list of all the hardware IPs that make up the asic is walked and the 4642 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4643 * IP specific hardware or software state changes that are necessary to soft 4644 * reset the IP. 4645 * Returns 0 on success, negative error code on failure. 4646 */ 4647 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4648 { 4649 int i, r = 0; 4650 4651 for (i = 0; i < adev->num_ip_blocks; i++) { 4652 if (!adev->ip_blocks[i].status.valid) 4653 continue; 4654 if (adev->ip_blocks[i].status.hang && 4655 adev->ip_blocks[i].version->funcs->soft_reset) { 4656 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4657 if (r) 4658 return r; 4659 } 4660 } 4661 4662 return 0; 4663 } 4664 4665 /** 4666 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4667 * 4668 * @adev: amdgpu_device pointer 4669 * 4670 * The list of all the hardware IPs that make up the asic is walked and the 4671 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4672 * handles any IP specific hardware or software state changes that are 4673 * necessary after the IP has been soft reset. 4674 * Returns 0 on success, negative error code on failure. 4675 */ 4676 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4677 { 4678 int i, r = 0; 4679 4680 for (i = 0; i < adev->num_ip_blocks; i++) { 4681 if (!adev->ip_blocks[i].status.valid) 4682 continue; 4683 if (adev->ip_blocks[i].status.hang && 4684 adev->ip_blocks[i].version->funcs->post_soft_reset) 4685 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4686 if (r) 4687 return r; 4688 } 4689 4690 return 0; 4691 } 4692 4693 /** 4694 * amdgpu_device_recover_vram - Recover some VRAM contents 4695 * 4696 * @adev: amdgpu_device pointer 4697 * 4698 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4699 * restore things like GPUVM page tables after a GPU reset where 4700 * the contents of VRAM might be lost. 4701 * 4702 * Returns: 4703 * 0 on success, negative error code on failure. 4704 */ 4705 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4706 { 4707 struct dma_fence *fence = NULL, *next = NULL; 4708 struct amdgpu_bo *shadow; 4709 struct amdgpu_bo_vm *vmbo; 4710 long r = 1, tmo; 4711 4712 if (amdgpu_sriov_runtime(adev)) 4713 tmo = msecs_to_jiffies(8000); 4714 else 4715 tmo = msecs_to_jiffies(100); 4716 4717 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4718 mutex_lock(&adev->shadow_list_lock); 4719 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4720 /* If vm is compute context or adev is APU, shadow will be NULL */ 4721 if (!vmbo->shadow) 4722 continue; 4723 shadow = vmbo->shadow; 4724 4725 /* No need to recover an evicted BO */ 4726 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4727 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4728 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4729 continue; 4730 4731 r = amdgpu_bo_restore_shadow(shadow, &next); 4732 if (r) 4733 break; 4734 4735 if (fence) { 4736 tmo = dma_fence_wait_timeout(fence, false, tmo); 4737 dma_fence_put(fence); 4738 fence = next; 4739 if (tmo == 0) { 4740 r = -ETIMEDOUT; 4741 break; 4742 } else if (tmo < 0) { 4743 r = tmo; 4744 break; 4745 } 4746 } else { 4747 fence = next; 4748 } 4749 } 4750 mutex_unlock(&adev->shadow_list_lock); 4751 4752 if (fence) 4753 tmo = dma_fence_wait_timeout(fence, false, tmo); 4754 dma_fence_put(fence); 4755 4756 if (r < 0 || tmo <= 0) { 4757 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4758 return -EIO; 4759 } 4760 4761 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4762 return 0; 4763 } 4764 4765 4766 /** 4767 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4768 * 4769 * @adev: amdgpu_device pointer 4770 * @from_hypervisor: request from hypervisor 4771 * 4772 * do VF FLR and reinitialize Asic 4773 * return 0 means succeeded otherwise failed 4774 */ 4775 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4776 bool from_hypervisor) 4777 { 4778 int r; 4779 struct amdgpu_hive_info *hive = NULL; 4780 int retry_limit = 0; 4781 4782 retry: 4783 amdgpu_amdkfd_pre_reset(adev); 4784 4785 if (from_hypervisor) 4786 r = amdgpu_virt_request_full_gpu(adev, true); 4787 else 4788 r = amdgpu_virt_reset_gpu(adev); 4789 if (r) 4790 return r; 4791 amdgpu_irq_gpu_reset_resume_helper(adev); 4792 4793 /* some sw clean up VF needs to do before recover */ 4794 amdgpu_virt_post_reset(adev); 4795 4796 /* Resume IP prior to SMC */ 4797 r = amdgpu_device_ip_reinit_early_sriov(adev); 4798 if (r) 4799 goto error; 4800 4801 amdgpu_virt_init_data_exchange(adev); 4802 4803 r = amdgpu_device_fw_loading(adev); 4804 if (r) 4805 return r; 4806 4807 /* now we are okay to resume SMC/CP/SDMA */ 4808 r = amdgpu_device_ip_reinit_late_sriov(adev); 4809 if (r) 4810 goto error; 4811 4812 hive = amdgpu_get_xgmi_hive(adev); 4813 /* Update PSP FW topology after reset */ 4814 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4815 r = amdgpu_xgmi_update_topology(hive, adev); 4816 4817 if (hive) 4818 amdgpu_put_xgmi_hive(hive); 4819 4820 if (!r) { 4821 r = amdgpu_ib_ring_tests(adev); 4822 4823 amdgpu_amdkfd_post_reset(adev); 4824 } 4825 4826 error: 4827 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4828 amdgpu_inc_vram_lost(adev); 4829 r = amdgpu_device_recover_vram(adev); 4830 } 4831 amdgpu_virt_release_full_gpu(adev, true); 4832 4833 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4834 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4835 retry_limit++; 4836 goto retry; 4837 } else 4838 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4839 } 4840 4841 return r; 4842 } 4843 4844 /** 4845 * amdgpu_device_has_job_running - check if there is any job in mirror list 4846 * 4847 * @adev: amdgpu_device pointer 4848 * 4849 * check if there is any job in mirror list 4850 */ 4851 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4852 { 4853 int i; 4854 struct drm_sched_job *job; 4855 4856 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4857 struct amdgpu_ring *ring = adev->rings[i]; 4858 4859 if (!ring || !ring->sched.thread) 4860 continue; 4861 4862 spin_lock(&ring->sched.job_list_lock); 4863 job = list_first_entry_or_null(&ring->sched.pending_list, 4864 struct drm_sched_job, list); 4865 spin_unlock(&ring->sched.job_list_lock); 4866 if (job) 4867 return true; 4868 } 4869 return false; 4870 } 4871 4872 /** 4873 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4874 * 4875 * @adev: amdgpu_device pointer 4876 * 4877 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4878 * a hung GPU. 4879 */ 4880 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4881 { 4882 4883 if (amdgpu_gpu_recovery == 0) 4884 goto disabled; 4885 4886 /* Skip soft reset check in fatal error mode */ 4887 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4888 return true; 4889 4890 if (amdgpu_sriov_vf(adev)) 4891 return true; 4892 4893 if (amdgpu_gpu_recovery == -1) { 4894 switch (adev->asic_type) { 4895 #ifdef CONFIG_DRM_AMDGPU_SI 4896 case CHIP_VERDE: 4897 case CHIP_TAHITI: 4898 case CHIP_PITCAIRN: 4899 case CHIP_OLAND: 4900 case CHIP_HAINAN: 4901 #endif 4902 #ifdef CONFIG_DRM_AMDGPU_CIK 4903 case CHIP_KAVERI: 4904 case CHIP_KABINI: 4905 case CHIP_MULLINS: 4906 #endif 4907 case CHIP_CARRIZO: 4908 case CHIP_STONEY: 4909 case CHIP_CYAN_SKILLFISH: 4910 goto disabled; 4911 default: 4912 break; 4913 } 4914 } 4915 4916 return true; 4917 4918 disabled: 4919 dev_info(adev->dev, "GPU recovery disabled.\n"); 4920 return false; 4921 } 4922 4923 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4924 { 4925 u32 i; 4926 int ret = 0; 4927 4928 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4929 4930 dev_info(adev->dev, "GPU mode1 reset\n"); 4931 4932 /* disable BM */ 4933 pci_clear_master(adev->pdev); 4934 4935 amdgpu_device_cache_pci_state(adev->pdev); 4936 4937 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4938 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4939 ret = amdgpu_dpm_mode1_reset(adev); 4940 } else { 4941 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4942 ret = psp_gpu_reset(adev); 4943 } 4944 4945 if (ret) 4946 goto mode1_reset_failed; 4947 4948 amdgpu_device_load_pci_state(adev->pdev); 4949 ret = amdgpu_psp_wait_for_bootloader(adev); 4950 if (ret) 4951 goto mode1_reset_failed; 4952 4953 /* wait for asic to come out of reset */ 4954 for (i = 0; i < adev->usec_timeout; i++) { 4955 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4956 4957 if (memsize != 0xffffffff) 4958 break; 4959 udelay(1); 4960 } 4961 4962 if (i >= adev->usec_timeout) { 4963 ret = -ETIMEDOUT; 4964 goto mode1_reset_failed; 4965 } 4966 4967 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4968 4969 return 0; 4970 4971 mode1_reset_failed: 4972 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4973 return ret; 4974 } 4975 4976 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4977 struct amdgpu_reset_context *reset_context) 4978 { 4979 int i, r = 0; 4980 struct amdgpu_job *job = NULL; 4981 bool need_full_reset = 4982 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4983 4984 if (reset_context->reset_req_dev == adev) 4985 job = reset_context->job; 4986 4987 if (amdgpu_sriov_vf(adev)) { 4988 /* stop the data exchange thread */ 4989 amdgpu_virt_fini_data_exchange(adev); 4990 } 4991 4992 amdgpu_fence_driver_isr_toggle(adev, true); 4993 4994 /* block all schedulers and reset given job's ring */ 4995 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4996 struct amdgpu_ring *ring = adev->rings[i]; 4997 4998 if (!ring || !ring->sched.thread) 4999 continue; 5000 5001 /* Clear job fence from fence drv to avoid force_completion 5002 * leave NULL and vm flush fence in fence drv 5003 */ 5004 amdgpu_fence_driver_clear_job_fences(ring); 5005 5006 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5007 amdgpu_fence_driver_force_completion(ring); 5008 } 5009 5010 amdgpu_fence_driver_isr_toggle(adev, false); 5011 5012 if (job && job->vm) 5013 drm_sched_increase_karma(&job->base); 5014 5015 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5016 /* If reset handler not implemented, continue; otherwise return */ 5017 if (r == -EOPNOTSUPP) 5018 r = 0; 5019 else 5020 return r; 5021 5022 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5023 if (!amdgpu_sriov_vf(adev)) { 5024 5025 if (!need_full_reset) 5026 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5027 5028 if (!need_full_reset && amdgpu_gpu_recovery && 5029 amdgpu_device_ip_check_soft_reset(adev)) { 5030 amdgpu_device_ip_pre_soft_reset(adev); 5031 r = amdgpu_device_ip_soft_reset(adev); 5032 amdgpu_device_ip_post_soft_reset(adev); 5033 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5034 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5035 need_full_reset = true; 5036 } 5037 } 5038 5039 if (need_full_reset) 5040 r = amdgpu_device_ip_suspend(adev); 5041 if (need_full_reset) 5042 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5043 else 5044 clear_bit(AMDGPU_NEED_FULL_RESET, 5045 &reset_context->flags); 5046 } 5047 5048 return r; 5049 } 5050 5051 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5052 { 5053 int i; 5054 5055 lockdep_assert_held(&adev->reset_domain->sem); 5056 5057 for (i = 0; i < adev->reset_info.num_regs; i++) { 5058 adev->reset_info.reset_dump_reg_value[i] = 5059 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5060 5061 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5062 adev->reset_info.reset_dump_reg_value[i]); 5063 } 5064 5065 return 0; 5066 } 5067 5068 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5069 struct amdgpu_reset_context *reset_context) 5070 { 5071 struct amdgpu_device *tmp_adev = NULL; 5072 bool need_full_reset, skip_hw_reset, vram_lost = false; 5073 int r = 0; 5074 bool gpu_reset_for_dev_remove = 0; 5075 5076 /* Try reset handler method first */ 5077 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5078 reset_list); 5079 amdgpu_reset_reg_dumps(tmp_adev); 5080 5081 reset_context->reset_device_list = device_list_handle; 5082 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5083 /* If reset handler not implemented, continue; otherwise return */ 5084 if (r == -EOPNOTSUPP) 5085 r = 0; 5086 else 5087 return r; 5088 5089 /* Reset handler not implemented, use the default method */ 5090 need_full_reset = 5091 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5092 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5093 5094 gpu_reset_for_dev_remove = 5095 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5096 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5097 5098 /* 5099 * ASIC reset has to be done on all XGMI hive nodes ASAP 5100 * to allow proper links negotiation in FW (within 1 sec) 5101 */ 5102 if (!skip_hw_reset && need_full_reset) { 5103 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5104 /* For XGMI run all resets in parallel to speed up the process */ 5105 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5106 tmp_adev->gmc.xgmi.pending_reset = false; 5107 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5108 r = -EALREADY; 5109 } else 5110 r = amdgpu_asic_reset(tmp_adev); 5111 5112 if (r) { 5113 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5114 r, adev_to_drm(tmp_adev)->unique); 5115 goto out; 5116 } 5117 } 5118 5119 /* For XGMI wait for all resets to complete before proceed */ 5120 if (!r) { 5121 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5122 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5123 flush_work(&tmp_adev->xgmi_reset_work); 5124 r = tmp_adev->asic_reset_res; 5125 if (r) 5126 break; 5127 } 5128 } 5129 } 5130 } 5131 5132 if (!r && amdgpu_ras_intr_triggered()) { 5133 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5134 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5135 } 5136 5137 amdgpu_ras_intr_cleared(); 5138 } 5139 5140 /* Since the mode1 reset affects base ip blocks, the 5141 * phase1 ip blocks need to be resumed. Otherwise there 5142 * will be a BIOS signature error and the psp bootloader 5143 * can't load kdb on the next amdgpu install. 5144 */ 5145 if (gpu_reset_for_dev_remove) { 5146 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5147 amdgpu_device_ip_resume_phase1(tmp_adev); 5148 5149 goto end; 5150 } 5151 5152 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5153 if (need_full_reset) { 5154 /* post card */ 5155 r = amdgpu_device_asic_init(tmp_adev); 5156 if (r) { 5157 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5158 } else { 5159 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5160 5161 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5162 if (r) 5163 goto out; 5164 5165 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5166 5167 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5168 5169 if (vram_lost) { 5170 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5171 amdgpu_inc_vram_lost(tmp_adev); 5172 } 5173 5174 r = amdgpu_device_fw_loading(tmp_adev); 5175 if (r) 5176 return r; 5177 5178 r = amdgpu_xcp_restore_partition_mode( 5179 tmp_adev->xcp_mgr); 5180 if (r) 5181 goto out; 5182 5183 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5184 if (r) 5185 goto out; 5186 5187 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5188 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5189 5190 if (vram_lost) 5191 amdgpu_device_fill_reset_magic(tmp_adev); 5192 5193 /* 5194 * Add this ASIC as tracked as reset was already 5195 * complete successfully. 5196 */ 5197 amdgpu_register_gpu_instance(tmp_adev); 5198 5199 if (!reset_context->hive && 5200 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5201 amdgpu_xgmi_add_device(tmp_adev); 5202 5203 r = amdgpu_device_ip_late_init(tmp_adev); 5204 if (r) 5205 goto out; 5206 5207 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5208 5209 /* 5210 * The GPU enters bad state once faulty pages 5211 * by ECC has reached the threshold, and ras 5212 * recovery is scheduled next. So add one check 5213 * here to break recovery if it indeed exceeds 5214 * bad page threshold, and remind user to 5215 * retire this GPU or setting one bigger 5216 * bad_page_threshold value to fix this once 5217 * probing driver again. 5218 */ 5219 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5220 /* must succeed. */ 5221 amdgpu_ras_resume(tmp_adev); 5222 } else { 5223 r = -EINVAL; 5224 goto out; 5225 } 5226 5227 /* Update PSP FW topology after reset */ 5228 if (reset_context->hive && 5229 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5230 r = amdgpu_xgmi_update_topology( 5231 reset_context->hive, tmp_adev); 5232 } 5233 } 5234 5235 out: 5236 if (!r) { 5237 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5238 r = amdgpu_ib_ring_tests(tmp_adev); 5239 if (r) { 5240 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5241 need_full_reset = true; 5242 r = -EAGAIN; 5243 goto end; 5244 } 5245 } 5246 5247 if (!r) 5248 r = amdgpu_device_recover_vram(tmp_adev); 5249 else 5250 tmp_adev->asic_reset_res = r; 5251 } 5252 5253 end: 5254 if (need_full_reset) 5255 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5256 else 5257 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5258 return r; 5259 } 5260 5261 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5262 { 5263 5264 switch (amdgpu_asic_reset_method(adev)) { 5265 case AMD_RESET_METHOD_MODE1: 5266 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5267 break; 5268 case AMD_RESET_METHOD_MODE2: 5269 adev->mp1_state = PP_MP1_STATE_RESET; 5270 break; 5271 default: 5272 adev->mp1_state = PP_MP1_STATE_NONE; 5273 break; 5274 } 5275 } 5276 5277 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5278 { 5279 amdgpu_vf_error_trans_all(adev); 5280 adev->mp1_state = PP_MP1_STATE_NONE; 5281 } 5282 5283 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5284 { 5285 struct pci_dev *p = NULL; 5286 5287 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5288 adev->pdev->bus->number, 1); 5289 if (p) { 5290 pm_runtime_enable(&(p->dev)); 5291 pm_runtime_resume(&(p->dev)); 5292 } 5293 5294 pci_dev_put(p); 5295 } 5296 5297 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5298 { 5299 enum amd_reset_method reset_method; 5300 struct pci_dev *p = NULL; 5301 u64 expires; 5302 5303 /* 5304 * For now, only BACO and mode1 reset are confirmed 5305 * to suffer the audio issue without proper suspended. 5306 */ 5307 reset_method = amdgpu_asic_reset_method(adev); 5308 if ((reset_method != AMD_RESET_METHOD_BACO) && 5309 (reset_method != AMD_RESET_METHOD_MODE1)) 5310 return -EINVAL; 5311 5312 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5313 adev->pdev->bus->number, 1); 5314 if (!p) 5315 return -ENODEV; 5316 5317 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5318 if (!expires) 5319 /* 5320 * If we cannot get the audio device autosuspend delay, 5321 * a fixed 4S interval will be used. Considering 3S is 5322 * the audio controller default autosuspend delay setting. 5323 * 4S used here is guaranteed to cover that. 5324 */ 5325 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5326 5327 while (!pm_runtime_status_suspended(&(p->dev))) { 5328 if (!pm_runtime_suspend(&(p->dev))) 5329 break; 5330 5331 if (expires < ktime_get_mono_fast_ns()) { 5332 dev_warn(adev->dev, "failed to suspend display audio\n"); 5333 pci_dev_put(p); 5334 /* TODO: abort the succeeding gpu reset? */ 5335 return -ETIMEDOUT; 5336 } 5337 } 5338 5339 pm_runtime_disable(&(p->dev)); 5340 5341 pci_dev_put(p); 5342 return 0; 5343 } 5344 5345 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5346 { 5347 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5348 5349 #if defined(CONFIG_DEBUG_FS) 5350 if (!amdgpu_sriov_vf(adev)) 5351 cancel_work(&adev->reset_work); 5352 #endif 5353 5354 if (adev->kfd.dev) 5355 cancel_work(&adev->kfd.reset_work); 5356 5357 if (amdgpu_sriov_vf(adev)) 5358 cancel_work(&adev->virt.flr_work); 5359 5360 if (con && adev->ras_enabled) 5361 cancel_work(&con->recovery_work); 5362 5363 } 5364 5365 /** 5366 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5367 * 5368 * @adev: amdgpu_device pointer 5369 * @job: which job trigger hang 5370 * @reset_context: amdgpu reset context pointer 5371 * 5372 * Attempt to reset the GPU if it has hung (all asics). 5373 * Attempt to do soft-reset or full-reset and reinitialize Asic 5374 * Returns 0 for success or an error on failure. 5375 */ 5376 5377 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5378 struct amdgpu_job *job, 5379 struct amdgpu_reset_context *reset_context) 5380 { 5381 struct list_head device_list, *device_list_handle = NULL; 5382 bool job_signaled = false; 5383 struct amdgpu_hive_info *hive = NULL; 5384 struct amdgpu_device *tmp_adev = NULL; 5385 int i, r = 0; 5386 bool need_emergency_restart = false; 5387 bool audio_suspended = false; 5388 bool gpu_reset_for_dev_remove = false; 5389 5390 gpu_reset_for_dev_remove = 5391 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5392 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5393 5394 /* 5395 * Special case: RAS triggered and full reset isn't supported 5396 */ 5397 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5398 5399 /* 5400 * Flush RAM to disk so that after reboot 5401 * the user can read log and see why the system rebooted. 5402 */ 5403 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5404 amdgpu_ras_get_context(adev)->reboot) { 5405 DRM_WARN("Emergency reboot."); 5406 5407 ksys_sync_helper(); 5408 emergency_restart(); 5409 } 5410 5411 dev_info(adev->dev, "GPU %s begin!\n", 5412 need_emergency_restart ? "jobs stop":"reset"); 5413 5414 if (!amdgpu_sriov_vf(adev)) 5415 hive = amdgpu_get_xgmi_hive(adev); 5416 if (hive) 5417 mutex_lock(&hive->hive_lock); 5418 5419 reset_context->job = job; 5420 reset_context->hive = hive; 5421 /* 5422 * Build list of devices to reset. 5423 * In case we are in XGMI hive mode, resort the device list 5424 * to put adev in the 1st position. 5425 */ 5426 INIT_LIST_HEAD(&device_list); 5427 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5428 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5429 list_add_tail(&tmp_adev->reset_list, &device_list); 5430 if (gpu_reset_for_dev_remove && adev->shutdown) 5431 tmp_adev->shutdown = true; 5432 } 5433 if (!list_is_first(&adev->reset_list, &device_list)) 5434 list_rotate_to_front(&adev->reset_list, &device_list); 5435 device_list_handle = &device_list; 5436 } else { 5437 list_add_tail(&adev->reset_list, &device_list); 5438 device_list_handle = &device_list; 5439 } 5440 5441 /* We need to lock reset domain only once both for XGMI and single device */ 5442 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5443 reset_list); 5444 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5445 5446 /* block all schedulers and reset given job's ring */ 5447 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5448 5449 amdgpu_device_set_mp1_state(tmp_adev); 5450 5451 /* 5452 * Try to put the audio codec into suspend state 5453 * before gpu reset started. 5454 * 5455 * Due to the power domain of the graphics device 5456 * is shared with AZ power domain. Without this, 5457 * we may change the audio hardware from behind 5458 * the audio driver's back. That will trigger 5459 * some audio codec errors. 5460 */ 5461 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5462 audio_suspended = true; 5463 5464 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5465 5466 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5467 5468 if (!amdgpu_sriov_vf(tmp_adev)) 5469 amdgpu_amdkfd_pre_reset(tmp_adev); 5470 5471 /* 5472 * Mark these ASICs to be reseted as untracked first 5473 * And add them back after reset completed 5474 */ 5475 amdgpu_unregister_gpu_instance(tmp_adev); 5476 5477 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5478 5479 /* disable ras on ALL IPs */ 5480 if (!need_emergency_restart && 5481 amdgpu_device_ip_need_full_reset(tmp_adev)) 5482 amdgpu_ras_suspend(tmp_adev); 5483 5484 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5485 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5486 5487 if (!ring || !ring->sched.thread) 5488 continue; 5489 5490 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5491 5492 if (need_emergency_restart) 5493 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5494 } 5495 atomic_inc(&tmp_adev->gpu_reset_counter); 5496 } 5497 5498 if (need_emergency_restart) 5499 goto skip_sched_resume; 5500 5501 /* 5502 * Must check guilty signal here since after this point all old 5503 * HW fences are force signaled. 5504 * 5505 * job->base holds a reference to parent fence 5506 */ 5507 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5508 job_signaled = true; 5509 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5510 goto skip_hw_reset; 5511 } 5512 5513 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5514 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5515 if (gpu_reset_for_dev_remove) { 5516 /* Workaroud for ASICs need to disable SMC first */ 5517 amdgpu_device_smu_fini_early(tmp_adev); 5518 } 5519 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5520 /*TODO Should we stop ?*/ 5521 if (r) { 5522 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5523 r, adev_to_drm(tmp_adev)->unique); 5524 tmp_adev->asic_reset_res = r; 5525 } 5526 5527 /* 5528 * Drop all pending non scheduler resets. Scheduler resets 5529 * were already dropped during drm_sched_stop 5530 */ 5531 amdgpu_device_stop_pending_resets(tmp_adev); 5532 } 5533 5534 /* Actual ASIC resets if needed.*/ 5535 /* Host driver will handle XGMI hive reset for SRIOV */ 5536 if (amdgpu_sriov_vf(adev)) { 5537 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5538 if (r) 5539 adev->asic_reset_res = r; 5540 5541 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5542 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5543 IP_VERSION(9, 4, 2) || 5544 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5545 amdgpu_ras_resume(adev); 5546 } else { 5547 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5548 if (r && r == -EAGAIN) 5549 goto retry; 5550 5551 if (!r && gpu_reset_for_dev_remove) 5552 goto recover_end; 5553 } 5554 5555 skip_hw_reset: 5556 5557 /* Post ASIC reset for all devs .*/ 5558 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5559 5560 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5561 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5562 5563 if (!ring || !ring->sched.thread) 5564 continue; 5565 5566 drm_sched_start(&ring->sched, true); 5567 } 5568 5569 if (adev->enable_mes && 5570 amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(11, 0, 3)) 5571 amdgpu_mes_self_test(tmp_adev); 5572 5573 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5574 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5575 5576 if (tmp_adev->asic_reset_res) 5577 r = tmp_adev->asic_reset_res; 5578 5579 tmp_adev->asic_reset_res = 0; 5580 5581 if (r) { 5582 /* bad news, how to tell it to userspace ? */ 5583 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5584 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5585 } else { 5586 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5587 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5588 DRM_WARN("smart shift update failed\n"); 5589 } 5590 } 5591 5592 skip_sched_resume: 5593 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5594 /* unlock kfd: SRIOV would do it separately */ 5595 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5596 amdgpu_amdkfd_post_reset(tmp_adev); 5597 5598 /* kfd_post_reset will do nothing if kfd device is not initialized, 5599 * need to bring up kfd here if it's not be initialized before 5600 */ 5601 if (!adev->kfd.init_complete) 5602 amdgpu_amdkfd_device_init(adev); 5603 5604 if (audio_suspended) 5605 amdgpu_device_resume_display_audio(tmp_adev); 5606 5607 amdgpu_device_unset_mp1_state(tmp_adev); 5608 5609 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5610 } 5611 5612 recover_end: 5613 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5614 reset_list); 5615 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5616 5617 if (hive) { 5618 mutex_unlock(&hive->hive_lock); 5619 amdgpu_put_xgmi_hive(hive); 5620 } 5621 5622 if (r) 5623 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5624 5625 atomic_set(&adev->reset_domain->reset_res, r); 5626 return r; 5627 } 5628 5629 /** 5630 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5631 * 5632 * @adev: amdgpu_device pointer 5633 * 5634 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5635 * and lanes) of the slot the device is in. Handles APUs and 5636 * virtualized environments where PCIE config space may not be available. 5637 */ 5638 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5639 { 5640 struct pci_dev *pdev; 5641 enum pci_bus_speed speed_cap, platform_speed_cap; 5642 enum pcie_link_width platform_link_width; 5643 5644 if (amdgpu_pcie_gen_cap) 5645 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5646 5647 if (amdgpu_pcie_lane_cap) 5648 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5649 5650 /* covers APUs as well */ 5651 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5652 if (adev->pm.pcie_gen_mask == 0) 5653 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5654 if (adev->pm.pcie_mlw_mask == 0) 5655 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5656 return; 5657 } 5658 5659 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5660 return; 5661 5662 pcie_bandwidth_available(adev->pdev, NULL, 5663 &platform_speed_cap, &platform_link_width); 5664 5665 if (adev->pm.pcie_gen_mask == 0) { 5666 /* asic caps */ 5667 pdev = adev->pdev; 5668 speed_cap = pcie_get_speed_cap(pdev); 5669 if (speed_cap == PCI_SPEED_UNKNOWN) { 5670 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5671 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5672 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5673 } else { 5674 if (speed_cap == PCIE_SPEED_32_0GT) 5675 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5676 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5677 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5678 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5679 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5680 else if (speed_cap == PCIE_SPEED_16_0GT) 5681 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5682 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5683 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5684 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5685 else if (speed_cap == PCIE_SPEED_8_0GT) 5686 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5687 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5688 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5689 else if (speed_cap == PCIE_SPEED_5_0GT) 5690 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5691 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5692 else 5693 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5694 } 5695 /* platform caps */ 5696 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5697 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5698 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5699 } else { 5700 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5701 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5702 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5703 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5704 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5705 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5706 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5707 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5708 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5709 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5710 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5711 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5712 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5713 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5714 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5715 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5716 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5717 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5718 else 5719 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5720 5721 } 5722 } 5723 if (adev->pm.pcie_mlw_mask == 0) { 5724 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5725 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5726 } else { 5727 switch (platform_link_width) { 5728 case PCIE_LNK_X32: 5729 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5730 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5731 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5732 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5733 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5734 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5735 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5736 break; 5737 case PCIE_LNK_X16: 5738 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5739 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5740 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5741 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5742 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5743 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5744 break; 5745 case PCIE_LNK_X12: 5746 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5747 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5748 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5749 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5750 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5751 break; 5752 case PCIE_LNK_X8: 5753 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5754 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5755 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5756 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5757 break; 5758 case PCIE_LNK_X4: 5759 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5760 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5761 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5762 break; 5763 case PCIE_LNK_X2: 5764 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5765 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5766 break; 5767 case PCIE_LNK_X1: 5768 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5769 break; 5770 default: 5771 break; 5772 } 5773 } 5774 } 5775 } 5776 5777 /** 5778 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5779 * 5780 * @adev: amdgpu_device pointer 5781 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5782 * 5783 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5784 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5785 * @peer_adev. 5786 */ 5787 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5788 struct amdgpu_device *peer_adev) 5789 { 5790 #ifdef CONFIG_HSA_AMD_P2P 5791 uint64_t address_mask = peer_adev->dev->dma_mask ? 5792 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5793 resource_size_t aper_limit = 5794 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5795 bool p2p_access = 5796 !adev->gmc.xgmi.connected_to_cpu && 5797 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5798 5799 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5800 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5801 !(adev->gmc.aper_base & address_mask || 5802 aper_limit & address_mask)); 5803 #else 5804 return false; 5805 #endif 5806 } 5807 5808 int amdgpu_device_baco_enter(struct drm_device *dev) 5809 { 5810 struct amdgpu_device *adev = drm_to_adev(dev); 5811 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5812 5813 if (!amdgpu_device_supports_baco(dev)) 5814 return -ENOTSUPP; 5815 5816 if (ras && adev->ras_enabled && 5817 adev->nbio.funcs->enable_doorbell_interrupt) 5818 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5819 5820 return amdgpu_dpm_baco_enter(adev); 5821 } 5822 5823 int amdgpu_device_baco_exit(struct drm_device *dev) 5824 { 5825 struct amdgpu_device *adev = drm_to_adev(dev); 5826 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5827 int ret = 0; 5828 5829 if (!amdgpu_device_supports_baco(dev)) 5830 return -ENOTSUPP; 5831 5832 ret = amdgpu_dpm_baco_exit(adev); 5833 if (ret) 5834 return ret; 5835 5836 if (ras && adev->ras_enabled && 5837 adev->nbio.funcs->enable_doorbell_interrupt) 5838 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5839 5840 if (amdgpu_passthrough(adev) && 5841 adev->nbio.funcs->clear_doorbell_interrupt) 5842 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5843 5844 return 0; 5845 } 5846 5847 /** 5848 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5849 * @pdev: PCI device struct 5850 * @state: PCI channel state 5851 * 5852 * Description: Called when a PCI error is detected. 5853 * 5854 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5855 */ 5856 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5857 { 5858 struct drm_device *dev = pci_get_drvdata(pdev); 5859 struct amdgpu_device *adev = drm_to_adev(dev); 5860 int i; 5861 5862 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5863 5864 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5865 DRM_WARN("No support for XGMI hive yet..."); 5866 return PCI_ERS_RESULT_DISCONNECT; 5867 } 5868 5869 adev->pci_channel_state = state; 5870 5871 switch (state) { 5872 case pci_channel_io_normal: 5873 return PCI_ERS_RESULT_CAN_RECOVER; 5874 /* Fatal error, prepare for slot reset */ 5875 case pci_channel_io_frozen: 5876 /* 5877 * Locking adev->reset_domain->sem will prevent any external access 5878 * to GPU during PCI error recovery 5879 */ 5880 amdgpu_device_lock_reset_domain(adev->reset_domain); 5881 amdgpu_device_set_mp1_state(adev); 5882 5883 /* 5884 * Block any work scheduling as we do for regular GPU reset 5885 * for the duration of the recovery 5886 */ 5887 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5888 struct amdgpu_ring *ring = adev->rings[i]; 5889 5890 if (!ring || !ring->sched.thread) 5891 continue; 5892 5893 drm_sched_stop(&ring->sched, NULL); 5894 } 5895 atomic_inc(&adev->gpu_reset_counter); 5896 return PCI_ERS_RESULT_NEED_RESET; 5897 case pci_channel_io_perm_failure: 5898 /* Permanent error, prepare for device removal */ 5899 return PCI_ERS_RESULT_DISCONNECT; 5900 } 5901 5902 return PCI_ERS_RESULT_NEED_RESET; 5903 } 5904 5905 /** 5906 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5907 * @pdev: pointer to PCI device 5908 */ 5909 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5910 { 5911 5912 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5913 5914 /* TODO - dump whatever for debugging purposes */ 5915 5916 /* This called only if amdgpu_pci_error_detected returns 5917 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5918 * works, no need to reset slot. 5919 */ 5920 5921 return PCI_ERS_RESULT_RECOVERED; 5922 } 5923 5924 /** 5925 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5926 * @pdev: PCI device struct 5927 * 5928 * Description: This routine is called by the pci error recovery 5929 * code after the PCI slot has been reset, just before we 5930 * should resume normal operations. 5931 */ 5932 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5933 { 5934 struct drm_device *dev = pci_get_drvdata(pdev); 5935 struct amdgpu_device *adev = drm_to_adev(dev); 5936 int r, i; 5937 struct amdgpu_reset_context reset_context; 5938 u32 memsize; 5939 struct list_head device_list; 5940 5941 DRM_INFO("PCI error: slot reset callback!!\n"); 5942 5943 memset(&reset_context, 0, sizeof(reset_context)); 5944 5945 INIT_LIST_HEAD(&device_list); 5946 list_add_tail(&adev->reset_list, &device_list); 5947 5948 /* wait for asic to come out of reset */ 5949 msleep(500); 5950 5951 /* Restore PCI confspace */ 5952 amdgpu_device_load_pci_state(pdev); 5953 5954 /* confirm ASIC came out of reset */ 5955 for (i = 0; i < adev->usec_timeout; i++) { 5956 memsize = amdgpu_asic_get_config_memsize(adev); 5957 5958 if (memsize != 0xffffffff) 5959 break; 5960 udelay(1); 5961 } 5962 if (memsize == 0xffffffff) { 5963 r = -ETIME; 5964 goto out; 5965 } 5966 5967 reset_context.method = AMD_RESET_METHOD_NONE; 5968 reset_context.reset_req_dev = adev; 5969 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5970 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5971 5972 adev->no_hw_access = true; 5973 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5974 adev->no_hw_access = false; 5975 if (r) 5976 goto out; 5977 5978 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5979 5980 out: 5981 if (!r) { 5982 if (amdgpu_device_cache_pci_state(adev->pdev)) 5983 pci_restore_state(adev->pdev); 5984 5985 DRM_INFO("PCIe error recovery succeeded\n"); 5986 } else { 5987 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5988 amdgpu_device_unset_mp1_state(adev); 5989 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5990 } 5991 5992 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5993 } 5994 5995 /** 5996 * amdgpu_pci_resume() - resume normal ops after PCI reset 5997 * @pdev: pointer to PCI device 5998 * 5999 * Called when the error recovery driver tells us that its 6000 * OK to resume normal operation. 6001 */ 6002 void amdgpu_pci_resume(struct pci_dev *pdev) 6003 { 6004 struct drm_device *dev = pci_get_drvdata(pdev); 6005 struct amdgpu_device *adev = drm_to_adev(dev); 6006 int i; 6007 6008 6009 DRM_INFO("PCI error: resume callback!!\n"); 6010 6011 /* Only continue execution for the case of pci_channel_io_frozen */ 6012 if (adev->pci_channel_state != pci_channel_io_frozen) 6013 return; 6014 6015 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6016 struct amdgpu_ring *ring = adev->rings[i]; 6017 6018 if (!ring || !ring->sched.thread) 6019 continue; 6020 6021 drm_sched_start(&ring->sched, true); 6022 } 6023 6024 amdgpu_device_unset_mp1_state(adev); 6025 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6026 } 6027 6028 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6029 { 6030 struct drm_device *dev = pci_get_drvdata(pdev); 6031 struct amdgpu_device *adev = drm_to_adev(dev); 6032 int r; 6033 6034 r = pci_save_state(pdev); 6035 if (!r) { 6036 kfree(adev->pci_state); 6037 6038 adev->pci_state = pci_store_saved_state(pdev); 6039 6040 if (!adev->pci_state) { 6041 DRM_ERROR("Failed to store PCI saved state"); 6042 return false; 6043 } 6044 } else { 6045 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6046 return false; 6047 } 6048 6049 return true; 6050 } 6051 6052 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6053 { 6054 struct drm_device *dev = pci_get_drvdata(pdev); 6055 struct amdgpu_device *adev = drm_to_adev(dev); 6056 int r; 6057 6058 if (!adev->pci_state) 6059 return false; 6060 6061 r = pci_load_saved_state(pdev, adev->pci_state); 6062 6063 if (!r) { 6064 pci_restore_state(pdev); 6065 } else { 6066 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6067 return false; 6068 } 6069 6070 return true; 6071 } 6072 6073 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6074 struct amdgpu_ring *ring) 6075 { 6076 #ifdef CONFIG_X86_64 6077 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6078 return; 6079 #endif 6080 if (adev->gmc.xgmi.connected_to_cpu) 6081 return; 6082 6083 if (ring && ring->funcs->emit_hdp_flush) 6084 amdgpu_ring_emit_hdp_flush(ring); 6085 else 6086 amdgpu_asic_flush_hdp(adev, ring); 6087 } 6088 6089 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6090 struct amdgpu_ring *ring) 6091 { 6092 #ifdef CONFIG_X86_64 6093 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6094 return; 6095 #endif 6096 if (adev->gmc.xgmi.connected_to_cpu) 6097 return; 6098 6099 amdgpu_asic_invalidate_hdp(adev, ring); 6100 } 6101 6102 int amdgpu_in_reset(struct amdgpu_device *adev) 6103 { 6104 return atomic_read(&adev->reset_domain->in_gpu_reset); 6105 } 6106 6107 /** 6108 * amdgpu_device_halt() - bring hardware to some kind of halt state 6109 * 6110 * @adev: amdgpu_device pointer 6111 * 6112 * Bring hardware to some kind of halt state so that no one can touch it 6113 * any more. It will help to maintain error context when error occurred. 6114 * Compare to a simple hang, the system will keep stable at least for SSH 6115 * access. Then it should be trivial to inspect the hardware state and 6116 * see what's going on. Implemented as following: 6117 * 6118 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6119 * clears all CPU mappings to device, disallows remappings through page faults 6120 * 2. amdgpu_irq_disable_all() disables all interrupts 6121 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6122 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6123 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6124 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6125 * flush any in flight DMA operations 6126 */ 6127 void amdgpu_device_halt(struct amdgpu_device *adev) 6128 { 6129 struct pci_dev *pdev = adev->pdev; 6130 struct drm_device *ddev = adev_to_drm(adev); 6131 6132 amdgpu_xcp_dev_unplug(adev); 6133 drm_dev_unplug(ddev); 6134 6135 amdgpu_irq_disable_all(adev); 6136 6137 amdgpu_fence_driver_hw_fini(adev); 6138 6139 adev->no_hw_access = true; 6140 6141 amdgpu_device_unmap_mmio(adev); 6142 6143 pci_disable_device(pdev); 6144 pci_wait_for_pending_transaction(pdev); 6145 } 6146 6147 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6148 u32 reg) 6149 { 6150 unsigned long flags, address, data; 6151 u32 r; 6152 6153 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6154 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6155 6156 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6157 WREG32(address, reg * 4); 6158 (void)RREG32(address); 6159 r = RREG32(data); 6160 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6161 return r; 6162 } 6163 6164 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6165 u32 reg, u32 v) 6166 { 6167 unsigned long flags, address, data; 6168 6169 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6170 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6171 6172 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6173 WREG32(address, reg * 4); 6174 (void)RREG32(address); 6175 WREG32(data, v); 6176 (void)RREG32(data); 6177 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6178 } 6179 6180 /** 6181 * amdgpu_device_switch_gang - switch to a new gang 6182 * @adev: amdgpu_device pointer 6183 * @gang: the gang to switch to 6184 * 6185 * Try to switch to a new gang. 6186 * Returns: NULL if we switched to the new gang or a reference to the current 6187 * gang leader. 6188 */ 6189 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6190 struct dma_fence *gang) 6191 { 6192 struct dma_fence *old = NULL; 6193 6194 do { 6195 dma_fence_put(old); 6196 rcu_read_lock(); 6197 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6198 rcu_read_unlock(); 6199 6200 if (old == gang) 6201 break; 6202 6203 if (!dma_fence_is_signaled(old)) 6204 return old; 6205 6206 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6207 old, gang) != old); 6208 6209 dma_fence_put(old); 6210 return NULL; 6211 } 6212 6213 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6214 { 6215 switch (adev->asic_type) { 6216 #ifdef CONFIG_DRM_AMDGPU_SI 6217 case CHIP_HAINAN: 6218 #endif 6219 case CHIP_TOPAZ: 6220 /* chips with no display hardware */ 6221 return false; 6222 #ifdef CONFIG_DRM_AMDGPU_SI 6223 case CHIP_TAHITI: 6224 case CHIP_PITCAIRN: 6225 case CHIP_VERDE: 6226 case CHIP_OLAND: 6227 #endif 6228 #ifdef CONFIG_DRM_AMDGPU_CIK 6229 case CHIP_BONAIRE: 6230 case CHIP_HAWAII: 6231 case CHIP_KAVERI: 6232 case CHIP_KABINI: 6233 case CHIP_MULLINS: 6234 #endif 6235 case CHIP_TONGA: 6236 case CHIP_FIJI: 6237 case CHIP_POLARIS10: 6238 case CHIP_POLARIS11: 6239 case CHIP_POLARIS12: 6240 case CHIP_VEGAM: 6241 case CHIP_CARRIZO: 6242 case CHIP_STONEY: 6243 /* chips with display hardware */ 6244 return true; 6245 default: 6246 /* IP discovery */ 6247 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6248 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6249 return false; 6250 return true; 6251 } 6252 } 6253 6254 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6255 uint32_t inst, uint32_t reg_addr, char reg_name[], 6256 uint32_t expected_value, uint32_t mask) 6257 { 6258 uint32_t ret = 0; 6259 uint32_t old_ = 0; 6260 uint32_t tmp_ = RREG32(reg_addr); 6261 uint32_t loop = adev->usec_timeout; 6262 6263 while ((tmp_ & (mask)) != (expected_value)) { 6264 if (old_ != tmp_) { 6265 loop = adev->usec_timeout; 6266 old_ = tmp_; 6267 } else 6268 udelay(1); 6269 tmp_ = RREG32(reg_addr); 6270 loop--; 6271 if (!loop) { 6272 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6273 inst, reg_name, (uint32_t)expected_value, 6274 (uint32_t)(tmp_ & (mask))); 6275 ret = -ETIMEDOUT; 6276 break; 6277 } 6278 } 6279 return ret; 6280 } 6281