1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 77 #include <linux/suspend.h> 78 #include <drm/task_barrier.h> 79 #include <linux/pm_runtime.h> 80 81 #include <drm/drm_drv.h> 82 83 #if IS_ENABLED(CONFIG_X86) 84 #include <asm/intel-family.h> 85 #endif 86 87 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 94 95 #define AMDGPU_RESUME_MS 2000 96 #define AMDGPU_MAX_RETRY_LIMIT 2 97 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 98 99 static const struct drm_driver amdgpu_kms_driver; 100 101 const char *amdgpu_asic_name[] = { 102 "TAHITI", 103 "PITCAIRN", 104 "VERDE", 105 "OLAND", 106 "HAINAN", 107 "BONAIRE", 108 "KAVERI", 109 "KABINI", 110 "HAWAII", 111 "MULLINS", 112 "TOPAZ", 113 "TONGA", 114 "FIJI", 115 "CARRIZO", 116 "STONEY", 117 "POLARIS10", 118 "POLARIS11", 119 "POLARIS12", 120 "VEGAM", 121 "VEGA10", 122 "VEGA12", 123 "VEGA20", 124 "RAVEN", 125 "ARCTURUS", 126 "RENOIR", 127 "ALDEBARAN", 128 "NAVI10", 129 "CYAN_SKILLFISH", 130 "NAVI14", 131 "NAVI12", 132 "SIENNA_CICHLID", 133 "NAVY_FLOUNDER", 134 "VANGOGH", 135 "DIMGREY_CAVEFISH", 136 "BEIGE_GOBY", 137 "YELLOW_CARP", 138 "IP DISCOVERY", 139 "LAST", 140 }; 141 142 /** 143 * DOC: pcie_replay_count 144 * 145 * The amdgpu driver provides a sysfs API for reporting the total number 146 * of PCIe replays (NAKs) 147 * The file pcie_replay_count is used for this and returns the total 148 * number of replays as a sum of the NAKs generated and NAKs received 149 */ 150 151 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 152 struct device_attribute *attr, char *buf) 153 { 154 struct drm_device *ddev = dev_get_drvdata(dev); 155 struct amdgpu_device *adev = drm_to_adev(ddev); 156 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 157 158 return sysfs_emit(buf, "%llu\n", cnt); 159 } 160 161 static DEVICE_ATTR(pcie_replay_count, 0444, 162 amdgpu_device_get_pcie_replay_count, NULL); 163 164 /** 165 * DOC: board_info 166 * 167 * The amdgpu driver provides a sysfs API for giving board related information. 168 * It provides the form factor information in the format 169 * 170 * type : form factor 171 * 172 * Possible form factor values 173 * 174 * - "cem" - PCIE CEM card 175 * - "oam" - Open Compute Accelerator Module 176 * - "unknown" - Not known 177 * 178 */ 179 180 static ssize_t amdgpu_device_get_board_info(struct device *dev, 181 struct device_attribute *attr, 182 char *buf) 183 { 184 struct drm_device *ddev = dev_get_drvdata(dev); 185 struct amdgpu_device *adev = drm_to_adev(ddev); 186 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 187 const char *pkg; 188 189 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 190 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 191 192 switch (pkg_type) { 193 case AMDGPU_PKG_TYPE_CEM: 194 pkg = "cem"; 195 break; 196 case AMDGPU_PKG_TYPE_OAM: 197 pkg = "oam"; 198 break; 199 default: 200 pkg = "unknown"; 201 break; 202 } 203 204 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 205 } 206 207 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 208 209 static struct attribute *amdgpu_board_attrs[] = { 210 &dev_attr_board_info.attr, 211 NULL, 212 }; 213 214 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 215 struct attribute *attr, int n) 216 { 217 struct device *dev = kobj_to_dev(kobj); 218 struct drm_device *ddev = dev_get_drvdata(dev); 219 struct amdgpu_device *adev = drm_to_adev(ddev); 220 221 if (adev->flags & AMD_IS_APU) 222 return 0; 223 224 return attr->mode; 225 } 226 227 static const struct attribute_group amdgpu_board_attrs_group = { 228 .attrs = amdgpu_board_attrs, 229 .is_visible = amdgpu_board_attrs_is_visible 230 }; 231 232 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 233 234 235 /** 236 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 237 * 238 * @dev: drm_device pointer 239 * 240 * Returns true if the device is a dGPU with ATPX power control, 241 * otherwise return false. 242 */ 243 bool amdgpu_device_supports_px(struct drm_device *dev) 244 { 245 struct amdgpu_device *adev = drm_to_adev(dev); 246 247 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 248 return true; 249 return false; 250 } 251 252 /** 253 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 254 * 255 * @dev: drm_device pointer 256 * 257 * Returns true if the device is a dGPU with ACPI power control, 258 * otherwise return false. 259 */ 260 bool amdgpu_device_supports_boco(struct drm_device *dev) 261 { 262 struct amdgpu_device *adev = drm_to_adev(dev); 263 264 if (adev->has_pr3 || 265 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 266 return true; 267 return false; 268 } 269 270 /** 271 * amdgpu_device_supports_baco - Does the device support BACO 272 * 273 * @dev: drm_device pointer 274 * 275 * Returns true if the device supporte BACO, 276 * otherwise return false. 277 */ 278 bool amdgpu_device_supports_baco(struct drm_device *dev) 279 { 280 struct amdgpu_device *adev = drm_to_adev(dev); 281 282 return amdgpu_asic_supports_baco(adev); 283 } 284 285 /** 286 * amdgpu_device_supports_smart_shift - Is the device dGPU with 287 * smart shift support 288 * 289 * @dev: drm_device pointer 290 * 291 * Returns true if the device is a dGPU with Smart Shift support, 292 * otherwise returns false. 293 */ 294 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 295 { 296 return (amdgpu_device_supports_boco(dev) && 297 amdgpu_acpi_is_power_shift_control_supported()); 298 } 299 300 /* 301 * VRAM access helper functions 302 */ 303 304 /** 305 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 306 * 307 * @adev: amdgpu_device pointer 308 * @pos: offset of the buffer in vram 309 * @buf: virtual address of the buffer in system memory 310 * @size: read/write size, sizeof(@buf) must > @size 311 * @write: true - write to vram, otherwise - read from vram 312 */ 313 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 314 void *buf, size_t size, bool write) 315 { 316 unsigned long flags; 317 uint32_t hi = ~0, tmp = 0; 318 uint32_t *data = buf; 319 uint64_t last; 320 int idx; 321 322 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 323 return; 324 325 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 326 327 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 328 for (last = pos + size; pos < last; pos += 4) { 329 tmp = pos >> 31; 330 331 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 332 if (tmp != hi) { 333 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 334 hi = tmp; 335 } 336 if (write) 337 WREG32_NO_KIQ(mmMM_DATA, *data++); 338 else 339 *data++ = RREG32_NO_KIQ(mmMM_DATA); 340 } 341 342 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 343 drm_dev_exit(idx); 344 } 345 346 /** 347 * amdgpu_device_aper_access - access vram by vram aperature 348 * 349 * @adev: amdgpu_device pointer 350 * @pos: offset of the buffer in vram 351 * @buf: virtual address of the buffer in system memory 352 * @size: read/write size, sizeof(@buf) must > @size 353 * @write: true - write to vram, otherwise - read from vram 354 * 355 * The return value means how many bytes have been transferred. 356 */ 357 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 358 void *buf, size_t size, bool write) 359 { 360 #ifdef CONFIG_64BIT 361 void __iomem *addr; 362 size_t count = 0; 363 uint64_t last; 364 365 if (!adev->mman.aper_base_kaddr) 366 return 0; 367 368 last = min(pos + size, adev->gmc.visible_vram_size); 369 if (last > pos) { 370 addr = adev->mman.aper_base_kaddr + pos; 371 count = last - pos; 372 373 if (write) { 374 memcpy_toio(addr, buf, count); 375 /* Make sure HDP write cache flush happens without any reordering 376 * after the system memory contents are sent over PCIe device 377 */ 378 mb(); 379 amdgpu_device_flush_hdp(adev, NULL); 380 } else { 381 amdgpu_device_invalidate_hdp(adev, NULL); 382 /* Make sure HDP read cache is invalidated before issuing a read 383 * to the PCIe device 384 */ 385 mb(); 386 memcpy_fromio(buf, addr, count); 387 } 388 389 } 390 391 return count; 392 #else 393 return 0; 394 #endif 395 } 396 397 /** 398 * amdgpu_device_vram_access - read/write a buffer in vram 399 * 400 * @adev: amdgpu_device pointer 401 * @pos: offset of the buffer in vram 402 * @buf: virtual address of the buffer in system memory 403 * @size: read/write size, sizeof(@buf) must > @size 404 * @write: true - write to vram, otherwise - read from vram 405 */ 406 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 407 void *buf, size_t size, bool write) 408 { 409 size_t count; 410 411 /* try to using vram apreature to access vram first */ 412 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 413 size -= count; 414 if (size) { 415 /* using MM to access rest vram */ 416 pos += count; 417 buf += count; 418 amdgpu_device_mm_access(adev, pos, buf, size, write); 419 } 420 } 421 422 /* 423 * register access helper functions. 424 */ 425 426 /* Check if hw access should be skipped because of hotplug or device error */ 427 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 428 { 429 if (adev->no_hw_access) 430 return true; 431 432 #ifdef CONFIG_LOCKDEP 433 /* 434 * This is a bit complicated to understand, so worth a comment. What we assert 435 * here is that the GPU reset is not running on another thread in parallel. 436 * 437 * For this we trylock the read side of the reset semaphore, if that succeeds 438 * we know that the reset is not running in paralell. 439 * 440 * If the trylock fails we assert that we are either already holding the read 441 * side of the lock or are the reset thread itself and hold the write side of 442 * the lock. 443 */ 444 if (in_task()) { 445 if (down_read_trylock(&adev->reset_domain->sem)) 446 up_read(&adev->reset_domain->sem); 447 else 448 lockdep_assert_held(&adev->reset_domain->sem); 449 } 450 #endif 451 return false; 452 } 453 454 /** 455 * amdgpu_device_rreg - read a memory mapped IO or indirect register 456 * 457 * @adev: amdgpu_device pointer 458 * @reg: dword aligned register offset 459 * @acc_flags: access flags which require special behavior 460 * 461 * Returns the 32 bit value from the offset specified. 462 */ 463 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 464 uint32_t reg, uint32_t acc_flags) 465 { 466 uint32_t ret; 467 468 if (amdgpu_device_skip_hw_access(adev)) 469 return 0; 470 471 if ((reg * 4) < adev->rmmio_size) { 472 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 473 amdgpu_sriov_runtime(adev) && 474 down_read_trylock(&adev->reset_domain->sem)) { 475 ret = amdgpu_kiq_rreg(adev, reg); 476 up_read(&adev->reset_domain->sem); 477 } else { 478 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 479 } 480 } else { 481 ret = adev->pcie_rreg(adev, reg * 4); 482 } 483 484 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 485 486 return ret; 487 } 488 489 /* 490 * MMIO register read with bytes helper functions 491 * @offset:bytes offset from MMIO start 492 */ 493 494 /** 495 * amdgpu_mm_rreg8 - read a memory mapped IO register 496 * 497 * @adev: amdgpu_device pointer 498 * @offset: byte aligned register offset 499 * 500 * Returns the 8 bit value from the offset specified. 501 */ 502 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 503 { 504 if (amdgpu_device_skip_hw_access(adev)) 505 return 0; 506 507 if (offset < adev->rmmio_size) 508 return (readb(adev->rmmio + offset)); 509 BUG(); 510 } 511 512 /* 513 * MMIO register write with bytes helper functions 514 * @offset:bytes offset from MMIO start 515 * @value: the value want to be written to the register 516 */ 517 518 /** 519 * amdgpu_mm_wreg8 - read a memory mapped IO register 520 * 521 * @adev: amdgpu_device pointer 522 * @offset: byte aligned register offset 523 * @value: 8 bit value to write 524 * 525 * Writes the value specified to the offset specified. 526 */ 527 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 528 { 529 if (amdgpu_device_skip_hw_access(adev)) 530 return; 531 532 if (offset < adev->rmmio_size) 533 writeb(value, adev->rmmio + offset); 534 else 535 BUG(); 536 } 537 538 /** 539 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 540 * 541 * @adev: amdgpu_device pointer 542 * @reg: dword aligned register offset 543 * @v: 32 bit value to write to the register 544 * @acc_flags: access flags which require special behavior 545 * 546 * Writes the value specified to the offset specified. 547 */ 548 void amdgpu_device_wreg(struct amdgpu_device *adev, 549 uint32_t reg, uint32_t v, 550 uint32_t acc_flags) 551 { 552 if (amdgpu_device_skip_hw_access(adev)) 553 return; 554 555 if ((reg * 4) < adev->rmmio_size) { 556 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 557 amdgpu_sriov_runtime(adev) && 558 down_read_trylock(&adev->reset_domain->sem)) { 559 amdgpu_kiq_wreg(adev, reg, v); 560 up_read(&adev->reset_domain->sem); 561 } else { 562 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 563 } 564 } else { 565 adev->pcie_wreg(adev, reg * 4, v); 566 } 567 568 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 569 } 570 571 /** 572 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 573 * 574 * @adev: amdgpu_device pointer 575 * @reg: mmio/rlc register 576 * @v: value to write 577 * @xcc_id: xcc accelerated compute core id 578 * 579 * this function is invoked only for the debugfs register access 580 */ 581 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 582 uint32_t reg, uint32_t v, 583 uint32_t xcc_id) 584 { 585 if (amdgpu_device_skip_hw_access(adev)) 586 return; 587 588 if (amdgpu_sriov_fullaccess(adev) && 589 adev->gfx.rlc.funcs && 590 adev->gfx.rlc.funcs->is_rlcg_access_range) { 591 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 592 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 593 } else if ((reg * 4) >= adev->rmmio_size) { 594 adev->pcie_wreg(adev, reg * 4, v); 595 } else { 596 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 597 } 598 } 599 600 /** 601 * amdgpu_device_indirect_rreg - read an indirect register 602 * 603 * @adev: amdgpu_device pointer 604 * @reg_addr: indirect register address to read from 605 * 606 * Returns the value of indirect register @reg_addr 607 */ 608 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 609 u32 reg_addr) 610 { 611 unsigned long flags, pcie_index, pcie_data; 612 void __iomem *pcie_index_offset; 613 void __iomem *pcie_data_offset; 614 u32 r; 615 616 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 617 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 618 619 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 620 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 621 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 622 623 writel(reg_addr, pcie_index_offset); 624 readl(pcie_index_offset); 625 r = readl(pcie_data_offset); 626 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 627 628 return r; 629 } 630 631 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 632 u64 reg_addr) 633 { 634 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 635 u32 r; 636 void __iomem *pcie_index_offset; 637 void __iomem *pcie_index_hi_offset; 638 void __iomem *pcie_data_offset; 639 640 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 641 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 642 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 643 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 644 else 645 pcie_index_hi = 0; 646 647 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 648 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 649 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 650 if (pcie_index_hi != 0) 651 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 652 pcie_index_hi * 4; 653 654 writel(reg_addr, pcie_index_offset); 655 readl(pcie_index_offset); 656 if (pcie_index_hi != 0) { 657 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 658 readl(pcie_index_hi_offset); 659 } 660 r = readl(pcie_data_offset); 661 662 /* clear the high bits */ 663 if (pcie_index_hi != 0) { 664 writel(0, pcie_index_hi_offset); 665 readl(pcie_index_hi_offset); 666 } 667 668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 669 670 return r; 671 } 672 673 /** 674 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 675 * 676 * @adev: amdgpu_device pointer 677 * @reg_addr: indirect register address to read from 678 * 679 * Returns the value of indirect register @reg_addr 680 */ 681 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 682 u32 reg_addr) 683 { 684 unsigned long flags, pcie_index, pcie_data; 685 void __iomem *pcie_index_offset; 686 void __iomem *pcie_data_offset; 687 u64 r; 688 689 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 690 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 691 692 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 693 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 694 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 695 696 /* read low 32 bits */ 697 writel(reg_addr, pcie_index_offset); 698 readl(pcie_index_offset); 699 r = readl(pcie_data_offset); 700 /* read high 32 bits */ 701 writel(reg_addr + 4, pcie_index_offset); 702 readl(pcie_index_offset); 703 r |= ((u64)readl(pcie_data_offset) << 32); 704 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 705 706 return r; 707 } 708 709 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 710 u64 reg_addr) 711 { 712 unsigned long flags, pcie_index, pcie_data; 713 unsigned long pcie_index_hi = 0; 714 void __iomem *pcie_index_offset; 715 void __iomem *pcie_index_hi_offset; 716 void __iomem *pcie_data_offset; 717 u64 r; 718 719 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 720 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 721 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 722 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 723 724 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 725 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 726 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 727 if (pcie_index_hi != 0) 728 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 729 pcie_index_hi * 4; 730 731 /* read low 32 bits */ 732 writel(reg_addr, pcie_index_offset); 733 readl(pcie_index_offset); 734 if (pcie_index_hi != 0) { 735 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 736 readl(pcie_index_hi_offset); 737 } 738 r = readl(pcie_data_offset); 739 /* read high 32 bits */ 740 writel(reg_addr + 4, pcie_index_offset); 741 readl(pcie_index_offset); 742 if (pcie_index_hi != 0) { 743 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 744 readl(pcie_index_hi_offset); 745 } 746 r |= ((u64)readl(pcie_data_offset) << 32); 747 748 /* clear the high bits */ 749 if (pcie_index_hi != 0) { 750 writel(0, pcie_index_hi_offset); 751 readl(pcie_index_hi_offset); 752 } 753 754 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 755 756 return r; 757 } 758 759 /** 760 * amdgpu_device_indirect_wreg - write an indirect register address 761 * 762 * @adev: amdgpu_device pointer 763 * @reg_addr: indirect register offset 764 * @reg_data: indirect register data 765 * 766 */ 767 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 768 u32 reg_addr, u32 reg_data) 769 { 770 unsigned long flags, pcie_index, pcie_data; 771 void __iomem *pcie_index_offset; 772 void __iomem *pcie_data_offset; 773 774 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 775 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 776 777 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 778 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 779 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 780 781 writel(reg_addr, pcie_index_offset); 782 readl(pcie_index_offset); 783 writel(reg_data, pcie_data_offset); 784 readl(pcie_data_offset); 785 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 786 } 787 788 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 789 u64 reg_addr, u32 reg_data) 790 { 791 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 792 void __iomem *pcie_index_offset; 793 void __iomem *pcie_index_hi_offset; 794 void __iomem *pcie_data_offset; 795 796 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 797 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 798 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 799 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 800 else 801 pcie_index_hi = 0; 802 803 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 804 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 805 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 806 if (pcie_index_hi != 0) 807 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 808 pcie_index_hi * 4; 809 810 writel(reg_addr, pcie_index_offset); 811 readl(pcie_index_offset); 812 if (pcie_index_hi != 0) { 813 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 814 readl(pcie_index_hi_offset); 815 } 816 writel(reg_data, pcie_data_offset); 817 readl(pcie_data_offset); 818 819 /* clear the high bits */ 820 if (pcie_index_hi != 0) { 821 writel(0, pcie_index_hi_offset); 822 readl(pcie_index_hi_offset); 823 } 824 825 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 826 } 827 828 /** 829 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 830 * 831 * @adev: amdgpu_device pointer 832 * @reg_addr: indirect register offset 833 * @reg_data: indirect register data 834 * 835 */ 836 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 837 u32 reg_addr, u64 reg_data) 838 { 839 unsigned long flags, pcie_index, pcie_data; 840 void __iomem *pcie_index_offset; 841 void __iomem *pcie_data_offset; 842 843 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 844 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 845 846 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 847 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 848 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 849 850 /* write low 32 bits */ 851 writel(reg_addr, pcie_index_offset); 852 readl(pcie_index_offset); 853 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 854 readl(pcie_data_offset); 855 /* write high 32 bits */ 856 writel(reg_addr + 4, pcie_index_offset); 857 readl(pcie_index_offset); 858 writel((u32)(reg_data >> 32), pcie_data_offset); 859 readl(pcie_data_offset); 860 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 861 } 862 863 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 864 u64 reg_addr, u64 reg_data) 865 { 866 unsigned long flags, pcie_index, pcie_data; 867 unsigned long pcie_index_hi = 0; 868 void __iomem *pcie_index_offset; 869 void __iomem *pcie_index_hi_offset; 870 void __iomem *pcie_data_offset; 871 872 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 873 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 874 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 875 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 876 877 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 878 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 879 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 880 if (pcie_index_hi != 0) 881 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 882 pcie_index_hi * 4; 883 884 /* write low 32 bits */ 885 writel(reg_addr, pcie_index_offset); 886 readl(pcie_index_offset); 887 if (pcie_index_hi != 0) { 888 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 889 readl(pcie_index_hi_offset); 890 } 891 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 892 readl(pcie_data_offset); 893 /* write high 32 bits */ 894 writel(reg_addr + 4, pcie_index_offset); 895 readl(pcie_index_offset); 896 if (pcie_index_hi != 0) { 897 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 898 readl(pcie_index_hi_offset); 899 } 900 writel((u32)(reg_data >> 32), pcie_data_offset); 901 readl(pcie_data_offset); 902 903 /* clear the high bits */ 904 if (pcie_index_hi != 0) { 905 writel(0, pcie_index_hi_offset); 906 readl(pcie_index_hi_offset); 907 } 908 909 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 910 } 911 912 /** 913 * amdgpu_device_get_rev_id - query device rev_id 914 * 915 * @adev: amdgpu_device pointer 916 * 917 * Return device rev_id 918 */ 919 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 920 { 921 return adev->nbio.funcs->get_rev_id(adev); 922 } 923 924 /** 925 * amdgpu_invalid_rreg - dummy reg read function 926 * 927 * @adev: amdgpu_device pointer 928 * @reg: offset of register 929 * 930 * Dummy register read function. Used for register blocks 931 * that certain asics don't have (all asics). 932 * Returns the value in the register. 933 */ 934 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 935 { 936 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 937 BUG(); 938 return 0; 939 } 940 941 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 942 { 943 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 944 BUG(); 945 return 0; 946 } 947 948 /** 949 * amdgpu_invalid_wreg - dummy reg write function 950 * 951 * @adev: amdgpu_device pointer 952 * @reg: offset of register 953 * @v: value to write to the register 954 * 955 * Dummy register read function. Used for register blocks 956 * that certain asics don't have (all asics). 957 */ 958 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 959 { 960 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 961 reg, v); 962 BUG(); 963 } 964 965 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 966 { 967 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 968 reg, v); 969 BUG(); 970 } 971 972 /** 973 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 974 * 975 * @adev: amdgpu_device pointer 976 * @reg: offset of register 977 * 978 * Dummy register read function. Used for register blocks 979 * that certain asics don't have (all asics). 980 * Returns the value in the register. 981 */ 982 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 983 { 984 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 985 BUG(); 986 return 0; 987 } 988 989 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 990 { 991 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 992 BUG(); 993 return 0; 994 } 995 996 /** 997 * amdgpu_invalid_wreg64 - dummy reg write function 998 * 999 * @adev: amdgpu_device pointer 1000 * @reg: offset of register 1001 * @v: value to write to the register 1002 * 1003 * Dummy register read function. Used for register blocks 1004 * that certain asics don't have (all asics). 1005 */ 1006 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1007 { 1008 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1009 reg, v); 1010 BUG(); 1011 } 1012 1013 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1014 { 1015 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1016 reg, v); 1017 BUG(); 1018 } 1019 1020 /** 1021 * amdgpu_block_invalid_rreg - dummy reg read function 1022 * 1023 * @adev: amdgpu_device pointer 1024 * @block: offset of instance 1025 * @reg: offset of register 1026 * 1027 * Dummy register read function. Used for register blocks 1028 * that certain asics don't have (all asics). 1029 * Returns the value in the register. 1030 */ 1031 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1032 uint32_t block, uint32_t reg) 1033 { 1034 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1035 reg, block); 1036 BUG(); 1037 return 0; 1038 } 1039 1040 /** 1041 * amdgpu_block_invalid_wreg - dummy reg write function 1042 * 1043 * @adev: amdgpu_device pointer 1044 * @block: offset of instance 1045 * @reg: offset of register 1046 * @v: value to write to the register 1047 * 1048 * Dummy register read function. Used for register blocks 1049 * that certain asics don't have (all asics). 1050 */ 1051 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1052 uint32_t block, 1053 uint32_t reg, uint32_t v) 1054 { 1055 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1056 reg, block, v); 1057 BUG(); 1058 } 1059 1060 /** 1061 * amdgpu_device_asic_init - Wrapper for atom asic_init 1062 * 1063 * @adev: amdgpu_device pointer 1064 * 1065 * Does any asic specific work and then calls atom asic init. 1066 */ 1067 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1068 { 1069 int ret; 1070 1071 amdgpu_asic_pre_asic_init(adev); 1072 1073 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1074 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1075 amdgpu_psp_wait_for_bootloader(adev); 1076 ret = amdgpu_atomfirmware_asic_init(adev, true); 1077 return ret; 1078 } else { 1079 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1080 } 1081 1082 return 0; 1083 } 1084 1085 /** 1086 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1087 * 1088 * @adev: amdgpu_device pointer 1089 * 1090 * Allocates a scratch page of VRAM for use by various things in the 1091 * driver. 1092 */ 1093 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1094 { 1095 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1096 AMDGPU_GEM_DOMAIN_VRAM | 1097 AMDGPU_GEM_DOMAIN_GTT, 1098 &adev->mem_scratch.robj, 1099 &adev->mem_scratch.gpu_addr, 1100 (void **)&adev->mem_scratch.ptr); 1101 } 1102 1103 /** 1104 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1105 * 1106 * @adev: amdgpu_device pointer 1107 * 1108 * Frees the VRAM scratch page. 1109 */ 1110 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1111 { 1112 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1113 } 1114 1115 /** 1116 * amdgpu_device_program_register_sequence - program an array of registers. 1117 * 1118 * @adev: amdgpu_device pointer 1119 * @registers: pointer to the register array 1120 * @array_size: size of the register array 1121 * 1122 * Programs an array or registers with and or masks. 1123 * This is a helper for setting golden registers. 1124 */ 1125 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1126 const u32 *registers, 1127 const u32 array_size) 1128 { 1129 u32 tmp, reg, and_mask, or_mask; 1130 int i; 1131 1132 if (array_size % 3) 1133 return; 1134 1135 for (i = 0; i < array_size; i += 3) { 1136 reg = registers[i + 0]; 1137 and_mask = registers[i + 1]; 1138 or_mask = registers[i + 2]; 1139 1140 if (and_mask == 0xffffffff) { 1141 tmp = or_mask; 1142 } else { 1143 tmp = RREG32(reg); 1144 tmp &= ~and_mask; 1145 if (adev->family >= AMDGPU_FAMILY_AI) 1146 tmp |= (or_mask & and_mask); 1147 else 1148 tmp |= or_mask; 1149 } 1150 WREG32(reg, tmp); 1151 } 1152 } 1153 1154 /** 1155 * amdgpu_device_pci_config_reset - reset the GPU 1156 * 1157 * @adev: amdgpu_device pointer 1158 * 1159 * Resets the GPU using the pci config reset sequence. 1160 * Only applicable to asics prior to vega10. 1161 */ 1162 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1163 { 1164 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1165 } 1166 1167 /** 1168 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1169 * 1170 * @adev: amdgpu_device pointer 1171 * 1172 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1173 */ 1174 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1175 { 1176 return pci_reset_function(adev->pdev); 1177 } 1178 1179 /* 1180 * amdgpu_device_wb_*() 1181 * Writeback is the method by which the GPU updates special pages in memory 1182 * with the status of certain GPU events (fences, ring pointers,etc.). 1183 */ 1184 1185 /** 1186 * amdgpu_device_wb_fini - Disable Writeback and free memory 1187 * 1188 * @adev: amdgpu_device pointer 1189 * 1190 * Disables Writeback and frees the Writeback memory (all asics). 1191 * Used at driver shutdown. 1192 */ 1193 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1194 { 1195 if (adev->wb.wb_obj) { 1196 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1197 &adev->wb.gpu_addr, 1198 (void **)&adev->wb.wb); 1199 adev->wb.wb_obj = NULL; 1200 } 1201 } 1202 1203 /** 1204 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1205 * 1206 * @adev: amdgpu_device pointer 1207 * 1208 * Initializes writeback and allocates writeback memory (all asics). 1209 * Used at driver startup. 1210 * Returns 0 on success or an -error on failure. 1211 */ 1212 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1213 { 1214 int r; 1215 1216 if (adev->wb.wb_obj == NULL) { 1217 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1218 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1219 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1220 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1221 (void **)&adev->wb.wb); 1222 if (r) { 1223 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1224 return r; 1225 } 1226 1227 adev->wb.num_wb = AMDGPU_MAX_WB; 1228 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1229 1230 /* clear wb memory */ 1231 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1232 } 1233 1234 return 0; 1235 } 1236 1237 /** 1238 * amdgpu_device_wb_get - Allocate a wb entry 1239 * 1240 * @adev: amdgpu_device pointer 1241 * @wb: wb index 1242 * 1243 * Allocate a wb slot for use by the driver (all asics). 1244 * Returns 0 on success or -EINVAL on failure. 1245 */ 1246 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1247 { 1248 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1249 1250 if (offset < adev->wb.num_wb) { 1251 __set_bit(offset, adev->wb.used); 1252 *wb = offset << 3; /* convert to dw offset */ 1253 return 0; 1254 } else { 1255 return -EINVAL; 1256 } 1257 } 1258 1259 /** 1260 * amdgpu_device_wb_free - Free a wb entry 1261 * 1262 * @adev: amdgpu_device pointer 1263 * @wb: wb index 1264 * 1265 * Free a wb slot allocated for use by the driver (all asics) 1266 */ 1267 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1268 { 1269 wb >>= 3; 1270 if (wb < adev->wb.num_wb) 1271 __clear_bit(wb, adev->wb.used); 1272 } 1273 1274 /** 1275 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1276 * 1277 * @adev: amdgpu_device pointer 1278 * 1279 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1280 * to fail, but if any of the BARs is not accessible after the size we abort 1281 * driver loading by returning -ENODEV. 1282 */ 1283 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1284 { 1285 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1286 struct pci_bus *root; 1287 struct resource *res; 1288 unsigned int i; 1289 u16 cmd; 1290 int r; 1291 1292 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1293 return 0; 1294 1295 /* Bypass for VF */ 1296 if (amdgpu_sriov_vf(adev)) 1297 return 0; 1298 1299 /* skip if the bios has already enabled large BAR */ 1300 if (adev->gmc.real_vram_size && 1301 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1302 return 0; 1303 1304 /* Check if the root BUS has 64bit memory resources */ 1305 root = adev->pdev->bus; 1306 while (root->parent) 1307 root = root->parent; 1308 1309 pci_bus_for_each_resource(root, res, i) { 1310 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1311 res->start > 0x100000000ull) 1312 break; 1313 } 1314 1315 /* Trying to resize is pointless without a root hub window above 4GB */ 1316 if (!res) 1317 return 0; 1318 1319 /* Limit the BAR size to what is available */ 1320 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1321 rbar_size); 1322 1323 /* Disable memory decoding while we change the BAR addresses and size */ 1324 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1325 pci_write_config_word(adev->pdev, PCI_COMMAND, 1326 cmd & ~PCI_COMMAND_MEMORY); 1327 1328 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1329 amdgpu_doorbell_fini(adev); 1330 if (adev->asic_type >= CHIP_BONAIRE) 1331 pci_release_resource(adev->pdev, 2); 1332 1333 pci_release_resource(adev->pdev, 0); 1334 1335 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1336 if (r == -ENOSPC) 1337 DRM_INFO("Not enough PCI address space for a large BAR."); 1338 else if (r && r != -ENOTSUPP) 1339 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1340 1341 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1342 1343 /* When the doorbell or fb BAR isn't available we have no chance of 1344 * using the device. 1345 */ 1346 r = amdgpu_doorbell_init(adev); 1347 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1348 return -ENODEV; 1349 1350 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1351 1352 return 0; 1353 } 1354 1355 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1356 { 1357 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1358 return false; 1359 1360 return true; 1361 } 1362 1363 /* 1364 * GPU helpers function. 1365 */ 1366 /** 1367 * amdgpu_device_need_post - check if the hw need post or not 1368 * 1369 * @adev: amdgpu_device pointer 1370 * 1371 * Check if the asic has been initialized (all asics) at driver startup 1372 * or post is needed if hw reset is performed. 1373 * Returns true if need or false if not. 1374 */ 1375 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1376 { 1377 uint32_t reg; 1378 1379 if (amdgpu_sriov_vf(adev)) 1380 return false; 1381 1382 if (!amdgpu_device_read_bios(adev)) 1383 return false; 1384 1385 if (amdgpu_passthrough(adev)) { 1386 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1387 * some old smc fw still need driver do vPost otherwise gpu hang, while 1388 * those smc fw version above 22.15 doesn't have this flaw, so we force 1389 * vpost executed for smc version below 22.15 1390 */ 1391 if (adev->asic_type == CHIP_FIJI) { 1392 int err; 1393 uint32_t fw_ver; 1394 1395 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1396 /* force vPost if error occured */ 1397 if (err) 1398 return true; 1399 1400 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1401 if (fw_ver < 0x00160e00) 1402 return true; 1403 } 1404 } 1405 1406 /* Don't post if we need to reset whole hive on init */ 1407 if (adev->gmc.xgmi.pending_reset) 1408 return false; 1409 1410 if (adev->has_hw_reset) { 1411 adev->has_hw_reset = false; 1412 return true; 1413 } 1414 1415 /* bios scratch used on CIK+ */ 1416 if (adev->asic_type >= CHIP_BONAIRE) 1417 return amdgpu_atombios_scratch_need_asic_init(adev); 1418 1419 /* check MEM_SIZE for older asics */ 1420 reg = amdgpu_asic_get_config_memsize(adev); 1421 1422 if ((reg != 0) && (reg != 0xffffffff)) 1423 return false; 1424 1425 return true; 1426 } 1427 1428 /* 1429 * Check whether seamless boot is supported. 1430 * 1431 * So far we only support seamless boot on DCE 3.0 or later. 1432 * If users report that it works on older ASICS as well, we may 1433 * loosen this. 1434 */ 1435 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1436 { 1437 switch (amdgpu_seamless) { 1438 case -1: 1439 break; 1440 case 1: 1441 return true; 1442 case 0: 1443 return false; 1444 default: 1445 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1446 amdgpu_seamless); 1447 return false; 1448 } 1449 1450 if (!(adev->flags & AMD_IS_APU)) 1451 return false; 1452 1453 if (adev->mman.keep_stolen_vga_memory) 1454 return false; 1455 1456 return adev->ip_versions[DCE_HWIP][0] >= IP_VERSION(3, 0, 0); 1457 } 1458 1459 /* 1460 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1461 * don't support dynamic speed switching. Until we have confirmation from Intel 1462 * that a specific host supports it, it's safer that we keep it disabled for all. 1463 * 1464 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1465 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1466 */ 1467 static bool amdgpu_device_pcie_dynamic_switching_supported(void) 1468 { 1469 #if IS_ENABLED(CONFIG_X86) 1470 struct cpuinfo_x86 *c = &cpu_data(0); 1471 1472 if (c->x86_vendor == X86_VENDOR_INTEL) 1473 return false; 1474 #endif 1475 return true; 1476 } 1477 1478 /** 1479 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1480 * 1481 * @adev: amdgpu_device pointer 1482 * 1483 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1484 * be set for this device. 1485 * 1486 * Returns true if it should be used or false if not. 1487 */ 1488 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1489 { 1490 switch (amdgpu_aspm) { 1491 case -1: 1492 break; 1493 case 0: 1494 return false; 1495 case 1: 1496 return true; 1497 default: 1498 return false; 1499 } 1500 if (adev->flags & AMD_IS_APU) 1501 return false; 1502 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1503 return false; 1504 return pcie_aspm_enabled(adev->pdev); 1505 } 1506 1507 /* if we get transitioned to only one device, take VGA back */ 1508 /** 1509 * amdgpu_device_vga_set_decode - enable/disable vga decode 1510 * 1511 * @pdev: PCI device pointer 1512 * @state: enable/disable vga decode 1513 * 1514 * Enable/disable vga decode (all asics). 1515 * Returns VGA resource flags. 1516 */ 1517 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1518 bool state) 1519 { 1520 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1521 1522 amdgpu_asic_set_vga_state(adev, state); 1523 if (state) 1524 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1525 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1526 else 1527 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1528 } 1529 1530 /** 1531 * amdgpu_device_check_block_size - validate the vm block size 1532 * 1533 * @adev: amdgpu_device pointer 1534 * 1535 * Validates the vm block size specified via module parameter. 1536 * The vm block size defines number of bits in page table versus page directory, 1537 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1538 * page table and the remaining bits are in the page directory. 1539 */ 1540 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1541 { 1542 /* defines number of bits in page table versus page directory, 1543 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1544 * page table and the remaining bits are in the page directory 1545 */ 1546 if (amdgpu_vm_block_size == -1) 1547 return; 1548 1549 if (amdgpu_vm_block_size < 9) { 1550 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1551 amdgpu_vm_block_size); 1552 amdgpu_vm_block_size = -1; 1553 } 1554 } 1555 1556 /** 1557 * amdgpu_device_check_vm_size - validate the vm size 1558 * 1559 * @adev: amdgpu_device pointer 1560 * 1561 * Validates the vm size in GB specified via module parameter. 1562 * The VM size is the size of the GPU virtual memory space in GB. 1563 */ 1564 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1565 { 1566 /* no need to check the default value */ 1567 if (amdgpu_vm_size == -1) 1568 return; 1569 1570 if (amdgpu_vm_size < 1) { 1571 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1572 amdgpu_vm_size); 1573 amdgpu_vm_size = -1; 1574 } 1575 } 1576 1577 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1578 { 1579 struct sysinfo si; 1580 bool is_os_64 = (sizeof(void *) == 8); 1581 uint64_t total_memory; 1582 uint64_t dram_size_seven_GB = 0x1B8000000; 1583 uint64_t dram_size_three_GB = 0xB8000000; 1584 1585 if (amdgpu_smu_memory_pool_size == 0) 1586 return; 1587 1588 if (!is_os_64) { 1589 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1590 goto def_value; 1591 } 1592 si_meminfo(&si); 1593 total_memory = (uint64_t)si.totalram * si.mem_unit; 1594 1595 if ((amdgpu_smu_memory_pool_size == 1) || 1596 (amdgpu_smu_memory_pool_size == 2)) { 1597 if (total_memory < dram_size_three_GB) 1598 goto def_value1; 1599 } else if ((amdgpu_smu_memory_pool_size == 4) || 1600 (amdgpu_smu_memory_pool_size == 8)) { 1601 if (total_memory < dram_size_seven_GB) 1602 goto def_value1; 1603 } else { 1604 DRM_WARN("Smu memory pool size not supported\n"); 1605 goto def_value; 1606 } 1607 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1608 1609 return; 1610 1611 def_value1: 1612 DRM_WARN("No enough system memory\n"); 1613 def_value: 1614 adev->pm.smu_prv_buffer_size = 0; 1615 } 1616 1617 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1618 { 1619 if (!(adev->flags & AMD_IS_APU) || 1620 adev->asic_type < CHIP_RAVEN) 1621 return 0; 1622 1623 switch (adev->asic_type) { 1624 case CHIP_RAVEN: 1625 if (adev->pdev->device == 0x15dd) 1626 adev->apu_flags |= AMD_APU_IS_RAVEN; 1627 if (adev->pdev->device == 0x15d8) 1628 adev->apu_flags |= AMD_APU_IS_PICASSO; 1629 break; 1630 case CHIP_RENOIR: 1631 if ((adev->pdev->device == 0x1636) || 1632 (adev->pdev->device == 0x164c)) 1633 adev->apu_flags |= AMD_APU_IS_RENOIR; 1634 else 1635 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1636 break; 1637 case CHIP_VANGOGH: 1638 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1639 break; 1640 case CHIP_YELLOW_CARP: 1641 break; 1642 case CHIP_CYAN_SKILLFISH: 1643 if ((adev->pdev->device == 0x13FE) || 1644 (adev->pdev->device == 0x143F)) 1645 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1646 break; 1647 default: 1648 break; 1649 } 1650 1651 return 0; 1652 } 1653 1654 /** 1655 * amdgpu_device_check_arguments - validate module params 1656 * 1657 * @adev: amdgpu_device pointer 1658 * 1659 * Validates certain module parameters and updates 1660 * the associated values used by the driver (all asics). 1661 */ 1662 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1663 { 1664 if (amdgpu_sched_jobs < 4) { 1665 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1666 amdgpu_sched_jobs); 1667 amdgpu_sched_jobs = 4; 1668 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1669 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1670 amdgpu_sched_jobs); 1671 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1672 } 1673 1674 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1675 /* gart size must be greater or equal to 32M */ 1676 dev_warn(adev->dev, "gart size (%d) too small\n", 1677 amdgpu_gart_size); 1678 amdgpu_gart_size = -1; 1679 } 1680 1681 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1682 /* gtt size must be greater or equal to 32M */ 1683 dev_warn(adev->dev, "gtt size (%d) too small\n", 1684 amdgpu_gtt_size); 1685 amdgpu_gtt_size = -1; 1686 } 1687 1688 /* valid range is between 4 and 9 inclusive */ 1689 if (amdgpu_vm_fragment_size != -1 && 1690 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1691 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1692 amdgpu_vm_fragment_size = -1; 1693 } 1694 1695 if (amdgpu_sched_hw_submission < 2) { 1696 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1697 amdgpu_sched_hw_submission); 1698 amdgpu_sched_hw_submission = 2; 1699 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1700 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1701 amdgpu_sched_hw_submission); 1702 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1703 } 1704 1705 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1706 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1707 amdgpu_reset_method = -1; 1708 } 1709 1710 amdgpu_device_check_smu_prv_buffer_size(adev); 1711 1712 amdgpu_device_check_vm_size(adev); 1713 1714 amdgpu_device_check_block_size(adev); 1715 1716 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1717 1718 return 0; 1719 } 1720 1721 /** 1722 * amdgpu_switcheroo_set_state - set switcheroo state 1723 * 1724 * @pdev: pci dev pointer 1725 * @state: vga_switcheroo state 1726 * 1727 * Callback for the switcheroo driver. Suspends or resumes 1728 * the asics before or after it is powered up using ACPI methods. 1729 */ 1730 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1731 enum vga_switcheroo_state state) 1732 { 1733 struct drm_device *dev = pci_get_drvdata(pdev); 1734 int r; 1735 1736 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1737 return; 1738 1739 if (state == VGA_SWITCHEROO_ON) { 1740 pr_info("switched on\n"); 1741 /* don't suspend or resume card normally */ 1742 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1743 1744 pci_set_power_state(pdev, PCI_D0); 1745 amdgpu_device_load_pci_state(pdev); 1746 r = pci_enable_device(pdev); 1747 if (r) 1748 DRM_WARN("pci_enable_device failed (%d)\n", r); 1749 amdgpu_device_resume(dev, true); 1750 1751 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1752 } else { 1753 pr_info("switched off\n"); 1754 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1755 amdgpu_device_prepare(dev); 1756 amdgpu_device_suspend(dev, true); 1757 amdgpu_device_cache_pci_state(pdev); 1758 /* Shut down the device */ 1759 pci_disable_device(pdev); 1760 pci_set_power_state(pdev, PCI_D3cold); 1761 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1762 } 1763 } 1764 1765 /** 1766 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1767 * 1768 * @pdev: pci dev pointer 1769 * 1770 * Callback for the switcheroo driver. Check of the switcheroo 1771 * state can be changed. 1772 * Returns true if the state can be changed, false if not. 1773 */ 1774 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1775 { 1776 struct drm_device *dev = pci_get_drvdata(pdev); 1777 1778 /* 1779 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1780 * locking inversion with the driver load path. And the access here is 1781 * completely racy anyway. So don't bother with locking for now. 1782 */ 1783 return atomic_read(&dev->open_count) == 0; 1784 } 1785 1786 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1787 .set_gpu_state = amdgpu_switcheroo_set_state, 1788 .reprobe = NULL, 1789 .can_switch = amdgpu_switcheroo_can_switch, 1790 }; 1791 1792 /** 1793 * amdgpu_device_ip_set_clockgating_state - set the CG state 1794 * 1795 * @dev: amdgpu_device pointer 1796 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1797 * @state: clockgating state (gate or ungate) 1798 * 1799 * Sets the requested clockgating state for all instances of 1800 * the hardware IP specified. 1801 * Returns the error code from the last instance. 1802 */ 1803 int amdgpu_device_ip_set_clockgating_state(void *dev, 1804 enum amd_ip_block_type block_type, 1805 enum amd_clockgating_state state) 1806 { 1807 struct amdgpu_device *adev = dev; 1808 int i, r = 0; 1809 1810 for (i = 0; i < adev->num_ip_blocks; i++) { 1811 if (!adev->ip_blocks[i].status.valid) 1812 continue; 1813 if (adev->ip_blocks[i].version->type != block_type) 1814 continue; 1815 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1816 continue; 1817 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1818 (void *)adev, state); 1819 if (r) 1820 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1821 adev->ip_blocks[i].version->funcs->name, r); 1822 } 1823 return r; 1824 } 1825 1826 /** 1827 * amdgpu_device_ip_set_powergating_state - set the PG state 1828 * 1829 * @dev: amdgpu_device pointer 1830 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1831 * @state: powergating state (gate or ungate) 1832 * 1833 * Sets the requested powergating state for all instances of 1834 * the hardware IP specified. 1835 * Returns the error code from the last instance. 1836 */ 1837 int amdgpu_device_ip_set_powergating_state(void *dev, 1838 enum amd_ip_block_type block_type, 1839 enum amd_powergating_state state) 1840 { 1841 struct amdgpu_device *adev = dev; 1842 int i, r = 0; 1843 1844 for (i = 0; i < adev->num_ip_blocks; i++) { 1845 if (!adev->ip_blocks[i].status.valid) 1846 continue; 1847 if (adev->ip_blocks[i].version->type != block_type) 1848 continue; 1849 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1850 continue; 1851 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1852 (void *)adev, state); 1853 if (r) 1854 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1855 adev->ip_blocks[i].version->funcs->name, r); 1856 } 1857 return r; 1858 } 1859 1860 /** 1861 * amdgpu_device_ip_get_clockgating_state - get the CG state 1862 * 1863 * @adev: amdgpu_device pointer 1864 * @flags: clockgating feature flags 1865 * 1866 * Walks the list of IPs on the device and updates the clockgating 1867 * flags for each IP. 1868 * Updates @flags with the feature flags for each hardware IP where 1869 * clockgating is enabled. 1870 */ 1871 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1872 u64 *flags) 1873 { 1874 int i; 1875 1876 for (i = 0; i < adev->num_ip_blocks; i++) { 1877 if (!adev->ip_blocks[i].status.valid) 1878 continue; 1879 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1880 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1881 } 1882 } 1883 1884 /** 1885 * amdgpu_device_ip_wait_for_idle - wait for idle 1886 * 1887 * @adev: amdgpu_device pointer 1888 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1889 * 1890 * Waits for the request hardware IP to be idle. 1891 * Returns 0 for success or a negative error code on failure. 1892 */ 1893 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1894 enum amd_ip_block_type block_type) 1895 { 1896 int i, r; 1897 1898 for (i = 0; i < adev->num_ip_blocks; i++) { 1899 if (!adev->ip_blocks[i].status.valid) 1900 continue; 1901 if (adev->ip_blocks[i].version->type == block_type) { 1902 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1903 if (r) 1904 return r; 1905 break; 1906 } 1907 } 1908 return 0; 1909 1910 } 1911 1912 /** 1913 * amdgpu_device_ip_is_idle - is the hardware IP idle 1914 * 1915 * @adev: amdgpu_device pointer 1916 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1917 * 1918 * Check if the hardware IP is idle or not. 1919 * Returns true if it the IP is idle, false if not. 1920 */ 1921 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1922 enum amd_ip_block_type block_type) 1923 { 1924 int i; 1925 1926 for (i = 0; i < adev->num_ip_blocks; i++) { 1927 if (!adev->ip_blocks[i].status.valid) 1928 continue; 1929 if (adev->ip_blocks[i].version->type == block_type) 1930 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1931 } 1932 return true; 1933 1934 } 1935 1936 /** 1937 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1938 * 1939 * @adev: amdgpu_device pointer 1940 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1941 * 1942 * Returns a pointer to the hardware IP block structure 1943 * if it exists for the asic, otherwise NULL. 1944 */ 1945 struct amdgpu_ip_block * 1946 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1947 enum amd_ip_block_type type) 1948 { 1949 int i; 1950 1951 for (i = 0; i < adev->num_ip_blocks; i++) 1952 if (adev->ip_blocks[i].version->type == type) 1953 return &adev->ip_blocks[i]; 1954 1955 return NULL; 1956 } 1957 1958 /** 1959 * amdgpu_device_ip_block_version_cmp 1960 * 1961 * @adev: amdgpu_device pointer 1962 * @type: enum amd_ip_block_type 1963 * @major: major version 1964 * @minor: minor version 1965 * 1966 * return 0 if equal or greater 1967 * return 1 if smaller or the ip_block doesn't exist 1968 */ 1969 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1970 enum amd_ip_block_type type, 1971 u32 major, u32 minor) 1972 { 1973 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1974 1975 if (ip_block && ((ip_block->version->major > major) || 1976 ((ip_block->version->major == major) && 1977 (ip_block->version->minor >= minor)))) 1978 return 0; 1979 1980 return 1; 1981 } 1982 1983 /** 1984 * amdgpu_device_ip_block_add 1985 * 1986 * @adev: amdgpu_device pointer 1987 * @ip_block_version: pointer to the IP to add 1988 * 1989 * Adds the IP block driver information to the collection of IPs 1990 * on the asic. 1991 */ 1992 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1993 const struct amdgpu_ip_block_version *ip_block_version) 1994 { 1995 if (!ip_block_version) 1996 return -EINVAL; 1997 1998 switch (ip_block_version->type) { 1999 case AMD_IP_BLOCK_TYPE_VCN: 2000 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2001 return 0; 2002 break; 2003 case AMD_IP_BLOCK_TYPE_JPEG: 2004 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2005 return 0; 2006 break; 2007 default: 2008 break; 2009 } 2010 2011 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2012 ip_block_version->funcs->name); 2013 2014 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2015 2016 return 0; 2017 } 2018 2019 /** 2020 * amdgpu_device_enable_virtual_display - enable virtual display feature 2021 * 2022 * @adev: amdgpu_device pointer 2023 * 2024 * Enabled the virtual display feature if the user has enabled it via 2025 * the module parameter virtual_display. This feature provides a virtual 2026 * display hardware on headless boards or in virtualized environments. 2027 * This function parses and validates the configuration string specified by 2028 * the user and configues the virtual display configuration (number of 2029 * virtual connectors, crtcs, etc.) specified. 2030 */ 2031 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2032 { 2033 adev->enable_virtual_display = false; 2034 2035 if (amdgpu_virtual_display) { 2036 const char *pci_address_name = pci_name(adev->pdev); 2037 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2038 2039 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2040 pciaddstr_tmp = pciaddstr; 2041 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2042 pciaddname = strsep(&pciaddname_tmp, ","); 2043 if (!strcmp("all", pciaddname) 2044 || !strcmp(pci_address_name, pciaddname)) { 2045 long num_crtc; 2046 int res = -1; 2047 2048 adev->enable_virtual_display = true; 2049 2050 if (pciaddname_tmp) 2051 res = kstrtol(pciaddname_tmp, 10, 2052 &num_crtc); 2053 2054 if (!res) { 2055 if (num_crtc < 1) 2056 num_crtc = 1; 2057 if (num_crtc > 6) 2058 num_crtc = 6; 2059 adev->mode_info.num_crtc = num_crtc; 2060 } else { 2061 adev->mode_info.num_crtc = 1; 2062 } 2063 break; 2064 } 2065 } 2066 2067 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2068 amdgpu_virtual_display, pci_address_name, 2069 adev->enable_virtual_display, adev->mode_info.num_crtc); 2070 2071 kfree(pciaddstr); 2072 } 2073 } 2074 2075 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2076 { 2077 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2078 adev->mode_info.num_crtc = 1; 2079 adev->enable_virtual_display = true; 2080 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2081 adev->enable_virtual_display, adev->mode_info.num_crtc); 2082 } 2083 } 2084 2085 /** 2086 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2087 * 2088 * @adev: amdgpu_device pointer 2089 * 2090 * Parses the asic configuration parameters specified in the gpu info 2091 * firmware and makes them availale to the driver for use in configuring 2092 * the asic. 2093 * Returns 0 on success, -EINVAL on failure. 2094 */ 2095 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2096 { 2097 const char *chip_name; 2098 char fw_name[40]; 2099 int err; 2100 const struct gpu_info_firmware_header_v1_0 *hdr; 2101 2102 adev->firmware.gpu_info_fw = NULL; 2103 2104 if (adev->mman.discovery_bin) { 2105 /* 2106 * FIXME: The bounding box is still needed by Navi12, so 2107 * temporarily read it from gpu_info firmware. Should be dropped 2108 * when DAL no longer needs it. 2109 */ 2110 if (adev->asic_type != CHIP_NAVI12) 2111 return 0; 2112 } 2113 2114 switch (adev->asic_type) { 2115 default: 2116 return 0; 2117 case CHIP_VEGA10: 2118 chip_name = "vega10"; 2119 break; 2120 case CHIP_VEGA12: 2121 chip_name = "vega12"; 2122 break; 2123 case CHIP_RAVEN: 2124 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2125 chip_name = "raven2"; 2126 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2127 chip_name = "picasso"; 2128 else 2129 chip_name = "raven"; 2130 break; 2131 case CHIP_ARCTURUS: 2132 chip_name = "arcturus"; 2133 break; 2134 case CHIP_NAVI12: 2135 chip_name = "navi12"; 2136 break; 2137 } 2138 2139 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2140 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2141 if (err) { 2142 dev_err(adev->dev, 2143 "Failed to get gpu_info firmware \"%s\"\n", 2144 fw_name); 2145 goto out; 2146 } 2147 2148 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2149 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2150 2151 switch (hdr->version_major) { 2152 case 1: 2153 { 2154 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2155 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2156 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2157 2158 /* 2159 * Should be droped when DAL no longer needs it. 2160 */ 2161 if (adev->asic_type == CHIP_NAVI12) 2162 goto parse_soc_bounding_box; 2163 2164 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2165 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2166 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2167 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2168 adev->gfx.config.max_texture_channel_caches = 2169 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2170 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2171 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2172 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2173 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2174 adev->gfx.config.double_offchip_lds_buf = 2175 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2176 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2177 adev->gfx.cu_info.max_waves_per_simd = 2178 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2179 adev->gfx.cu_info.max_scratch_slots_per_cu = 2180 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2181 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2182 if (hdr->version_minor >= 1) { 2183 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2184 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2185 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2186 adev->gfx.config.num_sc_per_sh = 2187 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2188 adev->gfx.config.num_packer_per_sc = 2189 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2190 } 2191 2192 parse_soc_bounding_box: 2193 /* 2194 * soc bounding box info is not integrated in disocovery table, 2195 * we always need to parse it from gpu info firmware if needed. 2196 */ 2197 if (hdr->version_minor == 2) { 2198 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2199 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2200 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2201 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2202 } 2203 break; 2204 } 2205 default: 2206 dev_err(adev->dev, 2207 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2208 err = -EINVAL; 2209 goto out; 2210 } 2211 out: 2212 return err; 2213 } 2214 2215 /** 2216 * amdgpu_device_ip_early_init - run early init for hardware IPs 2217 * 2218 * @adev: amdgpu_device pointer 2219 * 2220 * Early initialization pass for hardware IPs. The hardware IPs that make 2221 * up each asic are discovered each IP's early_init callback is run. This 2222 * is the first stage in initializing the asic. 2223 * Returns 0 on success, negative error code on failure. 2224 */ 2225 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2226 { 2227 struct pci_dev *parent; 2228 int i, r; 2229 bool total; 2230 2231 amdgpu_device_enable_virtual_display(adev); 2232 2233 if (amdgpu_sriov_vf(adev)) { 2234 r = amdgpu_virt_request_full_gpu(adev, true); 2235 if (r) 2236 return r; 2237 } 2238 2239 switch (adev->asic_type) { 2240 #ifdef CONFIG_DRM_AMDGPU_SI 2241 case CHIP_VERDE: 2242 case CHIP_TAHITI: 2243 case CHIP_PITCAIRN: 2244 case CHIP_OLAND: 2245 case CHIP_HAINAN: 2246 adev->family = AMDGPU_FAMILY_SI; 2247 r = si_set_ip_blocks(adev); 2248 if (r) 2249 return r; 2250 break; 2251 #endif 2252 #ifdef CONFIG_DRM_AMDGPU_CIK 2253 case CHIP_BONAIRE: 2254 case CHIP_HAWAII: 2255 case CHIP_KAVERI: 2256 case CHIP_KABINI: 2257 case CHIP_MULLINS: 2258 if (adev->flags & AMD_IS_APU) 2259 adev->family = AMDGPU_FAMILY_KV; 2260 else 2261 adev->family = AMDGPU_FAMILY_CI; 2262 2263 r = cik_set_ip_blocks(adev); 2264 if (r) 2265 return r; 2266 break; 2267 #endif 2268 case CHIP_TOPAZ: 2269 case CHIP_TONGA: 2270 case CHIP_FIJI: 2271 case CHIP_POLARIS10: 2272 case CHIP_POLARIS11: 2273 case CHIP_POLARIS12: 2274 case CHIP_VEGAM: 2275 case CHIP_CARRIZO: 2276 case CHIP_STONEY: 2277 if (adev->flags & AMD_IS_APU) 2278 adev->family = AMDGPU_FAMILY_CZ; 2279 else 2280 adev->family = AMDGPU_FAMILY_VI; 2281 2282 r = vi_set_ip_blocks(adev); 2283 if (r) 2284 return r; 2285 break; 2286 default: 2287 r = amdgpu_discovery_set_ip_blocks(adev); 2288 if (r) 2289 return r; 2290 break; 2291 } 2292 2293 if (amdgpu_has_atpx() && 2294 (amdgpu_is_atpx_hybrid() || 2295 amdgpu_has_atpx_dgpu_power_cntl()) && 2296 ((adev->flags & AMD_IS_APU) == 0) && 2297 !dev_is_removable(&adev->pdev->dev)) 2298 adev->flags |= AMD_IS_PX; 2299 2300 if (!(adev->flags & AMD_IS_APU)) { 2301 parent = pcie_find_root_port(adev->pdev); 2302 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2303 } 2304 2305 2306 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2307 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2308 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2309 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2310 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2311 if (!amdgpu_device_pcie_dynamic_switching_supported()) 2312 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2313 2314 total = true; 2315 for (i = 0; i < adev->num_ip_blocks; i++) { 2316 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2317 DRM_WARN("disabled ip block: %d <%s>\n", 2318 i, adev->ip_blocks[i].version->funcs->name); 2319 adev->ip_blocks[i].status.valid = false; 2320 } else { 2321 if (adev->ip_blocks[i].version->funcs->early_init) { 2322 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2323 if (r == -ENOENT) { 2324 adev->ip_blocks[i].status.valid = false; 2325 } else if (r) { 2326 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2327 adev->ip_blocks[i].version->funcs->name, r); 2328 total = false; 2329 } else { 2330 adev->ip_blocks[i].status.valid = true; 2331 } 2332 } else { 2333 adev->ip_blocks[i].status.valid = true; 2334 } 2335 } 2336 /* get the vbios after the asic_funcs are set up */ 2337 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2338 r = amdgpu_device_parse_gpu_info_fw(adev); 2339 if (r) 2340 return r; 2341 2342 /* Read BIOS */ 2343 if (amdgpu_device_read_bios(adev)) { 2344 if (!amdgpu_get_bios(adev)) 2345 return -EINVAL; 2346 2347 r = amdgpu_atombios_init(adev); 2348 if (r) { 2349 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2350 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2351 return r; 2352 } 2353 } 2354 2355 /*get pf2vf msg info at it's earliest time*/ 2356 if (amdgpu_sriov_vf(adev)) 2357 amdgpu_virt_init_data_exchange(adev); 2358 2359 } 2360 } 2361 if (!total) 2362 return -ENODEV; 2363 2364 amdgpu_amdkfd_device_probe(adev); 2365 adev->cg_flags &= amdgpu_cg_mask; 2366 adev->pg_flags &= amdgpu_pg_mask; 2367 2368 return 0; 2369 } 2370 2371 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2372 { 2373 int i, r; 2374 2375 for (i = 0; i < adev->num_ip_blocks; i++) { 2376 if (!adev->ip_blocks[i].status.sw) 2377 continue; 2378 if (adev->ip_blocks[i].status.hw) 2379 continue; 2380 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2381 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2382 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2383 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2384 if (r) { 2385 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2386 adev->ip_blocks[i].version->funcs->name, r); 2387 return r; 2388 } 2389 adev->ip_blocks[i].status.hw = true; 2390 } 2391 } 2392 2393 return 0; 2394 } 2395 2396 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2397 { 2398 int i, r; 2399 2400 for (i = 0; i < adev->num_ip_blocks; i++) { 2401 if (!adev->ip_blocks[i].status.sw) 2402 continue; 2403 if (adev->ip_blocks[i].status.hw) 2404 continue; 2405 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2406 if (r) { 2407 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2408 adev->ip_blocks[i].version->funcs->name, r); 2409 return r; 2410 } 2411 adev->ip_blocks[i].status.hw = true; 2412 } 2413 2414 return 0; 2415 } 2416 2417 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2418 { 2419 int r = 0; 2420 int i; 2421 uint32_t smu_version; 2422 2423 if (adev->asic_type >= CHIP_VEGA10) { 2424 for (i = 0; i < adev->num_ip_blocks; i++) { 2425 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2426 continue; 2427 2428 if (!adev->ip_blocks[i].status.sw) 2429 continue; 2430 2431 /* no need to do the fw loading again if already done*/ 2432 if (adev->ip_blocks[i].status.hw == true) 2433 break; 2434 2435 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2436 r = adev->ip_blocks[i].version->funcs->resume(adev); 2437 if (r) { 2438 DRM_ERROR("resume of IP block <%s> failed %d\n", 2439 adev->ip_blocks[i].version->funcs->name, r); 2440 return r; 2441 } 2442 } else { 2443 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2444 if (r) { 2445 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2446 adev->ip_blocks[i].version->funcs->name, r); 2447 return r; 2448 } 2449 } 2450 2451 adev->ip_blocks[i].status.hw = true; 2452 break; 2453 } 2454 } 2455 2456 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2457 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2458 2459 return r; 2460 } 2461 2462 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2463 { 2464 long timeout; 2465 int r, i; 2466 2467 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2468 struct amdgpu_ring *ring = adev->rings[i]; 2469 2470 /* No need to setup the GPU scheduler for rings that don't need it */ 2471 if (!ring || ring->no_scheduler) 2472 continue; 2473 2474 switch (ring->funcs->type) { 2475 case AMDGPU_RING_TYPE_GFX: 2476 timeout = adev->gfx_timeout; 2477 break; 2478 case AMDGPU_RING_TYPE_COMPUTE: 2479 timeout = adev->compute_timeout; 2480 break; 2481 case AMDGPU_RING_TYPE_SDMA: 2482 timeout = adev->sdma_timeout; 2483 break; 2484 default: 2485 timeout = adev->video_timeout; 2486 break; 2487 } 2488 2489 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2490 DRM_SCHED_PRIORITY_COUNT, 2491 ring->num_hw_submission, 0, 2492 timeout, adev->reset_domain->wq, 2493 ring->sched_score, ring->name, 2494 adev->dev); 2495 if (r) { 2496 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2497 ring->name); 2498 return r; 2499 } 2500 } 2501 2502 amdgpu_xcp_update_partition_sched_list(adev); 2503 2504 return 0; 2505 } 2506 2507 2508 /** 2509 * amdgpu_device_ip_init - run init for hardware IPs 2510 * 2511 * @adev: amdgpu_device pointer 2512 * 2513 * Main initialization pass for hardware IPs. The list of all the hardware 2514 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2515 * are run. sw_init initializes the software state associated with each IP 2516 * and hw_init initializes the hardware associated with each IP. 2517 * Returns 0 on success, negative error code on failure. 2518 */ 2519 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2520 { 2521 int i, r; 2522 2523 r = amdgpu_ras_init(adev); 2524 if (r) 2525 return r; 2526 2527 for (i = 0; i < adev->num_ip_blocks; i++) { 2528 if (!adev->ip_blocks[i].status.valid) 2529 continue; 2530 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2531 if (r) { 2532 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2533 adev->ip_blocks[i].version->funcs->name, r); 2534 goto init_failed; 2535 } 2536 adev->ip_blocks[i].status.sw = true; 2537 2538 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2539 /* need to do common hw init early so everything is set up for gmc */ 2540 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2541 if (r) { 2542 DRM_ERROR("hw_init %d failed %d\n", i, r); 2543 goto init_failed; 2544 } 2545 adev->ip_blocks[i].status.hw = true; 2546 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2547 /* need to do gmc hw init early so we can allocate gpu mem */ 2548 /* Try to reserve bad pages early */ 2549 if (amdgpu_sriov_vf(adev)) 2550 amdgpu_virt_exchange_data(adev); 2551 2552 r = amdgpu_device_mem_scratch_init(adev); 2553 if (r) { 2554 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2555 goto init_failed; 2556 } 2557 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2558 if (r) { 2559 DRM_ERROR("hw_init %d failed %d\n", i, r); 2560 goto init_failed; 2561 } 2562 r = amdgpu_device_wb_init(adev); 2563 if (r) { 2564 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2565 goto init_failed; 2566 } 2567 adev->ip_blocks[i].status.hw = true; 2568 2569 /* right after GMC hw init, we create CSA */ 2570 if (adev->gfx.mcbp) { 2571 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2572 AMDGPU_GEM_DOMAIN_VRAM | 2573 AMDGPU_GEM_DOMAIN_GTT, 2574 AMDGPU_CSA_SIZE); 2575 if (r) { 2576 DRM_ERROR("allocate CSA failed %d\n", r); 2577 goto init_failed; 2578 } 2579 } 2580 } 2581 } 2582 2583 if (amdgpu_sriov_vf(adev)) 2584 amdgpu_virt_init_data_exchange(adev); 2585 2586 r = amdgpu_ib_pool_init(adev); 2587 if (r) { 2588 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2589 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2590 goto init_failed; 2591 } 2592 2593 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2594 if (r) 2595 goto init_failed; 2596 2597 r = amdgpu_device_ip_hw_init_phase1(adev); 2598 if (r) 2599 goto init_failed; 2600 2601 r = amdgpu_device_fw_loading(adev); 2602 if (r) 2603 goto init_failed; 2604 2605 r = amdgpu_device_ip_hw_init_phase2(adev); 2606 if (r) 2607 goto init_failed; 2608 2609 /* 2610 * retired pages will be loaded from eeprom and reserved here, 2611 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2612 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2613 * for I2C communication which only true at this point. 2614 * 2615 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2616 * failure from bad gpu situation and stop amdgpu init process 2617 * accordingly. For other failed cases, it will still release all 2618 * the resource and print error message, rather than returning one 2619 * negative value to upper level. 2620 * 2621 * Note: theoretically, this should be called before all vram allocations 2622 * to protect retired page from abusing 2623 */ 2624 r = amdgpu_ras_recovery_init(adev); 2625 if (r) 2626 goto init_failed; 2627 2628 /** 2629 * In case of XGMI grab extra reference for reset domain for this device 2630 */ 2631 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2632 if (amdgpu_xgmi_add_device(adev) == 0) { 2633 if (!amdgpu_sriov_vf(adev)) { 2634 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2635 2636 if (WARN_ON(!hive)) { 2637 r = -ENOENT; 2638 goto init_failed; 2639 } 2640 2641 if (!hive->reset_domain || 2642 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2643 r = -ENOENT; 2644 amdgpu_put_xgmi_hive(hive); 2645 goto init_failed; 2646 } 2647 2648 /* Drop the early temporary reset domain we created for device */ 2649 amdgpu_reset_put_reset_domain(adev->reset_domain); 2650 adev->reset_domain = hive->reset_domain; 2651 amdgpu_put_xgmi_hive(hive); 2652 } 2653 } 2654 } 2655 2656 r = amdgpu_device_init_schedulers(adev); 2657 if (r) 2658 goto init_failed; 2659 2660 if (adev->mman.buffer_funcs_ring->sched.ready) 2661 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2662 2663 /* Don't init kfd if whole hive need to be reset during init */ 2664 if (!adev->gmc.xgmi.pending_reset) { 2665 kgd2kfd_init_zone_device(adev); 2666 amdgpu_amdkfd_device_init(adev); 2667 } 2668 2669 amdgpu_fru_get_product_info(adev); 2670 2671 init_failed: 2672 2673 return r; 2674 } 2675 2676 /** 2677 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2678 * 2679 * @adev: amdgpu_device pointer 2680 * 2681 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2682 * this function before a GPU reset. If the value is retained after a 2683 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2684 */ 2685 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2686 { 2687 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2688 } 2689 2690 /** 2691 * amdgpu_device_check_vram_lost - check if vram is valid 2692 * 2693 * @adev: amdgpu_device pointer 2694 * 2695 * Checks the reset magic value written to the gart pointer in VRAM. 2696 * The driver calls this after a GPU reset to see if the contents of 2697 * VRAM is lost or now. 2698 * returns true if vram is lost, false if not. 2699 */ 2700 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2701 { 2702 if (memcmp(adev->gart.ptr, adev->reset_magic, 2703 AMDGPU_RESET_MAGIC_NUM)) 2704 return true; 2705 2706 if (!amdgpu_in_reset(adev)) 2707 return false; 2708 2709 /* 2710 * For all ASICs with baco/mode1 reset, the VRAM is 2711 * always assumed to be lost. 2712 */ 2713 switch (amdgpu_asic_reset_method(adev)) { 2714 case AMD_RESET_METHOD_BACO: 2715 case AMD_RESET_METHOD_MODE1: 2716 return true; 2717 default: 2718 return false; 2719 } 2720 } 2721 2722 /** 2723 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2724 * 2725 * @adev: amdgpu_device pointer 2726 * @state: clockgating state (gate or ungate) 2727 * 2728 * The list of all the hardware IPs that make up the asic is walked and the 2729 * set_clockgating_state callbacks are run. 2730 * Late initialization pass enabling clockgating for hardware IPs. 2731 * Fini or suspend, pass disabling clockgating for hardware IPs. 2732 * Returns 0 on success, negative error code on failure. 2733 */ 2734 2735 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2736 enum amd_clockgating_state state) 2737 { 2738 int i, j, r; 2739 2740 if (amdgpu_emu_mode == 1) 2741 return 0; 2742 2743 for (j = 0; j < adev->num_ip_blocks; j++) { 2744 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2745 if (!adev->ip_blocks[i].status.late_initialized) 2746 continue; 2747 /* skip CG for GFX, SDMA on S0ix */ 2748 if (adev->in_s0ix && 2749 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2750 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2751 continue; 2752 /* skip CG for VCE/UVD, it's handled specially */ 2753 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2754 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2755 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2756 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2757 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2758 /* enable clockgating to save power */ 2759 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2760 state); 2761 if (r) { 2762 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2763 adev->ip_blocks[i].version->funcs->name, r); 2764 return r; 2765 } 2766 } 2767 } 2768 2769 return 0; 2770 } 2771 2772 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2773 enum amd_powergating_state state) 2774 { 2775 int i, j, r; 2776 2777 if (amdgpu_emu_mode == 1) 2778 return 0; 2779 2780 for (j = 0; j < adev->num_ip_blocks; j++) { 2781 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2782 if (!adev->ip_blocks[i].status.late_initialized) 2783 continue; 2784 /* skip PG for GFX, SDMA on S0ix */ 2785 if (adev->in_s0ix && 2786 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2787 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2788 continue; 2789 /* skip CG for VCE/UVD, it's handled specially */ 2790 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2791 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2792 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2793 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2794 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2795 /* enable powergating to save power */ 2796 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2797 state); 2798 if (r) { 2799 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2800 adev->ip_blocks[i].version->funcs->name, r); 2801 return r; 2802 } 2803 } 2804 } 2805 return 0; 2806 } 2807 2808 static int amdgpu_device_enable_mgpu_fan_boost(void) 2809 { 2810 struct amdgpu_gpu_instance *gpu_ins; 2811 struct amdgpu_device *adev; 2812 int i, ret = 0; 2813 2814 mutex_lock(&mgpu_info.mutex); 2815 2816 /* 2817 * MGPU fan boost feature should be enabled 2818 * only when there are two or more dGPUs in 2819 * the system 2820 */ 2821 if (mgpu_info.num_dgpu < 2) 2822 goto out; 2823 2824 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2825 gpu_ins = &(mgpu_info.gpu_ins[i]); 2826 adev = gpu_ins->adev; 2827 if (!(adev->flags & AMD_IS_APU) && 2828 !gpu_ins->mgpu_fan_enabled) { 2829 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2830 if (ret) 2831 break; 2832 2833 gpu_ins->mgpu_fan_enabled = 1; 2834 } 2835 } 2836 2837 out: 2838 mutex_unlock(&mgpu_info.mutex); 2839 2840 return ret; 2841 } 2842 2843 /** 2844 * amdgpu_device_ip_late_init - run late init for hardware IPs 2845 * 2846 * @adev: amdgpu_device pointer 2847 * 2848 * Late initialization pass for hardware IPs. The list of all the hardware 2849 * IPs that make up the asic is walked and the late_init callbacks are run. 2850 * late_init covers any special initialization that an IP requires 2851 * after all of the have been initialized or something that needs to happen 2852 * late in the init process. 2853 * Returns 0 on success, negative error code on failure. 2854 */ 2855 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2856 { 2857 struct amdgpu_gpu_instance *gpu_instance; 2858 int i = 0, r; 2859 2860 for (i = 0; i < adev->num_ip_blocks; i++) { 2861 if (!adev->ip_blocks[i].status.hw) 2862 continue; 2863 if (adev->ip_blocks[i].version->funcs->late_init) { 2864 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2865 if (r) { 2866 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2867 adev->ip_blocks[i].version->funcs->name, r); 2868 return r; 2869 } 2870 } 2871 adev->ip_blocks[i].status.late_initialized = true; 2872 } 2873 2874 r = amdgpu_ras_late_init(adev); 2875 if (r) { 2876 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2877 return r; 2878 } 2879 2880 amdgpu_ras_set_error_query_ready(adev, true); 2881 2882 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2883 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2884 2885 amdgpu_device_fill_reset_magic(adev); 2886 2887 r = amdgpu_device_enable_mgpu_fan_boost(); 2888 if (r) 2889 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2890 2891 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2892 if (amdgpu_passthrough(adev) && 2893 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2894 adev->asic_type == CHIP_ALDEBARAN)) 2895 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2896 2897 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2898 mutex_lock(&mgpu_info.mutex); 2899 2900 /* 2901 * Reset device p-state to low as this was booted with high. 2902 * 2903 * This should be performed only after all devices from the same 2904 * hive get initialized. 2905 * 2906 * However, it's unknown how many device in the hive in advance. 2907 * As this is counted one by one during devices initializations. 2908 * 2909 * So, we wait for all XGMI interlinked devices initialized. 2910 * This may bring some delays as those devices may come from 2911 * different hives. But that should be OK. 2912 */ 2913 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2914 for (i = 0; i < mgpu_info.num_gpu; i++) { 2915 gpu_instance = &(mgpu_info.gpu_ins[i]); 2916 if (gpu_instance->adev->flags & AMD_IS_APU) 2917 continue; 2918 2919 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2920 AMDGPU_XGMI_PSTATE_MIN); 2921 if (r) { 2922 DRM_ERROR("pstate setting failed (%d).\n", r); 2923 break; 2924 } 2925 } 2926 } 2927 2928 mutex_unlock(&mgpu_info.mutex); 2929 } 2930 2931 return 0; 2932 } 2933 2934 /** 2935 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2936 * 2937 * @adev: amdgpu_device pointer 2938 * 2939 * For ASICs need to disable SMC first 2940 */ 2941 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2942 { 2943 int i, r; 2944 2945 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 2946 return; 2947 2948 for (i = 0; i < adev->num_ip_blocks; i++) { 2949 if (!adev->ip_blocks[i].status.hw) 2950 continue; 2951 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2952 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2953 /* XXX handle errors */ 2954 if (r) { 2955 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2956 adev->ip_blocks[i].version->funcs->name, r); 2957 } 2958 adev->ip_blocks[i].status.hw = false; 2959 break; 2960 } 2961 } 2962 } 2963 2964 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2965 { 2966 int i, r; 2967 2968 for (i = 0; i < adev->num_ip_blocks; i++) { 2969 if (!adev->ip_blocks[i].version->funcs->early_fini) 2970 continue; 2971 2972 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2973 if (r) { 2974 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2975 adev->ip_blocks[i].version->funcs->name, r); 2976 } 2977 } 2978 2979 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2980 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2981 2982 amdgpu_amdkfd_suspend(adev, false); 2983 2984 /* Workaroud for ASICs need to disable SMC first */ 2985 amdgpu_device_smu_fini_early(adev); 2986 2987 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2988 if (!adev->ip_blocks[i].status.hw) 2989 continue; 2990 2991 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2992 /* XXX handle errors */ 2993 if (r) { 2994 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2995 adev->ip_blocks[i].version->funcs->name, r); 2996 } 2997 2998 adev->ip_blocks[i].status.hw = false; 2999 } 3000 3001 if (amdgpu_sriov_vf(adev)) { 3002 if (amdgpu_virt_release_full_gpu(adev, false)) 3003 DRM_ERROR("failed to release exclusive mode on fini\n"); 3004 } 3005 3006 return 0; 3007 } 3008 3009 /** 3010 * amdgpu_device_ip_fini - run fini for hardware IPs 3011 * 3012 * @adev: amdgpu_device pointer 3013 * 3014 * Main teardown pass for hardware IPs. The list of all the hardware 3015 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3016 * are run. hw_fini tears down the hardware associated with each IP 3017 * and sw_fini tears down any software state associated with each IP. 3018 * Returns 0 on success, negative error code on failure. 3019 */ 3020 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3021 { 3022 int i, r; 3023 3024 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3025 amdgpu_virt_release_ras_err_handler_data(adev); 3026 3027 if (adev->gmc.xgmi.num_physical_nodes > 1) 3028 amdgpu_xgmi_remove_device(adev); 3029 3030 amdgpu_amdkfd_device_fini_sw(adev); 3031 3032 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3033 if (!adev->ip_blocks[i].status.sw) 3034 continue; 3035 3036 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3037 amdgpu_ucode_free_bo(adev); 3038 amdgpu_free_static_csa(&adev->virt.csa_obj); 3039 amdgpu_device_wb_fini(adev); 3040 amdgpu_device_mem_scratch_fini(adev); 3041 amdgpu_ib_pool_fini(adev); 3042 } 3043 3044 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3045 /* XXX handle errors */ 3046 if (r) { 3047 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3048 adev->ip_blocks[i].version->funcs->name, r); 3049 } 3050 adev->ip_blocks[i].status.sw = false; 3051 adev->ip_blocks[i].status.valid = false; 3052 } 3053 3054 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3055 if (!adev->ip_blocks[i].status.late_initialized) 3056 continue; 3057 if (adev->ip_blocks[i].version->funcs->late_fini) 3058 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3059 adev->ip_blocks[i].status.late_initialized = false; 3060 } 3061 3062 amdgpu_ras_fini(adev); 3063 3064 return 0; 3065 } 3066 3067 /** 3068 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3069 * 3070 * @work: work_struct. 3071 */ 3072 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3073 { 3074 struct amdgpu_device *adev = 3075 container_of(work, struct amdgpu_device, delayed_init_work.work); 3076 int r; 3077 3078 r = amdgpu_ib_ring_tests(adev); 3079 if (r) 3080 DRM_ERROR("ib ring test failed (%d).\n", r); 3081 } 3082 3083 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3084 { 3085 struct amdgpu_device *adev = 3086 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3087 3088 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3089 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3090 3091 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3092 adev->gfx.gfx_off_state = true; 3093 } 3094 3095 /** 3096 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3097 * 3098 * @adev: amdgpu_device pointer 3099 * 3100 * Main suspend function for hardware IPs. The list of all the hardware 3101 * IPs that make up the asic is walked, clockgating is disabled and the 3102 * suspend callbacks are run. suspend puts the hardware and software state 3103 * in each IP into a state suitable for suspend. 3104 * Returns 0 on success, negative error code on failure. 3105 */ 3106 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3107 { 3108 int i, r; 3109 3110 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3111 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3112 3113 /* 3114 * Per PMFW team's suggestion, driver needs to handle gfxoff 3115 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3116 * scenario. Add the missing df cstate disablement here. 3117 */ 3118 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3119 dev_warn(adev->dev, "Failed to disallow df cstate"); 3120 3121 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3122 if (!adev->ip_blocks[i].status.valid) 3123 continue; 3124 3125 /* displays are handled separately */ 3126 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3127 continue; 3128 3129 /* XXX handle errors */ 3130 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3131 /* XXX handle errors */ 3132 if (r) { 3133 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3134 adev->ip_blocks[i].version->funcs->name, r); 3135 return r; 3136 } 3137 3138 adev->ip_blocks[i].status.hw = false; 3139 } 3140 3141 return 0; 3142 } 3143 3144 /** 3145 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3146 * 3147 * @adev: amdgpu_device pointer 3148 * 3149 * Main suspend function for hardware IPs. The list of all the hardware 3150 * IPs that make up the asic is walked, clockgating is disabled and the 3151 * suspend callbacks are run. suspend puts the hardware and software state 3152 * in each IP into a state suitable for suspend. 3153 * Returns 0 on success, negative error code on failure. 3154 */ 3155 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3156 { 3157 int i, r; 3158 3159 if (adev->in_s0ix) 3160 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3161 3162 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3163 if (!adev->ip_blocks[i].status.valid) 3164 continue; 3165 /* displays are handled in phase1 */ 3166 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3167 continue; 3168 /* PSP lost connection when err_event_athub occurs */ 3169 if (amdgpu_ras_intr_triggered() && 3170 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3171 adev->ip_blocks[i].status.hw = false; 3172 continue; 3173 } 3174 3175 /* skip unnecessary suspend if we do not initialize them yet */ 3176 if (adev->gmc.xgmi.pending_reset && 3177 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3178 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3179 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3180 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3181 adev->ip_blocks[i].status.hw = false; 3182 continue; 3183 } 3184 3185 /* skip suspend of gfx/mes and psp for S0ix 3186 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3187 * like at runtime. PSP is also part of the always on hardware 3188 * so no need to suspend it. 3189 */ 3190 if (adev->in_s0ix && 3191 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3192 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3193 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3194 continue; 3195 3196 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3197 if (adev->in_s0ix && 3198 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3199 IP_VERSION(5, 0, 0)) && 3200 (adev->ip_blocks[i].version->type == 3201 AMD_IP_BLOCK_TYPE_SDMA)) 3202 continue; 3203 3204 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3205 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3206 * from this location and RLC Autoload automatically also gets loaded 3207 * from here based on PMFW -> PSP message during re-init sequence. 3208 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3209 * the TMR and reload FWs again for IMU enabled APU ASICs. 3210 */ 3211 if (amdgpu_in_reset(adev) && 3212 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3213 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3214 continue; 3215 3216 /* XXX handle errors */ 3217 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3218 /* XXX handle errors */ 3219 if (r) { 3220 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3221 adev->ip_blocks[i].version->funcs->name, r); 3222 } 3223 adev->ip_blocks[i].status.hw = false; 3224 /* handle putting the SMC in the appropriate state */ 3225 if (!amdgpu_sriov_vf(adev)) { 3226 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3227 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3228 if (r) { 3229 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3230 adev->mp1_state, r); 3231 return r; 3232 } 3233 } 3234 } 3235 } 3236 3237 return 0; 3238 } 3239 3240 /** 3241 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3242 * 3243 * @adev: amdgpu_device pointer 3244 * 3245 * Main suspend function for hardware IPs. The list of all the hardware 3246 * IPs that make up the asic is walked, clockgating is disabled and the 3247 * suspend callbacks are run. suspend puts the hardware and software state 3248 * in each IP into a state suitable for suspend. 3249 * Returns 0 on success, negative error code on failure. 3250 */ 3251 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3252 { 3253 int r; 3254 3255 if (amdgpu_sriov_vf(adev)) { 3256 amdgpu_virt_fini_data_exchange(adev); 3257 amdgpu_virt_request_full_gpu(adev, false); 3258 } 3259 3260 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3261 3262 r = amdgpu_device_ip_suspend_phase1(adev); 3263 if (r) 3264 return r; 3265 r = amdgpu_device_ip_suspend_phase2(adev); 3266 3267 if (amdgpu_sriov_vf(adev)) 3268 amdgpu_virt_release_full_gpu(adev, false); 3269 3270 return r; 3271 } 3272 3273 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3274 { 3275 int i, r; 3276 3277 static enum amd_ip_block_type ip_order[] = { 3278 AMD_IP_BLOCK_TYPE_COMMON, 3279 AMD_IP_BLOCK_TYPE_GMC, 3280 AMD_IP_BLOCK_TYPE_PSP, 3281 AMD_IP_BLOCK_TYPE_IH, 3282 }; 3283 3284 for (i = 0; i < adev->num_ip_blocks; i++) { 3285 int j; 3286 struct amdgpu_ip_block *block; 3287 3288 block = &adev->ip_blocks[i]; 3289 block->status.hw = false; 3290 3291 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3292 3293 if (block->version->type != ip_order[j] || 3294 !block->status.valid) 3295 continue; 3296 3297 r = block->version->funcs->hw_init(adev); 3298 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3299 if (r) 3300 return r; 3301 block->status.hw = true; 3302 } 3303 } 3304 3305 return 0; 3306 } 3307 3308 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3309 { 3310 int i, r; 3311 3312 static enum amd_ip_block_type ip_order[] = { 3313 AMD_IP_BLOCK_TYPE_SMC, 3314 AMD_IP_BLOCK_TYPE_DCE, 3315 AMD_IP_BLOCK_TYPE_GFX, 3316 AMD_IP_BLOCK_TYPE_SDMA, 3317 AMD_IP_BLOCK_TYPE_MES, 3318 AMD_IP_BLOCK_TYPE_UVD, 3319 AMD_IP_BLOCK_TYPE_VCE, 3320 AMD_IP_BLOCK_TYPE_VCN, 3321 AMD_IP_BLOCK_TYPE_JPEG 3322 }; 3323 3324 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3325 int j; 3326 struct amdgpu_ip_block *block; 3327 3328 for (j = 0; j < adev->num_ip_blocks; j++) { 3329 block = &adev->ip_blocks[j]; 3330 3331 if (block->version->type != ip_order[i] || 3332 !block->status.valid || 3333 block->status.hw) 3334 continue; 3335 3336 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3337 r = block->version->funcs->resume(adev); 3338 else 3339 r = block->version->funcs->hw_init(adev); 3340 3341 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3342 if (r) 3343 return r; 3344 block->status.hw = true; 3345 } 3346 } 3347 3348 return 0; 3349 } 3350 3351 /** 3352 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3353 * 3354 * @adev: amdgpu_device pointer 3355 * 3356 * First resume function for hardware IPs. The list of all the hardware 3357 * IPs that make up the asic is walked and the resume callbacks are run for 3358 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3359 * after a suspend and updates the software state as necessary. This 3360 * function is also used for restoring the GPU after a GPU reset. 3361 * Returns 0 on success, negative error code on failure. 3362 */ 3363 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3364 { 3365 int i, r; 3366 3367 for (i = 0; i < adev->num_ip_blocks; i++) { 3368 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3369 continue; 3370 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3371 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3372 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3373 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3374 3375 r = adev->ip_blocks[i].version->funcs->resume(adev); 3376 if (r) { 3377 DRM_ERROR("resume of IP block <%s> failed %d\n", 3378 adev->ip_blocks[i].version->funcs->name, r); 3379 return r; 3380 } 3381 adev->ip_blocks[i].status.hw = true; 3382 } 3383 } 3384 3385 return 0; 3386 } 3387 3388 /** 3389 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3390 * 3391 * @adev: amdgpu_device pointer 3392 * 3393 * First resume function for hardware IPs. The list of all the hardware 3394 * IPs that make up the asic is walked and the resume callbacks are run for 3395 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3396 * functional state after a suspend and updates the software state as 3397 * necessary. This function is also used for restoring the GPU after a GPU 3398 * reset. 3399 * Returns 0 on success, negative error code on failure. 3400 */ 3401 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3402 { 3403 int i, r; 3404 3405 for (i = 0; i < adev->num_ip_blocks; i++) { 3406 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3407 continue; 3408 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3409 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3410 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3411 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3412 continue; 3413 r = adev->ip_blocks[i].version->funcs->resume(adev); 3414 if (r) { 3415 DRM_ERROR("resume of IP block <%s> failed %d\n", 3416 adev->ip_blocks[i].version->funcs->name, r); 3417 return r; 3418 } 3419 adev->ip_blocks[i].status.hw = true; 3420 } 3421 3422 return 0; 3423 } 3424 3425 /** 3426 * amdgpu_device_ip_resume - run resume for hardware IPs 3427 * 3428 * @adev: amdgpu_device pointer 3429 * 3430 * Main resume function for hardware IPs. The hardware IPs 3431 * are split into two resume functions because they are 3432 * also used in recovering from a GPU reset and some additional 3433 * steps need to be take between them. In this case (S3/S4) they are 3434 * run sequentially. 3435 * Returns 0 on success, negative error code on failure. 3436 */ 3437 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3438 { 3439 int r; 3440 3441 r = amdgpu_device_ip_resume_phase1(adev); 3442 if (r) 3443 return r; 3444 3445 r = amdgpu_device_fw_loading(adev); 3446 if (r) 3447 return r; 3448 3449 r = amdgpu_device_ip_resume_phase2(adev); 3450 3451 if (adev->mman.buffer_funcs_ring->sched.ready) 3452 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3453 3454 return r; 3455 } 3456 3457 /** 3458 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3459 * 3460 * @adev: amdgpu_device pointer 3461 * 3462 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3463 */ 3464 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3465 { 3466 if (amdgpu_sriov_vf(adev)) { 3467 if (adev->is_atom_fw) { 3468 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3469 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3470 } else { 3471 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3472 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3473 } 3474 3475 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3476 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3477 } 3478 } 3479 3480 /** 3481 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3482 * 3483 * @asic_type: AMD asic type 3484 * 3485 * Check if there is DC (new modesetting infrastructre) support for an asic. 3486 * returns true if DC has support, false if not. 3487 */ 3488 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3489 { 3490 switch (asic_type) { 3491 #ifdef CONFIG_DRM_AMDGPU_SI 3492 case CHIP_HAINAN: 3493 #endif 3494 case CHIP_TOPAZ: 3495 /* chips with no display hardware */ 3496 return false; 3497 #if defined(CONFIG_DRM_AMD_DC) 3498 case CHIP_TAHITI: 3499 case CHIP_PITCAIRN: 3500 case CHIP_VERDE: 3501 case CHIP_OLAND: 3502 /* 3503 * We have systems in the wild with these ASICs that require 3504 * LVDS and VGA support which is not supported with DC. 3505 * 3506 * Fallback to the non-DC driver here by default so as not to 3507 * cause regressions. 3508 */ 3509 #if defined(CONFIG_DRM_AMD_DC_SI) 3510 return amdgpu_dc > 0; 3511 #else 3512 return false; 3513 #endif 3514 case CHIP_BONAIRE: 3515 case CHIP_KAVERI: 3516 case CHIP_KABINI: 3517 case CHIP_MULLINS: 3518 /* 3519 * We have systems in the wild with these ASICs that require 3520 * VGA support which is not supported with DC. 3521 * 3522 * Fallback to the non-DC driver here by default so as not to 3523 * cause regressions. 3524 */ 3525 return amdgpu_dc > 0; 3526 default: 3527 return amdgpu_dc != 0; 3528 #else 3529 default: 3530 if (amdgpu_dc > 0) 3531 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3532 return false; 3533 #endif 3534 } 3535 } 3536 3537 /** 3538 * amdgpu_device_has_dc_support - check if dc is supported 3539 * 3540 * @adev: amdgpu_device pointer 3541 * 3542 * Returns true for supported, false for not supported 3543 */ 3544 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3545 { 3546 if (adev->enable_virtual_display || 3547 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3548 return false; 3549 3550 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3551 } 3552 3553 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3554 { 3555 struct amdgpu_device *adev = 3556 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3557 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3558 3559 /* It's a bug to not have a hive within this function */ 3560 if (WARN_ON(!hive)) 3561 return; 3562 3563 /* 3564 * Use task barrier to synchronize all xgmi reset works across the 3565 * hive. task_barrier_enter and task_barrier_exit will block 3566 * until all the threads running the xgmi reset works reach 3567 * those points. task_barrier_full will do both blocks. 3568 */ 3569 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3570 3571 task_barrier_enter(&hive->tb); 3572 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3573 3574 if (adev->asic_reset_res) 3575 goto fail; 3576 3577 task_barrier_exit(&hive->tb); 3578 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3579 3580 if (adev->asic_reset_res) 3581 goto fail; 3582 3583 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3584 } else { 3585 3586 task_barrier_full(&hive->tb); 3587 adev->asic_reset_res = amdgpu_asic_reset(adev); 3588 } 3589 3590 fail: 3591 if (adev->asic_reset_res) 3592 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3593 adev->asic_reset_res, adev_to_drm(adev)->unique); 3594 amdgpu_put_xgmi_hive(hive); 3595 } 3596 3597 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3598 { 3599 char *input = amdgpu_lockup_timeout; 3600 char *timeout_setting = NULL; 3601 int index = 0; 3602 long timeout; 3603 int ret = 0; 3604 3605 /* 3606 * By default timeout for non compute jobs is 10000 3607 * and 60000 for compute jobs. 3608 * In SR-IOV or passthrough mode, timeout for compute 3609 * jobs are 60000 by default. 3610 */ 3611 adev->gfx_timeout = msecs_to_jiffies(10000); 3612 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3613 if (amdgpu_sriov_vf(adev)) 3614 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3615 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3616 else 3617 adev->compute_timeout = msecs_to_jiffies(60000); 3618 3619 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3620 while ((timeout_setting = strsep(&input, ",")) && 3621 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3622 ret = kstrtol(timeout_setting, 0, &timeout); 3623 if (ret) 3624 return ret; 3625 3626 if (timeout == 0) { 3627 index++; 3628 continue; 3629 } else if (timeout < 0) { 3630 timeout = MAX_SCHEDULE_TIMEOUT; 3631 dev_warn(adev->dev, "lockup timeout disabled"); 3632 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3633 } else { 3634 timeout = msecs_to_jiffies(timeout); 3635 } 3636 3637 switch (index++) { 3638 case 0: 3639 adev->gfx_timeout = timeout; 3640 break; 3641 case 1: 3642 adev->compute_timeout = timeout; 3643 break; 3644 case 2: 3645 adev->sdma_timeout = timeout; 3646 break; 3647 case 3: 3648 adev->video_timeout = timeout; 3649 break; 3650 default: 3651 break; 3652 } 3653 } 3654 /* 3655 * There is only one value specified and 3656 * it should apply to all non-compute jobs. 3657 */ 3658 if (index == 1) { 3659 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3660 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3661 adev->compute_timeout = adev->gfx_timeout; 3662 } 3663 } 3664 3665 return ret; 3666 } 3667 3668 /** 3669 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3670 * 3671 * @adev: amdgpu_device pointer 3672 * 3673 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3674 */ 3675 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3676 { 3677 struct iommu_domain *domain; 3678 3679 domain = iommu_get_domain_for_dev(adev->dev); 3680 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3681 adev->ram_is_direct_mapped = true; 3682 } 3683 3684 static const struct attribute *amdgpu_dev_attributes[] = { 3685 &dev_attr_pcie_replay_count.attr, 3686 NULL 3687 }; 3688 3689 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3690 { 3691 if (amdgpu_mcbp == 1) 3692 adev->gfx.mcbp = true; 3693 else if (amdgpu_mcbp == 0) 3694 adev->gfx.mcbp = false; 3695 else if ((amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 0, 0)) && 3696 (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(10, 0, 0)) && 3697 adev->gfx.num_gfx_rings) 3698 adev->gfx.mcbp = true; 3699 3700 if (amdgpu_sriov_vf(adev)) 3701 adev->gfx.mcbp = true; 3702 3703 if (adev->gfx.mcbp) 3704 DRM_INFO("MCBP is enabled\n"); 3705 } 3706 3707 /** 3708 * amdgpu_device_init - initialize the driver 3709 * 3710 * @adev: amdgpu_device pointer 3711 * @flags: driver flags 3712 * 3713 * Initializes the driver info and hw (all asics). 3714 * Returns 0 for success or an error on failure. 3715 * Called at driver startup. 3716 */ 3717 int amdgpu_device_init(struct amdgpu_device *adev, 3718 uint32_t flags) 3719 { 3720 struct drm_device *ddev = adev_to_drm(adev); 3721 struct pci_dev *pdev = adev->pdev; 3722 int r, i; 3723 bool px = false; 3724 u32 max_MBps; 3725 int tmp; 3726 3727 adev->shutdown = false; 3728 adev->flags = flags; 3729 3730 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3731 adev->asic_type = amdgpu_force_asic_type; 3732 else 3733 adev->asic_type = flags & AMD_ASIC_MASK; 3734 3735 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3736 if (amdgpu_emu_mode == 1) 3737 adev->usec_timeout *= 10; 3738 adev->gmc.gart_size = 512 * 1024 * 1024; 3739 adev->accel_working = false; 3740 adev->num_rings = 0; 3741 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3742 adev->mman.buffer_funcs = NULL; 3743 adev->mman.buffer_funcs_ring = NULL; 3744 adev->vm_manager.vm_pte_funcs = NULL; 3745 adev->vm_manager.vm_pte_num_scheds = 0; 3746 adev->gmc.gmc_funcs = NULL; 3747 adev->harvest_ip_mask = 0x0; 3748 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3749 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3750 3751 adev->smc_rreg = &amdgpu_invalid_rreg; 3752 adev->smc_wreg = &amdgpu_invalid_wreg; 3753 adev->pcie_rreg = &amdgpu_invalid_rreg; 3754 adev->pcie_wreg = &amdgpu_invalid_wreg; 3755 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3756 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3757 adev->pciep_rreg = &amdgpu_invalid_rreg; 3758 adev->pciep_wreg = &amdgpu_invalid_wreg; 3759 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3760 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3761 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3762 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3763 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3764 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3765 adev->didt_rreg = &amdgpu_invalid_rreg; 3766 adev->didt_wreg = &amdgpu_invalid_wreg; 3767 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3768 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3769 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3770 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3771 3772 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3773 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3774 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3775 3776 /* mutex initialization are all done here so we 3777 * can recall function without having locking issues 3778 */ 3779 mutex_init(&adev->firmware.mutex); 3780 mutex_init(&adev->pm.mutex); 3781 mutex_init(&adev->gfx.gpu_clock_mutex); 3782 mutex_init(&adev->srbm_mutex); 3783 mutex_init(&adev->gfx.pipe_reserve_mutex); 3784 mutex_init(&adev->gfx.gfx_off_mutex); 3785 mutex_init(&adev->gfx.partition_mutex); 3786 mutex_init(&adev->grbm_idx_mutex); 3787 mutex_init(&adev->mn_lock); 3788 mutex_init(&adev->virt.vf_errors.lock); 3789 hash_init(adev->mn_hash); 3790 mutex_init(&adev->psp.mutex); 3791 mutex_init(&adev->notifier_lock); 3792 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3793 mutex_init(&adev->benchmark_mutex); 3794 3795 amdgpu_device_init_apu_flags(adev); 3796 3797 r = amdgpu_device_check_arguments(adev); 3798 if (r) 3799 return r; 3800 3801 spin_lock_init(&adev->mmio_idx_lock); 3802 spin_lock_init(&adev->smc_idx_lock); 3803 spin_lock_init(&adev->pcie_idx_lock); 3804 spin_lock_init(&adev->uvd_ctx_idx_lock); 3805 spin_lock_init(&adev->didt_idx_lock); 3806 spin_lock_init(&adev->gc_cac_idx_lock); 3807 spin_lock_init(&adev->se_cac_idx_lock); 3808 spin_lock_init(&adev->audio_endpt_idx_lock); 3809 spin_lock_init(&adev->mm_stats.lock); 3810 3811 INIT_LIST_HEAD(&adev->shadow_list); 3812 mutex_init(&adev->shadow_list_lock); 3813 3814 INIT_LIST_HEAD(&adev->reset_list); 3815 3816 INIT_LIST_HEAD(&adev->ras_list); 3817 3818 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3819 3820 INIT_DELAYED_WORK(&adev->delayed_init_work, 3821 amdgpu_device_delayed_init_work_handler); 3822 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3823 amdgpu_device_delay_enable_gfx_off); 3824 3825 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3826 3827 adev->gfx.gfx_off_req_count = 1; 3828 adev->gfx.gfx_off_residency = 0; 3829 adev->gfx.gfx_off_entrycount = 0; 3830 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3831 3832 atomic_set(&adev->throttling_logging_enabled, 1); 3833 /* 3834 * If throttling continues, logging will be performed every minute 3835 * to avoid log flooding. "-1" is subtracted since the thermal 3836 * throttling interrupt comes every second. Thus, the total logging 3837 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3838 * for throttling interrupt) = 60 seconds. 3839 */ 3840 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3841 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3842 3843 /* Registers mapping */ 3844 /* TODO: block userspace mapping of io register */ 3845 if (adev->asic_type >= CHIP_BONAIRE) { 3846 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3847 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3848 } else { 3849 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3850 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3851 } 3852 3853 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3854 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3855 3856 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3857 if (!adev->rmmio) 3858 return -ENOMEM; 3859 3860 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3861 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3862 3863 /* 3864 * Reset domain needs to be present early, before XGMI hive discovered 3865 * (if any) and intitialized to use reset sem and in_gpu reset flag 3866 * early on during init and before calling to RREG32. 3867 */ 3868 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3869 if (!adev->reset_domain) 3870 return -ENOMEM; 3871 3872 /* detect hw virtualization here */ 3873 amdgpu_detect_virtualization(adev); 3874 3875 amdgpu_device_get_pcie_info(adev); 3876 3877 r = amdgpu_device_get_job_timeout_settings(adev); 3878 if (r) { 3879 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3880 return r; 3881 } 3882 3883 /* early init functions */ 3884 r = amdgpu_device_ip_early_init(adev); 3885 if (r) 3886 return r; 3887 3888 amdgpu_device_set_mcbp(adev); 3889 3890 /* Get rid of things like offb */ 3891 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3892 if (r) 3893 return r; 3894 3895 /* Enable TMZ based on IP_VERSION */ 3896 amdgpu_gmc_tmz_set(adev); 3897 3898 amdgpu_gmc_noretry_set(adev); 3899 /* Need to get xgmi info early to decide the reset behavior*/ 3900 if (adev->gmc.xgmi.supported) { 3901 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3902 if (r) 3903 return r; 3904 } 3905 3906 /* enable PCIE atomic ops */ 3907 if (amdgpu_sriov_vf(adev)) { 3908 if (adev->virt.fw_reserve.p_pf2vf) 3909 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3910 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3911 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3912 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3913 * internal path natively support atomics, set have_atomics_support to true. 3914 */ 3915 } else if ((adev->flags & AMD_IS_APU) && 3916 (amdgpu_ip_version(adev, GC_HWIP, 0) > 3917 IP_VERSION(9, 0, 0))) { 3918 adev->have_atomics_support = true; 3919 } else { 3920 adev->have_atomics_support = 3921 !pci_enable_atomic_ops_to_root(adev->pdev, 3922 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3923 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3924 } 3925 3926 if (!adev->have_atomics_support) 3927 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3928 3929 /* doorbell bar mapping and doorbell index init*/ 3930 amdgpu_doorbell_init(adev); 3931 3932 if (amdgpu_emu_mode == 1) { 3933 /* post the asic on emulation mode */ 3934 emu_soc_asic_init(adev); 3935 goto fence_driver_init; 3936 } 3937 3938 amdgpu_reset_init(adev); 3939 3940 /* detect if we are with an SRIOV vbios */ 3941 if (adev->bios) 3942 amdgpu_device_detect_sriov_bios(adev); 3943 3944 /* check if we need to reset the asic 3945 * E.g., driver was not cleanly unloaded previously, etc. 3946 */ 3947 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3948 if (adev->gmc.xgmi.num_physical_nodes) { 3949 dev_info(adev->dev, "Pending hive reset.\n"); 3950 adev->gmc.xgmi.pending_reset = true; 3951 /* Only need to init necessary block for SMU to handle the reset */ 3952 for (i = 0; i < adev->num_ip_blocks; i++) { 3953 if (!adev->ip_blocks[i].status.valid) 3954 continue; 3955 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3956 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3957 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3958 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3959 DRM_DEBUG("IP %s disabled for hw_init.\n", 3960 adev->ip_blocks[i].version->funcs->name); 3961 adev->ip_blocks[i].status.hw = true; 3962 } 3963 } 3964 } else { 3965 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 3966 case IP_VERSION(13, 0, 0): 3967 case IP_VERSION(13, 0, 7): 3968 case IP_VERSION(13, 0, 10): 3969 r = psp_gpu_reset(adev); 3970 break; 3971 default: 3972 tmp = amdgpu_reset_method; 3973 /* It should do a default reset when loading or reloading the driver, 3974 * regardless of the module parameter reset_method. 3975 */ 3976 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3977 r = amdgpu_asic_reset(adev); 3978 amdgpu_reset_method = tmp; 3979 break; 3980 } 3981 3982 if (r) { 3983 dev_err(adev->dev, "asic reset on init failed\n"); 3984 goto failed; 3985 } 3986 } 3987 } 3988 3989 /* Post card if necessary */ 3990 if (amdgpu_device_need_post(adev)) { 3991 if (!adev->bios) { 3992 dev_err(adev->dev, "no vBIOS found\n"); 3993 r = -EINVAL; 3994 goto failed; 3995 } 3996 DRM_INFO("GPU posting now...\n"); 3997 r = amdgpu_device_asic_init(adev); 3998 if (r) { 3999 dev_err(adev->dev, "gpu post error!\n"); 4000 goto failed; 4001 } 4002 } 4003 4004 if (adev->bios) { 4005 if (adev->is_atom_fw) { 4006 /* Initialize clocks */ 4007 r = amdgpu_atomfirmware_get_clock_info(adev); 4008 if (r) { 4009 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4010 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4011 goto failed; 4012 } 4013 } else { 4014 /* Initialize clocks */ 4015 r = amdgpu_atombios_get_clock_info(adev); 4016 if (r) { 4017 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4018 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4019 goto failed; 4020 } 4021 /* init i2c buses */ 4022 if (!amdgpu_device_has_dc_support(adev)) 4023 amdgpu_atombios_i2c_init(adev); 4024 } 4025 } 4026 4027 fence_driver_init: 4028 /* Fence driver */ 4029 r = amdgpu_fence_driver_sw_init(adev); 4030 if (r) { 4031 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4032 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4033 goto failed; 4034 } 4035 4036 /* init the mode config */ 4037 drm_mode_config_init(adev_to_drm(adev)); 4038 4039 r = amdgpu_device_ip_init(adev); 4040 if (r) { 4041 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4042 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4043 goto release_ras_con; 4044 } 4045 4046 amdgpu_fence_driver_hw_init(adev); 4047 4048 dev_info(adev->dev, 4049 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4050 adev->gfx.config.max_shader_engines, 4051 adev->gfx.config.max_sh_per_se, 4052 adev->gfx.config.max_cu_per_sh, 4053 adev->gfx.cu_info.number); 4054 4055 adev->accel_working = true; 4056 4057 amdgpu_vm_check_compute_bug(adev); 4058 4059 /* Initialize the buffer migration limit. */ 4060 if (amdgpu_moverate >= 0) 4061 max_MBps = amdgpu_moverate; 4062 else 4063 max_MBps = 8; /* Allow 8 MB/s. */ 4064 /* Get a log2 for easy divisions. */ 4065 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4066 4067 /* 4068 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4069 * Otherwise the mgpu fan boost feature will be skipped due to the 4070 * gpu instance is counted less. 4071 */ 4072 amdgpu_register_gpu_instance(adev); 4073 4074 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4075 * explicit gating rather than handling it automatically. 4076 */ 4077 if (!adev->gmc.xgmi.pending_reset) { 4078 r = amdgpu_device_ip_late_init(adev); 4079 if (r) { 4080 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4081 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4082 goto release_ras_con; 4083 } 4084 /* must succeed. */ 4085 amdgpu_ras_resume(adev); 4086 queue_delayed_work(system_wq, &adev->delayed_init_work, 4087 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4088 } 4089 4090 if (amdgpu_sriov_vf(adev)) { 4091 amdgpu_virt_release_full_gpu(adev, true); 4092 flush_delayed_work(&adev->delayed_init_work); 4093 } 4094 4095 /* 4096 * Place those sysfs registering after `late_init`. As some of those 4097 * operations performed in `late_init` might affect the sysfs 4098 * interfaces creating. 4099 */ 4100 r = amdgpu_atombios_sysfs_init(adev); 4101 if (r) 4102 drm_err(&adev->ddev, 4103 "registering atombios sysfs failed (%d).\n", r); 4104 4105 r = amdgpu_pm_sysfs_init(adev); 4106 if (r) 4107 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4108 4109 r = amdgpu_ucode_sysfs_init(adev); 4110 if (r) { 4111 adev->ucode_sysfs_en = false; 4112 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4113 } else 4114 adev->ucode_sysfs_en = true; 4115 4116 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4117 if (r) 4118 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4119 4120 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4121 if (r) 4122 dev_err(adev->dev, 4123 "Could not create amdgpu board attributes\n"); 4124 4125 amdgpu_fru_sysfs_init(adev); 4126 4127 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4128 r = amdgpu_pmu_init(adev); 4129 if (r) 4130 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4131 4132 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4133 if (amdgpu_device_cache_pci_state(adev->pdev)) 4134 pci_restore_state(pdev); 4135 4136 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4137 /* this will fail for cards that aren't VGA class devices, just 4138 * ignore it 4139 */ 4140 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4141 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4142 4143 px = amdgpu_device_supports_px(ddev); 4144 4145 if (px || (!dev_is_removable(&adev->pdev->dev) && 4146 apple_gmux_detect(NULL, NULL))) 4147 vga_switcheroo_register_client(adev->pdev, 4148 &amdgpu_switcheroo_ops, px); 4149 4150 if (px) 4151 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4152 4153 if (adev->gmc.xgmi.pending_reset) 4154 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4155 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4156 4157 amdgpu_device_check_iommu_direct_map(adev); 4158 4159 return 0; 4160 4161 release_ras_con: 4162 if (amdgpu_sriov_vf(adev)) 4163 amdgpu_virt_release_full_gpu(adev, true); 4164 4165 /* failed in exclusive mode due to timeout */ 4166 if (amdgpu_sriov_vf(adev) && 4167 !amdgpu_sriov_runtime(adev) && 4168 amdgpu_virt_mmio_blocked(adev) && 4169 !amdgpu_virt_wait_reset(adev)) { 4170 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4171 /* Don't send request since VF is inactive. */ 4172 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4173 adev->virt.ops = NULL; 4174 r = -EAGAIN; 4175 } 4176 amdgpu_release_ras_context(adev); 4177 4178 failed: 4179 amdgpu_vf_error_trans_all(adev); 4180 4181 return r; 4182 } 4183 4184 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4185 { 4186 4187 /* Clear all CPU mappings pointing to this device */ 4188 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4189 4190 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4191 amdgpu_doorbell_fini(adev); 4192 4193 iounmap(adev->rmmio); 4194 adev->rmmio = NULL; 4195 if (adev->mman.aper_base_kaddr) 4196 iounmap(adev->mman.aper_base_kaddr); 4197 adev->mman.aper_base_kaddr = NULL; 4198 4199 /* Memory manager related */ 4200 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4201 arch_phys_wc_del(adev->gmc.vram_mtrr); 4202 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4203 } 4204 } 4205 4206 /** 4207 * amdgpu_device_fini_hw - tear down the driver 4208 * 4209 * @adev: amdgpu_device pointer 4210 * 4211 * Tear down the driver info (all asics). 4212 * Called at driver shutdown. 4213 */ 4214 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4215 { 4216 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4217 flush_delayed_work(&adev->delayed_init_work); 4218 adev->shutdown = true; 4219 4220 /* make sure IB test finished before entering exclusive mode 4221 * to avoid preemption on IB test 4222 */ 4223 if (amdgpu_sriov_vf(adev)) { 4224 amdgpu_virt_request_full_gpu(adev, false); 4225 amdgpu_virt_fini_data_exchange(adev); 4226 } 4227 4228 /* disable all interrupts */ 4229 amdgpu_irq_disable_all(adev); 4230 if (adev->mode_info.mode_config_initialized) { 4231 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4232 drm_helper_force_disable_all(adev_to_drm(adev)); 4233 else 4234 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4235 } 4236 amdgpu_fence_driver_hw_fini(adev); 4237 4238 if (adev->mman.initialized) 4239 drain_workqueue(adev->mman.bdev.wq); 4240 4241 if (adev->pm.sysfs_initialized) 4242 amdgpu_pm_sysfs_fini(adev); 4243 if (adev->ucode_sysfs_en) 4244 amdgpu_ucode_sysfs_fini(adev); 4245 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4246 amdgpu_fru_sysfs_fini(adev); 4247 4248 /* disable ras feature must before hw fini */ 4249 amdgpu_ras_pre_fini(adev); 4250 4251 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4252 4253 amdgpu_device_ip_fini_early(adev); 4254 4255 amdgpu_irq_fini_hw(adev); 4256 4257 if (adev->mman.initialized) 4258 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4259 4260 amdgpu_gart_dummy_page_fini(adev); 4261 4262 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4263 amdgpu_device_unmap_mmio(adev); 4264 4265 } 4266 4267 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4268 { 4269 int idx; 4270 bool px; 4271 4272 amdgpu_fence_driver_sw_fini(adev); 4273 amdgpu_device_ip_fini(adev); 4274 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4275 adev->accel_working = false; 4276 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4277 4278 amdgpu_reset_fini(adev); 4279 4280 /* free i2c buses */ 4281 if (!amdgpu_device_has_dc_support(adev)) 4282 amdgpu_i2c_fini(adev); 4283 4284 if (amdgpu_emu_mode != 1) 4285 amdgpu_atombios_fini(adev); 4286 4287 kfree(adev->bios); 4288 adev->bios = NULL; 4289 4290 kfree(adev->fru_info); 4291 adev->fru_info = NULL; 4292 4293 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4294 4295 if (px || (!dev_is_removable(&adev->pdev->dev) && 4296 apple_gmux_detect(NULL, NULL))) 4297 vga_switcheroo_unregister_client(adev->pdev); 4298 4299 if (px) 4300 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4301 4302 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4303 vga_client_unregister(adev->pdev); 4304 4305 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4306 4307 iounmap(adev->rmmio); 4308 adev->rmmio = NULL; 4309 amdgpu_doorbell_fini(adev); 4310 drm_dev_exit(idx); 4311 } 4312 4313 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4314 amdgpu_pmu_fini(adev); 4315 if (adev->mman.discovery_bin) 4316 amdgpu_discovery_fini(adev); 4317 4318 amdgpu_reset_put_reset_domain(adev->reset_domain); 4319 adev->reset_domain = NULL; 4320 4321 kfree(adev->pci_state); 4322 4323 } 4324 4325 /** 4326 * amdgpu_device_evict_resources - evict device resources 4327 * @adev: amdgpu device object 4328 * 4329 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4330 * of the vram memory type. Mainly used for evicting device resources 4331 * at suspend time. 4332 * 4333 */ 4334 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4335 { 4336 int ret; 4337 4338 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4339 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4340 return 0; 4341 4342 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4343 if (ret) 4344 DRM_WARN("evicting device resources failed\n"); 4345 return ret; 4346 } 4347 4348 /* 4349 * Suspend & resume. 4350 */ 4351 /** 4352 * amdgpu_device_prepare - prepare for device suspend 4353 * 4354 * @dev: drm dev pointer 4355 * 4356 * Prepare to put the hw in the suspend state (all asics). 4357 * Returns 0 for success or an error on failure. 4358 * Called at driver suspend. 4359 */ 4360 int amdgpu_device_prepare(struct drm_device *dev) 4361 { 4362 struct amdgpu_device *adev = drm_to_adev(dev); 4363 int i, r; 4364 4365 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4366 return 0; 4367 4368 /* Evict the majority of BOs before starting suspend sequence */ 4369 r = amdgpu_device_evict_resources(adev); 4370 if (r) 4371 return r; 4372 4373 for (i = 0; i < adev->num_ip_blocks; i++) { 4374 if (!adev->ip_blocks[i].status.valid) 4375 continue; 4376 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4377 continue; 4378 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4379 if (r) 4380 return r; 4381 } 4382 4383 return 0; 4384 } 4385 4386 /** 4387 * amdgpu_device_suspend - initiate device suspend 4388 * 4389 * @dev: drm dev pointer 4390 * @fbcon : notify the fbdev of suspend 4391 * 4392 * Puts the hw in the suspend state (all asics). 4393 * Returns 0 for success or an error on failure. 4394 * Called at driver suspend. 4395 */ 4396 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4397 { 4398 struct amdgpu_device *adev = drm_to_adev(dev); 4399 int r = 0; 4400 4401 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4402 return 0; 4403 4404 adev->in_suspend = true; 4405 4406 if (amdgpu_sriov_vf(adev)) { 4407 amdgpu_virt_fini_data_exchange(adev); 4408 r = amdgpu_virt_request_full_gpu(adev, false); 4409 if (r) 4410 return r; 4411 } 4412 4413 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4414 DRM_WARN("smart shift update failed\n"); 4415 4416 if (fbcon) 4417 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4418 4419 cancel_delayed_work_sync(&adev->delayed_init_work); 4420 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4421 4422 amdgpu_ras_suspend(adev); 4423 4424 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4425 4426 amdgpu_device_ip_suspend_phase1(adev); 4427 4428 if (!adev->in_s0ix) 4429 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4430 4431 r = amdgpu_device_evict_resources(adev); 4432 if (r) 4433 return r; 4434 4435 amdgpu_fence_driver_hw_fini(adev); 4436 4437 amdgpu_device_ip_suspend_phase2(adev); 4438 4439 if (amdgpu_sriov_vf(adev)) 4440 amdgpu_virt_release_full_gpu(adev, false); 4441 4442 return 0; 4443 } 4444 4445 /** 4446 * amdgpu_device_resume - initiate device resume 4447 * 4448 * @dev: drm dev pointer 4449 * @fbcon : notify the fbdev of resume 4450 * 4451 * Bring the hw back to operating state (all asics). 4452 * Returns 0 for success or an error on failure. 4453 * Called at driver resume. 4454 */ 4455 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4456 { 4457 struct amdgpu_device *adev = drm_to_adev(dev); 4458 int r = 0; 4459 4460 if (amdgpu_sriov_vf(adev)) { 4461 r = amdgpu_virt_request_full_gpu(adev, true); 4462 if (r) 4463 return r; 4464 } 4465 4466 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4467 return 0; 4468 4469 if (adev->in_s0ix) 4470 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4471 4472 /* post card */ 4473 if (amdgpu_device_need_post(adev)) { 4474 r = amdgpu_device_asic_init(adev); 4475 if (r) 4476 dev_err(adev->dev, "amdgpu asic init failed\n"); 4477 } 4478 4479 r = amdgpu_device_ip_resume(adev); 4480 4481 if (r) { 4482 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4483 goto exit; 4484 } 4485 amdgpu_fence_driver_hw_init(adev); 4486 4487 r = amdgpu_device_ip_late_init(adev); 4488 if (r) 4489 goto exit; 4490 4491 queue_delayed_work(system_wq, &adev->delayed_init_work, 4492 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4493 4494 if (!adev->in_s0ix) { 4495 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4496 if (r) 4497 goto exit; 4498 } 4499 4500 exit: 4501 if (amdgpu_sriov_vf(adev)) { 4502 amdgpu_virt_init_data_exchange(adev); 4503 amdgpu_virt_release_full_gpu(adev, true); 4504 } 4505 4506 if (r) 4507 return r; 4508 4509 /* Make sure IB tests flushed */ 4510 flush_delayed_work(&adev->delayed_init_work); 4511 4512 if (fbcon) 4513 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4514 4515 amdgpu_ras_resume(adev); 4516 4517 if (adev->mode_info.num_crtc) { 4518 /* 4519 * Most of the connector probing functions try to acquire runtime pm 4520 * refs to ensure that the GPU is powered on when connector polling is 4521 * performed. Since we're calling this from a runtime PM callback, 4522 * trying to acquire rpm refs will cause us to deadlock. 4523 * 4524 * Since we're guaranteed to be holding the rpm lock, it's safe to 4525 * temporarily disable the rpm helpers so this doesn't deadlock us. 4526 */ 4527 #ifdef CONFIG_PM 4528 dev->dev->power.disable_depth++; 4529 #endif 4530 if (!adev->dc_enabled) 4531 drm_helper_hpd_irq_event(dev); 4532 else 4533 drm_kms_helper_hotplug_event(dev); 4534 #ifdef CONFIG_PM 4535 dev->dev->power.disable_depth--; 4536 #endif 4537 } 4538 adev->in_suspend = false; 4539 4540 if (adev->enable_mes) 4541 amdgpu_mes_self_test(adev); 4542 4543 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4544 DRM_WARN("smart shift update failed\n"); 4545 4546 return 0; 4547 } 4548 4549 /** 4550 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4551 * 4552 * @adev: amdgpu_device pointer 4553 * 4554 * The list of all the hardware IPs that make up the asic is walked and 4555 * the check_soft_reset callbacks are run. check_soft_reset determines 4556 * if the asic is still hung or not. 4557 * Returns true if any of the IPs are still in a hung state, false if not. 4558 */ 4559 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4560 { 4561 int i; 4562 bool asic_hang = false; 4563 4564 if (amdgpu_sriov_vf(adev)) 4565 return true; 4566 4567 if (amdgpu_asic_need_full_reset(adev)) 4568 return true; 4569 4570 for (i = 0; i < adev->num_ip_blocks; i++) { 4571 if (!adev->ip_blocks[i].status.valid) 4572 continue; 4573 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4574 adev->ip_blocks[i].status.hang = 4575 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4576 if (adev->ip_blocks[i].status.hang) { 4577 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4578 asic_hang = true; 4579 } 4580 } 4581 return asic_hang; 4582 } 4583 4584 /** 4585 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4586 * 4587 * @adev: amdgpu_device pointer 4588 * 4589 * The list of all the hardware IPs that make up the asic is walked and the 4590 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4591 * handles any IP specific hardware or software state changes that are 4592 * necessary for a soft reset to succeed. 4593 * Returns 0 on success, negative error code on failure. 4594 */ 4595 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4596 { 4597 int i, r = 0; 4598 4599 for (i = 0; i < adev->num_ip_blocks; i++) { 4600 if (!adev->ip_blocks[i].status.valid) 4601 continue; 4602 if (adev->ip_blocks[i].status.hang && 4603 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4604 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4605 if (r) 4606 return r; 4607 } 4608 } 4609 4610 return 0; 4611 } 4612 4613 /** 4614 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4615 * 4616 * @adev: amdgpu_device pointer 4617 * 4618 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4619 * reset is necessary to recover. 4620 * Returns true if a full asic reset is required, false if not. 4621 */ 4622 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4623 { 4624 int i; 4625 4626 if (amdgpu_asic_need_full_reset(adev)) 4627 return true; 4628 4629 for (i = 0; i < adev->num_ip_blocks; i++) { 4630 if (!adev->ip_blocks[i].status.valid) 4631 continue; 4632 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4633 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4634 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4635 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4636 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4637 if (adev->ip_blocks[i].status.hang) { 4638 dev_info(adev->dev, "Some block need full reset!\n"); 4639 return true; 4640 } 4641 } 4642 } 4643 return false; 4644 } 4645 4646 /** 4647 * amdgpu_device_ip_soft_reset - do a soft reset 4648 * 4649 * @adev: amdgpu_device pointer 4650 * 4651 * The list of all the hardware IPs that make up the asic is walked and the 4652 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4653 * IP specific hardware or software state changes that are necessary to soft 4654 * reset the IP. 4655 * Returns 0 on success, negative error code on failure. 4656 */ 4657 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4658 { 4659 int i, r = 0; 4660 4661 for (i = 0; i < adev->num_ip_blocks; i++) { 4662 if (!adev->ip_blocks[i].status.valid) 4663 continue; 4664 if (adev->ip_blocks[i].status.hang && 4665 adev->ip_blocks[i].version->funcs->soft_reset) { 4666 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4667 if (r) 4668 return r; 4669 } 4670 } 4671 4672 return 0; 4673 } 4674 4675 /** 4676 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4677 * 4678 * @adev: amdgpu_device pointer 4679 * 4680 * The list of all the hardware IPs that make up the asic is walked and the 4681 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4682 * handles any IP specific hardware or software state changes that are 4683 * necessary after the IP has been soft reset. 4684 * Returns 0 on success, negative error code on failure. 4685 */ 4686 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4687 { 4688 int i, r = 0; 4689 4690 for (i = 0; i < adev->num_ip_blocks; i++) { 4691 if (!adev->ip_blocks[i].status.valid) 4692 continue; 4693 if (adev->ip_blocks[i].status.hang && 4694 adev->ip_blocks[i].version->funcs->post_soft_reset) 4695 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4696 if (r) 4697 return r; 4698 } 4699 4700 return 0; 4701 } 4702 4703 /** 4704 * amdgpu_device_recover_vram - Recover some VRAM contents 4705 * 4706 * @adev: amdgpu_device pointer 4707 * 4708 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4709 * restore things like GPUVM page tables after a GPU reset where 4710 * the contents of VRAM might be lost. 4711 * 4712 * Returns: 4713 * 0 on success, negative error code on failure. 4714 */ 4715 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4716 { 4717 struct dma_fence *fence = NULL, *next = NULL; 4718 struct amdgpu_bo *shadow; 4719 struct amdgpu_bo_vm *vmbo; 4720 long r = 1, tmo; 4721 4722 if (amdgpu_sriov_runtime(adev)) 4723 tmo = msecs_to_jiffies(8000); 4724 else 4725 tmo = msecs_to_jiffies(100); 4726 4727 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4728 mutex_lock(&adev->shadow_list_lock); 4729 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4730 /* If vm is compute context or adev is APU, shadow will be NULL */ 4731 if (!vmbo->shadow) 4732 continue; 4733 shadow = vmbo->shadow; 4734 4735 /* No need to recover an evicted BO */ 4736 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4737 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4738 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4739 continue; 4740 4741 r = amdgpu_bo_restore_shadow(shadow, &next); 4742 if (r) 4743 break; 4744 4745 if (fence) { 4746 tmo = dma_fence_wait_timeout(fence, false, tmo); 4747 dma_fence_put(fence); 4748 fence = next; 4749 if (tmo == 0) { 4750 r = -ETIMEDOUT; 4751 break; 4752 } else if (tmo < 0) { 4753 r = tmo; 4754 break; 4755 } 4756 } else { 4757 fence = next; 4758 } 4759 } 4760 mutex_unlock(&adev->shadow_list_lock); 4761 4762 if (fence) 4763 tmo = dma_fence_wait_timeout(fence, false, tmo); 4764 dma_fence_put(fence); 4765 4766 if (r < 0 || tmo <= 0) { 4767 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4768 return -EIO; 4769 } 4770 4771 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4772 return 0; 4773 } 4774 4775 4776 /** 4777 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4778 * 4779 * @adev: amdgpu_device pointer 4780 * @from_hypervisor: request from hypervisor 4781 * 4782 * do VF FLR and reinitialize Asic 4783 * return 0 means succeeded otherwise failed 4784 */ 4785 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4786 bool from_hypervisor) 4787 { 4788 int r; 4789 struct amdgpu_hive_info *hive = NULL; 4790 int retry_limit = 0; 4791 4792 retry: 4793 amdgpu_amdkfd_pre_reset(adev); 4794 4795 if (from_hypervisor) 4796 r = amdgpu_virt_request_full_gpu(adev, true); 4797 else 4798 r = amdgpu_virt_reset_gpu(adev); 4799 if (r) 4800 return r; 4801 amdgpu_irq_gpu_reset_resume_helper(adev); 4802 4803 /* some sw clean up VF needs to do before recover */ 4804 amdgpu_virt_post_reset(adev); 4805 4806 /* Resume IP prior to SMC */ 4807 r = amdgpu_device_ip_reinit_early_sriov(adev); 4808 if (r) 4809 goto error; 4810 4811 amdgpu_virt_init_data_exchange(adev); 4812 4813 r = amdgpu_device_fw_loading(adev); 4814 if (r) 4815 return r; 4816 4817 /* now we are okay to resume SMC/CP/SDMA */ 4818 r = amdgpu_device_ip_reinit_late_sriov(adev); 4819 if (r) 4820 goto error; 4821 4822 hive = amdgpu_get_xgmi_hive(adev); 4823 /* Update PSP FW topology after reset */ 4824 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4825 r = amdgpu_xgmi_update_topology(hive, adev); 4826 4827 if (hive) 4828 amdgpu_put_xgmi_hive(hive); 4829 4830 if (!r) { 4831 r = amdgpu_ib_ring_tests(adev); 4832 4833 amdgpu_amdkfd_post_reset(adev); 4834 } 4835 4836 error: 4837 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4838 amdgpu_inc_vram_lost(adev); 4839 r = amdgpu_device_recover_vram(adev); 4840 } 4841 amdgpu_virt_release_full_gpu(adev, true); 4842 4843 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4844 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4845 retry_limit++; 4846 goto retry; 4847 } else 4848 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4849 } 4850 4851 return r; 4852 } 4853 4854 /** 4855 * amdgpu_device_has_job_running - check if there is any job in mirror list 4856 * 4857 * @adev: amdgpu_device pointer 4858 * 4859 * check if there is any job in mirror list 4860 */ 4861 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4862 { 4863 int i; 4864 struct drm_sched_job *job; 4865 4866 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4867 struct amdgpu_ring *ring = adev->rings[i]; 4868 4869 if (!ring || !ring->sched.thread) 4870 continue; 4871 4872 spin_lock(&ring->sched.job_list_lock); 4873 job = list_first_entry_or_null(&ring->sched.pending_list, 4874 struct drm_sched_job, list); 4875 spin_unlock(&ring->sched.job_list_lock); 4876 if (job) 4877 return true; 4878 } 4879 return false; 4880 } 4881 4882 /** 4883 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4884 * 4885 * @adev: amdgpu_device pointer 4886 * 4887 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4888 * a hung GPU. 4889 */ 4890 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4891 { 4892 4893 if (amdgpu_gpu_recovery == 0) 4894 goto disabled; 4895 4896 /* Skip soft reset check in fatal error mode */ 4897 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4898 return true; 4899 4900 if (amdgpu_sriov_vf(adev)) 4901 return true; 4902 4903 if (amdgpu_gpu_recovery == -1) { 4904 switch (adev->asic_type) { 4905 #ifdef CONFIG_DRM_AMDGPU_SI 4906 case CHIP_VERDE: 4907 case CHIP_TAHITI: 4908 case CHIP_PITCAIRN: 4909 case CHIP_OLAND: 4910 case CHIP_HAINAN: 4911 #endif 4912 #ifdef CONFIG_DRM_AMDGPU_CIK 4913 case CHIP_KAVERI: 4914 case CHIP_KABINI: 4915 case CHIP_MULLINS: 4916 #endif 4917 case CHIP_CARRIZO: 4918 case CHIP_STONEY: 4919 case CHIP_CYAN_SKILLFISH: 4920 goto disabled; 4921 default: 4922 break; 4923 } 4924 } 4925 4926 return true; 4927 4928 disabled: 4929 dev_info(adev->dev, "GPU recovery disabled.\n"); 4930 return false; 4931 } 4932 4933 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4934 { 4935 u32 i; 4936 int ret = 0; 4937 4938 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4939 4940 dev_info(adev->dev, "GPU mode1 reset\n"); 4941 4942 /* disable BM */ 4943 pci_clear_master(adev->pdev); 4944 4945 amdgpu_device_cache_pci_state(adev->pdev); 4946 4947 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4948 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4949 ret = amdgpu_dpm_mode1_reset(adev); 4950 } else { 4951 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4952 ret = psp_gpu_reset(adev); 4953 } 4954 4955 if (ret) 4956 goto mode1_reset_failed; 4957 4958 amdgpu_device_load_pci_state(adev->pdev); 4959 ret = amdgpu_psp_wait_for_bootloader(adev); 4960 if (ret) 4961 goto mode1_reset_failed; 4962 4963 /* wait for asic to come out of reset */ 4964 for (i = 0; i < adev->usec_timeout; i++) { 4965 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4966 4967 if (memsize != 0xffffffff) 4968 break; 4969 udelay(1); 4970 } 4971 4972 if (i >= adev->usec_timeout) { 4973 ret = -ETIMEDOUT; 4974 goto mode1_reset_failed; 4975 } 4976 4977 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4978 4979 return 0; 4980 4981 mode1_reset_failed: 4982 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4983 return ret; 4984 } 4985 4986 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4987 struct amdgpu_reset_context *reset_context) 4988 { 4989 int i, r = 0; 4990 struct amdgpu_job *job = NULL; 4991 bool need_full_reset = 4992 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4993 4994 if (reset_context->reset_req_dev == adev) 4995 job = reset_context->job; 4996 4997 if (amdgpu_sriov_vf(adev)) { 4998 /* stop the data exchange thread */ 4999 amdgpu_virt_fini_data_exchange(adev); 5000 } 5001 5002 amdgpu_fence_driver_isr_toggle(adev, true); 5003 5004 /* block all schedulers and reset given job's ring */ 5005 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5006 struct amdgpu_ring *ring = adev->rings[i]; 5007 5008 if (!ring || !ring->sched.thread) 5009 continue; 5010 5011 /* Clear job fence from fence drv to avoid force_completion 5012 * leave NULL and vm flush fence in fence drv 5013 */ 5014 amdgpu_fence_driver_clear_job_fences(ring); 5015 5016 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5017 amdgpu_fence_driver_force_completion(ring); 5018 } 5019 5020 amdgpu_fence_driver_isr_toggle(adev, false); 5021 5022 if (job && job->vm) 5023 drm_sched_increase_karma(&job->base); 5024 5025 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5026 /* If reset handler not implemented, continue; otherwise return */ 5027 if (r == -EOPNOTSUPP) 5028 r = 0; 5029 else 5030 return r; 5031 5032 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5033 if (!amdgpu_sriov_vf(adev)) { 5034 5035 if (!need_full_reset) 5036 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5037 5038 if (!need_full_reset && amdgpu_gpu_recovery && 5039 amdgpu_device_ip_check_soft_reset(adev)) { 5040 amdgpu_device_ip_pre_soft_reset(adev); 5041 r = amdgpu_device_ip_soft_reset(adev); 5042 amdgpu_device_ip_post_soft_reset(adev); 5043 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5044 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5045 need_full_reset = true; 5046 } 5047 } 5048 5049 if (need_full_reset) 5050 r = amdgpu_device_ip_suspend(adev); 5051 if (need_full_reset) 5052 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5053 else 5054 clear_bit(AMDGPU_NEED_FULL_RESET, 5055 &reset_context->flags); 5056 } 5057 5058 return r; 5059 } 5060 5061 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5062 { 5063 int i; 5064 5065 lockdep_assert_held(&adev->reset_domain->sem); 5066 5067 for (i = 0; i < adev->reset_info.num_regs; i++) { 5068 adev->reset_info.reset_dump_reg_value[i] = 5069 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5070 5071 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5072 adev->reset_info.reset_dump_reg_value[i]); 5073 } 5074 5075 return 0; 5076 } 5077 5078 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5079 struct amdgpu_reset_context *reset_context) 5080 { 5081 struct amdgpu_device *tmp_adev = NULL; 5082 bool need_full_reset, skip_hw_reset, vram_lost = false; 5083 int r = 0; 5084 bool gpu_reset_for_dev_remove = 0; 5085 5086 /* Try reset handler method first */ 5087 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5088 reset_list); 5089 amdgpu_reset_reg_dumps(tmp_adev); 5090 5091 reset_context->reset_device_list = device_list_handle; 5092 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5093 /* If reset handler not implemented, continue; otherwise return */ 5094 if (r == -EOPNOTSUPP) 5095 r = 0; 5096 else 5097 return r; 5098 5099 /* Reset handler not implemented, use the default method */ 5100 need_full_reset = 5101 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5102 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5103 5104 gpu_reset_for_dev_remove = 5105 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5106 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5107 5108 /* 5109 * ASIC reset has to be done on all XGMI hive nodes ASAP 5110 * to allow proper links negotiation in FW (within 1 sec) 5111 */ 5112 if (!skip_hw_reset && need_full_reset) { 5113 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5114 /* For XGMI run all resets in parallel to speed up the process */ 5115 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5116 tmp_adev->gmc.xgmi.pending_reset = false; 5117 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5118 r = -EALREADY; 5119 } else 5120 r = amdgpu_asic_reset(tmp_adev); 5121 5122 if (r) { 5123 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5124 r, adev_to_drm(tmp_adev)->unique); 5125 goto out; 5126 } 5127 } 5128 5129 /* For XGMI wait for all resets to complete before proceed */ 5130 if (!r) { 5131 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5132 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5133 flush_work(&tmp_adev->xgmi_reset_work); 5134 r = tmp_adev->asic_reset_res; 5135 if (r) 5136 break; 5137 } 5138 } 5139 } 5140 } 5141 5142 if (!r && amdgpu_ras_intr_triggered()) { 5143 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5144 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5145 } 5146 5147 amdgpu_ras_intr_cleared(); 5148 } 5149 5150 /* Since the mode1 reset affects base ip blocks, the 5151 * phase1 ip blocks need to be resumed. Otherwise there 5152 * will be a BIOS signature error and the psp bootloader 5153 * can't load kdb on the next amdgpu install. 5154 */ 5155 if (gpu_reset_for_dev_remove) { 5156 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5157 amdgpu_device_ip_resume_phase1(tmp_adev); 5158 5159 goto end; 5160 } 5161 5162 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5163 if (need_full_reset) { 5164 /* post card */ 5165 r = amdgpu_device_asic_init(tmp_adev); 5166 if (r) { 5167 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5168 } else { 5169 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5170 5171 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5172 if (r) 5173 goto out; 5174 5175 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5176 5177 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5178 5179 if (vram_lost) { 5180 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5181 amdgpu_inc_vram_lost(tmp_adev); 5182 } 5183 5184 r = amdgpu_device_fw_loading(tmp_adev); 5185 if (r) 5186 return r; 5187 5188 r = amdgpu_xcp_restore_partition_mode( 5189 tmp_adev->xcp_mgr); 5190 if (r) 5191 goto out; 5192 5193 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5194 if (r) 5195 goto out; 5196 5197 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5198 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5199 5200 if (vram_lost) 5201 amdgpu_device_fill_reset_magic(tmp_adev); 5202 5203 /* 5204 * Add this ASIC as tracked as reset was already 5205 * complete successfully. 5206 */ 5207 amdgpu_register_gpu_instance(tmp_adev); 5208 5209 if (!reset_context->hive && 5210 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5211 amdgpu_xgmi_add_device(tmp_adev); 5212 5213 r = amdgpu_device_ip_late_init(tmp_adev); 5214 if (r) 5215 goto out; 5216 5217 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5218 5219 /* 5220 * The GPU enters bad state once faulty pages 5221 * by ECC has reached the threshold, and ras 5222 * recovery is scheduled next. So add one check 5223 * here to break recovery if it indeed exceeds 5224 * bad page threshold, and remind user to 5225 * retire this GPU or setting one bigger 5226 * bad_page_threshold value to fix this once 5227 * probing driver again. 5228 */ 5229 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5230 /* must succeed. */ 5231 amdgpu_ras_resume(tmp_adev); 5232 } else { 5233 r = -EINVAL; 5234 goto out; 5235 } 5236 5237 /* Update PSP FW topology after reset */ 5238 if (reset_context->hive && 5239 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5240 r = amdgpu_xgmi_update_topology( 5241 reset_context->hive, tmp_adev); 5242 } 5243 } 5244 5245 out: 5246 if (!r) { 5247 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5248 r = amdgpu_ib_ring_tests(tmp_adev); 5249 if (r) { 5250 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5251 need_full_reset = true; 5252 r = -EAGAIN; 5253 goto end; 5254 } 5255 } 5256 5257 if (!r) 5258 r = amdgpu_device_recover_vram(tmp_adev); 5259 else 5260 tmp_adev->asic_reset_res = r; 5261 } 5262 5263 end: 5264 if (need_full_reset) 5265 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5266 else 5267 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5268 return r; 5269 } 5270 5271 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5272 { 5273 5274 switch (amdgpu_asic_reset_method(adev)) { 5275 case AMD_RESET_METHOD_MODE1: 5276 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5277 break; 5278 case AMD_RESET_METHOD_MODE2: 5279 adev->mp1_state = PP_MP1_STATE_RESET; 5280 break; 5281 default: 5282 adev->mp1_state = PP_MP1_STATE_NONE; 5283 break; 5284 } 5285 } 5286 5287 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5288 { 5289 amdgpu_vf_error_trans_all(adev); 5290 adev->mp1_state = PP_MP1_STATE_NONE; 5291 } 5292 5293 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5294 { 5295 struct pci_dev *p = NULL; 5296 5297 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5298 adev->pdev->bus->number, 1); 5299 if (p) { 5300 pm_runtime_enable(&(p->dev)); 5301 pm_runtime_resume(&(p->dev)); 5302 } 5303 5304 pci_dev_put(p); 5305 } 5306 5307 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5308 { 5309 enum amd_reset_method reset_method; 5310 struct pci_dev *p = NULL; 5311 u64 expires; 5312 5313 /* 5314 * For now, only BACO and mode1 reset are confirmed 5315 * to suffer the audio issue without proper suspended. 5316 */ 5317 reset_method = amdgpu_asic_reset_method(adev); 5318 if ((reset_method != AMD_RESET_METHOD_BACO) && 5319 (reset_method != AMD_RESET_METHOD_MODE1)) 5320 return -EINVAL; 5321 5322 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5323 adev->pdev->bus->number, 1); 5324 if (!p) 5325 return -ENODEV; 5326 5327 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5328 if (!expires) 5329 /* 5330 * If we cannot get the audio device autosuspend delay, 5331 * a fixed 4S interval will be used. Considering 3S is 5332 * the audio controller default autosuspend delay setting. 5333 * 4S used here is guaranteed to cover that. 5334 */ 5335 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5336 5337 while (!pm_runtime_status_suspended(&(p->dev))) { 5338 if (!pm_runtime_suspend(&(p->dev))) 5339 break; 5340 5341 if (expires < ktime_get_mono_fast_ns()) { 5342 dev_warn(adev->dev, "failed to suspend display audio\n"); 5343 pci_dev_put(p); 5344 /* TODO: abort the succeeding gpu reset? */ 5345 return -ETIMEDOUT; 5346 } 5347 } 5348 5349 pm_runtime_disable(&(p->dev)); 5350 5351 pci_dev_put(p); 5352 return 0; 5353 } 5354 5355 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5356 { 5357 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5358 5359 #if defined(CONFIG_DEBUG_FS) 5360 if (!amdgpu_sriov_vf(adev)) 5361 cancel_work(&adev->reset_work); 5362 #endif 5363 5364 if (adev->kfd.dev) 5365 cancel_work(&adev->kfd.reset_work); 5366 5367 if (amdgpu_sriov_vf(adev)) 5368 cancel_work(&adev->virt.flr_work); 5369 5370 if (con && adev->ras_enabled) 5371 cancel_work(&con->recovery_work); 5372 5373 } 5374 5375 /** 5376 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5377 * 5378 * @adev: amdgpu_device pointer 5379 * @job: which job trigger hang 5380 * @reset_context: amdgpu reset context pointer 5381 * 5382 * Attempt to reset the GPU if it has hung (all asics). 5383 * Attempt to do soft-reset or full-reset and reinitialize Asic 5384 * Returns 0 for success or an error on failure. 5385 */ 5386 5387 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5388 struct amdgpu_job *job, 5389 struct amdgpu_reset_context *reset_context) 5390 { 5391 struct list_head device_list, *device_list_handle = NULL; 5392 bool job_signaled = false; 5393 struct amdgpu_hive_info *hive = NULL; 5394 struct amdgpu_device *tmp_adev = NULL; 5395 int i, r = 0; 5396 bool need_emergency_restart = false; 5397 bool audio_suspended = false; 5398 bool gpu_reset_for_dev_remove = false; 5399 5400 gpu_reset_for_dev_remove = 5401 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5402 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5403 5404 /* 5405 * Special case: RAS triggered and full reset isn't supported 5406 */ 5407 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5408 5409 /* 5410 * Flush RAM to disk so that after reboot 5411 * the user can read log and see why the system rebooted. 5412 */ 5413 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5414 amdgpu_ras_get_context(adev)->reboot) { 5415 DRM_WARN("Emergency reboot."); 5416 5417 ksys_sync_helper(); 5418 emergency_restart(); 5419 } 5420 5421 dev_info(adev->dev, "GPU %s begin!\n", 5422 need_emergency_restart ? "jobs stop":"reset"); 5423 5424 if (!amdgpu_sriov_vf(adev)) 5425 hive = amdgpu_get_xgmi_hive(adev); 5426 if (hive) 5427 mutex_lock(&hive->hive_lock); 5428 5429 reset_context->job = job; 5430 reset_context->hive = hive; 5431 /* 5432 * Build list of devices to reset. 5433 * In case we are in XGMI hive mode, resort the device list 5434 * to put adev in the 1st position. 5435 */ 5436 INIT_LIST_HEAD(&device_list); 5437 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5438 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5439 list_add_tail(&tmp_adev->reset_list, &device_list); 5440 if (gpu_reset_for_dev_remove && adev->shutdown) 5441 tmp_adev->shutdown = true; 5442 } 5443 if (!list_is_first(&adev->reset_list, &device_list)) 5444 list_rotate_to_front(&adev->reset_list, &device_list); 5445 device_list_handle = &device_list; 5446 } else { 5447 list_add_tail(&adev->reset_list, &device_list); 5448 device_list_handle = &device_list; 5449 } 5450 5451 /* We need to lock reset domain only once both for XGMI and single device */ 5452 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5453 reset_list); 5454 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5455 5456 /* block all schedulers and reset given job's ring */ 5457 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5458 5459 amdgpu_device_set_mp1_state(tmp_adev); 5460 5461 /* 5462 * Try to put the audio codec into suspend state 5463 * before gpu reset started. 5464 * 5465 * Due to the power domain of the graphics device 5466 * is shared with AZ power domain. Without this, 5467 * we may change the audio hardware from behind 5468 * the audio driver's back. That will trigger 5469 * some audio codec errors. 5470 */ 5471 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5472 audio_suspended = true; 5473 5474 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5475 5476 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5477 5478 if (!amdgpu_sriov_vf(tmp_adev)) 5479 amdgpu_amdkfd_pre_reset(tmp_adev); 5480 5481 /* 5482 * Mark these ASICs to be reseted as untracked first 5483 * And add them back after reset completed 5484 */ 5485 amdgpu_unregister_gpu_instance(tmp_adev); 5486 5487 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5488 5489 /* disable ras on ALL IPs */ 5490 if (!need_emergency_restart && 5491 amdgpu_device_ip_need_full_reset(tmp_adev)) 5492 amdgpu_ras_suspend(tmp_adev); 5493 5494 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5495 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5496 5497 if (!ring || !ring->sched.thread) 5498 continue; 5499 5500 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5501 5502 if (need_emergency_restart) 5503 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5504 } 5505 atomic_inc(&tmp_adev->gpu_reset_counter); 5506 } 5507 5508 if (need_emergency_restart) 5509 goto skip_sched_resume; 5510 5511 /* 5512 * Must check guilty signal here since after this point all old 5513 * HW fences are force signaled. 5514 * 5515 * job->base holds a reference to parent fence 5516 */ 5517 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5518 job_signaled = true; 5519 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5520 goto skip_hw_reset; 5521 } 5522 5523 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5524 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5525 if (gpu_reset_for_dev_remove) { 5526 /* Workaroud for ASICs need to disable SMC first */ 5527 amdgpu_device_smu_fini_early(tmp_adev); 5528 } 5529 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5530 /*TODO Should we stop ?*/ 5531 if (r) { 5532 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5533 r, adev_to_drm(tmp_adev)->unique); 5534 tmp_adev->asic_reset_res = r; 5535 } 5536 5537 /* 5538 * Drop all pending non scheduler resets. Scheduler resets 5539 * were already dropped during drm_sched_stop 5540 */ 5541 amdgpu_device_stop_pending_resets(tmp_adev); 5542 } 5543 5544 /* Actual ASIC resets if needed.*/ 5545 /* Host driver will handle XGMI hive reset for SRIOV */ 5546 if (amdgpu_sriov_vf(adev)) { 5547 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5548 if (r) 5549 adev->asic_reset_res = r; 5550 5551 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5552 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5553 IP_VERSION(9, 4, 2) || 5554 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5555 amdgpu_ras_resume(adev); 5556 } else { 5557 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5558 if (r && r == -EAGAIN) 5559 goto retry; 5560 5561 if (!r && gpu_reset_for_dev_remove) 5562 goto recover_end; 5563 } 5564 5565 skip_hw_reset: 5566 5567 /* Post ASIC reset for all devs .*/ 5568 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5569 5570 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5571 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5572 5573 if (!ring || !ring->sched.thread) 5574 continue; 5575 5576 drm_sched_start(&ring->sched, true); 5577 } 5578 5579 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5580 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5581 5582 if (tmp_adev->asic_reset_res) 5583 r = tmp_adev->asic_reset_res; 5584 5585 tmp_adev->asic_reset_res = 0; 5586 5587 if (r) { 5588 /* bad news, how to tell it to userspace ? */ 5589 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5590 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5591 } else { 5592 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5593 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5594 DRM_WARN("smart shift update failed\n"); 5595 } 5596 } 5597 5598 skip_sched_resume: 5599 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5600 /* unlock kfd: SRIOV would do it separately */ 5601 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5602 amdgpu_amdkfd_post_reset(tmp_adev); 5603 5604 /* kfd_post_reset will do nothing if kfd device is not initialized, 5605 * need to bring up kfd here if it's not be initialized before 5606 */ 5607 if (!adev->kfd.init_complete) 5608 amdgpu_amdkfd_device_init(adev); 5609 5610 if (audio_suspended) 5611 amdgpu_device_resume_display_audio(tmp_adev); 5612 5613 amdgpu_device_unset_mp1_state(tmp_adev); 5614 5615 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5616 } 5617 5618 recover_end: 5619 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5620 reset_list); 5621 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5622 5623 if (hive) { 5624 mutex_unlock(&hive->hive_lock); 5625 amdgpu_put_xgmi_hive(hive); 5626 } 5627 5628 if (r) 5629 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5630 5631 atomic_set(&adev->reset_domain->reset_res, r); 5632 return r; 5633 } 5634 5635 /** 5636 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5637 * 5638 * @adev: amdgpu_device pointer 5639 * 5640 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5641 * and lanes) of the slot the device is in. Handles APUs and 5642 * virtualized environments where PCIE config space may not be available. 5643 */ 5644 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5645 { 5646 struct pci_dev *pdev; 5647 enum pci_bus_speed speed_cap, platform_speed_cap; 5648 enum pcie_link_width platform_link_width; 5649 5650 if (amdgpu_pcie_gen_cap) 5651 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5652 5653 if (amdgpu_pcie_lane_cap) 5654 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5655 5656 /* covers APUs as well */ 5657 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5658 if (adev->pm.pcie_gen_mask == 0) 5659 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5660 if (adev->pm.pcie_mlw_mask == 0) 5661 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5662 return; 5663 } 5664 5665 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5666 return; 5667 5668 pcie_bandwidth_available(adev->pdev, NULL, 5669 &platform_speed_cap, &platform_link_width); 5670 5671 if (adev->pm.pcie_gen_mask == 0) { 5672 /* asic caps */ 5673 pdev = adev->pdev; 5674 speed_cap = pcie_get_speed_cap(pdev); 5675 if (speed_cap == PCI_SPEED_UNKNOWN) { 5676 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5677 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5678 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5679 } else { 5680 if (speed_cap == PCIE_SPEED_32_0GT) 5681 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5682 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5683 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5684 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5685 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5686 else if (speed_cap == PCIE_SPEED_16_0GT) 5687 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5688 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5689 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5690 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5691 else if (speed_cap == PCIE_SPEED_8_0GT) 5692 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5693 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5694 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5695 else if (speed_cap == PCIE_SPEED_5_0GT) 5696 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5697 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5698 else 5699 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5700 } 5701 /* platform caps */ 5702 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5703 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5704 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5705 } else { 5706 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5707 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5708 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5709 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5710 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5711 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5712 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5713 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5714 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5715 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5716 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5717 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5718 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5719 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5720 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5721 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5722 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5723 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5724 else 5725 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5726 5727 } 5728 } 5729 if (adev->pm.pcie_mlw_mask == 0) { 5730 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5731 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5732 } else { 5733 switch (platform_link_width) { 5734 case PCIE_LNK_X32: 5735 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5736 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5737 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5738 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5739 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5740 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5741 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5742 break; 5743 case PCIE_LNK_X16: 5744 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5745 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5746 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5747 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5748 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5749 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5750 break; 5751 case PCIE_LNK_X12: 5752 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5753 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5754 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5755 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5756 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5757 break; 5758 case PCIE_LNK_X8: 5759 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5760 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5761 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5762 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5763 break; 5764 case PCIE_LNK_X4: 5765 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5766 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5768 break; 5769 case PCIE_LNK_X2: 5770 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5771 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5772 break; 5773 case PCIE_LNK_X1: 5774 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5775 break; 5776 default: 5777 break; 5778 } 5779 } 5780 } 5781 } 5782 5783 /** 5784 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5785 * 5786 * @adev: amdgpu_device pointer 5787 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5788 * 5789 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5790 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5791 * @peer_adev. 5792 */ 5793 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5794 struct amdgpu_device *peer_adev) 5795 { 5796 #ifdef CONFIG_HSA_AMD_P2P 5797 uint64_t address_mask = peer_adev->dev->dma_mask ? 5798 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5799 resource_size_t aper_limit = 5800 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5801 bool p2p_access = 5802 !adev->gmc.xgmi.connected_to_cpu && 5803 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5804 5805 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5806 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5807 !(adev->gmc.aper_base & address_mask || 5808 aper_limit & address_mask)); 5809 #else 5810 return false; 5811 #endif 5812 } 5813 5814 int amdgpu_device_baco_enter(struct drm_device *dev) 5815 { 5816 struct amdgpu_device *adev = drm_to_adev(dev); 5817 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5818 5819 if (!amdgpu_device_supports_baco(dev)) 5820 return -ENOTSUPP; 5821 5822 if (ras && adev->ras_enabled && 5823 adev->nbio.funcs->enable_doorbell_interrupt) 5824 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5825 5826 return amdgpu_dpm_baco_enter(adev); 5827 } 5828 5829 int amdgpu_device_baco_exit(struct drm_device *dev) 5830 { 5831 struct amdgpu_device *adev = drm_to_adev(dev); 5832 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5833 int ret = 0; 5834 5835 if (!amdgpu_device_supports_baco(dev)) 5836 return -ENOTSUPP; 5837 5838 ret = amdgpu_dpm_baco_exit(adev); 5839 if (ret) 5840 return ret; 5841 5842 if (ras && adev->ras_enabled && 5843 adev->nbio.funcs->enable_doorbell_interrupt) 5844 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5845 5846 if (amdgpu_passthrough(adev) && 5847 adev->nbio.funcs->clear_doorbell_interrupt) 5848 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5849 5850 return 0; 5851 } 5852 5853 /** 5854 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5855 * @pdev: PCI device struct 5856 * @state: PCI channel state 5857 * 5858 * Description: Called when a PCI error is detected. 5859 * 5860 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5861 */ 5862 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5863 { 5864 struct drm_device *dev = pci_get_drvdata(pdev); 5865 struct amdgpu_device *adev = drm_to_adev(dev); 5866 int i; 5867 5868 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5869 5870 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5871 DRM_WARN("No support for XGMI hive yet..."); 5872 return PCI_ERS_RESULT_DISCONNECT; 5873 } 5874 5875 adev->pci_channel_state = state; 5876 5877 switch (state) { 5878 case pci_channel_io_normal: 5879 return PCI_ERS_RESULT_CAN_RECOVER; 5880 /* Fatal error, prepare for slot reset */ 5881 case pci_channel_io_frozen: 5882 /* 5883 * Locking adev->reset_domain->sem will prevent any external access 5884 * to GPU during PCI error recovery 5885 */ 5886 amdgpu_device_lock_reset_domain(adev->reset_domain); 5887 amdgpu_device_set_mp1_state(adev); 5888 5889 /* 5890 * Block any work scheduling as we do for regular GPU reset 5891 * for the duration of the recovery 5892 */ 5893 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5894 struct amdgpu_ring *ring = adev->rings[i]; 5895 5896 if (!ring || !ring->sched.thread) 5897 continue; 5898 5899 drm_sched_stop(&ring->sched, NULL); 5900 } 5901 atomic_inc(&adev->gpu_reset_counter); 5902 return PCI_ERS_RESULT_NEED_RESET; 5903 case pci_channel_io_perm_failure: 5904 /* Permanent error, prepare for device removal */ 5905 return PCI_ERS_RESULT_DISCONNECT; 5906 } 5907 5908 return PCI_ERS_RESULT_NEED_RESET; 5909 } 5910 5911 /** 5912 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5913 * @pdev: pointer to PCI device 5914 */ 5915 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5916 { 5917 5918 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5919 5920 /* TODO - dump whatever for debugging purposes */ 5921 5922 /* This called only if amdgpu_pci_error_detected returns 5923 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5924 * works, no need to reset slot. 5925 */ 5926 5927 return PCI_ERS_RESULT_RECOVERED; 5928 } 5929 5930 /** 5931 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5932 * @pdev: PCI device struct 5933 * 5934 * Description: This routine is called by the pci error recovery 5935 * code after the PCI slot has been reset, just before we 5936 * should resume normal operations. 5937 */ 5938 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5939 { 5940 struct drm_device *dev = pci_get_drvdata(pdev); 5941 struct amdgpu_device *adev = drm_to_adev(dev); 5942 int r, i; 5943 struct amdgpu_reset_context reset_context; 5944 u32 memsize; 5945 struct list_head device_list; 5946 5947 DRM_INFO("PCI error: slot reset callback!!\n"); 5948 5949 memset(&reset_context, 0, sizeof(reset_context)); 5950 5951 INIT_LIST_HEAD(&device_list); 5952 list_add_tail(&adev->reset_list, &device_list); 5953 5954 /* wait for asic to come out of reset */ 5955 msleep(500); 5956 5957 /* Restore PCI confspace */ 5958 amdgpu_device_load_pci_state(pdev); 5959 5960 /* confirm ASIC came out of reset */ 5961 for (i = 0; i < adev->usec_timeout; i++) { 5962 memsize = amdgpu_asic_get_config_memsize(adev); 5963 5964 if (memsize != 0xffffffff) 5965 break; 5966 udelay(1); 5967 } 5968 if (memsize == 0xffffffff) { 5969 r = -ETIME; 5970 goto out; 5971 } 5972 5973 reset_context.method = AMD_RESET_METHOD_NONE; 5974 reset_context.reset_req_dev = adev; 5975 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5976 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5977 5978 adev->no_hw_access = true; 5979 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5980 adev->no_hw_access = false; 5981 if (r) 5982 goto out; 5983 5984 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5985 5986 out: 5987 if (!r) { 5988 if (amdgpu_device_cache_pci_state(adev->pdev)) 5989 pci_restore_state(adev->pdev); 5990 5991 DRM_INFO("PCIe error recovery succeeded\n"); 5992 } else { 5993 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5994 amdgpu_device_unset_mp1_state(adev); 5995 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5996 } 5997 5998 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5999 } 6000 6001 /** 6002 * amdgpu_pci_resume() - resume normal ops after PCI reset 6003 * @pdev: pointer to PCI device 6004 * 6005 * Called when the error recovery driver tells us that its 6006 * OK to resume normal operation. 6007 */ 6008 void amdgpu_pci_resume(struct pci_dev *pdev) 6009 { 6010 struct drm_device *dev = pci_get_drvdata(pdev); 6011 struct amdgpu_device *adev = drm_to_adev(dev); 6012 int i; 6013 6014 6015 DRM_INFO("PCI error: resume callback!!\n"); 6016 6017 /* Only continue execution for the case of pci_channel_io_frozen */ 6018 if (adev->pci_channel_state != pci_channel_io_frozen) 6019 return; 6020 6021 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6022 struct amdgpu_ring *ring = adev->rings[i]; 6023 6024 if (!ring || !ring->sched.thread) 6025 continue; 6026 6027 drm_sched_start(&ring->sched, true); 6028 } 6029 6030 amdgpu_device_unset_mp1_state(adev); 6031 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6032 } 6033 6034 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6035 { 6036 struct drm_device *dev = pci_get_drvdata(pdev); 6037 struct amdgpu_device *adev = drm_to_adev(dev); 6038 int r; 6039 6040 r = pci_save_state(pdev); 6041 if (!r) { 6042 kfree(adev->pci_state); 6043 6044 adev->pci_state = pci_store_saved_state(pdev); 6045 6046 if (!adev->pci_state) { 6047 DRM_ERROR("Failed to store PCI saved state"); 6048 return false; 6049 } 6050 } else { 6051 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6052 return false; 6053 } 6054 6055 return true; 6056 } 6057 6058 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6059 { 6060 struct drm_device *dev = pci_get_drvdata(pdev); 6061 struct amdgpu_device *adev = drm_to_adev(dev); 6062 int r; 6063 6064 if (!adev->pci_state) 6065 return false; 6066 6067 r = pci_load_saved_state(pdev, adev->pci_state); 6068 6069 if (!r) { 6070 pci_restore_state(pdev); 6071 } else { 6072 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6073 return false; 6074 } 6075 6076 return true; 6077 } 6078 6079 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6080 struct amdgpu_ring *ring) 6081 { 6082 #ifdef CONFIG_X86_64 6083 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6084 return; 6085 #endif 6086 if (adev->gmc.xgmi.connected_to_cpu) 6087 return; 6088 6089 if (ring && ring->funcs->emit_hdp_flush) 6090 amdgpu_ring_emit_hdp_flush(ring); 6091 else 6092 amdgpu_asic_flush_hdp(adev, ring); 6093 } 6094 6095 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6096 struct amdgpu_ring *ring) 6097 { 6098 #ifdef CONFIG_X86_64 6099 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6100 return; 6101 #endif 6102 if (adev->gmc.xgmi.connected_to_cpu) 6103 return; 6104 6105 amdgpu_asic_invalidate_hdp(adev, ring); 6106 } 6107 6108 int amdgpu_in_reset(struct amdgpu_device *adev) 6109 { 6110 return atomic_read(&adev->reset_domain->in_gpu_reset); 6111 } 6112 6113 /** 6114 * amdgpu_device_halt() - bring hardware to some kind of halt state 6115 * 6116 * @adev: amdgpu_device pointer 6117 * 6118 * Bring hardware to some kind of halt state so that no one can touch it 6119 * any more. It will help to maintain error context when error occurred. 6120 * Compare to a simple hang, the system will keep stable at least for SSH 6121 * access. Then it should be trivial to inspect the hardware state and 6122 * see what's going on. Implemented as following: 6123 * 6124 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6125 * clears all CPU mappings to device, disallows remappings through page faults 6126 * 2. amdgpu_irq_disable_all() disables all interrupts 6127 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6128 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6129 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6130 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6131 * flush any in flight DMA operations 6132 */ 6133 void amdgpu_device_halt(struct amdgpu_device *adev) 6134 { 6135 struct pci_dev *pdev = adev->pdev; 6136 struct drm_device *ddev = adev_to_drm(adev); 6137 6138 amdgpu_xcp_dev_unplug(adev); 6139 drm_dev_unplug(ddev); 6140 6141 amdgpu_irq_disable_all(adev); 6142 6143 amdgpu_fence_driver_hw_fini(adev); 6144 6145 adev->no_hw_access = true; 6146 6147 amdgpu_device_unmap_mmio(adev); 6148 6149 pci_disable_device(pdev); 6150 pci_wait_for_pending_transaction(pdev); 6151 } 6152 6153 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6154 u32 reg) 6155 { 6156 unsigned long flags, address, data; 6157 u32 r; 6158 6159 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6160 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6161 6162 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6163 WREG32(address, reg * 4); 6164 (void)RREG32(address); 6165 r = RREG32(data); 6166 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6167 return r; 6168 } 6169 6170 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6171 u32 reg, u32 v) 6172 { 6173 unsigned long flags, address, data; 6174 6175 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6176 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6177 6178 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6179 WREG32(address, reg * 4); 6180 (void)RREG32(address); 6181 WREG32(data, v); 6182 (void)RREG32(data); 6183 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6184 } 6185 6186 /** 6187 * amdgpu_device_switch_gang - switch to a new gang 6188 * @adev: amdgpu_device pointer 6189 * @gang: the gang to switch to 6190 * 6191 * Try to switch to a new gang. 6192 * Returns: NULL if we switched to the new gang or a reference to the current 6193 * gang leader. 6194 */ 6195 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6196 struct dma_fence *gang) 6197 { 6198 struct dma_fence *old = NULL; 6199 6200 do { 6201 dma_fence_put(old); 6202 rcu_read_lock(); 6203 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6204 rcu_read_unlock(); 6205 6206 if (old == gang) 6207 break; 6208 6209 if (!dma_fence_is_signaled(old)) 6210 return old; 6211 6212 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6213 old, gang) != old); 6214 6215 dma_fence_put(old); 6216 return NULL; 6217 } 6218 6219 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6220 { 6221 switch (adev->asic_type) { 6222 #ifdef CONFIG_DRM_AMDGPU_SI 6223 case CHIP_HAINAN: 6224 #endif 6225 case CHIP_TOPAZ: 6226 /* chips with no display hardware */ 6227 return false; 6228 #ifdef CONFIG_DRM_AMDGPU_SI 6229 case CHIP_TAHITI: 6230 case CHIP_PITCAIRN: 6231 case CHIP_VERDE: 6232 case CHIP_OLAND: 6233 #endif 6234 #ifdef CONFIG_DRM_AMDGPU_CIK 6235 case CHIP_BONAIRE: 6236 case CHIP_HAWAII: 6237 case CHIP_KAVERI: 6238 case CHIP_KABINI: 6239 case CHIP_MULLINS: 6240 #endif 6241 case CHIP_TONGA: 6242 case CHIP_FIJI: 6243 case CHIP_POLARIS10: 6244 case CHIP_POLARIS11: 6245 case CHIP_POLARIS12: 6246 case CHIP_VEGAM: 6247 case CHIP_CARRIZO: 6248 case CHIP_STONEY: 6249 /* chips with display hardware */ 6250 return true; 6251 default: 6252 /* IP discovery */ 6253 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6254 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6255 return false; 6256 return true; 6257 } 6258 } 6259 6260 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6261 uint32_t inst, uint32_t reg_addr, char reg_name[], 6262 uint32_t expected_value, uint32_t mask) 6263 { 6264 uint32_t ret = 0; 6265 uint32_t old_ = 0; 6266 uint32_t tmp_ = RREG32(reg_addr); 6267 uint32_t loop = adev->usec_timeout; 6268 6269 while ((tmp_ & (mask)) != (expected_value)) { 6270 if (old_ != tmp_) { 6271 loop = adev->usec_timeout; 6272 old_ = tmp_; 6273 } else 6274 udelay(1); 6275 tmp_ = RREG32(reg_addr); 6276 loop--; 6277 if (!loop) { 6278 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6279 inst, reg_name, (uint32_t)expected_value, 6280 (uint32_t)(tmp_ & (mask))); 6281 ret = -ETIMEDOUT; 6282 break; 6283 } 6284 } 6285 return ret; 6286 } 6287