1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 36 #include <drm/drm_atomic_helper.h> 37 #include <drm/drm_probe_helper.h> 38 #include <drm/amdgpu_drm.h> 39 #include <linux/vgaarb.h> 40 #include <linux/vga_switcheroo.h> 41 #include <linux/efi.h> 42 #include "amdgpu.h" 43 #include "amdgpu_trace.h" 44 #include "amdgpu_i2c.h" 45 #include "atom.h" 46 #include "amdgpu_atombios.h" 47 #include "amdgpu_atomfirmware.h" 48 #include "amd_pcie.h" 49 #ifdef CONFIG_DRM_AMDGPU_SI 50 #include "si.h" 51 #endif 52 #ifdef CONFIG_DRM_AMDGPU_CIK 53 #include "cik.h" 54 #endif 55 #include "vi.h" 56 #include "soc15.h" 57 #include "nv.h" 58 #include "bif/bif_4_1_d.h" 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 #define AMDGPU_MAX_RETRY_LIMIT 2 92 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 93 94 const char *amdgpu_asic_name[] = { 95 "TAHITI", 96 "PITCAIRN", 97 "VERDE", 98 "OLAND", 99 "HAINAN", 100 "BONAIRE", 101 "KAVERI", 102 "KABINI", 103 "HAWAII", 104 "MULLINS", 105 "TOPAZ", 106 "TONGA", 107 "FIJI", 108 "CARRIZO", 109 "STONEY", 110 "POLARIS10", 111 "POLARIS11", 112 "POLARIS12", 113 "VEGAM", 114 "VEGA10", 115 "VEGA12", 116 "VEGA20", 117 "RAVEN", 118 "ARCTURUS", 119 "RENOIR", 120 "ALDEBARAN", 121 "NAVI10", 122 "CYAN_SKILLFISH", 123 "NAVI14", 124 "NAVI12", 125 "SIENNA_CICHLID", 126 "NAVY_FLOUNDER", 127 "VANGOGH", 128 "DIMGREY_CAVEFISH", 129 "BEIGE_GOBY", 130 "YELLOW_CARP", 131 "IP DISCOVERY", 132 "LAST", 133 }; 134 135 /** 136 * DOC: pcie_replay_count 137 * 138 * The amdgpu driver provides a sysfs API for reporting the total number 139 * of PCIe replays (NAKs) 140 * The file pcie_replay_count is used for this and returns the total 141 * number of replays as a sum of the NAKs generated and NAKs received 142 */ 143 144 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 145 struct device_attribute *attr, char *buf) 146 { 147 struct drm_device *ddev = dev_get_drvdata(dev); 148 struct amdgpu_device *adev = drm_to_adev(ddev); 149 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 150 151 return sysfs_emit(buf, "%llu\n", cnt); 152 } 153 154 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 155 amdgpu_device_get_pcie_replay_count, NULL); 156 157 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 158 159 /** 160 * DOC: product_name 161 * 162 * The amdgpu driver provides a sysfs API for reporting the product name 163 * for the device 164 * The file serial_number is used for this and returns the product name 165 * as returned from the FRU. 166 * NOTE: This is only available for certain server cards 167 */ 168 169 static ssize_t amdgpu_device_get_product_name(struct device *dev, 170 struct device_attribute *attr, char *buf) 171 { 172 struct drm_device *ddev = dev_get_drvdata(dev); 173 struct amdgpu_device *adev = drm_to_adev(ddev); 174 175 return sysfs_emit(buf, "%s\n", adev->product_name); 176 } 177 178 static DEVICE_ATTR(product_name, S_IRUGO, 179 amdgpu_device_get_product_name, NULL); 180 181 /** 182 * DOC: product_number 183 * 184 * The amdgpu driver provides a sysfs API for reporting the part number 185 * for the device 186 * The file serial_number is used for this and returns the part number 187 * as returned from the FRU. 188 * NOTE: This is only available for certain server cards 189 */ 190 191 static ssize_t amdgpu_device_get_product_number(struct device *dev, 192 struct device_attribute *attr, char *buf) 193 { 194 struct drm_device *ddev = dev_get_drvdata(dev); 195 struct amdgpu_device *adev = drm_to_adev(ddev); 196 197 return sysfs_emit(buf, "%s\n", adev->product_number); 198 } 199 200 static DEVICE_ATTR(product_number, S_IRUGO, 201 amdgpu_device_get_product_number, NULL); 202 203 /** 204 * DOC: serial_number 205 * 206 * The amdgpu driver provides a sysfs API for reporting the serial number 207 * for the device 208 * The file serial_number is used for this and returns the serial number 209 * as returned from the FRU. 210 * NOTE: This is only available for certain server cards 211 */ 212 213 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 214 struct device_attribute *attr, char *buf) 215 { 216 struct drm_device *ddev = dev_get_drvdata(dev); 217 struct amdgpu_device *adev = drm_to_adev(ddev); 218 219 return sysfs_emit(buf, "%s\n", adev->serial); 220 } 221 222 static DEVICE_ATTR(serial_number, S_IRUGO, 223 amdgpu_device_get_serial_number, NULL); 224 225 /** 226 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 227 * 228 * @dev: drm_device pointer 229 * 230 * Returns true if the device is a dGPU with ATPX power control, 231 * otherwise return false. 232 */ 233 bool amdgpu_device_supports_px(struct drm_device *dev) 234 { 235 struct amdgpu_device *adev = drm_to_adev(dev); 236 237 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 238 return true; 239 return false; 240 } 241 242 /** 243 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 244 * 245 * @dev: drm_device pointer 246 * 247 * Returns true if the device is a dGPU with ACPI power control, 248 * otherwise return false. 249 */ 250 bool amdgpu_device_supports_boco(struct drm_device *dev) 251 { 252 struct amdgpu_device *adev = drm_to_adev(dev); 253 254 if (adev->has_pr3 || 255 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 256 return true; 257 return false; 258 } 259 260 /** 261 * amdgpu_device_supports_baco - Does the device support BACO 262 * 263 * @dev: drm_device pointer 264 * 265 * Returns true if the device supporte BACO, 266 * otherwise return false. 267 */ 268 bool amdgpu_device_supports_baco(struct drm_device *dev) 269 { 270 struct amdgpu_device *adev = drm_to_adev(dev); 271 272 return amdgpu_asic_supports_baco(adev); 273 } 274 275 /** 276 * amdgpu_device_supports_smart_shift - Is the device dGPU with 277 * smart shift support 278 * 279 * @dev: drm_device pointer 280 * 281 * Returns true if the device is a dGPU with Smart Shift support, 282 * otherwise returns false. 283 */ 284 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 285 { 286 return (amdgpu_device_supports_boco(dev) && 287 amdgpu_acpi_is_power_shift_control_supported()); 288 } 289 290 /* 291 * VRAM access helper functions 292 */ 293 294 /** 295 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 296 * 297 * @adev: amdgpu_device pointer 298 * @pos: offset of the buffer in vram 299 * @buf: virtual address of the buffer in system memory 300 * @size: read/write size, sizeof(@buf) must > @size 301 * @write: true - write to vram, otherwise - read from vram 302 */ 303 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 304 void *buf, size_t size, bool write) 305 { 306 unsigned long flags; 307 uint32_t hi = ~0, tmp = 0; 308 uint32_t *data = buf; 309 uint64_t last; 310 int idx; 311 312 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 313 return; 314 315 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 316 317 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 318 for (last = pos + size; pos < last; pos += 4) { 319 tmp = pos >> 31; 320 321 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 322 if (tmp != hi) { 323 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 324 hi = tmp; 325 } 326 if (write) 327 WREG32_NO_KIQ(mmMM_DATA, *data++); 328 else 329 *data++ = RREG32_NO_KIQ(mmMM_DATA); 330 } 331 332 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 333 drm_dev_exit(idx); 334 } 335 336 /** 337 * amdgpu_device_aper_access - access vram by vram aperature 338 * 339 * @adev: amdgpu_device pointer 340 * @pos: offset of the buffer in vram 341 * @buf: virtual address of the buffer in system memory 342 * @size: read/write size, sizeof(@buf) must > @size 343 * @write: true - write to vram, otherwise - read from vram 344 * 345 * The return value means how many bytes have been transferred. 346 */ 347 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 348 void *buf, size_t size, bool write) 349 { 350 #ifdef CONFIG_64BIT 351 void __iomem *addr; 352 size_t count = 0; 353 uint64_t last; 354 355 if (!adev->mman.aper_base_kaddr) 356 return 0; 357 358 last = min(pos + size, adev->gmc.visible_vram_size); 359 if (last > pos) { 360 addr = adev->mman.aper_base_kaddr + pos; 361 count = last - pos; 362 363 if (write) { 364 memcpy_toio(addr, buf, count); 365 mb(); 366 amdgpu_device_flush_hdp(adev, NULL); 367 } else { 368 amdgpu_device_invalidate_hdp(adev, NULL); 369 mb(); 370 memcpy_fromio(buf, addr, count); 371 } 372 373 } 374 375 return count; 376 #else 377 return 0; 378 #endif 379 } 380 381 /** 382 * amdgpu_device_vram_access - read/write a buffer in vram 383 * 384 * @adev: amdgpu_device pointer 385 * @pos: offset of the buffer in vram 386 * @buf: virtual address of the buffer in system memory 387 * @size: read/write size, sizeof(@buf) must > @size 388 * @write: true - write to vram, otherwise - read from vram 389 */ 390 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 391 void *buf, size_t size, bool write) 392 { 393 size_t count; 394 395 /* try to using vram apreature to access vram first */ 396 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 397 size -= count; 398 if (size) { 399 /* using MM to access rest vram */ 400 pos += count; 401 buf += count; 402 amdgpu_device_mm_access(adev, pos, buf, size, write); 403 } 404 } 405 406 /* 407 * register access helper functions. 408 */ 409 410 /* Check if hw access should be skipped because of hotplug or device error */ 411 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 412 { 413 if (adev->no_hw_access) 414 return true; 415 416 #ifdef CONFIG_LOCKDEP 417 /* 418 * This is a bit complicated to understand, so worth a comment. What we assert 419 * here is that the GPU reset is not running on another thread in parallel. 420 * 421 * For this we trylock the read side of the reset semaphore, if that succeeds 422 * we know that the reset is not running in paralell. 423 * 424 * If the trylock fails we assert that we are either already holding the read 425 * side of the lock or are the reset thread itself and hold the write side of 426 * the lock. 427 */ 428 if (in_task()) { 429 if (down_read_trylock(&adev->reset_sem)) 430 up_read(&adev->reset_sem); 431 else 432 lockdep_assert_held(&adev->reset_sem); 433 } 434 #endif 435 return false; 436 } 437 438 /** 439 * amdgpu_device_rreg - read a memory mapped IO or indirect register 440 * 441 * @adev: amdgpu_device pointer 442 * @reg: dword aligned register offset 443 * @acc_flags: access flags which require special behavior 444 * 445 * Returns the 32 bit value from the offset specified. 446 */ 447 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 448 uint32_t reg, uint32_t acc_flags) 449 { 450 uint32_t ret; 451 452 if (amdgpu_device_skip_hw_access(adev)) 453 return 0; 454 455 if ((reg * 4) < adev->rmmio_size) { 456 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 457 amdgpu_sriov_runtime(adev) && 458 down_read_trylock(&adev->reset_sem)) { 459 ret = amdgpu_kiq_rreg(adev, reg); 460 up_read(&adev->reset_sem); 461 } else { 462 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 463 } 464 } else { 465 ret = adev->pcie_rreg(adev, reg * 4); 466 } 467 468 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 469 470 return ret; 471 } 472 473 /* 474 * MMIO register read with bytes helper functions 475 * @offset:bytes offset from MMIO start 476 * 477 */ 478 479 /** 480 * amdgpu_mm_rreg8 - read a memory mapped IO register 481 * 482 * @adev: amdgpu_device pointer 483 * @offset: byte aligned register offset 484 * 485 * Returns the 8 bit value from the offset specified. 486 */ 487 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 488 { 489 if (amdgpu_device_skip_hw_access(adev)) 490 return 0; 491 492 if (offset < adev->rmmio_size) 493 return (readb(adev->rmmio + offset)); 494 BUG(); 495 } 496 497 /* 498 * MMIO register write with bytes helper functions 499 * @offset:bytes offset from MMIO start 500 * @value: the value want to be written to the register 501 * 502 */ 503 /** 504 * amdgpu_mm_wreg8 - read a memory mapped IO register 505 * 506 * @adev: amdgpu_device pointer 507 * @offset: byte aligned register offset 508 * @value: 8 bit value to write 509 * 510 * Writes the value specified to the offset specified. 511 */ 512 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 513 { 514 if (amdgpu_device_skip_hw_access(adev)) 515 return; 516 517 if (offset < adev->rmmio_size) 518 writeb(value, adev->rmmio + offset); 519 else 520 BUG(); 521 } 522 523 /** 524 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 525 * 526 * @adev: amdgpu_device pointer 527 * @reg: dword aligned register offset 528 * @v: 32 bit value to write to the register 529 * @acc_flags: access flags which require special behavior 530 * 531 * Writes the value specified to the offset specified. 532 */ 533 void amdgpu_device_wreg(struct amdgpu_device *adev, 534 uint32_t reg, uint32_t v, 535 uint32_t acc_flags) 536 { 537 if (amdgpu_device_skip_hw_access(adev)) 538 return; 539 540 if ((reg * 4) < adev->rmmio_size) { 541 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 542 amdgpu_sriov_runtime(adev) && 543 down_read_trylock(&adev->reset_sem)) { 544 amdgpu_kiq_wreg(adev, reg, v); 545 up_read(&adev->reset_sem); 546 } else { 547 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 548 } 549 } else { 550 adev->pcie_wreg(adev, reg * 4, v); 551 } 552 553 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 554 } 555 556 /** 557 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 558 * 559 * @adev: amdgpu_device pointer 560 * @reg: mmio/rlc register 561 * @v: value to write 562 * 563 * this function is invoked only for the debugfs register access 564 */ 565 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 566 uint32_t reg, uint32_t v) 567 { 568 if (amdgpu_device_skip_hw_access(adev)) 569 return; 570 571 if (amdgpu_sriov_fullaccess(adev) && 572 adev->gfx.rlc.funcs && 573 adev->gfx.rlc.funcs->is_rlcg_access_range) { 574 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 575 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 576 } else if ((reg * 4) >= adev->rmmio_size) { 577 adev->pcie_wreg(adev, reg * 4, v); 578 } else { 579 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 580 } 581 } 582 583 /** 584 * amdgpu_mm_rdoorbell - read a doorbell dword 585 * 586 * @adev: amdgpu_device pointer 587 * @index: doorbell index 588 * 589 * Returns the value in the doorbell aperture at the 590 * requested doorbell index (CIK). 591 */ 592 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 593 { 594 if (amdgpu_device_skip_hw_access(adev)) 595 return 0; 596 597 if (index < adev->doorbell.num_doorbells) { 598 return readl(adev->doorbell.ptr + index); 599 } else { 600 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 601 return 0; 602 } 603 } 604 605 /** 606 * amdgpu_mm_wdoorbell - write a doorbell dword 607 * 608 * @adev: amdgpu_device pointer 609 * @index: doorbell index 610 * @v: value to write 611 * 612 * Writes @v to the doorbell aperture at the 613 * requested doorbell index (CIK). 614 */ 615 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 616 { 617 if (amdgpu_device_skip_hw_access(adev)) 618 return; 619 620 if (index < adev->doorbell.num_doorbells) { 621 writel(v, adev->doorbell.ptr + index); 622 } else { 623 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 624 } 625 } 626 627 /** 628 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 629 * 630 * @adev: amdgpu_device pointer 631 * @index: doorbell index 632 * 633 * Returns the value in the doorbell aperture at the 634 * requested doorbell index (VEGA10+). 635 */ 636 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 637 { 638 if (amdgpu_device_skip_hw_access(adev)) 639 return 0; 640 641 if (index < adev->doorbell.num_doorbells) { 642 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 643 } else { 644 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 645 return 0; 646 } 647 } 648 649 /** 650 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 651 * 652 * @adev: amdgpu_device pointer 653 * @index: doorbell index 654 * @v: value to write 655 * 656 * Writes @v to the doorbell aperture at the 657 * requested doorbell index (VEGA10+). 658 */ 659 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 660 { 661 if (amdgpu_device_skip_hw_access(adev)) 662 return; 663 664 if (index < adev->doorbell.num_doorbells) { 665 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 666 } else { 667 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 668 } 669 } 670 671 /** 672 * amdgpu_device_indirect_rreg - read an indirect register 673 * 674 * @adev: amdgpu_device pointer 675 * @pcie_index: mmio register offset 676 * @pcie_data: mmio register offset 677 * @reg_addr: indirect register address to read from 678 * 679 * Returns the value of indirect register @reg_addr 680 */ 681 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 682 u32 pcie_index, u32 pcie_data, 683 u32 reg_addr) 684 { 685 unsigned long flags; 686 u32 r; 687 void __iomem *pcie_index_offset; 688 void __iomem *pcie_data_offset; 689 690 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 693 694 writel(reg_addr, pcie_index_offset); 695 readl(pcie_index_offset); 696 r = readl(pcie_data_offset); 697 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 698 699 return r; 700 } 701 702 /** 703 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 704 * 705 * @adev: amdgpu_device pointer 706 * @pcie_index: mmio register offset 707 * @pcie_data: mmio register offset 708 * @reg_addr: indirect register address to read from 709 * 710 * Returns the value of indirect register @reg_addr 711 */ 712 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 713 u32 pcie_index, u32 pcie_data, 714 u32 reg_addr) 715 { 716 unsigned long flags; 717 u64 r; 718 void __iomem *pcie_index_offset; 719 void __iomem *pcie_data_offset; 720 721 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 722 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 723 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 724 725 /* read low 32 bits */ 726 writel(reg_addr, pcie_index_offset); 727 readl(pcie_index_offset); 728 r = readl(pcie_data_offset); 729 /* read high 32 bits */ 730 writel(reg_addr + 4, pcie_index_offset); 731 readl(pcie_index_offset); 732 r |= ((u64)readl(pcie_data_offset) << 32); 733 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 734 735 return r; 736 } 737 738 /** 739 * amdgpu_device_indirect_wreg - write an indirect register address 740 * 741 * @adev: amdgpu_device pointer 742 * @pcie_index: mmio register offset 743 * @pcie_data: mmio register offset 744 * @reg_addr: indirect register offset 745 * @reg_data: indirect register data 746 * 747 */ 748 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 749 u32 pcie_index, u32 pcie_data, 750 u32 reg_addr, u32 reg_data) 751 { 752 unsigned long flags; 753 void __iomem *pcie_index_offset; 754 void __iomem *pcie_data_offset; 755 756 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 757 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 758 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 759 760 writel(reg_addr, pcie_index_offset); 761 readl(pcie_index_offset); 762 writel(reg_data, pcie_data_offset); 763 readl(pcie_data_offset); 764 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 765 } 766 767 /** 768 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 769 * 770 * @adev: amdgpu_device pointer 771 * @pcie_index: mmio register offset 772 * @pcie_data: mmio register offset 773 * @reg_addr: indirect register offset 774 * @reg_data: indirect register data 775 * 776 */ 777 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 778 u32 pcie_index, u32 pcie_data, 779 u32 reg_addr, u64 reg_data) 780 { 781 unsigned long flags; 782 void __iomem *pcie_index_offset; 783 void __iomem *pcie_data_offset; 784 785 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 786 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 787 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 788 789 /* write low 32 bits */ 790 writel(reg_addr, pcie_index_offset); 791 readl(pcie_index_offset); 792 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 793 readl(pcie_data_offset); 794 /* write high 32 bits */ 795 writel(reg_addr + 4, pcie_index_offset); 796 readl(pcie_index_offset); 797 writel((u32)(reg_data >> 32), pcie_data_offset); 798 readl(pcie_data_offset); 799 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 800 } 801 802 /** 803 * amdgpu_invalid_rreg - dummy reg read function 804 * 805 * @adev: amdgpu_device pointer 806 * @reg: offset of register 807 * 808 * Dummy register read function. Used for register blocks 809 * that certain asics don't have (all asics). 810 * Returns the value in the register. 811 */ 812 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 813 { 814 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 815 BUG(); 816 return 0; 817 } 818 819 /** 820 * amdgpu_invalid_wreg - dummy reg write function 821 * 822 * @adev: amdgpu_device pointer 823 * @reg: offset of register 824 * @v: value to write to the register 825 * 826 * Dummy register read function. Used for register blocks 827 * that certain asics don't have (all asics). 828 */ 829 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 830 { 831 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 832 reg, v); 833 BUG(); 834 } 835 836 /** 837 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 838 * 839 * @adev: amdgpu_device pointer 840 * @reg: offset of register 841 * 842 * Dummy register read function. Used for register blocks 843 * that certain asics don't have (all asics). 844 * Returns the value in the register. 845 */ 846 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 847 { 848 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 849 BUG(); 850 return 0; 851 } 852 853 /** 854 * amdgpu_invalid_wreg64 - dummy reg write function 855 * 856 * @adev: amdgpu_device pointer 857 * @reg: offset of register 858 * @v: value to write to the register 859 * 860 * Dummy register read function. Used for register blocks 861 * that certain asics don't have (all asics). 862 */ 863 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 864 { 865 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 866 reg, v); 867 BUG(); 868 } 869 870 /** 871 * amdgpu_block_invalid_rreg - dummy reg read function 872 * 873 * @adev: amdgpu_device pointer 874 * @block: offset of instance 875 * @reg: offset of register 876 * 877 * Dummy register read function. Used for register blocks 878 * that certain asics don't have (all asics). 879 * Returns the value in the register. 880 */ 881 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 882 uint32_t block, uint32_t reg) 883 { 884 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 885 reg, block); 886 BUG(); 887 return 0; 888 } 889 890 /** 891 * amdgpu_block_invalid_wreg - dummy reg write function 892 * 893 * @adev: amdgpu_device pointer 894 * @block: offset of instance 895 * @reg: offset of register 896 * @v: value to write to the register 897 * 898 * Dummy register read function. Used for register blocks 899 * that certain asics don't have (all asics). 900 */ 901 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 902 uint32_t block, 903 uint32_t reg, uint32_t v) 904 { 905 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 906 reg, block, v); 907 BUG(); 908 } 909 910 /** 911 * amdgpu_device_asic_init - Wrapper for atom asic_init 912 * 913 * @adev: amdgpu_device pointer 914 * 915 * Does any asic specific work and then calls atom asic init. 916 */ 917 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 918 { 919 amdgpu_asic_pre_asic_init(adev); 920 921 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 922 } 923 924 /** 925 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 926 * 927 * @adev: amdgpu_device pointer 928 * 929 * Allocates a scratch page of VRAM for use by various things in the 930 * driver. 931 */ 932 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 933 { 934 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 935 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 936 &adev->vram_scratch.robj, 937 &adev->vram_scratch.gpu_addr, 938 (void **)&adev->vram_scratch.ptr); 939 } 940 941 /** 942 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 943 * 944 * @adev: amdgpu_device pointer 945 * 946 * Frees the VRAM scratch page. 947 */ 948 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 949 { 950 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 951 } 952 953 /** 954 * amdgpu_device_program_register_sequence - program an array of registers. 955 * 956 * @adev: amdgpu_device pointer 957 * @registers: pointer to the register array 958 * @array_size: size of the register array 959 * 960 * Programs an array or registers with and and or masks. 961 * This is a helper for setting golden registers. 962 */ 963 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 964 const u32 *registers, 965 const u32 array_size) 966 { 967 u32 tmp, reg, and_mask, or_mask; 968 int i; 969 970 if (array_size % 3) 971 return; 972 973 for (i = 0; i < array_size; i +=3) { 974 reg = registers[i + 0]; 975 and_mask = registers[i + 1]; 976 or_mask = registers[i + 2]; 977 978 if (and_mask == 0xffffffff) { 979 tmp = or_mask; 980 } else { 981 tmp = RREG32(reg); 982 tmp &= ~and_mask; 983 if (adev->family >= AMDGPU_FAMILY_AI) 984 tmp |= (or_mask & and_mask); 985 else 986 tmp |= or_mask; 987 } 988 WREG32(reg, tmp); 989 } 990 } 991 992 /** 993 * amdgpu_device_pci_config_reset - reset the GPU 994 * 995 * @adev: amdgpu_device pointer 996 * 997 * Resets the GPU using the pci config reset sequence. 998 * Only applicable to asics prior to vega10. 999 */ 1000 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1001 { 1002 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1003 } 1004 1005 /** 1006 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1007 * 1008 * @adev: amdgpu_device pointer 1009 * 1010 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1011 */ 1012 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1013 { 1014 return pci_reset_function(adev->pdev); 1015 } 1016 1017 /* 1018 * GPU doorbell aperture helpers function. 1019 */ 1020 /** 1021 * amdgpu_device_doorbell_init - Init doorbell driver information. 1022 * 1023 * @adev: amdgpu_device pointer 1024 * 1025 * Init doorbell driver information (CIK) 1026 * Returns 0 on success, error on failure. 1027 */ 1028 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1029 { 1030 1031 /* No doorbell on SI hardware generation */ 1032 if (adev->asic_type < CHIP_BONAIRE) { 1033 adev->doorbell.base = 0; 1034 adev->doorbell.size = 0; 1035 adev->doorbell.num_doorbells = 0; 1036 adev->doorbell.ptr = NULL; 1037 return 0; 1038 } 1039 1040 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1041 return -EINVAL; 1042 1043 amdgpu_asic_init_doorbell_index(adev); 1044 1045 /* doorbell bar mapping */ 1046 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1047 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1048 1049 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 1050 adev->doorbell_index.max_assignment+1); 1051 if (adev->doorbell.num_doorbells == 0) 1052 return -EINVAL; 1053 1054 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1055 * paging queue doorbell use the second page. The 1056 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1057 * doorbells are in the first page. So with paging queue enabled, 1058 * the max num_doorbells should + 1 page (0x400 in dword) 1059 */ 1060 if (adev->asic_type >= CHIP_VEGA10) 1061 adev->doorbell.num_doorbells += 0x400; 1062 1063 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1064 adev->doorbell.num_doorbells * 1065 sizeof(u32)); 1066 if (adev->doorbell.ptr == NULL) 1067 return -ENOMEM; 1068 1069 return 0; 1070 } 1071 1072 /** 1073 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1074 * 1075 * @adev: amdgpu_device pointer 1076 * 1077 * Tear down doorbell driver information (CIK) 1078 */ 1079 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1080 { 1081 iounmap(adev->doorbell.ptr); 1082 adev->doorbell.ptr = NULL; 1083 } 1084 1085 1086 1087 /* 1088 * amdgpu_device_wb_*() 1089 * Writeback is the method by which the GPU updates special pages in memory 1090 * with the status of certain GPU events (fences, ring pointers,etc.). 1091 */ 1092 1093 /** 1094 * amdgpu_device_wb_fini - Disable Writeback and free memory 1095 * 1096 * @adev: amdgpu_device pointer 1097 * 1098 * Disables Writeback and frees the Writeback memory (all asics). 1099 * Used at driver shutdown. 1100 */ 1101 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1102 { 1103 if (adev->wb.wb_obj) { 1104 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1105 &adev->wb.gpu_addr, 1106 (void **)&adev->wb.wb); 1107 adev->wb.wb_obj = NULL; 1108 } 1109 } 1110 1111 /** 1112 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1113 * 1114 * @adev: amdgpu_device pointer 1115 * 1116 * Initializes writeback and allocates writeback memory (all asics). 1117 * Used at driver startup. 1118 * Returns 0 on success or an -error on failure. 1119 */ 1120 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1121 { 1122 int r; 1123 1124 if (adev->wb.wb_obj == NULL) { 1125 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1126 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1127 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1128 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1129 (void **)&adev->wb.wb); 1130 if (r) { 1131 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1132 return r; 1133 } 1134 1135 adev->wb.num_wb = AMDGPU_MAX_WB; 1136 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1137 1138 /* clear wb memory */ 1139 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1140 } 1141 1142 return 0; 1143 } 1144 1145 /** 1146 * amdgpu_device_wb_get - Allocate a wb entry 1147 * 1148 * @adev: amdgpu_device pointer 1149 * @wb: wb index 1150 * 1151 * Allocate a wb slot for use by the driver (all asics). 1152 * Returns 0 on success or -EINVAL on failure. 1153 */ 1154 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1155 { 1156 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1157 1158 if (offset < adev->wb.num_wb) { 1159 __set_bit(offset, adev->wb.used); 1160 *wb = offset << 3; /* convert to dw offset */ 1161 return 0; 1162 } else { 1163 return -EINVAL; 1164 } 1165 } 1166 1167 /** 1168 * amdgpu_device_wb_free - Free a wb entry 1169 * 1170 * @adev: amdgpu_device pointer 1171 * @wb: wb index 1172 * 1173 * Free a wb slot allocated for use by the driver (all asics) 1174 */ 1175 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1176 { 1177 wb >>= 3; 1178 if (wb < adev->wb.num_wb) 1179 __clear_bit(wb, adev->wb.used); 1180 } 1181 1182 /** 1183 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1184 * 1185 * @adev: amdgpu_device pointer 1186 * 1187 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1188 * to fail, but if any of the BARs is not accessible after the size we abort 1189 * driver loading by returning -ENODEV. 1190 */ 1191 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1192 { 1193 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1194 struct pci_bus *root; 1195 struct resource *res; 1196 unsigned i; 1197 u16 cmd; 1198 int r; 1199 1200 /* Bypass for VF */ 1201 if (amdgpu_sriov_vf(adev)) 1202 return 0; 1203 1204 /* skip if the bios has already enabled large BAR */ 1205 if (adev->gmc.real_vram_size && 1206 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1207 return 0; 1208 1209 /* Check if the root BUS has 64bit memory resources */ 1210 root = adev->pdev->bus; 1211 while (root->parent) 1212 root = root->parent; 1213 1214 pci_bus_for_each_resource(root, res, i) { 1215 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1216 res->start > 0x100000000ull) 1217 break; 1218 } 1219 1220 /* Trying to resize is pointless without a root hub window above 4GB */ 1221 if (!res) 1222 return 0; 1223 1224 /* Limit the BAR size to what is available */ 1225 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1226 rbar_size); 1227 1228 /* Disable memory decoding while we change the BAR addresses and size */ 1229 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1230 pci_write_config_word(adev->pdev, PCI_COMMAND, 1231 cmd & ~PCI_COMMAND_MEMORY); 1232 1233 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1234 amdgpu_device_doorbell_fini(adev); 1235 if (adev->asic_type >= CHIP_BONAIRE) 1236 pci_release_resource(adev->pdev, 2); 1237 1238 pci_release_resource(adev->pdev, 0); 1239 1240 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1241 if (r == -ENOSPC) 1242 DRM_INFO("Not enough PCI address space for a large BAR."); 1243 else if (r && r != -ENOTSUPP) 1244 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1245 1246 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1247 1248 /* When the doorbell or fb BAR isn't available we have no chance of 1249 * using the device. 1250 */ 1251 r = amdgpu_device_doorbell_init(adev); 1252 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1253 return -ENODEV; 1254 1255 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1256 1257 return 0; 1258 } 1259 1260 /* 1261 * GPU helpers function. 1262 */ 1263 /** 1264 * amdgpu_device_need_post - check if the hw need post or not 1265 * 1266 * @adev: amdgpu_device pointer 1267 * 1268 * Check if the asic has been initialized (all asics) at driver startup 1269 * or post is needed if hw reset is performed. 1270 * Returns true if need or false if not. 1271 */ 1272 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1273 { 1274 uint32_t reg; 1275 1276 if (amdgpu_sriov_vf(adev)) 1277 return false; 1278 1279 if (amdgpu_passthrough(adev)) { 1280 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1281 * some old smc fw still need driver do vPost otherwise gpu hang, while 1282 * those smc fw version above 22.15 doesn't have this flaw, so we force 1283 * vpost executed for smc version below 22.15 1284 */ 1285 if (adev->asic_type == CHIP_FIJI) { 1286 int err; 1287 uint32_t fw_ver; 1288 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1289 /* force vPost if error occured */ 1290 if (err) 1291 return true; 1292 1293 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1294 if (fw_ver < 0x00160e00) 1295 return true; 1296 } 1297 } 1298 1299 /* Don't post if we need to reset whole hive on init */ 1300 if (adev->gmc.xgmi.pending_reset) 1301 return false; 1302 1303 if (adev->has_hw_reset) { 1304 adev->has_hw_reset = false; 1305 return true; 1306 } 1307 1308 /* bios scratch used on CIK+ */ 1309 if (adev->asic_type >= CHIP_BONAIRE) 1310 return amdgpu_atombios_scratch_need_asic_init(adev); 1311 1312 /* check MEM_SIZE for older asics */ 1313 reg = amdgpu_asic_get_config_memsize(adev); 1314 1315 if ((reg != 0) && (reg != 0xffffffff)) 1316 return false; 1317 1318 return true; 1319 } 1320 1321 /* if we get transitioned to only one device, take VGA back */ 1322 /** 1323 * amdgpu_device_vga_set_decode - enable/disable vga decode 1324 * 1325 * @pdev: PCI device pointer 1326 * @state: enable/disable vga decode 1327 * 1328 * Enable/disable vga decode (all asics). 1329 * Returns VGA resource flags. 1330 */ 1331 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1332 bool state) 1333 { 1334 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1335 amdgpu_asic_set_vga_state(adev, state); 1336 if (state) 1337 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1338 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1339 else 1340 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1341 } 1342 1343 /** 1344 * amdgpu_device_check_block_size - validate the vm block size 1345 * 1346 * @adev: amdgpu_device pointer 1347 * 1348 * Validates the vm block size specified via module parameter. 1349 * The vm block size defines number of bits in page table versus page directory, 1350 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1351 * page table and the remaining bits are in the page directory. 1352 */ 1353 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1354 { 1355 /* defines number of bits in page table versus page directory, 1356 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1357 * page table and the remaining bits are in the page directory */ 1358 if (amdgpu_vm_block_size == -1) 1359 return; 1360 1361 if (amdgpu_vm_block_size < 9) { 1362 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1363 amdgpu_vm_block_size); 1364 amdgpu_vm_block_size = -1; 1365 } 1366 } 1367 1368 /** 1369 * amdgpu_device_check_vm_size - validate the vm size 1370 * 1371 * @adev: amdgpu_device pointer 1372 * 1373 * Validates the vm size in GB specified via module parameter. 1374 * The VM size is the size of the GPU virtual memory space in GB. 1375 */ 1376 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1377 { 1378 /* no need to check the default value */ 1379 if (amdgpu_vm_size == -1) 1380 return; 1381 1382 if (amdgpu_vm_size < 1) { 1383 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1384 amdgpu_vm_size); 1385 amdgpu_vm_size = -1; 1386 } 1387 } 1388 1389 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1390 { 1391 struct sysinfo si; 1392 bool is_os_64 = (sizeof(void *) == 8); 1393 uint64_t total_memory; 1394 uint64_t dram_size_seven_GB = 0x1B8000000; 1395 uint64_t dram_size_three_GB = 0xB8000000; 1396 1397 if (amdgpu_smu_memory_pool_size == 0) 1398 return; 1399 1400 if (!is_os_64) { 1401 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1402 goto def_value; 1403 } 1404 si_meminfo(&si); 1405 total_memory = (uint64_t)si.totalram * si.mem_unit; 1406 1407 if ((amdgpu_smu_memory_pool_size == 1) || 1408 (amdgpu_smu_memory_pool_size == 2)) { 1409 if (total_memory < dram_size_three_GB) 1410 goto def_value1; 1411 } else if ((amdgpu_smu_memory_pool_size == 4) || 1412 (amdgpu_smu_memory_pool_size == 8)) { 1413 if (total_memory < dram_size_seven_GB) 1414 goto def_value1; 1415 } else { 1416 DRM_WARN("Smu memory pool size not supported\n"); 1417 goto def_value; 1418 } 1419 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1420 1421 return; 1422 1423 def_value1: 1424 DRM_WARN("No enough system memory\n"); 1425 def_value: 1426 adev->pm.smu_prv_buffer_size = 0; 1427 } 1428 1429 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1430 { 1431 if (!(adev->flags & AMD_IS_APU) || 1432 adev->asic_type < CHIP_RAVEN) 1433 return 0; 1434 1435 switch (adev->asic_type) { 1436 case CHIP_RAVEN: 1437 if (adev->pdev->device == 0x15dd) 1438 adev->apu_flags |= AMD_APU_IS_RAVEN; 1439 if (adev->pdev->device == 0x15d8) 1440 adev->apu_flags |= AMD_APU_IS_PICASSO; 1441 break; 1442 case CHIP_RENOIR: 1443 if ((adev->pdev->device == 0x1636) || 1444 (adev->pdev->device == 0x164c)) 1445 adev->apu_flags |= AMD_APU_IS_RENOIR; 1446 else 1447 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1448 break; 1449 case CHIP_VANGOGH: 1450 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1451 break; 1452 case CHIP_YELLOW_CARP: 1453 break; 1454 case CHIP_CYAN_SKILLFISH: 1455 if ((adev->pdev->device == 0x13FE) || 1456 (adev->pdev->device == 0x143F)) 1457 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1458 break; 1459 default: 1460 break; 1461 } 1462 1463 return 0; 1464 } 1465 1466 /** 1467 * amdgpu_device_check_arguments - validate module params 1468 * 1469 * @adev: amdgpu_device pointer 1470 * 1471 * Validates certain module parameters and updates 1472 * the associated values used by the driver (all asics). 1473 */ 1474 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1475 { 1476 if (amdgpu_sched_jobs < 4) { 1477 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1478 amdgpu_sched_jobs); 1479 amdgpu_sched_jobs = 4; 1480 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1481 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1482 amdgpu_sched_jobs); 1483 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1484 } 1485 1486 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1487 /* gart size must be greater or equal to 32M */ 1488 dev_warn(adev->dev, "gart size (%d) too small\n", 1489 amdgpu_gart_size); 1490 amdgpu_gart_size = -1; 1491 } 1492 1493 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1494 /* gtt size must be greater or equal to 32M */ 1495 dev_warn(adev->dev, "gtt size (%d) too small\n", 1496 amdgpu_gtt_size); 1497 amdgpu_gtt_size = -1; 1498 } 1499 1500 /* valid range is between 4 and 9 inclusive */ 1501 if (amdgpu_vm_fragment_size != -1 && 1502 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1503 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1504 amdgpu_vm_fragment_size = -1; 1505 } 1506 1507 if (amdgpu_sched_hw_submission < 2) { 1508 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1509 amdgpu_sched_hw_submission); 1510 amdgpu_sched_hw_submission = 2; 1511 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1512 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1513 amdgpu_sched_hw_submission); 1514 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1515 } 1516 1517 amdgpu_device_check_smu_prv_buffer_size(adev); 1518 1519 amdgpu_device_check_vm_size(adev); 1520 1521 amdgpu_device_check_block_size(adev); 1522 1523 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1524 1525 amdgpu_gmc_tmz_set(adev); 1526 1527 amdgpu_gmc_noretry_set(adev); 1528 1529 return 0; 1530 } 1531 1532 /** 1533 * amdgpu_switcheroo_set_state - set switcheroo state 1534 * 1535 * @pdev: pci dev pointer 1536 * @state: vga_switcheroo state 1537 * 1538 * Callback for the switcheroo driver. Suspends or resumes the 1539 * the asics before or after it is powered up using ACPI methods. 1540 */ 1541 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1542 enum vga_switcheroo_state state) 1543 { 1544 struct drm_device *dev = pci_get_drvdata(pdev); 1545 int r; 1546 1547 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1548 return; 1549 1550 if (state == VGA_SWITCHEROO_ON) { 1551 pr_info("switched on\n"); 1552 /* don't suspend or resume card normally */ 1553 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1554 1555 pci_set_power_state(pdev, PCI_D0); 1556 amdgpu_device_load_pci_state(pdev); 1557 r = pci_enable_device(pdev); 1558 if (r) 1559 DRM_WARN("pci_enable_device failed (%d)\n", r); 1560 amdgpu_device_resume(dev, true); 1561 1562 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1563 } else { 1564 pr_info("switched off\n"); 1565 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1566 amdgpu_device_suspend(dev, true); 1567 amdgpu_device_cache_pci_state(pdev); 1568 /* Shut down the device */ 1569 pci_disable_device(pdev); 1570 pci_set_power_state(pdev, PCI_D3cold); 1571 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1572 } 1573 } 1574 1575 /** 1576 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1577 * 1578 * @pdev: pci dev pointer 1579 * 1580 * Callback for the switcheroo driver. Check of the switcheroo 1581 * state can be changed. 1582 * Returns true if the state can be changed, false if not. 1583 */ 1584 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1585 { 1586 struct drm_device *dev = pci_get_drvdata(pdev); 1587 1588 /* 1589 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1590 * locking inversion with the driver load path. And the access here is 1591 * completely racy anyway. So don't bother with locking for now. 1592 */ 1593 return atomic_read(&dev->open_count) == 0; 1594 } 1595 1596 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1597 .set_gpu_state = amdgpu_switcheroo_set_state, 1598 .reprobe = NULL, 1599 .can_switch = amdgpu_switcheroo_can_switch, 1600 }; 1601 1602 /** 1603 * amdgpu_device_ip_set_clockgating_state - set the CG state 1604 * 1605 * @dev: amdgpu_device pointer 1606 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1607 * @state: clockgating state (gate or ungate) 1608 * 1609 * Sets the requested clockgating state for all instances of 1610 * the hardware IP specified. 1611 * Returns the error code from the last instance. 1612 */ 1613 int amdgpu_device_ip_set_clockgating_state(void *dev, 1614 enum amd_ip_block_type block_type, 1615 enum amd_clockgating_state state) 1616 { 1617 struct amdgpu_device *adev = dev; 1618 int i, r = 0; 1619 1620 for (i = 0; i < adev->num_ip_blocks; i++) { 1621 if (!adev->ip_blocks[i].status.valid) 1622 continue; 1623 if (adev->ip_blocks[i].version->type != block_type) 1624 continue; 1625 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1626 continue; 1627 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1628 (void *)adev, state); 1629 if (r) 1630 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1631 adev->ip_blocks[i].version->funcs->name, r); 1632 } 1633 return r; 1634 } 1635 1636 /** 1637 * amdgpu_device_ip_set_powergating_state - set the PG state 1638 * 1639 * @dev: amdgpu_device pointer 1640 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1641 * @state: powergating state (gate or ungate) 1642 * 1643 * Sets the requested powergating state for all instances of 1644 * the hardware IP specified. 1645 * Returns the error code from the last instance. 1646 */ 1647 int amdgpu_device_ip_set_powergating_state(void *dev, 1648 enum amd_ip_block_type block_type, 1649 enum amd_powergating_state state) 1650 { 1651 struct amdgpu_device *adev = dev; 1652 int i, r = 0; 1653 1654 for (i = 0; i < adev->num_ip_blocks; i++) { 1655 if (!adev->ip_blocks[i].status.valid) 1656 continue; 1657 if (adev->ip_blocks[i].version->type != block_type) 1658 continue; 1659 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1660 continue; 1661 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1662 (void *)adev, state); 1663 if (r) 1664 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1665 adev->ip_blocks[i].version->funcs->name, r); 1666 } 1667 return r; 1668 } 1669 1670 /** 1671 * amdgpu_device_ip_get_clockgating_state - get the CG state 1672 * 1673 * @adev: amdgpu_device pointer 1674 * @flags: clockgating feature flags 1675 * 1676 * Walks the list of IPs on the device and updates the clockgating 1677 * flags for each IP. 1678 * Updates @flags with the feature flags for each hardware IP where 1679 * clockgating is enabled. 1680 */ 1681 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1682 u32 *flags) 1683 { 1684 int i; 1685 1686 for (i = 0; i < adev->num_ip_blocks; i++) { 1687 if (!adev->ip_blocks[i].status.valid) 1688 continue; 1689 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1690 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1691 } 1692 } 1693 1694 /** 1695 * amdgpu_device_ip_wait_for_idle - wait for idle 1696 * 1697 * @adev: amdgpu_device pointer 1698 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1699 * 1700 * Waits for the request hardware IP to be idle. 1701 * Returns 0 for success or a negative error code on failure. 1702 */ 1703 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1704 enum amd_ip_block_type block_type) 1705 { 1706 int i, r; 1707 1708 for (i = 0; i < adev->num_ip_blocks; i++) { 1709 if (!adev->ip_blocks[i].status.valid) 1710 continue; 1711 if (adev->ip_blocks[i].version->type == block_type) { 1712 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1713 if (r) 1714 return r; 1715 break; 1716 } 1717 } 1718 return 0; 1719 1720 } 1721 1722 /** 1723 * amdgpu_device_ip_is_idle - is the hardware IP idle 1724 * 1725 * @adev: amdgpu_device pointer 1726 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1727 * 1728 * Check if the hardware IP is idle or not. 1729 * Returns true if it the IP is idle, false if not. 1730 */ 1731 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1732 enum amd_ip_block_type block_type) 1733 { 1734 int i; 1735 1736 for (i = 0; i < adev->num_ip_blocks; i++) { 1737 if (!adev->ip_blocks[i].status.valid) 1738 continue; 1739 if (adev->ip_blocks[i].version->type == block_type) 1740 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1741 } 1742 return true; 1743 1744 } 1745 1746 /** 1747 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1748 * 1749 * @adev: amdgpu_device pointer 1750 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1751 * 1752 * Returns a pointer to the hardware IP block structure 1753 * if it exists for the asic, otherwise NULL. 1754 */ 1755 struct amdgpu_ip_block * 1756 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1757 enum amd_ip_block_type type) 1758 { 1759 int i; 1760 1761 for (i = 0; i < adev->num_ip_blocks; i++) 1762 if (adev->ip_blocks[i].version->type == type) 1763 return &adev->ip_blocks[i]; 1764 1765 return NULL; 1766 } 1767 1768 /** 1769 * amdgpu_device_ip_block_version_cmp 1770 * 1771 * @adev: amdgpu_device pointer 1772 * @type: enum amd_ip_block_type 1773 * @major: major version 1774 * @minor: minor version 1775 * 1776 * return 0 if equal or greater 1777 * return 1 if smaller or the ip_block doesn't exist 1778 */ 1779 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1780 enum amd_ip_block_type type, 1781 u32 major, u32 minor) 1782 { 1783 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1784 1785 if (ip_block && ((ip_block->version->major > major) || 1786 ((ip_block->version->major == major) && 1787 (ip_block->version->minor >= minor)))) 1788 return 0; 1789 1790 return 1; 1791 } 1792 1793 /** 1794 * amdgpu_device_ip_block_add 1795 * 1796 * @adev: amdgpu_device pointer 1797 * @ip_block_version: pointer to the IP to add 1798 * 1799 * Adds the IP block driver information to the collection of IPs 1800 * on the asic. 1801 */ 1802 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1803 const struct amdgpu_ip_block_version *ip_block_version) 1804 { 1805 if (!ip_block_version) 1806 return -EINVAL; 1807 1808 switch (ip_block_version->type) { 1809 case AMD_IP_BLOCK_TYPE_VCN: 1810 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1811 return 0; 1812 break; 1813 case AMD_IP_BLOCK_TYPE_JPEG: 1814 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1815 return 0; 1816 break; 1817 default: 1818 break; 1819 } 1820 1821 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1822 ip_block_version->funcs->name); 1823 1824 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1825 1826 return 0; 1827 } 1828 1829 /** 1830 * amdgpu_device_enable_virtual_display - enable virtual display feature 1831 * 1832 * @adev: amdgpu_device pointer 1833 * 1834 * Enabled the virtual display feature if the user has enabled it via 1835 * the module parameter virtual_display. This feature provides a virtual 1836 * display hardware on headless boards or in virtualized environments. 1837 * This function parses and validates the configuration string specified by 1838 * the user and configues the virtual display configuration (number of 1839 * virtual connectors, crtcs, etc.) specified. 1840 */ 1841 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1842 { 1843 adev->enable_virtual_display = false; 1844 1845 if (amdgpu_virtual_display) { 1846 const char *pci_address_name = pci_name(adev->pdev); 1847 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1848 1849 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1850 pciaddstr_tmp = pciaddstr; 1851 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1852 pciaddname = strsep(&pciaddname_tmp, ","); 1853 if (!strcmp("all", pciaddname) 1854 || !strcmp(pci_address_name, pciaddname)) { 1855 long num_crtc; 1856 int res = -1; 1857 1858 adev->enable_virtual_display = true; 1859 1860 if (pciaddname_tmp) 1861 res = kstrtol(pciaddname_tmp, 10, 1862 &num_crtc); 1863 1864 if (!res) { 1865 if (num_crtc < 1) 1866 num_crtc = 1; 1867 if (num_crtc > 6) 1868 num_crtc = 6; 1869 adev->mode_info.num_crtc = num_crtc; 1870 } else { 1871 adev->mode_info.num_crtc = 1; 1872 } 1873 break; 1874 } 1875 } 1876 1877 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1878 amdgpu_virtual_display, pci_address_name, 1879 adev->enable_virtual_display, adev->mode_info.num_crtc); 1880 1881 kfree(pciaddstr); 1882 } 1883 } 1884 1885 /** 1886 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1887 * 1888 * @adev: amdgpu_device pointer 1889 * 1890 * Parses the asic configuration parameters specified in the gpu info 1891 * firmware and makes them availale to the driver for use in configuring 1892 * the asic. 1893 * Returns 0 on success, -EINVAL on failure. 1894 */ 1895 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1896 { 1897 const char *chip_name; 1898 char fw_name[40]; 1899 int err; 1900 const struct gpu_info_firmware_header_v1_0 *hdr; 1901 1902 adev->firmware.gpu_info_fw = NULL; 1903 1904 if (adev->mman.discovery_bin) { 1905 amdgpu_discovery_get_gfx_info(adev); 1906 1907 /* 1908 * FIXME: The bounding box is still needed by Navi12, so 1909 * temporarily read it from gpu_info firmware. Should be droped 1910 * when DAL no longer needs it. 1911 */ 1912 if (adev->asic_type != CHIP_NAVI12) 1913 return 0; 1914 } 1915 1916 switch (adev->asic_type) { 1917 #ifdef CONFIG_DRM_AMDGPU_SI 1918 case CHIP_VERDE: 1919 case CHIP_TAHITI: 1920 case CHIP_PITCAIRN: 1921 case CHIP_OLAND: 1922 case CHIP_HAINAN: 1923 #endif 1924 #ifdef CONFIG_DRM_AMDGPU_CIK 1925 case CHIP_BONAIRE: 1926 case CHIP_HAWAII: 1927 case CHIP_KAVERI: 1928 case CHIP_KABINI: 1929 case CHIP_MULLINS: 1930 #endif 1931 case CHIP_TOPAZ: 1932 case CHIP_TONGA: 1933 case CHIP_FIJI: 1934 case CHIP_POLARIS10: 1935 case CHIP_POLARIS11: 1936 case CHIP_POLARIS12: 1937 case CHIP_VEGAM: 1938 case CHIP_CARRIZO: 1939 case CHIP_STONEY: 1940 case CHIP_VEGA20: 1941 case CHIP_ALDEBARAN: 1942 case CHIP_SIENNA_CICHLID: 1943 case CHIP_NAVY_FLOUNDER: 1944 case CHIP_DIMGREY_CAVEFISH: 1945 case CHIP_BEIGE_GOBY: 1946 default: 1947 return 0; 1948 case CHIP_VEGA10: 1949 chip_name = "vega10"; 1950 break; 1951 case CHIP_VEGA12: 1952 chip_name = "vega12"; 1953 break; 1954 case CHIP_RAVEN: 1955 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1956 chip_name = "raven2"; 1957 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1958 chip_name = "picasso"; 1959 else 1960 chip_name = "raven"; 1961 break; 1962 case CHIP_ARCTURUS: 1963 chip_name = "arcturus"; 1964 break; 1965 case CHIP_RENOIR: 1966 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1967 chip_name = "renoir"; 1968 else 1969 chip_name = "green_sardine"; 1970 break; 1971 case CHIP_NAVI10: 1972 chip_name = "navi10"; 1973 break; 1974 case CHIP_NAVI14: 1975 chip_name = "navi14"; 1976 break; 1977 case CHIP_NAVI12: 1978 chip_name = "navi12"; 1979 break; 1980 case CHIP_VANGOGH: 1981 chip_name = "vangogh"; 1982 break; 1983 case CHIP_YELLOW_CARP: 1984 chip_name = "yellow_carp"; 1985 break; 1986 } 1987 1988 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1989 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1990 if (err) { 1991 dev_err(adev->dev, 1992 "Failed to load gpu_info firmware \"%s\"\n", 1993 fw_name); 1994 goto out; 1995 } 1996 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1997 if (err) { 1998 dev_err(adev->dev, 1999 "Failed to validate gpu_info firmware \"%s\"\n", 2000 fw_name); 2001 goto out; 2002 } 2003 2004 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2005 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2006 2007 switch (hdr->version_major) { 2008 case 1: 2009 { 2010 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2011 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2012 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2013 2014 /* 2015 * Should be droped when DAL no longer needs it. 2016 */ 2017 if (adev->asic_type == CHIP_NAVI12) 2018 goto parse_soc_bounding_box; 2019 2020 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2021 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2022 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2023 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2024 adev->gfx.config.max_texture_channel_caches = 2025 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2026 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2027 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2028 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2029 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2030 adev->gfx.config.double_offchip_lds_buf = 2031 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2032 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2033 adev->gfx.cu_info.max_waves_per_simd = 2034 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2035 adev->gfx.cu_info.max_scratch_slots_per_cu = 2036 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2037 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2038 if (hdr->version_minor >= 1) { 2039 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2040 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2041 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2042 adev->gfx.config.num_sc_per_sh = 2043 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2044 adev->gfx.config.num_packer_per_sc = 2045 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2046 } 2047 2048 parse_soc_bounding_box: 2049 /* 2050 * soc bounding box info is not integrated in disocovery table, 2051 * we always need to parse it from gpu info firmware if needed. 2052 */ 2053 if (hdr->version_minor == 2) { 2054 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2055 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2056 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2057 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2058 } 2059 break; 2060 } 2061 default: 2062 dev_err(adev->dev, 2063 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2064 err = -EINVAL; 2065 goto out; 2066 } 2067 out: 2068 return err; 2069 } 2070 2071 /** 2072 * amdgpu_device_ip_early_init - run early init for hardware IPs 2073 * 2074 * @adev: amdgpu_device pointer 2075 * 2076 * Early initialization pass for hardware IPs. The hardware IPs that make 2077 * up each asic are discovered each IP's early_init callback is run. This 2078 * is the first stage in initializing the asic. 2079 * Returns 0 on success, negative error code on failure. 2080 */ 2081 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2082 { 2083 struct drm_device *dev = adev_to_drm(adev); 2084 struct pci_dev *parent; 2085 int i, r; 2086 2087 amdgpu_device_enable_virtual_display(adev); 2088 2089 if (amdgpu_sriov_vf(adev)) { 2090 r = amdgpu_virt_request_full_gpu(adev, true); 2091 if (r) 2092 return r; 2093 } 2094 2095 switch (adev->asic_type) { 2096 #ifdef CONFIG_DRM_AMDGPU_SI 2097 case CHIP_VERDE: 2098 case CHIP_TAHITI: 2099 case CHIP_PITCAIRN: 2100 case CHIP_OLAND: 2101 case CHIP_HAINAN: 2102 adev->family = AMDGPU_FAMILY_SI; 2103 r = si_set_ip_blocks(adev); 2104 if (r) 2105 return r; 2106 break; 2107 #endif 2108 #ifdef CONFIG_DRM_AMDGPU_CIK 2109 case CHIP_BONAIRE: 2110 case CHIP_HAWAII: 2111 case CHIP_KAVERI: 2112 case CHIP_KABINI: 2113 case CHIP_MULLINS: 2114 if (adev->flags & AMD_IS_APU) 2115 adev->family = AMDGPU_FAMILY_KV; 2116 else 2117 adev->family = AMDGPU_FAMILY_CI; 2118 2119 r = cik_set_ip_blocks(adev); 2120 if (r) 2121 return r; 2122 break; 2123 #endif 2124 case CHIP_TOPAZ: 2125 case CHIP_TONGA: 2126 case CHIP_FIJI: 2127 case CHIP_POLARIS10: 2128 case CHIP_POLARIS11: 2129 case CHIP_POLARIS12: 2130 case CHIP_VEGAM: 2131 case CHIP_CARRIZO: 2132 case CHIP_STONEY: 2133 if (adev->flags & AMD_IS_APU) 2134 adev->family = AMDGPU_FAMILY_CZ; 2135 else 2136 adev->family = AMDGPU_FAMILY_VI; 2137 2138 r = vi_set_ip_blocks(adev); 2139 if (r) 2140 return r; 2141 break; 2142 default: 2143 r = amdgpu_discovery_set_ip_blocks(adev); 2144 if (r) 2145 return r; 2146 break; 2147 } 2148 2149 if (amdgpu_has_atpx() && 2150 (amdgpu_is_atpx_hybrid() || 2151 amdgpu_has_atpx_dgpu_power_cntl()) && 2152 ((adev->flags & AMD_IS_APU) == 0) && 2153 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2154 adev->flags |= AMD_IS_PX; 2155 2156 parent = pci_upstream_bridge(adev->pdev); 2157 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2158 2159 amdgpu_amdkfd_device_probe(adev); 2160 2161 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2162 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2163 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2164 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2165 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2166 2167 for (i = 0; i < adev->num_ip_blocks; i++) { 2168 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2169 DRM_ERROR("disabled ip block: %d <%s>\n", 2170 i, adev->ip_blocks[i].version->funcs->name); 2171 adev->ip_blocks[i].status.valid = false; 2172 } else { 2173 if (adev->ip_blocks[i].version->funcs->early_init) { 2174 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2175 if (r == -ENOENT) { 2176 adev->ip_blocks[i].status.valid = false; 2177 } else if (r) { 2178 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2179 adev->ip_blocks[i].version->funcs->name, r); 2180 return r; 2181 } else { 2182 adev->ip_blocks[i].status.valid = true; 2183 } 2184 } else { 2185 adev->ip_blocks[i].status.valid = true; 2186 } 2187 } 2188 /* get the vbios after the asic_funcs are set up */ 2189 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2190 r = amdgpu_device_parse_gpu_info_fw(adev); 2191 if (r) 2192 return r; 2193 2194 /* Read BIOS */ 2195 if (!amdgpu_get_bios(adev)) 2196 return -EINVAL; 2197 2198 r = amdgpu_atombios_init(adev); 2199 if (r) { 2200 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2201 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2202 return r; 2203 } 2204 2205 /*get pf2vf msg info at it's earliest time*/ 2206 if (amdgpu_sriov_vf(adev)) 2207 amdgpu_virt_init_data_exchange(adev); 2208 2209 } 2210 } 2211 2212 adev->cg_flags &= amdgpu_cg_mask; 2213 adev->pg_flags &= amdgpu_pg_mask; 2214 2215 return 0; 2216 } 2217 2218 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2219 { 2220 int i, r; 2221 2222 for (i = 0; i < adev->num_ip_blocks; i++) { 2223 if (!adev->ip_blocks[i].status.sw) 2224 continue; 2225 if (adev->ip_blocks[i].status.hw) 2226 continue; 2227 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2228 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2229 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2230 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2231 if (r) { 2232 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2233 adev->ip_blocks[i].version->funcs->name, r); 2234 return r; 2235 } 2236 adev->ip_blocks[i].status.hw = true; 2237 } 2238 } 2239 2240 return 0; 2241 } 2242 2243 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2244 { 2245 int i, r; 2246 2247 for (i = 0; i < adev->num_ip_blocks; i++) { 2248 if (!adev->ip_blocks[i].status.sw) 2249 continue; 2250 if (adev->ip_blocks[i].status.hw) 2251 continue; 2252 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2253 if (r) { 2254 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2255 adev->ip_blocks[i].version->funcs->name, r); 2256 return r; 2257 } 2258 adev->ip_blocks[i].status.hw = true; 2259 } 2260 2261 return 0; 2262 } 2263 2264 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2265 { 2266 int r = 0; 2267 int i; 2268 uint32_t smu_version; 2269 2270 if (adev->asic_type >= CHIP_VEGA10) { 2271 for (i = 0; i < adev->num_ip_blocks; i++) { 2272 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2273 continue; 2274 2275 if (!adev->ip_blocks[i].status.sw) 2276 continue; 2277 2278 /* no need to do the fw loading again if already done*/ 2279 if (adev->ip_blocks[i].status.hw == true) 2280 break; 2281 2282 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2283 r = adev->ip_blocks[i].version->funcs->resume(adev); 2284 if (r) { 2285 DRM_ERROR("resume of IP block <%s> failed %d\n", 2286 adev->ip_blocks[i].version->funcs->name, r); 2287 return r; 2288 } 2289 } else { 2290 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2291 if (r) { 2292 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2293 adev->ip_blocks[i].version->funcs->name, r); 2294 return r; 2295 } 2296 } 2297 2298 adev->ip_blocks[i].status.hw = true; 2299 break; 2300 } 2301 } 2302 2303 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2304 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2305 2306 return r; 2307 } 2308 2309 /** 2310 * amdgpu_device_ip_init - run init for hardware IPs 2311 * 2312 * @adev: amdgpu_device pointer 2313 * 2314 * Main initialization pass for hardware IPs. The list of all the hardware 2315 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2316 * are run. sw_init initializes the software state associated with each IP 2317 * and hw_init initializes the hardware associated with each IP. 2318 * Returns 0 on success, negative error code on failure. 2319 */ 2320 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2321 { 2322 int i, r; 2323 2324 r = amdgpu_ras_init(adev); 2325 if (r) 2326 return r; 2327 2328 for (i = 0; i < adev->num_ip_blocks; i++) { 2329 if (!adev->ip_blocks[i].status.valid) 2330 continue; 2331 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2332 if (r) { 2333 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2334 adev->ip_blocks[i].version->funcs->name, r); 2335 goto init_failed; 2336 } 2337 adev->ip_blocks[i].status.sw = true; 2338 2339 /* need to do gmc hw init early so we can allocate gpu mem */ 2340 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2341 /* Try to reserve bad pages early */ 2342 if (amdgpu_sriov_vf(adev)) 2343 amdgpu_virt_exchange_data(adev); 2344 2345 r = amdgpu_device_vram_scratch_init(adev); 2346 if (r) { 2347 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2348 goto init_failed; 2349 } 2350 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2351 if (r) { 2352 DRM_ERROR("hw_init %d failed %d\n", i, r); 2353 goto init_failed; 2354 } 2355 r = amdgpu_device_wb_init(adev); 2356 if (r) { 2357 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2358 goto init_failed; 2359 } 2360 adev->ip_blocks[i].status.hw = true; 2361 2362 /* right after GMC hw init, we create CSA */ 2363 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2364 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2365 AMDGPU_GEM_DOMAIN_VRAM, 2366 AMDGPU_CSA_SIZE); 2367 if (r) { 2368 DRM_ERROR("allocate CSA failed %d\n", r); 2369 goto init_failed; 2370 } 2371 } 2372 } 2373 } 2374 2375 if (amdgpu_sriov_vf(adev)) 2376 amdgpu_virt_init_data_exchange(adev); 2377 2378 r = amdgpu_ib_pool_init(adev); 2379 if (r) { 2380 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2381 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2382 goto init_failed; 2383 } 2384 2385 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2386 if (r) 2387 goto init_failed; 2388 2389 r = amdgpu_device_ip_hw_init_phase1(adev); 2390 if (r) 2391 goto init_failed; 2392 2393 r = amdgpu_device_fw_loading(adev); 2394 if (r) 2395 goto init_failed; 2396 2397 r = amdgpu_device_ip_hw_init_phase2(adev); 2398 if (r) 2399 goto init_failed; 2400 2401 /* 2402 * retired pages will be loaded from eeprom and reserved here, 2403 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2404 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2405 * for I2C communication which only true at this point. 2406 * 2407 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2408 * failure from bad gpu situation and stop amdgpu init process 2409 * accordingly. For other failed cases, it will still release all 2410 * the resource and print error message, rather than returning one 2411 * negative value to upper level. 2412 * 2413 * Note: theoretically, this should be called before all vram allocations 2414 * to protect retired page from abusing 2415 */ 2416 r = amdgpu_ras_recovery_init(adev); 2417 if (r) 2418 goto init_failed; 2419 2420 if (adev->gmc.xgmi.num_physical_nodes > 1) 2421 amdgpu_xgmi_add_device(adev); 2422 2423 /* Don't init kfd if whole hive need to be reset during init */ 2424 if (!adev->gmc.xgmi.pending_reset) 2425 amdgpu_amdkfd_device_init(adev); 2426 2427 amdgpu_fru_get_product_info(adev); 2428 2429 init_failed: 2430 if (amdgpu_sriov_vf(adev)) 2431 amdgpu_virt_release_full_gpu(adev, true); 2432 2433 return r; 2434 } 2435 2436 /** 2437 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2438 * 2439 * @adev: amdgpu_device pointer 2440 * 2441 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2442 * this function before a GPU reset. If the value is retained after a 2443 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2444 */ 2445 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2446 { 2447 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2448 } 2449 2450 /** 2451 * amdgpu_device_check_vram_lost - check if vram is valid 2452 * 2453 * @adev: amdgpu_device pointer 2454 * 2455 * Checks the reset magic value written to the gart pointer in VRAM. 2456 * The driver calls this after a GPU reset to see if the contents of 2457 * VRAM is lost or now. 2458 * returns true if vram is lost, false if not. 2459 */ 2460 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2461 { 2462 if (memcmp(adev->gart.ptr, adev->reset_magic, 2463 AMDGPU_RESET_MAGIC_NUM)) 2464 return true; 2465 2466 if (!amdgpu_in_reset(adev)) 2467 return false; 2468 2469 /* 2470 * For all ASICs with baco/mode1 reset, the VRAM is 2471 * always assumed to be lost. 2472 */ 2473 switch (amdgpu_asic_reset_method(adev)) { 2474 case AMD_RESET_METHOD_BACO: 2475 case AMD_RESET_METHOD_MODE1: 2476 return true; 2477 default: 2478 return false; 2479 } 2480 } 2481 2482 /** 2483 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2484 * 2485 * @adev: amdgpu_device pointer 2486 * @state: clockgating state (gate or ungate) 2487 * 2488 * The list of all the hardware IPs that make up the asic is walked and the 2489 * set_clockgating_state callbacks are run. 2490 * Late initialization pass enabling clockgating for hardware IPs. 2491 * Fini or suspend, pass disabling clockgating for hardware IPs. 2492 * Returns 0 on success, negative error code on failure. 2493 */ 2494 2495 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2496 enum amd_clockgating_state state) 2497 { 2498 int i, j, r; 2499 2500 if (amdgpu_emu_mode == 1) 2501 return 0; 2502 2503 for (j = 0; j < adev->num_ip_blocks; j++) { 2504 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2505 if (!adev->ip_blocks[i].status.late_initialized) 2506 continue; 2507 /* skip CG for GFX on S0ix */ 2508 if (adev->in_s0ix && 2509 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2510 continue; 2511 /* skip CG for VCE/UVD, it's handled specially */ 2512 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2513 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2514 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2515 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2516 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2517 /* enable clockgating to save power */ 2518 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2519 state); 2520 if (r) { 2521 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2522 adev->ip_blocks[i].version->funcs->name, r); 2523 return r; 2524 } 2525 } 2526 } 2527 2528 return 0; 2529 } 2530 2531 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2532 enum amd_powergating_state state) 2533 { 2534 int i, j, r; 2535 2536 if (amdgpu_emu_mode == 1) 2537 return 0; 2538 2539 for (j = 0; j < adev->num_ip_blocks; j++) { 2540 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2541 if (!adev->ip_blocks[i].status.late_initialized) 2542 continue; 2543 /* skip PG for GFX on S0ix */ 2544 if (adev->in_s0ix && 2545 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2546 continue; 2547 /* skip CG for VCE/UVD, it's handled specially */ 2548 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2549 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2550 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2551 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2552 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2553 /* enable powergating to save power */ 2554 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2555 state); 2556 if (r) { 2557 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2558 adev->ip_blocks[i].version->funcs->name, r); 2559 return r; 2560 } 2561 } 2562 } 2563 return 0; 2564 } 2565 2566 static int amdgpu_device_enable_mgpu_fan_boost(void) 2567 { 2568 struct amdgpu_gpu_instance *gpu_ins; 2569 struct amdgpu_device *adev; 2570 int i, ret = 0; 2571 2572 mutex_lock(&mgpu_info.mutex); 2573 2574 /* 2575 * MGPU fan boost feature should be enabled 2576 * only when there are two or more dGPUs in 2577 * the system 2578 */ 2579 if (mgpu_info.num_dgpu < 2) 2580 goto out; 2581 2582 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2583 gpu_ins = &(mgpu_info.gpu_ins[i]); 2584 adev = gpu_ins->adev; 2585 if (!(adev->flags & AMD_IS_APU) && 2586 !gpu_ins->mgpu_fan_enabled) { 2587 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2588 if (ret) 2589 break; 2590 2591 gpu_ins->mgpu_fan_enabled = 1; 2592 } 2593 } 2594 2595 out: 2596 mutex_unlock(&mgpu_info.mutex); 2597 2598 return ret; 2599 } 2600 2601 /** 2602 * amdgpu_device_ip_late_init - run late init for hardware IPs 2603 * 2604 * @adev: amdgpu_device pointer 2605 * 2606 * Late initialization pass for hardware IPs. The list of all the hardware 2607 * IPs that make up the asic is walked and the late_init callbacks are run. 2608 * late_init covers any special initialization that an IP requires 2609 * after all of the have been initialized or something that needs to happen 2610 * late in the init process. 2611 * Returns 0 on success, negative error code on failure. 2612 */ 2613 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2614 { 2615 struct amdgpu_gpu_instance *gpu_instance; 2616 int i = 0, r; 2617 2618 for (i = 0; i < adev->num_ip_blocks; i++) { 2619 if (!adev->ip_blocks[i].status.hw) 2620 continue; 2621 if (adev->ip_blocks[i].version->funcs->late_init) { 2622 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2623 if (r) { 2624 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2625 adev->ip_blocks[i].version->funcs->name, r); 2626 return r; 2627 } 2628 } 2629 adev->ip_blocks[i].status.late_initialized = true; 2630 } 2631 2632 amdgpu_ras_set_error_query_ready(adev, true); 2633 2634 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2635 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2636 2637 amdgpu_device_fill_reset_magic(adev); 2638 2639 r = amdgpu_device_enable_mgpu_fan_boost(); 2640 if (r) 2641 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2642 2643 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2644 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2645 adev->asic_type == CHIP_ALDEBARAN )) 2646 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2647 2648 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2649 mutex_lock(&mgpu_info.mutex); 2650 2651 /* 2652 * Reset device p-state to low as this was booted with high. 2653 * 2654 * This should be performed only after all devices from the same 2655 * hive get initialized. 2656 * 2657 * However, it's unknown how many device in the hive in advance. 2658 * As this is counted one by one during devices initializations. 2659 * 2660 * So, we wait for all XGMI interlinked devices initialized. 2661 * This may bring some delays as those devices may come from 2662 * different hives. But that should be OK. 2663 */ 2664 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2665 for (i = 0; i < mgpu_info.num_gpu; i++) { 2666 gpu_instance = &(mgpu_info.gpu_ins[i]); 2667 if (gpu_instance->adev->flags & AMD_IS_APU) 2668 continue; 2669 2670 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2671 AMDGPU_XGMI_PSTATE_MIN); 2672 if (r) { 2673 DRM_ERROR("pstate setting failed (%d).\n", r); 2674 break; 2675 } 2676 } 2677 } 2678 2679 mutex_unlock(&mgpu_info.mutex); 2680 } 2681 2682 return 0; 2683 } 2684 2685 /** 2686 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2687 * 2688 * @adev: amdgpu_device pointer 2689 * 2690 * For ASICs need to disable SMC first 2691 */ 2692 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2693 { 2694 int i, r; 2695 2696 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2697 return; 2698 2699 for (i = 0; i < adev->num_ip_blocks; i++) { 2700 if (!adev->ip_blocks[i].status.hw) 2701 continue; 2702 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2703 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2704 /* XXX handle errors */ 2705 if (r) { 2706 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2707 adev->ip_blocks[i].version->funcs->name, r); 2708 } 2709 adev->ip_blocks[i].status.hw = false; 2710 break; 2711 } 2712 } 2713 } 2714 2715 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2716 { 2717 int i, r; 2718 2719 for (i = 0; i < adev->num_ip_blocks; i++) { 2720 if (!adev->ip_blocks[i].version->funcs->early_fini) 2721 continue; 2722 2723 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2724 if (r) { 2725 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2726 adev->ip_blocks[i].version->funcs->name, r); 2727 } 2728 } 2729 2730 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2731 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2732 2733 amdgpu_amdkfd_suspend(adev, false); 2734 2735 /* Workaroud for ASICs need to disable SMC first */ 2736 amdgpu_device_smu_fini_early(adev); 2737 2738 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2739 if (!adev->ip_blocks[i].status.hw) 2740 continue; 2741 2742 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2743 /* XXX handle errors */ 2744 if (r) { 2745 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2746 adev->ip_blocks[i].version->funcs->name, r); 2747 } 2748 2749 adev->ip_blocks[i].status.hw = false; 2750 } 2751 2752 if (amdgpu_sriov_vf(adev)) { 2753 if (amdgpu_virt_release_full_gpu(adev, false)) 2754 DRM_ERROR("failed to release exclusive mode on fini\n"); 2755 } 2756 2757 return 0; 2758 } 2759 2760 /** 2761 * amdgpu_device_ip_fini - run fini for hardware IPs 2762 * 2763 * @adev: amdgpu_device pointer 2764 * 2765 * Main teardown pass for hardware IPs. The list of all the hardware 2766 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2767 * are run. hw_fini tears down the hardware associated with each IP 2768 * and sw_fini tears down any software state associated with each IP. 2769 * Returns 0 on success, negative error code on failure. 2770 */ 2771 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2772 { 2773 int i, r; 2774 2775 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2776 amdgpu_virt_release_ras_err_handler_data(adev); 2777 2778 if (adev->gmc.xgmi.num_physical_nodes > 1) 2779 amdgpu_xgmi_remove_device(adev); 2780 2781 amdgpu_amdkfd_device_fini_sw(adev); 2782 2783 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2784 if (!adev->ip_blocks[i].status.sw) 2785 continue; 2786 2787 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2788 amdgpu_ucode_free_bo(adev); 2789 amdgpu_free_static_csa(&adev->virt.csa_obj); 2790 amdgpu_device_wb_fini(adev); 2791 amdgpu_device_vram_scratch_fini(adev); 2792 amdgpu_ib_pool_fini(adev); 2793 } 2794 2795 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2796 /* XXX handle errors */ 2797 if (r) { 2798 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2799 adev->ip_blocks[i].version->funcs->name, r); 2800 } 2801 adev->ip_blocks[i].status.sw = false; 2802 adev->ip_blocks[i].status.valid = false; 2803 } 2804 2805 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2806 if (!adev->ip_blocks[i].status.late_initialized) 2807 continue; 2808 if (adev->ip_blocks[i].version->funcs->late_fini) 2809 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2810 adev->ip_blocks[i].status.late_initialized = false; 2811 } 2812 2813 amdgpu_ras_fini(adev); 2814 2815 return 0; 2816 } 2817 2818 /** 2819 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2820 * 2821 * @work: work_struct. 2822 */ 2823 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2824 { 2825 struct amdgpu_device *adev = 2826 container_of(work, struct amdgpu_device, delayed_init_work.work); 2827 int r; 2828 2829 r = amdgpu_ib_ring_tests(adev); 2830 if (r) 2831 DRM_ERROR("ib ring test failed (%d).\n", r); 2832 } 2833 2834 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2835 { 2836 struct amdgpu_device *adev = 2837 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2838 2839 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2840 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2841 2842 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2843 adev->gfx.gfx_off_state = true; 2844 } 2845 2846 /** 2847 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2848 * 2849 * @adev: amdgpu_device pointer 2850 * 2851 * Main suspend function for hardware IPs. The list of all the hardware 2852 * IPs that make up the asic is walked, clockgating is disabled and the 2853 * suspend callbacks are run. suspend puts the hardware and software state 2854 * in each IP into a state suitable for suspend. 2855 * Returns 0 on success, negative error code on failure. 2856 */ 2857 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2858 { 2859 int i, r; 2860 2861 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2862 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2863 2864 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2865 if (!adev->ip_blocks[i].status.valid) 2866 continue; 2867 2868 /* displays are handled separately */ 2869 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2870 continue; 2871 2872 /* XXX handle errors */ 2873 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2874 /* XXX handle errors */ 2875 if (r) { 2876 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2877 adev->ip_blocks[i].version->funcs->name, r); 2878 return r; 2879 } 2880 2881 adev->ip_blocks[i].status.hw = false; 2882 } 2883 2884 return 0; 2885 } 2886 2887 /** 2888 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2889 * 2890 * @adev: amdgpu_device pointer 2891 * 2892 * Main suspend function for hardware IPs. The list of all the hardware 2893 * IPs that make up the asic is walked, clockgating is disabled and the 2894 * suspend callbacks are run. suspend puts the hardware and software state 2895 * in each IP into a state suitable for suspend. 2896 * Returns 0 on success, negative error code on failure. 2897 */ 2898 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2899 { 2900 int i, r; 2901 2902 if (adev->in_s0ix) 2903 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2904 2905 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2906 if (!adev->ip_blocks[i].status.valid) 2907 continue; 2908 /* displays are handled in phase1 */ 2909 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2910 continue; 2911 /* PSP lost connection when err_event_athub occurs */ 2912 if (amdgpu_ras_intr_triggered() && 2913 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2914 adev->ip_blocks[i].status.hw = false; 2915 continue; 2916 } 2917 2918 /* skip unnecessary suspend if we do not initialize them yet */ 2919 if (adev->gmc.xgmi.pending_reset && 2920 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2921 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2922 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2923 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2924 adev->ip_blocks[i].status.hw = false; 2925 continue; 2926 } 2927 2928 /* skip suspend of gfx and psp for S0ix 2929 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2930 * like at runtime. PSP is also part of the always on hardware 2931 * so no need to suspend it. 2932 */ 2933 if (adev->in_s0ix && 2934 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2935 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2936 continue; 2937 2938 /* XXX handle errors */ 2939 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2940 /* XXX handle errors */ 2941 if (r) { 2942 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2943 adev->ip_blocks[i].version->funcs->name, r); 2944 } 2945 adev->ip_blocks[i].status.hw = false; 2946 /* handle putting the SMC in the appropriate state */ 2947 if(!amdgpu_sriov_vf(adev)){ 2948 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2949 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2950 if (r) { 2951 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2952 adev->mp1_state, r); 2953 return r; 2954 } 2955 } 2956 } 2957 } 2958 2959 return 0; 2960 } 2961 2962 /** 2963 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2964 * 2965 * @adev: amdgpu_device pointer 2966 * 2967 * Main suspend function for hardware IPs. The list of all the hardware 2968 * IPs that make up the asic is walked, clockgating is disabled and the 2969 * suspend callbacks are run. suspend puts the hardware and software state 2970 * in each IP into a state suitable for suspend. 2971 * Returns 0 on success, negative error code on failure. 2972 */ 2973 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2974 { 2975 int r; 2976 2977 if (amdgpu_sriov_vf(adev)) { 2978 amdgpu_virt_fini_data_exchange(adev); 2979 amdgpu_virt_request_full_gpu(adev, false); 2980 } 2981 2982 r = amdgpu_device_ip_suspend_phase1(adev); 2983 if (r) 2984 return r; 2985 r = amdgpu_device_ip_suspend_phase2(adev); 2986 2987 if (amdgpu_sriov_vf(adev)) 2988 amdgpu_virt_release_full_gpu(adev, false); 2989 2990 return r; 2991 } 2992 2993 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2994 { 2995 int i, r; 2996 2997 static enum amd_ip_block_type ip_order[] = { 2998 AMD_IP_BLOCK_TYPE_GMC, 2999 AMD_IP_BLOCK_TYPE_COMMON, 3000 AMD_IP_BLOCK_TYPE_PSP, 3001 AMD_IP_BLOCK_TYPE_IH, 3002 }; 3003 3004 for (i = 0; i < adev->num_ip_blocks; i++) { 3005 int j; 3006 struct amdgpu_ip_block *block; 3007 3008 block = &adev->ip_blocks[i]; 3009 block->status.hw = false; 3010 3011 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3012 3013 if (block->version->type != ip_order[j] || 3014 !block->status.valid) 3015 continue; 3016 3017 r = block->version->funcs->hw_init(adev); 3018 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3019 if (r) 3020 return r; 3021 block->status.hw = true; 3022 } 3023 } 3024 3025 return 0; 3026 } 3027 3028 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3029 { 3030 int i, r; 3031 3032 static enum amd_ip_block_type ip_order[] = { 3033 AMD_IP_BLOCK_TYPE_SMC, 3034 AMD_IP_BLOCK_TYPE_DCE, 3035 AMD_IP_BLOCK_TYPE_GFX, 3036 AMD_IP_BLOCK_TYPE_SDMA, 3037 AMD_IP_BLOCK_TYPE_UVD, 3038 AMD_IP_BLOCK_TYPE_VCE, 3039 AMD_IP_BLOCK_TYPE_VCN 3040 }; 3041 3042 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3043 int j; 3044 struct amdgpu_ip_block *block; 3045 3046 for (j = 0; j < adev->num_ip_blocks; j++) { 3047 block = &adev->ip_blocks[j]; 3048 3049 if (block->version->type != ip_order[i] || 3050 !block->status.valid || 3051 block->status.hw) 3052 continue; 3053 3054 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3055 r = block->version->funcs->resume(adev); 3056 else 3057 r = block->version->funcs->hw_init(adev); 3058 3059 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3060 if (r) 3061 return r; 3062 block->status.hw = true; 3063 } 3064 } 3065 3066 return 0; 3067 } 3068 3069 /** 3070 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3071 * 3072 * @adev: amdgpu_device pointer 3073 * 3074 * First resume function for hardware IPs. The list of all the hardware 3075 * IPs that make up the asic is walked and the resume callbacks are run for 3076 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3077 * after a suspend and updates the software state as necessary. This 3078 * function is also used for restoring the GPU after a GPU reset. 3079 * Returns 0 on success, negative error code on failure. 3080 */ 3081 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3082 { 3083 int i, r; 3084 3085 for (i = 0; i < adev->num_ip_blocks; i++) { 3086 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3087 continue; 3088 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3089 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3090 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3091 3092 r = adev->ip_blocks[i].version->funcs->resume(adev); 3093 if (r) { 3094 DRM_ERROR("resume of IP block <%s> failed %d\n", 3095 adev->ip_blocks[i].version->funcs->name, r); 3096 return r; 3097 } 3098 adev->ip_blocks[i].status.hw = true; 3099 } 3100 } 3101 3102 return 0; 3103 } 3104 3105 /** 3106 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3107 * 3108 * @adev: amdgpu_device pointer 3109 * 3110 * First resume function for hardware IPs. The list of all the hardware 3111 * IPs that make up the asic is walked and the resume callbacks are run for 3112 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3113 * functional state after a suspend and updates the software state as 3114 * necessary. This function is also used for restoring the GPU after a GPU 3115 * reset. 3116 * Returns 0 on success, negative error code on failure. 3117 */ 3118 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3119 { 3120 int i, r; 3121 3122 for (i = 0; i < adev->num_ip_blocks; i++) { 3123 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3124 continue; 3125 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3126 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3127 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3128 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3129 continue; 3130 r = adev->ip_blocks[i].version->funcs->resume(adev); 3131 if (r) { 3132 DRM_ERROR("resume of IP block <%s> failed %d\n", 3133 adev->ip_blocks[i].version->funcs->name, r); 3134 return r; 3135 } 3136 adev->ip_blocks[i].status.hw = true; 3137 } 3138 3139 return 0; 3140 } 3141 3142 /** 3143 * amdgpu_device_ip_resume - run resume for hardware IPs 3144 * 3145 * @adev: amdgpu_device pointer 3146 * 3147 * Main resume function for hardware IPs. The hardware IPs 3148 * are split into two resume functions because they are 3149 * are also used in in recovering from a GPU reset and some additional 3150 * steps need to be take between them. In this case (S3/S4) they are 3151 * run sequentially. 3152 * Returns 0 on success, negative error code on failure. 3153 */ 3154 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3155 { 3156 int r; 3157 3158 r = amdgpu_amdkfd_resume_iommu(adev); 3159 if (r) 3160 return r; 3161 3162 r = amdgpu_device_ip_resume_phase1(adev); 3163 if (r) 3164 return r; 3165 3166 r = amdgpu_device_fw_loading(adev); 3167 if (r) 3168 return r; 3169 3170 r = amdgpu_device_ip_resume_phase2(adev); 3171 3172 return r; 3173 } 3174 3175 /** 3176 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3177 * 3178 * @adev: amdgpu_device pointer 3179 * 3180 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3181 */ 3182 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3183 { 3184 if (amdgpu_sriov_vf(adev)) { 3185 if (adev->is_atom_fw) { 3186 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3187 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3188 } else { 3189 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3190 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3191 } 3192 3193 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3194 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3195 } 3196 } 3197 3198 /** 3199 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3200 * 3201 * @asic_type: AMD asic type 3202 * 3203 * Check if there is DC (new modesetting infrastructre) support for an asic. 3204 * returns true if DC has support, false if not. 3205 */ 3206 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3207 { 3208 switch (asic_type) { 3209 #ifdef CONFIG_DRM_AMDGPU_SI 3210 case CHIP_HAINAN: 3211 #endif 3212 case CHIP_TOPAZ: 3213 /* chips with no display hardware */ 3214 return false; 3215 #if defined(CONFIG_DRM_AMD_DC) 3216 case CHIP_TAHITI: 3217 case CHIP_PITCAIRN: 3218 case CHIP_VERDE: 3219 case CHIP_OLAND: 3220 /* 3221 * We have systems in the wild with these ASICs that require 3222 * LVDS and VGA support which is not supported with DC. 3223 * 3224 * Fallback to the non-DC driver here by default so as not to 3225 * cause regressions. 3226 */ 3227 #if defined(CONFIG_DRM_AMD_DC_SI) 3228 return amdgpu_dc > 0; 3229 #else 3230 return false; 3231 #endif 3232 case CHIP_BONAIRE: 3233 case CHIP_KAVERI: 3234 case CHIP_KABINI: 3235 case CHIP_MULLINS: 3236 /* 3237 * We have systems in the wild with these ASICs that require 3238 * LVDS and VGA support which is not supported with DC. 3239 * 3240 * Fallback to the non-DC driver here by default so as not to 3241 * cause regressions. 3242 */ 3243 return amdgpu_dc > 0; 3244 case CHIP_HAWAII: 3245 case CHIP_CARRIZO: 3246 case CHIP_STONEY: 3247 case CHIP_POLARIS10: 3248 case CHIP_POLARIS11: 3249 case CHIP_POLARIS12: 3250 case CHIP_VEGAM: 3251 case CHIP_TONGA: 3252 case CHIP_FIJI: 3253 case CHIP_VEGA10: 3254 case CHIP_VEGA12: 3255 case CHIP_VEGA20: 3256 #if defined(CONFIG_DRM_AMD_DC_DCN) 3257 case CHIP_RAVEN: 3258 case CHIP_NAVI10: 3259 case CHIP_NAVI14: 3260 case CHIP_NAVI12: 3261 case CHIP_RENOIR: 3262 case CHIP_CYAN_SKILLFISH: 3263 case CHIP_SIENNA_CICHLID: 3264 case CHIP_NAVY_FLOUNDER: 3265 case CHIP_DIMGREY_CAVEFISH: 3266 case CHIP_BEIGE_GOBY: 3267 case CHIP_VANGOGH: 3268 case CHIP_YELLOW_CARP: 3269 #endif 3270 default: 3271 return amdgpu_dc != 0; 3272 #else 3273 default: 3274 if (amdgpu_dc > 0) 3275 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3276 "but isn't supported by ASIC, ignoring\n"); 3277 return false; 3278 #endif 3279 } 3280 } 3281 3282 /** 3283 * amdgpu_device_has_dc_support - check if dc is supported 3284 * 3285 * @adev: amdgpu_device pointer 3286 * 3287 * Returns true for supported, false for not supported 3288 */ 3289 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3290 { 3291 if (amdgpu_sriov_vf(adev) || 3292 adev->enable_virtual_display || 3293 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3294 return false; 3295 3296 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3297 } 3298 3299 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3300 { 3301 struct amdgpu_device *adev = 3302 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3303 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3304 3305 /* It's a bug to not have a hive within this function */ 3306 if (WARN_ON(!hive)) 3307 return; 3308 3309 /* 3310 * Use task barrier to synchronize all xgmi reset works across the 3311 * hive. task_barrier_enter and task_barrier_exit will block 3312 * until all the threads running the xgmi reset works reach 3313 * those points. task_barrier_full will do both blocks. 3314 */ 3315 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3316 3317 task_barrier_enter(&hive->tb); 3318 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3319 3320 if (adev->asic_reset_res) 3321 goto fail; 3322 3323 task_barrier_exit(&hive->tb); 3324 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3325 3326 if (adev->asic_reset_res) 3327 goto fail; 3328 3329 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3330 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3331 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3332 } else { 3333 3334 task_barrier_full(&hive->tb); 3335 adev->asic_reset_res = amdgpu_asic_reset(adev); 3336 } 3337 3338 fail: 3339 if (adev->asic_reset_res) 3340 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3341 adev->asic_reset_res, adev_to_drm(adev)->unique); 3342 amdgpu_put_xgmi_hive(hive); 3343 } 3344 3345 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3346 { 3347 char *input = amdgpu_lockup_timeout; 3348 char *timeout_setting = NULL; 3349 int index = 0; 3350 long timeout; 3351 int ret = 0; 3352 3353 /* 3354 * By default timeout for non compute jobs is 10000 3355 * and 60000 for compute jobs. 3356 * In SR-IOV or passthrough mode, timeout for compute 3357 * jobs are 60000 by default. 3358 */ 3359 adev->gfx_timeout = msecs_to_jiffies(10000); 3360 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3361 if (amdgpu_sriov_vf(adev)) 3362 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3363 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3364 else 3365 adev->compute_timeout = msecs_to_jiffies(60000); 3366 3367 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3368 while ((timeout_setting = strsep(&input, ",")) && 3369 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3370 ret = kstrtol(timeout_setting, 0, &timeout); 3371 if (ret) 3372 return ret; 3373 3374 if (timeout == 0) { 3375 index++; 3376 continue; 3377 } else if (timeout < 0) { 3378 timeout = MAX_SCHEDULE_TIMEOUT; 3379 dev_warn(adev->dev, "lockup timeout disabled"); 3380 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3381 } else { 3382 timeout = msecs_to_jiffies(timeout); 3383 } 3384 3385 switch (index++) { 3386 case 0: 3387 adev->gfx_timeout = timeout; 3388 break; 3389 case 1: 3390 adev->compute_timeout = timeout; 3391 break; 3392 case 2: 3393 adev->sdma_timeout = timeout; 3394 break; 3395 case 3: 3396 adev->video_timeout = timeout; 3397 break; 3398 default: 3399 break; 3400 } 3401 } 3402 /* 3403 * There is only one value specified and 3404 * it should apply to all non-compute jobs. 3405 */ 3406 if (index == 1) { 3407 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3408 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3409 adev->compute_timeout = adev->gfx_timeout; 3410 } 3411 } 3412 3413 return ret; 3414 } 3415 3416 /** 3417 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3418 * 3419 * @adev: amdgpu_device pointer 3420 * 3421 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3422 */ 3423 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3424 { 3425 struct iommu_domain *domain; 3426 3427 domain = iommu_get_domain_for_dev(adev->dev); 3428 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3429 adev->ram_is_direct_mapped = true; 3430 } 3431 3432 static const struct attribute *amdgpu_dev_attributes[] = { 3433 &dev_attr_product_name.attr, 3434 &dev_attr_product_number.attr, 3435 &dev_attr_serial_number.attr, 3436 &dev_attr_pcie_replay_count.attr, 3437 NULL 3438 }; 3439 3440 /** 3441 * amdgpu_device_init - initialize the driver 3442 * 3443 * @adev: amdgpu_device pointer 3444 * @flags: driver flags 3445 * 3446 * Initializes the driver info and hw (all asics). 3447 * Returns 0 for success or an error on failure. 3448 * Called at driver startup. 3449 */ 3450 int amdgpu_device_init(struct amdgpu_device *adev, 3451 uint32_t flags) 3452 { 3453 struct drm_device *ddev = adev_to_drm(adev); 3454 struct pci_dev *pdev = adev->pdev; 3455 int r, i; 3456 bool px = false; 3457 u32 max_MBps; 3458 3459 adev->shutdown = false; 3460 adev->flags = flags; 3461 3462 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3463 adev->asic_type = amdgpu_force_asic_type; 3464 else 3465 adev->asic_type = flags & AMD_ASIC_MASK; 3466 3467 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3468 if (amdgpu_emu_mode == 1) 3469 adev->usec_timeout *= 10; 3470 adev->gmc.gart_size = 512 * 1024 * 1024; 3471 adev->accel_working = false; 3472 adev->num_rings = 0; 3473 adev->mman.buffer_funcs = NULL; 3474 adev->mman.buffer_funcs_ring = NULL; 3475 adev->vm_manager.vm_pte_funcs = NULL; 3476 adev->vm_manager.vm_pte_num_scheds = 0; 3477 adev->gmc.gmc_funcs = NULL; 3478 adev->harvest_ip_mask = 0x0; 3479 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3480 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3481 3482 adev->smc_rreg = &amdgpu_invalid_rreg; 3483 adev->smc_wreg = &amdgpu_invalid_wreg; 3484 adev->pcie_rreg = &amdgpu_invalid_rreg; 3485 adev->pcie_wreg = &amdgpu_invalid_wreg; 3486 adev->pciep_rreg = &amdgpu_invalid_rreg; 3487 adev->pciep_wreg = &amdgpu_invalid_wreg; 3488 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3489 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3490 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3491 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3492 adev->didt_rreg = &amdgpu_invalid_rreg; 3493 adev->didt_wreg = &amdgpu_invalid_wreg; 3494 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3495 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3496 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3497 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3498 3499 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3500 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3501 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3502 3503 /* mutex initialization are all done here so we 3504 * can recall function without having locking issues */ 3505 mutex_init(&adev->firmware.mutex); 3506 mutex_init(&adev->pm.mutex); 3507 mutex_init(&adev->gfx.gpu_clock_mutex); 3508 mutex_init(&adev->srbm_mutex); 3509 mutex_init(&adev->gfx.pipe_reserve_mutex); 3510 mutex_init(&adev->gfx.gfx_off_mutex); 3511 mutex_init(&adev->grbm_idx_mutex); 3512 mutex_init(&adev->mn_lock); 3513 mutex_init(&adev->virt.vf_errors.lock); 3514 hash_init(adev->mn_hash); 3515 atomic_set(&adev->in_gpu_reset, 0); 3516 init_rwsem(&adev->reset_sem); 3517 mutex_init(&adev->psp.mutex); 3518 mutex_init(&adev->notifier_lock); 3519 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3520 3521 amdgpu_device_init_apu_flags(adev); 3522 3523 r = amdgpu_device_check_arguments(adev); 3524 if (r) 3525 return r; 3526 3527 spin_lock_init(&adev->mmio_idx_lock); 3528 spin_lock_init(&adev->smc_idx_lock); 3529 spin_lock_init(&adev->pcie_idx_lock); 3530 spin_lock_init(&adev->uvd_ctx_idx_lock); 3531 spin_lock_init(&adev->didt_idx_lock); 3532 spin_lock_init(&adev->gc_cac_idx_lock); 3533 spin_lock_init(&adev->se_cac_idx_lock); 3534 spin_lock_init(&adev->audio_endpt_idx_lock); 3535 spin_lock_init(&adev->mm_stats.lock); 3536 3537 INIT_LIST_HEAD(&adev->shadow_list); 3538 mutex_init(&adev->shadow_list_lock); 3539 3540 INIT_LIST_HEAD(&adev->reset_list); 3541 3542 INIT_LIST_HEAD(&adev->ras_list); 3543 3544 INIT_DELAYED_WORK(&adev->delayed_init_work, 3545 amdgpu_device_delayed_init_work_handler); 3546 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3547 amdgpu_device_delay_enable_gfx_off); 3548 3549 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3550 3551 adev->gfx.gfx_off_req_count = 1; 3552 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3553 3554 atomic_set(&adev->throttling_logging_enabled, 1); 3555 /* 3556 * If throttling continues, logging will be performed every minute 3557 * to avoid log flooding. "-1" is subtracted since the thermal 3558 * throttling interrupt comes every second. Thus, the total logging 3559 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3560 * for throttling interrupt) = 60 seconds. 3561 */ 3562 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3563 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3564 3565 /* Registers mapping */ 3566 /* TODO: block userspace mapping of io register */ 3567 if (adev->asic_type >= CHIP_BONAIRE) { 3568 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3569 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3570 } else { 3571 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3572 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3573 } 3574 3575 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3576 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3577 3578 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3579 if (adev->rmmio == NULL) { 3580 return -ENOMEM; 3581 } 3582 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3583 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3584 3585 amdgpu_device_get_pcie_info(adev); 3586 3587 if (amdgpu_mcbp) 3588 DRM_INFO("MCBP is enabled\n"); 3589 3590 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3591 adev->enable_mes = true; 3592 3593 /* detect hw virtualization here */ 3594 amdgpu_detect_virtualization(adev); 3595 3596 r = amdgpu_device_get_job_timeout_settings(adev); 3597 if (r) { 3598 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3599 return r; 3600 } 3601 3602 /* early init functions */ 3603 r = amdgpu_device_ip_early_init(adev); 3604 if (r) 3605 return r; 3606 3607 /* Need to get xgmi info early to decide the reset behavior*/ 3608 if (adev->gmc.xgmi.supported) { 3609 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3610 if (r) 3611 return r; 3612 } 3613 3614 /* enable PCIE atomic ops */ 3615 if (amdgpu_sriov_vf(adev)) 3616 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3617 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags == 3618 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3619 else 3620 adev->have_atomics_support = 3621 !pci_enable_atomic_ops_to_root(adev->pdev, 3622 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3623 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3624 if (!adev->have_atomics_support) 3625 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3626 3627 /* doorbell bar mapping and doorbell index init*/ 3628 amdgpu_device_doorbell_init(adev); 3629 3630 if (amdgpu_emu_mode == 1) { 3631 /* post the asic on emulation mode */ 3632 emu_soc_asic_init(adev); 3633 goto fence_driver_init; 3634 } 3635 3636 amdgpu_reset_init(adev); 3637 3638 /* detect if we are with an SRIOV vbios */ 3639 amdgpu_device_detect_sriov_bios(adev); 3640 3641 /* check if we need to reset the asic 3642 * E.g., driver was not cleanly unloaded previously, etc. 3643 */ 3644 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3645 if (adev->gmc.xgmi.num_physical_nodes) { 3646 dev_info(adev->dev, "Pending hive reset.\n"); 3647 adev->gmc.xgmi.pending_reset = true; 3648 /* Only need to init necessary block for SMU to handle the reset */ 3649 for (i = 0; i < adev->num_ip_blocks; i++) { 3650 if (!adev->ip_blocks[i].status.valid) 3651 continue; 3652 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3653 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3654 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3655 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3656 DRM_DEBUG("IP %s disabled for hw_init.\n", 3657 adev->ip_blocks[i].version->funcs->name); 3658 adev->ip_blocks[i].status.hw = true; 3659 } 3660 } 3661 } else { 3662 r = amdgpu_asic_reset(adev); 3663 if (r) { 3664 dev_err(adev->dev, "asic reset on init failed\n"); 3665 goto failed; 3666 } 3667 } 3668 } 3669 3670 pci_enable_pcie_error_reporting(adev->pdev); 3671 3672 /* Post card if necessary */ 3673 if (amdgpu_device_need_post(adev)) { 3674 if (!adev->bios) { 3675 dev_err(adev->dev, "no vBIOS found\n"); 3676 r = -EINVAL; 3677 goto failed; 3678 } 3679 DRM_INFO("GPU posting now...\n"); 3680 r = amdgpu_device_asic_init(adev); 3681 if (r) { 3682 dev_err(adev->dev, "gpu post error!\n"); 3683 goto failed; 3684 } 3685 } 3686 3687 if (adev->is_atom_fw) { 3688 /* Initialize clocks */ 3689 r = amdgpu_atomfirmware_get_clock_info(adev); 3690 if (r) { 3691 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3692 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3693 goto failed; 3694 } 3695 } else { 3696 /* Initialize clocks */ 3697 r = amdgpu_atombios_get_clock_info(adev); 3698 if (r) { 3699 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3700 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3701 goto failed; 3702 } 3703 /* init i2c buses */ 3704 if (!amdgpu_device_has_dc_support(adev)) 3705 amdgpu_atombios_i2c_init(adev); 3706 } 3707 3708 fence_driver_init: 3709 /* Fence driver */ 3710 r = amdgpu_fence_driver_sw_init(adev); 3711 if (r) { 3712 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3713 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3714 goto failed; 3715 } 3716 3717 /* init the mode config */ 3718 drm_mode_config_init(adev_to_drm(adev)); 3719 3720 r = amdgpu_device_ip_init(adev); 3721 if (r) { 3722 /* failed in exclusive mode due to timeout */ 3723 if (amdgpu_sriov_vf(adev) && 3724 !amdgpu_sriov_runtime(adev) && 3725 amdgpu_virt_mmio_blocked(adev) && 3726 !amdgpu_virt_wait_reset(adev)) { 3727 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3728 /* Don't send request since VF is inactive. */ 3729 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3730 adev->virt.ops = NULL; 3731 r = -EAGAIN; 3732 goto release_ras_con; 3733 } 3734 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3735 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3736 goto release_ras_con; 3737 } 3738 3739 amdgpu_fence_driver_hw_init(adev); 3740 3741 dev_info(adev->dev, 3742 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3743 adev->gfx.config.max_shader_engines, 3744 adev->gfx.config.max_sh_per_se, 3745 adev->gfx.config.max_cu_per_sh, 3746 adev->gfx.cu_info.number); 3747 3748 adev->accel_working = true; 3749 3750 amdgpu_vm_check_compute_bug(adev); 3751 3752 /* Initialize the buffer migration limit. */ 3753 if (amdgpu_moverate >= 0) 3754 max_MBps = amdgpu_moverate; 3755 else 3756 max_MBps = 8; /* Allow 8 MB/s. */ 3757 /* Get a log2 for easy divisions. */ 3758 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3759 3760 r = amdgpu_pm_sysfs_init(adev); 3761 if (r) { 3762 adev->pm_sysfs_en = false; 3763 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3764 } else 3765 adev->pm_sysfs_en = true; 3766 3767 r = amdgpu_ucode_sysfs_init(adev); 3768 if (r) { 3769 adev->ucode_sysfs_en = false; 3770 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3771 } else 3772 adev->ucode_sysfs_en = true; 3773 3774 if ((amdgpu_testing & 1)) { 3775 if (adev->accel_working) 3776 amdgpu_test_moves(adev); 3777 else 3778 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3779 } 3780 if (amdgpu_benchmarking) { 3781 if (adev->accel_working) 3782 amdgpu_benchmark(adev, amdgpu_benchmarking); 3783 else 3784 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3785 } 3786 3787 /* 3788 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3789 * Otherwise the mgpu fan boost feature will be skipped due to the 3790 * gpu instance is counted less. 3791 */ 3792 amdgpu_register_gpu_instance(adev); 3793 3794 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3795 * explicit gating rather than handling it automatically. 3796 */ 3797 if (!adev->gmc.xgmi.pending_reset) { 3798 r = amdgpu_device_ip_late_init(adev); 3799 if (r) { 3800 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3801 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3802 goto release_ras_con; 3803 } 3804 /* must succeed. */ 3805 amdgpu_ras_resume(adev); 3806 queue_delayed_work(system_wq, &adev->delayed_init_work, 3807 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3808 } 3809 3810 if (amdgpu_sriov_vf(adev)) 3811 flush_delayed_work(&adev->delayed_init_work); 3812 3813 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3814 if (r) 3815 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3816 3817 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3818 r = amdgpu_pmu_init(adev); 3819 if (r) 3820 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3821 3822 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3823 if (amdgpu_device_cache_pci_state(adev->pdev)) 3824 pci_restore_state(pdev); 3825 3826 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3827 /* this will fail for cards that aren't VGA class devices, just 3828 * ignore it */ 3829 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3830 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3831 3832 if (amdgpu_device_supports_px(ddev)) { 3833 px = true; 3834 vga_switcheroo_register_client(adev->pdev, 3835 &amdgpu_switcheroo_ops, px); 3836 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3837 } 3838 3839 if (adev->gmc.xgmi.pending_reset) 3840 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3841 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3842 3843 amdgpu_device_check_iommu_direct_map(adev); 3844 3845 return 0; 3846 3847 release_ras_con: 3848 amdgpu_release_ras_context(adev); 3849 3850 failed: 3851 amdgpu_vf_error_trans_all(adev); 3852 3853 return r; 3854 } 3855 3856 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3857 { 3858 3859 /* Clear all CPU mappings pointing to this device */ 3860 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3861 3862 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3863 amdgpu_device_doorbell_fini(adev); 3864 3865 iounmap(adev->rmmio); 3866 adev->rmmio = NULL; 3867 if (adev->mman.aper_base_kaddr) 3868 iounmap(adev->mman.aper_base_kaddr); 3869 adev->mman.aper_base_kaddr = NULL; 3870 3871 /* Memory manager related */ 3872 if (!adev->gmc.xgmi.connected_to_cpu) { 3873 arch_phys_wc_del(adev->gmc.vram_mtrr); 3874 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3875 } 3876 } 3877 3878 /** 3879 * amdgpu_device_fini_hw - tear down the driver 3880 * 3881 * @adev: amdgpu_device pointer 3882 * 3883 * Tear down the driver info (all asics). 3884 * Called at driver shutdown. 3885 */ 3886 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3887 { 3888 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3889 flush_delayed_work(&adev->delayed_init_work); 3890 if (adev->mman.initialized) { 3891 flush_delayed_work(&adev->mman.bdev.wq); 3892 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3893 } 3894 adev->shutdown = true; 3895 3896 /* make sure IB test finished before entering exclusive mode 3897 * to avoid preemption on IB test 3898 * */ 3899 if (amdgpu_sriov_vf(adev)) { 3900 amdgpu_virt_request_full_gpu(adev, false); 3901 amdgpu_virt_fini_data_exchange(adev); 3902 } 3903 3904 /* disable all interrupts */ 3905 amdgpu_irq_disable_all(adev); 3906 if (adev->mode_info.mode_config_initialized){ 3907 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3908 drm_helper_force_disable_all(adev_to_drm(adev)); 3909 else 3910 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3911 } 3912 amdgpu_fence_driver_hw_fini(adev); 3913 3914 if (adev->pm_sysfs_en) 3915 amdgpu_pm_sysfs_fini(adev); 3916 if (adev->ucode_sysfs_en) 3917 amdgpu_ucode_sysfs_fini(adev); 3918 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3919 3920 /* disable ras feature must before hw fini */ 3921 amdgpu_ras_pre_fini(adev); 3922 3923 amdgpu_device_ip_fini_early(adev); 3924 3925 amdgpu_irq_fini_hw(adev); 3926 3927 if (adev->mman.initialized) 3928 ttm_device_clear_dma_mappings(&adev->mman.bdev); 3929 3930 amdgpu_gart_dummy_page_fini(adev); 3931 3932 if (drm_dev_is_unplugged(adev_to_drm(adev))) 3933 amdgpu_device_unmap_mmio(adev); 3934 3935 } 3936 3937 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3938 { 3939 int idx; 3940 3941 amdgpu_fence_driver_sw_fini(adev); 3942 amdgpu_device_ip_fini(adev); 3943 release_firmware(adev->firmware.gpu_info_fw); 3944 adev->firmware.gpu_info_fw = NULL; 3945 adev->accel_working = false; 3946 3947 amdgpu_reset_fini(adev); 3948 3949 /* free i2c buses */ 3950 if (!amdgpu_device_has_dc_support(adev)) 3951 amdgpu_i2c_fini(adev); 3952 3953 if (amdgpu_emu_mode != 1) 3954 amdgpu_atombios_fini(adev); 3955 3956 kfree(adev->bios); 3957 adev->bios = NULL; 3958 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3959 vga_switcheroo_unregister_client(adev->pdev); 3960 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3961 } 3962 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3963 vga_client_unregister(adev->pdev); 3964 3965 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 3966 3967 iounmap(adev->rmmio); 3968 adev->rmmio = NULL; 3969 amdgpu_device_doorbell_fini(adev); 3970 drm_dev_exit(idx); 3971 } 3972 3973 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3974 amdgpu_pmu_fini(adev); 3975 if (adev->mman.discovery_bin) 3976 amdgpu_discovery_fini(adev); 3977 3978 kfree(adev->pci_state); 3979 3980 } 3981 3982 /** 3983 * amdgpu_device_evict_resources - evict device resources 3984 * @adev: amdgpu device object 3985 * 3986 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 3987 * of the vram memory type. Mainly used for evicting device resources 3988 * at suspend time. 3989 * 3990 */ 3991 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 3992 { 3993 /* No need to evict vram on APUs for suspend to ram or s2idle */ 3994 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 3995 return; 3996 3997 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 3998 DRM_WARN("evicting device resources failed\n"); 3999 4000 } 4001 4002 /* 4003 * Suspend & resume. 4004 */ 4005 /** 4006 * amdgpu_device_suspend - initiate device suspend 4007 * 4008 * @dev: drm dev pointer 4009 * @fbcon : notify the fbdev of suspend 4010 * 4011 * Puts the hw in the suspend state (all asics). 4012 * Returns 0 for success or an error on failure. 4013 * Called at driver suspend. 4014 */ 4015 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4016 { 4017 struct amdgpu_device *adev = drm_to_adev(dev); 4018 4019 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4020 return 0; 4021 4022 adev->in_suspend = true; 4023 4024 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4025 DRM_WARN("smart shift update failed\n"); 4026 4027 drm_kms_helper_poll_disable(dev); 4028 4029 if (fbcon) 4030 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4031 4032 cancel_delayed_work_sync(&adev->delayed_init_work); 4033 4034 amdgpu_ras_suspend(adev); 4035 4036 amdgpu_device_ip_suspend_phase1(adev); 4037 4038 if (!adev->in_s0ix) 4039 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4040 4041 amdgpu_device_evict_resources(adev); 4042 4043 amdgpu_fence_driver_hw_fini(adev); 4044 4045 amdgpu_device_ip_suspend_phase2(adev); 4046 4047 return 0; 4048 } 4049 4050 /** 4051 * amdgpu_device_resume - initiate device resume 4052 * 4053 * @dev: drm dev pointer 4054 * @fbcon : notify the fbdev of resume 4055 * 4056 * Bring the hw back to operating state (all asics). 4057 * Returns 0 for success or an error on failure. 4058 * Called at driver resume. 4059 */ 4060 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4061 { 4062 struct amdgpu_device *adev = drm_to_adev(dev); 4063 int r = 0; 4064 4065 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4066 return 0; 4067 4068 if (adev->in_s0ix) 4069 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4070 4071 /* post card */ 4072 if (amdgpu_device_need_post(adev)) { 4073 r = amdgpu_device_asic_init(adev); 4074 if (r) 4075 dev_err(adev->dev, "amdgpu asic init failed\n"); 4076 } 4077 4078 r = amdgpu_device_ip_resume(adev); 4079 if (r) { 4080 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4081 return r; 4082 } 4083 amdgpu_fence_driver_hw_init(adev); 4084 4085 r = amdgpu_device_ip_late_init(adev); 4086 if (r) 4087 return r; 4088 4089 queue_delayed_work(system_wq, &adev->delayed_init_work, 4090 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4091 4092 if (!adev->in_s0ix) { 4093 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4094 if (r) 4095 return r; 4096 } 4097 4098 /* Make sure IB tests flushed */ 4099 flush_delayed_work(&adev->delayed_init_work); 4100 4101 if (fbcon) 4102 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4103 4104 drm_kms_helper_poll_enable(dev); 4105 4106 amdgpu_ras_resume(adev); 4107 4108 /* 4109 * Most of the connector probing functions try to acquire runtime pm 4110 * refs to ensure that the GPU is powered on when connector polling is 4111 * performed. Since we're calling this from a runtime PM callback, 4112 * trying to acquire rpm refs will cause us to deadlock. 4113 * 4114 * Since we're guaranteed to be holding the rpm lock, it's safe to 4115 * temporarily disable the rpm helpers so this doesn't deadlock us. 4116 */ 4117 #ifdef CONFIG_PM 4118 dev->dev->power.disable_depth++; 4119 #endif 4120 if (!amdgpu_device_has_dc_support(adev)) 4121 drm_helper_hpd_irq_event(dev); 4122 else 4123 drm_kms_helper_hotplug_event(dev); 4124 #ifdef CONFIG_PM 4125 dev->dev->power.disable_depth--; 4126 #endif 4127 adev->in_suspend = false; 4128 4129 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4130 DRM_WARN("smart shift update failed\n"); 4131 4132 return 0; 4133 } 4134 4135 /** 4136 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4137 * 4138 * @adev: amdgpu_device pointer 4139 * 4140 * The list of all the hardware IPs that make up the asic is walked and 4141 * the check_soft_reset callbacks are run. check_soft_reset determines 4142 * if the asic is still hung or not. 4143 * Returns true if any of the IPs are still in a hung state, false if not. 4144 */ 4145 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4146 { 4147 int i; 4148 bool asic_hang = false; 4149 4150 if (amdgpu_sriov_vf(adev)) 4151 return true; 4152 4153 if (amdgpu_asic_need_full_reset(adev)) 4154 return true; 4155 4156 for (i = 0; i < adev->num_ip_blocks; i++) { 4157 if (!adev->ip_blocks[i].status.valid) 4158 continue; 4159 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4160 adev->ip_blocks[i].status.hang = 4161 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4162 if (adev->ip_blocks[i].status.hang) { 4163 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4164 asic_hang = true; 4165 } 4166 } 4167 return asic_hang; 4168 } 4169 4170 /** 4171 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4172 * 4173 * @adev: amdgpu_device pointer 4174 * 4175 * The list of all the hardware IPs that make up the asic is walked and the 4176 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4177 * handles any IP specific hardware or software state changes that are 4178 * necessary for a soft reset to succeed. 4179 * Returns 0 on success, negative error code on failure. 4180 */ 4181 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4182 { 4183 int i, r = 0; 4184 4185 for (i = 0; i < adev->num_ip_blocks; i++) { 4186 if (!adev->ip_blocks[i].status.valid) 4187 continue; 4188 if (adev->ip_blocks[i].status.hang && 4189 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4190 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4191 if (r) 4192 return r; 4193 } 4194 } 4195 4196 return 0; 4197 } 4198 4199 /** 4200 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4201 * 4202 * @adev: amdgpu_device pointer 4203 * 4204 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4205 * reset is necessary to recover. 4206 * Returns true if a full asic reset is required, false if not. 4207 */ 4208 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4209 { 4210 int i; 4211 4212 if (amdgpu_asic_need_full_reset(adev)) 4213 return true; 4214 4215 for (i = 0; i < adev->num_ip_blocks; i++) { 4216 if (!adev->ip_blocks[i].status.valid) 4217 continue; 4218 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4219 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4220 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4221 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4222 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4223 if (adev->ip_blocks[i].status.hang) { 4224 dev_info(adev->dev, "Some block need full reset!\n"); 4225 return true; 4226 } 4227 } 4228 } 4229 return false; 4230 } 4231 4232 /** 4233 * amdgpu_device_ip_soft_reset - do a soft reset 4234 * 4235 * @adev: amdgpu_device pointer 4236 * 4237 * The list of all the hardware IPs that make up the asic is walked and the 4238 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4239 * IP specific hardware or software state changes that are necessary to soft 4240 * reset the IP. 4241 * Returns 0 on success, negative error code on failure. 4242 */ 4243 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4244 { 4245 int i, r = 0; 4246 4247 for (i = 0; i < adev->num_ip_blocks; i++) { 4248 if (!adev->ip_blocks[i].status.valid) 4249 continue; 4250 if (adev->ip_blocks[i].status.hang && 4251 adev->ip_blocks[i].version->funcs->soft_reset) { 4252 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4253 if (r) 4254 return r; 4255 } 4256 } 4257 4258 return 0; 4259 } 4260 4261 /** 4262 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4263 * 4264 * @adev: amdgpu_device pointer 4265 * 4266 * The list of all the hardware IPs that make up the asic is walked and the 4267 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4268 * handles any IP specific hardware or software state changes that are 4269 * necessary after the IP has been soft reset. 4270 * Returns 0 on success, negative error code on failure. 4271 */ 4272 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4273 { 4274 int i, r = 0; 4275 4276 for (i = 0; i < adev->num_ip_blocks; i++) { 4277 if (!adev->ip_blocks[i].status.valid) 4278 continue; 4279 if (adev->ip_blocks[i].status.hang && 4280 adev->ip_blocks[i].version->funcs->post_soft_reset) 4281 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4282 if (r) 4283 return r; 4284 } 4285 4286 return 0; 4287 } 4288 4289 /** 4290 * amdgpu_device_recover_vram - Recover some VRAM contents 4291 * 4292 * @adev: amdgpu_device pointer 4293 * 4294 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4295 * restore things like GPUVM page tables after a GPU reset where 4296 * the contents of VRAM might be lost. 4297 * 4298 * Returns: 4299 * 0 on success, negative error code on failure. 4300 */ 4301 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4302 { 4303 struct dma_fence *fence = NULL, *next = NULL; 4304 struct amdgpu_bo *shadow; 4305 struct amdgpu_bo_vm *vmbo; 4306 long r = 1, tmo; 4307 4308 if (amdgpu_sriov_runtime(adev)) 4309 tmo = msecs_to_jiffies(8000); 4310 else 4311 tmo = msecs_to_jiffies(100); 4312 4313 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4314 mutex_lock(&adev->shadow_list_lock); 4315 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4316 shadow = &vmbo->bo; 4317 /* No need to recover an evicted BO */ 4318 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4319 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4320 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4321 continue; 4322 4323 r = amdgpu_bo_restore_shadow(shadow, &next); 4324 if (r) 4325 break; 4326 4327 if (fence) { 4328 tmo = dma_fence_wait_timeout(fence, false, tmo); 4329 dma_fence_put(fence); 4330 fence = next; 4331 if (tmo == 0) { 4332 r = -ETIMEDOUT; 4333 break; 4334 } else if (tmo < 0) { 4335 r = tmo; 4336 break; 4337 } 4338 } else { 4339 fence = next; 4340 } 4341 } 4342 mutex_unlock(&adev->shadow_list_lock); 4343 4344 if (fence) 4345 tmo = dma_fence_wait_timeout(fence, false, tmo); 4346 dma_fence_put(fence); 4347 4348 if (r < 0 || tmo <= 0) { 4349 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4350 return -EIO; 4351 } 4352 4353 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4354 return 0; 4355 } 4356 4357 4358 /** 4359 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4360 * 4361 * @adev: amdgpu_device pointer 4362 * @from_hypervisor: request from hypervisor 4363 * 4364 * do VF FLR and reinitialize Asic 4365 * return 0 means succeeded otherwise failed 4366 */ 4367 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4368 bool from_hypervisor) 4369 { 4370 int r; 4371 struct amdgpu_hive_info *hive = NULL; 4372 int retry_limit = 0; 4373 4374 retry: 4375 amdgpu_amdkfd_pre_reset(adev); 4376 4377 amdgpu_amdkfd_pre_reset(adev); 4378 4379 if (from_hypervisor) 4380 r = amdgpu_virt_request_full_gpu(adev, true); 4381 else 4382 r = amdgpu_virt_reset_gpu(adev); 4383 if (r) 4384 return r; 4385 4386 /* Resume IP prior to SMC */ 4387 r = amdgpu_device_ip_reinit_early_sriov(adev); 4388 if (r) 4389 goto error; 4390 4391 amdgpu_virt_init_data_exchange(adev); 4392 4393 r = amdgpu_device_fw_loading(adev); 4394 if (r) 4395 return r; 4396 4397 /* now we are okay to resume SMC/CP/SDMA */ 4398 r = amdgpu_device_ip_reinit_late_sriov(adev); 4399 if (r) 4400 goto error; 4401 4402 hive = amdgpu_get_xgmi_hive(adev); 4403 /* Update PSP FW topology after reset */ 4404 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4405 r = amdgpu_xgmi_update_topology(hive, adev); 4406 4407 if (hive) 4408 amdgpu_put_xgmi_hive(hive); 4409 4410 if (!r) { 4411 amdgpu_irq_gpu_reset_resume_helper(adev); 4412 r = amdgpu_ib_ring_tests(adev); 4413 amdgpu_amdkfd_post_reset(adev); 4414 } 4415 4416 error: 4417 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4418 amdgpu_inc_vram_lost(adev); 4419 r = amdgpu_device_recover_vram(adev); 4420 } 4421 amdgpu_virt_release_full_gpu(adev, true); 4422 4423 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4424 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4425 retry_limit++; 4426 goto retry; 4427 } else 4428 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4429 } 4430 4431 return r; 4432 } 4433 4434 /** 4435 * amdgpu_device_has_job_running - check if there is any job in mirror list 4436 * 4437 * @adev: amdgpu_device pointer 4438 * 4439 * check if there is any job in mirror list 4440 */ 4441 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4442 { 4443 int i; 4444 struct drm_sched_job *job; 4445 4446 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4447 struct amdgpu_ring *ring = adev->rings[i]; 4448 4449 if (!ring || !ring->sched.thread) 4450 continue; 4451 4452 spin_lock(&ring->sched.job_list_lock); 4453 job = list_first_entry_or_null(&ring->sched.pending_list, 4454 struct drm_sched_job, list); 4455 spin_unlock(&ring->sched.job_list_lock); 4456 if (job) 4457 return true; 4458 } 4459 return false; 4460 } 4461 4462 /** 4463 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4464 * 4465 * @adev: amdgpu_device pointer 4466 * 4467 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4468 * a hung GPU. 4469 */ 4470 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4471 { 4472 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4473 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4474 return false; 4475 } 4476 4477 if (amdgpu_gpu_recovery == 0) 4478 goto disabled; 4479 4480 if (amdgpu_sriov_vf(adev)) 4481 return true; 4482 4483 if (amdgpu_gpu_recovery == -1) { 4484 switch (adev->asic_type) { 4485 #ifdef CONFIG_DRM_AMDGPU_SI 4486 case CHIP_VERDE: 4487 case CHIP_TAHITI: 4488 case CHIP_PITCAIRN: 4489 case CHIP_OLAND: 4490 case CHIP_HAINAN: 4491 #endif 4492 #ifdef CONFIG_DRM_AMDGPU_CIK 4493 case CHIP_KAVERI: 4494 case CHIP_KABINI: 4495 case CHIP_MULLINS: 4496 #endif 4497 case CHIP_CARRIZO: 4498 case CHIP_STONEY: 4499 case CHIP_CYAN_SKILLFISH: 4500 goto disabled; 4501 default: 4502 break; 4503 } 4504 } 4505 4506 return true; 4507 4508 disabled: 4509 dev_info(adev->dev, "GPU recovery disabled.\n"); 4510 return false; 4511 } 4512 4513 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4514 { 4515 u32 i; 4516 int ret = 0; 4517 4518 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4519 4520 dev_info(adev->dev, "GPU mode1 reset\n"); 4521 4522 /* disable BM */ 4523 pci_clear_master(adev->pdev); 4524 4525 amdgpu_device_cache_pci_state(adev->pdev); 4526 4527 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4528 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4529 ret = amdgpu_dpm_mode1_reset(adev); 4530 } else { 4531 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4532 ret = psp_gpu_reset(adev); 4533 } 4534 4535 if (ret) 4536 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4537 4538 amdgpu_device_load_pci_state(adev->pdev); 4539 4540 /* wait for asic to come out of reset */ 4541 for (i = 0; i < adev->usec_timeout; i++) { 4542 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4543 4544 if (memsize != 0xffffffff) 4545 break; 4546 udelay(1); 4547 } 4548 4549 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4550 return ret; 4551 } 4552 4553 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4554 struct amdgpu_reset_context *reset_context) 4555 { 4556 int i, r = 0; 4557 struct amdgpu_job *job = NULL; 4558 bool need_full_reset = 4559 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4560 4561 if (reset_context->reset_req_dev == adev) 4562 job = reset_context->job; 4563 4564 if (amdgpu_sriov_vf(adev)) { 4565 /* stop the data exchange thread */ 4566 amdgpu_virt_fini_data_exchange(adev); 4567 } 4568 4569 /* block all schedulers and reset given job's ring */ 4570 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4571 struct amdgpu_ring *ring = adev->rings[i]; 4572 4573 if (!ring || !ring->sched.thread) 4574 continue; 4575 4576 /*clear job fence from fence drv to avoid force_completion 4577 *leave NULL and vm flush fence in fence drv */ 4578 amdgpu_fence_driver_clear_job_fences(ring); 4579 4580 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4581 amdgpu_fence_driver_force_completion(ring); 4582 } 4583 4584 if (job && job->vm) 4585 drm_sched_increase_karma(&job->base); 4586 4587 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4588 /* If reset handler not implemented, continue; otherwise return */ 4589 if (r == -ENOSYS) 4590 r = 0; 4591 else 4592 return r; 4593 4594 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4595 if (!amdgpu_sriov_vf(adev)) { 4596 4597 if (!need_full_reset) 4598 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4599 4600 if (!need_full_reset) { 4601 amdgpu_device_ip_pre_soft_reset(adev); 4602 r = amdgpu_device_ip_soft_reset(adev); 4603 amdgpu_device_ip_post_soft_reset(adev); 4604 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4605 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4606 need_full_reset = true; 4607 } 4608 } 4609 4610 if (need_full_reset) 4611 r = amdgpu_device_ip_suspend(adev); 4612 if (need_full_reset) 4613 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4614 else 4615 clear_bit(AMDGPU_NEED_FULL_RESET, 4616 &reset_context->flags); 4617 } 4618 4619 return r; 4620 } 4621 4622 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4623 struct amdgpu_reset_context *reset_context) 4624 { 4625 struct amdgpu_device *tmp_adev = NULL; 4626 bool need_full_reset, skip_hw_reset, vram_lost = false; 4627 int r = 0; 4628 4629 /* Try reset handler method first */ 4630 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4631 reset_list); 4632 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4633 /* If reset handler not implemented, continue; otherwise return */ 4634 if (r == -ENOSYS) 4635 r = 0; 4636 else 4637 return r; 4638 4639 /* Reset handler not implemented, use the default method */ 4640 need_full_reset = 4641 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4642 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4643 4644 /* 4645 * ASIC reset has to be done on all XGMI hive nodes ASAP 4646 * to allow proper links negotiation in FW (within 1 sec) 4647 */ 4648 if (!skip_hw_reset && need_full_reset) { 4649 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4650 /* For XGMI run all resets in parallel to speed up the process */ 4651 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4652 tmp_adev->gmc.xgmi.pending_reset = false; 4653 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4654 r = -EALREADY; 4655 } else 4656 r = amdgpu_asic_reset(tmp_adev); 4657 4658 if (r) { 4659 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4660 r, adev_to_drm(tmp_adev)->unique); 4661 break; 4662 } 4663 } 4664 4665 /* For XGMI wait for all resets to complete before proceed */ 4666 if (!r) { 4667 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4668 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4669 flush_work(&tmp_adev->xgmi_reset_work); 4670 r = tmp_adev->asic_reset_res; 4671 if (r) 4672 break; 4673 } 4674 } 4675 } 4676 } 4677 4678 if (!r && amdgpu_ras_intr_triggered()) { 4679 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4680 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4681 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4682 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4683 } 4684 4685 amdgpu_ras_intr_cleared(); 4686 } 4687 4688 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4689 if (need_full_reset) { 4690 /* post card */ 4691 r = amdgpu_device_asic_init(tmp_adev); 4692 if (r) { 4693 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4694 } else { 4695 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4696 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4697 if (r) 4698 goto out; 4699 4700 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4701 if (r) 4702 goto out; 4703 4704 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4705 if (vram_lost) { 4706 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4707 amdgpu_inc_vram_lost(tmp_adev); 4708 } 4709 4710 r = amdgpu_device_fw_loading(tmp_adev); 4711 if (r) 4712 return r; 4713 4714 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4715 if (r) 4716 goto out; 4717 4718 if (vram_lost) 4719 amdgpu_device_fill_reset_magic(tmp_adev); 4720 4721 /* 4722 * Add this ASIC as tracked as reset was already 4723 * complete successfully. 4724 */ 4725 amdgpu_register_gpu_instance(tmp_adev); 4726 4727 if (!reset_context->hive && 4728 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4729 amdgpu_xgmi_add_device(tmp_adev); 4730 4731 r = amdgpu_device_ip_late_init(tmp_adev); 4732 if (r) 4733 goto out; 4734 4735 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4736 4737 /* 4738 * The GPU enters bad state once faulty pages 4739 * by ECC has reached the threshold, and ras 4740 * recovery is scheduled next. So add one check 4741 * here to break recovery if it indeed exceeds 4742 * bad page threshold, and remind user to 4743 * retire this GPU or setting one bigger 4744 * bad_page_threshold value to fix this once 4745 * probing driver again. 4746 */ 4747 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4748 /* must succeed. */ 4749 amdgpu_ras_resume(tmp_adev); 4750 } else { 4751 r = -EINVAL; 4752 goto out; 4753 } 4754 4755 /* Update PSP FW topology after reset */ 4756 if (reset_context->hive && 4757 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4758 r = amdgpu_xgmi_update_topology( 4759 reset_context->hive, tmp_adev); 4760 } 4761 } 4762 4763 out: 4764 if (!r) { 4765 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4766 r = amdgpu_ib_ring_tests(tmp_adev); 4767 if (r) { 4768 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4769 need_full_reset = true; 4770 r = -EAGAIN; 4771 goto end; 4772 } 4773 } 4774 4775 if (!r) 4776 r = amdgpu_device_recover_vram(tmp_adev); 4777 else 4778 tmp_adev->asic_reset_res = r; 4779 } 4780 4781 end: 4782 if (need_full_reset) 4783 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4784 else 4785 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4786 return r; 4787 } 4788 4789 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4790 struct amdgpu_hive_info *hive) 4791 { 4792 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4793 return false; 4794 4795 if (hive) { 4796 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4797 } else { 4798 down_write(&adev->reset_sem); 4799 } 4800 4801 switch (amdgpu_asic_reset_method(adev)) { 4802 case AMD_RESET_METHOD_MODE1: 4803 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4804 break; 4805 case AMD_RESET_METHOD_MODE2: 4806 adev->mp1_state = PP_MP1_STATE_RESET; 4807 break; 4808 default: 4809 adev->mp1_state = PP_MP1_STATE_NONE; 4810 break; 4811 } 4812 4813 return true; 4814 } 4815 4816 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4817 { 4818 amdgpu_vf_error_trans_all(adev); 4819 adev->mp1_state = PP_MP1_STATE_NONE; 4820 atomic_set(&adev->in_gpu_reset, 0); 4821 up_write(&adev->reset_sem); 4822 } 4823 4824 /* 4825 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4826 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4827 * 4828 * unlock won't require roll back. 4829 */ 4830 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4831 { 4832 struct amdgpu_device *tmp_adev = NULL; 4833 4834 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 4835 if (!hive) { 4836 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4837 return -ENODEV; 4838 } 4839 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4840 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4841 goto roll_back; 4842 } 4843 } else if (!amdgpu_device_lock_adev(adev, hive)) 4844 return -EAGAIN; 4845 4846 return 0; 4847 roll_back: 4848 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4849 /* 4850 * if the lockup iteration break in the middle of a hive, 4851 * it may means there may has a race issue, 4852 * or a hive device locked up independently. 4853 * we may be in trouble and may not, so will try to roll back 4854 * the lock and give out a warnning. 4855 */ 4856 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4857 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4858 amdgpu_device_unlock_adev(tmp_adev); 4859 } 4860 } 4861 return -EAGAIN; 4862 } 4863 4864 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4865 { 4866 struct pci_dev *p = NULL; 4867 4868 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4869 adev->pdev->bus->number, 1); 4870 if (p) { 4871 pm_runtime_enable(&(p->dev)); 4872 pm_runtime_resume(&(p->dev)); 4873 } 4874 } 4875 4876 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4877 { 4878 enum amd_reset_method reset_method; 4879 struct pci_dev *p = NULL; 4880 u64 expires; 4881 4882 /* 4883 * For now, only BACO and mode1 reset are confirmed 4884 * to suffer the audio issue without proper suspended. 4885 */ 4886 reset_method = amdgpu_asic_reset_method(adev); 4887 if ((reset_method != AMD_RESET_METHOD_BACO) && 4888 (reset_method != AMD_RESET_METHOD_MODE1)) 4889 return -EINVAL; 4890 4891 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4892 adev->pdev->bus->number, 1); 4893 if (!p) 4894 return -ENODEV; 4895 4896 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4897 if (!expires) 4898 /* 4899 * If we cannot get the audio device autosuspend delay, 4900 * a fixed 4S interval will be used. Considering 3S is 4901 * the audio controller default autosuspend delay setting. 4902 * 4S used here is guaranteed to cover that. 4903 */ 4904 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4905 4906 while (!pm_runtime_status_suspended(&(p->dev))) { 4907 if (!pm_runtime_suspend(&(p->dev))) 4908 break; 4909 4910 if (expires < ktime_get_mono_fast_ns()) { 4911 dev_warn(adev->dev, "failed to suspend display audio\n"); 4912 /* TODO: abort the succeeding gpu reset? */ 4913 return -ETIMEDOUT; 4914 } 4915 } 4916 4917 pm_runtime_disable(&(p->dev)); 4918 4919 return 0; 4920 } 4921 4922 static void amdgpu_device_recheck_guilty_jobs( 4923 struct amdgpu_device *adev, struct list_head *device_list_handle, 4924 struct amdgpu_reset_context *reset_context) 4925 { 4926 int i, r = 0; 4927 4928 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4929 struct amdgpu_ring *ring = adev->rings[i]; 4930 int ret = 0; 4931 struct drm_sched_job *s_job; 4932 4933 if (!ring || !ring->sched.thread) 4934 continue; 4935 4936 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4937 struct drm_sched_job, list); 4938 if (s_job == NULL) 4939 continue; 4940 4941 /* clear job's guilty and depend the folowing step to decide the real one */ 4942 drm_sched_reset_karma(s_job); 4943 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get 4944 * to make sure fence is balanced */ 4945 dma_fence_get(s_job->s_fence->parent); 4946 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4947 4948 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4949 if (ret == 0) { /* timeout */ 4950 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4951 ring->sched.name, s_job->id); 4952 4953 /* set guilty */ 4954 drm_sched_increase_karma(s_job); 4955 retry: 4956 /* do hw reset */ 4957 if (amdgpu_sriov_vf(adev)) { 4958 amdgpu_virt_fini_data_exchange(adev); 4959 r = amdgpu_device_reset_sriov(adev, false); 4960 if (r) 4961 adev->asic_reset_res = r; 4962 } else { 4963 clear_bit(AMDGPU_SKIP_HW_RESET, 4964 &reset_context->flags); 4965 r = amdgpu_do_asic_reset(device_list_handle, 4966 reset_context); 4967 if (r && r == -EAGAIN) 4968 goto retry; 4969 } 4970 4971 /* 4972 * add reset counter so that the following 4973 * resubmitted job could flush vmid 4974 */ 4975 atomic_inc(&adev->gpu_reset_counter); 4976 continue; 4977 } 4978 4979 /* got the hw fence, signal finished fence */ 4980 atomic_dec(ring->sched.score); 4981 dma_fence_put(s_job->s_fence->parent); 4982 dma_fence_get(&s_job->s_fence->finished); 4983 dma_fence_signal(&s_job->s_fence->finished); 4984 dma_fence_put(&s_job->s_fence->finished); 4985 4986 /* remove node from list and free the job */ 4987 spin_lock(&ring->sched.job_list_lock); 4988 list_del_init(&s_job->list); 4989 spin_unlock(&ring->sched.job_list_lock); 4990 ring->sched.ops->free_job(s_job); 4991 } 4992 } 4993 4994 /** 4995 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4996 * 4997 * @adev: amdgpu_device pointer 4998 * @job: which job trigger hang 4999 * 5000 * Attempt to reset the GPU if it has hung (all asics). 5001 * Attempt to do soft-reset or full-reset and reinitialize Asic 5002 * Returns 0 for success or an error on failure. 5003 */ 5004 5005 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5006 struct amdgpu_job *job) 5007 { 5008 struct list_head device_list, *device_list_handle = NULL; 5009 bool job_signaled = false; 5010 struct amdgpu_hive_info *hive = NULL; 5011 struct amdgpu_device *tmp_adev = NULL; 5012 int i, r = 0; 5013 bool need_emergency_restart = false; 5014 bool audio_suspended = false; 5015 int tmp_vram_lost_counter; 5016 struct amdgpu_reset_context reset_context; 5017 5018 memset(&reset_context, 0, sizeof(reset_context)); 5019 5020 /* 5021 * Special case: RAS triggered and full reset isn't supported 5022 */ 5023 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5024 5025 /* 5026 * Flush RAM to disk so that after reboot 5027 * the user can read log and see why the system rebooted. 5028 */ 5029 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5030 DRM_WARN("Emergency reboot."); 5031 5032 ksys_sync_helper(); 5033 emergency_restart(); 5034 } 5035 5036 dev_info(adev->dev, "GPU %s begin!\n", 5037 need_emergency_restart ? "jobs stop":"reset"); 5038 5039 /* 5040 * Here we trylock to avoid chain of resets executing from 5041 * either trigger by jobs on different adevs in XGMI hive or jobs on 5042 * different schedulers for same device while this TO handler is running. 5043 * We always reset all schedulers for device and all devices for XGMI 5044 * hive so that should take care of them too. 5045 */ 5046 if (!amdgpu_sriov_vf(adev)) 5047 hive = amdgpu_get_xgmi_hive(adev); 5048 if (hive) { 5049 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 5050 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 5051 job ? job->base.id : -1, hive->hive_id); 5052 amdgpu_put_xgmi_hive(hive); 5053 if (job && job->vm) 5054 drm_sched_increase_karma(&job->base); 5055 return 0; 5056 } 5057 mutex_lock(&hive->hive_lock); 5058 } 5059 5060 reset_context.method = AMD_RESET_METHOD_NONE; 5061 reset_context.reset_req_dev = adev; 5062 reset_context.job = job; 5063 reset_context.hive = hive; 5064 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5065 5066 /* 5067 * lock the device before we try to operate the linked list 5068 * if didn't get the device lock, don't touch the linked list since 5069 * others may iterating it. 5070 */ 5071 r = amdgpu_device_lock_hive_adev(adev, hive); 5072 if (r) { 5073 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 5074 job ? job->base.id : -1); 5075 5076 /* even we skipped this reset, still need to set the job to guilty */ 5077 if (job && job->vm) 5078 drm_sched_increase_karma(&job->base); 5079 goto skip_recovery; 5080 } 5081 5082 /* 5083 * Build list of devices to reset. 5084 * In case we are in XGMI hive mode, resort the device list 5085 * to put adev in the 1st position. 5086 */ 5087 INIT_LIST_HEAD(&device_list); 5088 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5089 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5090 list_add_tail(&tmp_adev->reset_list, &device_list); 5091 if (!list_is_first(&adev->reset_list, &device_list)) 5092 list_rotate_to_front(&adev->reset_list, &device_list); 5093 device_list_handle = &device_list; 5094 } else { 5095 list_add_tail(&adev->reset_list, &device_list); 5096 device_list_handle = &device_list; 5097 } 5098 5099 /* block all schedulers and reset given job's ring */ 5100 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5101 /* 5102 * Try to put the audio codec into suspend state 5103 * before gpu reset started. 5104 * 5105 * Due to the power domain of the graphics device 5106 * is shared with AZ power domain. Without this, 5107 * we may change the audio hardware from behind 5108 * the audio driver's back. That will trigger 5109 * some audio codec errors. 5110 */ 5111 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5112 audio_suspended = true; 5113 5114 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5115 5116 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5117 5118 if (!amdgpu_sriov_vf(tmp_adev)) 5119 amdgpu_amdkfd_pre_reset(tmp_adev); 5120 5121 /* 5122 * Mark these ASICs to be reseted as untracked first 5123 * And add them back after reset completed 5124 */ 5125 amdgpu_unregister_gpu_instance(tmp_adev); 5126 5127 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 5128 5129 /* disable ras on ALL IPs */ 5130 if (!need_emergency_restart && 5131 amdgpu_device_ip_need_full_reset(tmp_adev)) 5132 amdgpu_ras_suspend(tmp_adev); 5133 5134 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5135 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5136 5137 if (!ring || !ring->sched.thread) 5138 continue; 5139 5140 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5141 5142 if (need_emergency_restart) 5143 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5144 } 5145 atomic_inc(&tmp_adev->gpu_reset_counter); 5146 } 5147 5148 if (need_emergency_restart) 5149 goto skip_sched_resume; 5150 5151 /* 5152 * Must check guilty signal here since after this point all old 5153 * HW fences are force signaled. 5154 * 5155 * job->base holds a reference to parent fence 5156 */ 5157 if (job && job->base.s_fence->parent && 5158 dma_fence_is_signaled(job->base.s_fence->parent)) { 5159 job_signaled = true; 5160 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5161 goto skip_hw_reset; 5162 } 5163 5164 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5165 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5166 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5167 /*TODO Should we stop ?*/ 5168 if (r) { 5169 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5170 r, adev_to_drm(tmp_adev)->unique); 5171 tmp_adev->asic_reset_res = r; 5172 } 5173 } 5174 5175 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5176 /* Actual ASIC resets if needed.*/ 5177 /* Host driver will handle XGMI hive reset for SRIOV */ 5178 if (amdgpu_sriov_vf(adev)) { 5179 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5180 if (r) 5181 adev->asic_reset_res = r; 5182 } else { 5183 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5184 if (r && r == -EAGAIN) 5185 goto retry; 5186 } 5187 5188 skip_hw_reset: 5189 5190 /* Post ASIC reset for all devs .*/ 5191 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5192 5193 /* 5194 * Sometimes a later bad compute job can block a good gfx job as gfx 5195 * and compute ring share internal GC HW mutually. We add an additional 5196 * guilty jobs recheck step to find the real guilty job, it synchronously 5197 * submits and pends for the first job being signaled. If it gets timeout, 5198 * we identify it as a real guilty job. 5199 */ 5200 if (amdgpu_gpu_recovery == 2 && 5201 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5202 amdgpu_device_recheck_guilty_jobs( 5203 tmp_adev, device_list_handle, &reset_context); 5204 5205 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5206 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5207 5208 if (!ring || !ring->sched.thread) 5209 continue; 5210 5211 /* No point to resubmit jobs if we didn't HW reset*/ 5212 if (!tmp_adev->asic_reset_res && !job_signaled) 5213 drm_sched_resubmit_jobs(&ring->sched); 5214 5215 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5216 } 5217 5218 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5219 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5220 } 5221 5222 if (tmp_adev->asic_reset_res) 5223 r = tmp_adev->asic_reset_res; 5224 5225 tmp_adev->asic_reset_res = 0; 5226 5227 if (r) { 5228 /* bad news, how to tell it to userspace ? */ 5229 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5230 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5231 } else { 5232 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5233 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5234 DRM_WARN("smart shift update failed\n"); 5235 } 5236 } 5237 5238 skip_sched_resume: 5239 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5240 /* unlock kfd: SRIOV would do it separately */ 5241 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5242 amdgpu_amdkfd_post_reset(tmp_adev); 5243 5244 /* kfd_post_reset will do nothing if kfd device is not initialized, 5245 * need to bring up kfd here if it's not be initialized before 5246 */ 5247 if (!adev->kfd.init_complete) 5248 amdgpu_amdkfd_device_init(adev); 5249 5250 if (audio_suspended) 5251 amdgpu_device_resume_display_audio(tmp_adev); 5252 amdgpu_device_unlock_adev(tmp_adev); 5253 } 5254 5255 skip_recovery: 5256 if (hive) { 5257 atomic_set(&hive->in_reset, 0); 5258 mutex_unlock(&hive->hive_lock); 5259 amdgpu_put_xgmi_hive(hive); 5260 } 5261 5262 if (r && r != -EAGAIN) 5263 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5264 return r; 5265 } 5266 5267 /** 5268 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5269 * 5270 * @adev: amdgpu_device pointer 5271 * 5272 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5273 * and lanes) of the slot the device is in. Handles APUs and 5274 * virtualized environments where PCIE config space may not be available. 5275 */ 5276 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5277 { 5278 struct pci_dev *pdev; 5279 enum pci_bus_speed speed_cap, platform_speed_cap; 5280 enum pcie_link_width platform_link_width; 5281 5282 if (amdgpu_pcie_gen_cap) 5283 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5284 5285 if (amdgpu_pcie_lane_cap) 5286 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5287 5288 /* covers APUs as well */ 5289 if (pci_is_root_bus(adev->pdev->bus)) { 5290 if (adev->pm.pcie_gen_mask == 0) 5291 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5292 if (adev->pm.pcie_mlw_mask == 0) 5293 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5294 return; 5295 } 5296 5297 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5298 return; 5299 5300 pcie_bandwidth_available(adev->pdev, NULL, 5301 &platform_speed_cap, &platform_link_width); 5302 5303 if (adev->pm.pcie_gen_mask == 0) { 5304 /* asic caps */ 5305 pdev = adev->pdev; 5306 speed_cap = pcie_get_speed_cap(pdev); 5307 if (speed_cap == PCI_SPEED_UNKNOWN) { 5308 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5309 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5310 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5311 } else { 5312 if (speed_cap == PCIE_SPEED_32_0GT) 5313 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5314 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5315 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5316 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5317 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5318 else if (speed_cap == PCIE_SPEED_16_0GT) 5319 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5320 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5321 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5322 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5323 else if (speed_cap == PCIE_SPEED_8_0GT) 5324 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5325 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5326 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5327 else if (speed_cap == PCIE_SPEED_5_0GT) 5328 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5329 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5330 else 5331 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5332 } 5333 /* platform caps */ 5334 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5335 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5336 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5337 } else { 5338 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5339 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5340 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5341 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5342 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5343 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5344 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5345 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5346 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5347 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5348 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5349 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5350 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5351 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5352 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5353 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5354 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5355 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5356 else 5357 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5358 5359 } 5360 } 5361 if (adev->pm.pcie_mlw_mask == 0) { 5362 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5363 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5364 } else { 5365 switch (platform_link_width) { 5366 case PCIE_LNK_X32: 5367 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5368 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5369 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5370 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5371 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5372 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5373 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5374 break; 5375 case PCIE_LNK_X16: 5376 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5377 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5378 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5379 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5380 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5381 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5382 break; 5383 case PCIE_LNK_X12: 5384 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5385 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5386 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5387 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5388 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5389 break; 5390 case PCIE_LNK_X8: 5391 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5392 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5393 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5394 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5395 break; 5396 case PCIE_LNK_X4: 5397 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5398 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5399 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5400 break; 5401 case PCIE_LNK_X2: 5402 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5403 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5404 break; 5405 case PCIE_LNK_X1: 5406 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5407 break; 5408 default: 5409 break; 5410 } 5411 } 5412 } 5413 } 5414 5415 int amdgpu_device_baco_enter(struct drm_device *dev) 5416 { 5417 struct amdgpu_device *adev = drm_to_adev(dev); 5418 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5419 5420 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5421 return -ENOTSUPP; 5422 5423 if (ras && adev->ras_enabled && 5424 adev->nbio.funcs->enable_doorbell_interrupt) 5425 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5426 5427 return amdgpu_dpm_baco_enter(adev); 5428 } 5429 5430 int amdgpu_device_baco_exit(struct drm_device *dev) 5431 { 5432 struct amdgpu_device *adev = drm_to_adev(dev); 5433 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5434 int ret = 0; 5435 5436 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5437 return -ENOTSUPP; 5438 5439 ret = amdgpu_dpm_baco_exit(adev); 5440 if (ret) 5441 return ret; 5442 5443 if (ras && adev->ras_enabled && 5444 adev->nbio.funcs->enable_doorbell_interrupt) 5445 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5446 5447 if (amdgpu_passthrough(adev) && 5448 adev->nbio.funcs->clear_doorbell_interrupt) 5449 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5450 5451 return 0; 5452 } 5453 5454 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5455 { 5456 int i; 5457 5458 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5459 struct amdgpu_ring *ring = adev->rings[i]; 5460 5461 if (!ring || !ring->sched.thread) 5462 continue; 5463 5464 cancel_delayed_work_sync(&ring->sched.work_tdr); 5465 } 5466 } 5467 5468 /** 5469 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5470 * @pdev: PCI device struct 5471 * @state: PCI channel state 5472 * 5473 * Description: Called when a PCI error is detected. 5474 * 5475 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5476 */ 5477 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5478 { 5479 struct drm_device *dev = pci_get_drvdata(pdev); 5480 struct amdgpu_device *adev = drm_to_adev(dev); 5481 int i; 5482 5483 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5484 5485 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5486 DRM_WARN("No support for XGMI hive yet..."); 5487 return PCI_ERS_RESULT_DISCONNECT; 5488 } 5489 5490 adev->pci_channel_state = state; 5491 5492 switch (state) { 5493 case pci_channel_io_normal: 5494 return PCI_ERS_RESULT_CAN_RECOVER; 5495 /* Fatal error, prepare for slot reset */ 5496 case pci_channel_io_frozen: 5497 /* 5498 * Cancel and wait for all TDRs in progress if failing to 5499 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5500 * 5501 * Locking adev->reset_sem will prevent any external access 5502 * to GPU during PCI error recovery 5503 */ 5504 while (!amdgpu_device_lock_adev(adev, NULL)) 5505 amdgpu_cancel_all_tdr(adev); 5506 5507 /* 5508 * Block any work scheduling as we do for regular GPU reset 5509 * for the duration of the recovery 5510 */ 5511 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5512 struct amdgpu_ring *ring = adev->rings[i]; 5513 5514 if (!ring || !ring->sched.thread) 5515 continue; 5516 5517 drm_sched_stop(&ring->sched, NULL); 5518 } 5519 atomic_inc(&adev->gpu_reset_counter); 5520 return PCI_ERS_RESULT_NEED_RESET; 5521 case pci_channel_io_perm_failure: 5522 /* Permanent error, prepare for device removal */ 5523 return PCI_ERS_RESULT_DISCONNECT; 5524 } 5525 5526 return PCI_ERS_RESULT_NEED_RESET; 5527 } 5528 5529 /** 5530 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5531 * @pdev: pointer to PCI device 5532 */ 5533 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5534 { 5535 5536 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5537 5538 /* TODO - dump whatever for debugging purposes */ 5539 5540 /* This called only if amdgpu_pci_error_detected returns 5541 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5542 * works, no need to reset slot. 5543 */ 5544 5545 return PCI_ERS_RESULT_RECOVERED; 5546 } 5547 5548 /** 5549 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5550 * @pdev: PCI device struct 5551 * 5552 * Description: This routine is called by the pci error recovery 5553 * code after the PCI slot has been reset, just before we 5554 * should resume normal operations. 5555 */ 5556 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5557 { 5558 struct drm_device *dev = pci_get_drvdata(pdev); 5559 struct amdgpu_device *adev = drm_to_adev(dev); 5560 int r, i; 5561 struct amdgpu_reset_context reset_context; 5562 u32 memsize; 5563 struct list_head device_list; 5564 5565 DRM_INFO("PCI error: slot reset callback!!\n"); 5566 5567 memset(&reset_context, 0, sizeof(reset_context)); 5568 5569 INIT_LIST_HEAD(&device_list); 5570 list_add_tail(&adev->reset_list, &device_list); 5571 5572 /* wait for asic to come out of reset */ 5573 msleep(500); 5574 5575 /* Restore PCI confspace */ 5576 amdgpu_device_load_pci_state(pdev); 5577 5578 /* confirm ASIC came out of reset */ 5579 for (i = 0; i < adev->usec_timeout; i++) { 5580 memsize = amdgpu_asic_get_config_memsize(adev); 5581 5582 if (memsize != 0xffffffff) 5583 break; 5584 udelay(1); 5585 } 5586 if (memsize == 0xffffffff) { 5587 r = -ETIME; 5588 goto out; 5589 } 5590 5591 reset_context.method = AMD_RESET_METHOD_NONE; 5592 reset_context.reset_req_dev = adev; 5593 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5594 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5595 5596 adev->no_hw_access = true; 5597 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5598 adev->no_hw_access = false; 5599 if (r) 5600 goto out; 5601 5602 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5603 5604 out: 5605 if (!r) { 5606 if (amdgpu_device_cache_pci_state(adev->pdev)) 5607 pci_restore_state(adev->pdev); 5608 5609 DRM_INFO("PCIe error recovery succeeded\n"); 5610 } else { 5611 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5612 amdgpu_device_unlock_adev(adev); 5613 } 5614 5615 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5616 } 5617 5618 /** 5619 * amdgpu_pci_resume() - resume normal ops after PCI reset 5620 * @pdev: pointer to PCI device 5621 * 5622 * Called when the error recovery driver tells us that its 5623 * OK to resume normal operation. 5624 */ 5625 void amdgpu_pci_resume(struct pci_dev *pdev) 5626 { 5627 struct drm_device *dev = pci_get_drvdata(pdev); 5628 struct amdgpu_device *adev = drm_to_adev(dev); 5629 int i; 5630 5631 5632 DRM_INFO("PCI error: resume callback!!\n"); 5633 5634 /* Only continue execution for the case of pci_channel_io_frozen */ 5635 if (adev->pci_channel_state != pci_channel_io_frozen) 5636 return; 5637 5638 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5639 struct amdgpu_ring *ring = adev->rings[i]; 5640 5641 if (!ring || !ring->sched.thread) 5642 continue; 5643 5644 5645 drm_sched_resubmit_jobs(&ring->sched); 5646 drm_sched_start(&ring->sched, true); 5647 } 5648 5649 amdgpu_device_unlock_adev(adev); 5650 } 5651 5652 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5653 { 5654 struct drm_device *dev = pci_get_drvdata(pdev); 5655 struct amdgpu_device *adev = drm_to_adev(dev); 5656 int r; 5657 5658 r = pci_save_state(pdev); 5659 if (!r) { 5660 kfree(adev->pci_state); 5661 5662 adev->pci_state = pci_store_saved_state(pdev); 5663 5664 if (!adev->pci_state) { 5665 DRM_ERROR("Failed to store PCI saved state"); 5666 return false; 5667 } 5668 } else { 5669 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5670 return false; 5671 } 5672 5673 return true; 5674 } 5675 5676 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5677 { 5678 struct drm_device *dev = pci_get_drvdata(pdev); 5679 struct amdgpu_device *adev = drm_to_adev(dev); 5680 int r; 5681 5682 if (!adev->pci_state) 5683 return false; 5684 5685 r = pci_load_saved_state(pdev, adev->pci_state); 5686 5687 if (!r) { 5688 pci_restore_state(pdev); 5689 } else { 5690 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5691 return false; 5692 } 5693 5694 return true; 5695 } 5696 5697 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5698 struct amdgpu_ring *ring) 5699 { 5700 #ifdef CONFIG_X86_64 5701 if (adev->flags & AMD_IS_APU) 5702 return; 5703 #endif 5704 if (adev->gmc.xgmi.connected_to_cpu) 5705 return; 5706 5707 if (ring && ring->funcs->emit_hdp_flush) 5708 amdgpu_ring_emit_hdp_flush(ring); 5709 else 5710 amdgpu_asic_flush_hdp(adev, ring); 5711 } 5712 5713 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5714 struct amdgpu_ring *ring) 5715 { 5716 #ifdef CONFIG_X86_64 5717 if (adev->flags & AMD_IS_APU) 5718 return; 5719 #endif 5720 if (adev->gmc.xgmi.connected_to_cpu) 5721 return; 5722 5723 amdgpu_asic_invalidate_hdp(adev, ring); 5724 } 5725 5726 /** 5727 * amdgpu_device_halt() - bring hardware to some kind of halt state 5728 * 5729 * @adev: amdgpu_device pointer 5730 * 5731 * Bring hardware to some kind of halt state so that no one can touch it 5732 * any more. It will help to maintain error context when error occurred. 5733 * Compare to a simple hang, the system will keep stable at least for SSH 5734 * access. Then it should be trivial to inspect the hardware state and 5735 * see what's going on. Implemented as following: 5736 * 5737 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5738 * clears all CPU mappings to device, disallows remappings through page faults 5739 * 2. amdgpu_irq_disable_all() disables all interrupts 5740 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5741 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5742 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5743 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5744 * flush any in flight DMA operations 5745 */ 5746 void amdgpu_device_halt(struct amdgpu_device *adev) 5747 { 5748 struct pci_dev *pdev = adev->pdev; 5749 struct drm_device *ddev = adev_to_drm(adev); 5750 5751 drm_dev_unplug(ddev); 5752 5753 amdgpu_irq_disable_all(adev); 5754 5755 amdgpu_fence_driver_hw_fini(adev); 5756 5757 adev->no_hw_access = true; 5758 5759 amdgpu_device_unmap_mmio(adev); 5760 5761 pci_disable_device(pdev); 5762 pci_wait_for_pending_transaction(pdev); 5763 } 5764 5765 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5766 u32 reg) 5767 { 5768 unsigned long flags, address, data; 5769 u32 r; 5770 5771 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5772 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5773 5774 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5775 WREG32(address, reg * 4); 5776 (void)RREG32(address); 5777 r = RREG32(data); 5778 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5779 return r; 5780 } 5781 5782 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5783 u32 reg, u32 v) 5784 { 5785 unsigned long flags, address, data; 5786 5787 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5788 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5789 5790 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5791 WREG32(address, reg * 4); 5792 (void)RREG32(address); 5793 WREG32(data, v); 5794 (void)RREG32(data); 5795 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5796 } 5797