1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 #include "amdgpu_reset.h" 69 70 #include <linux/suspend.h> 71 #include <drm/task_barrier.h> 72 #include <linux/pm_runtime.h> 73 74 #include <drm/drm_drv.h> 75 76 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 88 89 #define AMDGPU_RESUME_MS 2000 90 91 const char *amdgpu_asic_name[] = { 92 "TAHITI", 93 "PITCAIRN", 94 "VERDE", 95 "OLAND", 96 "HAINAN", 97 "BONAIRE", 98 "KAVERI", 99 "KABINI", 100 "HAWAII", 101 "MULLINS", 102 "TOPAZ", 103 "TONGA", 104 "FIJI", 105 "CARRIZO", 106 "STONEY", 107 "POLARIS10", 108 "POLARIS11", 109 "POLARIS12", 110 "VEGAM", 111 "VEGA10", 112 "VEGA12", 113 "VEGA20", 114 "RAVEN", 115 "ARCTURUS", 116 "RENOIR", 117 "ALDEBARAN", 118 "NAVI10", 119 "NAVI14", 120 "NAVI12", 121 "SIENNA_CICHLID", 122 "NAVY_FLOUNDER", 123 "VANGOGH", 124 "DIMGREY_CAVEFISH", 125 "BEIGE_GOBY", 126 "YELLOW_CARP", 127 "LAST", 128 }; 129 130 /** 131 * DOC: pcie_replay_count 132 * 133 * The amdgpu driver provides a sysfs API for reporting the total number 134 * of PCIe replays (NAKs) 135 * The file pcie_replay_count is used for this and returns the total 136 * number of replays as a sum of the NAKs generated and NAKs received 137 */ 138 139 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 140 struct device_attribute *attr, char *buf) 141 { 142 struct drm_device *ddev = dev_get_drvdata(dev); 143 struct amdgpu_device *adev = drm_to_adev(ddev); 144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 145 146 return sysfs_emit(buf, "%llu\n", cnt); 147 } 148 149 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 150 amdgpu_device_get_pcie_replay_count, NULL); 151 152 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 153 154 /** 155 * DOC: product_name 156 * 157 * The amdgpu driver provides a sysfs API for reporting the product name 158 * for the device 159 * The file serial_number is used for this and returns the product name 160 * as returned from the FRU. 161 * NOTE: This is only available for certain server cards 162 */ 163 164 static ssize_t amdgpu_device_get_product_name(struct device *dev, 165 struct device_attribute *attr, char *buf) 166 { 167 struct drm_device *ddev = dev_get_drvdata(dev); 168 struct amdgpu_device *adev = drm_to_adev(ddev); 169 170 return sysfs_emit(buf, "%s\n", adev->product_name); 171 } 172 173 static DEVICE_ATTR(product_name, S_IRUGO, 174 amdgpu_device_get_product_name, NULL); 175 176 /** 177 * DOC: product_number 178 * 179 * The amdgpu driver provides a sysfs API for reporting the part number 180 * for the device 181 * The file serial_number is used for this and returns the part number 182 * as returned from the FRU. 183 * NOTE: This is only available for certain server cards 184 */ 185 186 static ssize_t amdgpu_device_get_product_number(struct device *dev, 187 struct device_attribute *attr, char *buf) 188 { 189 struct drm_device *ddev = dev_get_drvdata(dev); 190 struct amdgpu_device *adev = drm_to_adev(ddev); 191 192 return sysfs_emit(buf, "%s\n", adev->product_number); 193 } 194 195 static DEVICE_ATTR(product_number, S_IRUGO, 196 amdgpu_device_get_product_number, NULL); 197 198 /** 199 * DOC: serial_number 200 * 201 * The amdgpu driver provides a sysfs API for reporting the serial number 202 * for the device 203 * The file serial_number is used for this and returns the serial number 204 * as returned from the FRU. 205 * NOTE: This is only available for certain server cards 206 */ 207 208 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 209 struct device_attribute *attr, char *buf) 210 { 211 struct drm_device *ddev = dev_get_drvdata(dev); 212 struct amdgpu_device *adev = drm_to_adev(ddev); 213 214 return sysfs_emit(buf, "%s\n", adev->serial); 215 } 216 217 static DEVICE_ATTR(serial_number, S_IRUGO, 218 amdgpu_device_get_serial_number, NULL); 219 220 /** 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with ATPX power control, 226 * otherwise return false. 227 */ 228 bool amdgpu_device_supports_px(struct drm_device *dev) 229 { 230 struct amdgpu_device *adev = drm_to_adev(dev); 231 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 233 return true; 234 return false; 235 } 236 237 /** 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 239 * 240 * @dev: drm_device pointer 241 * 242 * Returns true if the device is a dGPU with ACPI power control, 243 * otherwise return false. 244 */ 245 bool amdgpu_device_supports_boco(struct drm_device *dev) 246 { 247 struct amdgpu_device *adev = drm_to_adev(dev); 248 249 if (adev->has_pr3 || 250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 251 return true; 252 return false; 253 } 254 255 /** 256 * amdgpu_device_supports_baco - Does the device support BACO 257 * 258 * @dev: drm_device pointer 259 * 260 * Returns true if the device supporte BACO, 261 * otherwise return false. 262 */ 263 bool amdgpu_device_supports_baco(struct drm_device *dev) 264 { 265 struct amdgpu_device *adev = drm_to_adev(dev); 266 267 return amdgpu_asic_supports_baco(adev); 268 } 269 270 /** 271 * amdgpu_device_supports_smart_shift - Is the device dGPU with 272 * smart shift support 273 * 274 * @dev: drm_device pointer 275 * 276 * Returns true if the device is a dGPU with Smart Shift support, 277 * otherwise returns false. 278 */ 279 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 280 { 281 return (amdgpu_device_supports_boco(dev) && 282 amdgpu_acpi_is_power_shift_control_supported()); 283 } 284 285 /* 286 * VRAM access helper functions 287 */ 288 289 /** 290 * amdgpu_device_vram_access - read/write a buffer in vram 291 * 292 * @adev: amdgpu_device pointer 293 * @pos: offset of the buffer in vram 294 * @buf: virtual address of the buffer in system memory 295 * @size: read/write size, sizeof(@buf) must > @size 296 * @write: true - write to vram, otherwise - read from vram 297 */ 298 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 299 uint32_t *buf, size_t size, bool write) 300 { 301 unsigned long flags; 302 uint32_t hi = ~0; 303 uint64_t last; 304 int idx; 305 306 if (!drm_dev_enter(&adev->ddev, &idx)) 307 return; 308 309 #ifdef CONFIG_64BIT 310 last = min(pos + size, adev->gmc.visible_vram_size); 311 if (last > pos) { 312 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 313 size_t count = last - pos; 314 315 if (write) { 316 memcpy_toio(addr, buf, count); 317 mb(); 318 amdgpu_device_flush_hdp(adev, NULL); 319 } else { 320 amdgpu_device_invalidate_hdp(adev, NULL); 321 mb(); 322 memcpy_fromio(buf, addr, count); 323 } 324 325 if (count == size) 326 goto exit; 327 328 pos += count; 329 buf += count / 4; 330 size -= count; 331 } 332 #endif 333 334 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 335 for (last = pos + size; pos < last; pos += 4) { 336 uint32_t tmp = pos >> 31; 337 338 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 339 if (tmp != hi) { 340 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 341 hi = tmp; 342 } 343 if (write) 344 WREG32_NO_KIQ(mmMM_DATA, *buf++); 345 else 346 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 347 } 348 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 349 350 #ifdef CONFIG_64BIT 351 exit: 352 #endif 353 drm_dev_exit(idx); 354 } 355 356 /* 357 * register access helper functions. 358 */ 359 360 /* Check if hw access should be skipped because of hotplug or device error */ 361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 362 { 363 if (adev->no_hw_access) 364 return true; 365 366 #ifdef CONFIG_LOCKDEP 367 /* 368 * This is a bit complicated to understand, so worth a comment. What we assert 369 * here is that the GPU reset is not running on another thread in parallel. 370 * 371 * For this we trylock the read side of the reset semaphore, if that succeeds 372 * we know that the reset is not running in paralell. 373 * 374 * If the trylock fails we assert that we are either already holding the read 375 * side of the lock or are the reset thread itself and hold the write side of 376 * the lock. 377 */ 378 if (in_task()) { 379 if (down_read_trylock(&adev->reset_sem)) 380 up_read(&adev->reset_sem); 381 else 382 lockdep_assert_held(&adev->reset_sem); 383 } 384 #endif 385 return false; 386 } 387 388 /** 389 * amdgpu_device_rreg - read a memory mapped IO or indirect register 390 * 391 * @adev: amdgpu_device pointer 392 * @reg: dword aligned register offset 393 * @acc_flags: access flags which require special behavior 394 * 395 * Returns the 32 bit value from the offset specified. 396 */ 397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 398 uint32_t reg, uint32_t acc_flags) 399 { 400 uint32_t ret; 401 402 if (amdgpu_device_skip_hw_access(adev)) 403 return 0; 404 405 if ((reg * 4) < adev->rmmio_size) { 406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 407 amdgpu_sriov_runtime(adev) && 408 down_read_trylock(&adev->reset_sem)) { 409 ret = amdgpu_kiq_rreg(adev, reg); 410 up_read(&adev->reset_sem); 411 } else { 412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 413 } 414 } else { 415 ret = adev->pcie_rreg(adev, reg * 4); 416 } 417 418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 419 420 return ret; 421 } 422 423 /* 424 * MMIO register read with bytes helper functions 425 * @offset:bytes offset from MMIO start 426 * 427 */ 428 429 /** 430 * amdgpu_mm_rreg8 - read a memory mapped IO register 431 * 432 * @adev: amdgpu_device pointer 433 * @offset: byte aligned register offset 434 * 435 * Returns the 8 bit value from the offset specified. 436 */ 437 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 438 { 439 if (amdgpu_device_skip_hw_access(adev)) 440 return 0; 441 442 if (offset < adev->rmmio_size) 443 return (readb(adev->rmmio + offset)); 444 BUG(); 445 } 446 447 /* 448 * MMIO register write with bytes helper functions 449 * @offset:bytes offset from MMIO start 450 * @value: the value want to be written to the register 451 * 452 */ 453 /** 454 * amdgpu_mm_wreg8 - read a memory mapped IO register 455 * 456 * @adev: amdgpu_device pointer 457 * @offset: byte aligned register offset 458 * @value: 8 bit value to write 459 * 460 * Writes the value specified to the offset specified. 461 */ 462 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 463 { 464 if (amdgpu_device_skip_hw_access(adev)) 465 return; 466 467 if (offset < adev->rmmio_size) 468 writeb(value, adev->rmmio + offset); 469 else 470 BUG(); 471 } 472 473 /** 474 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 475 * 476 * @adev: amdgpu_device pointer 477 * @reg: dword aligned register offset 478 * @v: 32 bit value to write to the register 479 * @acc_flags: access flags which require special behavior 480 * 481 * Writes the value specified to the offset specified. 482 */ 483 void amdgpu_device_wreg(struct amdgpu_device *adev, 484 uint32_t reg, uint32_t v, 485 uint32_t acc_flags) 486 { 487 if (amdgpu_device_skip_hw_access(adev)) 488 return; 489 490 if ((reg * 4) < adev->rmmio_size) { 491 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 492 amdgpu_sriov_runtime(adev) && 493 down_read_trylock(&adev->reset_sem)) { 494 amdgpu_kiq_wreg(adev, reg, v); 495 up_read(&adev->reset_sem); 496 } else { 497 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 498 } 499 } else { 500 adev->pcie_wreg(adev, reg * 4, v); 501 } 502 503 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 504 } 505 506 /* 507 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 508 * 509 * this function is invoked only the debugfs register access 510 * */ 511 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 512 uint32_t reg, uint32_t v) 513 { 514 if (amdgpu_device_skip_hw_access(adev)) 515 return; 516 517 if (amdgpu_sriov_fullaccess(adev) && 518 adev->gfx.rlc.funcs && 519 adev->gfx.rlc.funcs->is_rlcg_access_range) { 520 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 521 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0, 0); 522 } else { 523 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 524 } 525 } 526 527 /** 528 * amdgpu_mm_rdoorbell - read a doorbell dword 529 * 530 * @adev: amdgpu_device pointer 531 * @index: doorbell index 532 * 533 * Returns the value in the doorbell aperture at the 534 * requested doorbell index (CIK). 535 */ 536 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 537 { 538 if (amdgpu_device_skip_hw_access(adev)) 539 return 0; 540 541 if (index < adev->doorbell.num_doorbells) { 542 return readl(adev->doorbell.ptr + index); 543 } else { 544 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 545 return 0; 546 } 547 } 548 549 /** 550 * amdgpu_mm_wdoorbell - write a doorbell dword 551 * 552 * @adev: amdgpu_device pointer 553 * @index: doorbell index 554 * @v: value to write 555 * 556 * Writes @v to the doorbell aperture at the 557 * requested doorbell index (CIK). 558 */ 559 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 560 { 561 if (amdgpu_device_skip_hw_access(adev)) 562 return; 563 564 if (index < adev->doorbell.num_doorbells) { 565 writel(v, adev->doorbell.ptr + index); 566 } else { 567 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 568 } 569 } 570 571 /** 572 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 573 * 574 * @adev: amdgpu_device pointer 575 * @index: doorbell index 576 * 577 * Returns the value in the doorbell aperture at the 578 * requested doorbell index (VEGA10+). 579 */ 580 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 581 { 582 if (amdgpu_device_skip_hw_access(adev)) 583 return 0; 584 585 if (index < adev->doorbell.num_doorbells) { 586 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 587 } else { 588 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 589 return 0; 590 } 591 } 592 593 /** 594 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 595 * 596 * @adev: amdgpu_device pointer 597 * @index: doorbell index 598 * @v: value to write 599 * 600 * Writes @v to the doorbell aperture at the 601 * requested doorbell index (VEGA10+). 602 */ 603 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 604 { 605 if (amdgpu_device_skip_hw_access(adev)) 606 return; 607 608 if (index < adev->doorbell.num_doorbells) { 609 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 610 } else { 611 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 612 } 613 } 614 615 /** 616 * amdgpu_device_indirect_rreg - read an indirect register 617 * 618 * @adev: amdgpu_device pointer 619 * @pcie_index: mmio register offset 620 * @pcie_data: mmio register offset 621 * @reg_addr: indirect register address to read from 622 * 623 * Returns the value of indirect register @reg_addr 624 */ 625 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 626 u32 pcie_index, u32 pcie_data, 627 u32 reg_addr) 628 { 629 unsigned long flags; 630 u32 r; 631 void __iomem *pcie_index_offset; 632 void __iomem *pcie_data_offset; 633 634 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 635 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 636 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 637 638 writel(reg_addr, pcie_index_offset); 639 readl(pcie_index_offset); 640 r = readl(pcie_data_offset); 641 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 642 643 return r; 644 } 645 646 /** 647 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 648 * 649 * @adev: amdgpu_device pointer 650 * @pcie_index: mmio register offset 651 * @pcie_data: mmio register offset 652 * @reg_addr: indirect register address to read from 653 * 654 * Returns the value of indirect register @reg_addr 655 */ 656 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 657 u32 pcie_index, u32 pcie_data, 658 u32 reg_addr) 659 { 660 unsigned long flags; 661 u64 r; 662 void __iomem *pcie_index_offset; 663 void __iomem *pcie_data_offset; 664 665 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 666 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 667 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 668 669 /* read low 32 bits */ 670 writel(reg_addr, pcie_index_offset); 671 readl(pcie_index_offset); 672 r = readl(pcie_data_offset); 673 /* read high 32 bits */ 674 writel(reg_addr + 4, pcie_index_offset); 675 readl(pcie_index_offset); 676 r |= ((u64)readl(pcie_data_offset) << 32); 677 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 678 679 return r; 680 } 681 682 /** 683 * amdgpu_device_indirect_wreg - write an indirect register address 684 * 685 * @adev: amdgpu_device pointer 686 * @pcie_index: mmio register offset 687 * @pcie_data: mmio register offset 688 * @reg_addr: indirect register offset 689 * @reg_data: indirect register data 690 * 691 */ 692 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 693 u32 pcie_index, u32 pcie_data, 694 u32 reg_addr, u32 reg_data) 695 { 696 unsigned long flags; 697 void __iomem *pcie_index_offset; 698 void __iomem *pcie_data_offset; 699 700 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 701 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 702 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 703 704 writel(reg_addr, pcie_index_offset); 705 readl(pcie_index_offset); 706 writel(reg_data, pcie_data_offset); 707 readl(pcie_data_offset); 708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 709 } 710 711 /** 712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 713 * 714 * @adev: amdgpu_device pointer 715 * @pcie_index: mmio register offset 716 * @pcie_data: mmio register offset 717 * @reg_addr: indirect register offset 718 * @reg_data: indirect register data 719 * 720 */ 721 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 722 u32 pcie_index, u32 pcie_data, 723 u32 reg_addr, u64 reg_data) 724 { 725 unsigned long flags; 726 void __iomem *pcie_index_offset; 727 void __iomem *pcie_data_offset; 728 729 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 732 733 /* write low 32 bits */ 734 writel(reg_addr, pcie_index_offset); 735 readl(pcie_index_offset); 736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 737 readl(pcie_data_offset); 738 /* write high 32 bits */ 739 writel(reg_addr + 4, pcie_index_offset); 740 readl(pcie_index_offset); 741 writel((u32)(reg_data >> 32), pcie_data_offset); 742 readl(pcie_data_offset); 743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 744 } 745 746 /** 747 * amdgpu_invalid_rreg - dummy reg read function 748 * 749 * @adev: amdgpu_device pointer 750 * @reg: offset of register 751 * 752 * Dummy register read function. Used for register blocks 753 * that certain asics don't have (all asics). 754 * Returns the value in the register. 755 */ 756 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 757 { 758 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 759 BUG(); 760 return 0; 761 } 762 763 /** 764 * amdgpu_invalid_wreg - dummy reg write function 765 * 766 * @adev: amdgpu_device pointer 767 * @reg: offset of register 768 * @v: value to write to the register 769 * 770 * Dummy register read function. Used for register blocks 771 * that certain asics don't have (all asics). 772 */ 773 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 774 { 775 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 776 reg, v); 777 BUG(); 778 } 779 780 /** 781 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 782 * 783 * @adev: amdgpu_device pointer 784 * @reg: offset of register 785 * 786 * Dummy register read function. Used for register blocks 787 * that certain asics don't have (all asics). 788 * Returns the value in the register. 789 */ 790 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 791 { 792 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 793 BUG(); 794 return 0; 795 } 796 797 /** 798 * amdgpu_invalid_wreg64 - dummy reg write function 799 * 800 * @adev: amdgpu_device pointer 801 * @reg: offset of register 802 * @v: value to write to the register 803 * 804 * Dummy register read function. Used for register blocks 805 * that certain asics don't have (all asics). 806 */ 807 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 808 { 809 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 810 reg, v); 811 BUG(); 812 } 813 814 /** 815 * amdgpu_block_invalid_rreg - dummy reg read function 816 * 817 * @adev: amdgpu_device pointer 818 * @block: offset of instance 819 * @reg: offset of register 820 * 821 * Dummy register read function. Used for register blocks 822 * that certain asics don't have (all asics). 823 * Returns the value in the register. 824 */ 825 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 826 uint32_t block, uint32_t reg) 827 { 828 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 829 reg, block); 830 BUG(); 831 return 0; 832 } 833 834 /** 835 * amdgpu_block_invalid_wreg - dummy reg write function 836 * 837 * @adev: amdgpu_device pointer 838 * @block: offset of instance 839 * @reg: offset of register 840 * @v: value to write to the register 841 * 842 * Dummy register read function. Used for register blocks 843 * that certain asics don't have (all asics). 844 */ 845 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 846 uint32_t block, 847 uint32_t reg, uint32_t v) 848 { 849 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 850 reg, block, v); 851 BUG(); 852 } 853 854 /** 855 * amdgpu_device_asic_init - Wrapper for atom asic_init 856 * 857 * @adev: amdgpu_device pointer 858 * 859 * Does any asic specific work and then calls atom asic init. 860 */ 861 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 862 { 863 amdgpu_asic_pre_asic_init(adev); 864 865 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 866 } 867 868 /** 869 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 870 * 871 * @adev: amdgpu_device pointer 872 * 873 * Allocates a scratch page of VRAM for use by various things in the 874 * driver. 875 */ 876 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 877 { 878 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 879 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 880 &adev->vram_scratch.robj, 881 &adev->vram_scratch.gpu_addr, 882 (void **)&adev->vram_scratch.ptr); 883 } 884 885 /** 886 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 887 * 888 * @adev: amdgpu_device pointer 889 * 890 * Frees the VRAM scratch page. 891 */ 892 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 893 { 894 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 895 } 896 897 /** 898 * amdgpu_device_program_register_sequence - program an array of registers. 899 * 900 * @adev: amdgpu_device pointer 901 * @registers: pointer to the register array 902 * @array_size: size of the register array 903 * 904 * Programs an array or registers with and and or masks. 905 * This is a helper for setting golden registers. 906 */ 907 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 908 const u32 *registers, 909 const u32 array_size) 910 { 911 u32 tmp, reg, and_mask, or_mask; 912 int i; 913 914 if (array_size % 3) 915 return; 916 917 for (i = 0; i < array_size; i +=3) { 918 reg = registers[i + 0]; 919 and_mask = registers[i + 1]; 920 or_mask = registers[i + 2]; 921 922 if (and_mask == 0xffffffff) { 923 tmp = or_mask; 924 } else { 925 tmp = RREG32(reg); 926 tmp &= ~and_mask; 927 if (adev->family >= AMDGPU_FAMILY_AI) 928 tmp |= (or_mask & and_mask); 929 else 930 tmp |= or_mask; 931 } 932 WREG32(reg, tmp); 933 } 934 } 935 936 /** 937 * amdgpu_device_pci_config_reset - reset the GPU 938 * 939 * @adev: amdgpu_device pointer 940 * 941 * Resets the GPU using the pci config reset sequence. 942 * Only applicable to asics prior to vega10. 943 */ 944 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 945 { 946 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 947 } 948 949 /** 950 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 951 * 952 * @adev: amdgpu_device pointer 953 * 954 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 955 */ 956 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 957 { 958 return pci_reset_function(adev->pdev); 959 } 960 961 /* 962 * GPU doorbell aperture helpers function. 963 */ 964 /** 965 * amdgpu_device_doorbell_init - Init doorbell driver information. 966 * 967 * @adev: amdgpu_device pointer 968 * 969 * Init doorbell driver information (CIK) 970 * Returns 0 on success, error on failure. 971 */ 972 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 973 { 974 975 /* No doorbell on SI hardware generation */ 976 if (adev->asic_type < CHIP_BONAIRE) { 977 adev->doorbell.base = 0; 978 adev->doorbell.size = 0; 979 adev->doorbell.num_doorbells = 0; 980 adev->doorbell.ptr = NULL; 981 return 0; 982 } 983 984 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 985 return -EINVAL; 986 987 amdgpu_asic_init_doorbell_index(adev); 988 989 /* doorbell bar mapping */ 990 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 991 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 992 993 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 994 adev->doorbell_index.max_assignment+1); 995 if (adev->doorbell.num_doorbells == 0) 996 return -EINVAL; 997 998 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 999 * paging queue doorbell use the second page. The 1000 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1001 * doorbells are in the first page. So with paging queue enabled, 1002 * the max num_doorbells should + 1 page (0x400 in dword) 1003 */ 1004 if (adev->asic_type >= CHIP_VEGA10) 1005 adev->doorbell.num_doorbells += 0x400; 1006 1007 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1008 adev->doorbell.num_doorbells * 1009 sizeof(u32)); 1010 if (adev->doorbell.ptr == NULL) 1011 return -ENOMEM; 1012 1013 return 0; 1014 } 1015 1016 /** 1017 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1018 * 1019 * @adev: amdgpu_device pointer 1020 * 1021 * Tear down doorbell driver information (CIK) 1022 */ 1023 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1024 { 1025 iounmap(adev->doorbell.ptr); 1026 adev->doorbell.ptr = NULL; 1027 } 1028 1029 1030 1031 /* 1032 * amdgpu_device_wb_*() 1033 * Writeback is the method by which the GPU updates special pages in memory 1034 * with the status of certain GPU events (fences, ring pointers,etc.). 1035 */ 1036 1037 /** 1038 * amdgpu_device_wb_fini - Disable Writeback and free memory 1039 * 1040 * @adev: amdgpu_device pointer 1041 * 1042 * Disables Writeback and frees the Writeback memory (all asics). 1043 * Used at driver shutdown. 1044 */ 1045 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1046 { 1047 if (adev->wb.wb_obj) { 1048 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1049 &adev->wb.gpu_addr, 1050 (void **)&adev->wb.wb); 1051 adev->wb.wb_obj = NULL; 1052 } 1053 } 1054 1055 /** 1056 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1057 * 1058 * @adev: amdgpu_device pointer 1059 * 1060 * Initializes writeback and allocates writeback memory (all asics). 1061 * Used at driver startup. 1062 * Returns 0 on success or an -error on failure. 1063 */ 1064 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1065 { 1066 int r; 1067 1068 if (adev->wb.wb_obj == NULL) { 1069 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1070 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1071 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1072 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1073 (void **)&adev->wb.wb); 1074 if (r) { 1075 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1076 return r; 1077 } 1078 1079 adev->wb.num_wb = AMDGPU_MAX_WB; 1080 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1081 1082 /* clear wb memory */ 1083 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1084 } 1085 1086 return 0; 1087 } 1088 1089 /** 1090 * amdgpu_device_wb_get - Allocate a wb entry 1091 * 1092 * @adev: amdgpu_device pointer 1093 * @wb: wb index 1094 * 1095 * Allocate a wb slot for use by the driver (all asics). 1096 * Returns 0 on success or -EINVAL on failure. 1097 */ 1098 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1099 { 1100 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1101 1102 if (offset < adev->wb.num_wb) { 1103 __set_bit(offset, adev->wb.used); 1104 *wb = offset << 3; /* convert to dw offset */ 1105 return 0; 1106 } else { 1107 return -EINVAL; 1108 } 1109 } 1110 1111 /** 1112 * amdgpu_device_wb_free - Free a wb entry 1113 * 1114 * @adev: amdgpu_device pointer 1115 * @wb: wb index 1116 * 1117 * Free a wb slot allocated for use by the driver (all asics) 1118 */ 1119 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1120 { 1121 wb >>= 3; 1122 if (wb < adev->wb.num_wb) 1123 __clear_bit(wb, adev->wb.used); 1124 } 1125 1126 /** 1127 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1128 * 1129 * @adev: amdgpu_device pointer 1130 * 1131 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1132 * to fail, but if any of the BARs is not accessible after the size we abort 1133 * driver loading by returning -ENODEV. 1134 */ 1135 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1136 { 1137 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1138 struct pci_bus *root; 1139 struct resource *res; 1140 unsigned i; 1141 u16 cmd; 1142 int r; 1143 1144 /* Bypass for VF */ 1145 if (amdgpu_sriov_vf(adev)) 1146 return 0; 1147 1148 /* skip if the bios has already enabled large BAR */ 1149 if (adev->gmc.real_vram_size && 1150 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1151 return 0; 1152 1153 /* Check if the root BUS has 64bit memory resources */ 1154 root = adev->pdev->bus; 1155 while (root->parent) 1156 root = root->parent; 1157 1158 pci_bus_for_each_resource(root, res, i) { 1159 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1160 res->start > 0x100000000ull) 1161 break; 1162 } 1163 1164 /* Trying to resize is pointless without a root hub window above 4GB */ 1165 if (!res) 1166 return 0; 1167 1168 /* Limit the BAR size to what is available */ 1169 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1170 rbar_size); 1171 1172 /* Disable memory decoding while we change the BAR addresses and size */ 1173 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1174 pci_write_config_word(adev->pdev, PCI_COMMAND, 1175 cmd & ~PCI_COMMAND_MEMORY); 1176 1177 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1178 amdgpu_device_doorbell_fini(adev); 1179 if (adev->asic_type >= CHIP_BONAIRE) 1180 pci_release_resource(adev->pdev, 2); 1181 1182 pci_release_resource(adev->pdev, 0); 1183 1184 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1185 if (r == -ENOSPC) 1186 DRM_INFO("Not enough PCI address space for a large BAR."); 1187 else if (r && r != -ENOTSUPP) 1188 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1189 1190 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1191 1192 /* When the doorbell or fb BAR isn't available we have no chance of 1193 * using the device. 1194 */ 1195 r = amdgpu_device_doorbell_init(adev); 1196 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1197 return -ENODEV; 1198 1199 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1200 1201 return 0; 1202 } 1203 1204 /* 1205 * GPU helpers function. 1206 */ 1207 /** 1208 * amdgpu_device_need_post - check if the hw need post or not 1209 * 1210 * @adev: amdgpu_device pointer 1211 * 1212 * Check if the asic has been initialized (all asics) at driver startup 1213 * or post is needed if hw reset is performed. 1214 * Returns true if need or false if not. 1215 */ 1216 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1217 { 1218 uint32_t reg; 1219 1220 if (amdgpu_sriov_vf(adev)) 1221 return false; 1222 1223 if (amdgpu_passthrough(adev)) { 1224 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1225 * some old smc fw still need driver do vPost otherwise gpu hang, while 1226 * those smc fw version above 22.15 doesn't have this flaw, so we force 1227 * vpost executed for smc version below 22.15 1228 */ 1229 if (adev->asic_type == CHIP_FIJI) { 1230 int err; 1231 uint32_t fw_ver; 1232 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1233 /* force vPost if error occured */ 1234 if (err) 1235 return true; 1236 1237 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1238 if (fw_ver < 0x00160e00) 1239 return true; 1240 } 1241 } 1242 1243 /* Don't post if we need to reset whole hive on init */ 1244 if (adev->gmc.xgmi.pending_reset) 1245 return false; 1246 1247 if (adev->has_hw_reset) { 1248 adev->has_hw_reset = false; 1249 return true; 1250 } 1251 1252 /* bios scratch used on CIK+ */ 1253 if (adev->asic_type >= CHIP_BONAIRE) 1254 return amdgpu_atombios_scratch_need_asic_init(adev); 1255 1256 /* check MEM_SIZE for older asics */ 1257 reg = amdgpu_asic_get_config_memsize(adev); 1258 1259 if ((reg != 0) && (reg != 0xffffffff)) 1260 return false; 1261 1262 return true; 1263 } 1264 1265 /* if we get transitioned to only one device, take VGA back */ 1266 /** 1267 * amdgpu_device_vga_set_decode - enable/disable vga decode 1268 * 1269 * @pdev: PCI device pointer 1270 * @state: enable/disable vga decode 1271 * 1272 * Enable/disable vga decode (all asics). 1273 * Returns VGA resource flags. 1274 */ 1275 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1276 bool state) 1277 { 1278 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1279 amdgpu_asic_set_vga_state(adev, state); 1280 if (state) 1281 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1282 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1283 else 1284 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1285 } 1286 1287 /** 1288 * amdgpu_device_check_block_size - validate the vm block size 1289 * 1290 * @adev: amdgpu_device pointer 1291 * 1292 * Validates the vm block size specified via module parameter. 1293 * The vm block size defines number of bits in page table versus page directory, 1294 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1295 * page table and the remaining bits are in the page directory. 1296 */ 1297 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1298 { 1299 /* defines number of bits in page table versus page directory, 1300 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1301 * page table and the remaining bits are in the page directory */ 1302 if (amdgpu_vm_block_size == -1) 1303 return; 1304 1305 if (amdgpu_vm_block_size < 9) { 1306 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1307 amdgpu_vm_block_size); 1308 amdgpu_vm_block_size = -1; 1309 } 1310 } 1311 1312 /** 1313 * amdgpu_device_check_vm_size - validate the vm size 1314 * 1315 * @adev: amdgpu_device pointer 1316 * 1317 * Validates the vm size in GB specified via module parameter. 1318 * The VM size is the size of the GPU virtual memory space in GB. 1319 */ 1320 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1321 { 1322 /* no need to check the default value */ 1323 if (amdgpu_vm_size == -1) 1324 return; 1325 1326 if (amdgpu_vm_size < 1) { 1327 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1328 amdgpu_vm_size); 1329 amdgpu_vm_size = -1; 1330 } 1331 } 1332 1333 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1334 { 1335 struct sysinfo si; 1336 bool is_os_64 = (sizeof(void *) == 8); 1337 uint64_t total_memory; 1338 uint64_t dram_size_seven_GB = 0x1B8000000; 1339 uint64_t dram_size_three_GB = 0xB8000000; 1340 1341 if (amdgpu_smu_memory_pool_size == 0) 1342 return; 1343 1344 if (!is_os_64) { 1345 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1346 goto def_value; 1347 } 1348 si_meminfo(&si); 1349 total_memory = (uint64_t)si.totalram * si.mem_unit; 1350 1351 if ((amdgpu_smu_memory_pool_size == 1) || 1352 (amdgpu_smu_memory_pool_size == 2)) { 1353 if (total_memory < dram_size_three_GB) 1354 goto def_value1; 1355 } else if ((amdgpu_smu_memory_pool_size == 4) || 1356 (amdgpu_smu_memory_pool_size == 8)) { 1357 if (total_memory < dram_size_seven_GB) 1358 goto def_value1; 1359 } else { 1360 DRM_WARN("Smu memory pool size not supported\n"); 1361 goto def_value; 1362 } 1363 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1364 1365 return; 1366 1367 def_value1: 1368 DRM_WARN("No enough system memory\n"); 1369 def_value: 1370 adev->pm.smu_prv_buffer_size = 0; 1371 } 1372 1373 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1374 { 1375 if (!(adev->flags & AMD_IS_APU) || 1376 adev->asic_type < CHIP_RAVEN) 1377 return 0; 1378 1379 switch (adev->asic_type) { 1380 case CHIP_RAVEN: 1381 if (adev->pdev->device == 0x15dd) 1382 adev->apu_flags |= AMD_APU_IS_RAVEN; 1383 if (adev->pdev->device == 0x15d8) 1384 adev->apu_flags |= AMD_APU_IS_PICASSO; 1385 break; 1386 case CHIP_RENOIR: 1387 if ((adev->pdev->device == 0x1636) || 1388 (adev->pdev->device == 0x164c)) 1389 adev->apu_flags |= AMD_APU_IS_RENOIR; 1390 else 1391 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1392 break; 1393 case CHIP_VANGOGH: 1394 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1395 break; 1396 case CHIP_YELLOW_CARP: 1397 break; 1398 default: 1399 return -EINVAL; 1400 } 1401 1402 return 0; 1403 } 1404 1405 /** 1406 * amdgpu_device_check_arguments - validate module params 1407 * 1408 * @adev: amdgpu_device pointer 1409 * 1410 * Validates certain module parameters and updates 1411 * the associated values used by the driver (all asics). 1412 */ 1413 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1414 { 1415 if (amdgpu_sched_jobs < 4) { 1416 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1417 amdgpu_sched_jobs); 1418 amdgpu_sched_jobs = 4; 1419 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1420 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1421 amdgpu_sched_jobs); 1422 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1423 } 1424 1425 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1426 /* gart size must be greater or equal to 32M */ 1427 dev_warn(adev->dev, "gart size (%d) too small\n", 1428 amdgpu_gart_size); 1429 amdgpu_gart_size = -1; 1430 } 1431 1432 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1433 /* gtt size must be greater or equal to 32M */ 1434 dev_warn(adev->dev, "gtt size (%d) too small\n", 1435 amdgpu_gtt_size); 1436 amdgpu_gtt_size = -1; 1437 } 1438 1439 /* valid range is between 4 and 9 inclusive */ 1440 if (amdgpu_vm_fragment_size != -1 && 1441 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1442 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1443 amdgpu_vm_fragment_size = -1; 1444 } 1445 1446 if (amdgpu_sched_hw_submission < 2) { 1447 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1448 amdgpu_sched_hw_submission); 1449 amdgpu_sched_hw_submission = 2; 1450 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1451 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1452 amdgpu_sched_hw_submission); 1453 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1454 } 1455 1456 amdgpu_device_check_smu_prv_buffer_size(adev); 1457 1458 amdgpu_device_check_vm_size(adev); 1459 1460 amdgpu_device_check_block_size(adev); 1461 1462 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1463 1464 amdgpu_gmc_tmz_set(adev); 1465 1466 amdgpu_gmc_noretry_set(adev); 1467 1468 return 0; 1469 } 1470 1471 /** 1472 * amdgpu_switcheroo_set_state - set switcheroo state 1473 * 1474 * @pdev: pci dev pointer 1475 * @state: vga_switcheroo state 1476 * 1477 * Callback for the switcheroo driver. Suspends or resumes the 1478 * the asics before or after it is powered up using ACPI methods. 1479 */ 1480 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1481 enum vga_switcheroo_state state) 1482 { 1483 struct drm_device *dev = pci_get_drvdata(pdev); 1484 int r; 1485 1486 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1487 return; 1488 1489 if (state == VGA_SWITCHEROO_ON) { 1490 pr_info("switched on\n"); 1491 /* don't suspend or resume card normally */ 1492 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1493 1494 pci_set_power_state(pdev, PCI_D0); 1495 amdgpu_device_load_pci_state(pdev); 1496 r = pci_enable_device(pdev); 1497 if (r) 1498 DRM_WARN("pci_enable_device failed (%d)\n", r); 1499 amdgpu_device_resume(dev, true); 1500 1501 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1502 } else { 1503 pr_info("switched off\n"); 1504 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1505 amdgpu_device_suspend(dev, true); 1506 amdgpu_device_cache_pci_state(pdev); 1507 /* Shut down the device */ 1508 pci_disable_device(pdev); 1509 pci_set_power_state(pdev, PCI_D3cold); 1510 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1511 } 1512 } 1513 1514 /** 1515 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1516 * 1517 * @pdev: pci dev pointer 1518 * 1519 * Callback for the switcheroo driver. Check of the switcheroo 1520 * state can be changed. 1521 * Returns true if the state can be changed, false if not. 1522 */ 1523 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1524 { 1525 struct drm_device *dev = pci_get_drvdata(pdev); 1526 1527 /* 1528 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1529 * locking inversion with the driver load path. And the access here is 1530 * completely racy anyway. So don't bother with locking for now. 1531 */ 1532 return atomic_read(&dev->open_count) == 0; 1533 } 1534 1535 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1536 .set_gpu_state = amdgpu_switcheroo_set_state, 1537 .reprobe = NULL, 1538 .can_switch = amdgpu_switcheroo_can_switch, 1539 }; 1540 1541 /** 1542 * amdgpu_device_ip_set_clockgating_state - set the CG state 1543 * 1544 * @dev: amdgpu_device pointer 1545 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1546 * @state: clockgating state (gate or ungate) 1547 * 1548 * Sets the requested clockgating state for all instances of 1549 * the hardware IP specified. 1550 * Returns the error code from the last instance. 1551 */ 1552 int amdgpu_device_ip_set_clockgating_state(void *dev, 1553 enum amd_ip_block_type block_type, 1554 enum amd_clockgating_state state) 1555 { 1556 struct amdgpu_device *adev = dev; 1557 int i, r = 0; 1558 1559 for (i = 0; i < adev->num_ip_blocks; i++) { 1560 if (!adev->ip_blocks[i].status.valid) 1561 continue; 1562 if (adev->ip_blocks[i].version->type != block_type) 1563 continue; 1564 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1565 continue; 1566 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1567 (void *)adev, state); 1568 if (r) 1569 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1570 adev->ip_blocks[i].version->funcs->name, r); 1571 } 1572 return r; 1573 } 1574 1575 /** 1576 * amdgpu_device_ip_set_powergating_state - set the PG state 1577 * 1578 * @dev: amdgpu_device pointer 1579 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1580 * @state: powergating state (gate or ungate) 1581 * 1582 * Sets the requested powergating state for all instances of 1583 * the hardware IP specified. 1584 * Returns the error code from the last instance. 1585 */ 1586 int amdgpu_device_ip_set_powergating_state(void *dev, 1587 enum amd_ip_block_type block_type, 1588 enum amd_powergating_state state) 1589 { 1590 struct amdgpu_device *adev = dev; 1591 int i, r = 0; 1592 1593 for (i = 0; i < adev->num_ip_blocks; i++) { 1594 if (!adev->ip_blocks[i].status.valid) 1595 continue; 1596 if (adev->ip_blocks[i].version->type != block_type) 1597 continue; 1598 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1599 continue; 1600 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1601 (void *)adev, state); 1602 if (r) 1603 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1604 adev->ip_blocks[i].version->funcs->name, r); 1605 } 1606 return r; 1607 } 1608 1609 /** 1610 * amdgpu_device_ip_get_clockgating_state - get the CG state 1611 * 1612 * @adev: amdgpu_device pointer 1613 * @flags: clockgating feature flags 1614 * 1615 * Walks the list of IPs on the device and updates the clockgating 1616 * flags for each IP. 1617 * Updates @flags with the feature flags for each hardware IP where 1618 * clockgating is enabled. 1619 */ 1620 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1621 u32 *flags) 1622 { 1623 int i; 1624 1625 for (i = 0; i < adev->num_ip_blocks; i++) { 1626 if (!adev->ip_blocks[i].status.valid) 1627 continue; 1628 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1629 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1630 } 1631 } 1632 1633 /** 1634 * amdgpu_device_ip_wait_for_idle - wait for idle 1635 * 1636 * @adev: amdgpu_device pointer 1637 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1638 * 1639 * Waits for the request hardware IP to be idle. 1640 * Returns 0 for success or a negative error code on failure. 1641 */ 1642 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1643 enum amd_ip_block_type block_type) 1644 { 1645 int i, r; 1646 1647 for (i = 0; i < adev->num_ip_blocks; i++) { 1648 if (!adev->ip_blocks[i].status.valid) 1649 continue; 1650 if (adev->ip_blocks[i].version->type == block_type) { 1651 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1652 if (r) 1653 return r; 1654 break; 1655 } 1656 } 1657 return 0; 1658 1659 } 1660 1661 /** 1662 * amdgpu_device_ip_is_idle - is the hardware IP idle 1663 * 1664 * @adev: amdgpu_device pointer 1665 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1666 * 1667 * Check if the hardware IP is idle or not. 1668 * Returns true if it the IP is idle, false if not. 1669 */ 1670 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1671 enum amd_ip_block_type block_type) 1672 { 1673 int i; 1674 1675 for (i = 0; i < adev->num_ip_blocks; i++) { 1676 if (!adev->ip_blocks[i].status.valid) 1677 continue; 1678 if (adev->ip_blocks[i].version->type == block_type) 1679 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1680 } 1681 return true; 1682 1683 } 1684 1685 /** 1686 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1687 * 1688 * @adev: amdgpu_device pointer 1689 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1690 * 1691 * Returns a pointer to the hardware IP block structure 1692 * if it exists for the asic, otherwise NULL. 1693 */ 1694 struct amdgpu_ip_block * 1695 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1696 enum amd_ip_block_type type) 1697 { 1698 int i; 1699 1700 for (i = 0; i < adev->num_ip_blocks; i++) 1701 if (adev->ip_blocks[i].version->type == type) 1702 return &adev->ip_blocks[i]; 1703 1704 return NULL; 1705 } 1706 1707 /** 1708 * amdgpu_device_ip_block_version_cmp 1709 * 1710 * @adev: amdgpu_device pointer 1711 * @type: enum amd_ip_block_type 1712 * @major: major version 1713 * @minor: minor version 1714 * 1715 * return 0 if equal or greater 1716 * return 1 if smaller or the ip_block doesn't exist 1717 */ 1718 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1719 enum amd_ip_block_type type, 1720 u32 major, u32 minor) 1721 { 1722 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1723 1724 if (ip_block && ((ip_block->version->major > major) || 1725 ((ip_block->version->major == major) && 1726 (ip_block->version->minor >= minor)))) 1727 return 0; 1728 1729 return 1; 1730 } 1731 1732 /** 1733 * amdgpu_device_ip_block_add 1734 * 1735 * @adev: amdgpu_device pointer 1736 * @ip_block_version: pointer to the IP to add 1737 * 1738 * Adds the IP block driver information to the collection of IPs 1739 * on the asic. 1740 */ 1741 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1742 const struct amdgpu_ip_block_version *ip_block_version) 1743 { 1744 if (!ip_block_version) 1745 return -EINVAL; 1746 1747 switch (ip_block_version->type) { 1748 case AMD_IP_BLOCK_TYPE_VCN: 1749 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1750 return 0; 1751 break; 1752 case AMD_IP_BLOCK_TYPE_JPEG: 1753 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1754 return 0; 1755 break; 1756 default: 1757 break; 1758 } 1759 1760 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1761 ip_block_version->funcs->name); 1762 1763 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1764 1765 return 0; 1766 } 1767 1768 /** 1769 * amdgpu_device_enable_virtual_display - enable virtual display feature 1770 * 1771 * @adev: amdgpu_device pointer 1772 * 1773 * Enabled the virtual display feature if the user has enabled it via 1774 * the module parameter virtual_display. This feature provides a virtual 1775 * display hardware on headless boards or in virtualized environments. 1776 * This function parses and validates the configuration string specified by 1777 * the user and configues the virtual display configuration (number of 1778 * virtual connectors, crtcs, etc.) specified. 1779 */ 1780 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1781 { 1782 adev->enable_virtual_display = false; 1783 1784 if (amdgpu_virtual_display) { 1785 const char *pci_address_name = pci_name(adev->pdev); 1786 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1787 1788 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1789 pciaddstr_tmp = pciaddstr; 1790 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1791 pciaddname = strsep(&pciaddname_tmp, ","); 1792 if (!strcmp("all", pciaddname) 1793 || !strcmp(pci_address_name, pciaddname)) { 1794 long num_crtc; 1795 int res = -1; 1796 1797 adev->enable_virtual_display = true; 1798 1799 if (pciaddname_tmp) 1800 res = kstrtol(pciaddname_tmp, 10, 1801 &num_crtc); 1802 1803 if (!res) { 1804 if (num_crtc < 1) 1805 num_crtc = 1; 1806 if (num_crtc > 6) 1807 num_crtc = 6; 1808 adev->mode_info.num_crtc = num_crtc; 1809 } else { 1810 adev->mode_info.num_crtc = 1; 1811 } 1812 break; 1813 } 1814 } 1815 1816 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1817 amdgpu_virtual_display, pci_address_name, 1818 adev->enable_virtual_display, adev->mode_info.num_crtc); 1819 1820 kfree(pciaddstr); 1821 } 1822 } 1823 1824 /** 1825 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1826 * 1827 * @adev: amdgpu_device pointer 1828 * 1829 * Parses the asic configuration parameters specified in the gpu info 1830 * firmware and makes them availale to the driver for use in configuring 1831 * the asic. 1832 * Returns 0 on success, -EINVAL on failure. 1833 */ 1834 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1835 { 1836 const char *chip_name; 1837 char fw_name[40]; 1838 int err; 1839 const struct gpu_info_firmware_header_v1_0 *hdr; 1840 1841 adev->firmware.gpu_info_fw = NULL; 1842 1843 if (adev->mman.discovery_bin) { 1844 amdgpu_discovery_get_gfx_info(adev); 1845 1846 /* 1847 * FIXME: The bounding box is still needed by Navi12, so 1848 * temporarily read it from gpu_info firmware. Should be droped 1849 * when DAL no longer needs it. 1850 */ 1851 if (adev->asic_type != CHIP_NAVI12) 1852 return 0; 1853 } 1854 1855 switch (adev->asic_type) { 1856 #ifdef CONFIG_DRM_AMDGPU_SI 1857 case CHIP_VERDE: 1858 case CHIP_TAHITI: 1859 case CHIP_PITCAIRN: 1860 case CHIP_OLAND: 1861 case CHIP_HAINAN: 1862 #endif 1863 #ifdef CONFIG_DRM_AMDGPU_CIK 1864 case CHIP_BONAIRE: 1865 case CHIP_HAWAII: 1866 case CHIP_KAVERI: 1867 case CHIP_KABINI: 1868 case CHIP_MULLINS: 1869 #endif 1870 case CHIP_TOPAZ: 1871 case CHIP_TONGA: 1872 case CHIP_FIJI: 1873 case CHIP_POLARIS10: 1874 case CHIP_POLARIS11: 1875 case CHIP_POLARIS12: 1876 case CHIP_VEGAM: 1877 case CHIP_CARRIZO: 1878 case CHIP_STONEY: 1879 case CHIP_VEGA20: 1880 case CHIP_ALDEBARAN: 1881 case CHIP_SIENNA_CICHLID: 1882 case CHIP_NAVY_FLOUNDER: 1883 case CHIP_DIMGREY_CAVEFISH: 1884 case CHIP_BEIGE_GOBY: 1885 default: 1886 return 0; 1887 case CHIP_VEGA10: 1888 chip_name = "vega10"; 1889 break; 1890 case CHIP_VEGA12: 1891 chip_name = "vega12"; 1892 break; 1893 case CHIP_RAVEN: 1894 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1895 chip_name = "raven2"; 1896 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1897 chip_name = "picasso"; 1898 else 1899 chip_name = "raven"; 1900 break; 1901 case CHIP_ARCTURUS: 1902 chip_name = "arcturus"; 1903 break; 1904 case CHIP_RENOIR: 1905 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1906 chip_name = "renoir"; 1907 else 1908 chip_name = "green_sardine"; 1909 break; 1910 case CHIP_NAVI10: 1911 chip_name = "navi10"; 1912 break; 1913 case CHIP_NAVI14: 1914 chip_name = "navi14"; 1915 break; 1916 case CHIP_NAVI12: 1917 chip_name = "navi12"; 1918 break; 1919 case CHIP_VANGOGH: 1920 chip_name = "vangogh"; 1921 break; 1922 case CHIP_YELLOW_CARP: 1923 chip_name = "yellow_carp"; 1924 break; 1925 } 1926 1927 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1928 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1929 if (err) { 1930 dev_err(adev->dev, 1931 "Failed to load gpu_info firmware \"%s\"\n", 1932 fw_name); 1933 goto out; 1934 } 1935 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1936 if (err) { 1937 dev_err(adev->dev, 1938 "Failed to validate gpu_info firmware \"%s\"\n", 1939 fw_name); 1940 goto out; 1941 } 1942 1943 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1944 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1945 1946 switch (hdr->version_major) { 1947 case 1: 1948 { 1949 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1950 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1951 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1952 1953 /* 1954 * Should be droped when DAL no longer needs it. 1955 */ 1956 if (adev->asic_type == CHIP_NAVI12) 1957 goto parse_soc_bounding_box; 1958 1959 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1960 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1961 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1962 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1963 adev->gfx.config.max_texture_channel_caches = 1964 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1965 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1966 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1967 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1968 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1969 adev->gfx.config.double_offchip_lds_buf = 1970 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1971 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1972 adev->gfx.cu_info.max_waves_per_simd = 1973 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1974 adev->gfx.cu_info.max_scratch_slots_per_cu = 1975 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1976 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1977 if (hdr->version_minor >= 1) { 1978 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1979 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1980 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1981 adev->gfx.config.num_sc_per_sh = 1982 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1983 adev->gfx.config.num_packer_per_sc = 1984 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1985 } 1986 1987 parse_soc_bounding_box: 1988 /* 1989 * soc bounding box info is not integrated in disocovery table, 1990 * we always need to parse it from gpu info firmware if needed. 1991 */ 1992 if (hdr->version_minor == 2) { 1993 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1994 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1995 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1996 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1997 } 1998 break; 1999 } 2000 default: 2001 dev_err(adev->dev, 2002 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2003 err = -EINVAL; 2004 goto out; 2005 } 2006 out: 2007 return err; 2008 } 2009 2010 /** 2011 * amdgpu_device_ip_early_init - run early init for hardware IPs 2012 * 2013 * @adev: amdgpu_device pointer 2014 * 2015 * Early initialization pass for hardware IPs. The hardware IPs that make 2016 * up each asic are discovered each IP's early_init callback is run. This 2017 * is the first stage in initializing the asic. 2018 * Returns 0 on success, negative error code on failure. 2019 */ 2020 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2021 { 2022 int i, r; 2023 2024 amdgpu_device_enable_virtual_display(adev); 2025 2026 if (amdgpu_sriov_vf(adev)) { 2027 r = amdgpu_virt_request_full_gpu(adev, true); 2028 if (r) 2029 return r; 2030 } 2031 2032 switch (adev->asic_type) { 2033 #ifdef CONFIG_DRM_AMDGPU_SI 2034 case CHIP_VERDE: 2035 case CHIP_TAHITI: 2036 case CHIP_PITCAIRN: 2037 case CHIP_OLAND: 2038 case CHIP_HAINAN: 2039 adev->family = AMDGPU_FAMILY_SI; 2040 r = si_set_ip_blocks(adev); 2041 if (r) 2042 return r; 2043 break; 2044 #endif 2045 #ifdef CONFIG_DRM_AMDGPU_CIK 2046 case CHIP_BONAIRE: 2047 case CHIP_HAWAII: 2048 case CHIP_KAVERI: 2049 case CHIP_KABINI: 2050 case CHIP_MULLINS: 2051 if (adev->flags & AMD_IS_APU) 2052 adev->family = AMDGPU_FAMILY_KV; 2053 else 2054 adev->family = AMDGPU_FAMILY_CI; 2055 2056 r = cik_set_ip_blocks(adev); 2057 if (r) 2058 return r; 2059 break; 2060 #endif 2061 case CHIP_TOPAZ: 2062 case CHIP_TONGA: 2063 case CHIP_FIJI: 2064 case CHIP_POLARIS10: 2065 case CHIP_POLARIS11: 2066 case CHIP_POLARIS12: 2067 case CHIP_VEGAM: 2068 case CHIP_CARRIZO: 2069 case CHIP_STONEY: 2070 if (adev->flags & AMD_IS_APU) 2071 adev->family = AMDGPU_FAMILY_CZ; 2072 else 2073 adev->family = AMDGPU_FAMILY_VI; 2074 2075 r = vi_set_ip_blocks(adev); 2076 if (r) 2077 return r; 2078 break; 2079 case CHIP_VEGA10: 2080 case CHIP_VEGA12: 2081 case CHIP_VEGA20: 2082 case CHIP_RAVEN: 2083 case CHIP_ARCTURUS: 2084 case CHIP_RENOIR: 2085 case CHIP_ALDEBARAN: 2086 if (adev->flags & AMD_IS_APU) 2087 adev->family = AMDGPU_FAMILY_RV; 2088 else 2089 adev->family = AMDGPU_FAMILY_AI; 2090 2091 r = soc15_set_ip_blocks(adev); 2092 if (r) 2093 return r; 2094 break; 2095 case CHIP_NAVI10: 2096 case CHIP_NAVI14: 2097 case CHIP_NAVI12: 2098 case CHIP_SIENNA_CICHLID: 2099 case CHIP_NAVY_FLOUNDER: 2100 case CHIP_DIMGREY_CAVEFISH: 2101 case CHIP_BEIGE_GOBY: 2102 case CHIP_VANGOGH: 2103 case CHIP_YELLOW_CARP: 2104 if (adev->asic_type == CHIP_VANGOGH) 2105 adev->family = AMDGPU_FAMILY_VGH; 2106 else if (adev->asic_type == CHIP_YELLOW_CARP) 2107 adev->family = AMDGPU_FAMILY_YC; 2108 else 2109 adev->family = AMDGPU_FAMILY_NV; 2110 2111 r = nv_set_ip_blocks(adev); 2112 if (r) 2113 return r; 2114 break; 2115 default: 2116 /* FIXME: not supported yet */ 2117 return -EINVAL; 2118 } 2119 2120 amdgpu_amdkfd_device_probe(adev); 2121 2122 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2123 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2124 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2125 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2126 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2127 2128 for (i = 0; i < adev->num_ip_blocks; i++) { 2129 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2130 DRM_ERROR("disabled ip block: %d <%s>\n", 2131 i, adev->ip_blocks[i].version->funcs->name); 2132 adev->ip_blocks[i].status.valid = false; 2133 } else { 2134 if (adev->ip_blocks[i].version->funcs->early_init) { 2135 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2136 if (r == -ENOENT) { 2137 adev->ip_blocks[i].status.valid = false; 2138 } else if (r) { 2139 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2140 adev->ip_blocks[i].version->funcs->name, r); 2141 return r; 2142 } else { 2143 adev->ip_blocks[i].status.valid = true; 2144 } 2145 } else { 2146 adev->ip_blocks[i].status.valid = true; 2147 } 2148 } 2149 /* get the vbios after the asic_funcs are set up */ 2150 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2151 r = amdgpu_device_parse_gpu_info_fw(adev); 2152 if (r) 2153 return r; 2154 2155 /* Read BIOS */ 2156 if (!amdgpu_get_bios(adev)) 2157 return -EINVAL; 2158 2159 r = amdgpu_atombios_init(adev); 2160 if (r) { 2161 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2162 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2163 return r; 2164 } 2165 2166 /*get pf2vf msg info at it's earliest time*/ 2167 if (amdgpu_sriov_vf(adev)) 2168 amdgpu_virt_init_data_exchange(adev); 2169 2170 } 2171 } 2172 2173 adev->cg_flags &= amdgpu_cg_mask; 2174 adev->pg_flags &= amdgpu_pg_mask; 2175 2176 return 0; 2177 } 2178 2179 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2180 { 2181 int i, r; 2182 2183 for (i = 0; i < adev->num_ip_blocks; i++) { 2184 if (!adev->ip_blocks[i].status.sw) 2185 continue; 2186 if (adev->ip_blocks[i].status.hw) 2187 continue; 2188 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2189 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2190 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2191 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2192 if (r) { 2193 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2194 adev->ip_blocks[i].version->funcs->name, r); 2195 return r; 2196 } 2197 adev->ip_blocks[i].status.hw = true; 2198 } 2199 } 2200 2201 return 0; 2202 } 2203 2204 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2205 { 2206 int i, r; 2207 2208 for (i = 0; i < adev->num_ip_blocks; i++) { 2209 if (!adev->ip_blocks[i].status.sw) 2210 continue; 2211 if (adev->ip_blocks[i].status.hw) 2212 continue; 2213 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2214 if (r) { 2215 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2216 adev->ip_blocks[i].version->funcs->name, r); 2217 return r; 2218 } 2219 adev->ip_blocks[i].status.hw = true; 2220 } 2221 2222 return 0; 2223 } 2224 2225 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2226 { 2227 int r = 0; 2228 int i; 2229 uint32_t smu_version; 2230 2231 if (adev->asic_type >= CHIP_VEGA10) { 2232 for (i = 0; i < adev->num_ip_blocks; i++) { 2233 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2234 continue; 2235 2236 if (!adev->ip_blocks[i].status.sw) 2237 continue; 2238 2239 /* no need to do the fw loading again if already done*/ 2240 if (adev->ip_blocks[i].status.hw == true) 2241 break; 2242 2243 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2244 r = adev->ip_blocks[i].version->funcs->resume(adev); 2245 if (r) { 2246 DRM_ERROR("resume of IP block <%s> failed %d\n", 2247 adev->ip_blocks[i].version->funcs->name, r); 2248 return r; 2249 } 2250 } else { 2251 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2252 if (r) { 2253 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2254 adev->ip_blocks[i].version->funcs->name, r); 2255 return r; 2256 } 2257 } 2258 2259 adev->ip_blocks[i].status.hw = true; 2260 break; 2261 } 2262 } 2263 2264 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2265 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2266 2267 return r; 2268 } 2269 2270 /** 2271 * amdgpu_device_ip_init - run init for hardware IPs 2272 * 2273 * @adev: amdgpu_device pointer 2274 * 2275 * Main initialization pass for hardware IPs. The list of all the hardware 2276 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2277 * are run. sw_init initializes the software state associated with each IP 2278 * and hw_init initializes the hardware associated with each IP. 2279 * Returns 0 on success, negative error code on failure. 2280 */ 2281 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2282 { 2283 int i, r; 2284 2285 r = amdgpu_ras_init(adev); 2286 if (r) 2287 return r; 2288 2289 for (i = 0; i < adev->num_ip_blocks; i++) { 2290 if (!adev->ip_blocks[i].status.valid) 2291 continue; 2292 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2293 if (r) { 2294 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2295 adev->ip_blocks[i].version->funcs->name, r); 2296 goto init_failed; 2297 } 2298 adev->ip_blocks[i].status.sw = true; 2299 2300 /* need to do gmc hw init early so we can allocate gpu mem */ 2301 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2302 r = amdgpu_device_vram_scratch_init(adev); 2303 if (r) { 2304 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2305 goto init_failed; 2306 } 2307 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2308 if (r) { 2309 DRM_ERROR("hw_init %d failed %d\n", i, r); 2310 goto init_failed; 2311 } 2312 r = amdgpu_device_wb_init(adev); 2313 if (r) { 2314 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2315 goto init_failed; 2316 } 2317 adev->ip_blocks[i].status.hw = true; 2318 2319 /* right after GMC hw init, we create CSA */ 2320 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2321 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2322 AMDGPU_GEM_DOMAIN_VRAM, 2323 AMDGPU_CSA_SIZE); 2324 if (r) { 2325 DRM_ERROR("allocate CSA failed %d\n", r); 2326 goto init_failed; 2327 } 2328 } 2329 } 2330 } 2331 2332 if (amdgpu_sriov_vf(adev)) 2333 amdgpu_virt_init_data_exchange(adev); 2334 2335 r = amdgpu_ib_pool_init(adev); 2336 if (r) { 2337 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2338 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2339 goto init_failed; 2340 } 2341 2342 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2343 if (r) 2344 goto init_failed; 2345 2346 r = amdgpu_device_ip_hw_init_phase1(adev); 2347 if (r) 2348 goto init_failed; 2349 2350 r = amdgpu_device_fw_loading(adev); 2351 if (r) 2352 goto init_failed; 2353 2354 r = amdgpu_device_ip_hw_init_phase2(adev); 2355 if (r) 2356 goto init_failed; 2357 2358 /* 2359 * retired pages will be loaded from eeprom and reserved here, 2360 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2361 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2362 * for I2C communication which only true at this point. 2363 * 2364 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2365 * failure from bad gpu situation and stop amdgpu init process 2366 * accordingly. For other failed cases, it will still release all 2367 * the resource and print error message, rather than returning one 2368 * negative value to upper level. 2369 * 2370 * Note: theoretically, this should be called before all vram allocations 2371 * to protect retired page from abusing 2372 */ 2373 r = amdgpu_ras_recovery_init(adev); 2374 if (r) 2375 goto init_failed; 2376 2377 if (adev->gmc.xgmi.num_physical_nodes > 1) 2378 amdgpu_xgmi_add_device(adev); 2379 2380 /* Don't init kfd if whole hive need to be reset during init */ 2381 if (!adev->gmc.xgmi.pending_reset) 2382 amdgpu_amdkfd_device_init(adev); 2383 2384 amdgpu_fru_get_product_info(adev); 2385 2386 init_failed: 2387 if (amdgpu_sriov_vf(adev)) 2388 amdgpu_virt_release_full_gpu(adev, true); 2389 2390 return r; 2391 } 2392 2393 /** 2394 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2395 * 2396 * @adev: amdgpu_device pointer 2397 * 2398 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2399 * this function before a GPU reset. If the value is retained after a 2400 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2401 */ 2402 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2403 { 2404 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2405 } 2406 2407 /** 2408 * amdgpu_device_check_vram_lost - check if vram is valid 2409 * 2410 * @adev: amdgpu_device pointer 2411 * 2412 * Checks the reset magic value written to the gart pointer in VRAM. 2413 * The driver calls this after a GPU reset to see if the contents of 2414 * VRAM is lost or now. 2415 * returns true if vram is lost, false if not. 2416 */ 2417 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2418 { 2419 if (memcmp(adev->gart.ptr, adev->reset_magic, 2420 AMDGPU_RESET_MAGIC_NUM)) 2421 return true; 2422 2423 if (!amdgpu_in_reset(adev)) 2424 return false; 2425 2426 /* 2427 * For all ASICs with baco/mode1 reset, the VRAM is 2428 * always assumed to be lost. 2429 */ 2430 switch (amdgpu_asic_reset_method(adev)) { 2431 case AMD_RESET_METHOD_BACO: 2432 case AMD_RESET_METHOD_MODE1: 2433 return true; 2434 default: 2435 return false; 2436 } 2437 } 2438 2439 /** 2440 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2441 * 2442 * @adev: amdgpu_device pointer 2443 * @state: clockgating state (gate or ungate) 2444 * 2445 * The list of all the hardware IPs that make up the asic is walked and the 2446 * set_clockgating_state callbacks are run. 2447 * Late initialization pass enabling clockgating for hardware IPs. 2448 * Fini or suspend, pass disabling clockgating for hardware IPs. 2449 * Returns 0 on success, negative error code on failure. 2450 */ 2451 2452 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2453 enum amd_clockgating_state state) 2454 { 2455 int i, j, r; 2456 2457 if (amdgpu_emu_mode == 1) 2458 return 0; 2459 2460 for (j = 0; j < adev->num_ip_blocks; j++) { 2461 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2462 if (!adev->ip_blocks[i].status.late_initialized) 2463 continue; 2464 /* skip CG for GFX on S0ix */ 2465 if (adev->in_s0ix && 2466 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2467 continue; 2468 /* skip CG for VCE/UVD, it's handled specially */ 2469 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2470 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2471 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2472 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2473 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2474 /* enable clockgating to save power */ 2475 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2476 state); 2477 if (r) { 2478 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2479 adev->ip_blocks[i].version->funcs->name, r); 2480 return r; 2481 } 2482 } 2483 } 2484 2485 return 0; 2486 } 2487 2488 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2489 enum amd_powergating_state state) 2490 { 2491 int i, j, r; 2492 2493 if (amdgpu_emu_mode == 1) 2494 return 0; 2495 2496 for (j = 0; j < adev->num_ip_blocks; j++) { 2497 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2498 if (!adev->ip_blocks[i].status.late_initialized) 2499 continue; 2500 /* skip PG for GFX on S0ix */ 2501 if (adev->in_s0ix && 2502 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2503 continue; 2504 /* skip CG for VCE/UVD, it's handled specially */ 2505 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2506 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2507 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2508 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2509 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2510 /* enable powergating to save power */ 2511 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2512 state); 2513 if (r) { 2514 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2515 adev->ip_blocks[i].version->funcs->name, r); 2516 return r; 2517 } 2518 } 2519 } 2520 return 0; 2521 } 2522 2523 static int amdgpu_device_enable_mgpu_fan_boost(void) 2524 { 2525 struct amdgpu_gpu_instance *gpu_ins; 2526 struct amdgpu_device *adev; 2527 int i, ret = 0; 2528 2529 mutex_lock(&mgpu_info.mutex); 2530 2531 /* 2532 * MGPU fan boost feature should be enabled 2533 * only when there are two or more dGPUs in 2534 * the system 2535 */ 2536 if (mgpu_info.num_dgpu < 2) 2537 goto out; 2538 2539 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2540 gpu_ins = &(mgpu_info.gpu_ins[i]); 2541 adev = gpu_ins->adev; 2542 if (!(adev->flags & AMD_IS_APU) && 2543 !gpu_ins->mgpu_fan_enabled) { 2544 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2545 if (ret) 2546 break; 2547 2548 gpu_ins->mgpu_fan_enabled = 1; 2549 } 2550 } 2551 2552 out: 2553 mutex_unlock(&mgpu_info.mutex); 2554 2555 return ret; 2556 } 2557 2558 /** 2559 * amdgpu_device_ip_late_init - run late init for hardware IPs 2560 * 2561 * @adev: amdgpu_device pointer 2562 * 2563 * Late initialization pass for hardware IPs. The list of all the hardware 2564 * IPs that make up the asic is walked and the late_init callbacks are run. 2565 * late_init covers any special initialization that an IP requires 2566 * after all of the have been initialized or something that needs to happen 2567 * late in the init process. 2568 * Returns 0 on success, negative error code on failure. 2569 */ 2570 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2571 { 2572 struct amdgpu_gpu_instance *gpu_instance; 2573 int i = 0, r; 2574 2575 for (i = 0; i < adev->num_ip_blocks; i++) { 2576 if (!adev->ip_blocks[i].status.hw) 2577 continue; 2578 if (adev->ip_blocks[i].version->funcs->late_init) { 2579 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2580 if (r) { 2581 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2582 adev->ip_blocks[i].version->funcs->name, r); 2583 return r; 2584 } 2585 } 2586 adev->ip_blocks[i].status.late_initialized = true; 2587 } 2588 2589 amdgpu_ras_set_error_query_ready(adev, true); 2590 2591 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2592 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2593 2594 amdgpu_device_fill_reset_magic(adev); 2595 2596 r = amdgpu_device_enable_mgpu_fan_boost(); 2597 if (r) 2598 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2599 2600 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2601 if (adev->asic_type == CHIP_ARCTURUS && 2602 amdgpu_passthrough(adev) && 2603 adev->gmc.xgmi.num_physical_nodes > 1) 2604 smu_set_light_sbr(&adev->smu, true); 2605 2606 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2607 mutex_lock(&mgpu_info.mutex); 2608 2609 /* 2610 * Reset device p-state to low as this was booted with high. 2611 * 2612 * This should be performed only after all devices from the same 2613 * hive get initialized. 2614 * 2615 * However, it's unknown how many device in the hive in advance. 2616 * As this is counted one by one during devices initializations. 2617 * 2618 * So, we wait for all XGMI interlinked devices initialized. 2619 * This may bring some delays as those devices may come from 2620 * different hives. But that should be OK. 2621 */ 2622 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2623 for (i = 0; i < mgpu_info.num_gpu; i++) { 2624 gpu_instance = &(mgpu_info.gpu_ins[i]); 2625 if (gpu_instance->adev->flags & AMD_IS_APU) 2626 continue; 2627 2628 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2629 AMDGPU_XGMI_PSTATE_MIN); 2630 if (r) { 2631 DRM_ERROR("pstate setting failed (%d).\n", r); 2632 break; 2633 } 2634 } 2635 } 2636 2637 mutex_unlock(&mgpu_info.mutex); 2638 } 2639 2640 return 0; 2641 } 2642 2643 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2644 { 2645 int i, r; 2646 2647 for (i = 0; i < adev->num_ip_blocks; i++) { 2648 if (!adev->ip_blocks[i].version->funcs->early_fini) 2649 continue; 2650 2651 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2652 if (r) { 2653 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2654 adev->ip_blocks[i].version->funcs->name, r); 2655 } 2656 } 2657 2658 amdgpu_amdkfd_suspend(adev, false); 2659 2660 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2661 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2662 2663 /* need to disable SMC first */ 2664 for (i = 0; i < adev->num_ip_blocks; i++) { 2665 if (!adev->ip_blocks[i].status.hw) 2666 continue; 2667 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2668 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2669 /* XXX handle errors */ 2670 if (r) { 2671 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2672 adev->ip_blocks[i].version->funcs->name, r); 2673 } 2674 adev->ip_blocks[i].status.hw = false; 2675 break; 2676 } 2677 } 2678 2679 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2680 if (!adev->ip_blocks[i].status.hw) 2681 continue; 2682 2683 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2684 /* XXX handle errors */ 2685 if (r) { 2686 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2687 adev->ip_blocks[i].version->funcs->name, r); 2688 } 2689 2690 adev->ip_blocks[i].status.hw = false; 2691 } 2692 2693 return 0; 2694 } 2695 2696 /** 2697 * amdgpu_device_ip_fini - run fini for hardware IPs 2698 * 2699 * @adev: amdgpu_device pointer 2700 * 2701 * Main teardown pass for hardware IPs. The list of all the hardware 2702 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2703 * are run. hw_fini tears down the hardware associated with each IP 2704 * and sw_fini tears down any software state associated with each IP. 2705 * Returns 0 on success, negative error code on failure. 2706 */ 2707 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2708 { 2709 int i, r; 2710 2711 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2712 amdgpu_virt_release_ras_err_handler_data(adev); 2713 2714 amdgpu_ras_pre_fini(adev); 2715 2716 if (adev->gmc.xgmi.num_physical_nodes > 1) 2717 amdgpu_xgmi_remove_device(adev); 2718 2719 amdgpu_amdkfd_device_fini_sw(adev); 2720 2721 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2722 if (!adev->ip_blocks[i].status.sw) 2723 continue; 2724 2725 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2726 amdgpu_ucode_free_bo(adev); 2727 amdgpu_free_static_csa(&adev->virt.csa_obj); 2728 amdgpu_device_wb_fini(adev); 2729 amdgpu_device_vram_scratch_fini(adev); 2730 amdgpu_ib_pool_fini(adev); 2731 } 2732 2733 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2734 /* XXX handle errors */ 2735 if (r) { 2736 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2737 adev->ip_blocks[i].version->funcs->name, r); 2738 } 2739 adev->ip_blocks[i].status.sw = false; 2740 adev->ip_blocks[i].status.valid = false; 2741 } 2742 2743 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2744 if (!adev->ip_blocks[i].status.late_initialized) 2745 continue; 2746 if (adev->ip_blocks[i].version->funcs->late_fini) 2747 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2748 adev->ip_blocks[i].status.late_initialized = false; 2749 } 2750 2751 amdgpu_ras_fini(adev); 2752 2753 if (amdgpu_sriov_vf(adev)) 2754 if (amdgpu_virt_release_full_gpu(adev, false)) 2755 DRM_ERROR("failed to release exclusive mode on fini\n"); 2756 2757 return 0; 2758 } 2759 2760 /** 2761 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2762 * 2763 * @work: work_struct. 2764 */ 2765 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2766 { 2767 struct amdgpu_device *adev = 2768 container_of(work, struct amdgpu_device, delayed_init_work.work); 2769 int r; 2770 2771 r = amdgpu_ib_ring_tests(adev); 2772 if (r) 2773 DRM_ERROR("ib ring test failed (%d).\n", r); 2774 } 2775 2776 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2777 { 2778 struct amdgpu_device *adev = 2779 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2780 2781 mutex_lock(&adev->gfx.gfx_off_mutex); 2782 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2783 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2784 adev->gfx.gfx_off_state = true; 2785 } 2786 mutex_unlock(&adev->gfx.gfx_off_mutex); 2787 } 2788 2789 /** 2790 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2791 * 2792 * @adev: amdgpu_device pointer 2793 * 2794 * Main suspend function for hardware IPs. The list of all the hardware 2795 * IPs that make up the asic is walked, clockgating is disabled and the 2796 * suspend callbacks are run. suspend puts the hardware and software state 2797 * in each IP into a state suitable for suspend. 2798 * Returns 0 on success, negative error code on failure. 2799 */ 2800 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2801 { 2802 int i, r; 2803 2804 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2805 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2806 2807 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2808 if (!adev->ip_blocks[i].status.valid) 2809 continue; 2810 2811 /* displays are handled separately */ 2812 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2813 continue; 2814 2815 /* XXX handle errors */ 2816 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2817 /* XXX handle errors */ 2818 if (r) { 2819 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2820 adev->ip_blocks[i].version->funcs->name, r); 2821 return r; 2822 } 2823 2824 adev->ip_blocks[i].status.hw = false; 2825 } 2826 2827 return 0; 2828 } 2829 2830 /** 2831 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2832 * 2833 * @adev: amdgpu_device pointer 2834 * 2835 * Main suspend function for hardware IPs. The list of all the hardware 2836 * IPs that make up the asic is walked, clockgating is disabled and the 2837 * suspend callbacks are run. suspend puts the hardware and software state 2838 * in each IP into a state suitable for suspend. 2839 * Returns 0 on success, negative error code on failure. 2840 */ 2841 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2842 { 2843 int i, r; 2844 2845 if (adev->in_s0ix) 2846 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2847 2848 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2849 if (!adev->ip_blocks[i].status.valid) 2850 continue; 2851 /* displays are handled in phase1 */ 2852 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2853 continue; 2854 /* PSP lost connection when err_event_athub occurs */ 2855 if (amdgpu_ras_intr_triggered() && 2856 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2857 adev->ip_blocks[i].status.hw = false; 2858 continue; 2859 } 2860 2861 /* skip unnecessary suspend if we do not initialize them yet */ 2862 if (adev->gmc.xgmi.pending_reset && 2863 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2864 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2865 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2866 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2867 adev->ip_blocks[i].status.hw = false; 2868 continue; 2869 } 2870 2871 /* skip suspend of gfx and psp for S0ix 2872 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2873 * like at runtime. PSP is also part of the always on hardware 2874 * so no need to suspend it. 2875 */ 2876 if (adev->in_s0ix && 2877 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2878 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2879 continue; 2880 2881 /* XXX handle errors */ 2882 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2883 /* XXX handle errors */ 2884 if (r) { 2885 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2886 adev->ip_blocks[i].version->funcs->name, r); 2887 } 2888 adev->ip_blocks[i].status.hw = false; 2889 /* handle putting the SMC in the appropriate state */ 2890 if(!amdgpu_sriov_vf(adev)){ 2891 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2892 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2893 if (r) { 2894 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2895 adev->mp1_state, r); 2896 return r; 2897 } 2898 } 2899 } 2900 } 2901 2902 return 0; 2903 } 2904 2905 /** 2906 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2907 * 2908 * @adev: amdgpu_device pointer 2909 * 2910 * Main suspend function for hardware IPs. The list of all the hardware 2911 * IPs that make up the asic is walked, clockgating is disabled and the 2912 * suspend callbacks are run. suspend puts the hardware and software state 2913 * in each IP into a state suitable for suspend. 2914 * Returns 0 on success, negative error code on failure. 2915 */ 2916 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2917 { 2918 int r; 2919 2920 if (amdgpu_sriov_vf(adev)) { 2921 amdgpu_virt_fini_data_exchange(adev); 2922 amdgpu_virt_request_full_gpu(adev, false); 2923 } 2924 2925 r = amdgpu_device_ip_suspend_phase1(adev); 2926 if (r) 2927 return r; 2928 r = amdgpu_device_ip_suspend_phase2(adev); 2929 2930 if (amdgpu_sriov_vf(adev)) 2931 amdgpu_virt_release_full_gpu(adev, false); 2932 2933 return r; 2934 } 2935 2936 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2937 { 2938 int i, r; 2939 2940 static enum amd_ip_block_type ip_order[] = { 2941 AMD_IP_BLOCK_TYPE_GMC, 2942 AMD_IP_BLOCK_TYPE_COMMON, 2943 AMD_IP_BLOCK_TYPE_PSP, 2944 AMD_IP_BLOCK_TYPE_IH, 2945 }; 2946 2947 for (i = 0; i < adev->num_ip_blocks; i++) { 2948 int j; 2949 struct amdgpu_ip_block *block; 2950 2951 block = &adev->ip_blocks[i]; 2952 block->status.hw = false; 2953 2954 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2955 2956 if (block->version->type != ip_order[j] || 2957 !block->status.valid) 2958 continue; 2959 2960 r = block->version->funcs->hw_init(adev); 2961 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2962 if (r) 2963 return r; 2964 block->status.hw = true; 2965 } 2966 } 2967 2968 return 0; 2969 } 2970 2971 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2972 { 2973 int i, r; 2974 2975 static enum amd_ip_block_type ip_order[] = { 2976 AMD_IP_BLOCK_TYPE_SMC, 2977 AMD_IP_BLOCK_TYPE_DCE, 2978 AMD_IP_BLOCK_TYPE_GFX, 2979 AMD_IP_BLOCK_TYPE_SDMA, 2980 AMD_IP_BLOCK_TYPE_UVD, 2981 AMD_IP_BLOCK_TYPE_VCE, 2982 AMD_IP_BLOCK_TYPE_VCN 2983 }; 2984 2985 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2986 int j; 2987 struct amdgpu_ip_block *block; 2988 2989 for (j = 0; j < adev->num_ip_blocks; j++) { 2990 block = &adev->ip_blocks[j]; 2991 2992 if (block->version->type != ip_order[i] || 2993 !block->status.valid || 2994 block->status.hw) 2995 continue; 2996 2997 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2998 r = block->version->funcs->resume(adev); 2999 else 3000 r = block->version->funcs->hw_init(adev); 3001 3002 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3003 if (r) 3004 return r; 3005 block->status.hw = true; 3006 } 3007 } 3008 3009 return 0; 3010 } 3011 3012 /** 3013 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3014 * 3015 * @adev: amdgpu_device pointer 3016 * 3017 * First resume function for hardware IPs. The list of all the hardware 3018 * IPs that make up the asic is walked and the resume callbacks are run for 3019 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3020 * after a suspend and updates the software state as necessary. This 3021 * function is also used for restoring the GPU after a GPU reset. 3022 * Returns 0 on success, negative error code on failure. 3023 */ 3024 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3025 { 3026 int i, r; 3027 3028 for (i = 0; i < adev->num_ip_blocks; i++) { 3029 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3030 continue; 3031 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3032 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3033 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3034 3035 r = adev->ip_blocks[i].version->funcs->resume(adev); 3036 if (r) { 3037 DRM_ERROR("resume of IP block <%s> failed %d\n", 3038 adev->ip_blocks[i].version->funcs->name, r); 3039 return r; 3040 } 3041 adev->ip_blocks[i].status.hw = true; 3042 } 3043 } 3044 3045 return 0; 3046 } 3047 3048 /** 3049 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3050 * 3051 * @adev: amdgpu_device pointer 3052 * 3053 * First resume function for hardware IPs. The list of all the hardware 3054 * IPs that make up the asic is walked and the resume callbacks are run for 3055 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3056 * functional state after a suspend and updates the software state as 3057 * necessary. This function is also used for restoring the GPU after a GPU 3058 * reset. 3059 * Returns 0 on success, negative error code on failure. 3060 */ 3061 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3062 { 3063 int i, r; 3064 3065 for (i = 0; i < adev->num_ip_blocks; i++) { 3066 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3067 continue; 3068 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3069 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3070 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3071 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3072 continue; 3073 r = adev->ip_blocks[i].version->funcs->resume(adev); 3074 if (r) { 3075 DRM_ERROR("resume of IP block <%s> failed %d\n", 3076 adev->ip_blocks[i].version->funcs->name, r); 3077 return r; 3078 } 3079 adev->ip_blocks[i].status.hw = true; 3080 } 3081 3082 return 0; 3083 } 3084 3085 /** 3086 * amdgpu_device_ip_resume - run resume for hardware IPs 3087 * 3088 * @adev: amdgpu_device pointer 3089 * 3090 * Main resume function for hardware IPs. The hardware IPs 3091 * are split into two resume functions because they are 3092 * are also used in in recovering from a GPU reset and some additional 3093 * steps need to be take between them. In this case (S3/S4) they are 3094 * run sequentially. 3095 * Returns 0 on success, negative error code on failure. 3096 */ 3097 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3098 { 3099 int r; 3100 3101 r = amdgpu_device_ip_resume_phase1(adev); 3102 if (r) 3103 return r; 3104 3105 r = amdgpu_device_fw_loading(adev); 3106 if (r) 3107 return r; 3108 3109 r = amdgpu_device_ip_resume_phase2(adev); 3110 3111 return r; 3112 } 3113 3114 /** 3115 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3116 * 3117 * @adev: amdgpu_device pointer 3118 * 3119 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3120 */ 3121 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3122 { 3123 if (amdgpu_sriov_vf(adev)) { 3124 if (adev->is_atom_fw) { 3125 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3126 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3127 } else { 3128 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3129 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3130 } 3131 3132 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3133 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3134 } 3135 } 3136 3137 /** 3138 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3139 * 3140 * @asic_type: AMD asic type 3141 * 3142 * Check if there is DC (new modesetting infrastructre) support for an asic. 3143 * returns true if DC has support, false if not. 3144 */ 3145 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3146 { 3147 switch (asic_type) { 3148 #if defined(CONFIG_DRM_AMD_DC) 3149 #if defined(CONFIG_DRM_AMD_DC_SI) 3150 case CHIP_TAHITI: 3151 case CHIP_PITCAIRN: 3152 case CHIP_VERDE: 3153 case CHIP_OLAND: 3154 #endif 3155 case CHIP_BONAIRE: 3156 case CHIP_KAVERI: 3157 case CHIP_KABINI: 3158 case CHIP_MULLINS: 3159 /* 3160 * We have systems in the wild with these ASICs that require 3161 * LVDS and VGA support which is not supported with DC. 3162 * 3163 * Fallback to the non-DC driver here by default so as not to 3164 * cause regressions. 3165 */ 3166 return amdgpu_dc > 0; 3167 case CHIP_HAWAII: 3168 case CHIP_CARRIZO: 3169 case CHIP_STONEY: 3170 case CHIP_POLARIS10: 3171 case CHIP_POLARIS11: 3172 case CHIP_POLARIS12: 3173 case CHIP_VEGAM: 3174 case CHIP_TONGA: 3175 case CHIP_FIJI: 3176 case CHIP_VEGA10: 3177 case CHIP_VEGA12: 3178 case CHIP_VEGA20: 3179 #if defined(CONFIG_DRM_AMD_DC_DCN) 3180 case CHIP_RAVEN: 3181 case CHIP_NAVI10: 3182 case CHIP_NAVI14: 3183 case CHIP_NAVI12: 3184 case CHIP_RENOIR: 3185 case CHIP_SIENNA_CICHLID: 3186 case CHIP_NAVY_FLOUNDER: 3187 case CHIP_DIMGREY_CAVEFISH: 3188 case CHIP_BEIGE_GOBY: 3189 case CHIP_VANGOGH: 3190 case CHIP_YELLOW_CARP: 3191 #endif 3192 return amdgpu_dc != 0; 3193 #endif 3194 default: 3195 if (amdgpu_dc > 0) 3196 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3197 "but isn't supported by ASIC, ignoring\n"); 3198 return false; 3199 } 3200 } 3201 3202 /** 3203 * amdgpu_device_has_dc_support - check if dc is supported 3204 * 3205 * @adev: amdgpu_device pointer 3206 * 3207 * Returns true for supported, false for not supported 3208 */ 3209 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3210 { 3211 if (amdgpu_sriov_vf(adev) || 3212 adev->enable_virtual_display || 3213 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3214 return false; 3215 3216 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3217 } 3218 3219 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3220 { 3221 struct amdgpu_device *adev = 3222 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3223 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3224 3225 /* It's a bug to not have a hive within this function */ 3226 if (WARN_ON(!hive)) 3227 return; 3228 3229 /* 3230 * Use task barrier to synchronize all xgmi reset works across the 3231 * hive. task_barrier_enter and task_barrier_exit will block 3232 * until all the threads running the xgmi reset works reach 3233 * those points. task_barrier_full will do both blocks. 3234 */ 3235 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3236 3237 task_barrier_enter(&hive->tb); 3238 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3239 3240 if (adev->asic_reset_res) 3241 goto fail; 3242 3243 task_barrier_exit(&hive->tb); 3244 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3245 3246 if (adev->asic_reset_res) 3247 goto fail; 3248 3249 if (adev->mmhub.ras_funcs && 3250 adev->mmhub.ras_funcs->reset_ras_error_count) 3251 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3252 } else { 3253 3254 task_barrier_full(&hive->tb); 3255 adev->asic_reset_res = amdgpu_asic_reset(adev); 3256 } 3257 3258 fail: 3259 if (adev->asic_reset_res) 3260 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3261 adev->asic_reset_res, adev_to_drm(adev)->unique); 3262 amdgpu_put_xgmi_hive(hive); 3263 } 3264 3265 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3266 { 3267 char *input = amdgpu_lockup_timeout; 3268 char *timeout_setting = NULL; 3269 int index = 0; 3270 long timeout; 3271 int ret = 0; 3272 3273 /* 3274 * By default timeout for non compute jobs is 10000 3275 * and 60000 for compute jobs. 3276 * In SR-IOV or passthrough mode, timeout for compute 3277 * jobs are 60000 by default. 3278 */ 3279 adev->gfx_timeout = msecs_to_jiffies(10000); 3280 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3281 if (amdgpu_sriov_vf(adev)) 3282 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3283 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3284 else 3285 adev->compute_timeout = msecs_to_jiffies(60000); 3286 3287 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3288 while ((timeout_setting = strsep(&input, ",")) && 3289 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3290 ret = kstrtol(timeout_setting, 0, &timeout); 3291 if (ret) 3292 return ret; 3293 3294 if (timeout == 0) { 3295 index++; 3296 continue; 3297 } else if (timeout < 0) { 3298 timeout = MAX_SCHEDULE_TIMEOUT; 3299 } else { 3300 timeout = msecs_to_jiffies(timeout); 3301 } 3302 3303 switch (index++) { 3304 case 0: 3305 adev->gfx_timeout = timeout; 3306 break; 3307 case 1: 3308 adev->compute_timeout = timeout; 3309 break; 3310 case 2: 3311 adev->sdma_timeout = timeout; 3312 break; 3313 case 3: 3314 adev->video_timeout = timeout; 3315 break; 3316 default: 3317 break; 3318 } 3319 } 3320 /* 3321 * There is only one value specified and 3322 * it should apply to all non-compute jobs. 3323 */ 3324 if (index == 1) { 3325 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3326 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3327 adev->compute_timeout = adev->gfx_timeout; 3328 } 3329 } 3330 3331 return ret; 3332 } 3333 3334 static const struct attribute *amdgpu_dev_attributes[] = { 3335 &dev_attr_product_name.attr, 3336 &dev_attr_product_number.attr, 3337 &dev_attr_serial_number.attr, 3338 &dev_attr_pcie_replay_count.attr, 3339 NULL 3340 }; 3341 3342 /** 3343 * amdgpu_device_init - initialize the driver 3344 * 3345 * @adev: amdgpu_device pointer 3346 * @flags: driver flags 3347 * 3348 * Initializes the driver info and hw (all asics). 3349 * Returns 0 for success or an error on failure. 3350 * Called at driver startup. 3351 */ 3352 int amdgpu_device_init(struct amdgpu_device *adev, 3353 uint32_t flags) 3354 { 3355 struct drm_device *ddev = adev_to_drm(adev); 3356 struct pci_dev *pdev = adev->pdev; 3357 int r, i; 3358 bool px = false; 3359 u32 max_MBps; 3360 3361 adev->shutdown = false; 3362 adev->flags = flags; 3363 3364 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3365 adev->asic_type = amdgpu_force_asic_type; 3366 else 3367 adev->asic_type = flags & AMD_ASIC_MASK; 3368 3369 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3370 if (amdgpu_emu_mode == 1) 3371 adev->usec_timeout *= 10; 3372 adev->gmc.gart_size = 512 * 1024 * 1024; 3373 adev->accel_working = false; 3374 adev->num_rings = 0; 3375 adev->mman.buffer_funcs = NULL; 3376 adev->mman.buffer_funcs_ring = NULL; 3377 adev->vm_manager.vm_pte_funcs = NULL; 3378 adev->vm_manager.vm_pte_num_scheds = 0; 3379 adev->gmc.gmc_funcs = NULL; 3380 adev->harvest_ip_mask = 0x0; 3381 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3382 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3383 3384 adev->smc_rreg = &amdgpu_invalid_rreg; 3385 adev->smc_wreg = &amdgpu_invalid_wreg; 3386 adev->pcie_rreg = &amdgpu_invalid_rreg; 3387 adev->pcie_wreg = &amdgpu_invalid_wreg; 3388 adev->pciep_rreg = &amdgpu_invalid_rreg; 3389 adev->pciep_wreg = &amdgpu_invalid_wreg; 3390 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3391 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3392 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3393 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3394 adev->didt_rreg = &amdgpu_invalid_rreg; 3395 adev->didt_wreg = &amdgpu_invalid_wreg; 3396 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3397 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3398 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3399 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3400 3401 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3402 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3403 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3404 3405 /* mutex initialization are all done here so we 3406 * can recall function without having locking issues */ 3407 mutex_init(&adev->firmware.mutex); 3408 mutex_init(&adev->pm.mutex); 3409 mutex_init(&adev->gfx.gpu_clock_mutex); 3410 mutex_init(&adev->srbm_mutex); 3411 mutex_init(&adev->gfx.pipe_reserve_mutex); 3412 mutex_init(&adev->gfx.gfx_off_mutex); 3413 mutex_init(&adev->grbm_idx_mutex); 3414 mutex_init(&adev->mn_lock); 3415 mutex_init(&adev->virt.vf_errors.lock); 3416 hash_init(adev->mn_hash); 3417 atomic_set(&adev->in_gpu_reset, 0); 3418 init_rwsem(&adev->reset_sem); 3419 mutex_init(&adev->psp.mutex); 3420 mutex_init(&adev->notifier_lock); 3421 3422 r = amdgpu_device_init_apu_flags(adev); 3423 if (r) 3424 return r; 3425 3426 r = amdgpu_device_check_arguments(adev); 3427 if (r) 3428 return r; 3429 3430 spin_lock_init(&adev->mmio_idx_lock); 3431 spin_lock_init(&adev->smc_idx_lock); 3432 spin_lock_init(&adev->pcie_idx_lock); 3433 spin_lock_init(&adev->uvd_ctx_idx_lock); 3434 spin_lock_init(&adev->didt_idx_lock); 3435 spin_lock_init(&adev->gc_cac_idx_lock); 3436 spin_lock_init(&adev->se_cac_idx_lock); 3437 spin_lock_init(&adev->audio_endpt_idx_lock); 3438 spin_lock_init(&adev->mm_stats.lock); 3439 3440 INIT_LIST_HEAD(&adev->shadow_list); 3441 mutex_init(&adev->shadow_list_lock); 3442 3443 INIT_LIST_HEAD(&adev->reset_list); 3444 3445 INIT_DELAYED_WORK(&adev->delayed_init_work, 3446 amdgpu_device_delayed_init_work_handler); 3447 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3448 amdgpu_device_delay_enable_gfx_off); 3449 3450 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3451 3452 adev->gfx.gfx_off_req_count = 1; 3453 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3454 3455 atomic_set(&adev->throttling_logging_enabled, 1); 3456 /* 3457 * If throttling continues, logging will be performed every minute 3458 * to avoid log flooding. "-1" is subtracted since the thermal 3459 * throttling interrupt comes every second. Thus, the total logging 3460 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3461 * for throttling interrupt) = 60 seconds. 3462 */ 3463 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3464 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3465 3466 /* Registers mapping */ 3467 /* TODO: block userspace mapping of io register */ 3468 if (adev->asic_type >= CHIP_BONAIRE) { 3469 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3470 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3471 } else { 3472 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3473 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3474 } 3475 3476 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3477 if (adev->rmmio == NULL) { 3478 return -ENOMEM; 3479 } 3480 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3481 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3482 3483 /* enable PCIE atomic ops */ 3484 r = pci_enable_atomic_ops_to_root(adev->pdev, 3485 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3486 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3487 if (r) { 3488 adev->have_atomics_support = false; 3489 DRM_INFO("PCIE atomic ops is not supported\n"); 3490 } else { 3491 adev->have_atomics_support = true; 3492 } 3493 3494 amdgpu_device_get_pcie_info(adev); 3495 3496 if (amdgpu_mcbp) 3497 DRM_INFO("MCBP is enabled\n"); 3498 3499 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3500 adev->enable_mes = true; 3501 3502 /* detect hw virtualization here */ 3503 amdgpu_detect_virtualization(adev); 3504 3505 r = amdgpu_device_get_job_timeout_settings(adev); 3506 if (r) { 3507 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3508 goto failed_unmap; 3509 } 3510 3511 /* early init functions */ 3512 r = amdgpu_device_ip_early_init(adev); 3513 if (r) 3514 goto failed_unmap; 3515 3516 /* doorbell bar mapping and doorbell index init*/ 3517 amdgpu_device_doorbell_init(adev); 3518 3519 if (amdgpu_emu_mode == 1) { 3520 /* post the asic on emulation mode */ 3521 emu_soc_asic_init(adev); 3522 goto fence_driver_init; 3523 } 3524 3525 amdgpu_reset_init(adev); 3526 3527 /* detect if we are with an SRIOV vbios */ 3528 amdgpu_device_detect_sriov_bios(adev); 3529 3530 /* check if we need to reset the asic 3531 * E.g., driver was not cleanly unloaded previously, etc. 3532 */ 3533 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3534 if (adev->gmc.xgmi.num_physical_nodes) { 3535 dev_info(adev->dev, "Pending hive reset.\n"); 3536 adev->gmc.xgmi.pending_reset = true; 3537 /* Only need to init necessary block for SMU to handle the reset */ 3538 for (i = 0; i < adev->num_ip_blocks; i++) { 3539 if (!adev->ip_blocks[i].status.valid) 3540 continue; 3541 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3542 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3543 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3544 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3545 DRM_DEBUG("IP %s disabled for hw_init.\n", 3546 adev->ip_blocks[i].version->funcs->name); 3547 adev->ip_blocks[i].status.hw = true; 3548 } 3549 } 3550 } else { 3551 r = amdgpu_asic_reset(adev); 3552 if (r) { 3553 dev_err(adev->dev, "asic reset on init failed\n"); 3554 goto failed; 3555 } 3556 } 3557 } 3558 3559 pci_enable_pcie_error_reporting(adev->pdev); 3560 3561 /* Post card if necessary */ 3562 if (amdgpu_device_need_post(adev)) { 3563 if (!adev->bios) { 3564 dev_err(adev->dev, "no vBIOS found\n"); 3565 r = -EINVAL; 3566 goto failed; 3567 } 3568 DRM_INFO("GPU posting now...\n"); 3569 r = amdgpu_device_asic_init(adev); 3570 if (r) { 3571 dev_err(adev->dev, "gpu post error!\n"); 3572 goto failed; 3573 } 3574 } 3575 3576 if (adev->is_atom_fw) { 3577 /* Initialize clocks */ 3578 r = amdgpu_atomfirmware_get_clock_info(adev); 3579 if (r) { 3580 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3581 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3582 goto failed; 3583 } 3584 } else { 3585 /* Initialize clocks */ 3586 r = amdgpu_atombios_get_clock_info(adev); 3587 if (r) { 3588 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3589 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3590 goto failed; 3591 } 3592 /* init i2c buses */ 3593 if (!amdgpu_device_has_dc_support(adev)) 3594 amdgpu_atombios_i2c_init(adev); 3595 } 3596 3597 fence_driver_init: 3598 /* Fence driver */ 3599 r = amdgpu_fence_driver_init(adev); 3600 if (r) { 3601 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3602 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3603 goto failed; 3604 } 3605 3606 /* init the mode config */ 3607 drm_mode_config_init(adev_to_drm(adev)); 3608 3609 r = amdgpu_device_ip_init(adev); 3610 if (r) { 3611 /* failed in exclusive mode due to timeout */ 3612 if (amdgpu_sriov_vf(adev) && 3613 !amdgpu_sriov_runtime(adev) && 3614 amdgpu_virt_mmio_blocked(adev) && 3615 !amdgpu_virt_wait_reset(adev)) { 3616 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3617 /* Don't send request since VF is inactive. */ 3618 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3619 adev->virt.ops = NULL; 3620 r = -EAGAIN; 3621 goto release_ras_con; 3622 } 3623 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3624 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3625 goto release_ras_con; 3626 } 3627 3628 dev_info(adev->dev, 3629 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3630 adev->gfx.config.max_shader_engines, 3631 adev->gfx.config.max_sh_per_se, 3632 adev->gfx.config.max_cu_per_sh, 3633 adev->gfx.cu_info.number); 3634 3635 adev->accel_working = true; 3636 3637 amdgpu_vm_check_compute_bug(adev); 3638 3639 /* Initialize the buffer migration limit. */ 3640 if (amdgpu_moverate >= 0) 3641 max_MBps = amdgpu_moverate; 3642 else 3643 max_MBps = 8; /* Allow 8 MB/s. */ 3644 /* Get a log2 for easy divisions. */ 3645 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3646 3647 amdgpu_fbdev_init(adev); 3648 3649 r = amdgpu_pm_sysfs_init(adev); 3650 if (r) { 3651 adev->pm_sysfs_en = false; 3652 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3653 } else 3654 adev->pm_sysfs_en = true; 3655 3656 r = amdgpu_ucode_sysfs_init(adev); 3657 if (r) { 3658 adev->ucode_sysfs_en = false; 3659 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3660 } else 3661 adev->ucode_sysfs_en = true; 3662 3663 if ((amdgpu_testing & 1)) { 3664 if (adev->accel_working) 3665 amdgpu_test_moves(adev); 3666 else 3667 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3668 } 3669 if (amdgpu_benchmarking) { 3670 if (adev->accel_working) 3671 amdgpu_benchmark(adev, amdgpu_benchmarking); 3672 else 3673 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3674 } 3675 3676 /* 3677 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3678 * Otherwise the mgpu fan boost feature will be skipped due to the 3679 * gpu instance is counted less. 3680 */ 3681 amdgpu_register_gpu_instance(adev); 3682 3683 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3684 * explicit gating rather than handling it automatically. 3685 */ 3686 if (!adev->gmc.xgmi.pending_reset) { 3687 r = amdgpu_device_ip_late_init(adev); 3688 if (r) { 3689 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3690 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3691 goto release_ras_con; 3692 } 3693 /* must succeed. */ 3694 amdgpu_ras_resume(adev); 3695 queue_delayed_work(system_wq, &adev->delayed_init_work, 3696 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3697 } 3698 3699 if (amdgpu_sriov_vf(adev)) 3700 flush_delayed_work(&adev->delayed_init_work); 3701 3702 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3703 if (r) 3704 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3705 3706 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3707 r = amdgpu_pmu_init(adev); 3708 if (r) 3709 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3710 3711 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3712 if (amdgpu_device_cache_pci_state(adev->pdev)) 3713 pci_restore_state(pdev); 3714 3715 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3716 /* this will fail for cards that aren't VGA class devices, just 3717 * ignore it */ 3718 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3719 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3720 3721 if (amdgpu_device_supports_px(ddev)) { 3722 px = true; 3723 vga_switcheroo_register_client(adev->pdev, 3724 &amdgpu_switcheroo_ops, px); 3725 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3726 } 3727 3728 if (adev->gmc.xgmi.pending_reset) 3729 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3730 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3731 3732 return 0; 3733 3734 release_ras_con: 3735 amdgpu_release_ras_context(adev); 3736 3737 failed: 3738 amdgpu_vf_error_trans_all(adev); 3739 3740 failed_unmap: 3741 iounmap(adev->rmmio); 3742 adev->rmmio = NULL; 3743 3744 return r; 3745 } 3746 3747 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3748 { 3749 /* Clear all CPU mappings pointing to this device */ 3750 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3751 3752 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3753 amdgpu_device_doorbell_fini(adev); 3754 3755 iounmap(adev->rmmio); 3756 adev->rmmio = NULL; 3757 if (adev->mman.aper_base_kaddr) 3758 iounmap(adev->mman.aper_base_kaddr); 3759 adev->mman.aper_base_kaddr = NULL; 3760 3761 /* Memory manager related */ 3762 if (!adev->gmc.xgmi.connected_to_cpu) { 3763 arch_phys_wc_del(adev->gmc.vram_mtrr); 3764 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3765 } 3766 } 3767 3768 /** 3769 * amdgpu_device_fini - tear down the driver 3770 * 3771 * @adev: amdgpu_device pointer 3772 * 3773 * Tear down the driver info (all asics). 3774 * Called at driver shutdown. 3775 */ 3776 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3777 { 3778 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3779 flush_delayed_work(&adev->delayed_init_work); 3780 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3781 adev->shutdown = true; 3782 3783 /* make sure IB test finished before entering exclusive mode 3784 * to avoid preemption on IB test 3785 * */ 3786 if (amdgpu_sriov_vf(adev)) { 3787 amdgpu_virt_request_full_gpu(adev, false); 3788 amdgpu_virt_fini_data_exchange(adev); 3789 } 3790 3791 /* disable all interrupts */ 3792 amdgpu_irq_disable_all(adev); 3793 if (adev->mode_info.mode_config_initialized){ 3794 if (!amdgpu_device_has_dc_support(adev)) 3795 drm_helper_force_disable_all(adev_to_drm(adev)); 3796 else 3797 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3798 } 3799 amdgpu_fence_driver_fini_hw(adev); 3800 3801 if (adev->pm_sysfs_en) 3802 amdgpu_pm_sysfs_fini(adev); 3803 if (adev->ucode_sysfs_en) 3804 amdgpu_ucode_sysfs_fini(adev); 3805 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3806 3807 amdgpu_fbdev_fini(adev); 3808 3809 amdgpu_irq_fini_hw(adev); 3810 3811 amdgpu_device_ip_fini_early(adev); 3812 3813 amdgpu_gart_dummy_page_fini(adev); 3814 3815 amdgpu_device_unmap_mmio(adev); 3816 } 3817 3818 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3819 { 3820 amdgpu_device_ip_fini(adev); 3821 amdgpu_fence_driver_fini_sw(adev); 3822 release_firmware(adev->firmware.gpu_info_fw); 3823 adev->firmware.gpu_info_fw = NULL; 3824 adev->accel_working = false; 3825 3826 amdgpu_reset_fini(adev); 3827 3828 /* free i2c buses */ 3829 if (!amdgpu_device_has_dc_support(adev)) 3830 amdgpu_i2c_fini(adev); 3831 3832 if (amdgpu_emu_mode != 1) 3833 amdgpu_atombios_fini(adev); 3834 3835 kfree(adev->bios); 3836 adev->bios = NULL; 3837 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3838 vga_switcheroo_unregister_client(adev->pdev); 3839 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3840 } 3841 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3842 vga_client_unregister(adev->pdev); 3843 3844 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3845 amdgpu_pmu_fini(adev); 3846 if (adev->mman.discovery_bin) 3847 amdgpu_discovery_fini(adev); 3848 3849 kfree(adev->pci_state); 3850 3851 } 3852 3853 3854 /* 3855 * Suspend & resume. 3856 */ 3857 /** 3858 * amdgpu_device_suspend - initiate device suspend 3859 * 3860 * @dev: drm dev pointer 3861 * @fbcon : notify the fbdev of suspend 3862 * 3863 * Puts the hw in the suspend state (all asics). 3864 * Returns 0 for success or an error on failure. 3865 * Called at driver suspend. 3866 */ 3867 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3868 { 3869 struct amdgpu_device *adev = drm_to_adev(dev); 3870 3871 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3872 return 0; 3873 3874 adev->in_suspend = true; 3875 3876 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 3877 DRM_WARN("smart shift update failed\n"); 3878 3879 drm_kms_helper_poll_disable(dev); 3880 3881 if (fbcon) 3882 amdgpu_fbdev_set_suspend(adev, 1); 3883 3884 cancel_delayed_work_sync(&adev->delayed_init_work); 3885 3886 amdgpu_ras_suspend(adev); 3887 3888 amdgpu_device_ip_suspend_phase1(adev); 3889 3890 if (!adev->in_s0ix) 3891 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 3892 3893 /* evict vram memory */ 3894 amdgpu_bo_evict_vram(adev); 3895 3896 amdgpu_fence_driver_suspend(adev); 3897 3898 amdgpu_device_ip_suspend_phase2(adev); 3899 /* evict remaining vram memory 3900 * This second call to evict vram is to evict the gart page table 3901 * using the CPU. 3902 */ 3903 amdgpu_bo_evict_vram(adev); 3904 3905 return 0; 3906 } 3907 3908 /** 3909 * amdgpu_device_resume - initiate device resume 3910 * 3911 * @dev: drm dev pointer 3912 * @fbcon : notify the fbdev of resume 3913 * 3914 * Bring the hw back to operating state (all asics). 3915 * Returns 0 for success or an error on failure. 3916 * Called at driver resume. 3917 */ 3918 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3919 { 3920 struct amdgpu_device *adev = drm_to_adev(dev); 3921 int r = 0; 3922 3923 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3924 return 0; 3925 3926 if (adev->in_s0ix) 3927 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3928 3929 /* post card */ 3930 if (amdgpu_device_need_post(adev)) { 3931 r = amdgpu_device_asic_init(adev); 3932 if (r) 3933 dev_err(adev->dev, "amdgpu asic init failed\n"); 3934 } 3935 3936 r = amdgpu_device_ip_resume(adev); 3937 if (r) { 3938 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3939 return r; 3940 } 3941 amdgpu_fence_driver_resume(adev); 3942 3943 3944 r = amdgpu_device_ip_late_init(adev); 3945 if (r) 3946 return r; 3947 3948 queue_delayed_work(system_wq, &adev->delayed_init_work, 3949 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3950 3951 if (!adev->in_s0ix) { 3952 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 3953 if (r) 3954 return r; 3955 } 3956 3957 /* Make sure IB tests flushed */ 3958 flush_delayed_work(&adev->delayed_init_work); 3959 3960 if (fbcon) 3961 amdgpu_fbdev_set_suspend(adev, 0); 3962 3963 drm_kms_helper_poll_enable(dev); 3964 3965 amdgpu_ras_resume(adev); 3966 3967 /* 3968 * Most of the connector probing functions try to acquire runtime pm 3969 * refs to ensure that the GPU is powered on when connector polling is 3970 * performed. Since we're calling this from a runtime PM callback, 3971 * trying to acquire rpm refs will cause us to deadlock. 3972 * 3973 * Since we're guaranteed to be holding the rpm lock, it's safe to 3974 * temporarily disable the rpm helpers so this doesn't deadlock us. 3975 */ 3976 #ifdef CONFIG_PM 3977 dev->dev->power.disable_depth++; 3978 #endif 3979 if (!amdgpu_device_has_dc_support(adev)) 3980 drm_helper_hpd_irq_event(dev); 3981 else 3982 drm_kms_helper_hotplug_event(dev); 3983 #ifdef CONFIG_PM 3984 dev->dev->power.disable_depth--; 3985 #endif 3986 adev->in_suspend = false; 3987 3988 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 3989 DRM_WARN("smart shift update failed\n"); 3990 3991 return 0; 3992 } 3993 3994 /** 3995 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3996 * 3997 * @adev: amdgpu_device pointer 3998 * 3999 * The list of all the hardware IPs that make up the asic is walked and 4000 * the check_soft_reset callbacks are run. check_soft_reset determines 4001 * if the asic is still hung or not. 4002 * Returns true if any of the IPs are still in a hung state, false if not. 4003 */ 4004 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4005 { 4006 int i; 4007 bool asic_hang = false; 4008 4009 if (amdgpu_sriov_vf(adev)) 4010 return true; 4011 4012 if (amdgpu_asic_need_full_reset(adev)) 4013 return true; 4014 4015 for (i = 0; i < adev->num_ip_blocks; i++) { 4016 if (!adev->ip_blocks[i].status.valid) 4017 continue; 4018 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4019 adev->ip_blocks[i].status.hang = 4020 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4021 if (adev->ip_blocks[i].status.hang) { 4022 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4023 asic_hang = true; 4024 } 4025 } 4026 return asic_hang; 4027 } 4028 4029 /** 4030 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4031 * 4032 * @adev: amdgpu_device pointer 4033 * 4034 * The list of all the hardware IPs that make up the asic is walked and the 4035 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4036 * handles any IP specific hardware or software state changes that are 4037 * necessary for a soft reset to succeed. 4038 * Returns 0 on success, negative error code on failure. 4039 */ 4040 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4041 { 4042 int i, r = 0; 4043 4044 for (i = 0; i < adev->num_ip_blocks; i++) { 4045 if (!adev->ip_blocks[i].status.valid) 4046 continue; 4047 if (adev->ip_blocks[i].status.hang && 4048 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4049 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4050 if (r) 4051 return r; 4052 } 4053 } 4054 4055 return 0; 4056 } 4057 4058 /** 4059 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4060 * 4061 * @adev: amdgpu_device pointer 4062 * 4063 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4064 * reset is necessary to recover. 4065 * Returns true if a full asic reset is required, false if not. 4066 */ 4067 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4068 { 4069 int i; 4070 4071 if (amdgpu_asic_need_full_reset(adev)) 4072 return true; 4073 4074 for (i = 0; i < adev->num_ip_blocks; i++) { 4075 if (!adev->ip_blocks[i].status.valid) 4076 continue; 4077 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4078 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4079 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4080 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4081 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4082 if (adev->ip_blocks[i].status.hang) { 4083 dev_info(adev->dev, "Some block need full reset!\n"); 4084 return true; 4085 } 4086 } 4087 } 4088 return false; 4089 } 4090 4091 /** 4092 * amdgpu_device_ip_soft_reset - do a soft reset 4093 * 4094 * @adev: amdgpu_device pointer 4095 * 4096 * The list of all the hardware IPs that make up the asic is walked and the 4097 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4098 * IP specific hardware or software state changes that are necessary to soft 4099 * reset the IP. 4100 * Returns 0 on success, negative error code on failure. 4101 */ 4102 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4103 { 4104 int i, r = 0; 4105 4106 for (i = 0; i < adev->num_ip_blocks; i++) { 4107 if (!adev->ip_blocks[i].status.valid) 4108 continue; 4109 if (adev->ip_blocks[i].status.hang && 4110 adev->ip_blocks[i].version->funcs->soft_reset) { 4111 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4112 if (r) 4113 return r; 4114 } 4115 } 4116 4117 return 0; 4118 } 4119 4120 /** 4121 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4122 * 4123 * @adev: amdgpu_device pointer 4124 * 4125 * The list of all the hardware IPs that make up the asic is walked and the 4126 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4127 * handles any IP specific hardware or software state changes that are 4128 * necessary after the IP has been soft reset. 4129 * Returns 0 on success, negative error code on failure. 4130 */ 4131 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4132 { 4133 int i, r = 0; 4134 4135 for (i = 0; i < adev->num_ip_blocks; i++) { 4136 if (!adev->ip_blocks[i].status.valid) 4137 continue; 4138 if (adev->ip_blocks[i].status.hang && 4139 adev->ip_blocks[i].version->funcs->post_soft_reset) 4140 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4141 if (r) 4142 return r; 4143 } 4144 4145 return 0; 4146 } 4147 4148 /** 4149 * amdgpu_device_recover_vram - Recover some VRAM contents 4150 * 4151 * @adev: amdgpu_device pointer 4152 * 4153 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4154 * restore things like GPUVM page tables after a GPU reset where 4155 * the contents of VRAM might be lost. 4156 * 4157 * Returns: 4158 * 0 on success, negative error code on failure. 4159 */ 4160 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4161 { 4162 struct dma_fence *fence = NULL, *next = NULL; 4163 struct amdgpu_bo *shadow; 4164 struct amdgpu_bo_vm *vmbo; 4165 long r = 1, tmo; 4166 4167 if (amdgpu_sriov_runtime(adev)) 4168 tmo = msecs_to_jiffies(8000); 4169 else 4170 tmo = msecs_to_jiffies(100); 4171 4172 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4173 mutex_lock(&adev->shadow_list_lock); 4174 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4175 shadow = &vmbo->bo; 4176 /* No need to recover an evicted BO */ 4177 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4178 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4179 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4180 continue; 4181 4182 r = amdgpu_bo_restore_shadow(shadow, &next); 4183 if (r) 4184 break; 4185 4186 if (fence) { 4187 tmo = dma_fence_wait_timeout(fence, false, tmo); 4188 dma_fence_put(fence); 4189 fence = next; 4190 if (tmo == 0) { 4191 r = -ETIMEDOUT; 4192 break; 4193 } else if (tmo < 0) { 4194 r = tmo; 4195 break; 4196 } 4197 } else { 4198 fence = next; 4199 } 4200 } 4201 mutex_unlock(&adev->shadow_list_lock); 4202 4203 if (fence) 4204 tmo = dma_fence_wait_timeout(fence, false, tmo); 4205 dma_fence_put(fence); 4206 4207 if (r < 0 || tmo <= 0) { 4208 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4209 return -EIO; 4210 } 4211 4212 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4213 return 0; 4214 } 4215 4216 4217 /** 4218 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4219 * 4220 * @adev: amdgpu_device pointer 4221 * @from_hypervisor: request from hypervisor 4222 * 4223 * do VF FLR and reinitialize Asic 4224 * return 0 means succeeded otherwise failed 4225 */ 4226 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4227 bool from_hypervisor) 4228 { 4229 int r; 4230 4231 if (from_hypervisor) 4232 r = amdgpu_virt_request_full_gpu(adev, true); 4233 else 4234 r = amdgpu_virt_reset_gpu(adev); 4235 if (r) 4236 return r; 4237 4238 amdgpu_amdkfd_pre_reset(adev); 4239 4240 /* Resume IP prior to SMC */ 4241 r = amdgpu_device_ip_reinit_early_sriov(adev); 4242 if (r) 4243 goto error; 4244 4245 amdgpu_virt_init_data_exchange(adev); 4246 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4247 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4248 4249 r = amdgpu_device_fw_loading(adev); 4250 if (r) 4251 return r; 4252 4253 /* now we are okay to resume SMC/CP/SDMA */ 4254 r = amdgpu_device_ip_reinit_late_sriov(adev); 4255 if (r) 4256 goto error; 4257 4258 amdgpu_irq_gpu_reset_resume_helper(adev); 4259 r = amdgpu_ib_ring_tests(adev); 4260 amdgpu_amdkfd_post_reset(adev); 4261 4262 error: 4263 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4264 amdgpu_inc_vram_lost(adev); 4265 r = amdgpu_device_recover_vram(adev); 4266 } 4267 amdgpu_virt_release_full_gpu(adev, true); 4268 4269 return r; 4270 } 4271 4272 /** 4273 * amdgpu_device_has_job_running - check if there is any job in mirror list 4274 * 4275 * @adev: amdgpu_device pointer 4276 * 4277 * check if there is any job in mirror list 4278 */ 4279 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4280 { 4281 int i; 4282 struct drm_sched_job *job; 4283 4284 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4285 struct amdgpu_ring *ring = adev->rings[i]; 4286 4287 if (!ring || !ring->sched.thread) 4288 continue; 4289 4290 spin_lock(&ring->sched.job_list_lock); 4291 job = list_first_entry_or_null(&ring->sched.pending_list, 4292 struct drm_sched_job, list); 4293 spin_unlock(&ring->sched.job_list_lock); 4294 if (job) 4295 return true; 4296 } 4297 return false; 4298 } 4299 4300 /** 4301 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4302 * 4303 * @adev: amdgpu_device pointer 4304 * 4305 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4306 * a hung GPU. 4307 */ 4308 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4309 { 4310 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4311 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4312 return false; 4313 } 4314 4315 if (amdgpu_gpu_recovery == 0) 4316 goto disabled; 4317 4318 if (amdgpu_sriov_vf(adev)) 4319 return true; 4320 4321 if (amdgpu_gpu_recovery == -1) { 4322 switch (adev->asic_type) { 4323 case CHIP_BONAIRE: 4324 case CHIP_HAWAII: 4325 case CHIP_TOPAZ: 4326 case CHIP_TONGA: 4327 case CHIP_FIJI: 4328 case CHIP_POLARIS10: 4329 case CHIP_POLARIS11: 4330 case CHIP_POLARIS12: 4331 case CHIP_VEGAM: 4332 case CHIP_VEGA20: 4333 case CHIP_VEGA10: 4334 case CHIP_VEGA12: 4335 case CHIP_RAVEN: 4336 case CHIP_ARCTURUS: 4337 case CHIP_RENOIR: 4338 case CHIP_NAVI10: 4339 case CHIP_NAVI14: 4340 case CHIP_NAVI12: 4341 case CHIP_SIENNA_CICHLID: 4342 case CHIP_NAVY_FLOUNDER: 4343 case CHIP_DIMGREY_CAVEFISH: 4344 case CHIP_BEIGE_GOBY: 4345 case CHIP_VANGOGH: 4346 case CHIP_ALDEBARAN: 4347 break; 4348 default: 4349 goto disabled; 4350 } 4351 } 4352 4353 return true; 4354 4355 disabled: 4356 dev_info(adev->dev, "GPU recovery disabled.\n"); 4357 return false; 4358 } 4359 4360 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4361 { 4362 u32 i; 4363 int ret = 0; 4364 4365 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4366 4367 dev_info(adev->dev, "GPU mode1 reset\n"); 4368 4369 /* disable BM */ 4370 pci_clear_master(adev->pdev); 4371 4372 amdgpu_device_cache_pci_state(adev->pdev); 4373 4374 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4375 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4376 ret = amdgpu_dpm_mode1_reset(adev); 4377 } else { 4378 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4379 ret = psp_gpu_reset(adev); 4380 } 4381 4382 if (ret) 4383 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4384 4385 amdgpu_device_load_pci_state(adev->pdev); 4386 4387 /* wait for asic to come out of reset */ 4388 for (i = 0; i < adev->usec_timeout; i++) { 4389 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4390 4391 if (memsize != 0xffffffff) 4392 break; 4393 udelay(1); 4394 } 4395 4396 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4397 return ret; 4398 } 4399 4400 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4401 struct amdgpu_reset_context *reset_context) 4402 { 4403 int i, r = 0; 4404 struct amdgpu_job *job = NULL; 4405 bool need_full_reset = 4406 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4407 4408 if (reset_context->reset_req_dev == adev) 4409 job = reset_context->job; 4410 4411 /* no need to dump if device is not in good state during probe period */ 4412 if (!adev->gmc.xgmi.pending_reset) 4413 amdgpu_debugfs_wait_dump(adev); 4414 4415 if (amdgpu_sriov_vf(adev)) { 4416 /* stop the data exchange thread */ 4417 amdgpu_virt_fini_data_exchange(adev); 4418 } 4419 4420 /* block all schedulers and reset given job's ring */ 4421 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4422 struct amdgpu_ring *ring = adev->rings[i]; 4423 4424 if (!ring || !ring->sched.thread) 4425 continue; 4426 4427 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4428 amdgpu_fence_driver_force_completion(ring); 4429 } 4430 4431 if(job) 4432 drm_sched_increase_karma(&job->base); 4433 4434 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4435 /* If reset handler not implemented, continue; otherwise return */ 4436 if (r == -ENOSYS) 4437 r = 0; 4438 else 4439 return r; 4440 4441 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4442 if (!amdgpu_sriov_vf(adev)) { 4443 4444 if (!need_full_reset) 4445 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4446 4447 if (!need_full_reset) { 4448 amdgpu_device_ip_pre_soft_reset(adev); 4449 r = amdgpu_device_ip_soft_reset(adev); 4450 amdgpu_device_ip_post_soft_reset(adev); 4451 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4452 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4453 need_full_reset = true; 4454 } 4455 } 4456 4457 if (need_full_reset) 4458 r = amdgpu_device_ip_suspend(adev); 4459 if (need_full_reset) 4460 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4461 else 4462 clear_bit(AMDGPU_NEED_FULL_RESET, 4463 &reset_context->flags); 4464 } 4465 4466 return r; 4467 } 4468 4469 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4470 struct amdgpu_reset_context *reset_context) 4471 { 4472 struct amdgpu_device *tmp_adev = NULL; 4473 bool need_full_reset, skip_hw_reset, vram_lost = false; 4474 int r = 0; 4475 4476 /* Try reset handler method first */ 4477 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4478 reset_list); 4479 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4480 /* If reset handler not implemented, continue; otherwise return */ 4481 if (r == -ENOSYS) 4482 r = 0; 4483 else 4484 return r; 4485 4486 /* Reset handler not implemented, use the default method */ 4487 need_full_reset = 4488 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4489 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4490 4491 /* 4492 * ASIC reset has to be done on all XGMI hive nodes ASAP 4493 * to allow proper links negotiation in FW (within 1 sec) 4494 */ 4495 if (!skip_hw_reset && need_full_reset) { 4496 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4497 /* For XGMI run all resets in parallel to speed up the process */ 4498 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4499 tmp_adev->gmc.xgmi.pending_reset = false; 4500 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4501 r = -EALREADY; 4502 } else 4503 r = amdgpu_asic_reset(tmp_adev); 4504 4505 if (r) { 4506 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4507 r, adev_to_drm(tmp_adev)->unique); 4508 break; 4509 } 4510 } 4511 4512 /* For XGMI wait for all resets to complete before proceed */ 4513 if (!r) { 4514 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4515 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4516 flush_work(&tmp_adev->xgmi_reset_work); 4517 r = tmp_adev->asic_reset_res; 4518 if (r) 4519 break; 4520 } 4521 } 4522 } 4523 } 4524 4525 if (!r && amdgpu_ras_intr_triggered()) { 4526 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4527 if (tmp_adev->mmhub.ras_funcs && 4528 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4529 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4530 } 4531 4532 amdgpu_ras_intr_cleared(); 4533 } 4534 4535 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4536 if (need_full_reset) { 4537 /* post card */ 4538 r = amdgpu_device_asic_init(tmp_adev); 4539 if (r) { 4540 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4541 } else { 4542 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4543 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4544 if (r) 4545 goto out; 4546 4547 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4548 if (vram_lost) { 4549 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4550 amdgpu_inc_vram_lost(tmp_adev); 4551 } 4552 4553 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4554 if (r) 4555 goto out; 4556 4557 r = amdgpu_device_fw_loading(tmp_adev); 4558 if (r) 4559 return r; 4560 4561 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4562 if (r) 4563 goto out; 4564 4565 if (vram_lost) 4566 amdgpu_device_fill_reset_magic(tmp_adev); 4567 4568 /* 4569 * Add this ASIC as tracked as reset was already 4570 * complete successfully. 4571 */ 4572 amdgpu_register_gpu_instance(tmp_adev); 4573 4574 if (!reset_context->hive && 4575 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4576 amdgpu_xgmi_add_device(tmp_adev); 4577 4578 r = amdgpu_device_ip_late_init(tmp_adev); 4579 if (r) 4580 goto out; 4581 4582 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4583 4584 /* 4585 * The GPU enters bad state once faulty pages 4586 * by ECC has reached the threshold, and ras 4587 * recovery is scheduled next. So add one check 4588 * here to break recovery if it indeed exceeds 4589 * bad page threshold, and remind user to 4590 * retire this GPU or setting one bigger 4591 * bad_page_threshold value to fix this once 4592 * probing driver again. 4593 */ 4594 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4595 /* must succeed. */ 4596 amdgpu_ras_resume(tmp_adev); 4597 } else { 4598 r = -EINVAL; 4599 goto out; 4600 } 4601 4602 /* Update PSP FW topology after reset */ 4603 if (reset_context->hive && 4604 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4605 r = amdgpu_xgmi_update_topology( 4606 reset_context->hive, tmp_adev); 4607 } 4608 } 4609 4610 out: 4611 if (!r) { 4612 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4613 r = amdgpu_ib_ring_tests(tmp_adev); 4614 if (r) { 4615 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4616 need_full_reset = true; 4617 r = -EAGAIN; 4618 goto end; 4619 } 4620 } 4621 4622 if (!r) 4623 r = amdgpu_device_recover_vram(tmp_adev); 4624 else 4625 tmp_adev->asic_reset_res = r; 4626 } 4627 4628 end: 4629 if (need_full_reset) 4630 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4631 else 4632 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4633 return r; 4634 } 4635 4636 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4637 struct amdgpu_hive_info *hive) 4638 { 4639 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4640 return false; 4641 4642 if (hive) { 4643 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4644 } else { 4645 down_write(&adev->reset_sem); 4646 } 4647 4648 switch (amdgpu_asic_reset_method(adev)) { 4649 case AMD_RESET_METHOD_MODE1: 4650 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4651 break; 4652 case AMD_RESET_METHOD_MODE2: 4653 adev->mp1_state = PP_MP1_STATE_RESET; 4654 break; 4655 default: 4656 adev->mp1_state = PP_MP1_STATE_NONE; 4657 break; 4658 } 4659 4660 return true; 4661 } 4662 4663 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4664 { 4665 amdgpu_vf_error_trans_all(adev); 4666 adev->mp1_state = PP_MP1_STATE_NONE; 4667 atomic_set(&adev->in_gpu_reset, 0); 4668 up_write(&adev->reset_sem); 4669 } 4670 4671 /* 4672 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4673 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4674 * 4675 * unlock won't require roll back. 4676 */ 4677 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4678 { 4679 struct amdgpu_device *tmp_adev = NULL; 4680 4681 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4682 if (!hive) { 4683 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4684 return -ENODEV; 4685 } 4686 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4687 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4688 goto roll_back; 4689 } 4690 } else if (!amdgpu_device_lock_adev(adev, hive)) 4691 return -EAGAIN; 4692 4693 return 0; 4694 roll_back: 4695 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4696 /* 4697 * if the lockup iteration break in the middle of a hive, 4698 * it may means there may has a race issue, 4699 * or a hive device locked up independently. 4700 * we may be in trouble and may not, so will try to roll back 4701 * the lock and give out a warnning. 4702 */ 4703 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4704 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4705 amdgpu_device_unlock_adev(tmp_adev); 4706 } 4707 } 4708 return -EAGAIN; 4709 } 4710 4711 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4712 { 4713 struct pci_dev *p = NULL; 4714 4715 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4716 adev->pdev->bus->number, 1); 4717 if (p) { 4718 pm_runtime_enable(&(p->dev)); 4719 pm_runtime_resume(&(p->dev)); 4720 } 4721 } 4722 4723 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4724 { 4725 enum amd_reset_method reset_method; 4726 struct pci_dev *p = NULL; 4727 u64 expires; 4728 4729 /* 4730 * For now, only BACO and mode1 reset are confirmed 4731 * to suffer the audio issue without proper suspended. 4732 */ 4733 reset_method = amdgpu_asic_reset_method(adev); 4734 if ((reset_method != AMD_RESET_METHOD_BACO) && 4735 (reset_method != AMD_RESET_METHOD_MODE1)) 4736 return -EINVAL; 4737 4738 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4739 adev->pdev->bus->number, 1); 4740 if (!p) 4741 return -ENODEV; 4742 4743 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4744 if (!expires) 4745 /* 4746 * If we cannot get the audio device autosuspend delay, 4747 * a fixed 4S interval will be used. Considering 3S is 4748 * the audio controller default autosuspend delay setting. 4749 * 4S used here is guaranteed to cover that. 4750 */ 4751 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4752 4753 while (!pm_runtime_status_suspended(&(p->dev))) { 4754 if (!pm_runtime_suspend(&(p->dev))) 4755 break; 4756 4757 if (expires < ktime_get_mono_fast_ns()) { 4758 dev_warn(adev->dev, "failed to suspend display audio\n"); 4759 /* TODO: abort the succeeding gpu reset? */ 4760 return -ETIMEDOUT; 4761 } 4762 } 4763 4764 pm_runtime_disable(&(p->dev)); 4765 4766 return 0; 4767 } 4768 4769 static void amdgpu_device_recheck_guilty_jobs( 4770 struct amdgpu_device *adev, struct list_head *device_list_handle, 4771 struct amdgpu_reset_context *reset_context) 4772 { 4773 int i, r = 0; 4774 4775 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4776 struct amdgpu_ring *ring = adev->rings[i]; 4777 int ret = 0; 4778 struct drm_sched_job *s_job; 4779 4780 if (!ring || !ring->sched.thread) 4781 continue; 4782 4783 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4784 struct drm_sched_job, list); 4785 if (s_job == NULL) 4786 continue; 4787 4788 /* clear job's guilty and depend the folowing step to decide the real one */ 4789 drm_sched_reset_karma(s_job); 4790 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4791 4792 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4793 if (ret == 0) { /* timeout */ 4794 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4795 ring->sched.name, s_job->id); 4796 4797 /* set guilty */ 4798 drm_sched_increase_karma(s_job); 4799 retry: 4800 /* do hw reset */ 4801 if (amdgpu_sriov_vf(adev)) { 4802 amdgpu_virt_fini_data_exchange(adev); 4803 r = amdgpu_device_reset_sriov(adev, false); 4804 if (r) 4805 adev->asic_reset_res = r; 4806 } else { 4807 clear_bit(AMDGPU_SKIP_HW_RESET, 4808 &reset_context->flags); 4809 r = amdgpu_do_asic_reset(device_list_handle, 4810 reset_context); 4811 if (r && r == -EAGAIN) 4812 goto retry; 4813 } 4814 4815 /* 4816 * add reset counter so that the following 4817 * resubmitted job could flush vmid 4818 */ 4819 atomic_inc(&adev->gpu_reset_counter); 4820 continue; 4821 } 4822 4823 /* got the hw fence, signal finished fence */ 4824 atomic_dec(ring->sched.score); 4825 dma_fence_get(&s_job->s_fence->finished); 4826 dma_fence_signal(&s_job->s_fence->finished); 4827 dma_fence_put(&s_job->s_fence->finished); 4828 4829 /* remove node from list and free the job */ 4830 spin_lock(&ring->sched.job_list_lock); 4831 list_del_init(&s_job->list); 4832 spin_unlock(&ring->sched.job_list_lock); 4833 ring->sched.ops->free_job(s_job); 4834 } 4835 } 4836 4837 /** 4838 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4839 * 4840 * @adev: amdgpu_device pointer 4841 * @job: which job trigger hang 4842 * 4843 * Attempt to reset the GPU if it has hung (all asics). 4844 * Attempt to do soft-reset or full-reset and reinitialize Asic 4845 * Returns 0 for success or an error on failure. 4846 */ 4847 4848 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4849 struct amdgpu_job *job) 4850 { 4851 struct list_head device_list, *device_list_handle = NULL; 4852 bool job_signaled = false; 4853 struct amdgpu_hive_info *hive = NULL; 4854 struct amdgpu_device *tmp_adev = NULL; 4855 int i, r = 0; 4856 bool need_emergency_restart = false; 4857 bool audio_suspended = false; 4858 int tmp_vram_lost_counter; 4859 struct amdgpu_reset_context reset_context; 4860 4861 memset(&reset_context, 0, sizeof(reset_context)); 4862 4863 /* 4864 * Special case: RAS triggered and full reset isn't supported 4865 */ 4866 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4867 4868 /* 4869 * Flush RAM to disk so that after reboot 4870 * the user can read log and see why the system rebooted. 4871 */ 4872 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4873 DRM_WARN("Emergency reboot."); 4874 4875 ksys_sync_helper(); 4876 emergency_restart(); 4877 } 4878 4879 dev_info(adev->dev, "GPU %s begin!\n", 4880 need_emergency_restart ? "jobs stop":"reset"); 4881 4882 /* 4883 * Here we trylock to avoid chain of resets executing from 4884 * either trigger by jobs on different adevs in XGMI hive or jobs on 4885 * different schedulers for same device while this TO handler is running. 4886 * We always reset all schedulers for device and all devices for XGMI 4887 * hive so that should take care of them too. 4888 */ 4889 hive = amdgpu_get_xgmi_hive(adev); 4890 if (hive) { 4891 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4892 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4893 job ? job->base.id : -1, hive->hive_id); 4894 amdgpu_put_xgmi_hive(hive); 4895 if (job) 4896 drm_sched_increase_karma(&job->base); 4897 return 0; 4898 } 4899 mutex_lock(&hive->hive_lock); 4900 } 4901 4902 reset_context.method = AMD_RESET_METHOD_NONE; 4903 reset_context.reset_req_dev = adev; 4904 reset_context.job = job; 4905 reset_context.hive = hive; 4906 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 4907 4908 /* 4909 * lock the device before we try to operate the linked list 4910 * if didn't get the device lock, don't touch the linked list since 4911 * others may iterating it. 4912 */ 4913 r = amdgpu_device_lock_hive_adev(adev, hive); 4914 if (r) { 4915 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4916 job ? job->base.id : -1); 4917 4918 /* even we skipped this reset, still need to set the job to guilty */ 4919 if (job) 4920 drm_sched_increase_karma(&job->base); 4921 goto skip_recovery; 4922 } 4923 4924 /* 4925 * Build list of devices to reset. 4926 * In case we are in XGMI hive mode, resort the device list 4927 * to put adev in the 1st position. 4928 */ 4929 INIT_LIST_HEAD(&device_list); 4930 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4931 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 4932 list_add_tail(&tmp_adev->reset_list, &device_list); 4933 if (!list_is_first(&adev->reset_list, &device_list)) 4934 list_rotate_to_front(&adev->reset_list, &device_list); 4935 device_list_handle = &device_list; 4936 } else { 4937 list_add_tail(&adev->reset_list, &device_list); 4938 device_list_handle = &device_list; 4939 } 4940 4941 /* block all schedulers and reset given job's ring */ 4942 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4943 /* 4944 * Try to put the audio codec into suspend state 4945 * before gpu reset started. 4946 * 4947 * Due to the power domain of the graphics device 4948 * is shared with AZ power domain. Without this, 4949 * we may change the audio hardware from behind 4950 * the audio driver's back. That will trigger 4951 * some audio codec errors. 4952 */ 4953 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4954 audio_suspended = true; 4955 4956 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4957 4958 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4959 4960 if (!amdgpu_sriov_vf(tmp_adev)) 4961 amdgpu_amdkfd_pre_reset(tmp_adev); 4962 4963 /* 4964 * Mark these ASICs to be reseted as untracked first 4965 * And add them back after reset completed 4966 */ 4967 amdgpu_unregister_gpu_instance(tmp_adev); 4968 4969 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4970 4971 /* disable ras on ALL IPs */ 4972 if (!need_emergency_restart && 4973 amdgpu_device_ip_need_full_reset(tmp_adev)) 4974 amdgpu_ras_suspend(tmp_adev); 4975 4976 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4977 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4978 4979 if (!ring || !ring->sched.thread) 4980 continue; 4981 4982 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4983 4984 if (need_emergency_restart) 4985 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4986 } 4987 atomic_inc(&tmp_adev->gpu_reset_counter); 4988 } 4989 4990 if (need_emergency_restart) 4991 goto skip_sched_resume; 4992 4993 /* 4994 * Must check guilty signal here since after this point all old 4995 * HW fences are force signaled. 4996 * 4997 * job->base holds a reference to parent fence 4998 */ 4999 if (job && job->base.s_fence->parent && 5000 dma_fence_is_signaled(job->base.s_fence->parent)) { 5001 job_signaled = true; 5002 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5003 goto skip_hw_reset; 5004 } 5005 5006 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5007 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5008 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5009 /*TODO Should we stop ?*/ 5010 if (r) { 5011 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5012 r, adev_to_drm(tmp_adev)->unique); 5013 tmp_adev->asic_reset_res = r; 5014 } 5015 } 5016 5017 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5018 /* Actual ASIC resets if needed.*/ 5019 /* TODO Implement XGMI hive reset logic for SRIOV */ 5020 if (amdgpu_sriov_vf(adev)) { 5021 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5022 if (r) 5023 adev->asic_reset_res = r; 5024 } else { 5025 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5026 if (r && r == -EAGAIN) 5027 goto retry; 5028 } 5029 5030 skip_hw_reset: 5031 5032 /* Post ASIC reset for all devs .*/ 5033 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5034 5035 /* 5036 * Sometimes a later bad compute job can block a good gfx job as gfx 5037 * and compute ring share internal GC HW mutually. We add an additional 5038 * guilty jobs recheck step to find the real guilty job, it synchronously 5039 * submits and pends for the first job being signaled. If it gets timeout, 5040 * we identify it as a real guilty job. 5041 */ 5042 if (amdgpu_gpu_recovery == 2 && 5043 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5044 amdgpu_device_recheck_guilty_jobs( 5045 tmp_adev, device_list_handle, &reset_context); 5046 5047 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5048 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5049 5050 if (!ring || !ring->sched.thread) 5051 continue; 5052 5053 /* No point to resubmit jobs if we didn't HW reset*/ 5054 if (!tmp_adev->asic_reset_res && !job_signaled) 5055 drm_sched_resubmit_jobs(&ring->sched); 5056 5057 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5058 } 5059 5060 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 5061 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5062 } 5063 5064 tmp_adev->asic_reset_res = 0; 5065 5066 if (r) { 5067 /* bad news, how to tell it to userspace ? */ 5068 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5069 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5070 } else { 5071 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5072 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5073 DRM_WARN("smart shift update failed\n"); 5074 } 5075 } 5076 5077 skip_sched_resume: 5078 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5079 /* unlock kfd: SRIOV would do it separately */ 5080 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5081 amdgpu_amdkfd_post_reset(tmp_adev); 5082 5083 /* kfd_post_reset will do nothing if kfd device is not initialized, 5084 * need to bring up kfd here if it's not be initialized before 5085 */ 5086 if (!adev->kfd.init_complete) 5087 amdgpu_amdkfd_device_init(adev); 5088 5089 if (audio_suspended) 5090 amdgpu_device_resume_display_audio(tmp_adev); 5091 amdgpu_device_unlock_adev(tmp_adev); 5092 } 5093 5094 skip_recovery: 5095 if (hive) { 5096 atomic_set(&hive->in_reset, 0); 5097 mutex_unlock(&hive->hive_lock); 5098 amdgpu_put_xgmi_hive(hive); 5099 } 5100 5101 if (r && r != -EAGAIN) 5102 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5103 return r; 5104 } 5105 5106 /** 5107 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5108 * 5109 * @adev: amdgpu_device pointer 5110 * 5111 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5112 * and lanes) of the slot the device is in. Handles APUs and 5113 * virtualized environments where PCIE config space may not be available. 5114 */ 5115 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5116 { 5117 struct pci_dev *pdev; 5118 enum pci_bus_speed speed_cap, platform_speed_cap; 5119 enum pcie_link_width platform_link_width; 5120 5121 if (amdgpu_pcie_gen_cap) 5122 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5123 5124 if (amdgpu_pcie_lane_cap) 5125 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5126 5127 /* covers APUs as well */ 5128 if (pci_is_root_bus(adev->pdev->bus)) { 5129 if (adev->pm.pcie_gen_mask == 0) 5130 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5131 if (adev->pm.pcie_mlw_mask == 0) 5132 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5133 return; 5134 } 5135 5136 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5137 return; 5138 5139 pcie_bandwidth_available(adev->pdev, NULL, 5140 &platform_speed_cap, &platform_link_width); 5141 5142 if (adev->pm.pcie_gen_mask == 0) { 5143 /* asic caps */ 5144 pdev = adev->pdev; 5145 speed_cap = pcie_get_speed_cap(pdev); 5146 if (speed_cap == PCI_SPEED_UNKNOWN) { 5147 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5148 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5149 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5150 } else { 5151 if (speed_cap == PCIE_SPEED_32_0GT) 5152 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5153 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5154 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5155 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5156 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5157 else if (speed_cap == PCIE_SPEED_16_0GT) 5158 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5159 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5160 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5161 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5162 else if (speed_cap == PCIE_SPEED_8_0GT) 5163 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5164 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5165 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5166 else if (speed_cap == PCIE_SPEED_5_0GT) 5167 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5168 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5169 else 5170 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5171 } 5172 /* platform caps */ 5173 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5174 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5175 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5176 } else { 5177 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5178 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5179 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5180 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5181 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5182 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5183 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5184 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5185 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5186 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5187 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5188 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5189 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5190 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5191 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5192 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5193 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5194 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5195 else 5196 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5197 5198 } 5199 } 5200 if (adev->pm.pcie_mlw_mask == 0) { 5201 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5202 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5203 } else { 5204 switch (platform_link_width) { 5205 case PCIE_LNK_X32: 5206 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5207 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5208 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5209 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5210 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5211 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5212 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5213 break; 5214 case PCIE_LNK_X16: 5215 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5216 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5217 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5218 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5219 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5220 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5221 break; 5222 case PCIE_LNK_X12: 5223 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5224 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5225 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5226 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5227 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5228 break; 5229 case PCIE_LNK_X8: 5230 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5231 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5232 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5233 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5234 break; 5235 case PCIE_LNK_X4: 5236 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5237 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5238 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5239 break; 5240 case PCIE_LNK_X2: 5241 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5242 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5243 break; 5244 case PCIE_LNK_X1: 5245 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5246 break; 5247 default: 5248 break; 5249 } 5250 } 5251 } 5252 } 5253 5254 int amdgpu_device_baco_enter(struct drm_device *dev) 5255 { 5256 struct amdgpu_device *adev = drm_to_adev(dev); 5257 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5258 5259 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5260 return -ENOTSUPP; 5261 5262 if (ras && adev->ras_enabled && 5263 adev->nbio.funcs->enable_doorbell_interrupt) 5264 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5265 5266 return amdgpu_dpm_baco_enter(adev); 5267 } 5268 5269 int amdgpu_device_baco_exit(struct drm_device *dev) 5270 { 5271 struct amdgpu_device *adev = drm_to_adev(dev); 5272 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5273 int ret = 0; 5274 5275 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5276 return -ENOTSUPP; 5277 5278 ret = amdgpu_dpm_baco_exit(adev); 5279 if (ret) 5280 return ret; 5281 5282 if (ras && adev->ras_enabled && 5283 adev->nbio.funcs->enable_doorbell_interrupt) 5284 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5285 5286 return 0; 5287 } 5288 5289 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5290 { 5291 int i; 5292 5293 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5294 struct amdgpu_ring *ring = adev->rings[i]; 5295 5296 if (!ring || !ring->sched.thread) 5297 continue; 5298 5299 cancel_delayed_work_sync(&ring->sched.work_tdr); 5300 } 5301 } 5302 5303 /** 5304 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5305 * @pdev: PCI device struct 5306 * @state: PCI channel state 5307 * 5308 * Description: Called when a PCI error is detected. 5309 * 5310 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5311 */ 5312 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5313 { 5314 struct drm_device *dev = pci_get_drvdata(pdev); 5315 struct amdgpu_device *adev = drm_to_adev(dev); 5316 int i; 5317 5318 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5319 5320 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5321 DRM_WARN("No support for XGMI hive yet..."); 5322 return PCI_ERS_RESULT_DISCONNECT; 5323 } 5324 5325 switch (state) { 5326 case pci_channel_io_normal: 5327 return PCI_ERS_RESULT_CAN_RECOVER; 5328 /* Fatal error, prepare for slot reset */ 5329 case pci_channel_io_frozen: 5330 /* 5331 * Cancel and wait for all TDRs in progress if failing to 5332 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5333 * 5334 * Locking adev->reset_sem will prevent any external access 5335 * to GPU during PCI error recovery 5336 */ 5337 while (!amdgpu_device_lock_adev(adev, NULL)) 5338 amdgpu_cancel_all_tdr(adev); 5339 5340 /* 5341 * Block any work scheduling as we do for regular GPU reset 5342 * for the duration of the recovery 5343 */ 5344 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5345 struct amdgpu_ring *ring = adev->rings[i]; 5346 5347 if (!ring || !ring->sched.thread) 5348 continue; 5349 5350 drm_sched_stop(&ring->sched, NULL); 5351 } 5352 atomic_inc(&adev->gpu_reset_counter); 5353 return PCI_ERS_RESULT_NEED_RESET; 5354 case pci_channel_io_perm_failure: 5355 /* Permanent error, prepare for device removal */ 5356 return PCI_ERS_RESULT_DISCONNECT; 5357 } 5358 5359 return PCI_ERS_RESULT_NEED_RESET; 5360 } 5361 5362 /** 5363 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5364 * @pdev: pointer to PCI device 5365 */ 5366 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5367 { 5368 5369 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5370 5371 /* TODO - dump whatever for debugging purposes */ 5372 5373 /* This called only if amdgpu_pci_error_detected returns 5374 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5375 * works, no need to reset slot. 5376 */ 5377 5378 return PCI_ERS_RESULT_RECOVERED; 5379 } 5380 5381 /** 5382 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5383 * @pdev: PCI device struct 5384 * 5385 * Description: This routine is called by the pci error recovery 5386 * code after the PCI slot has been reset, just before we 5387 * should resume normal operations. 5388 */ 5389 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5390 { 5391 struct drm_device *dev = pci_get_drvdata(pdev); 5392 struct amdgpu_device *adev = drm_to_adev(dev); 5393 int r, i; 5394 struct amdgpu_reset_context reset_context; 5395 u32 memsize; 5396 struct list_head device_list; 5397 5398 DRM_INFO("PCI error: slot reset callback!!\n"); 5399 5400 memset(&reset_context, 0, sizeof(reset_context)); 5401 5402 INIT_LIST_HEAD(&device_list); 5403 list_add_tail(&adev->reset_list, &device_list); 5404 5405 /* wait for asic to come out of reset */ 5406 msleep(500); 5407 5408 /* Restore PCI confspace */ 5409 amdgpu_device_load_pci_state(pdev); 5410 5411 /* confirm ASIC came out of reset */ 5412 for (i = 0; i < adev->usec_timeout; i++) { 5413 memsize = amdgpu_asic_get_config_memsize(adev); 5414 5415 if (memsize != 0xffffffff) 5416 break; 5417 udelay(1); 5418 } 5419 if (memsize == 0xffffffff) { 5420 r = -ETIME; 5421 goto out; 5422 } 5423 5424 reset_context.method = AMD_RESET_METHOD_NONE; 5425 reset_context.reset_req_dev = adev; 5426 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5427 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5428 5429 adev->no_hw_access = true; 5430 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5431 adev->no_hw_access = false; 5432 if (r) 5433 goto out; 5434 5435 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5436 5437 out: 5438 if (!r) { 5439 if (amdgpu_device_cache_pci_state(adev->pdev)) 5440 pci_restore_state(adev->pdev); 5441 5442 DRM_INFO("PCIe error recovery succeeded\n"); 5443 } else { 5444 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5445 amdgpu_device_unlock_adev(adev); 5446 } 5447 5448 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5449 } 5450 5451 /** 5452 * amdgpu_pci_resume() - resume normal ops after PCI reset 5453 * @pdev: pointer to PCI device 5454 * 5455 * Called when the error recovery driver tells us that its 5456 * OK to resume normal operation. 5457 */ 5458 void amdgpu_pci_resume(struct pci_dev *pdev) 5459 { 5460 struct drm_device *dev = pci_get_drvdata(pdev); 5461 struct amdgpu_device *adev = drm_to_adev(dev); 5462 int i; 5463 5464 5465 DRM_INFO("PCI error: resume callback!!\n"); 5466 5467 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5468 struct amdgpu_ring *ring = adev->rings[i]; 5469 5470 if (!ring || !ring->sched.thread) 5471 continue; 5472 5473 5474 drm_sched_resubmit_jobs(&ring->sched); 5475 drm_sched_start(&ring->sched, true); 5476 } 5477 5478 amdgpu_device_unlock_adev(adev); 5479 } 5480 5481 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5482 { 5483 struct drm_device *dev = pci_get_drvdata(pdev); 5484 struct amdgpu_device *adev = drm_to_adev(dev); 5485 int r; 5486 5487 r = pci_save_state(pdev); 5488 if (!r) { 5489 kfree(adev->pci_state); 5490 5491 adev->pci_state = pci_store_saved_state(pdev); 5492 5493 if (!adev->pci_state) { 5494 DRM_ERROR("Failed to store PCI saved state"); 5495 return false; 5496 } 5497 } else { 5498 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5499 return false; 5500 } 5501 5502 return true; 5503 } 5504 5505 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5506 { 5507 struct drm_device *dev = pci_get_drvdata(pdev); 5508 struct amdgpu_device *adev = drm_to_adev(dev); 5509 int r; 5510 5511 if (!adev->pci_state) 5512 return false; 5513 5514 r = pci_load_saved_state(pdev, adev->pci_state); 5515 5516 if (!r) { 5517 pci_restore_state(pdev); 5518 } else { 5519 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5520 return false; 5521 } 5522 5523 return true; 5524 } 5525 5526 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5527 struct amdgpu_ring *ring) 5528 { 5529 #ifdef CONFIG_X86_64 5530 if (adev->flags & AMD_IS_APU) 5531 return; 5532 #endif 5533 if (adev->gmc.xgmi.connected_to_cpu) 5534 return; 5535 5536 if (ring && ring->funcs->emit_hdp_flush) 5537 amdgpu_ring_emit_hdp_flush(ring); 5538 else 5539 amdgpu_asic_flush_hdp(adev, ring); 5540 } 5541 5542 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5543 struct amdgpu_ring *ring) 5544 { 5545 #ifdef CONFIG_X86_64 5546 if (adev->flags & AMD_IS_APU) 5547 return; 5548 #endif 5549 if (adev->gmc.xgmi.connected_to_cpu) 5550 return; 5551 5552 amdgpu_asic_invalidate_hdp(adev, ring); 5553 } 5554