1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 #include "amdgpu_reset.h" 69 70 #include <linux/suspend.h> 71 #include <drm/task_barrier.h> 72 #include <linux/pm_runtime.h> 73 74 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "ALDEBARAN", 115 "NAVI10", 116 "NAVI14", 117 "NAVI12", 118 "SIENNA_CICHLID", 119 "NAVY_FLOUNDER", 120 "VANGOGH", 121 "DIMGREY_CAVEFISH", 122 "LAST", 123 }; 124 125 /** 126 * DOC: pcie_replay_count 127 * 128 * The amdgpu driver provides a sysfs API for reporting the total number 129 * of PCIe replays (NAKs) 130 * The file pcie_replay_count is used for this and returns the total 131 * number of replays as a sum of the NAKs generated and NAKs received 132 */ 133 134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 135 struct device_attribute *attr, char *buf) 136 { 137 struct drm_device *ddev = dev_get_drvdata(dev); 138 struct amdgpu_device *adev = drm_to_adev(ddev); 139 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 140 141 return sysfs_emit(buf, "%llu\n", cnt); 142 } 143 144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 145 amdgpu_device_get_pcie_replay_count, NULL); 146 147 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 148 149 /** 150 * DOC: product_name 151 * 152 * The amdgpu driver provides a sysfs API for reporting the product name 153 * for the device 154 * The file serial_number is used for this and returns the product name 155 * as returned from the FRU. 156 * NOTE: This is only available for certain server cards 157 */ 158 159 static ssize_t amdgpu_device_get_product_name(struct device *dev, 160 struct device_attribute *attr, char *buf) 161 { 162 struct drm_device *ddev = dev_get_drvdata(dev); 163 struct amdgpu_device *adev = drm_to_adev(ddev); 164 165 return sysfs_emit(buf, "%s\n", adev->product_name); 166 } 167 168 static DEVICE_ATTR(product_name, S_IRUGO, 169 amdgpu_device_get_product_name, NULL); 170 171 /** 172 * DOC: product_number 173 * 174 * The amdgpu driver provides a sysfs API for reporting the part number 175 * for the device 176 * The file serial_number is used for this and returns the part number 177 * as returned from the FRU. 178 * NOTE: This is only available for certain server cards 179 */ 180 181 static ssize_t amdgpu_device_get_product_number(struct device *dev, 182 struct device_attribute *attr, char *buf) 183 { 184 struct drm_device *ddev = dev_get_drvdata(dev); 185 struct amdgpu_device *adev = drm_to_adev(ddev); 186 187 return sysfs_emit(buf, "%s\n", adev->product_number); 188 } 189 190 static DEVICE_ATTR(product_number, S_IRUGO, 191 amdgpu_device_get_product_number, NULL); 192 193 /** 194 * DOC: serial_number 195 * 196 * The amdgpu driver provides a sysfs API for reporting the serial number 197 * for the device 198 * The file serial_number is used for this and returns the serial number 199 * as returned from the FRU. 200 * NOTE: This is only available for certain server cards 201 */ 202 203 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 204 struct device_attribute *attr, char *buf) 205 { 206 struct drm_device *ddev = dev_get_drvdata(dev); 207 struct amdgpu_device *adev = drm_to_adev(ddev); 208 209 return sysfs_emit(buf, "%s\n", adev->serial); 210 } 211 212 static DEVICE_ATTR(serial_number, S_IRUGO, 213 amdgpu_device_get_serial_number, NULL); 214 215 /** 216 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 217 * 218 * @dev: drm_device pointer 219 * 220 * Returns true if the device is a dGPU with ATPX power control, 221 * otherwise return false. 222 */ 223 bool amdgpu_device_supports_px(struct drm_device *dev) 224 { 225 struct amdgpu_device *adev = drm_to_adev(dev); 226 227 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 228 return true; 229 return false; 230 } 231 232 /** 233 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 234 * 235 * @dev: drm_device pointer 236 * 237 * Returns true if the device is a dGPU with ACPI power control, 238 * otherwise return false. 239 */ 240 bool amdgpu_device_supports_boco(struct drm_device *dev) 241 { 242 struct amdgpu_device *adev = drm_to_adev(dev); 243 244 if (adev->has_pr3 || 245 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_baco - Does the device support BACO 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device supporte BACO, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_baco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 return amdgpu_asic_supports_baco(adev); 263 } 264 265 /* 266 * VRAM access helper functions 267 */ 268 269 /** 270 * amdgpu_device_vram_access - read/write a buffer in vram 271 * 272 * @adev: amdgpu_device pointer 273 * @pos: offset of the buffer in vram 274 * @buf: virtual address of the buffer in system memory 275 * @size: read/write size, sizeof(@buf) must > @size 276 * @write: true - write to vram, otherwise - read from vram 277 */ 278 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 279 uint32_t *buf, size_t size, bool write) 280 { 281 unsigned long flags; 282 uint32_t hi = ~0; 283 uint64_t last; 284 285 286 #ifdef CONFIG_64BIT 287 last = min(pos + size, adev->gmc.visible_vram_size); 288 if (last > pos) { 289 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 290 size_t count = last - pos; 291 292 if (write) { 293 memcpy_toio(addr, buf, count); 294 mb(); 295 amdgpu_asic_flush_hdp(adev, NULL); 296 } else { 297 amdgpu_asic_invalidate_hdp(adev, NULL); 298 mb(); 299 memcpy_fromio(buf, addr, count); 300 } 301 302 if (count == size) 303 return; 304 305 pos += count; 306 buf += count / 4; 307 size -= count; 308 } 309 #endif 310 311 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 312 for (last = pos + size; pos < last; pos += 4) { 313 uint32_t tmp = pos >> 31; 314 315 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 316 if (tmp != hi) { 317 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 318 hi = tmp; 319 } 320 if (write) 321 WREG32_NO_KIQ(mmMM_DATA, *buf++); 322 else 323 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 324 } 325 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 326 } 327 328 /* 329 * register access helper functions. 330 */ 331 332 /* Check if hw access should be skipped because of hotplug or device error */ 333 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 334 { 335 if (adev->in_pci_err_recovery) 336 return true; 337 338 #ifdef CONFIG_LOCKDEP 339 /* 340 * This is a bit complicated to understand, so worth a comment. What we assert 341 * here is that the GPU reset is not running on another thread in parallel. 342 * 343 * For this we trylock the read side of the reset semaphore, if that succeeds 344 * we know that the reset is not running in paralell. 345 * 346 * If the trylock fails we assert that we are either already holding the read 347 * side of the lock or are the reset thread itself and hold the write side of 348 * the lock. 349 */ 350 if (in_task()) { 351 if (down_read_trylock(&adev->reset_sem)) 352 up_read(&adev->reset_sem); 353 else 354 lockdep_assert_held(&adev->reset_sem); 355 } 356 #endif 357 return false; 358 } 359 360 /** 361 * amdgpu_device_rreg - read a memory mapped IO or indirect register 362 * 363 * @adev: amdgpu_device pointer 364 * @reg: dword aligned register offset 365 * @acc_flags: access flags which require special behavior 366 * 367 * Returns the 32 bit value from the offset specified. 368 */ 369 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 370 uint32_t reg, uint32_t acc_flags) 371 { 372 uint32_t ret; 373 374 if (amdgpu_device_skip_hw_access(adev)) 375 return 0; 376 377 if ((reg * 4) < adev->rmmio_size) { 378 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 379 amdgpu_sriov_runtime(adev) && 380 down_read_trylock(&adev->reset_sem)) { 381 ret = amdgpu_kiq_rreg(adev, reg); 382 up_read(&adev->reset_sem); 383 } else { 384 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 385 } 386 } else { 387 ret = adev->pcie_rreg(adev, reg * 4); 388 } 389 390 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 391 392 return ret; 393 } 394 395 /* 396 * MMIO register read with bytes helper functions 397 * @offset:bytes offset from MMIO start 398 * 399 */ 400 401 /** 402 * amdgpu_mm_rreg8 - read a memory mapped IO register 403 * 404 * @adev: amdgpu_device pointer 405 * @offset: byte aligned register offset 406 * 407 * Returns the 8 bit value from the offset specified. 408 */ 409 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 410 { 411 if (amdgpu_device_skip_hw_access(adev)) 412 return 0; 413 414 if (offset < adev->rmmio_size) 415 return (readb(adev->rmmio + offset)); 416 BUG(); 417 } 418 419 /* 420 * MMIO register write with bytes helper functions 421 * @offset:bytes offset from MMIO start 422 * @value: the value want to be written to the register 423 * 424 */ 425 /** 426 * amdgpu_mm_wreg8 - read a memory mapped IO register 427 * 428 * @adev: amdgpu_device pointer 429 * @offset: byte aligned register offset 430 * @value: 8 bit value to write 431 * 432 * Writes the value specified to the offset specified. 433 */ 434 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 435 { 436 if (amdgpu_device_skip_hw_access(adev)) 437 return; 438 439 if (offset < adev->rmmio_size) 440 writeb(value, adev->rmmio + offset); 441 else 442 BUG(); 443 } 444 445 /** 446 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 447 * 448 * @adev: amdgpu_device pointer 449 * @reg: dword aligned register offset 450 * @v: 32 bit value to write to the register 451 * @acc_flags: access flags which require special behavior 452 * 453 * Writes the value specified to the offset specified. 454 */ 455 void amdgpu_device_wreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t v, 457 uint32_t acc_flags) 458 { 459 if (amdgpu_device_skip_hw_access(adev)) 460 return; 461 462 if ((reg * 4) < adev->rmmio_size) { 463 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 464 amdgpu_sriov_runtime(adev) && 465 down_read_trylock(&adev->reset_sem)) { 466 amdgpu_kiq_wreg(adev, reg, v); 467 up_read(&adev->reset_sem); 468 } else { 469 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 470 } 471 } else { 472 adev->pcie_wreg(adev, reg * 4, v); 473 } 474 475 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 476 } 477 478 /* 479 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 480 * 481 * this function is invoked only the debugfs register access 482 * */ 483 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 484 uint32_t reg, uint32_t v) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return; 488 489 if (amdgpu_sriov_fullaccess(adev) && 490 adev->gfx.rlc.funcs && 491 adev->gfx.rlc.funcs->is_rlcg_access_range) { 492 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 493 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } 498 499 /** 500 * amdgpu_mm_rdoorbell - read a doorbell dword 501 * 502 * @adev: amdgpu_device pointer 503 * @index: doorbell index 504 * 505 * Returns the value in the doorbell aperture at the 506 * requested doorbell index (CIK). 507 */ 508 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 509 { 510 if (amdgpu_device_skip_hw_access(adev)) 511 return 0; 512 513 if (index < adev->doorbell.num_doorbells) { 514 return readl(adev->doorbell.ptr + index); 515 } else { 516 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 517 return 0; 518 } 519 } 520 521 /** 522 * amdgpu_mm_wdoorbell - write a doorbell dword 523 * 524 * @adev: amdgpu_device pointer 525 * @index: doorbell index 526 * @v: value to write 527 * 528 * Writes @v to the doorbell aperture at the 529 * requested doorbell index (CIK). 530 */ 531 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 532 { 533 if (amdgpu_device_skip_hw_access(adev)) 534 return; 535 536 if (index < adev->doorbell.num_doorbells) { 537 writel(v, adev->doorbell.ptr + index); 538 } else { 539 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 540 } 541 } 542 543 /** 544 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 545 * 546 * @adev: amdgpu_device pointer 547 * @index: doorbell index 548 * 549 * Returns the value in the doorbell aperture at the 550 * requested doorbell index (VEGA10+). 551 */ 552 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 553 { 554 if (amdgpu_device_skip_hw_access(adev)) 555 return 0; 556 557 if (index < adev->doorbell.num_doorbells) { 558 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 559 } else { 560 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 561 return 0; 562 } 563 } 564 565 /** 566 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 567 * 568 * @adev: amdgpu_device pointer 569 * @index: doorbell index 570 * @v: value to write 571 * 572 * Writes @v to the doorbell aperture at the 573 * requested doorbell index (VEGA10+). 574 */ 575 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 576 { 577 if (amdgpu_device_skip_hw_access(adev)) 578 return; 579 580 if (index < adev->doorbell.num_doorbells) { 581 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 582 } else { 583 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 584 } 585 } 586 587 /** 588 * amdgpu_device_indirect_rreg - read an indirect register 589 * 590 * @adev: amdgpu_device pointer 591 * @pcie_index: mmio register offset 592 * @pcie_data: mmio register offset 593 * @reg_addr: indirect register address to read from 594 * 595 * Returns the value of indirect register @reg_addr 596 */ 597 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 598 u32 pcie_index, u32 pcie_data, 599 u32 reg_addr) 600 { 601 unsigned long flags; 602 u32 r; 603 void __iomem *pcie_index_offset; 604 void __iomem *pcie_data_offset; 605 606 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 607 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 608 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 609 610 writel(reg_addr, pcie_index_offset); 611 readl(pcie_index_offset); 612 r = readl(pcie_data_offset); 613 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 614 615 return r; 616 } 617 618 /** 619 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 620 * 621 * @adev: amdgpu_device pointer 622 * @pcie_index: mmio register offset 623 * @pcie_data: mmio register offset 624 * @reg_addr: indirect register address to read from 625 * 626 * Returns the value of indirect register @reg_addr 627 */ 628 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 629 u32 pcie_index, u32 pcie_data, 630 u32 reg_addr) 631 { 632 unsigned long flags; 633 u64 r; 634 void __iomem *pcie_index_offset; 635 void __iomem *pcie_data_offset; 636 637 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 638 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 639 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 640 641 /* read low 32 bits */ 642 writel(reg_addr, pcie_index_offset); 643 readl(pcie_index_offset); 644 r = readl(pcie_data_offset); 645 /* read high 32 bits */ 646 writel(reg_addr + 4, pcie_index_offset); 647 readl(pcie_index_offset); 648 r |= ((u64)readl(pcie_data_offset) << 32); 649 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 650 651 return r; 652 } 653 654 /** 655 * amdgpu_device_indirect_wreg - write an indirect register address 656 * 657 * @adev: amdgpu_device pointer 658 * @pcie_index: mmio register offset 659 * @pcie_data: mmio register offset 660 * @reg_addr: indirect register offset 661 * @reg_data: indirect register data 662 * 663 */ 664 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 665 u32 pcie_index, u32 pcie_data, 666 u32 reg_addr, u32 reg_data) 667 { 668 unsigned long flags; 669 void __iomem *pcie_index_offset; 670 void __iomem *pcie_data_offset; 671 672 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 673 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 674 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 675 676 writel(reg_addr, pcie_index_offset); 677 readl(pcie_index_offset); 678 writel(reg_data, pcie_data_offset); 679 readl(pcie_data_offset); 680 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 681 } 682 683 /** 684 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 685 * 686 * @adev: amdgpu_device pointer 687 * @pcie_index: mmio register offset 688 * @pcie_data: mmio register offset 689 * @reg_addr: indirect register offset 690 * @reg_data: indirect register data 691 * 692 */ 693 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 694 u32 pcie_index, u32 pcie_data, 695 u32 reg_addr, u64 reg_data) 696 { 697 unsigned long flags; 698 void __iomem *pcie_index_offset; 699 void __iomem *pcie_data_offset; 700 701 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 702 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 703 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 704 705 /* write low 32 bits */ 706 writel(reg_addr, pcie_index_offset); 707 readl(pcie_index_offset); 708 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 709 readl(pcie_data_offset); 710 /* write high 32 bits */ 711 writel(reg_addr + 4, pcie_index_offset); 712 readl(pcie_index_offset); 713 writel((u32)(reg_data >> 32), pcie_data_offset); 714 readl(pcie_data_offset); 715 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 716 } 717 718 /** 719 * amdgpu_invalid_rreg - dummy reg read function 720 * 721 * @adev: amdgpu_device pointer 722 * @reg: offset of register 723 * 724 * Dummy register read function. Used for register blocks 725 * that certain asics don't have (all asics). 726 * Returns the value in the register. 727 */ 728 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 729 { 730 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 731 BUG(); 732 return 0; 733 } 734 735 /** 736 * amdgpu_invalid_wreg - dummy reg write function 737 * 738 * @adev: amdgpu_device pointer 739 * @reg: offset of register 740 * @v: value to write to the register 741 * 742 * Dummy register read function. Used for register blocks 743 * that certain asics don't have (all asics). 744 */ 745 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 746 { 747 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 748 reg, v); 749 BUG(); 750 } 751 752 /** 753 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 754 * 755 * @adev: amdgpu_device pointer 756 * @reg: offset of register 757 * 758 * Dummy register read function. Used for register blocks 759 * that certain asics don't have (all asics). 760 * Returns the value in the register. 761 */ 762 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 763 { 764 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 765 BUG(); 766 return 0; 767 } 768 769 /** 770 * amdgpu_invalid_wreg64 - dummy reg write function 771 * 772 * @adev: amdgpu_device pointer 773 * @reg: offset of register 774 * @v: value to write to the register 775 * 776 * Dummy register read function. Used for register blocks 777 * that certain asics don't have (all asics). 778 */ 779 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 780 { 781 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 782 reg, v); 783 BUG(); 784 } 785 786 /** 787 * amdgpu_block_invalid_rreg - dummy reg read function 788 * 789 * @adev: amdgpu_device pointer 790 * @block: offset of instance 791 * @reg: offset of register 792 * 793 * Dummy register read function. Used for register blocks 794 * that certain asics don't have (all asics). 795 * Returns the value in the register. 796 */ 797 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 798 uint32_t block, uint32_t reg) 799 { 800 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 801 reg, block); 802 BUG(); 803 return 0; 804 } 805 806 /** 807 * amdgpu_block_invalid_wreg - dummy reg write function 808 * 809 * @adev: amdgpu_device pointer 810 * @block: offset of instance 811 * @reg: offset of register 812 * @v: value to write to the register 813 * 814 * Dummy register read function. Used for register blocks 815 * that certain asics don't have (all asics). 816 */ 817 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 818 uint32_t block, 819 uint32_t reg, uint32_t v) 820 { 821 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 822 reg, block, v); 823 BUG(); 824 } 825 826 /** 827 * amdgpu_device_asic_init - Wrapper for atom asic_init 828 * 829 * @adev: amdgpu_device pointer 830 * 831 * Does any asic specific work and then calls atom asic init. 832 */ 833 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 834 { 835 amdgpu_asic_pre_asic_init(adev); 836 837 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 838 } 839 840 /** 841 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 842 * 843 * @adev: amdgpu_device pointer 844 * 845 * Allocates a scratch page of VRAM for use by various things in the 846 * driver. 847 */ 848 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 849 { 850 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 851 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 852 &adev->vram_scratch.robj, 853 &adev->vram_scratch.gpu_addr, 854 (void **)&adev->vram_scratch.ptr); 855 } 856 857 /** 858 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 859 * 860 * @adev: amdgpu_device pointer 861 * 862 * Frees the VRAM scratch page. 863 */ 864 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 865 { 866 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 867 } 868 869 /** 870 * amdgpu_device_program_register_sequence - program an array of registers. 871 * 872 * @adev: amdgpu_device pointer 873 * @registers: pointer to the register array 874 * @array_size: size of the register array 875 * 876 * Programs an array or registers with and and or masks. 877 * This is a helper for setting golden registers. 878 */ 879 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 880 const u32 *registers, 881 const u32 array_size) 882 { 883 u32 tmp, reg, and_mask, or_mask; 884 int i; 885 886 if (array_size % 3) 887 return; 888 889 for (i = 0; i < array_size; i +=3) { 890 reg = registers[i + 0]; 891 and_mask = registers[i + 1]; 892 or_mask = registers[i + 2]; 893 894 if (and_mask == 0xffffffff) { 895 tmp = or_mask; 896 } else { 897 tmp = RREG32(reg); 898 tmp &= ~and_mask; 899 if (adev->family >= AMDGPU_FAMILY_AI) 900 tmp |= (or_mask & and_mask); 901 else 902 tmp |= or_mask; 903 } 904 WREG32(reg, tmp); 905 } 906 } 907 908 /** 909 * amdgpu_device_pci_config_reset - reset the GPU 910 * 911 * @adev: amdgpu_device pointer 912 * 913 * Resets the GPU using the pci config reset sequence. 914 * Only applicable to asics prior to vega10. 915 */ 916 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 917 { 918 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 919 } 920 921 /** 922 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 923 * 924 * @adev: amdgpu_device pointer 925 * 926 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 927 */ 928 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 929 { 930 return pci_reset_function(adev->pdev); 931 } 932 933 /* 934 * GPU doorbell aperture helpers function. 935 */ 936 /** 937 * amdgpu_device_doorbell_init - Init doorbell driver information. 938 * 939 * @adev: amdgpu_device pointer 940 * 941 * Init doorbell driver information (CIK) 942 * Returns 0 on success, error on failure. 943 */ 944 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 945 { 946 947 /* No doorbell on SI hardware generation */ 948 if (adev->asic_type < CHIP_BONAIRE) { 949 adev->doorbell.base = 0; 950 adev->doorbell.size = 0; 951 adev->doorbell.num_doorbells = 0; 952 adev->doorbell.ptr = NULL; 953 return 0; 954 } 955 956 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 957 return -EINVAL; 958 959 amdgpu_asic_init_doorbell_index(adev); 960 961 /* doorbell bar mapping */ 962 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 963 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 964 965 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 966 adev->doorbell_index.max_assignment+1); 967 if (adev->doorbell.num_doorbells == 0) 968 return -EINVAL; 969 970 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 971 * paging queue doorbell use the second page. The 972 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 973 * doorbells are in the first page. So with paging queue enabled, 974 * the max num_doorbells should + 1 page (0x400 in dword) 975 */ 976 if (adev->asic_type >= CHIP_VEGA10) 977 adev->doorbell.num_doorbells += 0x400; 978 979 adev->doorbell.ptr = ioremap(adev->doorbell.base, 980 adev->doorbell.num_doorbells * 981 sizeof(u32)); 982 if (adev->doorbell.ptr == NULL) 983 return -ENOMEM; 984 985 return 0; 986 } 987 988 /** 989 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 990 * 991 * @adev: amdgpu_device pointer 992 * 993 * Tear down doorbell driver information (CIK) 994 */ 995 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 996 { 997 iounmap(adev->doorbell.ptr); 998 adev->doorbell.ptr = NULL; 999 } 1000 1001 1002 1003 /* 1004 * amdgpu_device_wb_*() 1005 * Writeback is the method by which the GPU updates special pages in memory 1006 * with the status of certain GPU events (fences, ring pointers,etc.). 1007 */ 1008 1009 /** 1010 * amdgpu_device_wb_fini - Disable Writeback and free memory 1011 * 1012 * @adev: amdgpu_device pointer 1013 * 1014 * Disables Writeback and frees the Writeback memory (all asics). 1015 * Used at driver shutdown. 1016 */ 1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1018 { 1019 if (adev->wb.wb_obj) { 1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1021 &adev->wb.gpu_addr, 1022 (void **)&adev->wb.wb); 1023 adev->wb.wb_obj = NULL; 1024 } 1025 } 1026 1027 /** 1028 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1029 * 1030 * @adev: amdgpu_device pointer 1031 * 1032 * Initializes writeback and allocates writeback memory (all asics). 1033 * Used at driver startup. 1034 * Returns 0 on success or an -error on failure. 1035 */ 1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1037 { 1038 int r; 1039 1040 if (adev->wb.wb_obj == NULL) { 1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1044 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1045 (void **)&adev->wb.wb); 1046 if (r) { 1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1048 return r; 1049 } 1050 1051 adev->wb.num_wb = AMDGPU_MAX_WB; 1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1053 1054 /* clear wb memory */ 1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1056 } 1057 1058 return 0; 1059 } 1060 1061 /** 1062 * amdgpu_device_wb_get - Allocate a wb entry 1063 * 1064 * @adev: amdgpu_device pointer 1065 * @wb: wb index 1066 * 1067 * Allocate a wb slot for use by the driver (all asics). 1068 * Returns 0 on success or -EINVAL on failure. 1069 */ 1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1071 { 1072 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1073 1074 if (offset < adev->wb.num_wb) { 1075 __set_bit(offset, adev->wb.used); 1076 *wb = offset << 3; /* convert to dw offset */ 1077 return 0; 1078 } else { 1079 return -EINVAL; 1080 } 1081 } 1082 1083 /** 1084 * amdgpu_device_wb_free - Free a wb entry 1085 * 1086 * @adev: amdgpu_device pointer 1087 * @wb: wb index 1088 * 1089 * Free a wb slot allocated for use by the driver (all asics) 1090 */ 1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1092 { 1093 wb >>= 3; 1094 if (wb < adev->wb.num_wb) 1095 __clear_bit(wb, adev->wb.used); 1096 } 1097 1098 /** 1099 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1100 * 1101 * @adev: amdgpu_device pointer 1102 * 1103 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1104 * to fail, but if any of the BARs is not accessible after the size we abort 1105 * driver loading by returning -ENODEV. 1106 */ 1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1108 { 1109 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1110 struct pci_bus *root; 1111 struct resource *res; 1112 unsigned i; 1113 u16 cmd; 1114 int r; 1115 1116 /* Bypass for VF */ 1117 if (amdgpu_sriov_vf(adev)) 1118 return 0; 1119 1120 /* skip if the bios has already enabled large BAR */ 1121 if (adev->gmc.real_vram_size && 1122 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1123 return 0; 1124 1125 /* Check if the root BUS has 64bit memory resources */ 1126 root = adev->pdev->bus; 1127 while (root->parent) 1128 root = root->parent; 1129 1130 pci_bus_for_each_resource(root, res, i) { 1131 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1132 res->start > 0x100000000ull) 1133 break; 1134 } 1135 1136 /* Trying to resize is pointless without a root hub window above 4GB */ 1137 if (!res) 1138 return 0; 1139 1140 /* Limit the BAR size to what is available */ 1141 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1142 rbar_size); 1143 1144 /* Disable memory decoding while we change the BAR addresses and size */ 1145 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1146 pci_write_config_word(adev->pdev, PCI_COMMAND, 1147 cmd & ~PCI_COMMAND_MEMORY); 1148 1149 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1150 amdgpu_device_doorbell_fini(adev); 1151 if (adev->asic_type >= CHIP_BONAIRE) 1152 pci_release_resource(adev->pdev, 2); 1153 1154 pci_release_resource(adev->pdev, 0); 1155 1156 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1157 if (r == -ENOSPC) 1158 DRM_INFO("Not enough PCI address space for a large BAR."); 1159 else if (r && r != -ENOTSUPP) 1160 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1161 1162 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1163 1164 /* When the doorbell or fb BAR isn't available we have no chance of 1165 * using the device. 1166 */ 1167 r = amdgpu_device_doorbell_init(adev); 1168 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1169 return -ENODEV; 1170 1171 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1172 1173 return 0; 1174 } 1175 1176 /* 1177 * GPU helpers function. 1178 */ 1179 /** 1180 * amdgpu_device_need_post - check if the hw need post or not 1181 * 1182 * @adev: amdgpu_device pointer 1183 * 1184 * Check if the asic has been initialized (all asics) at driver startup 1185 * or post is needed if hw reset is performed. 1186 * Returns true if need or false if not. 1187 */ 1188 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1189 { 1190 uint32_t reg; 1191 1192 if (amdgpu_sriov_vf(adev)) 1193 return false; 1194 1195 if (amdgpu_passthrough(adev)) { 1196 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1197 * some old smc fw still need driver do vPost otherwise gpu hang, while 1198 * those smc fw version above 22.15 doesn't have this flaw, so we force 1199 * vpost executed for smc version below 22.15 1200 */ 1201 if (adev->asic_type == CHIP_FIJI) { 1202 int err; 1203 uint32_t fw_ver; 1204 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1205 /* force vPost if error occured */ 1206 if (err) 1207 return true; 1208 1209 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1210 if (fw_ver < 0x00160e00) 1211 return true; 1212 } 1213 } 1214 1215 /* Don't post if we need to reset whole hive on init */ 1216 if (adev->gmc.xgmi.pending_reset) 1217 return false; 1218 1219 if (adev->has_hw_reset) { 1220 adev->has_hw_reset = false; 1221 return true; 1222 } 1223 1224 /* bios scratch used on CIK+ */ 1225 if (adev->asic_type >= CHIP_BONAIRE) 1226 return amdgpu_atombios_scratch_need_asic_init(adev); 1227 1228 /* check MEM_SIZE for older asics */ 1229 reg = amdgpu_asic_get_config_memsize(adev); 1230 1231 if ((reg != 0) && (reg != 0xffffffff)) 1232 return false; 1233 1234 return true; 1235 } 1236 1237 /* if we get transitioned to only one device, take VGA back */ 1238 /** 1239 * amdgpu_device_vga_set_decode - enable/disable vga decode 1240 * 1241 * @cookie: amdgpu_device pointer 1242 * @state: enable/disable vga decode 1243 * 1244 * Enable/disable vga decode (all asics). 1245 * Returns VGA resource flags. 1246 */ 1247 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1248 { 1249 struct amdgpu_device *adev = cookie; 1250 amdgpu_asic_set_vga_state(adev, state); 1251 if (state) 1252 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1253 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1254 else 1255 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1256 } 1257 1258 /** 1259 * amdgpu_device_check_block_size - validate the vm block size 1260 * 1261 * @adev: amdgpu_device pointer 1262 * 1263 * Validates the vm block size specified via module parameter. 1264 * The vm block size defines number of bits in page table versus page directory, 1265 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1266 * page table and the remaining bits are in the page directory. 1267 */ 1268 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1269 { 1270 /* defines number of bits in page table versus page directory, 1271 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1272 * page table and the remaining bits are in the page directory */ 1273 if (amdgpu_vm_block_size == -1) 1274 return; 1275 1276 if (amdgpu_vm_block_size < 9) { 1277 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1278 amdgpu_vm_block_size); 1279 amdgpu_vm_block_size = -1; 1280 } 1281 } 1282 1283 /** 1284 * amdgpu_device_check_vm_size - validate the vm size 1285 * 1286 * @adev: amdgpu_device pointer 1287 * 1288 * Validates the vm size in GB specified via module parameter. 1289 * The VM size is the size of the GPU virtual memory space in GB. 1290 */ 1291 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1292 { 1293 /* no need to check the default value */ 1294 if (amdgpu_vm_size == -1) 1295 return; 1296 1297 if (amdgpu_vm_size < 1) { 1298 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1299 amdgpu_vm_size); 1300 amdgpu_vm_size = -1; 1301 } 1302 } 1303 1304 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1305 { 1306 struct sysinfo si; 1307 bool is_os_64 = (sizeof(void *) == 8); 1308 uint64_t total_memory; 1309 uint64_t dram_size_seven_GB = 0x1B8000000; 1310 uint64_t dram_size_three_GB = 0xB8000000; 1311 1312 if (amdgpu_smu_memory_pool_size == 0) 1313 return; 1314 1315 if (!is_os_64) { 1316 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1317 goto def_value; 1318 } 1319 si_meminfo(&si); 1320 total_memory = (uint64_t)si.totalram * si.mem_unit; 1321 1322 if ((amdgpu_smu_memory_pool_size == 1) || 1323 (amdgpu_smu_memory_pool_size == 2)) { 1324 if (total_memory < dram_size_three_GB) 1325 goto def_value1; 1326 } else if ((amdgpu_smu_memory_pool_size == 4) || 1327 (amdgpu_smu_memory_pool_size == 8)) { 1328 if (total_memory < dram_size_seven_GB) 1329 goto def_value1; 1330 } else { 1331 DRM_WARN("Smu memory pool size not supported\n"); 1332 goto def_value; 1333 } 1334 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1335 1336 return; 1337 1338 def_value1: 1339 DRM_WARN("No enough system memory\n"); 1340 def_value: 1341 adev->pm.smu_prv_buffer_size = 0; 1342 } 1343 1344 /** 1345 * amdgpu_device_check_arguments - validate module params 1346 * 1347 * @adev: amdgpu_device pointer 1348 * 1349 * Validates certain module parameters and updates 1350 * the associated values used by the driver (all asics). 1351 */ 1352 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1353 { 1354 if (amdgpu_sched_jobs < 4) { 1355 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1356 amdgpu_sched_jobs); 1357 amdgpu_sched_jobs = 4; 1358 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1359 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1360 amdgpu_sched_jobs); 1361 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1362 } 1363 1364 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1365 /* gart size must be greater or equal to 32M */ 1366 dev_warn(adev->dev, "gart size (%d) too small\n", 1367 amdgpu_gart_size); 1368 amdgpu_gart_size = -1; 1369 } 1370 1371 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1372 /* gtt size must be greater or equal to 32M */ 1373 dev_warn(adev->dev, "gtt size (%d) too small\n", 1374 amdgpu_gtt_size); 1375 amdgpu_gtt_size = -1; 1376 } 1377 1378 /* valid range is between 4 and 9 inclusive */ 1379 if (amdgpu_vm_fragment_size != -1 && 1380 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1381 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1382 amdgpu_vm_fragment_size = -1; 1383 } 1384 1385 if (amdgpu_sched_hw_submission < 2) { 1386 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1387 amdgpu_sched_hw_submission); 1388 amdgpu_sched_hw_submission = 2; 1389 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1390 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1391 amdgpu_sched_hw_submission); 1392 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1393 } 1394 1395 amdgpu_device_check_smu_prv_buffer_size(adev); 1396 1397 amdgpu_device_check_vm_size(adev); 1398 1399 amdgpu_device_check_block_size(adev); 1400 1401 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1402 1403 amdgpu_gmc_tmz_set(adev); 1404 1405 amdgpu_gmc_noretry_set(adev); 1406 1407 return 0; 1408 } 1409 1410 /** 1411 * amdgpu_switcheroo_set_state - set switcheroo state 1412 * 1413 * @pdev: pci dev pointer 1414 * @state: vga_switcheroo state 1415 * 1416 * Callback for the switcheroo driver. Suspends or resumes the 1417 * the asics before or after it is powered up using ACPI methods. 1418 */ 1419 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1420 enum vga_switcheroo_state state) 1421 { 1422 struct drm_device *dev = pci_get_drvdata(pdev); 1423 int r; 1424 1425 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1426 return; 1427 1428 if (state == VGA_SWITCHEROO_ON) { 1429 pr_info("switched on\n"); 1430 /* don't suspend or resume card normally */ 1431 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1432 1433 pci_set_power_state(pdev, PCI_D0); 1434 amdgpu_device_load_pci_state(pdev); 1435 r = pci_enable_device(pdev); 1436 if (r) 1437 DRM_WARN("pci_enable_device failed (%d)\n", r); 1438 amdgpu_device_resume(dev, true); 1439 1440 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1441 } else { 1442 pr_info("switched off\n"); 1443 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1444 amdgpu_device_suspend(dev, true); 1445 amdgpu_device_cache_pci_state(pdev); 1446 /* Shut down the device */ 1447 pci_disable_device(pdev); 1448 pci_set_power_state(pdev, PCI_D3cold); 1449 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1450 } 1451 } 1452 1453 /** 1454 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1455 * 1456 * @pdev: pci dev pointer 1457 * 1458 * Callback for the switcheroo driver. Check of the switcheroo 1459 * state can be changed. 1460 * Returns true if the state can be changed, false if not. 1461 */ 1462 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1463 { 1464 struct drm_device *dev = pci_get_drvdata(pdev); 1465 1466 /* 1467 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1468 * locking inversion with the driver load path. And the access here is 1469 * completely racy anyway. So don't bother with locking for now. 1470 */ 1471 return atomic_read(&dev->open_count) == 0; 1472 } 1473 1474 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1475 .set_gpu_state = amdgpu_switcheroo_set_state, 1476 .reprobe = NULL, 1477 .can_switch = amdgpu_switcheroo_can_switch, 1478 }; 1479 1480 /** 1481 * amdgpu_device_ip_set_clockgating_state - set the CG state 1482 * 1483 * @dev: amdgpu_device pointer 1484 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1485 * @state: clockgating state (gate or ungate) 1486 * 1487 * Sets the requested clockgating state for all instances of 1488 * the hardware IP specified. 1489 * Returns the error code from the last instance. 1490 */ 1491 int amdgpu_device_ip_set_clockgating_state(void *dev, 1492 enum amd_ip_block_type block_type, 1493 enum amd_clockgating_state state) 1494 { 1495 struct amdgpu_device *adev = dev; 1496 int i, r = 0; 1497 1498 for (i = 0; i < adev->num_ip_blocks; i++) { 1499 if (!adev->ip_blocks[i].status.valid) 1500 continue; 1501 if (adev->ip_blocks[i].version->type != block_type) 1502 continue; 1503 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1504 continue; 1505 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1506 (void *)adev, state); 1507 if (r) 1508 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1509 adev->ip_blocks[i].version->funcs->name, r); 1510 } 1511 return r; 1512 } 1513 1514 /** 1515 * amdgpu_device_ip_set_powergating_state - set the PG state 1516 * 1517 * @dev: amdgpu_device pointer 1518 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1519 * @state: powergating state (gate or ungate) 1520 * 1521 * Sets the requested powergating state for all instances of 1522 * the hardware IP specified. 1523 * Returns the error code from the last instance. 1524 */ 1525 int amdgpu_device_ip_set_powergating_state(void *dev, 1526 enum amd_ip_block_type block_type, 1527 enum amd_powergating_state state) 1528 { 1529 struct amdgpu_device *adev = dev; 1530 int i, r = 0; 1531 1532 for (i = 0; i < adev->num_ip_blocks; i++) { 1533 if (!adev->ip_blocks[i].status.valid) 1534 continue; 1535 if (adev->ip_blocks[i].version->type != block_type) 1536 continue; 1537 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1538 continue; 1539 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1540 (void *)adev, state); 1541 if (r) 1542 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1543 adev->ip_blocks[i].version->funcs->name, r); 1544 } 1545 return r; 1546 } 1547 1548 /** 1549 * amdgpu_device_ip_get_clockgating_state - get the CG state 1550 * 1551 * @adev: amdgpu_device pointer 1552 * @flags: clockgating feature flags 1553 * 1554 * Walks the list of IPs on the device and updates the clockgating 1555 * flags for each IP. 1556 * Updates @flags with the feature flags for each hardware IP where 1557 * clockgating is enabled. 1558 */ 1559 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1560 u32 *flags) 1561 { 1562 int i; 1563 1564 for (i = 0; i < adev->num_ip_blocks; i++) { 1565 if (!adev->ip_blocks[i].status.valid) 1566 continue; 1567 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1568 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1569 } 1570 } 1571 1572 /** 1573 * amdgpu_device_ip_wait_for_idle - wait for idle 1574 * 1575 * @adev: amdgpu_device pointer 1576 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1577 * 1578 * Waits for the request hardware IP to be idle. 1579 * Returns 0 for success or a negative error code on failure. 1580 */ 1581 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1582 enum amd_ip_block_type block_type) 1583 { 1584 int i, r; 1585 1586 for (i = 0; i < adev->num_ip_blocks; i++) { 1587 if (!adev->ip_blocks[i].status.valid) 1588 continue; 1589 if (adev->ip_blocks[i].version->type == block_type) { 1590 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1591 if (r) 1592 return r; 1593 break; 1594 } 1595 } 1596 return 0; 1597 1598 } 1599 1600 /** 1601 * amdgpu_device_ip_is_idle - is the hardware IP idle 1602 * 1603 * @adev: amdgpu_device pointer 1604 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1605 * 1606 * Check if the hardware IP is idle or not. 1607 * Returns true if it the IP is idle, false if not. 1608 */ 1609 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1610 enum amd_ip_block_type block_type) 1611 { 1612 int i; 1613 1614 for (i = 0; i < adev->num_ip_blocks; i++) { 1615 if (!adev->ip_blocks[i].status.valid) 1616 continue; 1617 if (adev->ip_blocks[i].version->type == block_type) 1618 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1619 } 1620 return true; 1621 1622 } 1623 1624 /** 1625 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1626 * 1627 * @adev: amdgpu_device pointer 1628 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1629 * 1630 * Returns a pointer to the hardware IP block structure 1631 * if it exists for the asic, otherwise NULL. 1632 */ 1633 struct amdgpu_ip_block * 1634 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1635 enum amd_ip_block_type type) 1636 { 1637 int i; 1638 1639 for (i = 0; i < adev->num_ip_blocks; i++) 1640 if (adev->ip_blocks[i].version->type == type) 1641 return &adev->ip_blocks[i]; 1642 1643 return NULL; 1644 } 1645 1646 /** 1647 * amdgpu_device_ip_block_version_cmp 1648 * 1649 * @adev: amdgpu_device pointer 1650 * @type: enum amd_ip_block_type 1651 * @major: major version 1652 * @minor: minor version 1653 * 1654 * return 0 if equal or greater 1655 * return 1 if smaller or the ip_block doesn't exist 1656 */ 1657 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1658 enum amd_ip_block_type type, 1659 u32 major, u32 minor) 1660 { 1661 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1662 1663 if (ip_block && ((ip_block->version->major > major) || 1664 ((ip_block->version->major == major) && 1665 (ip_block->version->minor >= minor)))) 1666 return 0; 1667 1668 return 1; 1669 } 1670 1671 /** 1672 * amdgpu_device_ip_block_add 1673 * 1674 * @adev: amdgpu_device pointer 1675 * @ip_block_version: pointer to the IP to add 1676 * 1677 * Adds the IP block driver information to the collection of IPs 1678 * on the asic. 1679 */ 1680 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1681 const struct amdgpu_ip_block_version *ip_block_version) 1682 { 1683 if (!ip_block_version) 1684 return -EINVAL; 1685 1686 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1687 ip_block_version->funcs->name); 1688 1689 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1690 1691 return 0; 1692 } 1693 1694 /** 1695 * amdgpu_device_enable_virtual_display - enable virtual display feature 1696 * 1697 * @adev: amdgpu_device pointer 1698 * 1699 * Enabled the virtual display feature if the user has enabled it via 1700 * the module parameter virtual_display. This feature provides a virtual 1701 * display hardware on headless boards or in virtualized environments. 1702 * This function parses and validates the configuration string specified by 1703 * the user and configues the virtual display configuration (number of 1704 * virtual connectors, crtcs, etc.) specified. 1705 */ 1706 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1707 { 1708 adev->enable_virtual_display = false; 1709 1710 if (amdgpu_virtual_display) { 1711 const char *pci_address_name = pci_name(adev->pdev); 1712 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1713 1714 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1715 pciaddstr_tmp = pciaddstr; 1716 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1717 pciaddname = strsep(&pciaddname_tmp, ","); 1718 if (!strcmp("all", pciaddname) 1719 || !strcmp(pci_address_name, pciaddname)) { 1720 long num_crtc; 1721 int res = -1; 1722 1723 adev->enable_virtual_display = true; 1724 1725 if (pciaddname_tmp) 1726 res = kstrtol(pciaddname_tmp, 10, 1727 &num_crtc); 1728 1729 if (!res) { 1730 if (num_crtc < 1) 1731 num_crtc = 1; 1732 if (num_crtc > 6) 1733 num_crtc = 6; 1734 adev->mode_info.num_crtc = num_crtc; 1735 } else { 1736 adev->mode_info.num_crtc = 1; 1737 } 1738 break; 1739 } 1740 } 1741 1742 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1743 amdgpu_virtual_display, pci_address_name, 1744 adev->enable_virtual_display, adev->mode_info.num_crtc); 1745 1746 kfree(pciaddstr); 1747 } 1748 } 1749 1750 /** 1751 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1752 * 1753 * @adev: amdgpu_device pointer 1754 * 1755 * Parses the asic configuration parameters specified in the gpu info 1756 * firmware and makes them availale to the driver for use in configuring 1757 * the asic. 1758 * Returns 0 on success, -EINVAL on failure. 1759 */ 1760 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1761 { 1762 const char *chip_name; 1763 char fw_name[40]; 1764 int err; 1765 const struct gpu_info_firmware_header_v1_0 *hdr; 1766 1767 adev->firmware.gpu_info_fw = NULL; 1768 1769 if (adev->mman.discovery_bin) { 1770 amdgpu_discovery_get_gfx_info(adev); 1771 1772 /* 1773 * FIXME: The bounding box is still needed by Navi12, so 1774 * temporarily read it from gpu_info firmware. Should be droped 1775 * when DAL no longer needs it. 1776 */ 1777 if (adev->asic_type != CHIP_NAVI12) 1778 return 0; 1779 } 1780 1781 switch (adev->asic_type) { 1782 #ifdef CONFIG_DRM_AMDGPU_SI 1783 case CHIP_VERDE: 1784 case CHIP_TAHITI: 1785 case CHIP_PITCAIRN: 1786 case CHIP_OLAND: 1787 case CHIP_HAINAN: 1788 #endif 1789 #ifdef CONFIG_DRM_AMDGPU_CIK 1790 case CHIP_BONAIRE: 1791 case CHIP_HAWAII: 1792 case CHIP_KAVERI: 1793 case CHIP_KABINI: 1794 case CHIP_MULLINS: 1795 #endif 1796 case CHIP_TOPAZ: 1797 case CHIP_TONGA: 1798 case CHIP_FIJI: 1799 case CHIP_POLARIS10: 1800 case CHIP_POLARIS11: 1801 case CHIP_POLARIS12: 1802 case CHIP_VEGAM: 1803 case CHIP_CARRIZO: 1804 case CHIP_STONEY: 1805 case CHIP_VEGA20: 1806 case CHIP_ALDEBARAN: 1807 case CHIP_SIENNA_CICHLID: 1808 case CHIP_NAVY_FLOUNDER: 1809 case CHIP_DIMGREY_CAVEFISH: 1810 default: 1811 return 0; 1812 case CHIP_VEGA10: 1813 chip_name = "vega10"; 1814 break; 1815 case CHIP_VEGA12: 1816 chip_name = "vega12"; 1817 break; 1818 case CHIP_RAVEN: 1819 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1820 chip_name = "raven2"; 1821 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1822 chip_name = "picasso"; 1823 else 1824 chip_name = "raven"; 1825 break; 1826 case CHIP_ARCTURUS: 1827 chip_name = "arcturus"; 1828 break; 1829 case CHIP_RENOIR: 1830 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1831 chip_name = "renoir"; 1832 else 1833 chip_name = "green_sardine"; 1834 break; 1835 case CHIP_NAVI10: 1836 chip_name = "navi10"; 1837 break; 1838 case CHIP_NAVI14: 1839 chip_name = "navi14"; 1840 break; 1841 case CHIP_NAVI12: 1842 chip_name = "navi12"; 1843 break; 1844 case CHIP_VANGOGH: 1845 chip_name = "vangogh"; 1846 break; 1847 } 1848 1849 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1850 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1851 if (err) { 1852 dev_err(adev->dev, 1853 "Failed to load gpu_info firmware \"%s\"\n", 1854 fw_name); 1855 goto out; 1856 } 1857 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1858 if (err) { 1859 dev_err(adev->dev, 1860 "Failed to validate gpu_info firmware \"%s\"\n", 1861 fw_name); 1862 goto out; 1863 } 1864 1865 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1866 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1867 1868 switch (hdr->version_major) { 1869 case 1: 1870 { 1871 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1872 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1873 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1874 1875 /* 1876 * Should be droped when DAL no longer needs it. 1877 */ 1878 if (adev->asic_type == CHIP_NAVI12) 1879 goto parse_soc_bounding_box; 1880 1881 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1882 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1883 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1884 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1885 adev->gfx.config.max_texture_channel_caches = 1886 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1887 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1888 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1889 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1890 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1891 adev->gfx.config.double_offchip_lds_buf = 1892 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1893 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1894 adev->gfx.cu_info.max_waves_per_simd = 1895 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1896 adev->gfx.cu_info.max_scratch_slots_per_cu = 1897 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1898 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1899 if (hdr->version_minor >= 1) { 1900 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1901 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1902 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1903 adev->gfx.config.num_sc_per_sh = 1904 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1905 adev->gfx.config.num_packer_per_sc = 1906 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1907 } 1908 1909 parse_soc_bounding_box: 1910 /* 1911 * soc bounding box info is not integrated in disocovery table, 1912 * we always need to parse it from gpu info firmware if needed. 1913 */ 1914 if (hdr->version_minor == 2) { 1915 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1916 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1917 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1918 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1919 } 1920 break; 1921 } 1922 default: 1923 dev_err(adev->dev, 1924 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1925 err = -EINVAL; 1926 goto out; 1927 } 1928 out: 1929 return err; 1930 } 1931 1932 /** 1933 * amdgpu_device_ip_early_init - run early init for hardware IPs 1934 * 1935 * @adev: amdgpu_device pointer 1936 * 1937 * Early initialization pass for hardware IPs. The hardware IPs that make 1938 * up each asic are discovered each IP's early_init callback is run. This 1939 * is the first stage in initializing the asic. 1940 * Returns 0 on success, negative error code on failure. 1941 */ 1942 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1943 { 1944 int i, r; 1945 1946 amdgpu_device_enable_virtual_display(adev); 1947 1948 if (amdgpu_sriov_vf(adev)) { 1949 r = amdgpu_virt_request_full_gpu(adev, true); 1950 if (r) 1951 return r; 1952 } 1953 1954 switch (adev->asic_type) { 1955 #ifdef CONFIG_DRM_AMDGPU_SI 1956 case CHIP_VERDE: 1957 case CHIP_TAHITI: 1958 case CHIP_PITCAIRN: 1959 case CHIP_OLAND: 1960 case CHIP_HAINAN: 1961 adev->family = AMDGPU_FAMILY_SI; 1962 r = si_set_ip_blocks(adev); 1963 if (r) 1964 return r; 1965 break; 1966 #endif 1967 #ifdef CONFIG_DRM_AMDGPU_CIK 1968 case CHIP_BONAIRE: 1969 case CHIP_HAWAII: 1970 case CHIP_KAVERI: 1971 case CHIP_KABINI: 1972 case CHIP_MULLINS: 1973 if (adev->flags & AMD_IS_APU) 1974 adev->family = AMDGPU_FAMILY_KV; 1975 else 1976 adev->family = AMDGPU_FAMILY_CI; 1977 1978 r = cik_set_ip_blocks(adev); 1979 if (r) 1980 return r; 1981 break; 1982 #endif 1983 case CHIP_TOPAZ: 1984 case CHIP_TONGA: 1985 case CHIP_FIJI: 1986 case CHIP_POLARIS10: 1987 case CHIP_POLARIS11: 1988 case CHIP_POLARIS12: 1989 case CHIP_VEGAM: 1990 case CHIP_CARRIZO: 1991 case CHIP_STONEY: 1992 if (adev->flags & AMD_IS_APU) 1993 adev->family = AMDGPU_FAMILY_CZ; 1994 else 1995 adev->family = AMDGPU_FAMILY_VI; 1996 1997 r = vi_set_ip_blocks(adev); 1998 if (r) 1999 return r; 2000 break; 2001 case CHIP_VEGA10: 2002 case CHIP_VEGA12: 2003 case CHIP_VEGA20: 2004 case CHIP_RAVEN: 2005 case CHIP_ARCTURUS: 2006 case CHIP_RENOIR: 2007 case CHIP_ALDEBARAN: 2008 if (adev->flags & AMD_IS_APU) 2009 adev->family = AMDGPU_FAMILY_RV; 2010 else 2011 adev->family = AMDGPU_FAMILY_AI; 2012 2013 r = soc15_set_ip_blocks(adev); 2014 if (r) 2015 return r; 2016 break; 2017 case CHIP_NAVI10: 2018 case CHIP_NAVI14: 2019 case CHIP_NAVI12: 2020 case CHIP_SIENNA_CICHLID: 2021 case CHIP_NAVY_FLOUNDER: 2022 case CHIP_DIMGREY_CAVEFISH: 2023 case CHIP_VANGOGH: 2024 if (adev->asic_type == CHIP_VANGOGH) 2025 adev->family = AMDGPU_FAMILY_VGH; 2026 else 2027 adev->family = AMDGPU_FAMILY_NV; 2028 2029 r = nv_set_ip_blocks(adev); 2030 if (r) 2031 return r; 2032 break; 2033 default: 2034 /* FIXME: not supported yet */ 2035 return -EINVAL; 2036 } 2037 2038 amdgpu_amdkfd_device_probe(adev); 2039 2040 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2041 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2042 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2043 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2044 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2045 2046 for (i = 0; i < adev->num_ip_blocks; i++) { 2047 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2048 DRM_ERROR("disabled ip block: %d <%s>\n", 2049 i, adev->ip_blocks[i].version->funcs->name); 2050 adev->ip_blocks[i].status.valid = false; 2051 } else { 2052 if (adev->ip_blocks[i].version->funcs->early_init) { 2053 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2054 if (r == -ENOENT) { 2055 adev->ip_blocks[i].status.valid = false; 2056 } else if (r) { 2057 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2058 adev->ip_blocks[i].version->funcs->name, r); 2059 return r; 2060 } else { 2061 adev->ip_blocks[i].status.valid = true; 2062 } 2063 } else { 2064 adev->ip_blocks[i].status.valid = true; 2065 } 2066 } 2067 /* get the vbios after the asic_funcs are set up */ 2068 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2069 r = amdgpu_device_parse_gpu_info_fw(adev); 2070 if (r) 2071 return r; 2072 2073 /* Read BIOS */ 2074 if (!amdgpu_get_bios(adev)) 2075 return -EINVAL; 2076 2077 r = amdgpu_atombios_init(adev); 2078 if (r) { 2079 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2080 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2081 return r; 2082 } 2083 2084 /*get pf2vf msg info at it's earliest time*/ 2085 if (amdgpu_sriov_vf(adev)) 2086 amdgpu_virt_init_data_exchange(adev); 2087 2088 } 2089 } 2090 2091 adev->cg_flags &= amdgpu_cg_mask; 2092 adev->pg_flags &= amdgpu_pg_mask; 2093 2094 return 0; 2095 } 2096 2097 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2098 { 2099 int i, r; 2100 2101 for (i = 0; i < adev->num_ip_blocks; i++) { 2102 if (!adev->ip_blocks[i].status.sw) 2103 continue; 2104 if (adev->ip_blocks[i].status.hw) 2105 continue; 2106 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2107 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2108 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2109 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2110 if (r) { 2111 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2112 adev->ip_blocks[i].version->funcs->name, r); 2113 return r; 2114 } 2115 adev->ip_blocks[i].status.hw = true; 2116 } 2117 } 2118 2119 return 0; 2120 } 2121 2122 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2123 { 2124 int i, r; 2125 2126 for (i = 0; i < adev->num_ip_blocks; i++) { 2127 if (!adev->ip_blocks[i].status.sw) 2128 continue; 2129 if (adev->ip_blocks[i].status.hw) 2130 continue; 2131 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2132 if (r) { 2133 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2134 adev->ip_blocks[i].version->funcs->name, r); 2135 return r; 2136 } 2137 adev->ip_blocks[i].status.hw = true; 2138 } 2139 2140 return 0; 2141 } 2142 2143 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2144 { 2145 int r = 0; 2146 int i; 2147 uint32_t smu_version; 2148 2149 if (adev->asic_type >= CHIP_VEGA10) { 2150 for (i = 0; i < adev->num_ip_blocks; i++) { 2151 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2152 continue; 2153 2154 if (!adev->ip_blocks[i].status.sw) 2155 continue; 2156 2157 /* no need to do the fw loading again if already done*/ 2158 if (adev->ip_blocks[i].status.hw == true) 2159 break; 2160 2161 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2162 r = adev->ip_blocks[i].version->funcs->resume(adev); 2163 if (r) { 2164 DRM_ERROR("resume of IP block <%s> failed %d\n", 2165 adev->ip_blocks[i].version->funcs->name, r); 2166 return r; 2167 } 2168 } else { 2169 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2170 if (r) { 2171 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2172 adev->ip_blocks[i].version->funcs->name, r); 2173 return r; 2174 } 2175 } 2176 2177 adev->ip_blocks[i].status.hw = true; 2178 break; 2179 } 2180 } 2181 2182 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2183 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2184 2185 return r; 2186 } 2187 2188 /** 2189 * amdgpu_device_ip_init - run init for hardware IPs 2190 * 2191 * @adev: amdgpu_device pointer 2192 * 2193 * Main initialization pass for hardware IPs. The list of all the hardware 2194 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2195 * are run. sw_init initializes the software state associated with each IP 2196 * and hw_init initializes the hardware associated with each IP. 2197 * Returns 0 on success, negative error code on failure. 2198 */ 2199 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2200 { 2201 int i, r; 2202 2203 r = amdgpu_ras_init(adev); 2204 if (r) 2205 return r; 2206 2207 for (i = 0; i < adev->num_ip_blocks; i++) { 2208 if (!adev->ip_blocks[i].status.valid) 2209 continue; 2210 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2211 if (r) { 2212 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2213 adev->ip_blocks[i].version->funcs->name, r); 2214 goto init_failed; 2215 } 2216 adev->ip_blocks[i].status.sw = true; 2217 2218 /* need to do gmc hw init early so we can allocate gpu mem */ 2219 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2220 r = amdgpu_device_vram_scratch_init(adev); 2221 if (r) { 2222 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2223 goto init_failed; 2224 } 2225 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2226 if (r) { 2227 DRM_ERROR("hw_init %d failed %d\n", i, r); 2228 goto init_failed; 2229 } 2230 r = amdgpu_device_wb_init(adev); 2231 if (r) { 2232 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2233 goto init_failed; 2234 } 2235 adev->ip_blocks[i].status.hw = true; 2236 2237 /* right after GMC hw init, we create CSA */ 2238 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2239 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2240 AMDGPU_GEM_DOMAIN_VRAM, 2241 AMDGPU_CSA_SIZE); 2242 if (r) { 2243 DRM_ERROR("allocate CSA failed %d\n", r); 2244 goto init_failed; 2245 } 2246 } 2247 } 2248 } 2249 2250 if (amdgpu_sriov_vf(adev)) 2251 amdgpu_virt_init_data_exchange(adev); 2252 2253 r = amdgpu_ib_pool_init(adev); 2254 if (r) { 2255 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2256 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2257 goto init_failed; 2258 } 2259 2260 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2261 if (r) 2262 goto init_failed; 2263 2264 r = amdgpu_device_ip_hw_init_phase1(adev); 2265 if (r) 2266 goto init_failed; 2267 2268 r = amdgpu_device_fw_loading(adev); 2269 if (r) 2270 goto init_failed; 2271 2272 r = amdgpu_device_ip_hw_init_phase2(adev); 2273 if (r) 2274 goto init_failed; 2275 2276 /* 2277 * retired pages will be loaded from eeprom and reserved here, 2278 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2279 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2280 * for I2C communication which only true at this point. 2281 * 2282 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2283 * failure from bad gpu situation and stop amdgpu init process 2284 * accordingly. For other failed cases, it will still release all 2285 * the resource and print error message, rather than returning one 2286 * negative value to upper level. 2287 * 2288 * Note: theoretically, this should be called before all vram allocations 2289 * to protect retired page from abusing 2290 */ 2291 r = amdgpu_ras_recovery_init(adev); 2292 if (r) 2293 goto init_failed; 2294 2295 if (adev->gmc.xgmi.num_physical_nodes > 1) 2296 amdgpu_xgmi_add_device(adev); 2297 2298 /* Don't init kfd if whole hive need to be reset during init */ 2299 if (!adev->gmc.xgmi.pending_reset) 2300 amdgpu_amdkfd_device_init(adev); 2301 2302 amdgpu_fru_get_product_info(adev); 2303 2304 init_failed: 2305 if (amdgpu_sriov_vf(adev)) 2306 amdgpu_virt_release_full_gpu(adev, true); 2307 2308 return r; 2309 } 2310 2311 /** 2312 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2313 * 2314 * @adev: amdgpu_device pointer 2315 * 2316 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2317 * this function before a GPU reset. If the value is retained after a 2318 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2319 */ 2320 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2321 { 2322 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2323 } 2324 2325 /** 2326 * amdgpu_device_check_vram_lost - check if vram is valid 2327 * 2328 * @adev: amdgpu_device pointer 2329 * 2330 * Checks the reset magic value written to the gart pointer in VRAM. 2331 * The driver calls this after a GPU reset to see if the contents of 2332 * VRAM is lost or now. 2333 * returns true if vram is lost, false if not. 2334 */ 2335 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2336 { 2337 if (memcmp(adev->gart.ptr, adev->reset_magic, 2338 AMDGPU_RESET_MAGIC_NUM)) 2339 return true; 2340 2341 if (!amdgpu_in_reset(adev)) 2342 return false; 2343 2344 /* 2345 * For all ASICs with baco/mode1 reset, the VRAM is 2346 * always assumed to be lost. 2347 */ 2348 switch (amdgpu_asic_reset_method(adev)) { 2349 case AMD_RESET_METHOD_BACO: 2350 case AMD_RESET_METHOD_MODE1: 2351 return true; 2352 default: 2353 return false; 2354 } 2355 } 2356 2357 /** 2358 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2359 * 2360 * @adev: amdgpu_device pointer 2361 * @state: clockgating state (gate or ungate) 2362 * 2363 * The list of all the hardware IPs that make up the asic is walked and the 2364 * set_clockgating_state callbacks are run. 2365 * Late initialization pass enabling clockgating for hardware IPs. 2366 * Fini or suspend, pass disabling clockgating for hardware IPs. 2367 * Returns 0 on success, negative error code on failure. 2368 */ 2369 2370 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2371 enum amd_clockgating_state state) 2372 { 2373 int i, j, r; 2374 2375 if (amdgpu_emu_mode == 1) 2376 return 0; 2377 2378 for (j = 0; j < adev->num_ip_blocks; j++) { 2379 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2380 if (!adev->ip_blocks[i].status.late_initialized) 2381 continue; 2382 /* skip CG for GFX on S0ix */ 2383 if (adev->in_s0ix && 2384 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2385 continue; 2386 /* skip CG for VCE/UVD, it's handled specially */ 2387 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2388 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2389 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2390 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2391 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2392 /* enable clockgating to save power */ 2393 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2394 state); 2395 if (r) { 2396 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2397 adev->ip_blocks[i].version->funcs->name, r); 2398 return r; 2399 } 2400 } 2401 } 2402 2403 return 0; 2404 } 2405 2406 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2407 enum amd_powergating_state state) 2408 { 2409 int i, j, r; 2410 2411 if (amdgpu_emu_mode == 1) 2412 return 0; 2413 2414 for (j = 0; j < adev->num_ip_blocks; j++) { 2415 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2416 if (!adev->ip_blocks[i].status.late_initialized) 2417 continue; 2418 /* skip PG for GFX on S0ix */ 2419 if (adev->in_s0ix && 2420 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2421 continue; 2422 /* skip CG for VCE/UVD, it's handled specially */ 2423 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2424 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2425 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2426 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2427 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2428 /* enable powergating to save power */ 2429 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2430 state); 2431 if (r) { 2432 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2433 adev->ip_blocks[i].version->funcs->name, r); 2434 return r; 2435 } 2436 } 2437 } 2438 return 0; 2439 } 2440 2441 static int amdgpu_device_enable_mgpu_fan_boost(void) 2442 { 2443 struct amdgpu_gpu_instance *gpu_ins; 2444 struct amdgpu_device *adev; 2445 int i, ret = 0; 2446 2447 mutex_lock(&mgpu_info.mutex); 2448 2449 /* 2450 * MGPU fan boost feature should be enabled 2451 * only when there are two or more dGPUs in 2452 * the system 2453 */ 2454 if (mgpu_info.num_dgpu < 2) 2455 goto out; 2456 2457 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2458 gpu_ins = &(mgpu_info.gpu_ins[i]); 2459 adev = gpu_ins->adev; 2460 if (!(adev->flags & AMD_IS_APU) && 2461 !gpu_ins->mgpu_fan_enabled) { 2462 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2463 if (ret) 2464 break; 2465 2466 gpu_ins->mgpu_fan_enabled = 1; 2467 } 2468 } 2469 2470 out: 2471 mutex_unlock(&mgpu_info.mutex); 2472 2473 return ret; 2474 } 2475 2476 /** 2477 * amdgpu_device_ip_late_init - run late init for hardware IPs 2478 * 2479 * @adev: amdgpu_device pointer 2480 * 2481 * Late initialization pass for hardware IPs. The list of all the hardware 2482 * IPs that make up the asic is walked and the late_init callbacks are run. 2483 * late_init covers any special initialization that an IP requires 2484 * after all of the have been initialized or something that needs to happen 2485 * late in the init process. 2486 * Returns 0 on success, negative error code on failure. 2487 */ 2488 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2489 { 2490 struct amdgpu_gpu_instance *gpu_instance; 2491 int i = 0, r; 2492 2493 for (i = 0; i < adev->num_ip_blocks; i++) { 2494 if (!adev->ip_blocks[i].status.hw) 2495 continue; 2496 if (adev->ip_blocks[i].version->funcs->late_init) { 2497 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2498 if (r) { 2499 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2500 adev->ip_blocks[i].version->funcs->name, r); 2501 return r; 2502 } 2503 } 2504 adev->ip_blocks[i].status.late_initialized = true; 2505 } 2506 2507 amdgpu_ras_set_error_query_ready(adev, true); 2508 2509 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2510 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2511 2512 amdgpu_device_fill_reset_magic(adev); 2513 2514 r = amdgpu_device_enable_mgpu_fan_boost(); 2515 if (r) 2516 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2517 2518 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2519 if (adev->asic_type == CHIP_ARCTURUS && 2520 amdgpu_passthrough(adev) && 2521 adev->gmc.xgmi.num_physical_nodes > 1) 2522 smu_set_light_sbr(&adev->smu, true); 2523 2524 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2525 mutex_lock(&mgpu_info.mutex); 2526 2527 /* 2528 * Reset device p-state to low as this was booted with high. 2529 * 2530 * This should be performed only after all devices from the same 2531 * hive get initialized. 2532 * 2533 * However, it's unknown how many device in the hive in advance. 2534 * As this is counted one by one during devices initializations. 2535 * 2536 * So, we wait for all XGMI interlinked devices initialized. 2537 * This may bring some delays as those devices may come from 2538 * different hives. But that should be OK. 2539 */ 2540 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2541 for (i = 0; i < mgpu_info.num_gpu; i++) { 2542 gpu_instance = &(mgpu_info.gpu_ins[i]); 2543 if (gpu_instance->adev->flags & AMD_IS_APU) 2544 continue; 2545 2546 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2547 AMDGPU_XGMI_PSTATE_MIN); 2548 if (r) { 2549 DRM_ERROR("pstate setting failed (%d).\n", r); 2550 break; 2551 } 2552 } 2553 } 2554 2555 mutex_unlock(&mgpu_info.mutex); 2556 } 2557 2558 return 0; 2559 } 2560 2561 /** 2562 * amdgpu_device_ip_fini - run fini for hardware IPs 2563 * 2564 * @adev: amdgpu_device pointer 2565 * 2566 * Main teardown pass for hardware IPs. The list of all the hardware 2567 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2568 * are run. hw_fini tears down the hardware associated with each IP 2569 * and sw_fini tears down any software state associated with each IP. 2570 * Returns 0 on success, negative error code on failure. 2571 */ 2572 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2573 { 2574 int i, r; 2575 2576 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2577 amdgpu_virt_release_ras_err_handler_data(adev); 2578 2579 amdgpu_ras_pre_fini(adev); 2580 2581 if (adev->gmc.xgmi.num_physical_nodes > 1) 2582 amdgpu_xgmi_remove_device(adev); 2583 2584 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2585 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2586 2587 amdgpu_amdkfd_device_fini(adev); 2588 2589 /* need to disable SMC first */ 2590 for (i = 0; i < adev->num_ip_blocks; i++) { 2591 if (!adev->ip_blocks[i].status.hw) 2592 continue; 2593 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2594 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2595 /* XXX handle errors */ 2596 if (r) { 2597 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2598 adev->ip_blocks[i].version->funcs->name, r); 2599 } 2600 adev->ip_blocks[i].status.hw = false; 2601 break; 2602 } 2603 } 2604 2605 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2606 if (!adev->ip_blocks[i].status.hw) 2607 continue; 2608 2609 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2610 /* XXX handle errors */ 2611 if (r) { 2612 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2613 adev->ip_blocks[i].version->funcs->name, r); 2614 } 2615 2616 adev->ip_blocks[i].status.hw = false; 2617 } 2618 2619 2620 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2621 if (!adev->ip_blocks[i].status.sw) 2622 continue; 2623 2624 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2625 amdgpu_ucode_free_bo(adev); 2626 amdgpu_free_static_csa(&adev->virt.csa_obj); 2627 amdgpu_device_wb_fini(adev); 2628 amdgpu_device_vram_scratch_fini(adev); 2629 amdgpu_ib_pool_fini(adev); 2630 } 2631 2632 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2633 /* XXX handle errors */ 2634 if (r) { 2635 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2636 adev->ip_blocks[i].version->funcs->name, r); 2637 } 2638 adev->ip_blocks[i].status.sw = false; 2639 adev->ip_blocks[i].status.valid = false; 2640 } 2641 2642 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2643 if (!adev->ip_blocks[i].status.late_initialized) 2644 continue; 2645 if (adev->ip_blocks[i].version->funcs->late_fini) 2646 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2647 adev->ip_blocks[i].status.late_initialized = false; 2648 } 2649 2650 amdgpu_ras_fini(adev); 2651 2652 if (amdgpu_sriov_vf(adev)) 2653 if (amdgpu_virt_release_full_gpu(adev, false)) 2654 DRM_ERROR("failed to release exclusive mode on fini\n"); 2655 2656 return 0; 2657 } 2658 2659 /** 2660 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2661 * 2662 * @work: work_struct. 2663 */ 2664 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2665 { 2666 struct amdgpu_device *adev = 2667 container_of(work, struct amdgpu_device, delayed_init_work.work); 2668 int r; 2669 2670 r = amdgpu_ib_ring_tests(adev); 2671 if (r) 2672 DRM_ERROR("ib ring test failed (%d).\n", r); 2673 } 2674 2675 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2676 { 2677 struct amdgpu_device *adev = 2678 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2679 2680 mutex_lock(&adev->gfx.gfx_off_mutex); 2681 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2682 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2683 adev->gfx.gfx_off_state = true; 2684 } 2685 mutex_unlock(&adev->gfx.gfx_off_mutex); 2686 } 2687 2688 /** 2689 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2690 * 2691 * @adev: amdgpu_device pointer 2692 * 2693 * Main suspend function for hardware IPs. The list of all the hardware 2694 * IPs that make up the asic is walked, clockgating is disabled and the 2695 * suspend callbacks are run. suspend puts the hardware and software state 2696 * in each IP into a state suitable for suspend. 2697 * Returns 0 on success, negative error code on failure. 2698 */ 2699 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2700 { 2701 int i, r; 2702 2703 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2704 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2705 2706 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2707 if (!adev->ip_blocks[i].status.valid) 2708 continue; 2709 2710 /* displays are handled separately */ 2711 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2712 continue; 2713 2714 /* XXX handle errors */ 2715 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2716 /* XXX handle errors */ 2717 if (r) { 2718 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2719 adev->ip_blocks[i].version->funcs->name, r); 2720 return r; 2721 } 2722 2723 adev->ip_blocks[i].status.hw = false; 2724 } 2725 2726 return 0; 2727 } 2728 2729 /** 2730 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2731 * 2732 * @adev: amdgpu_device pointer 2733 * 2734 * Main suspend function for hardware IPs. The list of all the hardware 2735 * IPs that make up the asic is walked, clockgating is disabled and the 2736 * suspend callbacks are run. suspend puts the hardware and software state 2737 * in each IP into a state suitable for suspend. 2738 * Returns 0 on success, negative error code on failure. 2739 */ 2740 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2741 { 2742 int i, r; 2743 2744 if (adev->in_s0ix) 2745 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2746 2747 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2748 if (!adev->ip_blocks[i].status.valid) 2749 continue; 2750 /* displays are handled in phase1 */ 2751 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2752 continue; 2753 /* PSP lost connection when err_event_athub occurs */ 2754 if (amdgpu_ras_intr_triggered() && 2755 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2756 adev->ip_blocks[i].status.hw = false; 2757 continue; 2758 } 2759 2760 /* skip unnecessary suspend if we do not initialize them yet */ 2761 if (adev->gmc.xgmi.pending_reset && 2762 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2763 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2764 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2765 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2766 adev->ip_blocks[i].status.hw = false; 2767 continue; 2768 } 2769 2770 /* skip suspend of gfx and psp for S0ix 2771 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2772 * like at runtime. PSP is also part of the always on hardware 2773 * so no need to suspend it. 2774 */ 2775 if (adev->in_s0ix && 2776 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2778 continue; 2779 2780 /* XXX handle errors */ 2781 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2782 /* XXX handle errors */ 2783 if (r) { 2784 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2785 adev->ip_blocks[i].version->funcs->name, r); 2786 } 2787 adev->ip_blocks[i].status.hw = false; 2788 /* handle putting the SMC in the appropriate state */ 2789 if(!amdgpu_sriov_vf(adev)){ 2790 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2791 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2792 if (r) { 2793 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2794 adev->mp1_state, r); 2795 return r; 2796 } 2797 } 2798 } 2799 } 2800 2801 return 0; 2802 } 2803 2804 /** 2805 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2806 * 2807 * @adev: amdgpu_device pointer 2808 * 2809 * Main suspend function for hardware IPs. The list of all the hardware 2810 * IPs that make up the asic is walked, clockgating is disabled and the 2811 * suspend callbacks are run. suspend puts the hardware and software state 2812 * in each IP into a state suitable for suspend. 2813 * Returns 0 on success, negative error code on failure. 2814 */ 2815 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2816 { 2817 int r; 2818 2819 if (amdgpu_sriov_vf(adev)) { 2820 amdgpu_virt_fini_data_exchange(adev); 2821 amdgpu_virt_request_full_gpu(adev, false); 2822 } 2823 2824 r = amdgpu_device_ip_suspend_phase1(adev); 2825 if (r) 2826 return r; 2827 r = amdgpu_device_ip_suspend_phase2(adev); 2828 2829 if (amdgpu_sriov_vf(adev)) 2830 amdgpu_virt_release_full_gpu(adev, false); 2831 2832 return r; 2833 } 2834 2835 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2836 { 2837 int i, r; 2838 2839 static enum amd_ip_block_type ip_order[] = { 2840 AMD_IP_BLOCK_TYPE_GMC, 2841 AMD_IP_BLOCK_TYPE_COMMON, 2842 AMD_IP_BLOCK_TYPE_PSP, 2843 AMD_IP_BLOCK_TYPE_IH, 2844 }; 2845 2846 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2847 int j; 2848 struct amdgpu_ip_block *block; 2849 2850 block = &adev->ip_blocks[i]; 2851 block->status.hw = false; 2852 2853 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2854 2855 if (block->version->type != ip_order[j] || 2856 !block->status.valid) 2857 continue; 2858 2859 r = block->version->funcs->hw_init(adev); 2860 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2861 if (r) 2862 return r; 2863 block->status.hw = true; 2864 } 2865 } 2866 2867 return 0; 2868 } 2869 2870 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2871 { 2872 int i, r; 2873 2874 static enum amd_ip_block_type ip_order[] = { 2875 AMD_IP_BLOCK_TYPE_SMC, 2876 AMD_IP_BLOCK_TYPE_DCE, 2877 AMD_IP_BLOCK_TYPE_GFX, 2878 AMD_IP_BLOCK_TYPE_SDMA, 2879 AMD_IP_BLOCK_TYPE_UVD, 2880 AMD_IP_BLOCK_TYPE_VCE, 2881 AMD_IP_BLOCK_TYPE_VCN 2882 }; 2883 2884 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2885 int j; 2886 struct amdgpu_ip_block *block; 2887 2888 for (j = 0; j < adev->num_ip_blocks; j++) { 2889 block = &adev->ip_blocks[j]; 2890 2891 if (block->version->type != ip_order[i] || 2892 !block->status.valid || 2893 block->status.hw) 2894 continue; 2895 2896 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2897 r = block->version->funcs->resume(adev); 2898 else 2899 r = block->version->funcs->hw_init(adev); 2900 2901 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2902 if (r) 2903 return r; 2904 block->status.hw = true; 2905 } 2906 } 2907 2908 return 0; 2909 } 2910 2911 /** 2912 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2913 * 2914 * @adev: amdgpu_device pointer 2915 * 2916 * First resume function for hardware IPs. The list of all the hardware 2917 * IPs that make up the asic is walked and the resume callbacks are run for 2918 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2919 * after a suspend and updates the software state as necessary. This 2920 * function is also used for restoring the GPU after a GPU reset. 2921 * Returns 0 on success, negative error code on failure. 2922 */ 2923 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2924 { 2925 int i, r; 2926 2927 for (i = 0; i < adev->num_ip_blocks; i++) { 2928 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2929 continue; 2930 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2931 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2932 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2933 2934 r = adev->ip_blocks[i].version->funcs->resume(adev); 2935 if (r) { 2936 DRM_ERROR("resume of IP block <%s> failed %d\n", 2937 adev->ip_blocks[i].version->funcs->name, r); 2938 return r; 2939 } 2940 adev->ip_blocks[i].status.hw = true; 2941 } 2942 } 2943 2944 return 0; 2945 } 2946 2947 /** 2948 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2949 * 2950 * @adev: amdgpu_device pointer 2951 * 2952 * First resume function for hardware IPs. The list of all the hardware 2953 * IPs that make up the asic is walked and the resume callbacks are run for 2954 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2955 * functional state after a suspend and updates the software state as 2956 * necessary. This function is also used for restoring the GPU after a GPU 2957 * reset. 2958 * Returns 0 on success, negative error code on failure. 2959 */ 2960 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2961 { 2962 int i, r; 2963 2964 for (i = 0; i < adev->num_ip_blocks; i++) { 2965 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2966 continue; 2967 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2968 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2969 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2970 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2971 continue; 2972 r = adev->ip_blocks[i].version->funcs->resume(adev); 2973 if (r) { 2974 DRM_ERROR("resume of IP block <%s> failed %d\n", 2975 adev->ip_blocks[i].version->funcs->name, r); 2976 return r; 2977 } 2978 adev->ip_blocks[i].status.hw = true; 2979 } 2980 2981 return 0; 2982 } 2983 2984 /** 2985 * amdgpu_device_ip_resume - run resume for hardware IPs 2986 * 2987 * @adev: amdgpu_device pointer 2988 * 2989 * Main resume function for hardware IPs. The hardware IPs 2990 * are split into two resume functions because they are 2991 * are also used in in recovering from a GPU reset and some additional 2992 * steps need to be take between them. In this case (S3/S4) they are 2993 * run sequentially. 2994 * Returns 0 on success, negative error code on failure. 2995 */ 2996 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2997 { 2998 int r; 2999 3000 r = amdgpu_device_ip_resume_phase1(adev); 3001 if (r) 3002 return r; 3003 3004 r = amdgpu_device_fw_loading(adev); 3005 if (r) 3006 return r; 3007 3008 r = amdgpu_device_ip_resume_phase2(adev); 3009 3010 return r; 3011 } 3012 3013 /** 3014 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3015 * 3016 * @adev: amdgpu_device pointer 3017 * 3018 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3019 */ 3020 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3021 { 3022 if (amdgpu_sriov_vf(adev)) { 3023 if (adev->is_atom_fw) { 3024 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 3025 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3026 } else { 3027 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3028 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3029 } 3030 3031 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3032 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3033 } 3034 } 3035 3036 /** 3037 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3038 * 3039 * @asic_type: AMD asic type 3040 * 3041 * Check if there is DC (new modesetting infrastructre) support for an asic. 3042 * returns true if DC has support, false if not. 3043 */ 3044 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3045 { 3046 switch (asic_type) { 3047 #if defined(CONFIG_DRM_AMD_DC) 3048 #if defined(CONFIG_DRM_AMD_DC_SI) 3049 case CHIP_TAHITI: 3050 case CHIP_PITCAIRN: 3051 case CHIP_VERDE: 3052 case CHIP_OLAND: 3053 #endif 3054 case CHIP_BONAIRE: 3055 case CHIP_KAVERI: 3056 case CHIP_KABINI: 3057 case CHIP_MULLINS: 3058 /* 3059 * We have systems in the wild with these ASICs that require 3060 * LVDS and VGA support which is not supported with DC. 3061 * 3062 * Fallback to the non-DC driver here by default so as not to 3063 * cause regressions. 3064 */ 3065 return amdgpu_dc > 0; 3066 case CHIP_HAWAII: 3067 case CHIP_CARRIZO: 3068 case CHIP_STONEY: 3069 case CHIP_POLARIS10: 3070 case CHIP_POLARIS11: 3071 case CHIP_POLARIS12: 3072 case CHIP_VEGAM: 3073 case CHIP_TONGA: 3074 case CHIP_FIJI: 3075 case CHIP_VEGA10: 3076 case CHIP_VEGA12: 3077 case CHIP_VEGA20: 3078 #if defined(CONFIG_DRM_AMD_DC_DCN) 3079 case CHIP_RAVEN: 3080 case CHIP_NAVI10: 3081 case CHIP_NAVI14: 3082 case CHIP_NAVI12: 3083 case CHIP_RENOIR: 3084 case CHIP_SIENNA_CICHLID: 3085 case CHIP_NAVY_FLOUNDER: 3086 case CHIP_DIMGREY_CAVEFISH: 3087 case CHIP_VANGOGH: 3088 #endif 3089 return amdgpu_dc != 0; 3090 #endif 3091 default: 3092 if (amdgpu_dc > 0) 3093 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3094 "but isn't supported by ASIC, ignoring\n"); 3095 return false; 3096 } 3097 } 3098 3099 /** 3100 * amdgpu_device_has_dc_support - check if dc is supported 3101 * 3102 * @adev: amdgpu_device pointer 3103 * 3104 * Returns true for supported, false for not supported 3105 */ 3106 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3107 { 3108 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3109 return false; 3110 3111 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3112 } 3113 3114 3115 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3116 { 3117 struct amdgpu_device *adev = 3118 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3119 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3120 3121 /* It's a bug to not have a hive within this function */ 3122 if (WARN_ON(!hive)) 3123 return; 3124 3125 /* 3126 * Use task barrier to synchronize all xgmi reset works across the 3127 * hive. task_barrier_enter and task_barrier_exit will block 3128 * until all the threads running the xgmi reset works reach 3129 * those points. task_barrier_full will do both blocks. 3130 */ 3131 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3132 3133 task_barrier_enter(&hive->tb); 3134 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3135 3136 if (adev->asic_reset_res) 3137 goto fail; 3138 3139 task_barrier_exit(&hive->tb); 3140 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3141 3142 if (adev->asic_reset_res) 3143 goto fail; 3144 3145 if (adev->mmhub.ras_funcs && 3146 adev->mmhub.ras_funcs->reset_ras_error_count) 3147 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3148 } else { 3149 3150 task_barrier_full(&hive->tb); 3151 adev->asic_reset_res = amdgpu_asic_reset(adev); 3152 } 3153 3154 fail: 3155 if (adev->asic_reset_res) 3156 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3157 adev->asic_reset_res, adev_to_drm(adev)->unique); 3158 amdgpu_put_xgmi_hive(hive); 3159 } 3160 3161 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3162 { 3163 char *input = amdgpu_lockup_timeout; 3164 char *timeout_setting = NULL; 3165 int index = 0; 3166 long timeout; 3167 int ret = 0; 3168 3169 /* 3170 * By default timeout for non compute jobs is 10000. 3171 * And there is no timeout enforced on compute jobs. 3172 * In SR-IOV or passthrough mode, timeout for compute 3173 * jobs are 60000 by default. 3174 */ 3175 adev->gfx_timeout = msecs_to_jiffies(10000); 3176 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3177 if (amdgpu_sriov_vf(adev)) 3178 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3179 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3180 else if (amdgpu_passthrough(adev)) 3181 adev->compute_timeout = msecs_to_jiffies(60000); 3182 else 3183 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3184 3185 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3186 while ((timeout_setting = strsep(&input, ",")) && 3187 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3188 ret = kstrtol(timeout_setting, 0, &timeout); 3189 if (ret) 3190 return ret; 3191 3192 if (timeout == 0) { 3193 index++; 3194 continue; 3195 } else if (timeout < 0) { 3196 timeout = MAX_SCHEDULE_TIMEOUT; 3197 } else { 3198 timeout = msecs_to_jiffies(timeout); 3199 } 3200 3201 switch (index++) { 3202 case 0: 3203 adev->gfx_timeout = timeout; 3204 break; 3205 case 1: 3206 adev->compute_timeout = timeout; 3207 break; 3208 case 2: 3209 adev->sdma_timeout = timeout; 3210 break; 3211 case 3: 3212 adev->video_timeout = timeout; 3213 break; 3214 default: 3215 break; 3216 } 3217 } 3218 /* 3219 * There is only one value specified and 3220 * it should apply to all non-compute jobs. 3221 */ 3222 if (index == 1) { 3223 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3224 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3225 adev->compute_timeout = adev->gfx_timeout; 3226 } 3227 } 3228 3229 return ret; 3230 } 3231 3232 static const struct attribute *amdgpu_dev_attributes[] = { 3233 &dev_attr_product_name.attr, 3234 &dev_attr_product_number.attr, 3235 &dev_attr_serial_number.attr, 3236 &dev_attr_pcie_replay_count.attr, 3237 NULL 3238 }; 3239 3240 3241 /** 3242 * amdgpu_device_init - initialize the driver 3243 * 3244 * @adev: amdgpu_device pointer 3245 * @flags: driver flags 3246 * 3247 * Initializes the driver info and hw (all asics). 3248 * Returns 0 for success or an error on failure. 3249 * Called at driver startup. 3250 */ 3251 int amdgpu_device_init(struct amdgpu_device *adev, 3252 uint32_t flags) 3253 { 3254 struct drm_device *ddev = adev_to_drm(adev); 3255 struct pci_dev *pdev = adev->pdev; 3256 int r, i; 3257 bool px = false; 3258 u32 max_MBps; 3259 3260 adev->shutdown = false; 3261 adev->flags = flags; 3262 3263 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3264 adev->asic_type = amdgpu_force_asic_type; 3265 else 3266 adev->asic_type = flags & AMD_ASIC_MASK; 3267 3268 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3269 if (amdgpu_emu_mode == 1) 3270 adev->usec_timeout *= 10; 3271 adev->gmc.gart_size = 512 * 1024 * 1024; 3272 adev->accel_working = false; 3273 adev->num_rings = 0; 3274 adev->mman.buffer_funcs = NULL; 3275 adev->mman.buffer_funcs_ring = NULL; 3276 adev->vm_manager.vm_pte_funcs = NULL; 3277 adev->vm_manager.vm_pte_num_scheds = 0; 3278 adev->gmc.gmc_funcs = NULL; 3279 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3280 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3281 3282 adev->smc_rreg = &amdgpu_invalid_rreg; 3283 adev->smc_wreg = &amdgpu_invalid_wreg; 3284 adev->pcie_rreg = &amdgpu_invalid_rreg; 3285 adev->pcie_wreg = &amdgpu_invalid_wreg; 3286 adev->pciep_rreg = &amdgpu_invalid_rreg; 3287 adev->pciep_wreg = &amdgpu_invalid_wreg; 3288 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3289 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3290 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3291 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3292 adev->didt_rreg = &amdgpu_invalid_rreg; 3293 adev->didt_wreg = &amdgpu_invalid_wreg; 3294 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3295 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3296 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3297 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3298 3299 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3300 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3301 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3302 3303 /* mutex initialization are all done here so we 3304 * can recall function without having locking issues */ 3305 mutex_init(&adev->firmware.mutex); 3306 mutex_init(&adev->pm.mutex); 3307 mutex_init(&adev->gfx.gpu_clock_mutex); 3308 mutex_init(&adev->srbm_mutex); 3309 mutex_init(&adev->gfx.pipe_reserve_mutex); 3310 mutex_init(&adev->gfx.gfx_off_mutex); 3311 mutex_init(&adev->grbm_idx_mutex); 3312 mutex_init(&adev->mn_lock); 3313 mutex_init(&adev->virt.vf_errors.lock); 3314 hash_init(adev->mn_hash); 3315 atomic_set(&adev->in_gpu_reset, 0); 3316 init_rwsem(&adev->reset_sem); 3317 mutex_init(&adev->psp.mutex); 3318 mutex_init(&adev->notifier_lock); 3319 3320 r = amdgpu_device_check_arguments(adev); 3321 if (r) 3322 return r; 3323 3324 spin_lock_init(&adev->mmio_idx_lock); 3325 spin_lock_init(&adev->smc_idx_lock); 3326 spin_lock_init(&adev->pcie_idx_lock); 3327 spin_lock_init(&adev->uvd_ctx_idx_lock); 3328 spin_lock_init(&adev->didt_idx_lock); 3329 spin_lock_init(&adev->gc_cac_idx_lock); 3330 spin_lock_init(&adev->se_cac_idx_lock); 3331 spin_lock_init(&adev->audio_endpt_idx_lock); 3332 spin_lock_init(&adev->mm_stats.lock); 3333 3334 INIT_LIST_HEAD(&adev->shadow_list); 3335 mutex_init(&adev->shadow_list_lock); 3336 3337 INIT_LIST_HEAD(&adev->reset_list); 3338 3339 INIT_DELAYED_WORK(&adev->delayed_init_work, 3340 amdgpu_device_delayed_init_work_handler); 3341 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3342 amdgpu_device_delay_enable_gfx_off); 3343 3344 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3345 3346 adev->gfx.gfx_off_req_count = 1; 3347 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3348 3349 atomic_set(&adev->throttling_logging_enabled, 1); 3350 /* 3351 * If throttling continues, logging will be performed every minute 3352 * to avoid log flooding. "-1" is subtracted since the thermal 3353 * throttling interrupt comes every second. Thus, the total logging 3354 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3355 * for throttling interrupt) = 60 seconds. 3356 */ 3357 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3358 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3359 3360 /* Registers mapping */ 3361 /* TODO: block userspace mapping of io register */ 3362 if (adev->asic_type >= CHIP_BONAIRE) { 3363 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3364 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3365 } else { 3366 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3367 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3368 } 3369 3370 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3371 if (adev->rmmio == NULL) { 3372 return -ENOMEM; 3373 } 3374 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3375 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3376 3377 /* enable PCIE atomic ops */ 3378 r = pci_enable_atomic_ops_to_root(adev->pdev, 3379 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3380 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3381 if (r) { 3382 adev->have_atomics_support = false; 3383 DRM_INFO("PCIE atomic ops is not supported\n"); 3384 } else { 3385 adev->have_atomics_support = true; 3386 } 3387 3388 amdgpu_device_get_pcie_info(adev); 3389 3390 if (amdgpu_mcbp) 3391 DRM_INFO("MCBP is enabled\n"); 3392 3393 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3394 adev->enable_mes = true; 3395 3396 /* detect hw virtualization here */ 3397 amdgpu_detect_virtualization(adev); 3398 3399 r = amdgpu_device_get_job_timeout_settings(adev); 3400 if (r) { 3401 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3402 goto failed_unmap; 3403 } 3404 3405 /* early init functions */ 3406 r = amdgpu_device_ip_early_init(adev); 3407 if (r) 3408 goto failed_unmap; 3409 3410 /* doorbell bar mapping and doorbell index init*/ 3411 amdgpu_device_doorbell_init(adev); 3412 3413 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3414 /* this will fail for cards that aren't VGA class devices, just 3415 * ignore it */ 3416 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3417 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3418 3419 if (amdgpu_device_supports_px(ddev)) { 3420 px = true; 3421 vga_switcheroo_register_client(adev->pdev, 3422 &amdgpu_switcheroo_ops, px); 3423 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3424 } 3425 3426 if (amdgpu_emu_mode == 1) { 3427 /* post the asic on emulation mode */ 3428 emu_soc_asic_init(adev); 3429 goto fence_driver_init; 3430 } 3431 3432 amdgpu_reset_init(adev); 3433 3434 /* detect if we are with an SRIOV vbios */ 3435 amdgpu_device_detect_sriov_bios(adev); 3436 3437 /* check if we need to reset the asic 3438 * E.g., driver was not cleanly unloaded previously, etc. 3439 */ 3440 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3441 if (adev->gmc.xgmi.num_physical_nodes) { 3442 dev_info(adev->dev, "Pending hive reset.\n"); 3443 adev->gmc.xgmi.pending_reset = true; 3444 /* Only need to init necessary block for SMU to handle the reset */ 3445 for (i = 0; i < adev->num_ip_blocks; i++) { 3446 if (!adev->ip_blocks[i].status.valid) 3447 continue; 3448 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3449 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3450 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3451 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3452 DRM_DEBUG("IP %s disabled for hw_init.\n", 3453 adev->ip_blocks[i].version->funcs->name); 3454 adev->ip_blocks[i].status.hw = true; 3455 } 3456 } 3457 } else { 3458 r = amdgpu_asic_reset(adev); 3459 if (r) { 3460 dev_err(adev->dev, "asic reset on init failed\n"); 3461 goto failed; 3462 } 3463 } 3464 } 3465 3466 pci_enable_pcie_error_reporting(adev->pdev); 3467 3468 /* Post card if necessary */ 3469 if (amdgpu_device_need_post(adev)) { 3470 if (!adev->bios) { 3471 dev_err(adev->dev, "no vBIOS found\n"); 3472 r = -EINVAL; 3473 goto failed; 3474 } 3475 DRM_INFO("GPU posting now...\n"); 3476 r = amdgpu_device_asic_init(adev); 3477 if (r) { 3478 dev_err(adev->dev, "gpu post error!\n"); 3479 goto failed; 3480 } 3481 } 3482 3483 if (adev->is_atom_fw) { 3484 /* Initialize clocks */ 3485 r = amdgpu_atomfirmware_get_clock_info(adev); 3486 if (r) { 3487 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3488 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3489 goto failed; 3490 } 3491 } else { 3492 /* Initialize clocks */ 3493 r = amdgpu_atombios_get_clock_info(adev); 3494 if (r) { 3495 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3496 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3497 goto failed; 3498 } 3499 /* init i2c buses */ 3500 if (!amdgpu_device_has_dc_support(adev)) 3501 amdgpu_atombios_i2c_init(adev); 3502 } 3503 3504 fence_driver_init: 3505 /* Fence driver */ 3506 r = amdgpu_fence_driver_init(adev); 3507 if (r) { 3508 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3509 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3510 goto failed; 3511 } 3512 3513 /* init the mode config */ 3514 drm_mode_config_init(adev_to_drm(adev)); 3515 3516 r = amdgpu_device_ip_init(adev); 3517 if (r) { 3518 /* failed in exclusive mode due to timeout */ 3519 if (amdgpu_sriov_vf(adev) && 3520 !amdgpu_sriov_runtime(adev) && 3521 amdgpu_virt_mmio_blocked(adev) && 3522 !amdgpu_virt_wait_reset(adev)) { 3523 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3524 /* Don't send request since VF is inactive. */ 3525 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3526 adev->virt.ops = NULL; 3527 r = -EAGAIN; 3528 goto release_ras_con; 3529 } 3530 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3531 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3532 goto release_ras_con; 3533 } 3534 3535 dev_info(adev->dev, 3536 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3537 adev->gfx.config.max_shader_engines, 3538 adev->gfx.config.max_sh_per_se, 3539 adev->gfx.config.max_cu_per_sh, 3540 adev->gfx.cu_info.number); 3541 3542 adev->accel_working = true; 3543 3544 amdgpu_vm_check_compute_bug(adev); 3545 3546 /* Initialize the buffer migration limit. */ 3547 if (amdgpu_moverate >= 0) 3548 max_MBps = amdgpu_moverate; 3549 else 3550 max_MBps = 8; /* Allow 8 MB/s. */ 3551 /* Get a log2 for easy divisions. */ 3552 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3553 3554 amdgpu_fbdev_init(adev); 3555 3556 r = amdgpu_pm_sysfs_init(adev); 3557 if (r) { 3558 adev->pm_sysfs_en = false; 3559 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3560 } else 3561 adev->pm_sysfs_en = true; 3562 3563 r = amdgpu_ucode_sysfs_init(adev); 3564 if (r) { 3565 adev->ucode_sysfs_en = false; 3566 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3567 } else 3568 adev->ucode_sysfs_en = true; 3569 3570 if ((amdgpu_testing & 1)) { 3571 if (adev->accel_working) 3572 amdgpu_test_moves(adev); 3573 else 3574 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3575 } 3576 if (amdgpu_benchmarking) { 3577 if (adev->accel_working) 3578 amdgpu_benchmark(adev, amdgpu_benchmarking); 3579 else 3580 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3581 } 3582 3583 /* 3584 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3585 * Otherwise the mgpu fan boost feature will be skipped due to the 3586 * gpu instance is counted less. 3587 */ 3588 amdgpu_register_gpu_instance(adev); 3589 3590 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3591 * explicit gating rather than handling it automatically. 3592 */ 3593 if (!adev->gmc.xgmi.pending_reset) { 3594 r = amdgpu_device_ip_late_init(adev); 3595 if (r) { 3596 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3597 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3598 goto release_ras_con; 3599 } 3600 /* must succeed. */ 3601 amdgpu_ras_resume(adev); 3602 queue_delayed_work(system_wq, &adev->delayed_init_work, 3603 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3604 } 3605 3606 if (amdgpu_sriov_vf(adev)) 3607 flush_delayed_work(&adev->delayed_init_work); 3608 3609 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3610 if (r) 3611 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3612 3613 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3614 r = amdgpu_pmu_init(adev); 3615 if (r) 3616 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3617 3618 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3619 if (amdgpu_device_cache_pci_state(adev->pdev)) 3620 pci_restore_state(pdev); 3621 3622 if (adev->gmc.xgmi.pending_reset) 3623 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3624 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3625 3626 return 0; 3627 3628 release_ras_con: 3629 amdgpu_release_ras_context(adev); 3630 3631 failed: 3632 amdgpu_vf_error_trans_all(adev); 3633 if (px) 3634 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3635 3636 failed_unmap: 3637 iounmap(adev->rmmio); 3638 adev->rmmio = NULL; 3639 3640 return r; 3641 } 3642 3643 /** 3644 * amdgpu_device_fini - tear down the driver 3645 * 3646 * @adev: amdgpu_device pointer 3647 * 3648 * Tear down the driver info (all asics). 3649 * Called at driver shutdown. 3650 */ 3651 void amdgpu_device_fini(struct amdgpu_device *adev) 3652 { 3653 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3654 flush_delayed_work(&adev->delayed_init_work); 3655 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3656 adev->shutdown = true; 3657 3658 kfree(adev->pci_state); 3659 3660 /* make sure IB test finished before entering exclusive mode 3661 * to avoid preemption on IB test 3662 * */ 3663 if (amdgpu_sriov_vf(adev)) { 3664 amdgpu_virt_request_full_gpu(adev, false); 3665 amdgpu_virt_fini_data_exchange(adev); 3666 } 3667 3668 /* disable all interrupts */ 3669 amdgpu_irq_disable_all(adev); 3670 if (adev->mode_info.mode_config_initialized){ 3671 if (!amdgpu_device_has_dc_support(adev)) 3672 drm_helper_force_disable_all(adev_to_drm(adev)); 3673 else 3674 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3675 } 3676 amdgpu_fence_driver_fini(adev); 3677 if (adev->pm_sysfs_en) 3678 amdgpu_pm_sysfs_fini(adev); 3679 amdgpu_fbdev_fini(adev); 3680 amdgpu_device_ip_fini(adev); 3681 release_firmware(adev->firmware.gpu_info_fw); 3682 adev->firmware.gpu_info_fw = NULL; 3683 adev->accel_working = false; 3684 3685 amdgpu_reset_fini(adev); 3686 3687 /* free i2c buses */ 3688 if (!amdgpu_device_has_dc_support(adev)) 3689 amdgpu_i2c_fini(adev); 3690 3691 if (amdgpu_emu_mode != 1) 3692 amdgpu_atombios_fini(adev); 3693 3694 kfree(adev->bios); 3695 adev->bios = NULL; 3696 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3697 vga_switcheroo_unregister_client(adev->pdev); 3698 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3699 } 3700 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3701 vga_client_register(adev->pdev, NULL, NULL, NULL); 3702 iounmap(adev->rmmio); 3703 adev->rmmio = NULL; 3704 amdgpu_device_doorbell_fini(adev); 3705 3706 if (adev->ucode_sysfs_en) 3707 amdgpu_ucode_sysfs_fini(adev); 3708 3709 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3710 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3711 amdgpu_pmu_fini(adev); 3712 if (adev->mman.discovery_bin) 3713 amdgpu_discovery_fini(adev); 3714 } 3715 3716 3717 /* 3718 * Suspend & resume. 3719 */ 3720 /** 3721 * amdgpu_device_suspend - initiate device suspend 3722 * 3723 * @dev: drm dev pointer 3724 * @fbcon : notify the fbdev of suspend 3725 * 3726 * Puts the hw in the suspend state (all asics). 3727 * Returns 0 for success or an error on failure. 3728 * Called at driver suspend. 3729 */ 3730 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3731 { 3732 struct amdgpu_device *adev = drm_to_adev(dev); 3733 int r; 3734 3735 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3736 return 0; 3737 3738 adev->in_suspend = true; 3739 drm_kms_helper_poll_disable(dev); 3740 3741 if (fbcon) 3742 amdgpu_fbdev_set_suspend(adev, 1); 3743 3744 cancel_delayed_work_sync(&adev->delayed_init_work); 3745 3746 amdgpu_ras_suspend(adev); 3747 3748 r = amdgpu_device_ip_suspend_phase1(adev); 3749 3750 if (!adev->in_s0ix) 3751 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 3752 3753 /* evict vram memory */ 3754 amdgpu_bo_evict_vram(adev); 3755 3756 amdgpu_fence_driver_suspend(adev); 3757 3758 r = amdgpu_device_ip_suspend_phase2(adev); 3759 /* evict remaining vram memory 3760 * This second call to evict vram is to evict the gart page table 3761 * using the CPU. 3762 */ 3763 amdgpu_bo_evict_vram(adev); 3764 3765 return 0; 3766 } 3767 3768 /** 3769 * amdgpu_device_resume - initiate device resume 3770 * 3771 * @dev: drm dev pointer 3772 * @fbcon : notify the fbdev of resume 3773 * 3774 * Bring the hw back to operating state (all asics). 3775 * Returns 0 for success or an error on failure. 3776 * Called at driver resume. 3777 */ 3778 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3779 { 3780 struct amdgpu_device *adev = drm_to_adev(dev); 3781 int r = 0; 3782 3783 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3784 return 0; 3785 3786 if (adev->in_s0ix) 3787 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3788 3789 /* post card */ 3790 if (amdgpu_device_need_post(adev)) { 3791 r = amdgpu_device_asic_init(adev); 3792 if (r) 3793 dev_err(adev->dev, "amdgpu asic init failed\n"); 3794 } 3795 3796 r = amdgpu_device_ip_resume(adev); 3797 if (r) { 3798 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3799 return r; 3800 } 3801 amdgpu_fence_driver_resume(adev); 3802 3803 3804 r = amdgpu_device_ip_late_init(adev); 3805 if (r) 3806 return r; 3807 3808 queue_delayed_work(system_wq, &adev->delayed_init_work, 3809 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3810 3811 if (!adev->in_s0ix) { 3812 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 3813 if (r) 3814 return r; 3815 } 3816 3817 /* Make sure IB tests flushed */ 3818 flush_delayed_work(&adev->delayed_init_work); 3819 3820 if (fbcon) 3821 amdgpu_fbdev_set_suspend(adev, 0); 3822 3823 drm_kms_helper_poll_enable(dev); 3824 3825 amdgpu_ras_resume(adev); 3826 3827 /* 3828 * Most of the connector probing functions try to acquire runtime pm 3829 * refs to ensure that the GPU is powered on when connector polling is 3830 * performed. Since we're calling this from a runtime PM callback, 3831 * trying to acquire rpm refs will cause us to deadlock. 3832 * 3833 * Since we're guaranteed to be holding the rpm lock, it's safe to 3834 * temporarily disable the rpm helpers so this doesn't deadlock us. 3835 */ 3836 #ifdef CONFIG_PM 3837 dev->dev->power.disable_depth++; 3838 #endif 3839 if (!amdgpu_device_has_dc_support(adev)) 3840 drm_helper_hpd_irq_event(dev); 3841 else 3842 drm_kms_helper_hotplug_event(dev); 3843 #ifdef CONFIG_PM 3844 dev->dev->power.disable_depth--; 3845 #endif 3846 adev->in_suspend = false; 3847 3848 return 0; 3849 } 3850 3851 /** 3852 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3853 * 3854 * @adev: amdgpu_device pointer 3855 * 3856 * The list of all the hardware IPs that make up the asic is walked and 3857 * the check_soft_reset callbacks are run. check_soft_reset determines 3858 * if the asic is still hung or not. 3859 * Returns true if any of the IPs are still in a hung state, false if not. 3860 */ 3861 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3862 { 3863 int i; 3864 bool asic_hang = false; 3865 3866 if (amdgpu_sriov_vf(adev)) 3867 return true; 3868 3869 if (amdgpu_asic_need_full_reset(adev)) 3870 return true; 3871 3872 for (i = 0; i < adev->num_ip_blocks; i++) { 3873 if (!adev->ip_blocks[i].status.valid) 3874 continue; 3875 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3876 adev->ip_blocks[i].status.hang = 3877 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3878 if (adev->ip_blocks[i].status.hang) { 3879 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3880 asic_hang = true; 3881 } 3882 } 3883 return asic_hang; 3884 } 3885 3886 /** 3887 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3888 * 3889 * @adev: amdgpu_device pointer 3890 * 3891 * The list of all the hardware IPs that make up the asic is walked and the 3892 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3893 * handles any IP specific hardware or software state changes that are 3894 * necessary for a soft reset to succeed. 3895 * Returns 0 on success, negative error code on failure. 3896 */ 3897 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3898 { 3899 int i, r = 0; 3900 3901 for (i = 0; i < adev->num_ip_blocks; i++) { 3902 if (!adev->ip_blocks[i].status.valid) 3903 continue; 3904 if (adev->ip_blocks[i].status.hang && 3905 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3906 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3907 if (r) 3908 return r; 3909 } 3910 } 3911 3912 return 0; 3913 } 3914 3915 /** 3916 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3917 * 3918 * @adev: amdgpu_device pointer 3919 * 3920 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3921 * reset is necessary to recover. 3922 * Returns true if a full asic reset is required, false if not. 3923 */ 3924 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3925 { 3926 int i; 3927 3928 if (amdgpu_asic_need_full_reset(adev)) 3929 return true; 3930 3931 for (i = 0; i < adev->num_ip_blocks; i++) { 3932 if (!adev->ip_blocks[i].status.valid) 3933 continue; 3934 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3935 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3936 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3937 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3938 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3939 if (adev->ip_blocks[i].status.hang) { 3940 dev_info(adev->dev, "Some block need full reset!\n"); 3941 return true; 3942 } 3943 } 3944 } 3945 return false; 3946 } 3947 3948 /** 3949 * amdgpu_device_ip_soft_reset - do a soft reset 3950 * 3951 * @adev: amdgpu_device pointer 3952 * 3953 * The list of all the hardware IPs that make up the asic is walked and the 3954 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3955 * IP specific hardware or software state changes that are necessary to soft 3956 * reset the IP. 3957 * Returns 0 on success, negative error code on failure. 3958 */ 3959 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3960 { 3961 int i, r = 0; 3962 3963 for (i = 0; i < adev->num_ip_blocks; i++) { 3964 if (!adev->ip_blocks[i].status.valid) 3965 continue; 3966 if (adev->ip_blocks[i].status.hang && 3967 adev->ip_blocks[i].version->funcs->soft_reset) { 3968 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3969 if (r) 3970 return r; 3971 } 3972 } 3973 3974 return 0; 3975 } 3976 3977 /** 3978 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3979 * 3980 * @adev: amdgpu_device pointer 3981 * 3982 * The list of all the hardware IPs that make up the asic is walked and the 3983 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3984 * handles any IP specific hardware or software state changes that are 3985 * necessary after the IP has been soft reset. 3986 * Returns 0 on success, negative error code on failure. 3987 */ 3988 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3989 { 3990 int i, r = 0; 3991 3992 for (i = 0; i < adev->num_ip_blocks; i++) { 3993 if (!adev->ip_blocks[i].status.valid) 3994 continue; 3995 if (adev->ip_blocks[i].status.hang && 3996 adev->ip_blocks[i].version->funcs->post_soft_reset) 3997 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3998 if (r) 3999 return r; 4000 } 4001 4002 return 0; 4003 } 4004 4005 /** 4006 * amdgpu_device_recover_vram - Recover some VRAM contents 4007 * 4008 * @adev: amdgpu_device pointer 4009 * 4010 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4011 * restore things like GPUVM page tables after a GPU reset where 4012 * the contents of VRAM might be lost. 4013 * 4014 * Returns: 4015 * 0 on success, negative error code on failure. 4016 */ 4017 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4018 { 4019 struct dma_fence *fence = NULL, *next = NULL; 4020 struct amdgpu_bo *shadow; 4021 long r = 1, tmo; 4022 4023 if (amdgpu_sriov_runtime(adev)) 4024 tmo = msecs_to_jiffies(8000); 4025 else 4026 tmo = msecs_to_jiffies(100); 4027 4028 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4029 mutex_lock(&adev->shadow_list_lock); 4030 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4031 4032 /* No need to recover an evicted BO */ 4033 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4034 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4035 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4036 continue; 4037 4038 r = amdgpu_bo_restore_shadow(shadow, &next); 4039 if (r) 4040 break; 4041 4042 if (fence) { 4043 tmo = dma_fence_wait_timeout(fence, false, tmo); 4044 dma_fence_put(fence); 4045 fence = next; 4046 if (tmo == 0) { 4047 r = -ETIMEDOUT; 4048 break; 4049 } else if (tmo < 0) { 4050 r = tmo; 4051 break; 4052 } 4053 } else { 4054 fence = next; 4055 } 4056 } 4057 mutex_unlock(&adev->shadow_list_lock); 4058 4059 if (fence) 4060 tmo = dma_fence_wait_timeout(fence, false, tmo); 4061 dma_fence_put(fence); 4062 4063 if (r < 0 || tmo <= 0) { 4064 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4065 return -EIO; 4066 } 4067 4068 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4069 return 0; 4070 } 4071 4072 4073 /** 4074 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4075 * 4076 * @adev: amdgpu_device pointer 4077 * @from_hypervisor: request from hypervisor 4078 * 4079 * do VF FLR and reinitialize Asic 4080 * return 0 means succeeded otherwise failed 4081 */ 4082 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4083 bool from_hypervisor) 4084 { 4085 int r; 4086 4087 if (from_hypervisor) 4088 r = amdgpu_virt_request_full_gpu(adev, true); 4089 else 4090 r = amdgpu_virt_reset_gpu(adev); 4091 if (r) 4092 return r; 4093 4094 amdgpu_amdkfd_pre_reset(adev); 4095 4096 /* Resume IP prior to SMC */ 4097 r = amdgpu_device_ip_reinit_early_sriov(adev); 4098 if (r) 4099 goto error; 4100 4101 amdgpu_virt_init_data_exchange(adev); 4102 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4103 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4104 4105 r = amdgpu_device_fw_loading(adev); 4106 if (r) 4107 return r; 4108 4109 /* now we are okay to resume SMC/CP/SDMA */ 4110 r = amdgpu_device_ip_reinit_late_sriov(adev); 4111 if (r) 4112 goto error; 4113 4114 amdgpu_irq_gpu_reset_resume_helper(adev); 4115 r = amdgpu_ib_ring_tests(adev); 4116 amdgpu_amdkfd_post_reset(adev); 4117 4118 error: 4119 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4120 amdgpu_inc_vram_lost(adev); 4121 r = amdgpu_device_recover_vram(adev); 4122 } 4123 amdgpu_virt_release_full_gpu(adev, true); 4124 4125 return r; 4126 } 4127 4128 /** 4129 * amdgpu_device_has_job_running - check if there is any job in mirror list 4130 * 4131 * @adev: amdgpu_device pointer 4132 * 4133 * check if there is any job in mirror list 4134 */ 4135 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4136 { 4137 int i; 4138 struct drm_sched_job *job; 4139 4140 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4141 struct amdgpu_ring *ring = adev->rings[i]; 4142 4143 if (!ring || !ring->sched.thread) 4144 continue; 4145 4146 spin_lock(&ring->sched.job_list_lock); 4147 job = list_first_entry_or_null(&ring->sched.pending_list, 4148 struct drm_sched_job, list); 4149 spin_unlock(&ring->sched.job_list_lock); 4150 if (job) 4151 return true; 4152 } 4153 return false; 4154 } 4155 4156 /** 4157 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4158 * 4159 * @adev: amdgpu_device pointer 4160 * 4161 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4162 * a hung GPU. 4163 */ 4164 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4165 { 4166 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4167 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4168 return false; 4169 } 4170 4171 if (amdgpu_gpu_recovery == 0) 4172 goto disabled; 4173 4174 if (amdgpu_sriov_vf(adev)) 4175 return true; 4176 4177 if (amdgpu_gpu_recovery == -1) { 4178 switch (adev->asic_type) { 4179 case CHIP_BONAIRE: 4180 case CHIP_HAWAII: 4181 case CHIP_TOPAZ: 4182 case CHIP_TONGA: 4183 case CHIP_FIJI: 4184 case CHIP_POLARIS10: 4185 case CHIP_POLARIS11: 4186 case CHIP_POLARIS12: 4187 case CHIP_VEGAM: 4188 case CHIP_VEGA20: 4189 case CHIP_VEGA10: 4190 case CHIP_VEGA12: 4191 case CHIP_RAVEN: 4192 case CHIP_ARCTURUS: 4193 case CHIP_RENOIR: 4194 case CHIP_NAVI10: 4195 case CHIP_NAVI14: 4196 case CHIP_NAVI12: 4197 case CHIP_SIENNA_CICHLID: 4198 case CHIP_NAVY_FLOUNDER: 4199 case CHIP_DIMGREY_CAVEFISH: 4200 case CHIP_VANGOGH: 4201 case CHIP_ALDEBARAN: 4202 break; 4203 default: 4204 goto disabled; 4205 } 4206 } 4207 4208 return true; 4209 4210 disabled: 4211 dev_info(adev->dev, "GPU recovery disabled.\n"); 4212 return false; 4213 } 4214 4215 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4216 { 4217 u32 i; 4218 int ret = 0; 4219 4220 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4221 4222 dev_info(adev->dev, "GPU mode1 reset\n"); 4223 4224 /* disable BM */ 4225 pci_clear_master(adev->pdev); 4226 4227 amdgpu_device_cache_pci_state(adev->pdev); 4228 4229 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4230 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4231 ret = amdgpu_dpm_mode1_reset(adev); 4232 } else { 4233 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4234 ret = psp_gpu_reset(adev); 4235 } 4236 4237 if (ret) 4238 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4239 4240 amdgpu_device_load_pci_state(adev->pdev); 4241 4242 /* wait for asic to come out of reset */ 4243 for (i = 0; i < adev->usec_timeout; i++) { 4244 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4245 4246 if (memsize != 0xffffffff) 4247 break; 4248 udelay(1); 4249 } 4250 4251 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4252 return ret; 4253 } 4254 4255 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4256 struct amdgpu_reset_context *reset_context) 4257 { 4258 int i, r = 0; 4259 struct amdgpu_job *job = NULL; 4260 bool need_full_reset = 4261 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4262 4263 if (reset_context->reset_req_dev == adev) 4264 job = reset_context->job; 4265 4266 /* no need to dump if device is not in good state during probe period */ 4267 if (!adev->gmc.xgmi.pending_reset) 4268 amdgpu_debugfs_wait_dump(adev); 4269 4270 if (amdgpu_sriov_vf(adev)) { 4271 /* stop the data exchange thread */ 4272 amdgpu_virt_fini_data_exchange(adev); 4273 } 4274 4275 /* block all schedulers and reset given job's ring */ 4276 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4277 struct amdgpu_ring *ring = adev->rings[i]; 4278 4279 if (!ring || !ring->sched.thread) 4280 continue; 4281 4282 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4283 amdgpu_fence_driver_force_completion(ring); 4284 } 4285 4286 if(job) 4287 drm_sched_increase_karma(&job->base); 4288 4289 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4290 /* If reset handler not implemented, continue; otherwise return */ 4291 if (r == -ENOSYS) 4292 r = 0; 4293 else 4294 return r; 4295 4296 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4297 if (!amdgpu_sriov_vf(adev)) { 4298 4299 if (!need_full_reset) 4300 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4301 4302 if (!need_full_reset) { 4303 amdgpu_device_ip_pre_soft_reset(adev); 4304 r = amdgpu_device_ip_soft_reset(adev); 4305 amdgpu_device_ip_post_soft_reset(adev); 4306 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4307 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4308 need_full_reset = true; 4309 } 4310 } 4311 4312 if (need_full_reset) 4313 r = amdgpu_device_ip_suspend(adev); 4314 if (need_full_reset) 4315 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4316 else 4317 clear_bit(AMDGPU_NEED_FULL_RESET, 4318 &reset_context->flags); 4319 } 4320 4321 return r; 4322 } 4323 4324 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4325 struct amdgpu_reset_context *reset_context) 4326 { 4327 struct amdgpu_device *tmp_adev = NULL; 4328 bool need_full_reset, skip_hw_reset, vram_lost = false; 4329 int r = 0; 4330 4331 /* Try reset handler method first */ 4332 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4333 reset_list); 4334 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4335 /* If reset handler not implemented, continue; otherwise return */ 4336 if (r == -ENOSYS) 4337 r = 0; 4338 else 4339 return r; 4340 4341 /* Reset handler not implemented, use the default method */ 4342 need_full_reset = 4343 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4344 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4345 4346 /* 4347 * ASIC reset has to be done on all XGMI hive nodes ASAP 4348 * to allow proper links negotiation in FW (within 1 sec) 4349 */ 4350 if (!skip_hw_reset && need_full_reset) { 4351 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4352 /* For XGMI run all resets in parallel to speed up the process */ 4353 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4354 tmp_adev->gmc.xgmi.pending_reset = false; 4355 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4356 r = -EALREADY; 4357 } else 4358 r = amdgpu_asic_reset(tmp_adev); 4359 4360 if (r) { 4361 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4362 r, adev_to_drm(tmp_adev)->unique); 4363 break; 4364 } 4365 } 4366 4367 /* For XGMI wait for all resets to complete before proceed */ 4368 if (!r) { 4369 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4370 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4371 flush_work(&tmp_adev->xgmi_reset_work); 4372 r = tmp_adev->asic_reset_res; 4373 if (r) 4374 break; 4375 } 4376 } 4377 } 4378 } 4379 4380 if (!r && amdgpu_ras_intr_triggered()) { 4381 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4382 if (tmp_adev->mmhub.ras_funcs && 4383 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4384 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4385 } 4386 4387 amdgpu_ras_intr_cleared(); 4388 } 4389 4390 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4391 if (need_full_reset) { 4392 /* post card */ 4393 r = amdgpu_device_asic_init(tmp_adev); 4394 if (r) { 4395 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4396 } else { 4397 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4398 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4399 if (r) 4400 goto out; 4401 4402 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4403 if (vram_lost) { 4404 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4405 amdgpu_inc_vram_lost(tmp_adev); 4406 } 4407 4408 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4409 if (r) 4410 goto out; 4411 4412 r = amdgpu_device_fw_loading(tmp_adev); 4413 if (r) 4414 return r; 4415 4416 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4417 if (r) 4418 goto out; 4419 4420 if (vram_lost) 4421 amdgpu_device_fill_reset_magic(tmp_adev); 4422 4423 /* 4424 * Add this ASIC as tracked as reset was already 4425 * complete successfully. 4426 */ 4427 amdgpu_register_gpu_instance(tmp_adev); 4428 4429 if (!reset_context->hive && 4430 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4431 amdgpu_xgmi_add_device(tmp_adev); 4432 4433 r = amdgpu_device_ip_late_init(tmp_adev); 4434 if (r) 4435 goto out; 4436 4437 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4438 4439 /* 4440 * The GPU enters bad state once faulty pages 4441 * by ECC has reached the threshold, and ras 4442 * recovery is scheduled next. So add one check 4443 * here to break recovery if it indeed exceeds 4444 * bad page threshold, and remind user to 4445 * retire this GPU or setting one bigger 4446 * bad_page_threshold value to fix this once 4447 * probing driver again. 4448 */ 4449 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4450 /* must succeed. */ 4451 amdgpu_ras_resume(tmp_adev); 4452 } else { 4453 r = -EINVAL; 4454 goto out; 4455 } 4456 4457 /* Update PSP FW topology after reset */ 4458 if (reset_context->hive && 4459 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4460 r = amdgpu_xgmi_update_topology( 4461 reset_context->hive, tmp_adev); 4462 } 4463 } 4464 4465 out: 4466 if (!r) { 4467 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4468 r = amdgpu_ib_ring_tests(tmp_adev); 4469 if (r) { 4470 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4471 r = amdgpu_device_ip_suspend(tmp_adev); 4472 need_full_reset = true; 4473 r = -EAGAIN; 4474 goto end; 4475 } 4476 } 4477 4478 if (!r) 4479 r = amdgpu_device_recover_vram(tmp_adev); 4480 else 4481 tmp_adev->asic_reset_res = r; 4482 } 4483 4484 end: 4485 if (need_full_reset) 4486 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4487 else 4488 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4489 return r; 4490 } 4491 4492 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4493 struct amdgpu_hive_info *hive) 4494 { 4495 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4496 return false; 4497 4498 if (hive) { 4499 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4500 } else { 4501 down_write(&adev->reset_sem); 4502 } 4503 4504 switch (amdgpu_asic_reset_method(adev)) { 4505 case AMD_RESET_METHOD_MODE1: 4506 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4507 break; 4508 case AMD_RESET_METHOD_MODE2: 4509 adev->mp1_state = PP_MP1_STATE_RESET; 4510 break; 4511 default: 4512 adev->mp1_state = PP_MP1_STATE_NONE; 4513 break; 4514 } 4515 4516 return true; 4517 } 4518 4519 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4520 { 4521 amdgpu_vf_error_trans_all(adev); 4522 adev->mp1_state = PP_MP1_STATE_NONE; 4523 atomic_set(&adev->in_gpu_reset, 0); 4524 up_write(&adev->reset_sem); 4525 } 4526 4527 /* 4528 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4529 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4530 * 4531 * unlock won't require roll back. 4532 */ 4533 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4534 { 4535 struct amdgpu_device *tmp_adev = NULL; 4536 4537 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4538 if (!hive) { 4539 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4540 return -ENODEV; 4541 } 4542 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4543 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4544 goto roll_back; 4545 } 4546 } else if (!amdgpu_device_lock_adev(adev, hive)) 4547 return -EAGAIN; 4548 4549 return 0; 4550 roll_back: 4551 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4552 /* 4553 * if the lockup iteration break in the middle of a hive, 4554 * it may means there may has a race issue, 4555 * or a hive device locked up independently. 4556 * we may be in trouble and may not, so will try to roll back 4557 * the lock and give out a warnning. 4558 */ 4559 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4560 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4561 amdgpu_device_unlock_adev(tmp_adev); 4562 } 4563 } 4564 return -EAGAIN; 4565 } 4566 4567 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4568 { 4569 struct pci_dev *p = NULL; 4570 4571 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4572 adev->pdev->bus->number, 1); 4573 if (p) { 4574 pm_runtime_enable(&(p->dev)); 4575 pm_runtime_resume(&(p->dev)); 4576 } 4577 } 4578 4579 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4580 { 4581 enum amd_reset_method reset_method; 4582 struct pci_dev *p = NULL; 4583 u64 expires; 4584 4585 /* 4586 * For now, only BACO and mode1 reset are confirmed 4587 * to suffer the audio issue without proper suspended. 4588 */ 4589 reset_method = amdgpu_asic_reset_method(adev); 4590 if ((reset_method != AMD_RESET_METHOD_BACO) && 4591 (reset_method != AMD_RESET_METHOD_MODE1)) 4592 return -EINVAL; 4593 4594 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4595 adev->pdev->bus->number, 1); 4596 if (!p) 4597 return -ENODEV; 4598 4599 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4600 if (!expires) 4601 /* 4602 * If we cannot get the audio device autosuspend delay, 4603 * a fixed 4S interval will be used. Considering 3S is 4604 * the audio controller default autosuspend delay setting. 4605 * 4S used here is guaranteed to cover that. 4606 */ 4607 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4608 4609 while (!pm_runtime_status_suspended(&(p->dev))) { 4610 if (!pm_runtime_suspend(&(p->dev))) 4611 break; 4612 4613 if (expires < ktime_get_mono_fast_ns()) { 4614 dev_warn(adev->dev, "failed to suspend display audio\n"); 4615 /* TODO: abort the succeeding gpu reset? */ 4616 return -ETIMEDOUT; 4617 } 4618 } 4619 4620 pm_runtime_disable(&(p->dev)); 4621 4622 return 0; 4623 } 4624 4625 void amdgpu_device_recheck_guilty_jobs( 4626 struct amdgpu_device *adev, struct list_head *device_list_handle, 4627 struct amdgpu_reset_context *reset_context) 4628 { 4629 int i, r = 0; 4630 4631 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4632 struct amdgpu_ring *ring = adev->rings[i]; 4633 int ret = 0; 4634 struct drm_sched_job *s_job; 4635 4636 if (!ring || !ring->sched.thread) 4637 continue; 4638 4639 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4640 struct drm_sched_job, list); 4641 if (s_job == NULL) 4642 continue; 4643 4644 /* clear job's guilty and depend the folowing step to decide the real one */ 4645 drm_sched_reset_karma(s_job); 4646 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4647 4648 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4649 if (ret == 0) { /* timeout */ 4650 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4651 ring->sched.name, s_job->id); 4652 4653 /* set guilty */ 4654 drm_sched_increase_karma(s_job); 4655 retry: 4656 /* do hw reset */ 4657 if (amdgpu_sriov_vf(adev)) { 4658 amdgpu_virt_fini_data_exchange(adev); 4659 r = amdgpu_device_reset_sriov(adev, false); 4660 if (r) 4661 adev->asic_reset_res = r; 4662 } else { 4663 clear_bit(AMDGPU_SKIP_HW_RESET, 4664 &reset_context->flags); 4665 r = amdgpu_do_asic_reset(device_list_handle, 4666 reset_context); 4667 if (r && r == -EAGAIN) 4668 goto retry; 4669 } 4670 4671 /* 4672 * add reset counter so that the following 4673 * resubmitted job could flush vmid 4674 */ 4675 atomic_inc(&adev->gpu_reset_counter); 4676 continue; 4677 } 4678 4679 /* got the hw fence, signal finished fence */ 4680 atomic_dec(ring->sched.score); 4681 dma_fence_get(&s_job->s_fence->finished); 4682 dma_fence_signal(&s_job->s_fence->finished); 4683 dma_fence_put(&s_job->s_fence->finished); 4684 4685 /* remove node from list and free the job */ 4686 spin_lock(&ring->sched.job_list_lock); 4687 list_del_init(&s_job->list); 4688 spin_unlock(&ring->sched.job_list_lock); 4689 ring->sched.ops->free_job(s_job); 4690 } 4691 } 4692 4693 /** 4694 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4695 * 4696 * @adev: amdgpu_device pointer 4697 * @job: which job trigger hang 4698 * 4699 * Attempt to reset the GPU if it has hung (all asics). 4700 * Attempt to do soft-reset or full-reset and reinitialize Asic 4701 * Returns 0 for success or an error on failure. 4702 */ 4703 4704 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4705 struct amdgpu_job *job) 4706 { 4707 struct list_head device_list, *device_list_handle = NULL; 4708 bool job_signaled = false; 4709 struct amdgpu_hive_info *hive = NULL; 4710 struct amdgpu_device *tmp_adev = NULL; 4711 int i, r = 0; 4712 bool need_emergency_restart = false; 4713 bool audio_suspended = false; 4714 int tmp_vram_lost_counter; 4715 struct amdgpu_reset_context reset_context; 4716 4717 memset(&reset_context, 0, sizeof(reset_context)); 4718 4719 /* 4720 * Special case: RAS triggered and full reset isn't supported 4721 */ 4722 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4723 4724 /* 4725 * Flush RAM to disk so that after reboot 4726 * the user can read log and see why the system rebooted. 4727 */ 4728 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4729 DRM_WARN("Emergency reboot."); 4730 4731 ksys_sync_helper(); 4732 emergency_restart(); 4733 } 4734 4735 dev_info(adev->dev, "GPU %s begin!\n", 4736 need_emergency_restart ? "jobs stop":"reset"); 4737 4738 /* 4739 * Here we trylock to avoid chain of resets executing from 4740 * either trigger by jobs on different adevs in XGMI hive or jobs on 4741 * different schedulers for same device while this TO handler is running. 4742 * We always reset all schedulers for device and all devices for XGMI 4743 * hive so that should take care of them too. 4744 */ 4745 hive = amdgpu_get_xgmi_hive(adev); 4746 if (hive) { 4747 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4748 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4749 job ? job->base.id : -1, hive->hive_id); 4750 amdgpu_put_xgmi_hive(hive); 4751 if (job) 4752 drm_sched_increase_karma(&job->base); 4753 return 0; 4754 } 4755 mutex_lock(&hive->hive_lock); 4756 } 4757 4758 reset_context.method = AMD_RESET_METHOD_NONE; 4759 reset_context.reset_req_dev = adev; 4760 reset_context.job = job; 4761 reset_context.hive = hive; 4762 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 4763 4764 /* 4765 * lock the device before we try to operate the linked list 4766 * if didn't get the device lock, don't touch the linked list since 4767 * others may iterating it. 4768 */ 4769 r = amdgpu_device_lock_hive_adev(adev, hive); 4770 if (r) { 4771 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4772 job ? job->base.id : -1); 4773 4774 /* even we skipped this reset, still need to set the job to guilty */ 4775 if (job) 4776 drm_sched_increase_karma(&job->base); 4777 goto skip_recovery; 4778 } 4779 4780 /* 4781 * Build list of devices to reset. 4782 * In case we are in XGMI hive mode, resort the device list 4783 * to put adev in the 1st position. 4784 */ 4785 INIT_LIST_HEAD(&device_list); 4786 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4787 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 4788 list_add_tail(&tmp_adev->reset_list, &device_list); 4789 if (!list_is_first(&adev->reset_list, &device_list)) 4790 list_rotate_to_front(&adev->reset_list, &device_list); 4791 device_list_handle = &device_list; 4792 } else { 4793 list_add_tail(&adev->reset_list, &device_list); 4794 device_list_handle = &device_list; 4795 } 4796 4797 /* block all schedulers and reset given job's ring */ 4798 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4799 /* 4800 * Try to put the audio codec into suspend state 4801 * before gpu reset started. 4802 * 4803 * Due to the power domain of the graphics device 4804 * is shared with AZ power domain. Without this, 4805 * we may change the audio hardware from behind 4806 * the audio driver's back. That will trigger 4807 * some audio codec errors. 4808 */ 4809 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4810 audio_suspended = true; 4811 4812 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4813 4814 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4815 4816 if (!amdgpu_sriov_vf(tmp_adev)) 4817 amdgpu_amdkfd_pre_reset(tmp_adev); 4818 4819 /* 4820 * Mark these ASICs to be reseted as untracked first 4821 * And add them back after reset completed 4822 */ 4823 amdgpu_unregister_gpu_instance(tmp_adev); 4824 4825 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4826 4827 /* disable ras on ALL IPs */ 4828 if (!need_emergency_restart && 4829 amdgpu_device_ip_need_full_reset(tmp_adev)) 4830 amdgpu_ras_suspend(tmp_adev); 4831 4832 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4833 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4834 4835 if (!ring || !ring->sched.thread) 4836 continue; 4837 4838 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4839 4840 if (need_emergency_restart) 4841 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4842 } 4843 atomic_inc(&tmp_adev->gpu_reset_counter); 4844 } 4845 4846 if (need_emergency_restart) 4847 goto skip_sched_resume; 4848 4849 /* 4850 * Must check guilty signal here since after this point all old 4851 * HW fences are force signaled. 4852 * 4853 * job->base holds a reference to parent fence 4854 */ 4855 if (job && job->base.s_fence->parent && 4856 dma_fence_is_signaled(job->base.s_fence->parent)) { 4857 job_signaled = true; 4858 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4859 goto skip_hw_reset; 4860 } 4861 4862 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4863 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4864 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 4865 /*TODO Should we stop ?*/ 4866 if (r) { 4867 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4868 r, adev_to_drm(tmp_adev)->unique); 4869 tmp_adev->asic_reset_res = r; 4870 } 4871 } 4872 4873 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 4874 /* Actual ASIC resets if needed.*/ 4875 /* TODO Implement XGMI hive reset logic for SRIOV */ 4876 if (amdgpu_sriov_vf(adev)) { 4877 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4878 if (r) 4879 adev->asic_reset_res = r; 4880 } else { 4881 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 4882 if (r && r == -EAGAIN) 4883 goto retry; 4884 } 4885 4886 skip_hw_reset: 4887 4888 /* Post ASIC reset for all devs .*/ 4889 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4890 4891 /* 4892 * Sometimes a later bad compute job can block a good gfx job as gfx 4893 * and compute ring share internal GC HW mutually. We add an additional 4894 * guilty jobs recheck step to find the real guilty job, it synchronously 4895 * submits and pends for the first job being signaled. If it gets timeout, 4896 * we identify it as a real guilty job. 4897 */ 4898 if (amdgpu_gpu_recovery == 2 && 4899 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 4900 amdgpu_device_recheck_guilty_jobs( 4901 tmp_adev, device_list_handle, &reset_context); 4902 4903 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4904 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4905 4906 if (!ring || !ring->sched.thread) 4907 continue; 4908 4909 /* No point to resubmit jobs if we didn't HW reset*/ 4910 if (!tmp_adev->asic_reset_res && !job_signaled) 4911 drm_sched_resubmit_jobs(&ring->sched); 4912 4913 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4914 } 4915 4916 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4917 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4918 } 4919 4920 tmp_adev->asic_reset_res = 0; 4921 4922 if (r) { 4923 /* bad news, how to tell it to userspace ? */ 4924 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4925 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4926 } else { 4927 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4928 } 4929 } 4930 4931 skip_sched_resume: 4932 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4933 /* unlock kfd: SRIOV would do it separately */ 4934 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4935 amdgpu_amdkfd_post_reset(tmp_adev); 4936 4937 /* kfd_post_reset will do nothing if kfd device is not initialized, 4938 * need to bring up kfd here if it's not be initialized before 4939 */ 4940 if (!adev->kfd.init_complete) 4941 amdgpu_amdkfd_device_init(adev); 4942 4943 if (audio_suspended) 4944 amdgpu_device_resume_display_audio(tmp_adev); 4945 amdgpu_device_unlock_adev(tmp_adev); 4946 } 4947 4948 skip_recovery: 4949 if (hive) { 4950 atomic_set(&hive->in_reset, 0); 4951 mutex_unlock(&hive->hive_lock); 4952 amdgpu_put_xgmi_hive(hive); 4953 } 4954 4955 if (r && r != -EAGAIN) 4956 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4957 return r; 4958 } 4959 4960 /** 4961 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4962 * 4963 * @adev: amdgpu_device pointer 4964 * 4965 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4966 * and lanes) of the slot the device is in. Handles APUs and 4967 * virtualized environments where PCIE config space may not be available. 4968 */ 4969 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4970 { 4971 struct pci_dev *pdev; 4972 enum pci_bus_speed speed_cap, platform_speed_cap; 4973 enum pcie_link_width platform_link_width; 4974 4975 if (amdgpu_pcie_gen_cap) 4976 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4977 4978 if (amdgpu_pcie_lane_cap) 4979 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4980 4981 /* covers APUs as well */ 4982 if (pci_is_root_bus(adev->pdev->bus)) { 4983 if (adev->pm.pcie_gen_mask == 0) 4984 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4985 if (adev->pm.pcie_mlw_mask == 0) 4986 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4987 return; 4988 } 4989 4990 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4991 return; 4992 4993 pcie_bandwidth_available(adev->pdev, NULL, 4994 &platform_speed_cap, &platform_link_width); 4995 4996 if (adev->pm.pcie_gen_mask == 0) { 4997 /* asic caps */ 4998 pdev = adev->pdev; 4999 speed_cap = pcie_get_speed_cap(pdev); 5000 if (speed_cap == PCI_SPEED_UNKNOWN) { 5001 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5002 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5003 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5004 } else { 5005 if (speed_cap == PCIE_SPEED_32_0GT) 5006 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5007 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5008 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5009 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5010 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5011 else if (speed_cap == PCIE_SPEED_16_0GT) 5012 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5013 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5014 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5015 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5016 else if (speed_cap == PCIE_SPEED_8_0GT) 5017 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5018 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5019 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5020 else if (speed_cap == PCIE_SPEED_5_0GT) 5021 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5022 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5023 else 5024 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5025 } 5026 /* platform caps */ 5027 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5028 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5029 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5030 } else { 5031 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5032 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5033 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5034 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5035 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5036 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5037 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5038 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5039 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5040 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5041 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5042 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5043 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5044 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5045 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5046 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5047 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5048 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5049 else 5050 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5051 5052 } 5053 } 5054 if (adev->pm.pcie_mlw_mask == 0) { 5055 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5056 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5057 } else { 5058 switch (platform_link_width) { 5059 case PCIE_LNK_X32: 5060 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5061 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5062 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5063 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5064 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5065 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5066 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5067 break; 5068 case PCIE_LNK_X16: 5069 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5070 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5071 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5072 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5073 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5074 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5075 break; 5076 case PCIE_LNK_X12: 5077 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5078 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5079 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5080 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5081 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5082 break; 5083 case PCIE_LNK_X8: 5084 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5085 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5086 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5087 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5088 break; 5089 case PCIE_LNK_X4: 5090 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5091 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5092 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5093 break; 5094 case PCIE_LNK_X2: 5095 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5096 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5097 break; 5098 case PCIE_LNK_X1: 5099 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5100 break; 5101 default: 5102 break; 5103 } 5104 } 5105 } 5106 } 5107 5108 int amdgpu_device_baco_enter(struct drm_device *dev) 5109 { 5110 struct amdgpu_device *adev = drm_to_adev(dev); 5111 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5112 5113 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5114 return -ENOTSUPP; 5115 5116 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 5117 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5118 5119 return amdgpu_dpm_baco_enter(adev); 5120 } 5121 5122 int amdgpu_device_baco_exit(struct drm_device *dev) 5123 { 5124 struct amdgpu_device *adev = drm_to_adev(dev); 5125 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5126 int ret = 0; 5127 5128 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5129 return -ENOTSUPP; 5130 5131 ret = amdgpu_dpm_baco_exit(adev); 5132 if (ret) 5133 return ret; 5134 5135 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 5136 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5137 5138 return 0; 5139 } 5140 5141 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5142 { 5143 int i; 5144 5145 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5146 struct amdgpu_ring *ring = adev->rings[i]; 5147 5148 if (!ring || !ring->sched.thread) 5149 continue; 5150 5151 cancel_delayed_work_sync(&ring->sched.work_tdr); 5152 } 5153 } 5154 5155 /** 5156 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5157 * @pdev: PCI device struct 5158 * @state: PCI channel state 5159 * 5160 * Description: Called when a PCI error is detected. 5161 * 5162 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5163 */ 5164 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5165 { 5166 struct drm_device *dev = pci_get_drvdata(pdev); 5167 struct amdgpu_device *adev = drm_to_adev(dev); 5168 int i; 5169 5170 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5171 5172 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5173 DRM_WARN("No support for XGMI hive yet..."); 5174 return PCI_ERS_RESULT_DISCONNECT; 5175 } 5176 5177 switch (state) { 5178 case pci_channel_io_normal: 5179 return PCI_ERS_RESULT_CAN_RECOVER; 5180 /* Fatal error, prepare for slot reset */ 5181 case pci_channel_io_frozen: 5182 /* 5183 * Cancel and wait for all TDRs in progress if failing to 5184 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5185 * 5186 * Locking adev->reset_sem will prevent any external access 5187 * to GPU during PCI error recovery 5188 */ 5189 while (!amdgpu_device_lock_adev(adev, NULL)) 5190 amdgpu_cancel_all_tdr(adev); 5191 5192 /* 5193 * Block any work scheduling as we do for regular GPU reset 5194 * for the duration of the recovery 5195 */ 5196 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5197 struct amdgpu_ring *ring = adev->rings[i]; 5198 5199 if (!ring || !ring->sched.thread) 5200 continue; 5201 5202 drm_sched_stop(&ring->sched, NULL); 5203 } 5204 atomic_inc(&adev->gpu_reset_counter); 5205 return PCI_ERS_RESULT_NEED_RESET; 5206 case pci_channel_io_perm_failure: 5207 /* Permanent error, prepare for device removal */ 5208 return PCI_ERS_RESULT_DISCONNECT; 5209 } 5210 5211 return PCI_ERS_RESULT_NEED_RESET; 5212 } 5213 5214 /** 5215 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5216 * @pdev: pointer to PCI device 5217 */ 5218 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5219 { 5220 5221 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5222 5223 /* TODO - dump whatever for debugging purposes */ 5224 5225 /* This called only if amdgpu_pci_error_detected returns 5226 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5227 * works, no need to reset slot. 5228 */ 5229 5230 return PCI_ERS_RESULT_RECOVERED; 5231 } 5232 5233 /** 5234 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5235 * @pdev: PCI device struct 5236 * 5237 * Description: This routine is called by the pci error recovery 5238 * code after the PCI slot has been reset, just before we 5239 * should resume normal operations. 5240 */ 5241 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5242 { 5243 struct drm_device *dev = pci_get_drvdata(pdev); 5244 struct amdgpu_device *adev = drm_to_adev(dev); 5245 int r, i; 5246 struct amdgpu_reset_context reset_context; 5247 u32 memsize; 5248 struct list_head device_list; 5249 5250 DRM_INFO("PCI error: slot reset callback!!\n"); 5251 5252 memset(&reset_context, 0, sizeof(reset_context)); 5253 5254 INIT_LIST_HEAD(&device_list); 5255 list_add_tail(&adev->reset_list, &device_list); 5256 5257 /* wait for asic to come out of reset */ 5258 msleep(500); 5259 5260 /* Restore PCI confspace */ 5261 amdgpu_device_load_pci_state(pdev); 5262 5263 /* confirm ASIC came out of reset */ 5264 for (i = 0; i < adev->usec_timeout; i++) { 5265 memsize = amdgpu_asic_get_config_memsize(adev); 5266 5267 if (memsize != 0xffffffff) 5268 break; 5269 udelay(1); 5270 } 5271 if (memsize == 0xffffffff) { 5272 r = -ETIME; 5273 goto out; 5274 } 5275 5276 reset_context.method = AMD_RESET_METHOD_NONE; 5277 reset_context.reset_req_dev = adev; 5278 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5279 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5280 5281 adev->in_pci_err_recovery = true; 5282 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5283 adev->in_pci_err_recovery = false; 5284 if (r) 5285 goto out; 5286 5287 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5288 5289 out: 5290 if (!r) { 5291 if (amdgpu_device_cache_pci_state(adev->pdev)) 5292 pci_restore_state(adev->pdev); 5293 5294 DRM_INFO("PCIe error recovery succeeded\n"); 5295 } else { 5296 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5297 amdgpu_device_unlock_adev(adev); 5298 } 5299 5300 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5301 } 5302 5303 /** 5304 * amdgpu_pci_resume() - resume normal ops after PCI reset 5305 * @pdev: pointer to PCI device 5306 * 5307 * Called when the error recovery driver tells us that its 5308 * OK to resume normal operation. 5309 */ 5310 void amdgpu_pci_resume(struct pci_dev *pdev) 5311 { 5312 struct drm_device *dev = pci_get_drvdata(pdev); 5313 struct amdgpu_device *adev = drm_to_adev(dev); 5314 int i; 5315 5316 5317 DRM_INFO("PCI error: resume callback!!\n"); 5318 5319 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5320 struct amdgpu_ring *ring = adev->rings[i]; 5321 5322 if (!ring || !ring->sched.thread) 5323 continue; 5324 5325 5326 drm_sched_resubmit_jobs(&ring->sched); 5327 drm_sched_start(&ring->sched, true); 5328 } 5329 5330 amdgpu_device_unlock_adev(adev); 5331 } 5332 5333 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5334 { 5335 struct drm_device *dev = pci_get_drvdata(pdev); 5336 struct amdgpu_device *adev = drm_to_adev(dev); 5337 int r; 5338 5339 r = pci_save_state(pdev); 5340 if (!r) { 5341 kfree(adev->pci_state); 5342 5343 adev->pci_state = pci_store_saved_state(pdev); 5344 5345 if (!adev->pci_state) { 5346 DRM_ERROR("Failed to store PCI saved state"); 5347 return false; 5348 } 5349 } else { 5350 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5351 return false; 5352 } 5353 5354 return true; 5355 } 5356 5357 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5358 { 5359 struct drm_device *dev = pci_get_drvdata(pdev); 5360 struct amdgpu_device *adev = drm_to_adev(dev); 5361 int r; 5362 5363 if (!adev->pci_state) 5364 return false; 5365 5366 r = pci_load_saved_state(pdev, adev->pci_state); 5367 5368 if (!r) { 5369 pci_restore_state(pdev); 5370 } else { 5371 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5372 return false; 5373 } 5374 5375 return true; 5376 } 5377 5378 5379