1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 84 #define AMDGPU_RESUME_MS 2000 85 86 const char *amdgpu_asic_name[] = { 87 "TAHITI", 88 "PITCAIRN", 89 "VERDE", 90 "OLAND", 91 "HAINAN", 92 "BONAIRE", 93 "KAVERI", 94 "KABINI", 95 "HAWAII", 96 "MULLINS", 97 "TOPAZ", 98 "TONGA", 99 "FIJI", 100 "CARRIZO", 101 "STONEY", 102 "POLARIS10", 103 "POLARIS11", 104 "POLARIS12", 105 "VEGAM", 106 "VEGA10", 107 "VEGA12", 108 "VEGA20", 109 "RAVEN", 110 "ARCTURUS", 111 "RENOIR", 112 "NAVI10", 113 "NAVI14", 114 "NAVI12", 115 "SIENNA_CICHLID", 116 "NAVY_FLOUNDER", 117 "LAST", 118 }; 119 120 /** 121 * DOC: pcie_replay_count 122 * 123 * The amdgpu driver provides a sysfs API for reporting the total number 124 * of PCIe replays (NAKs) 125 * The file pcie_replay_count is used for this and returns the total 126 * number of replays as a sum of the NAKs generated and NAKs received 127 */ 128 129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 130 struct device_attribute *attr, char *buf) 131 { 132 struct drm_device *ddev = dev_get_drvdata(dev); 133 struct amdgpu_device *adev = drm_to_adev(ddev); 134 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 135 136 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 137 } 138 139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 140 amdgpu_device_get_pcie_replay_count, NULL); 141 142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 143 144 /** 145 * DOC: product_name 146 * 147 * The amdgpu driver provides a sysfs API for reporting the product name 148 * for the device 149 * The file serial_number is used for this and returns the product name 150 * as returned from the FRU. 151 * NOTE: This is only available for certain server cards 152 */ 153 154 static ssize_t amdgpu_device_get_product_name(struct device *dev, 155 struct device_attribute *attr, char *buf) 156 { 157 struct drm_device *ddev = dev_get_drvdata(dev); 158 struct amdgpu_device *adev = drm_to_adev(ddev); 159 160 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 161 } 162 163 static DEVICE_ATTR(product_name, S_IRUGO, 164 amdgpu_device_get_product_name, NULL); 165 166 /** 167 * DOC: product_number 168 * 169 * The amdgpu driver provides a sysfs API for reporting the part number 170 * for the device 171 * The file serial_number is used for this and returns the part number 172 * as returned from the FRU. 173 * NOTE: This is only available for certain server cards 174 */ 175 176 static ssize_t amdgpu_device_get_product_number(struct device *dev, 177 struct device_attribute *attr, char *buf) 178 { 179 struct drm_device *ddev = dev_get_drvdata(dev); 180 struct amdgpu_device *adev = drm_to_adev(ddev); 181 182 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 183 } 184 185 static DEVICE_ATTR(product_number, S_IRUGO, 186 amdgpu_device_get_product_number, NULL); 187 188 /** 189 * DOC: serial_number 190 * 191 * The amdgpu driver provides a sysfs API for reporting the serial number 192 * for the device 193 * The file serial_number is used for this and returns the serial number 194 * as returned from the FRU. 195 * NOTE: This is only available for certain server cards 196 */ 197 198 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 199 struct device_attribute *attr, char *buf) 200 { 201 struct drm_device *ddev = dev_get_drvdata(dev); 202 struct amdgpu_device *adev = drm_to_adev(ddev); 203 204 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 205 } 206 207 static DEVICE_ATTR(serial_number, S_IRUGO, 208 amdgpu_device_get_serial_number, NULL); 209 210 /** 211 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 212 * 213 * @dev: drm_device pointer 214 * 215 * Returns true if the device is a dGPU with HG/PX power control, 216 * otherwise return false. 217 */ 218 bool amdgpu_device_supports_boco(struct drm_device *dev) 219 { 220 struct amdgpu_device *adev = drm_to_adev(dev); 221 222 if (adev->flags & AMD_IS_PX) 223 return true; 224 return false; 225 } 226 227 /** 228 * amdgpu_device_supports_baco - Does the device support BACO 229 * 230 * @dev: drm_device pointer 231 * 232 * Returns true if the device supporte BACO, 233 * otherwise return false. 234 */ 235 bool amdgpu_device_supports_baco(struct drm_device *dev) 236 { 237 struct amdgpu_device *adev = drm_to_adev(dev); 238 239 return amdgpu_asic_supports_baco(adev); 240 } 241 242 /* 243 * VRAM access helper functions 244 */ 245 246 /** 247 * amdgpu_device_vram_access - read/write a buffer in vram 248 * 249 * @adev: amdgpu_device pointer 250 * @pos: offset of the buffer in vram 251 * @buf: virtual address of the buffer in system memory 252 * @size: read/write size, sizeof(@buf) must > @size 253 * @write: true - write to vram, otherwise - read from vram 254 */ 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 256 uint32_t *buf, size_t size, bool write) 257 { 258 unsigned long flags; 259 uint32_t hi = ~0; 260 uint64_t last; 261 262 263 #ifdef CONFIG_64BIT 264 last = min(pos + size, adev->gmc.visible_vram_size); 265 if (last > pos) { 266 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 267 size_t count = last - pos; 268 269 if (write) { 270 memcpy_toio(addr, buf, count); 271 mb(); 272 amdgpu_asic_flush_hdp(adev, NULL); 273 } else { 274 amdgpu_asic_invalidate_hdp(adev, NULL); 275 mb(); 276 memcpy_fromio(buf, addr, count); 277 } 278 279 if (count == size) 280 return; 281 282 pos += count; 283 buf += count / 4; 284 size -= count; 285 } 286 #endif 287 288 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 289 for (last = pos + size; pos < last; pos += 4) { 290 uint32_t tmp = pos >> 31; 291 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 293 if (tmp != hi) { 294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 295 hi = tmp; 296 } 297 if (write) 298 WREG32_NO_KIQ(mmMM_DATA, *buf++); 299 else 300 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 301 } 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 303 } 304 305 /* 306 * register access helper functions. 307 */ 308 /** 309 * amdgpu_device_rreg - read a memory mapped IO or indirect register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @acc_flags: access flags which require special behavior 314 * 315 * Returns the 32 bit value from the offset specified. 316 */ 317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 318 uint32_t reg, uint32_t acc_flags) 319 { 320 uint32_t ret; 321 322 if (adev->in_pci_err_recovery) 323 return 0; 324 325 if ((reg * 4) < adev->rmmio_size) { 326 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 327 amdgpu_sriov_runtime(adev) && 328 down_read_trylock(&adev->reset_sem)) { 329 ret = amdgpu_kiq_rreg(adev, reg); 330 up_read(&adev->reset_sem); 331 } else { 332 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 333 } 334 } else { 335 ret = adev->pcie_rreg(adev, reg * 4); 336 } 337 338 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 339 340 return ret; 341 } 342 343 /* 344 * MMIO register read with bytes helper functions 345 * @offset:bytes offset from MMIO start 346 * 347 */ 348 349 /** 350 * amdgpu_mm_rreg8 - read a memory mapped IO register 351 * 352 * @adev: amdgpu_device pointer 353 * @offset: byte aligned register offset 354 * 355 * Returns the 8 bit value from the offset specified. 356 */ 357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 358 { 359 if (adev->in_pci_err_recovery) 360 return 0; 361 362 if (offset < adev->rmmio_size) 363 return (readb(adev->rmmio + offset)); 364 BUG(); 365 } 366 367 /* 368 * MMIO register write with bytes helper functions 369 * @offset:bytes offset from MMIO start 370 * @value: the value want to be written to the register 371 * 372 */ 373 /** 374 * amdgpu_mm_wreg8 - read a memory mapped IO register 375 * 376 * @adev: amdgpu_device pointer 377 * @offset: byte aligned register offset 378 * @value: 8 bit value to write 379 * 380 * Writes the value specified to the offset specified. 381 */ 382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 383 { 384 if (adev->in_pci_err_recovery) 385 return; 386 387 if (offset < adev->rmmio_size) 388 writeb(value, adev->rmmio + offset); 389 else 390 BUG(); 391 } 392 393 /** 394 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 395 * 396 * @adev: amdgpu_device pointer 397 * @reg: dword aligned register offset 398 * @v: 32 bit value to write to the register 399 * @acc_flags: access flags which require special behavior 400 * 401 * Writes the value specified to the offset specified. 402 */ 403 void amdgpu_device_wreg(struct amdgpu_device *adev, 404 uint32_t reg, uint32_t v, 405 uint32_t acc_flags) 406 { 407 if (adev->in_pci_err_recovery) 408 return; 409 410 if ((reg * 4) < adev->rmmio_size) { 411 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 412 amdgpu_sriov_runtime(adev) && 413 down_read_trylock(&adev->reset_sem)) { 414 amdgpu_kiq_wreg(adev, reg, v); 415 up_read(&adev->reset_sem); 416 } else { 417 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 418 } 419 } else { 420 adev->pcie_wreg(adev, reg * 4, v); 421 } 422 423 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 424 } 425 426 /* 427 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 428 * 429 * this function is invoked only the debugfs register access 430 * */ 431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 432 uint32_t reg, uint32_t v) 433 { 434 if (adev->in_pci_err_recovery) 435 return; 436 437 if (amdgpu_sriov_fullaccess(adev) && 438 adev->gfx.rlc.funcs && 439 adev->gfx.rlc.funcs->is_rlcg_access_range) { 440 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 441 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 442 } else { 443 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 444 } 445 } 446 447 /** 448 * amdgpu_io_rreg - read an IO register 449 * 450 * @adev: amdgpu_device pointer 451 * @reg: dword aligned register offset 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 456 { 457 if (adev->in_pci_err_recovery) 458 return 0; 459 460 if ((reg * 4) < adev->rio_mem_size) 461 return ioread32(adev->rio_mem + (reg * 4)); 462 else { 463 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 464 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 465 } 466 } 467 468 /** 469 * amdgpu_io_wreg - write to an IO register 470 * 471 * @adev: amdgpu_device pointer 472 * @reg: dword aligned register offset 473 * @v: 32 bit value to write to the register 474 * 475 * Writes the value specified to the offset specified. 476 */ 477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 478 { 479 if (adev->in_pci_err_recovery) 480 return; 481 482 if ((reg * 4) < adev->rio_mem_size) 483 iowrite32(v, adev->rio_mem + (reg * 4)); 484 else { 485 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 486 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 487 } 488 } 489 490 /** 491 * amdgpu_mm_rdoorbell - read a doorbell dword 492 * 493 * @adev: amdgpu_device pointer 494 * @index: doorbell index 495 * 496 * Returns the value in the doorbell aperture at the 497 * requested doorbell index (CIK). 498 */ 499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 500 { 501 if (adev->in_pci_err_recovery) 502 return 0; 503 504 if (index < adev->doorbell.num_doorbells) { 505 return readl(adev->doorbell.ptr + index); 506 } else { 507 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 508 return 0; 509 } 510 } 511 512 /** 513 * amdgpu_mm_wdoorbell - write a doorbell dword 514 * 515 * @adev: amdgpu_device pointer 516 * @index: doorbell index 517 * @v: value to write 518 * 519 * Writes @v to the doorbell aperture at the 520 * requested doorbell index (CIK). 521 */ 522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 523 { 524 if (adev->in_pci_err_recovery) 525 return; 526 527 if (index < adev->doorbell.num_doorbells) { 528 writel(v, adev->doorbell.ptr + index); 529 } else { 530 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 531 } 532 } 533 534 /** 535 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 536 * 537 * @adev: amdgpu_device pointer 538 * @index: doorbell index 539 * 540 * Returns the value in the doorbell aperture at the 541 * requested doorbell index (VEGA10+). 542 */ 543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 544 { 545 if (adev->in_pci_err_recovery) 546 return 0; 547 548 if (index < adev->doorbell.num_doorbells) { 549 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 550 } else { 551 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 552 return 0; 553 } 554 } 555 556 /** 557 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 558 * 559 * @adev: amdgpu_device pointer 560 * @index: doorbell index 561 * @v: value to write 562 * 563 * Writes @v to the doorbell aperture at the 564 * requested doorbell index (VEGA10+). 565 */ 566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 567 { 568 if (adev->in_pci_err_recovery) 569 return; 570 571 if (index < adev->doorbell.num_doorbells) { 572 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 573 } else { 574 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 575 } 576 } 577 578 /** 579 * amdgpu_device_indirect_rreg - read an indirect register 580 * 581 * @adev: amdgpu_device pointer 582 * @pcie_index: mmio register offset 583 * @pcie_data: mmio register offset 584 * 585 * Returns the value of indirect register @reg_addr 586 */ 587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 588 u32 pcie_index, u32 pcie_data, 589 u32 reg_addr) 590 { 591 unsigned long flags; 592 u32 r; 593 void __iomem *pcie_index_offset; 594 void __iomem *pcie_data_offset; 595 596 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 597 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 598 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 599 600 writel(reg_addr, pcie_index_offset); 601 readl(pcie_index_offset); 602 r = readl(pcie_data_offset); 603 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 604 605 return r; 606 } 607 608 /** 609 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 610 * 611 * @adev: amdgpu_device pointer 612 * @pcie_index: mmio register offset 613 * @pcie_data: mmio register offset 614 * 615 * Returns the value of indirect register @reg_addr 616 */ 617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 618 u32 pcie_index, u32 pcie_data, 619 u32 reg_addr) 620 { 621 unsigned long flags; 622 u64 r; 623 void __iomem *pcie_index_offset; 624 void __iomem *pcie_data_offset; 625 626 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 627 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 628 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 629 630 /* read low 32 bits */ 631 writel(reg_addr, pcie_index_offset); 632 readl(pcie_index_offset); 633 r = readl(pcie_data_offset); 634 /* read high 32 bits */ 635 writel(reg_addr + 4, pcie_index_offset); 636 readl(pcie_index_offset); 637 r |= ((u64)readl(pcie_data_offset) << 32); 638 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 639 640 return r; 641 } 642 643 /** 644 * amdgpu_device_indirect_wreg - write an indirect register address 645 * 646 * @adev: amdgpu_device pointer 647 * @pcie_index: mmio register offset 648 * @pcie_data: mmio register offset 649 * @reg_addr: indirect register offset 650 * @reg_data: indirect register data 651 * 652 */ 653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 654 u32 pcie_index, u32 pcie_data, 655 u32 reg_addr, u32 reg_data) 656 { 657 unsigned long flags; 658 void __iomem *pcie_index_offset; 659 void __iomem *pcie_data_offset; 660 661 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 662 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 663 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 664 665 writel(reg_addr, pcie_index_offset); 666 readl(pcie_index_offset); 667 writel(reg_data, pcie_data_offset); 668 readl(pcie_data_offset); 669 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 670 } 671 672 /** 673 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 674 * 675 * @adev: amdgpu_device pointer 676 * @pcie_index: mmio register offset 677 * @pcie_data: mmio register offset 678 * @reg_addr: indirect register offset 679 * @reg_data: indirect register data 680 * 681 */ 682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 683 u32 pcie_index, u32 pcie_data, 684 u32 reg_addr, u64 reg_data) 685 { 686 unsigned long flags; 687 void __iomem *pcie_index_offset; 688 void __iomem *pcie_data_offset; 689 690 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 693 694 /* write low 32 bits */ 695 writel(reg_addr, pcie_index_offset); 696 readl(pcie_index_offset); 697 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 698 readl(pcie_data_offset); 699 /* write high 32 bits */ 700 writel(reg_addr + 4, pcie_index_offset); 701 readl(pcie_index_offset); 702 writel((u32)(reg_data >> 32), pcie_data_offset); 703 readl(pcie_data_offset); 704 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 705 } 706 707 /** 708 * amdgpu_invalid_rreg - dummy reg read function 709 * 710 * @adev: amdgpu_device pointer 711 * @reg: offset of register 712 * 713 * Dummy register read function. Used for register blocks 714 * that certain asics don't have (all asics). 715 * Returns the value in the register. 716 */ 717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 718 { 719 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 720 BUG(); 721 return 0; 722 } 723 724 /** 725 * amdgpu_invalid_wreg - dummy reg write function 726 * 727 * @adev: amdgpu_device pointer 728 * @reg: offset of register 729 * @v: value to write to the register 730 * 731 * Dummy register read function. Used for register blocks 732 * that certain asics don't have (all asics). 733 */ 734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 735 { 736 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 737 reg, v); 738 BUG(); 739 } 740 741 /** 742 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 743 * 744 * @adev: amdgpu_device pointer 745 * @reg: offset of register 746 * 747 * Dummy register read function. Used for register blocks 748 * that certain asics don't have (all asics). 749 * Returns the value in the register. 750 */ 751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 752 { 753 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 754 BUG(); 755 return 0; 756 } 757 758 /** 759 * amdgpu_invalid_wreg64 - dummy reg write function 760 * 761 * @adev: amdgpu_device pointer 762 * @reg: offset of register 763 * @v: value to write to the register 764 * 765 * Dummy register read function. Used for register blocks 766 * that certain asics don't have (all asics). 767 */ 768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 769 { 770 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 771 reg, v); 772 BUG(); 773 } 774 775 /** 776 * amdgpu_block_invalid_rreg - dummy reg read function 777 * 778 * @adev: amdgpu_device pointer 779 * @block: offset of instance 780 * @reg: offset of register 781 * 782 * Dummy register read function. Used for register blocks 783 * that certain asics don't have (all asics). 784 * Returns the value in the register. 785 */ 786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 787 uint32_t block, uint32_t reg) 788 { 789 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 790 reg, block); 791 BUG(); 792 return 0; 793 } 794 795 /** 796 * amdgpu_block_invalid_wreg - dummy reg write function 797 * 798 * @adev: amdgpu_device pointer 799 * @block: offset of instance 800 * @reg: offset of register 801 * @v: value to write to the register 802 * 803 * Dummy register read function. Used for register blocks 804 * that certain asics don't have (all asics). 805 */ 806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 807 uint32_t block, 808 uint32_t reg, uint32_t v) 809 { 810 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 811 reg, block, v); 812 BUG(); 813 } 814 815 /** 816 * amdgpu_device_asic_init - Wrapper for atom asic_init 817 * 818 * @adev: amdgpu_device pointer 819 * 820 * Does any asic specific work and then calls atom asic init. 821 */ 822 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 823 { 824 amdgpu_asic_pre_asic_init(adev); 825 826 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 827 } 828 829 /** 830 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 831 * 832 * @adev: amdgpu_device pointer 833 * 834 * Allocates a scratch page of VRAM for use by various things in the 835 * driver. 836 */ 837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 838 { 839 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 840 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 841 &adev->vram_scratch.robj, 842 &adev->vram_scratch.gpu_addr, 843 (void **)&adev->vram_scratch.ptr); 844 } 845 846 /** 847 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 848 * 849 * @adev: amdgpu_device pointer 850 * 851 * Frees the VRAM scratch page. 852 */ 853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 854 { 855 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 856 } 857 858 /** 859 * amdgpu_device_program_register_sequence - program an array of registers. 860 * 861 * @adev: amdgpu_device pointer 862 * @registers: pointer to the register array 863 * @array_size: size of the register array 864 * 865 * Programs an array or registers with and and or masks. 866 * This is a helper for setting golden registers. 867 */ 868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 869 const u32 *registers, 870 const u32 array_size) 871 { 872 u32 tmp, reg, and_mask, or_mask; 873 int i; 874 875 if (array_size % 3) 876 return; 877 878 for (i = 0; i < array_size; i +=3) { 879 reg = registers[i + 0]; 880 and_mask = registers[i + 1]; 881 or_mask = registers[i + 2]; 882 883 if (and_mask == 0xffffffff) { 884 tmp = or_mask; 885 } else { 886 tmp = RREG32(reg); 887 tmp &= ~and_mask; 888 if (adev->family >= AMDGPU_FAMILY_AI) 889 tmp |= (or_mask & and_mask); 890 else 891 tmp |= or_mask; 892 } 893 WREG32(reg, tmp); 894 } 895 } 896 897 /** 898 * amdgpu_device_pci_config_reset - reset the GPU 899 * 900 * @adev: amdgpu_device pointer 901 * 902 * Resets the GPU using the pci config reset sequence. 903 * Only applicable to asics prior to vega10. 904 */ 905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 906 { 907 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 908 } 909 910 /* 911 * GPU doorbell aperture helpers function. 912 */ 913 /** 914 * amdgpu_device_doorbell_init - Init doorbell driver information. 915 * 916 * @adev: amdgpu_device pointer 917 * 918 * Init doorbell driver information (CIK) 919 * Returns 0 on success, error on failure. 920 */ 921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 922 { 923 924 /* No doorbell on SI hardware generation */ 925 if (adev->asic_type < CHIP_BONAIRE) { 926 adev->doorbell.base = 0; 927 adev->doorbell.size = 0; 928 adev->doorbell.num_doorbells = 0; 929 adev->doorbell.ptr = NULL; 930 return 0; 931 } 932 933 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 934 return -EINVAL; 935 936 amdgpu_asic_init_doorbell_index(adev); 937 938 /* doorbell bar mapping */ 939 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 940 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 941 942 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 943 adev->doorbell_index.max_assignment+1); 944 if (adev->doorbell.num_doorbells == 0) 945 return -EINVAL; 946 947 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 948 * paging queue doorbell use the second page. The 949 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 950 * doorbells are in the first page. So with paging queue enabled, 951 * the max num_doorbells should + 1 page (0x400 in dword) 952 */ 953 if (adev->asic_type >= CHIP_VEGA10) 954 adev->doorbell.num_doorbells += 0x400; 955 956 adev->doorbell.ptr = ioremap(adev->doorbell.base, 957 adev->doorbell.num_doorbells * 958 sizeof(u32)); 959 if (adev->doorbell.ptr == NULL) 960 return -ENOMEM; 961 962 return 0; 963 } 964 965 /** 966 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 967 * 968 * @adev: amdgpu_device pointer 969 * 970 * Tear down doorbell driver information (CIK) 971 */ 972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 973 { 974 iounmap(adev->doorbell.ptr); 975 adev->doorbell.ptr = NULL; 976 } 977 978 979 980 /* 981 * amdgpu_device_wb_*() 982 * Writeback is the method by which the GPU updates special pages in memory 983 * with the status of certain GPU events (fences, ring pointers,etc.). 984 */ 985 986 /** 987 * amdgpu_device_wb_fini - Disable Writeback and free memory 988 * 989 * @adev: amdgpu_device pointer 990 * 991 * Disables Writeback and frees the Writeback memory (all asics). 992 * Used at driver shutdown. 993 */ 994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 995 { 996 if (adev->wb.wb_obj) { 997 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 998 &adev->wb.gpu_addr, 999 (void **)&adev->wb.wb); 1000 adev->wb.wb_obj = NULL; 1001 } 1002 } 1003 1004 /** 1005 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1006 * 1007 * @adev: amdgpu_device pointer 1008 * 1009 * Initializes writeback and allocates writeback memory (all asics). 1010 * Used at driver startup. 1011 * Returns 0 on success or an -error on failure. 1012 */ 1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1014 { 1015 int r; 1016 1017 if (adev->wb.wb_obj == NULL) { 1018 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1019 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1020 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1021 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1022 (void **)&adev->wb.wb); 1023 if (r) { 1024 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1025 return r; 1026 } 1027 1028 adev->wb.num_wb = AMDGPU_MAX_WB; 1029 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1030 1031 /* clear wb memory */ 1032 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1033 } 1034 1035 return 0; 1036 } 1037 1038 /** 1039 * amdgpu_device_wb_get - Allocate a wb entry 1040 * 1041 * @adev: amdgpu_device pointer 1042 * @wb: wb index 1043 * 1044 * Allocate a wb slot for use by the driver (all asics). 1045 * Returns 0 on success or -EINVAL on failure. 1046 */ 1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1048 { 1049 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1050 1051 if (offset < adev->wb.num_wb) { 1052 __set_bit(offset, adev->wb.used); 1053 *wb = offset << 3; /* convert to dw offset */ 1054 return 0; 1055 } else { 1056 return -EINVAL; 1057 } 1058 } 1059 1060 /** 1061 * amdgpu_device_wb_free - Free a wb entry 1062 * 1063 * @adev: amdgpu_device pointer 1064 * @wb: wb index 1065 * 1066 * Free a wb slot allocated for use by the driver (all asics) 1067 */ 1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1069 { 1070 wb >>= 3; 1071 if (wb < adev->wb.num_wb) 1072 __clear_bit(wb, adev->wb.used); 1073 } 1074 1075 /** 1076 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1077 * 1078 * @adev: amdgpu_device pointer 1079 * 1080 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1081 * to fail, but if any of the BARs is not accessible after the size we abort 1082 * driver loading by returning -ENODEV. 1083 */ 1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1085 { 1086 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 1087 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 1088 struct pci_bus *root; 1089 struct resource *res; 1090 unsigned i; 1091 u16 cmd; 1092 int r; 1093 1094 /* Bypass for VF */ 1095 if (amdgpu_sriov_vf(adev)) 1096 return 0; 1097 1098 /* skip if the bios has already enabled large BAR */ 1099 if (adev->gmc.real_vram_size && 1100 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1101 return 0; 1102 1103 /* Check if the root BUS has 64bit memory resources */ 1104 root = adev->pdev->bus; 1105 while (root->parent) 1106 root = root->parent; 1107 1108 pci_bus_for_each_resource(root, res, i) { 1109 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1110 res->start > 0x100000000ull) 1111 break; 1112 } 1113 1114 /* Trying to resize is pointless without a root hub window above 4GB */ 1115 if (!res) 1116 return 0; 1117 1118 /* Disable memory decoding while we change the BAR addresses and size */ 1119 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1120 pci_write_config_word(adev->pdev, PCI_COMMAND, 1121 cmd & ~PCI_COMMAND_MEMORY); 1122 1123 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1124 amdgpu_device_doorbell_fini(adev); 1125 if (adev->asic_type >= CHIP_BONAIRE) 1126 pci_release_resource(adev->pdev, 2); 1127 1128 pci_release_resource(adev->pdev, 0); 1129 1130 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1131 if (r == -ENOSPC) 1132 DRM_INFO("Not enough PCI address space for a large BAR."); 1133 else if (r && r != -ENOTSUPP) 1134 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1135 1136 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1137 1138 /* When the doorbell or fb BAR isn't available we have no chance of 1139 * using the device. 1140 */ 1141 r = amdgpu_device_doorbell_init(adev); 1142 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1143 return -ENODEV; 1144 1145 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1146 1147 return 0; 1148 } 1149 1150 /* 1151 * GPU helpers function. 1152 */ 1153 /** 1154 * amdgpu_device_need_post - check if the hw need post or not 1155 * 1156 * @adev: amdgpu_device pointer 1157 * 1158 * Check if the asic has been initialized (all asics) at driver startup 1159 * or post is needed if hw reset is performed. 1160 * Returns true if need or false if not. 1161 */ 1162 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1163 { 1164 uint32_t reg; 1165 1166 if (amdgpu_sriov_vf(adev)) 1167 return false; 1168 1169 if (amdgpu_passthrough(adev)) { 1170 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1171 * some old smc fw still need driver do vPost otherwise gpu hang, while 1172 * those smc fw version above 22.15 doesn't have this flaw, so we force 1173 * vpost executed for smc version below 22.15 1174 */ 1175 if (adev->asic_type == CHIP_FIJI) { 1176 int err; 1177 uint32_t fw_ver; 1178 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1179 /* force vPost if error occured */ 1180 if (err) 1181 return true; 1182 1183 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1184 if (fw_ver < 0x00160e00) 1185 return true; 1186 } 1187 } 1188 1189 if (adev->has_hw_reset) { 1190 adev->has_hw_reset = false; 1191 return true; 1192 } 1193 1194 /* bios scratch used on CIK+ */ 1195 if (adev->asic_type >= CHIP_BONAIRE) 1196 return amdgpu_atombios_scratch_need_asic_init(adev); 1197 1198 /* check MEM_SIZE for older asics */ 1199 reg = amdgpu_asic_get_config_memsize(adev); 1200 1201 if ((reg != 0) && (reg != 0xffffffff)) 1202 return false; 1203 1204 return true; 1205 } 1206 1207 /* if we get transitioned to only one device, take VGA back */ 1208 /** 1209 * amdgpu_device_vga_set_decode - enable/disable vga decode 1210 * 1211 * @cookie: amdgpu_device pointer 1212 * @state: enable/disable vga decode 1213 * 1214 * Enable/disable vga decode (all asics). 1215 * Returns VGA resource flags. 1216 */ 1217 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1218 { 1219 struct amdgpu_device *adev = cookie; 1220 amdgpu_asic_set_vga_state(adev, state); 1221 if (state) 1222 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1223 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1224 else 1225 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1226 } 1227 1228 /** 1229 * amdgpu_device_check_block_size - validate the vm block size 1230 * 1231 * @adev: amdgpu_device pointer 1232 * 1233 * Validates the vm block size specified via module parameter. 1234 * The vm block size defines number of bits in page table versus page directory, 1235 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1236 * page table and the remaining bits are in the page directory. 1237 */ 1238 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1239 { 1240 /* defines number of bits in page table versus page directory, 1241 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1242 * page table and the remaining bits are in the page directory */ 1243 if (amdgpu_vm_block_size == -1) 1244 return; 1245 1246 if (amdgpu_vm_block_size < 9) { 1247 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1248 amdgpu_vm_block_size); 1249 amdgpu_vm_block_size = -1; 1250 } 1251 } 1252 1253 /** 1254 * amdgpu_device_check_vm_size - validate the vm size 1255 * 1256 * @adev: amdgpu_device pointer 1257 * 1258 * Validates the vm size in GB specified via module parameter. 1259 * The VM size is the size of the GPU virtual memory space in GB. 1260 */ 1261 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1262 { 1263 /* no need to check the default value */ 1264 if (amdgpu_vm_size == -1) 1265 return; 1266 1267 if (amdgpu_vm_size < 1) { 1268 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1269 amdgpu_vm_size); 1270 amdgpu_vm_size = -1; 1271 } 1272 } 1273 1274 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1275 { 1276 struct sysinfo si; 1277 bool is_os_64 = (sizeof(void *) == 8); 1278 uint64_t total_memory; 1279 uint64_t dram_size_seven_GB = 0x1B8000000; 1280 uint64_t dram_size_three_GB = 0xB8000000; 1281 1282 if (amdgpu_smu_memory_pool_size == 0) 1283 return; 1284 1285 if (!is_os_64) { 1286 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1287 goto def_value; 1288 } 1289 si_meminfo(&si); 1290 total_memory = (uint64_t)si.totalram * si.mem_unit; 1291 1292 if ((amdgpu_smu_memory_pool_size == 1) || 1293 (amdgpu_smu_memory_pool_size == 2)) { 1294 if (total_memory < dram_size_three_GB) 1295 goto def_value1; 1296 } else if ((amdgpu_smu_memory_pool_size == 4) || 1297 (amdgpu_smu_memory_pool_size == 8)) { 1298 if (total_memory < dram_size_seven_GB) 1299 goto def_value1; 1300 } else { 1301 DRM_WARN("Smu memory pool size not supported\n"); 1302 goto def_value; 1303 } 1304 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1305 1306 return; 1307 1308 def_value1: 1309 DRM_WARN("No enough system memory\n"); 1310 def_value: 1311 adev->pm.smu_prv_buffer_size = 0; 1312 } 1313 1314 /** 1315 * amdgpu_device_check_arguments - validate module params 1316 * 1317 * @adev: amdgpu_device pointer 1318 * 1319 * Validates certain module parameters and updates 1320 * the associated values used by the driver (all asics). 1321 */ 1322 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1323 { 1324 if (amdgpu_sched_jobs < 4) { 1325 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1326 amdgpu_sched_jobs); 1327 amdgpu_sched_jobs = 4; 1328 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1329 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1330 amdgpu_sched_jobs); 1331 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1332 } 1333 1334 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1335 /* gart size must be greater or equal to 32M */ 1336 dev_warn(adev->dev, "gart size (%d) too small\n", 1337 amdgpu_gart_size); 1338 amdgpu_gart_size = -1; 1339 } 1340 1341 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1342 /* gtt size must be greater or equal to 32M */ 1343 dev_warn(adev->dev, "gtt size (%d) too small\n", 1344 amdgpu_gtt_size); 1345 amdgpu_gtt_size = -1; 1346 } 1347 1348 /* valid range is between 4 and 9 inclusive */ 1349 if (amdgpu_vm_fragment_size != -1 && 1350 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1351 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1352 amdgpu_vm_fragment_size = -1; 1353 } 1354 1355 if (amdgpu_sched_hw_submission < 2) { 1356 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1357 amdgpu_sched_hw_submission); 1358 amdgpu_sched_hw_submission = 2; 1359 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1360 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1361 amdgpu_sched_hw_submission); 1362 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1363 } 1364 1365 amdgpu_device_check_smu_prv_buffer_size(adev); 1366 1367 amdgpu_device_check_vm_size(adev); 1368 1369 amdgpu_device_check_block_size(adev); 1370 1371 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1372 1373 amdgpu_gmc_tmz_set(adev); 1374 1375 if (amdgpu_num_kcq == -1) { 1376 amdgpu_num_kcq = 8; 1377 } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { 1378 amdgpu_num_kcq = 8; 1379 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); 1380 } 1381 1382 amdgpu_gmc_noretry_set(adev); 1383 1384 return 0; 1385 } 1386 1387 /** 1388 * amdgpu_switcheroo_set_state - set switcheroo state 1389 * 1390 * @pdev: pci dev pointer 1391 * @state: vga_switcheroo state 1392 * 1393 * Callback for the switcheroo driver. Suspends or resumes the 1394 * the asics before or after it is powered up using ACPI methods. 1395 */ 1396 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1397 enum vga_switcheroo_state state) 1398 { 1399 struct drm_device *dev = pci_get_drvdata(pdev); 1400 int r; 1401 1402 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1403 return; 1404 1405 if (state == VGA_SWITCHEROO_ON) { 1406 pr_info("switched on\n"); 1407 /* don't suspend or resume card normally */ 1408 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1409 1410 pci_set_power_state(dev->pdev, PCI_D0); 1411 amdgpu_device_load_pci_state(dev->pdev); 1412 r = pci_enable_device(dev->pdev); 1413 if (r) 1414 DRM_WARN("pci_enable_device failed (%d)\n", r); 1415 amdgpu_device_resume(dev, true); 1416 1417 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1418 drm_kms_helper_poll_enable(dev); 1419 } else { 1420 pr_info("switched off\n"); 1421 drm_kms_helper_poll_disable(dev); 1422 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1423 amdgpu_device_suspend(dev, true); 1424 amdgpu_device_cache_pci_state(dev->pdev); 1425 /* Shut down the device */ 1426 pci_disable_device(dev->pdev); 1427 pci_set_power_state(dev->pdev, PCI_D3cold); 1428 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1429 } 1430 } 1431 1432 /** 1433 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1434 * 1435 * @pdev: pci dev pointer 1436 * 1437 * Callback for the switcheroo driver. Check of the switcheroo 1438 * state can be changed. 1439 * Returns true if the state can be changed, false if not. 1440 */ 1441 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1442 { 1443 struct drm_device *dev = pci_get_drvdata(pdev); 1444 1445 /* 1446 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1447 * locking inversion with the driver load path. And the access here is 1448 * completely racy anyway. So don't bother with locking for now. 1449 */ 1450 return atomic_read(&dev->open_count) == 0; 1451 } 1452 1453 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1454 .set_gpu_state = amdgpu_switcheroo_set_state, 1455 .reprobe = NULL, 1456 .can_switch = amdgpu_switcheroo_can_switch, 1457 }; 1458 1459 /** 1460 * amdgpu_device_ip_set_clockgating_state - set the CG state 1461 * 1462 * @dev: amdgpu_device pointer 1463 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1464 * @state: clockgating state (gate or ungate) 1465 * 1466 * Sets the requested clockgating state for all instances of 1467 * the hardware IP specified. 1468 * Returns the error code from the last instance. 1469 */ 1470 int amdgpu_device_ip_set_clockgating_state(void *dev, 1471 enum amd_ip_block_type block_type, 1472 enum amd_clockgating_state state) 1473 { 1474 struct amdgpu_device *adev = dev; 1475 int i, r = 0; 1476 1477 for (i = 0; i < adev->num_ip_blocks; i++) { 1478 if (!adev->ip_blocks[i].status.valid) 1479 continue; 1480 if (adev->ip_blocks[i].version->type != block_type) 1481 continue; 1482 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1483 continue; 1484 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1485 (void *)adev, state); 1486 if (r) 1487 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1488 adev->ip_blocks[i].version->funcs->name, r); 1489 } 1490 return r; 1491 } 1492 1493 /** 1494 * amdgpu_device_ip_set_powergating_state - set the PG state 1495 * 1496 * @dev: amdgpu_device pointer 1497 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1498 * @state: powergating state (gate or ungate) 1499 * 1500 * Sets the requested powergating state for all instances of 1501 * the hardware IP specified. 1502 * Returns the error code from the last instance. 1503 */ 1504 int amdgpu_device_ip_set_powergating_state(void *dev, 1505 enum amd_ip_block_type block_type, 1506 enum amd_powergating_state state) 1507 { 1508 struct amdgpu_device *adev = dev; 1509 int i, r = 0; 1510 1511 for (i = 0; i < adev->num_ip_blocks; i++) { 1512 if (!adev->ip_blocks[i].status.valid) 1513 continue; 1514 if (adev->ip_blocks[i].version->type != block_type) 1515 continue; 1516 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1517 continue; 1518 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1519 (void *)adev, state); 1520 if (r) 1521 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1522 adev->ip_blocks[i].version->funcs->name, r); 1523 } 1524 return r; 1525 } 1526 1527 /** 1528 * amdgpu_device_ip_get_clockgating_state - get the CG state 1529 * 1530 * @adev: amdgpu_device pointer 1531 * @flags: clockgating feature flags 1532 * 1533 * Walks the list of IPs on the device and updates the clockgating 1534 * flags for each IP. 1535 * Updates @flags with the feature flags for each hardware IP where 1536 * clockgating is enabled. 1537 */ 1538 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1539 u32 *flags) 1540 { 1541 int i; 1542 1543 for (i = 0; i < adev->num_ip_blocks; i++) { 1544 if (!adev->ip_blocks[i].status.valid) 1545 continue; 1546 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1547 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1548 } 1549 } 1550 1551 /** 1552 * amdgpu_device_ip_wait_for_idle - wait for idle 1553 * 1554 * @adev: amdgpu_device pointer 1555 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1556 * 1557 * Waits for the request hardware IP to be idle. 1558 * Returns 0 for success or a negative error code on failure. 1559 */ 1560 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1561 enum amd_ip_block_type block_type) 1562 { 1563 int i, r; 1564 1565 for (i = 0; i < adev->num_ip_blocks; i++) { 1566 if (!adev->ip_blocks[i].status.valid) 1567 continue; 1568 if (adev->ip_blocks[i].version->type == block_type) { 1569 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1570 if (r) 1571 return r; 1572 break; 1573 } 1574 } 1575 return 0; 1576 1577 } 1578 1579 /** 1580 * amdgpu_device_ip_is_idle - is the hardware IP idle 1581 * 1582 * @adev: amdgpu_device pointer 1583 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1584 * 1585 * Check if the hardware IP is idle or not. 1586 * Returns true if it the IP is idle, false if not. 1587 */ 1588 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1589 enum amd_ip_block_type block_type) 1590 { 1591 int i; 1592 1593 for (i = 0; i < adev->num_ip_blocks; i++) { 1594 if (!adev->ip_blocks[i].status.valid) 1595 continue; 1596 if (adev->ip_blocks[i].version->type == block_type) 1597 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1598 } 1599 return true; 1600 1601 } 1602 1603 /** 1604 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1605 * 1606 * @adev: amdgpu_device pointer 1607 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1608 * 1609 * Returns a pointer to the hardware IP block structure 1610 * if it exists for the asic, otherwise NULL. 1611 */ 1612 struct amdgpu_ip_block * 1613 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1614 enum amd_ip_block_type type) 1615 { 1616 int i; 1617 1618 for (i = 0; i < adev->num_ip_blocks; i++) 1619 if (adev->ip_blocks[i].version->type == type) 1620 return &adev->ip_blocks[i]; 1621 1622 return NULL; 1623 } 1624 1625 /** 1626 * amdgpu_device_ip_block_version_cmp 1627 * 1628 * @adev: amdgpu_device pointer 1629 * @type: enum amd_ip_block_type 1630 * @major: major version 1631 * @minor: minor version 1632 * 1633 * return 0 if equal or greater 1634 * return 1 if smaller or the ip_block doesn't exist 1635 */ 1636 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1637 enum amd_ip_block_type type, 1638 u32 major, u32 minor) 1639 { 1640 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1641 1642 if (ip_block && ((ip_block->version->major > major) || 1643 ((ip_block->version->major == major) && 1644 (ip_block->version->minor >= minor)))) 1645 return 0; 1646 1647 return 1; 1648 } 1649 1650 /** 1651 * amdgpu_device_ip_block_add 1652 * 1653 * @adev: amdgpu_device pointer 1654 * @ip_block_version: pointer to the IP to add 1655 * 1656 * Adds the IP block driver information to the collection of IPs 1657 * on the asic. 1658 */ 1659 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1660 const struct amdgpu_ip_block_version *ip_block_version) 1661 { 1662 if (!ip_block_version) 1663 return -EINVAL; 1664 1665 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1666 ip_block_version->funcs->name); 1667 1668 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1669 1670 return 0; 1671 } 1672 1673 /** 1674 * amdgpu_device_enable_virtual_display - enable virtual display feature 1675 * 1676 * @adev: amdgpu_device pointer 1677 * 1678 * Enabled the virtual display feature if the user has enabled it via 1679 * the module parameter virtual_display. This feature provides a virtual 1680 * display hardware on headless boards or in virtualized environments. 1681 * This function parses and validates the configuration string specified by 1682 * the user and configues the virtual display configuration (number of 1683 * virtual connectors, crtcs, etc.) specified. 1684 */ 1685 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1686 { 1687 adev->enable_virtual_display = false; 1688 1689 if (amdgpu_virtual_display) { 1690 struct drm_device *ddev = adev_to_drm(adev); 1691 const char *pci_address_name = pci_name(ddev->pdev); 1692 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1693 1694 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1695 pciaddstr_tmp = pciaddstr; 1696 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1697 pciaddname = strsep(&pciaddname_tmp, ","); 1698 if (!strcmp("all", pciaddname) 1699 || !strcmp(pci_address_name, pciaddname)) { 1700 long num_crtc; 1701 int res = -1; 1702 1703 adev->enable_virtual_display = true; 1704 1705 if (pciaddname_tmp) 1706 res = kstrtol(pciaddname_tmp, 10, 1707 &num_crtc); 1708 1709 if (!res) { 1710 if (num_crtc < 1) 1711 num_crtc = 1; 1712 if (num_crtc > 6) 1713 num_crtc = 6; 1714 adev->mode_info.num_crtc = num_crtc; 1715 } else { 1716 adev->mode_info.num_crtc = 1; 1717 } 1718 break; 1719 } 1720 } 1721 1722 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1723 amdgpu_virtual_display, pci_address_name, 1724 adev->enable_virtual_display, adev->mode_info.num_crtc); 1725 1726 kfree(pciaddstr); 1727 } 1728 } 1729 1730 /** 1731 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1732 * 1733 * @adev: amdgpu_device pointer 1734 * 1735 * Parses the asic configuration parameters specified in the gpu info 1736 * firmware and makes them availale to the driver for use in configuring 1737 * the asic. 1738 * Returns 0 on success, -EINVAL on failure. 1739 */ 1740 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1741 { 1742 const char *chip_name; 1743 char fw_name[40]; 1744 int err; 1745 const struct gpu_info_firmware_header_v1_0 *hdr; 1746 1747 adev->firmware.gpu_info_fw = NULL; 1748 1749 if (adev->mman.discovery_bin) { 1750 amdgpu_discovery_get_gfx_info(adev); 1751 1752 /* 1753 * FIXME: The bounding box is still needed by Navi12, so 1754 * temporarily read it from gpu_info firmware. Should be droped 1755 * when DAL no longer needs it. 1756 */ 1757 if (adev->asic_type != CHIP_NAVI12) 1758 return 0; 1759 } 1760 1761 switch (adev->asic_type) { 1762 #ifdef CONFIG_DRM_AMDGPU_SI 1763 case CHIP_VERDE: 1764 case CHIP_TAHITI: 1765 case CHIP_PITCAIRN: 1766 case CHIP_OLAND: 1767 case CHIP_HAINAN: 1768 #endif 1769 #ifdef CONFIG_DRM_AMDGPU_CIK 1770 case CHIP_BONAIRE: 1771 case CHIP_HAWAII: 1772 case CHIP_KAVERI: 1773 case CHIP_KABINI: 1774 case CHIP_MULLINS: 1775 #endif 1776 case CHIP_TOPAZ: 1777 case CHIP_TONGA: 1778 case CHIP_FIJI: 1779 case CHIP_POLARIS10: 1780 case CHIP_POLARIS11: 1781 case CHIP_POLARIS12: 1782 case CHIP_VEGAM: 1783 case CHIP_CARRIZO: 1784 case CHIP_STONEY: 1785 case CHIP_VEGA20: 1786 case CHIP_SIENNA_CICHLID: 1787 case CHIP_NAVY_FLOUNDER: 1788 default: 1789 return 0; 1790 case CHIP_VEGA10: 1791 chip_name = "vega10"; 1792 break; 1793 case CHIP_VEGA12: 1794 chip_name = "vega12"; 1795 break; 1796 case CHIP_RAVEN: 1797 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1798 chip_name = "raven2"; 1799 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1800 chip_name = "picasso"; 1801 else 1802 chip_name = "raven"; 1803 break; 1804 case CHIP_ARCTURUS: 1805 chip_name = "arcturus"; 1806 break; 1807 case CHIP_RENOIR: 1808 chip_name = "renoir"; 1809 break; 1810 case CHIP_NAVI10: 1811 chip_name = "navi10"; 1812 break; 1813 case CHIP_NAVI14: 1814 chip_name = "navi14"; 1815 break; 1816 case CHIP_NAVI12: 1817 chip_name = "navi12"; 1818 break; 1819 } 1820 1821 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1822 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1823 if (err) { 1824 dev_err(adev->dev, 1825 "Failed to load gpu_info firmware \"%s\"\n", 1826 fw_name); 1827 goto out; 1828 } 1829 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1830 if (err) { 1831 dev_err(adev->dev, 1832 "Failed to validate gpu_info firmware \"%s\"\n", 1833 fw_name); 1834 goto out; 1835 } 1836 1837 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1838 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1839 1840 switch (hdr->version_major) { 1841 case 1: 1842 { 1843 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1844 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1845 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1846 1847 /* 1848 * Should be droped when DAL no longer needs it. 1849 */ 1850 if (adev->asic_type == CHIP_NAVI12) 1851 goto parse_soc_bounding_box; 1852 1853 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1854 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1855 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1856 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1857 adev->gfx.config.max_texture_channel_caches = 1858 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1859 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1860 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1861 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1862 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1863 adev->gfx.config.double_offchip_lds_buf = 1864 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1865 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1866 adev->gfx.cu_info.max_waves_per_simd = 1867 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1868 adev->gfx.cu_info.max_scratch_slots_per_cu = 1869 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1870 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1871 if (hdr->version_minor >= 1) { 1872 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1873 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1874 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1875 adev->gfx.config.num_sc_per_sh = 1876 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1877 adev->gfx.config.num_packer_per_sc = 1878 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1879 } 1880 1881 parse_soc_bounding_box: 1882 /* 1883 * soc bounding box info is not integrated in disocovery table, 1884 * we always need to parse it from gpu info firmware if needed. 1885 */ 1886 if (hdr->version_minor == 2) { 1887 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1888 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1889 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1890 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1891 } 1892 break; 1893 } 1894 default: 1895 dev_err(adev->dev, 1896 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1897 err = -EINVAL; 1898 goto out; 1899 } 1900 out: 1901 return err; 1902 } 1903 1904 /** 1905 * amdgpu_device_ip_early_init - run early init for hardware IPs 1906 * 1907 * @adev: amdgpu_device pointer 1908 * 1909 * Early initialization pass for hardware IPs. The hardware IPs that make 1910 * up each asic are discovered each IP's early_init callback is run. This 1911 * is the first stage in initializing the asic. 1912 * Returns 0 on success, negative error code on failure. 1913 */ 1914 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1915 { 1916 int i, r; 1917 1918 amdgpu_device_enable_virtual_display(adev); 1919 1920 if (amdgpu_sriov_vf(adev)) { 1921 r = amdgpu_virt_request_full_gpu(adev, true); 1922 if (r) 1923 return r; 1924 } 1925 1926 switch (adev->asic_type) { 1927 #ifdef CONFIG_DRM_AMDGPU_SI 1928 case CHIP_VERDE: 1929 case CHIP_TAHITI: 1930 case CHIP_PITCAIRN: 1931 case CHIP_OLAND: 1932 case CHIP_HAINAN: 1933 adev->family = AMDGPU_FAMILY_SI; 1934 r = si_set_ip_blocks(adev); 1935 if (r) 1936 return r; 1937 break; 1938 #endif 1939 #ifdef CONFIG_DRM_AMDGPU_CIK 1940 case CHIP_BONAIRE: 1941 case CHIP_HAWAII: 1942 case CHIP_KAVERI: 1943 case CHIP_KABINI: 1944 case CHIP_MULLINS: 1945 if (adev->flags & AMD_IS_APU) 1946 adev->family = AMDGPU_FAMILY_KV; 1947 else 1948 adev->family = AMDGPU_FAMILY_CI; 1949 1950 r = cik_set_ip_blocks(adev); 1951 if (r) 1952 return r; 1953 break; 1954 #endif 1955 case CHIP_TOPAZ: 1956 case CHIP_TONGA: 1957 case CHIP_FIJI: 1958 case CHIP_POLARIS10: 1959 case CHIP_POLARIS11: 1960 case CHIP_POLARIS12: 1961 case CHIP_VEGAM: 1962 case CHIP_CARRIZO: 1963 case CHIP_STONEY: 1964 if (adev->flags & AMD_IS_APU) 1965 adev->family = AMDGPU_FAMILY_CZ; 1966 else 1967 adev->family = AMDGPU_FAMILY_VI; 1968 1969 r = vi_set_ip_blocks(adev); 1970 if (r) 1971 return r; 1972 break; 1973 case CHIP_VEGA10: 1974 case CHIP_VEGA12: 1975 case CHIP_VEGA20: 1976 case CHIP_RAVEN: 1977 case CHIP_ARCTURUS: 1978 case CHIP_RENOIR: 1979 if (adev->flags & AMD_IS_APU) 1980 adev->family = AMDGPU_FAMILY_RV; 1981 else 1982 adev->family = AMDGPU_FAMILY_AI; 1983 1984 r = soc15_set_ip_blocks(adev); 1985 if (r) 1986 return r; 1987 break; 1988 case CHIP_NAVI10: 1989 case CHIP_NAVI14: 1990 case CHIP_NAVI12: 1991 case CHIP_SIENNA_CICHLID: 1992 case CHIP_NAVY_FLOUNDER: 1993 adev->family = AMDGPU_FAMILY_NV; 1994 1995 r = nv_set_ip_blocks(adev); 1996 if (r) 1997 return r; 1998 break; 1999 default: 2000 /* FIXME: not supported yet */ 2001 return -EINVAL; 2002 } 2003 2004 amdgpu_amdkfd_device_probe(adev); 2005 2006 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2007 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2008 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2009 2010 for (i = 0; i < adev->num_ip_blocks; i++) { 2011 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2012 DRM_ERROR("disabled ip block: %d <%s>\n", 2013 i, adev->ip_blocks[i].version->funcs->name); 2014 adev->ip_blocks[i].status.valid = false; 2015 } else { 2016 if (adev->ip_blocks[i].version->funcs->early_init) { 2017 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2018 if (r == -ENOENT) { 2019 adev->ip_blocks[i].status.valid = false; 2020 } else if (r) { 2021 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2022 adev->ip_blocks[i].version->funcs->name, r); 2023 return r; 2024 } else { 2025 adev->ip_blocks[i].status.valid = true; 2026 } 2027 } else { 2028 adev->ip_blocks[i].status.valid = true; 2029 } 2030 } 2031 /* get the vbios after the asic_funcs are set up */ 2032 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2033 r = amdgpu_device_parse_gpu_info_fw(adev); 2034 if (r) 2035 return r; 2036 2037 /* Read BIOS */ 2038 if (!amdgpu_get_bios(adev)) 2039 return -EINVAL; 2040 2041 r = amdgpu_atombios_init(adev); 2042 if (r) { 2043 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2044 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2045 return r; 2046 } 2047 } 2048 } 2049 2050 adev->cg_flags &= amdgpu_cg_mask; 2051 adev->pg_flags &= amdgpu_pg_mask; 2052 2053 return 0; 2054 } 2055 2056 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2057 { 2058 int i, r; 2059 2060 for (i = 0; i < adev->num_ip_blocks; i++) { 2061 if (!adev->ip_blocks[i].status.sw) 2062 continue; 2063 if (adev->ip_blocks[i].status.hw) 2064 continue; 2065 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2066 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2067 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2068 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2069 if (r) { 2070 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2071 adev->ip_blocks[i].version->funcs->name, r); 2072 return r; 2073 } 2074 adev->ip_blocks[i].status.hw = true; 2075 } 2076 } 2077 2078 return 0; 2079 } 2080 2081 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2082 { 2083 int i, r; 2084 2085 for (i = 0; i < adev->num_ip_blocks; i++) { 2086 if (!adev->ip_blocks[i].status.sw) 2087 continue; 2088 if (adev->ip_blocks[i].status.hw) 2089 continue; 2090 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2091 if (r) { 2092 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2093 adev->ip_blocks[i].version->funcs->name, r); 2094 return r; 2095 } 2096 adev->ip_blocks[i].status.hw = true; 2097 } 2098 2099 return 0; 2100 } 2101 2102 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2103 { 2104 int r = 0; 2105 int i; 2106 uint32_t smu_version; 2107 2108 if (adev->asic_type >= CHIP_VEGA10) { 2109 for (i = 0; i < adev->num_ip_blocks; i++) { 2110 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2111 continue; 2112 2113 /* no need to do the fw loading again if already done*/ 2114 if (adev->ip_blocks[i].status.hw == true) 2115 break; 2116 2117 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2118 r = adev->ip_blocks[i].version->funcs->resume(adev); 2119 if (r) { 2120 DRM_ERROR("resume of IP block <%s> failed %d\n", 2121 adev->ip_blocks[i].version->funcs->name, r); 2122 return r; 2123 } 2124 } else { 2125 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2126 if (r) { 2127 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2128 adev->ip_blocks[i].version->funcs->name, r); 2129 return r; 2130 } 2131 } 2132 2133 adev->ip_blocks[i].status.hw = true; 2134 break; 2135 } 2136 } 2137 2138 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2139 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2140 2141 return r; 2142 } 2143 2144 /** 2145 * amdgpu_device_ip_init - run init for hardware IPs 2146 * 2147 * @adev: amdgpu_device pointer 2148 * 2149 * Main initialization pass for hardware IPs. The list of all the hardware 2150 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2151 * are run. sw_init initializes the software state associated with each IP 2152 * and hw_init initializes the hardware associated with each IP. 2153 * Returns 0 on success, negative error code on failure. 2154 */ 2155 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2156 { 2157 int i, r; 2158 2159 r = amdgpu_ras_init(adev); 2160 if (r) 2161 return r; 2162 2163 for (i = 0; i < adev->num_ip_blocks; i++) { 2164 if (!adev->ip_blocks[i].status.valid) 2165 continue; 2166 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2167 if (r) { 2168 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2169 adev->ip_blocks[i].version->funcs->name, r); 2170 goto init_failed; 2171 } 2172 adev->ip_blocks[i].status.sw = true; 2173 2174 /* need to do gmc hw init early so we can allocate gpu mem */ 2175 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2176 r = amdgpu_device_vram_scratch_init(adev); 2177 if (r) { 2178 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2179 goto init_failed; 2180 } 2181 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2182 if (r) { 2183 DRM_ERROR("hw_init %d failed %d\n", i, r); 2184 goto init_failed; 2185 } 2186 r = amdgpu_device_wb_init(adev); 2187 if (r) { 2188 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2189 goto init_failed; 2190 } 2191 adev->ip_blocks[i].status.hw = true; 2192 2193 /* right after GMC hw init, we create CSA */ 2194 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2195 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2196 AMDGPU_GEM_DOMAIN_VRAM, 2197 AMDGPU_CSA_SIZE); 2198 if (r) { 2199 DRM_ERROR("allocate CSA failed %d\n", r); 2200 goto init_failed; 2201 } 2202 } 2203 } 2204 } 2205 2206 if (amdgpu_sriov_vf(adev)) 2207 amdgpu_virt_init_data_exchange(adev); 2208 2209 r = amdgpu_ib_pool_init(adev); 2210 if (r) { 2211 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2212 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2213 goto init_failed; 2214 } 2215 2216 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2217 if (r) 2218 goto init_failed; 2219 2220 r = amdgpu_device_ip_hw_init_phase1(adev); 2221 if (r) 2222 goto init_failed; 2223 2224 r = amdgpu_device_fw_loading(adev); 2225 if (r) 2226 goto init_failed; 2227 2228 r = amdgpu_device_ip_hw_init_phase2(adev); 2229 if (r) 2230 goto init_failed; 2231 2232 /* 2233 * retired pages will be loaded from eeprom and reserved here, 2234 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2235 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2236 * for I2C communication which only true at this point. 2237 * 2238 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2239 * failure from bad gpu situation and stop amdgpu init process 2240 * accordingly. For other failed cases, it will still release all 2241 * the resource and print error message, rather than returning one 2242 * negative value to upper level. 2243 * 2244 * Note: theoretically, this should be called before all vram allocations 2245 * to protect retired page from abusing 2246 */ 2247 r = amdgpu_ras_recovery_init(adev); 2248 if (r) 2249 goto init_failed; 2250 2251 if (adev->gmc.xgmi.num_physical_nodes > 1) 2252 amdgpu_xgmi_add_device(adev); 2253 amdgpu_amdkfd_device_init(adev); 2254 2255 amdgpu_fru_get_product_info(adev); 2256 2257 init_failed: 2258 if (amdgpu_sriov_vf(adev)) 2259 amdgpu_virt_release_full_gpu(adev, true); 2260 2261 return r; 2262 } 2263 2264 /** 2265 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2266 * 2267 * @adev: amdgpu_device pointer 2268 * 2269 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2270 * this function before a GPU reset. If the value is retained after a 2271 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2272 */ 2273 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2274 { 2275 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2276 } 2277 2278 /** 2279 * amdgpu_device_check_vram_lost - check if vram is valid 2280 * 2281 * @adev: amdgpu_device pointer 2282 * 2283 * Checks the reset magic value written to the gart pointer in VRAM. 2284 * The driver calls this after a GPU reset to see if the contents of 2285 * VRAM is lost or now. 2286 * returns true if vram is lost, false if not. 2287 */ 2288 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2289 { 2290 if (memcmp(adev->gart.ptr, adev->reset_magic, 2291 AMDGPU_RESET_MAGIC_NUM)) 2292 return true; 2293 2294 if (!amdgpu_in_reset(adev)) 2295 return false; 2296 2297 /* 2298 * For all ASICs with baco/mode1 reset, the VRAM is 2299 * always assumed to be lost. 2300 */ 2301 switch (amdgpu_asic_reset_method(adev)) { 2302 case AMD_RESET_METHOD_BACO: 2303 case AMD_RESET_METHOD_MODE1: 2304 return true; 2305 default: 2306 return false; 2307 } 2308 } 2309 2310 /** 2311 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2312 * 2313 * @adev: amdgpu_device pointer 2314 * @state: clockgating state (gate or ungate) 2315 * 2316 * The list of all the hardware IPs that make up the asic is walked and the 2317 * set_clockgating_state callbacks are run. 2318 * Late initialization pass enabling clockgating for hardware IPs. 2319 * Fini or suspend, pass disabling clockgating for hardware IPs. 2320 * Returns 0 on success, negative error code on failure. 2321 */ 2322 2323 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2324 enum amd_clockgating_state state) 2325 { 2326 int i, j, r; 2327 2328 if (amdgpu_emu_mode == 1) 2329 return 0; 2330 2331 for (j = 0; j < adev->num_ip_blocks; j++) { 2332 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2333 if (!adev->ip_blocks[i].status.late_initialized) 2334 continue; 2335 /* skip CG for VCE/UVD, it's handled specially */ 2336 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2337 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2338 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2339 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2340 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2341 /* enable clockgating to save power */ 2342 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2343 state); 2344 if (r) { 2345 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2346 adev->ip_blocks[i].version->funcs->name, r); 2347 return r; 2348 } 2349 } 2350 } 2351 2352 return 0; 2353 } 2354 2355 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2356 { 2357 int i, j, r; 2358 2359 if (amdgpu_emu_mode == 1) 2360 return 0; 2361 2362 for (j = 0; j < adev->num_ip_blocks; j++) { 2363 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2364 if (!adev->ip_blocks[i].status.late_initialized) 2365 continue; 2366 /* skip CG for VCE/UVD, it's handled specially */ 2367 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2368 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2369 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2370 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2371 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2372 /* enable powergating to save power */ 2373 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2374 state); 2375 if (r) { 2376 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2377 adev->ip_blocks[i].version->funcs->name, r); 2378 return r; 2379 } 2380 } 2381 } 2382 return 0; 2383 } 2384 2385 static int amdgpu_device_enable_mgpu_fan_boost(void) 2386 { 2387 struct amdgpu_gpu_instance *gpu_ins; 2388 struct amdgpu_device *adev; 2389 int i, ret = 0; 2390 2391 mutex_lock(&mgpu_info.mutex); 2392 2393 /* 2394 * MGPU fan boost feature should be enabled 2395 * only when there are two or more dGPUs in 2396 * the system 2397 */ 2398 if (mgpu_info.num_dgpu < 2) 2399 goto out; 2400 2401 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2402 gpu_ins = &(mgpu_info.gpu_ins[i]); 2403 adev = gpu_ins->adev; 2404 if (!(adev->flags & AMD_IS_APU) && 2405 !gpu_ins->mgpu_fan_enabled) { 2406 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2407 if (ret) 2408 break; 2409 2410 gpu_ins->mgpu_fan_enabled = 1; 2411 } 2412 } 2413 2414 out: 2415 mutex_unlock(&mgpu_info.mutex); 2416 2417 return ret; 2418 } 2419 2420 /** 2421 * amdgpu_device_ip_late_init - run late init for hardware IPs 2422 * 2423 * @adev: amdgpu_device pointer 2424 * 2425 * Late initialization pass for hardware IPs. The list of all the hardware 2426 * IPs that make up the asic is walked and the late_init callbacks are run. 2427 * late_init covers any special initialization that an IP requires 2428 * after all of the have been initialized or something that needs to happen 2429 * late in the init process. 2430 * Returns 0 on success, negative error code on failure. 2431 */ 2432 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2433 { 2434 struct amdgpu_gpu_instance *gpu_instance; 2435 int i = 0, r; 2436 2437 for (i = 0; i < adev->num_ip_blocks; i++) { 2438 if (!adev->ip_blocks[i].status.hw) 2439 continue; 2440 if (adev->ip_blocks[i].version->funcs->late_init) { 2441 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2442 if (r) { 2443 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2444 adev->ip_blocks[i].version->funcs->name, r); 2445 return r; 2446 } 2447 } 2448 adev->ip_blocks[i].status.late_initialized = true; 2449 } 2450 2451 amdgpu_ras_set_error_query_ready(adev, true); 2452 2453 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2454 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2455 2456 amdgpu_device_fill_reset_magic(adev); 2457 2458 r = amdgpu_device_enable_mgpu_fan_boost(); 2459 if (r) 2460 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2461 2462 2463 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2464 mutex_lock(&mgpu_info.mutex); 2465 2466 /* 2467 * Reset device p-state to low as this was booted with high. 2468 * 2469 * This should be performed only after all devices from the same 2470 * hive get initialized. 2471 * 2472 * However, it's unknown how many device in the hive in advance. 2473 * As this is counted one by one during devices initializations. 2474 * 2475 * So, we wait for all XGMI interlinked devices initialized. 2476 * This may bring some delays as those devices may come from 2477 * different hives. But that should be OK. 2478 */ 2479 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2480 for (i = 0; i < mgpu_info.num_gpu; i++) { 2481 gpu_instance = &(mgpu_info.gpu_ins[i]); 2482 if (gpu_instance->adev->flags & AMD_IS_APU) 2483 continue; 2484 2485 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2486 AMDGPU_XGMI_PSTATE_MIN); 2487 if (r) { 2488 DRM_ERROR("pstate setting failed (%d).\n", r); 2489 break; 2490 } 2491 } 2492 } 2493 2494 mutex_unlock(&mgpu_info.mutex); 2495 } 2496 2497 return 0; 2498 } 2499 2500 /** 2501 * amdgpu_device_ip_fini - run fini for hardware IPs 2502 * 2503 * @adev: amdgpu_device pointer 2504 * 2505 * Main teardown pass for hardware IPs. The list of all the hardware 2506 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2507 * are run. hw_fini tears down the hardware associated with each IP 2508 * and sw_fini tears down any software state associated with each IP. 2509 * Returns 0 on success, negative error code on failure. 2510 */ 2511 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2512 { 2513 int i, r; 2514 2515 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2516 amdgpu_virt_release_ras_err_handler_data(adev); 2517 2518 amdgpu_ras_pre_fini(adev); 2519 2520 if (adev->gmc.xgmi.num_physical_nodes > 1) 2521 amdgpu_xgmi_remove_device(adev); 2522 2523 amdgpu_amdkfd_device_fini(adev); 2524 2525 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2526 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2527 2528 /* need to disable SMC first */ 2529 for (i = 0; i < adev->num_ip_blocks; i++) { 2530 if (!adev->ip_blocks[i].status.hw) 2531 continue; 2532 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2533 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2534 /* XXX handle errors */ 2535 if (r) { 2536 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2537 adev->ip_blocks[i].version->funcs->name, r); 2538 } 2539 adev->ip_blocks[i].status.hw = false; 2540 break; 2541 } 2542 } 2543 2544 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2545 if (!adev->ip_blocks[i].status.hw) 2546 continue; 2547 2548 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2549 /* XXX handle errors */ 2550 if (r) { 2551 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2552 adev->ip_blocks[i].version->funcs->name, r); 2553 } 2554 2555 adev->ip_blocks[i].status.hw = false; 2556 } 2557 2558 2559 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2560 if (!adev->ip_blocks[i].status.sw) 2561 continue; 2562 2563 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2564 amdgpu_ucode_free_bo(adev); 2565 amdgpu_free_static_csa(&adev->virt.csa_obj); 2566 amdgpu_device_wb_fini(adev); 2567 amdgpu_device_vram_scratch_fini(adev); 2568 amdgpu_ib_pool_fini(adev); 2569 } 2570 2571 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2572 /* XXX handle errors */ 2573 if (r) { 2574 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2575 adev->ip_blocks[i].version->funcs->name, r); 2576 } 2577 adev->ip_blocks[i].status.sw = false; 2578 adev->ip_blocks[i].status.valid = false; 2579 } 2580 2581 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2582 if (!adev->ip_blocks[i].status.late_initialized) 2583 continue; 2584 if (adev->ip_blocks[i].version->funcs->late_fini) 2585 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2586 adev->ip_blocks[i].status.late_initialized = false; 2587 } 2588 2589 amdgpu_ras_fini(adev); 2590 2591 if (amdgpu_sriov_vf(adev)) 2592 if (amdgpu_virt_release_full_gpu(adev, false)) 2593 DRM_ERROR("failed to release exclusive mode on fini\n"); 2594 2595 return 0; 2596 } 2597 2598 /** 2599 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2600 * 2601 * @work: work_struct. 2602 */ 2603 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2604 { 2605 struct amdgpu_device *adev = 2606 container_of(work, struct amdgpu_device, delayed_init_work.work); 2607 int r; 2608 2609 r = amdgpu_ib_ring_tests(adev); 2610 if (r) 2611 DRM_ERROR("ib ring test failed (%d).\n", r); 2612 } 2613 2614 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2615 { 2616 struct amdgpu_device *adev = 2617 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2618 2619 mutex_lock(&adev->gfx.gfx_off_mutex); 2620 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2621 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2622 adev->gfx.gfx_off_state = true; 2623 } 2624 mutex_unlock(&adev->gfx.gfx_off_mutex); 2625 } 2626 2627 /** 2628 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2629 * 2630 * @adev: amdgpu_device pointer 2631 * 2632 * Main suspend function for hardware IPs. The list of all the hardware 2633 * IPs that make up the asic is walked, clockgating is disabled and the 2634 * suspend callbacks are run. suspend puts the hardware and software state 2635 * in each IP into a state suitable for suspend. 2636 * Returns 0 on success, negative error code on failure. 2637 */ 2638 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2639 { 2640 int i, r; 2641 2642 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2643 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2644 2645 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2646 if (!adev->ip_blocks[i].status.valid) 2647 continue; 2648 2649 /* displays are handled separately */ 2650 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2651 continue; 2652 2653 /* XXX handle errors */ 2654 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2655 /* XXX handle errors */ 2656 if (r) { 2657 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2658 adev->ip_blocks[i].version->funcs->name, r); 2659 return r; 2660 } 2661 2662 adev->ip_blocks[i].status.hw = false; 2663 } 2664 2665 return 0; 2666 } 2667 2668 /** 2669 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2670 * 2671 * @adev: amdgpu_device pointer 2672 * 2673 * Main suspend function for hardware IPs. The list of all the hardware 2674 * IPs that make up the asic is walked, clockgating is disabled and the 2675 * suspend callbacks are run. suspend puts the hardware and software state 2676 * in each IP into a state suitable for suspend. 2677 * Returns 0 on success, negative error code on failure. 2678 */ 2679 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2680 { 2681 int i, r; 2682 2683 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2684 if (!adev->ip_blocks[i].status.valid) 2685 continue; 2686 /* displays are handled in phase1 */ 2687 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2688 continue; 2689 /* PSP lost connection when err_event_athub occurs */ 2690 if (amdgpu_ras_intr_triggered() && 2691 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2692 adev->ip_blocks[i].status.hw = false; 2693 continue; 2694 } 2695 /* XXX handle errors */ 2696 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2697 /* XXX handle errors */ 2698 if (r) { 2699 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2700 adev->ip_blocks[i].version->funcs->name, r); 2701 } 2702 adev->ip_blocks[i].status.hw = false; 2703 /* handle putting the SMC in the appropriate state */ 2704 if(!amdgpu_sriov_vf(adev)){ 2705 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2706 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2707 if (r) { 2708 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2709 adev->mp1_state, r); 2710 return r; 2711 } 2712 } 2713 } 2714 adev->ip_blocks[i].status.hw = false; 2715 } 2716 2717 return 0; 2718 } 2719 2720 /** 2721 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2722 * 2723 * @adev: amdgpu_device pointer 2724 * 2725 * Main suspend function for hardware IPs. The list of all the hardware 2726 * IPs that make up the asic is walked, clockgating is disabled and the 2727 * suspend callbacks are run. suspend puts the hardware and software state 2728 * in each IP into a state suitable for suspend. 2729 * Returns 0 on success, negative error code on failure. 2730 */ 2731 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2732 { 2733 int r; 2734 2735 if (amdgpu_sriov_vf(adev)) 2736 amdgpu_virt_request_full_gpu(adev, false); 2737 2738 r = amdgpu_device_ip_suspend_phase1(adev); 2739 if (r) 2740 return r; 2741 r = amdgpu_device_ip_suspend_phase2(adev); 2742 2743 if (amdgpu_sriov_vf(adev)) 2744 amdgpu_virt_release_full_gpu(adev, false); 2745 2746 return r; 2747 } 2748 2749 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2750 { 2751 int i, r; 2752 2753 static enum amd_ip_block_type ip_order[] = { 2754 AMD_IP_BLOCK_TYPE_GMC, 2755 AMD_IP_BLOCK_TYPE_COMMON, 2756 AMD_IP_BLOCK_TYPE_PSP, 2757 AMD_IP_BLOCK_TYPE_IH, 2758 }; 2759 2760 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2761 int j; 2762 struct amdgpu_ip_block *block; 2763 2764 block = &adev->ip_blocks[i]; 2765 block->status.hw = false; 2766 2767 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2768 2769 if (block->version->type != ip_order[j] || 2770 !block->status.valid) 2771 continue; 2772 2773 r = block->version->funcs->hw_init(adev); 2774 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2775 if (r) 2776 return r; 2777 block->status.hw = true; 2778 } 2779 } 2780 2781 return 0; 2782 } 2783 2784 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2785 { 2786 int i, r; 2787 2788 static enum amd_ip_block_type ip_order[] = { 2789 AMD_IP_BLOCK_TYPE_SMC, 2790 AMD_IP_BLOCK_TYPE_DCE, 2791 AMD_IP_BLOCK_TYPE_GFX, 2792 AMD_IP_BLOCK_TYPE_SDMA, 2793 AMD_IP_BLOCK_TYPE_UVD, 2794 AMD_IP_BLOCK_TYPE_VCE, 2795 AMD_IP_BLOCK_TYPE_VCN 2796 }; 2797 2798 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2799 int j; 2800 struct amdgpu_ip_block *block; 2801 2802 for (j = 0; j < adev->num_ip_blocks; j++) { 2803 block = &adev->ip_blocks[j]; 2804 2805 if (block->version->type != ip_order[i] || 2806 !block->status.valid || 2807 block->status.hw) 2808 continue; 2809 2810 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2811 r = block->version->funcs->resume(adev); 2812 else 2813 r = block->version->funcs->hw_init(adev); 2814 2815 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2816 if (r) 2817 return r; 2818 block->status.hw = true; 2819 } 2820 } 2821 2822 return 0; 2823 } 2824 2825 /** 2826 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2827 * 2828 * @adev: amdgpu_device pointer 2829 * 2830 * First resume function for hardware IPs. The list of all the hardware 2831 * IPs that make up the asic is walked and the resume callbacks are run for 2832 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2833 * after a suspend and updates the software state as necessary. This 2834 * function is also used for restoring the GPU after a GPU reset. 2835 * Returns 0 on success, negative error code on failure. 2836 */ 2837 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2838 { 2839 int i, r; 2840 2841 for (i = 0; i < adev->num_ip_blocks; i++) { 2842 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2843 continue; 2844 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2845 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2846 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2847 2848 r = adev->ip_blocks[i].version->funcs->resume(adev); 2849 if (r) { 2850 DRM_ERROR("resume of IP block <%s> failed %d\n", 2851 adev->ip_blocks[i].version->funcs->name, r); 2852 return r; 2853 } 2854 adev->ip_blocks[i].status.hw = true; 2855 } 2856 } 2857 2858 return 0; 2859 } 2860 2861 /** 2862 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2863 * 2864 * @adev: amdgpu_device pointer 2865 * 2866 * First resume function for hardware IPs. The list of all the hardware 2867 * IPs that make up the asic is walked and the resume callbacks are run for 2868 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2869 * functional state after a suspend and updates the software state as 2870 * necessary. This function is also used for restoring the GPU after a GPU 2871 * reset. 2872 * Returns 0 on success, negative error code on failure. 2873 */ 2874 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2875 { 2876 int i, r; 2877 2878 for (i = 0; i < adev->num_ip_blocks; i++) { 2879 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2880 continue; 2881 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2882 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2883 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2884 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2885 continue; 2886 r = adev->ip_blocks[i].version->funcs->resume(adev); 2887 if (r) { 2888 DRM_ERROR("resume of IP block <%s> failed %d\n", 2889 adev->ip_blocks[i].version->funcs->name, r); 2890 return r; 2891 } 2892 adev->ip_blocks[i].status.hw = true; 2893 } 2894 2895 return 0; 2896 } 2897 2898 /** 2899 * amdgpu_device_ip_resume - run resume for hardware IPs 2900 * 2901 * @adev: amdgpu_device pointer 2902 * 2903 * Main resume function for hardware IPs. The hardware IPs 2904 * are split into two resume functions because they are 2905 * are also used in in recovering from a GPU reset and some additional 2906 * steps need to be take between them. In this case (S3/S4) they are 2907 * run sequentially. 2908 * Returns 0 on success, negative error code on failure. 2909 */ 2910 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2911 { 2912 int r; 2913 2914 r = amdgpu_device_ip_resume_phase1(adev); 2915 if (r) 2916 return r; 2917 2918 r = amdgpu_device_fw_loading(adev); 2919 if (r) 2920 return r; 2921 2922 r = amdgpu_device_ip_resume_phase2(adev); 2923 2924 return r; 2925 } 2926 2927 /** 2928 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2929 * 2930 * @adev: amdgpu_device pointer 2931 * 2932 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2933 */ 2934 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2935 { 2936 if (amdgpu_sriov_vf(adev)) { 2937 if (adev->is_atom_fw) { 2938 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2939 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2940 } else { 2941 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2942 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2943 } 2944 2945 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2946 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2947 } 2948 } 2949 2950 /** 2951 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2952 * 2953 * @asic_type: AMD asic type 2954 * 2955 * Check if there is DC (new modesetting infrastructre) support for an asic. 2956 * returns true if DC has support, false if not. 2957 */ 2958 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2959 { 2960 switch (asic_type) { 2961 #if defined(CONFIG_DRM_AMD_DC) 2962 #if defined(CONFIG_DRM_AMD_DC_SI) 2963 case CHIP_TAHITI: 2964 case CHIP_PITCAIRN: 2965 case CHIP_VERDE: 2966 case CHIP_OLAND: 2967 #endif 2968 case CHIP_BONAIRE: 2969 case CHIP_KAVERI: 2970 case CHIP_KABINI: 2971 case CHIP_MULLINS: 2972 /* 2973 * We have systems in the wild with these ASICs that require 2974 * LVDS and VGA support which is not supported with DC. 2975 * 2976 * Fallback to the non-DC driver here by default so as not to 2977 * cause regressions. 2978 */ 2979 return amdgpu_dc > 0; 2980 case CHIP_HAWAII: 2981 case CHIP_CARRIZO: 2982 case CHIP_STONEY: 2983 case CHIP_POLARIS10: 2984 case CHIP_POLARIS11: 2985 case CHIP_POLARIS12: 2986 case CHIP_VEGAM: 2987 case CHIP_TONGA: 2988 case CHIP_FIJI: 2989 case CHIP_VEGA10: 2990 case CHIP_VEGA12: 2991 case CHIP_VEGA20: 2992 #if defined(CONFIG_DRM_AMD_DC_DCN) 2993 case CHIP_RAVEN: 2994 case CHIP_NAVI10: 2995 case CHIP_NAVI14: 2996 case CHIP_NAVI12: 2997 case CHIP_RENOIR: 2998 #endif 2999 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 3000 case CHIP_SIENNA_CICHLID: 3001 case CHIP_NAVY_FLOUNDER: 3002 #endif 3003 return amdgpu_dc != 0; 3004 #endif 3005 default: 3006 if (amdgpu_dc > 0) 3007 DRM_INFO("Display Core has been requested via kernel parameter " 3008 "but isn't supported by ASIC, ignoring\n"); 3009 return false; 3010 } 3011 } 3012 3013 /** 3014 * amdgpu_device_has_dc_support - check if dc is supported 3015 * 3016 * @adev: amdgpu_device pointer 3017 * 3018 * Returns true for supported, false for not supported 3019 */ 3020 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3021 { 3022 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3023 return false; 3024 3025 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3026 } 3027 3028 3029 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3030 { 3031 struct amdgpu_device *adev = 3032 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3033 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3034 3035 /* It's a bug to not have a hive within this function */ 3036 if (WARN_ON(!hive)) 3037 return; 3038 3039 /* 3040 * Use task barrier to synchronize all xgmi reset works across the 3041 * hive. task_barrier_enter and task_barrier_exit will block 3042 * until all the threads running the xgmi reset works reach 3043 * those points. task_barrier_full will do both blocks. 3044 */ 3045 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3046 3047 task_barrier_enter(&hive->tb); 3048 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3049 3050 if (adev->asic_reset_res) 3051 goto fail; 3052 3053 task_barrier_exit(&hive->tb); 3054 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3055 3056 if (adev->asic_reset_res) 3057 goto fail; 3058 3059 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3060 adev->mmhub.funcs->reset_ras_error_count(adev); 3061 } else { 3062 3063 task_barrier_full(&hive->tb); 3064 adev->asic_reset_res = amdgpu_asic_reset(adev); 3065 } 3066 3067 fail: 3068 if (adev->asic_reset_res) 3069 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3070 adev->asic_reset_res, adev_to_drm(adev)->unique); 3071 amdgpu_put_xgmi_hive(hive); 3072 } 3073 3074 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3075 { 3076 char *input = amdgpu_lockup_timeout; 3077 char *timeout_setting = NULL; 3078 int index = 0; 3079 long timeout; 3080 int ret = 0; 3081 3082 /* 3083 * By default timeout for non compute jobs is 10000. 3084 * And there is no timeout enforced on compute jobs. 3085 * In SR-IOV or passthrough mode, timeout for compute 3086 * jobs are 60000 by default. 3087 */ 3088 adev->gfx_timeout = msecs_to_jiffies(10000); 3089 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3090 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3091 adev->compute_timeout = msecs_to_jiffies(60000); 3092 else 3093 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3094 3095 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3096 while ((timeout_setting = strsep(&input, ",")) && 3097 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3098 ret = kstrtol(timeout_setting, 0, &timeout); 3099 if (ret) 3100 return ret; 3101 3102 if (timeout == 0) { 3103 index++; 3104 continue; 3105 } else if (timeout < 0) { 3106 timeout = MAX_SCHEDULE_TIMEOUT; 3107 } else { 3108 timeout = msecs_to_jiffies(timeout); 3109 } 3110 3111 switch (index++) { 3112 case 0: 3113 adev->gfx_timeout = timeout; 3114 break; 3115 case 1: 3116 adev->compute_timeout = timeout; 3117 break; 3118 case 2: 3119 adev->sdma_timeout = timeout; 3120 break; 3121 case 3: 3122 adev->video_timeout = timeout; 3123 break; 3124 default: 3125 break; 3126 } 3127 } 3128 /* 3129 * There is only one value specified and 3130 * it should apply to all non-compute jobs. 3131 */ 3132 if (index == 1) { 3133 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3134 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3135 adev->compute_timeout = adev->gfx_timeout; 3136 } 3137 } 3138 3139 return ret; 3140 } 3141 3142 static const struct attribute *amdgpu_dev_attributes[] = { 3143 &dev_attr_product_name.attr, 3144 &dev_attr_product_number.attr, 3145 &dev_attr_serial_number.attr, 3146 &dev_attr_pcie_replay_count.attr, 3147 NULL 3148 }; 3149 3150 3151 /** 3152 * amdgpu_device_init - initialize the driver 3153 * 3154 * @adev: amdgpu_device pointer 3155 * @flags: driver flags 3156 * 3157 * Initializes the driver info and hw (all asics). 3158 * Returns 0 for success or an error on failure. 3159 * Called at driver startup. 3160 */ 3161 int amdgpu_device_init(struct amdgpu_device *adev, 3162 uint32_t flags) 3163 { 3164 struct drm_device *ddev = adev_to_drm(adev); 3165 struct pci_dev *pdev = adev->pdev; 3166 int r, i; 3167 bool boco = false; 3168 u32 max_MBps; 3169 3170 adev->shutdown = false; 3171 adev->flags = flags; 3172 3173 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3174 adev->asic_type = amdgpu_force_asic_type; 3175 else 3176 adev->asic_type = flags & AMD_ASIC_MASK; 3177 3178 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3179 if (amdgpu_emu_mode == 1) 3180 adev->usec_timeout *= 10; 3181 adev->gmc.gart_size = 512 * 1024 * 1024; 3182 adev->accel_working = false; 3183 adev->num_rings = 0; 3184 adev->mman.buffer_funcs = NULL; 3185 adev->mman.buffer_funcs_ring = NULL; 3186 adev->vm_manager.vm_pte_funcs = NULL; 3187 adev->vm_manager.vm_pte_num_scheds = 0; 3188 adev->gmc.gmc_funcs = NULL; 3189 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3190 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3191 3192 adev->smc_rreg = &amdgpu_invalid_rreg; 3193 adev->smc_wreg = &amdgpu_invalid_wreg; 3194 adev->pcie_rreg = &amdgpu_invalid_rreg; 3195 adev->pcie_wreg = &amdgpu_invalid_wreg; 3196 adev->pciep_rreg = &amdgpu_invalid_rreg; 3197 adev->pciep_wreg = &amdgpu_invalid_wreg; 3198 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3199 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3200 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3201 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3202 adev->didt_rreg = &amdgpu_invalid_rreg; 3203 adev->didt_wreg = &amdgpu_invalid_wreg; 3204 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3205 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3206 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3207 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3208 3209 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3210 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3211 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3212 3213 /* mutex initialization are all done here so we 3214 * can recall function without having locking issues */ 3215 atomic_set(&adev->irq.ih.lock, 0); 3216 mutex_init(&adev->firmware.mutex); 3217 mutex_init(&adev->pm.mutex); 3218 mutex_init(&adev->gfx.gpu_clock_mutex); 3219 mutex_init(&adev->srbm_mutex); 3220 mutex_init(&adev->gfx.pipe_reserve_mutex); 3221 mutex_init(&adev->gfx.gfx_off_mutex); 3222 mutex_init(&adev->grbm_idx_mutex); 3223 mutex_init(&adev->mn_lock); 3224 mutex_init(&adev->virt.vf_errors.lock); 3225 hash_init(adev->mn_hash); 3226 atomic_set(&adev->in_gpu_reset, 0); 3227 init_rwsem(&adev->reset_sem); 3228 mutex_init(&adev->psp.mutex); 3229 mutex_init(&adev->notifier_lock); 3230 3231 r = amdgpu_device_check_arguments(adev); 3232 if (r) 3233 return r; 3234 3235 spin_lock_init(&adev->mmio_idx_lock); 3236 spin_lock_init(&adev->smc_idx_lock); 3237 spin_lock_init(&adev->pcie_idx_lock); 3238 spin_lock_init(&adev->uvd_ctx_idx_lock); 3239 spin_lock_init(&adev->didt_idx_lock); 3240 spin_lock_init(&adev->gc_cac_idx_lock); 3241 spin_lock_init(&adev->se_cac_idx_lock); 3242 spin_lock_init(&adev->audio_endpt_idx_lock); 3243 spin_lock_init(&adev->mm_stats.lock); 3244 3245 INIT_LIST_HEAD(&adev->shadow_list); 3246 mutex_init(&adev->shadow_list_lock); 3247 3248 INIT_DELAYED_WORK(&adev->delayed_init_work, 3249 amdgpu_device_delayed_init_work_handler); 3250 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3251 amdgpu_device_delay_enable_gfx_off); 3252 3253 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3254 3255 adev->gfx.gfx_off_req_count = 1; 3256 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3257 3258 atomic_set(&adev->throttling_logging_enabled, 1); 3259 /* 3260 * If throttling continues, logging will be performed every minute 3261 * to avoid log flooding. "-1" is subtracted since the thermal 3262 * throttling interrupt comes every second. Thus, the total logging 3263 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3264 * for throttling interrupt) = 60 seconds. 3265 */ 3266 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3267 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3268 3269 /* Registers mapping */ 3270 /* TODO: block userspace mapping of io register */ 3271 if (adev->asic_type >= CHIP_BONAIRE) { 3272 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3273 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3274 } else { 3275 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3276 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3277 } 3278 3279 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3280 if (adev->rmmio == NULL) { 3281 return -ENOMEM; 3282 } 3283 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3284 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3285 3286 /* io port mapping */ 3287 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3288 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3289 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3290 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3291 break; 3292 } 3293 } 3294 if (adev->rio_mem == NULL) 3295 DRM_INFO("PCI I/O BAR is not found.\n"); 3296 3297 /* enable PCIE atomic ops */ 3298 r = pci_enable_atomic_ops_to_root(adev->pdev, 3299 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3300 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3301 if (r) { 3302 adev->have_atomics_support = false; 3303 DRM_INFO("PCIE atomic ops is not supported\n"); 3304 } else { 3305 adev->have_atomics_support = true; 3306 } 3307 3308 amdgpu_device_get_pcie_info(adev); 3309 3310 if (amdgpu_mcbp) 3311 DRM_INFO("MCBP is enabled\n"); 3312 3313 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3314 adev->enable_mes = true; 3315 3316 /* detect hw virtualization here */ 3317 amdgpu_detect_virtualization(adev); 3318 3319 r = amdgpu_device_get_job_timeout_settings(adev); 3320 if (r) { 3321 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3322 goto failed_unmap; 3323 } 3324 3325 /* early init functions */ 3326 r = amdgpu_device_ip_early_init(adev); 3327 if (r) 3328 goto failed_unmap; 3329 3330 /* doorbell bar mapping and doorbell index init*/ 3331 amdgpu_device_doorbell_init(adev); 3332 3333 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3334 /* this will fail for cards that aren't VGA class devices, just 3335 * ignore it */ 3336 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3337 3338 if (amdgpu_device_supports_boco(ddev)) 3339 boco = true; 3340 if (amdgpu_has_atpx() && 3341 (amdgpu_is_atpx_hybrid() || 3342 amdgpu_has_atpx_dgpu_power_cntl()) && 3343 !pci_is_thunderbolt_attached(adev->pdev)) 3344 vga_switcheroo_register_client(adev->pdev, 3345 &amdgpu_switcheroo_ops, boco); 3346 if (boco) 3347 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3348 3349 if (amdgpu_emu_mode == 1) { 3350 /* post the asic on emulation mode */ 3351 emu_soc_asic_init(adev); 3352 goto fence_driver_init; 3353 } 3354 3355 /* detect if we are with an SRIOV vbios */ 3356 amdgpu_device_detect_sriov_bios(adev); 3357 3358 /* check if we need to reset the asic 3359 * E.g., driver was not cleanly unloaded previously, etc. 3360 */ 3361 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3362 r = amdgpu_asic_reset(adev); 3363 if (r) { 3364 dev_err(adev->dev, "asic reset on init failed\n"); 3365 goto failed; 3366 } 3367 } 3368 3369 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3370 3371 /* Post card if necessary */ 3372 if (amdgpu_device_need_post(adev)) { 3373 if (!adev->bios) { 3374 dev_err(adev->dev, "no vBIOS found\n"); 3375 r = -EINVAL; 3376 goto failed; 3377 } 3378 DRM_INFO("GPU posting now...\n"); 3379 r = amdgpu_device_asic_init(adev); 3380 if (r) { 3381 dev_err(adev->dev, "gpu post error!\n"); 3382 goto failed; 3383 } 3384 } 3385 3386 if (adev->is_atom_fw) { 3387 /* Initialize clocks */ 3388 r = amdgpu_atomfirmware_get_clock_info(adev); 3389 if (r) { 3390 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3391 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3392 goto failed; 3393 } 3394 } else { 3395 /* Initialize clocks */ 3396 r = amdgpu_atombios_get_clock_info(adev); 3397 if (r) { 3398 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3399 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3400 goto failed; 3401 } 3402 /* init i2c buses */ 3403 if (!amdgpu_device_has_dc_support(adev)) 3404 amdgpu_atombios_i2c_init(adev); 3405 } 3406 3407 fence_driver_init: 3408 /* Fence driver */ 3409 r = amdgpu_fence_driver_init(adev); 3410 if (r) { 3411 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3412 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3413 goto failed; 3414 } 3415 3416 /* init the mode config */ 3417 drm_mode_config_init(adev_to_drm(adev)); 3418 3419 r = amdgpu_device_ip_init(adev); 3420 if (r) { 3421 /* failed in exclusive mode due to timeout */ 3422 if (amdgpu_sriov_vf(adev) && 3423 !amdgpu_sriov_runtime(adev) && 3424 amdgpu_virt_mmio_blocked(adev) && 3425 !amdgpu_virt_wait_reset(adev)) { 3426 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3427 /* Don't send request since VF is inactive. */ 3428 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3429 adev->virt.ops = NULL; 3430 r = -EAGAIN; 3431 goto failed; 3432 } 3433 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3434 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3435 goto failed; 3436 } 3437 3438 dev_info(adev->dev, 3439 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3440 adev->gfx.config.max_shader_engines, 3441 adev->gfx.config.max_sh_per_se, 3442 adev->gfx.config.max_cu_per_sh, 3443 adev->gfx.cu_info.number); 3444 3445 adev->accel_working = true; 3446 3447 amdgpu_vm_check_compute_bug(adev); 3448 3449 /* Initialize the buffer migration limit. */ 3450 if (amdgpu_moverate >= 0) 3451 max_MBps = amdgpu_moverate; 3452 else 3453 max_MBps = 8; /* Allow 8 MB/s. */ 3454 /* Get a log2 for easy divisions. */ 3455 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3456 3457 amdgpu_fbdev_init(adev); 3458 3459 r = amdgpu_pm_sysfs_init(adev); 3460 if (r) { 3461 adev->pm_sysfs_en = false; 3462 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3463 } else 3464 adev->pm_sysfs_en = true; 3465 3466 r = amdgpu_ucode_sysfs_init(adev); 3467 if (r) { 3468 adev->ucode_sysfs_en = false; 3469 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3470 } else 3471 adev->ucode_sysfs_en = true; 3472 3473 if ((amdgpu_testing & 1)) { 3474 if (adev->accel_working) 3475 amdgpu_test_moves(adev); 3476 else 3477 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3478 } 3479 if (amdgpu_benchmarking) { 3480 if (adev->accel_working) 3481 amdgpu_benchmark(adev, amdgpu_benchmarking); 3482 else 3483 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3484 } 3485 3486 /* 3487 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3488 * Otherwise the mgpu fan boost feature will be skipped due to the 3489 * gpu instance is counted less. 3490 */ 3491 amdgpu_register_gpu_instance(adev); 3492 3493 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3494 * explicit gating rather than handling it automatically. 3495 */ 3496 r = amdgpu_device_ip_late_init(adev); 3497 if (r) { 3498 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3499 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3500 goto failed; 3501 } 3502 3503 /* must succeed. */ 3504 amdgpu_ras_resume(adev); 3505 3506 queue_delayed_work(system_wq, &adev->delayed_init_work, 3507 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3508 3509 if (amdgpu_sriov_vf(adev)) 3510 flush_delayed_work(&adev->delayed_init_work); 3511 3512 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3513 if (r) 3514 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3515 3516 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3517 r = amdgpu_pmu_init(adev); 3518 if (r) 3519 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3520 3521 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3522 if (amdgpu_device_cache_pci_state(adev->pdev)) 3523 pci_restore_state(pdev); 3524 3525 return 0; 3526 3527 failed: 3528 amdgpu_vf_error_trans_all(adev); 3529 if (boco) 3530 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3531 3532 failed_unmap: 3533 iounmap(adev->rmmio); 3534 adev->rmmio = NULL; 3535 3536 return r; 3537 } 3538 3539 /** 3540 * amdgpu_device_fini - tear down the driver 3541 * 3542 * @adev: amdgpu_device pointer 3543 * 3544 * Tear down the driver info (all asics). 3545 * Called at driver shutdown. 3546 */ 3547 void amdgpu_device_fini(struct amdgpu_device *adev) 3548 { 3549 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3550 flush_delayed_work(&adev->delayed_init_work); 3551 adev->shutdown = true; 3552 3553 kfree(adev->pci_state); 3554 3555 /* make sure IB test finished before entering exclusive mode 3556 * to avoid preemption on IB test 3557 * */ 3558 if (amdgpu_sriov_vf(adev)) { 3559 amdgpu_virt_request_full_gpu(adev, false); 3560 amdgpu_virt_fini_data_exchange(adev); 3561 } 3562 3563 /* disable all interrupts */ 3564 amdgpu_irq_disable_all(adev); 3565 if (adev->mode_info.mode_config_initialized){ 3566 if (!amdgpu_device_has_dc_support(adev)) 3567 drm_helper_force_disable_all(adev_to_drm(adev)); 3568 else 3569 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3570 } 3571 amdgpu_fence_driver_fini(adev); 3572 if (adev->pm_sysfs_en) 3573 amdgpu_pm_sysfs_fini(adev); 3574 amdgpu_fbdev_fini(adev); 3575 amdgpu_device_ip_fini(adev); 3576 release_firmware(adev->firmware.gpu_info_fw); 3577 adev->firmware.gpu_info_fw = NULL; 3578 adev->accel_working = false; 3579 /* free i2c buses */ 3580 if (!amdgpu_device_has_dc_support(adev)) 3581 amdgpu_i2c_fini(adev); 3582 3583 if (amdgpu_emu_mode != 1) 3584 amdgpu_atombios_fini(adev); 3585 3586 kfree(adev->bios); 3587 adev->bios = NULL; 3588 if (amdgpu_has_atpx() && 3589 (amdgpu_is_atpx_hybrid() || 3590 amdgpu_has_atpx_dgpu_power_cntl()) && 3591 !pci_is_thunderbolt_attached(adev->pdev)) 3592 vga_switcheroo_unregister_client(adev->pdev); 3593 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3594 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3595 vga_client_register(adev->pdev, NULL, NULL, NULL); 3596 if (adev->rio_mem) 3597 pci_iounmap(adev->pdev, adev->rio_mem); 3598 adev->rio_mem = NULL; 3599 iounmap(adev->rmmio); 3600 adev->rmmio = NULL; 3601 amdgpu_device_doorbell_fini(adev); 3602 3603 if (adev->ucode_sysfs_en) 3604 amdgpu_ucode_sysfs_fini(adev); 3605 3606 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3607 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3608 amdgpu_pmu_fini(adev); 3609 if (adev->mman.discovery_bin) 3610 amdgpu_discovery_fini(adev); 3611 } 3612 3613 3614 /* 3615 * Suspend & resume. 3616 */ 3617 /** 3618 * amdgpu_device_suspend - initiate device suspend 3619 * 3620 * @dev: drm dev pointer 3621 * @fbcon : notify the fbdev of suspend 3622 * 3623 * Puts the hw in the suspend state (all asics). 3624 * Returns 0 for success or an error on failure. 3625 * Called at driver suspend. 3626 */ 3627 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3628 { 3629 struct amdgpu_device *adev; 3630 struct drm_crtc *crtc; 3631 struct drm_connector *connector; 3632 struct drm_connector_list_iter iter; 3633 int r; 3634 3635 adev = drm_to_adev(dev); 3636 3637 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3638 return 0; 3639 3640 adev->in_suspend = true; 3641 drm_kms_helper_poll_disable(dev); 3642 3643 if (fbcon) 3644 amdgpu_fbdev_set_suspend(adev, 1); 3645 3646 cancel_delayed_work_sync(&adev->delayed_init_work); 3647 3648 if (!amdgpu_device_has_dc_support(adev)) { 3649 /* turn off display hw */ 3650 drm_modeset_lock_all(dev); 3651 drm_connector_list_iter_begin(dev, &iter); 3652 drm_for_each_connector_iter(connector, &iter) 3653 drm_helper_connector_dpms(connector, 3654 DRM_MODE_DPMS_OFF); 3655 drm_connector_list_iter_end(&iter); 3656 drm_modeset_unlock_all(dev); 3657 /* unpin the front buffers and cursors */ 3658 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3659 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3660 struct drm_framebuffer *fb = crtc->primary->fb; 3661 struct amdgpu_bo *robj; 3662 3663 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3664 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3665 r = amdgpu_bo_reserve(aobj, true); 3666 if (r == 0) { 3667 amdgpu_bo_unpin(aobj); 3668 amdgpu_bo_unreserve(aobj); 3669 } 3670 } 3671 3672 if (fb == NULL || fb->obj[0] == NULL) { 3673 continue; 3674 } 3675 robj = gem_to_amdgpu_bo(fb->obj[0]); 3676 /* don't unpin kernel fb objects */ 3677 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3678 r = amdgpu_bo_reserve(robj, true); 3679 if (r == 0) { 3680 amdgpu_bo_unpin(robj); 3681 amdgpu_bo_unreserve(robj); 3682 } 3683 } 3684 } 3685 } 3686 3687 amdgpu_ras_suspend(adev); 3688 3689 r = amdgpu_device_ip_suspend_phase1(adev); 3690 3691 amdgpu_amdkfd_suspend(adev, !fbcon); 3692 3693 /* evict vram memory */ 3694 amdgpu_bo_evict_vram(adev); 3695 3696 amdgpu_fence_driver_suspend(adev); 3697 3698 r = amdgpu_device_ip_suspend_phase2(adev); 3699 3700 /* evict remaining vram memory 3701 * This second call to evict vram is to evict the gart page table 3702 * using the CPU. 3703 */ 3704 amdgpu_bo_evict_vram(adev); 3705 3706 return 0; 3707 } 3708 3709 /** 3710 * amdgpu_device_resume - initiate device resume 3711 * 3712 * @dev: drm dev pointer 3713 * @fbcon : notify the fbdev of resume 3714 * 3715 * Bring the hw back to operating state (all asics). 3716 * Returns 0 for success or an error on failure. 3717 * Called at driver resume. 3718 */ 3719 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3720 { 3721 struct drm_connector *connector; 3722 struct drm_connector_list_iter iter; 3723 struct amdgpu_device *adev = drm_to_adev(dev); 3724 struct drm_crtc *crtc; 3725 int r = 0; 3726 3727 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3728 return 0; 3729 3730 /* post card */ 3731 if (amdgpu_device_need_post(adev)) { 3732 r = amdgpu_device_asic_init(adev); 3733 if (r) 3734 dev_err(adev->dev, "amdgpu asic init failed\n"); 3735 } 3736 3737 r = amdgpu_device_ip_resume(adev); 3738 if (r) { 3739 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3740 return r; 3741 } 3742 amdgpu_fence_driver_resume(adev); 3743 3744 3745 r = amdgpu_device_ip_late_init(adev); 3746 if (r) 3747 return r; 3748 3749 queue_delayed_work(system_wq, &adev->delayed_init_work, 3750 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3751 3752 if (!amdgpu_device_has_dc_support(adev)) { 3753 /* pin cursors */ 3754 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3755 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3756 3757 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3758 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3759 r = amdgpu_bo_reserve(aobj, true); 3760 if (r == 0) { 3761 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3762 if (r != 0) 3763 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3764 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3765 amdgpu_bo_unreserve(aobj); 3766 } 3767 } 3768 } 3769 } 3770 r = amdgpu_amdkfd_resume(adev, !fbcon); 3771 if (r) 3772 return r; 3773 3774 /* Make sure IB tests flushed */ 3775 flush_delayed_work(&adev->delayed_init_work); 3776 3777 /* blat the mode back in */ 3778 if (fbcon) { 3779 if (!amdgpu_device_has_dc_support(adev)) { 3780 /* pre DCE11 */ 3781 drm_helper_resume_force_mode(dev); 3782 3783 /* turn on display hw */ 3784 drm_modeset_lock_all(dev); 3785 3786 drm_connector_list_iter_begin(dev, &iter); 3787 drm_for_each_connector_iter(connector, &iter) 3788 drm_helper_connector_dpms(connector, 3789 DRM_MODE_DPMS_ON); 3790 drm_connector_list_iter_end(&iter); 3791 3792 drm_modeset_unlock_all(dev); 3793 } 3794 amdgpu_fbdev_set_suspend(adev, 0); 3795 } 3796 3797 drm_kms_helper_poll_enable(dev); 3798 3799 amdgpu_ras_resume(adev); 3800 3801 /* 3802 * Most of the connector probing functions try to acquire runtime pm 3803 * refs to ensure that the GPU is powered on when connector polling is 3804 * performed. Since we're calling this from a runtime PM callback, 3805 * trying to acquire rpm refs will cause us to deadlock. 3806 * 3807 * Since we're guaranteed to be holding the rpm lock, it's safe to 3808 * temporarily disable the rpm helpers so this doesn't deadlock us. 3809 */ 3810 #ifdef CONFIG_PM 3811 dev->dev->power.disable_depth++; 3812 #endif 3813 if (!amdgpu_device_has_dc_support(adev)) 3814 drm_helper_hpd_irq_event(dev); 3815 else 3816 drm_kms_helper_hotplug_event(dev); 3817 #ifdef CONFIG_PM 3818 dev->dev->power.disable_depth--; 3819 #endif 3820 adev->in_suspend = false; 3821 3822 return 0; 3823 } 3824 3825 /** 3826 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3827 * 3828 * @adev: amdgpu_device pointer 3829 * 3830 * The list of all the hardware IPs that make up the asic is walked and 3831 * the check_soft_reset callbacks are run. check_soft_reset determines 3832 * if the asic is still hung or not. 3833 * Returns true if any of the IPs are still in a hung state, false if not. 3834 */ 3835 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3836 { 3837 int i; 3838 bool asic_hang = false; 3839 3840 if (amdgpu_sriov_vf(adev)) 3841 return true; 3842 3843 if (amdgpu_asic_need_full_reset(adev)) 3844 return true; 3845 3846 for (i = 0; i < adev->num_ip_blocks; i++) { 3847 if (!adev->ip_blocks[i].status.valid) 3848 continue; 3849 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3850 adev->ip_blocks[i].status.hang = 3851 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3852 if (adev->ip_blocks[i].status.hang) { 3853 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3854 asic_hang = true; 3855 } 3856 } 3857 return asic_hang; 3858 } 3859 3860 /** 3861 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3862 * 3863 * @adev: amdgpu_device pointer 3864 * 3865 * The list of all the hardware IPs that make up the asic is walked and the 3866 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3867 * handles any IP specific hardware or software state changes that are 3868 * necessary for a soft reset to succeed. 3869 * Returns 0 on success, negative error code on failure. 3870 */ 3871 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3872 { 3873 int i, r = 0; 3874 3875 for (i = 0; i < adev->num_ip_blocks; i++) { 3876 if (!adev->ip_blocks[i].status.valid) 3877 continue; 3878 if (adev->ip_blocks[i].status.hang && 3879 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3880 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3881 if (r) 3882 return r; 3883 } 3884 } 3885 3886 return 0; 3887 } 3888 3889 /** 3890 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3891 * 3892 * @adev: amdgpu_device pointer 3893 * 3894 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3895 * reset is necessary to recover. 3896 * Returns true if a full asic reset is required, false if not. 3897 */ 3898 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3899 { 3900 int i; 3901 3902 if (amdgpu_asic_need_full_reset(adev)) 3903 return true; 3904 3905 for (i = 0; i < adev->num_ip_blocks; i++) { 3906 if (!adev->ip_blocks[i].status.valid) 3907 continue; 3908 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3909 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3910 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3911 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3912 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3913 if (adev->ip_blocks[i].status.hang) { 3914 dev_info(adev->dev, "Some block need full reset!\n"); 3915 return true; 3916 } 3917 } 3918 } 3919 return false; 3920 } 3921 3922 /** 3923 * amdgpu_device_ip_soft_reset - do a soft reset 3924 * 3925 * @adev: amdgpu_device pointer 3926 * 3927 * The list of all the hardware IPs that make up the asic is walked and the 3928 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3929 * IP specific hardware or software state changes that are necessary to soft 3930 * reset the IP. 3931 * Returns 0 on success, negative error code on failure. 3932 */ 3933 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3934 { 3935 int i, r = 0; 3936 3937 for (i = 0; i < adev->num_ip_blocks; i++) { 3938 if (!adev->ip_blocks[i].status.valid) 3939 continue; 3940 if (adev->ip_blocks[i].status.hang && 3941 adev->ip_blocks[i].version->funcs->soft_reset) { 3942 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3943 if (r) 3944 return r; 3945 } 3946 } 3947 3948 return 0; 3949 } 3950 3951 /** 3952 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3953 * 3954 * @adev: amdgpu_device pointer 3955 * 3956 * The list of all the hardware IPs that make up the asic is walked and the 3957 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3958 * handles any IP specific hardware or software state changes that are 3959 * necessary after the IP has been soft reset. 3960 * Returns 0 on success, negative error code on failure. 3961 */ 3962 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3963 { 3964 int i, r = 0; 3965 3966 for (i = 0; i < adev->num_ip_blocks; i++) { 3967 if (!adev->ip_blocks[i].status.valid) 3968 continue; 3969 if (adev->ip_blocks[i].status.hang && 3970 adev->ip_blocks[i].version->funcs->post_soft_reset) 3971 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3972 if (r) 3973 return r; 3974 } 3975 3976 return 0; 3977 } 3978 3979 /** 3980 * amdgpu_device_recover_vram - Recover some VRAM contents 3981 * 3982 * @adev: amdgpu_device pointer 3983 * 3984 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3985 * restore things like GPUVM page tables after a GPU reset where 3986 * the contents of VRAM might be lost. 3987 * 3988 * Returns: 3989 * 0 on success, negative error code on failure. 3990 */ 3991 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3992 { 3993 struct dma_fence *fence = NULL, *next = NULL; 3994 struct amdgpu_bo *shadow; 3995 long r = 1, tmo; 3996 3997 if (amdgpu_sriov_runtime(adev)) 3998 tmo = msecs_to_jiffies(8000); 3999 else 4000 tmo = msecs_to_jiffies(100); 4001 4002 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4003 mutex_lock(&adev->shadow_list_lock); 4004 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4005 4006 /* No need to recover an evicted BO */ 4007 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4008 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4009 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4010 continue; 4011 4012 r = amdgpu_bo_restore_shadow(shadow, &next); 4013 if (r) 4014 break; 4015 4016 if (fence) { 4017 tmo = dma_fence_wait_timeout(fence, false, tmo); 4018 dma_fence_put(fence); 4019 fence = next; 4020 if (tmo == 0) { 4021 r = -ETIMEDOUT; 4022 break; 4023 } else if (tmo < 0) { 4024 r = tmo; 4025 break; 4026 } 4027 } else { 4028 fence = next; 4029 } 4030 } 4031 mutex_unlock(&adev->shadow_list_lock); 4032 4033 if (fence) 4034 tmo = dma_fence_wait_timeout(fence, false, tmo); 4035 dma_fence_put(fence); 4036 4037 if (r < 0 || tmo <= 0) { 4038 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4039 return -EIO; 4040 } 4041 4042 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4043 return 0; 4044 } 4045 4046 4047 /** 4048 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4049 * 4050 * @adev: amdgpu_device pointer 4051 * @from_hypervisor: request from hypervisor 4052 * 4053 * do VF FLR and reinitialize Asic 4054 * return 0 means succeeded otherwise failed 4055 */ 4056 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4057 bool from_hypervisor) 4058 { 4059 int r; 4060 4061 if (from_hypervisor) 4062 r = amdgpu_virt_request_full_gpu(adev, true); 4063 else 4064 r = amdgpu_virt_reset_gpu(adev); 4065 if (r) 4066 return r; 4067 4068 amdgpu_amdkfd_pre_reset(adev); 4069 4070 /* Resume IP prior to SMC */ 4071 r = amdgpu_device_ip_reinit_early_sriov(adev); 4072 if (r) 4073 goto error; 4074 4075 amdgpu_virt_init_data_exchange(adev); 4076 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4077 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4078 4079 r = amdgpu_device_fw_loading(adev); 4080 if (r) 4081 return r; 4082 4083 /* now we are okay to resume SMC/CP/SDMA */ 4084 r = amdgpu_device_ip_reinit_late_sriov(adev); 4085 if (r) 4086 goto error; 4087 4088 amdgpu_irq_gpu_reset_resume_helper(adev); 4089 r = amdgpu_ib_ring_tests(adev); 4090 amdgpu_amdkfd_post_reset(adev); 4091 4092 error: 4093 amdgpu_virt_release_full_gpu(adev, true); 4094 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4095 amdgpu_inc_vram_lost(adev); 4096 r = amdgpu_device_recover_vram(adev); 4097 } 4098 4099 return r; 4100 } 4101 4102 /** 4103 * amdgpu_device_has_job_running - check if there is any job in mirror list 4104 * 4105 * @adev: amdgpu_device pointer 4106 * 4107 * check if there is any job in mirror list 4108 */ 4109 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4110 { 4111 int i; 4112 struct drm_sched_job *job; 4113 4114 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4115 struct amdgpu_ring *ring = adev->rings[i]; 4116 4117 if (!ring || !ring->sched.thread) 4118 continue; 4119 4120 spin_lock(&ring->sched.job_list_lock); 4121 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4122 struct drm_sched_job, node); 4123 spin_unlock(&ring->sched.job_list_lock); 4124 if (job) 4125 return true; 4126 } 4127 return false; 4128 } 4129 4130 /** 4131 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4132 * 4133 * @adev: amdgpu_device pointer 4134 * 4135 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4136 * a hung GPU. 4137 */ 4138 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4139 { 4140 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4141 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4142 return false; 4143 } 4144 4145 if (amdgpu_gpu_recovery == 0) 4146 goto disabled; 4147 4148 if (amdgpu_sriov_vf(adev)) 4149 return true; 4150 4151 if (amdgpu_gpu_recovery == -1) { 4152 switch (adev->asic_type) { 4153 case CHIP_BONAIRE: 4154 case CHIP_HAWAII: 4155 case CHIP_TOPAZ: 4156 case CHIP_TONGA: 4157 case CHIP_FIJI: 4158 case CHIP_POLARIS10: 4159 case CHIP_POLARIS11: 4160 case CHIP_POLARIS12: 4161 case CHIP_VEGAM: 4162 case CHIP_VEGA20: 4163 case CHIP_VEGA10: 4164 case CHIP_VEGA12: 4165 case CHIP_RAVEN: 4166 case CHIP_ARCTURUS: 4167 case CHIP_RENOIR: 4168 case CHIP_NAVI10: 4169 case CHIP_NAVI14: 4170 case CHIP_NAVI12: 4171 case CHIP_SIENNA_CICHLID: 4172 break; 4173 default: 4174 goto disabled; 4175 } 4176 } 4177 4178 return true; 4179 4180 disabled: 4181 dev_info(adev->dev, "GPU recovery disabled.\n"); 4182 return false; 4183 } 4184 4185 4186 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4187 struct amdgpu_job *job, 4188 bool *need_full_reset_arg) 4189 { 4190 int i, r = 0; 4191 bool need_full_reset = *need_full_reset_arg; 4192 4193 amdgpu_debugfs_wait_dump(adev); 4194 4195 if (amdgpu_sriov_vf(adev)) { 4196 /* stop the data exchange thread */ 4197 amdgpu_virt_fini_data_exchange(adev); 4198 } 4199 4200 /* block all schedulers and reset given job's ring */ 4201 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4202 struct amdgpu_ring *ring = adev->rings[i]; 4203 4204 if (!ring || !ring->sched.thread) 4205 continue; 4206 4207 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4208 amdgpu_fence_driver_force_completion(ring); 4209 } 4210 4211 if(job) 4212 drm_sched_increase_karma(&job->base); 4213 4214 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4215 if (!amdgpu_sriov_vf(adev)) { 4216 4217 if (!need_full_reset) 4218 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4219 4220 if (!need_full_reset) { 4221 amdgpu_device_ip_pre_soft_reset(adev); 4222 r = amdgpu_device_ip_soft_reset(adev); 4223 amdgpu_device_ip_post_soft_reset(adev); 4224 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4225 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4226 need_full_reset = true; 4227 } 4228 } 4229 4230 if (need_full_reset) 4231 r = amdgpu_device_ip_suspend(adev); 4232 4233 *need_full_reset_arg = need_full_reset; 4234 } 4235 4236 return r; 4237 } 4238 4239 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4240 struct list_head *device_list_handle, 4241 bool *need_full_reset_arg, 4242 bool skip_hw_reset) 4243 { 4244 struct amdgpu_device *tmp_adev = NULL; 4245 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4246 int r = 0; 4247 4248 /* 4249 * ASIC reset has to be done on all HGMI hive nodes ASAP 4250 * to allow proper links negotiation in FW (within 1 sec) 4251 */ 4252 if (!skip_hw_reset && need_full_reset) { 4253 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4254 /* For XGMI run all resets in parallel to speed up the process */ 4255 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4256 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4257 r = -EALREADY; 4258 } else 4259 r = amdgpu_asic_reset(tmp_adev); 4260 4261 if (r) { 4262 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4263 r, adev_to_drm(tmp_adev)->unique); 4264 break; 4265 } 4266 } 4267 4268 /* For XGMI wait for all resets to complete before proceed */ 4269 if (!r) { 4270 list_for_each_entry(tmp_adev, device_list_handle, 4271 gmc.xgmi.head) { 4272 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4273 flush_work(&tmp_adev->xgmi_reset_work); 4274 r = tmp_adev->asic_reset_res; 4275 if (r) 4276 break; 4277 } 4278 } 4279 } 4280 } 4281 4282 if (!r && amdgpu_ras_intr_triggered()) { 4283 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4284 if (tmp_adev->mmhub.funcs && 4285 tmp_adev->mmhub.funcs->reset_ras_error_count) 4286 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4287 } 4288 4289 amdgpu_ras_intr_cleared(); 4290 } 4291 4292 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4293 if (need_full_reset) { 4294 /* post card */ 4295 if (amdgpu_device_asic_init(tmp_adev)) 4296 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4297 4298 if (!r) { 4299 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4300 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4301 if (r) 4302 goto out; 4303 4304 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4305 if (vram_lost) { 4306 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4307 amdgpu_inc_vram_lost(tmp_adev); 4308 } 4309 4310 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4311 if (r) 4312 goto out; 4313 4314 r = amdgpu_device_fw_loading(tmp_adev); 4315 if (r) 4316 return r; 4317 4318 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4319 if (r) 4320 goto out; 4321 4322 if (vram_lost) 4323 amdgpu_device_fill_reset_magic(tmp_adev); 4324 4325 /* 4326 * Add this ASIC as tracked as reset was already 4327 * complete successfully. 4328 */ 4329 amdgpu_register_gpu_instance(tmp_adev); 4330 4331 r = amdgpu_device_ip_late_init(tmp_adev); 4332 if (r) 4333 goto out; 4334 4335 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4336 4337 /* 4338 * The GPU enters bad state once faulty pages 4339 * by ECC has reached the threshold, and ras 4340 * recovery is scheduled next. So add one check 4341 * here to break recovery if it indeed exceeds 4342 * bad page threshold, and remind user to 4343 * retire this GPU or setting one bigger 4344 * bad_page_threshold value to fix this once 4345 * probing driver again. 4346 */ 4347 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4348 /* must succeed. */ 4349 amdgpu_ras_resume(tmp_adev); 4350 } else { 4351 r = -EINVAL; 4352 goto out; 4353 } 4354 4355 /* Update PSP FW topology after reset */ 4356 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4357 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4358 } 4359 } 4360 4361 out: 4362 if (!r) { 4363 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4364 r = amdgpu_ib_ring_tests(tmp_adev); 4365 if (r) { 4366 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4367 r = amdgpu_device_ip_suspend(tmp_adev); 4368 need_full_reset = true; 4369 r = -EAGAIN; 4370 goto end; 4371 } 4372 } 4373 4374 if (!r) 4375 r = amdgpu_device_recover_vram(tmp_adev); 4376 else 4377 tmp_adev->asic_reset_res = r; 4378 } 4379 4380 end: 4381 *need_full_reset_arg = need_full_reset; 4382 return r; 4383 } 4384 4385 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4386 struct amdgpu_hive_info *hive) 4387 { 4388 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4389 return false; 4390 4391 if (hive) { 4392 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4393 } else { 4394 down_write(&adev->reset_sem); 4395 } 4396 4397 atomic_inc(&adev->gpu_reset_counter); 4398 switch (amdgpu_asic_reset_method(adev)) { 4399 case AMD_RESET_METHOD_MODE1: 4400 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4401 break; 4402 case AMD_RESET_METHOD_MODE2: 4403 adev->mp1_state = PP_MP1_STATE_RESET; 4404 break; 4405 default: 4406 adev->mp1_state = PP_MP1_STATE_NONE; 4407 break; 4408 } 4409 4410 return true; 4411 } 4412 4413 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4414 { 4415 amdgpu_vf_error_trans_all(adev); 4416 adev->mp1_state = PP_MP1_STATE_NONE; 4417 atomic_set(&adev->in_gpu_reset, 0); 4418 up_write(&adev->reset_sem); 4419 } 4420 4421 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4422 { 4423 struct pci_dev *p = NULL; 4424 4425 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4426 adev->pdev->bus->number, 1); 4427 if (p) { 4428 pm_runtime_enable(&(p->dev)); 4429 pm_runtime_resume(&(p->dev)); 4430 } 4431 } 4432 4433 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4434 { 4435 enum amd_reset_method reset_method; 4436 struct pci_dev *p = NULL; 4437 u64 expires; 4438 4439 /* 4440 * For now, only BACO and mode1 reset are confirmed 4441 * to suffer the audio issue without proper suspended. 4442 */ 4443 reset_method = amdgpu_asic_reset_method(adev); 4444 if ((reset_method != AMD_RESET_METHOD_BACO) && 4445 (reset_method != AMD_RESET_METHOD_MODE1)) 4446 return -EINVAL; 4447 4448 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4449 adev->pdev->bus->number, 1); 4450 if (!p) 4451 return -ENODEV; 4452 4453 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4454 if (!expires) 4455 /* 4456 * If we cannot get the audio device autosuspend delay, 4457 * a fixed 4S interval will be used. Considering 3S is 4458 * the audio controller default autosuspend delay setting. 4459 * 4S used here is guaranteed to cover that. 4460 */ 4461 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4462 4463 while (!pm_runtime_status_suspended(&(p->dev))) { 4464 if (!pm_runtime_suspend(&(p->dev))) 4465 break; 4466 4467 if (expires < ktime_get_mono_fast_ns()) { 4468 dev_warn(adev->dev, "failed to suspend display audio\n"); 4469 /* TODO: abort the succeeding gpu reset? */ 4470 return -ETIMEDOUT; 4471 } 4472 } 4473 4474 pm_runtime_disable(&(p->dev)); 4475 4476 return 0; 4477 } 4478 4479 /** 4480 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4481 * 4482 * @adev: amdgpu_device pointer 4483 * @job: which job trigger hang 4484 * 4485 * Attempt to reset the GPU if it has hung (all asics). 4486 * Attempt to do soft-reset or full-reset and reinitialize Asic 4487 * Returns 0 for success or an error on failure. 4488 */ 4489 4490 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4491 struct amdgpu_job *job) 4492 { 4493 struct list_head device_list, *device_list_handle = NULL; 4494 bool need_full_reset = false; 4495 bool job_signaled = false; 4496 struct amdgpu_hive_info *hive = NULL; 4497 struct amdgpu_device *tmp_adev = NULL; 4498 int i, r = 0; 4499 bool need_emergency_restart = false; 4500 bool audio_suspended = false; 4501 4502 /* 4503 * Special case: RAS triggered and full reset isn't supported 4504 */ 4505 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4506 4507 /* 4508 * Flush RAM to disk so that after reboot 4509 * the user can read log and see why the system rebooted. 4510 */ 4511 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4512 DRM_WARN("Emergency reboot."); 4513 4514 ksys_sync_helper(); 4515 emergency_restart(); 4516 } 4517 4518 dev_info(adev->dev, "GPU %s begin!\n", 4519 need_emergency_restart ? "jobs stop":"reset"); 4520 4521 /* 4522 * Here we trylock to avoid chain of resets executing from 4523 * either trigger by jobs on different adevs in XGMI hive or jobs on 4524 * different schedulers for same device while this TO handler is running. 4525 * We always reset all schedulers for device and all devices for XGMI 4526 * hive so that should take care of them too. 4527 */ 4528 hive = amdgpu_get_xgmi_hive(adev); 4529 if (hive) { 4530 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4531 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4532 job ? job->base.id : -1, hive->hive_id); 4533 amdgpu_put_xgmi_hive(hive); 4534 return 0; 4535 } 4536 mutex_lock(&hive->hive_lock); 4537 } 4538 4539 /* 4540 * Build list of devices to reset. 4541 * In case we are in XGMI hive mode, resort the device list 4542 * to put adev in the 1st position. 4543 */ 4544 INIT_LIST_HEAD(&device_list); 4545 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4546 if (!hive) 4547 return -ENODEV; 4548 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4549 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4550 device_list_handle = &hive->device_list; 4551 } else { 4552 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4553 device_list_handle = &device_list; 4554 } 4555 4556 /* block all schedulers and reset given job's ring */ 4557 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4558 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4559 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4560 job ? job->base.id : -1); 4561 r = 0; 4562 goto skip_recovery; 4563 } 4564 4565 /* 4566 * Try to put the audio codec into suspend state 4567 * before gpu reset started. 4568 * 4569 * Due to the power domain of the graphics device 4570 * is shared with AZ power domain. Without this, 4571 * we may change the audio hardware from behind 4572 * the audio driver's back. That will trigger 4573 * some audio codec errors. 4574 */ 4575 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4576 audio_suspended = true; 4577 4578 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4579 4580 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4581 4582 if (!amdgpu_sriov_vf(tmp_adev)) 4583 amdgpu_amdkfd_pre_reset(tmp_adev); 4584 4585 /* 4586 * Mark these ASICs to be reseted as untracked first 4587 * And add them back after reset completed 4588 */ 4589 amdgpu_unregister_gpu_instance(tmp_adev); 4590 4591 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4592 4593 /* disable ras on ALL IPs */ 4594 if (!need_emergency_restart && 4595 amdgpu_device_ip_need_full_reset(tmp_adev)) 4596 amdgpu_ras_suspend(tmp_adev); 4597 4598 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4599 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4600 4601 if (!ring || !ring->sched.thread) 4602 continue; 4603 4604 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4605 4606 if (need_emergency_restart) 4607 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4608 } 4609 } 4610 4611 if (need_emergency_restart) 4612 goto skip_sched_resume; 4613 4614 /* 4615 * Must check guilty signal here since after this point all old 4616 * HW fences are force signaled. 4617 * 4618 * job->base holds a reference to parent fence 4619 */ 4620 if (job && job->base.s_fence->parent && 4621 dma_fence_is_signaled(job->base.s_fence->parent)) { 4622 job_signaled = true; 4623 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4624 goto skip_hw_reset; 4625 } 4626 4627 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4628 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4629 r = amdgpu_device_pre_asic_reset(tmp_adev, 4630 (tmp_adev == adev) ? job : NULL, 4631 &need_full_reset); 4632 /*TODO Should we stop ?*/ 4633 if (r) { 4634 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4635 r, adev_to_drm(tmp_adev)->unique); 4636 tmp_adev->asic_reset_res = r; 4637 } 4638 } 4639 4640 /* Actual ASIC resets if needed.*/ 4641 /* TODO Implement XGMI hive reset logic for SRIOV */ 4642 if (amdgpu_sriov_vf(adev)) { 4643 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4644 if (r) 4645 adev->asic_reset_res = r; 4646 } else { 4647 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4648 if (r && r == -EAGAIN) 4649 goto retry; 4650 } 4651 4652 skip_hw_reset: 4653 4654 /* Post ASIC reset for all devs .*/ 4655 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4656 4657 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4658 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4659 4660 if (!ring || !ring->sched.thread) 4661 continue; 4662 4663 /* No point to resubmit jobs if we didn't HW reset*/ 4664 if (!tmp_adev->asic_reset_res && !job_signaled) 4665 drm_sched_resubmit_jobs(&ring->sched); 4666 4667 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4668 } 4669 4670 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4671 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4672 } 4673 4674 tmp_adev->asic_reset_res = 0; 4675 4676 if (r) { 4677 /* bad news, how to tell it to userspace ? */ 4678 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4679 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4680 } else { 4681 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4682 } 4683 } 4684 4685 skip_sched_resume: 4686 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4687 /*unlock kfd: SRIOV would do it separately */ 4688 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4689 amdgpu_amdkfd_post_reset(tmp_adev); 4690 if (audio_suspended) 4691 amdgpu_device_resume_display_audio(tmp_adev); 4692 amdgpu_device_unlock_adev(tmp_adev); 4693 } 4694 4695 skip_recovery: 4696 if (hive) { 4697 atomic_set(&hive->in_reset, 0); 4698 mutex_unlock(&hive->hive_lock); 4699 amdgpu_put_xgmi_hive(hive); 4700 } 4701 4702 if (r) 4703 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4704 return r; 4705 } 4706 4707 /** 4708 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4709 * 4710 * @adev: amdgpu_device pointer 4711 * 4712 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4713 * and lanes) of the slot the device is in. Handles APUs and 4714 * virtualized environments where PCIE config space may not be available. 4715 */ 4716 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4717 { 4718 struct pci_dev *pdev; 4719 enum pci_bus_speed speed_cap, platform_speed_cap; 4720 enum pcie_link_width platform_link_width; 4721 4722 if (amdgpu_pcie_gen_cap) 4723 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4724 4725 if (amdgpu_pcie_lane_cap) 4726 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4727 4728 /* covers APUs as well */ 4729 if (pci_is_root_bus(adev->pdev->bus)) { 4730 if (adev->pm.pcie_gen_mask == 0) 4731 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4732 if (adev->pm.pcie_mlw_mask == 0) 4733 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4734 return; 4735 } 4736 4737 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4738 return; 4739 4740 pcie_bandwidth_available(adev->pdev, NULL, 4741 &platform_speed_cap, &platform_link_width); 4742 4743 if (adev->pm.pcie_gen_mask == 0) { 4744 /* asic caps */ 4745 pdev = adev->pdev; 4746 speed_cap = pcie_get_speed_cap(pdev); 4747 if (speed_cap == PCI_SPEED_UNKNOWN) { 4748 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4749 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4750 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4751 } else { 4752 if (speed_cap == PCIE_SPEED_16_0GT) 4753 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4754 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4755 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4756 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4757 else if (speed_cap == PCIE_SPEED_8_0GT) 4758 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4759 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4760 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4761 else if (speed_cap == PCIE_SPEED_5_0GT) 4762 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4763 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4764 else 4765 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4766 } 4767 /* platform caps */ 4768 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4769 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4770 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4771 } else { 4772 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4773 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4774 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4775 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4776 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4777 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4778 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4779 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4780 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4781 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4782 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4783 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4784 else 4785 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4786 4787 } 4788 } 4789 if (adev->pm.pcie_mlw_mask == 0) { 4790 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4791 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4792 } else { 4793 switch (platform_link_width) { 4794 case PCIE_LNK_X32: 4795 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4796 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4797 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4798 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4799 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4800 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4801 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4802 break; 4803 case PCIE_LNK_X16: 4804 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4805 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4806 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4807 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4808 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4809 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4810 break; 4811 case PCIE_LNK_X12: 4812 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4813 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4814 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4815 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4816 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4817 break; 4818 case PCIE_LNK_X8: 4819 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4821 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4822 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4823 break; 4824 case PCIE_LNK_X4: 4825 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4826 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4827 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4828 break; 4829 case PCIE_LNK_X2: 4830 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4832 break; 4833 case PCIE_LNK_X1: 4834 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4835 break; 4836 default: 4837 break; 4838 } 4839 } 4840 } 4841 } 4842 4843 int amdgpu_device_baco_enter(struct drm_device *dev) 4844 { 4845 struct amdgpu_device *adev = drm_to_adev(dev); 4846 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4847 4848 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4849 return -ENOTSUPP; 4850 4851 if (ras && ras->supported) 4852 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4853 4854 return amdgpu_dpm_baco_enter(adev); 4855 } 4856 4857 int amdgpu_device_baco_exit(struct drm_device *dev) 4858 { 4859 struct amdgpu_device *adev = drm_to_adev(dev); 4860 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4861 int ret = 0; 4862 4863 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4864 return -ENOTSUPP; 4865 4866 ret = amdgpu_dpm_baco_exit(adev); 4867 if (ret) 4868 return ret; 4869 4870 if (ras && ras->supported) 4871 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4872 4873 return 0; 4874 } 4875 4876 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4877 { 4878 int i; 4879 4880 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4881 struct amdgpu_ring *ring = adev->rings[i]; 4882 4883 if (!ring || !ring->sched.thread) 4884 continue; 4885 4886 cancel_delayed_work_sync(&ring->sched.work_tdr); 4887 } 4888 } 4889 4890 /** 4891 * amdgpu_pci_error_detected - Called when a PCI error is detected. 4892 * @pdev: PCI device struct 4893 * @state: PCI channel state 4894 * 4895 * Description: Called when a PCI error is detected. 4896 * 4897 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 4898 */ 4899 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 4900 { 4901 struct drm_device *dev = pci_get_drvdata(pdev); 4902 struct amdgpu_device *adev = drm_to_adev(dev); 4903 int i; 4904 4905 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 4906 4907 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4908 DRM_WARN("No support for XGMI hive yet..."); 4909 return PCI_ERS_RESULT_DISCONNECT; 4910 } 4911 4912 switch (state) { 4913 case pci_channel_io_normal: 4914 return PCI_ERS_RESULT_CAN_RECOVER; 4915 /* Fatal error, prepare for slot reset */ 4916 case pci_channel_io_frozen: 4917 /* 4918 * Cancel and wait for all TDRs in progress if failing to 4919 * set adev->in_gpu_reset in amdgpu_device_lock_adev 4920 * 4921 * Locking adev->reset_sem will prevent any external access 4922 * to GPU during PCI error recovery 4923 */ 4924 while (!amdgpu_device_lock_adev(adev, NULL)) 4925 amdgpu_cancel_all_tdr(adev); 4926 4927 /* 4928 * Block any work scheduling as we do for regular GPU reset 4929 * for the duration of the recovery 4930 */ 4931 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4932 struct amdgpu_ring *ring = adev->rings[i]; 4933 4934 if (!ring || !ring->sched.thread) 4935 continue; 4936 4937 drm_sched_stop(&ring->sched, NULL); 4938 } 4939 return PCI_ERS_RESULT_NEED_RESET; 4940 case pci_channel_io_perm_failure: 4941 /* Permanent error, prepare for device removal */ 4942 return PCI_ERS_RESULT_DISCONNECT; 4943 } 4944 4945 return PCI_ERS_RESULT_NEED_RESET; 4946 } 4947 4948 /** 4949 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 4950 * @pdev: pointer to PCI device 4951 */ 4952 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 4953 { 4954 4955 DRM_INFO("PCI error: mmio enabled callback!!\n"); 4956 4957 /* TODO - dump whatever for debugging purposes */ 4958 4959 /* This called only if amdgpu_pci_error_detected returns 4960 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 4961 * works, no need to reset slot. 4962 */ 4963 4964 return PCI_ERS_RESULT_RECOVERED; 4965 } 4966 4967 /** 4968 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 4969 * @pdev: PCI device struct 4970 * 4971 * Description: This routine is called by the pci error recovery 4972 * code after the PCI slot has been reset, just before we 4973 * should resume normal operations. 4974 */ 4975 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 4976 { 4977 struct drm_device *dev = pci_get_drvdata(pdev); 4978 struct amdgpu_device *adev = drm_to_adev(dev); 4979 int r, i; 4980 bool need_full_reset = true; 4981 u32 memsize; 4982 struct list_head device_list; 4983 4984 DRM_INFO("PCI error: slot reset callback!!\n"); 4985 4986 INIT_LIST_HEAD(&device_list); 4987 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4988 4989 /* wait for asic to come out of reset */ 4990 msleep(500); 4991 4992 /* Restore PCI confspace */ 4993 amdgpu_device_load_pci_state(pdev); 4994 4995 /* confirm ASIC came out of reset */ 4996 for (i = 0; i < adev->usec_timeout; i++) { 4997 memsize = amdgpu_asic_get_config_memsize(adev); 4998 4999 if (memsize != 0xffffffff) 5000 break; 5001 udelay(1); 5002 } 5003 if (memsize == 0xffffffff) { 5004 r = -ETIME; 5005 goto out; 5006 } 5007 5008 adev->in_pci_err_recovery = true; 5009 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5010 adev->in_pci_err_recovery = false; 5011 if (r) 5012 goto out; 5013 5014 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5015 5016 out: 5017 if (!r) { 5018 if (amdgpu_device_cache_pci_state(adev->pdev)) 5019 pci_restore_state(adev->pdev); 5020 5021 DRM_INFO("PCIe error recovery succeeded\n"); 5022 } else { 5023 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5024 amdgpu_device_unlock_adev(adev); 5025 } 5026 5027 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5028 } 5029 5030 /** 5031 * amdgpu_pci_resume() - resume normal ops after PCI reset 5032 * @pdev: pointer to PCI device 5033 * 5034 * Called when the error recovery driver tells us that its 5035 * OK to resume normal operation. Use completion to allow 5036 * halted scsi ops to resume. 5037 */ 5038 void amdgpu_pci_resume(struct pci_dev *pdev) 5039 { 5040 struct drm_device *dev = pci_get_drvdata(pdev); 5041 struct amdgpu_device *adev = drm_to_adev(dev); 5042 int i; 5043 5044 5045 DRM_INFO("PCI error: resume callback!!\n"); 5046 5047 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5048 struct amdgpu_ring *ring = adev->rings[i]; 5049 5050 if (!ring || !ring->sched.thread) 5051 continue; 5052 5053 5054 drm_sched_resubmit_jobs(&ring->sched); 5055 drm_sched_start(&ring->sched, true); 5056 } 5057 5058 amdgpu_device_unlock_adev(adev); 5059 } 5060 5061 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5062 { 5063 struct drm_device *dev = pci_get_drvdata(pdev); 5064 struct amdgpu_device *adev = drm_to_adev(dev); 5065 int r; 5066 5067 r = pci_save_state(pdev); 5068 if (!r) { 5069 kfree(adev->pci_state); 5070 5071 adev->pci_state = pci_store_saved_state(pdev); 5072 5073 if (!adev->pci_state) { 5074 DRM_ERROR("Failed to store PCI saved state"); 5075 return false; 5076 } 5077 } else { 5078 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5079 return false; 5080 } 5081 5082 return true; 5083 } 5084 5085 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5086 { 5087 struct drm_device *dev = pci_get_drvdata(pdev); 5088 struct amdgpu_device *adev = drm_to_adev(dev); 5089 int r; 5090 5091 if (!adev->pci_state) 5092 return false; 5093 5094 r = pci_load_saved_state(pdev, adev->pci_state); 5095 5096 if (!r) { 5097 pci_restore_state(pdev); 5098 } else { 5099 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5100 return false; 5101 } 5102 5103 return true; 5104 } 5105 5106 5107