1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 #include <linux/nospec.h> 40 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_client_event.h> 43 #include <drm/drm_crtc_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/device.h> 47 #include <linux/vgaarb.h> 48 #include <linux/vga_switcheroo.h> 49 #include <linux/efi.h> 50 #include "amdgpu.h" 51 #include "amdgpu_trace.h" 52 #include "amdgpu_i2c.h" 53 #include "atom.h" 54 #include "amdgpu_atombios.h" 55 #include "amdgpu_atomfirmware.h" 56 #include "amd_pcie.h" 57 #ifdef CONFIG_DRM_AMDGPU_SI 58 #include "si.h" 59 #endif 60 #ifdef CONFIG_DRM_AMDGPU_CIK 61 #include "cik.h" 62 #endif 63 #include "vi.h" 64 #include "soc15.h" 65 #include "nv.h" 66 #include "bif/bif_4_1_d.h" 67 #include <linux/firmware.h> 68 #include "amdgpu_vf_error.h" 69 70 #include "amdgpu_amdkfd.h" 71 #include "amdgpu_pm.h" 72 73 #include "amdgpu_xgmi.h" 74 #include "amdgpu_ras.h" 75 #include "amdgpu_ras_mgr.h" 76 #include "amdgpu_pmu.h" 77 #include "amdgpu_fru_eeprom.h" 78 #include "amdgpu_reset.h" 79 #include "amdgpu_virt.h" 80 #include "amdgpu_dev_coredump.h" 81 82 #include <linux/suspend.h> 83 #include <drm/task_barrier.h> 84 #include <linux/pm_runtime.h> 85 86 #include <drm/drm_drv.h> 87 88 #if IS_ENABLED(CONFIG_X86) 89 #include <asm/intel-family.h> 90 #include <asm/cpu_device_id.h> 91 #endif 92 93 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 100 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 101 102 #define AMDGPU_RESUME_MS 2000 103 #define AMDGPU_MAX_RETRY_LIMIT 2 104 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 105 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 106 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 107 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 108 109 #define AMDGPU_VBIOS_SKIP (1U << 0) 110 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 111 112 static const struct drm_driver amdgpu_kms_driver; 113 114 const char *amdgpu_asic_name[] = { 115 "TAHITI", 116 "PITCAIRN", 117 "VERDE", 118 "OLAND", 119 "HAINAN", 120 "BONAIRE", 121 "KAVERI", 122 "KABINI", 123 "HAWAII", 124 "MULLINS", 125 "TOPAZ", 126 "TONGA", 127 "FIJI", 128 "CARRIZO", 129 "STONEY", 130 "POLARIS10", 131 "POLARIS11", 132 "POLARIS12", 133 "VEGAM", 134 "VEGA10", 135 "VEGA12", 136 "VEGA20", 137 "RAVEN", 138 "ARCTURUS", 139 "RENOIR", 140 "ALDEBARAN", 141 "NAVI10", 142 "CYAN_SKILLFISH", 143 "NAVI14", 144 "NAVI12", 145 "SIENNA_CICHLID", 146 "NAVY_FLOUNDER", 147 "VANGOGH", 148 "DIMGREY_CAVEFISH", 149 "BEIGE_GOBY", 150 "YELLOW_CARP", 151 "IP DISCOVERY", 152 "LAST", 153 }; 154 155 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 156 /* 157 * Default init level where all blocks are expected to be initialized. This is 158 * the level of initialization expected by default and also after a full reset 159 * of the device. 160 */ 161 struct amdgpu_init_level amdgpu_init_default = { 162 .level = AMDGPU_INIT_LEVEL_DEFAULT, 163 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 164 }; 165 166 struct amdgpu_init_level amdgpu_init_recovery = { 167 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 168 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 169 }; 170 171 /* 172 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 173 * is used for cases like reset on initialization where the entire hive needs to 174 * be reset before first use. 175 */ 176 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 177 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 178 .hwini_ip_block_mask = 179 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 180 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 181 BIT(AMD_IP_BLOCK_TYPE_PSP) 182 }; 183 184 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 186 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 187 188 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 189 190 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 191 enum amd_ip_block_type block) 192 { 193 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 194 } 195 196 void amdgpu_set_init_level(struct amdgpu_device *adev, 197 enum amdgpu_init_lvl_id lvl) 198 { 199 switch (lvl) { 200 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 201 adev->init_lvl = &amdgpu_init_minimal_xgmi; 202 break; 203 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 204 adev->init_lvl = &amdgpu_init_recovery; 205 break; 206 case AMDGPU_INIT_LEVEL_DEFAULT: 207 fallthrough; 208 default: 209 adev->init_lvl = &amdgpu_init_default; 210 break; 211 } 212 } 213 214 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 215 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 216 void *data); 217 218 /** 219 * DOC: pcie_replay_count 220 * 221 * The amdgpu driver provides a sysfs API for reporting the total number 222 * of PCIe replays (NAKs). 223 * The file pcie_replay_count is used for this and returns the total 224 * number of replays as a sum of the NAKs generated and NAKs received. 225 */ 226 227 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 228 struct device_attribute *attr, char *buf) 229 { 230 struct drm_device *ddev = dev_get_drvdata(dev); 231 struct amdgpu_device *adev = drm_to_adev(ddev); 232 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 233 234 return sysfs_emit(buf, "%llu\n", cnt); 235 } 236 237 static DEVICE_ATTR(pcie_replay_count, 0444, 238 amdgpu_device_get_pcie_replay_count, NULL); 239 240 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 241 { 242 int ret = 0; 243 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 ret = sysfs_create_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 248 return ret; 249 } 250 251 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 252 { 253 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 254 sysfs_remove_file(&adev->dev->kobj, 255 &dev_attr_pcie_replay_count.attr); 256 } 257 258 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 259 const struct bin_attribute *attr, char *buf, 260 loff_t ppos, size_t count) 261 { 262 struct device *dev = kobj_to_dev(kobj); 263 struct drm_device *ddev = dev_get_drvdata(dev); 264 struct amdgpu_device *adev = drm_to_adev(ddev); 265 ssize_t bytes_read; 266 267 switch (ppos) { 268 case AMDGPU_SYS_REG_STATE_XGMI: 269 bytes_read = amdgpu_asic_get_reg_state( 270 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 271 break; 272 case AMDGPU_SYS_REG_STATE_WAFL: 273 bytes_read = amdgpu_asic_get_reg_state( 274 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 275 break; 276 case AMDGPU_SYS_REG_STATE_PCIE: 277 bytes_read = amdgpu_asic_get_reg_state( 278 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 279 break; 280 case AMDGPU_SYS_REG_STATE_USR: 281 bytes_read = amdgpu_asic_get_reg_state( 282 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 283 break; 284 case AMDGPU_SYS_REG_STATE_USR_1: 285 bytes_read = amdgpu_asic_get_reg_state( 286 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 287 break; 288 default: 289 return -EINVAL; 290 } 291 292 return bytes_read; 293 } 294 295 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 296 AMDGPU_SYS_REG_STATE_END); 297 298 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 299 { 300 int ret; 301 302 if (!amdgpu_asic_get_reg_state_supported(adev)) 303 return 0; 304 305 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 307 return ret; 308 } 309 310 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 311 { 312 if (!amdgpu_asic_get_reg_state_supported(adev)) 313 return; 314 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 315 } 316 317 /** 318 * DOC: board_info 319 * 320 * The amdgpu driver provides a sysfs API for giving board related information. 321 * It provides the form factor information in the format 322 * 323 * type : form factor 324 * 325 * Possible form factor values 326 * 327 * - "cem" - PCIE CEM card 328 * - "oam" - Open Compute Accelerator Module 329 * - "unknown" - Not known 330 * 331 */ 332 333 static ssize_t amdgpu_device_get_board_info(struct device *dev, 334 struct device_attribute *attr, 335 char *buf) 336 { 337 struct drm_device *ddev = dev_get_drvdata(dev); 338 struct amdgpu_device *adev = drm_to_adev(ddev); 339 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 340 const char *pkg; 341 342 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 343 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 344 345 switch (pkg_type) { 346 case AMDGPU_PKG_TYPE_CEM: 347 pkg = "cem"; 348 break; 349 case AMDGPU_PKG_TYPE_OAM: 350 pkg = "oam"; 351 break; 352 default: 353 pkg = "unknown"; 354 break; 355 } 356 357 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 358 } 359 360 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 361 362 static struct attribute *amdgpu_board_attrs[] = { 363 &dev_attr_board_info.attr, 364 NULL, 365 }; 366 367 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 368 struct attribute *attr, int n) 369 { 370 struct device *dev = kobj_to_dev(kobj); 371 struct drm_device *ddev = dev_get_drvdata(dev); 372 struct amdgpu_device *adev = drm_to_adev(ddev); 373 374 if (adev->flags & AMD_IS_APU) 375 return 0; 376 377 return attr->mode; 378 } 379 380 static const struct attribute_group amdgpu_board_attrs_group = { 381 .attrs = amdgpu_board_attrs, 382 .is_visible = amdgpu_board_attrs_is_visible 383 }; 384 385 /** 386 * DOC: uma/carveout_options 387 * 388 * This is a read-only file that lists all available UMA allocation 389 * options and their corresponding indices. Example output:: 390 * 391 * $ cat uma/carveout_options 392 * 0: Minimum (512 MB) 393 * 1: (1 GB) 394 * 2: (2 GB) 395 * 3: (4 GB) 396 * 4: (6 GB) 397 * 5: (8 GB) 398 * 6: (12 GB) 399 * 7: Medium (16 GB) 400 * 8: (24 GB) 401 * 9: High (32 GB) 402 */ 403 static ssize_t carveout_options_show(struct device *dev, 404 struct device_attribute *attr, 405 char *buf) 406 { 407 struct drm_device *ddev = dev_get_drvdata(dev); 408 struct amdgpu_device *adev = drm_to_adev(ddev); 409 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 410 uint32_t memory_carved; 411 ssize_t size = 0; 412 413 if (!uma_info || !uma_info->num_entries) 414 return -ENODEV; 415 416 for (int i = 0; i < uma_info->num_entries; i++) { 417 memory_carved = uma_info->entries[i].memory_carved_mb; 418 if (memory_carved >= SZ_1G/SZ_1M) { 419 size += sysfs_emit_at(buf, size, "%d: %s (%u GB)\n", 420 i, 421 uma_info->entries[i].name, 422 memory_carved >> 10); 423 } else { 424 size += sysfs_emit_at(buf, size, "%d: %s (%u MB)\n", 425 i, 426 uma_info->entries[i].name, 427 memory_carved); 428 } 429 } 430 431 return size; 432 } 433 static DEVICE_ATTR_RO(carveout_options); 434 435 /** 436 * DOC: uma/carveout 437 * 438 * This file is both readable and writable. When read, it shows the 439 * index of the current setting. Writing a valid index to this file 440 * allows users to change the UMA carveout size to the selected option 441 * on the next boot. 442 * 443 * The available options and their corresponding indices can be read 444 * from the uma/carveout_options file. 445 */ 446 static ssize_t carveout_show(struct device *dev, 447 struct device_attribute *attr, 448 char *buf) 449 { 450 struct drm_device *ddev = dev_get_drvdata(dev); 451 struct amdgpu_device *adev = drm_to_adev(ddev); 452 453 return sysfs_emit(buf, "%u\n", adev->uma_info.uma_option_index); 454 } 455 456 static ssize_t carveout_store(struct device *dev, 457 struct device_attribute *attr, 458 const char *buf, size_t count) 459 { 460 struct drm_device *ddev = dev_get_drvdata(dev); 461 struct amdgpu_device *adev = drm_to_adev(ddev); 462 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 463 struct amdgpu_uma_carveout_option *opt; 464 unsigned long val; 465 uint8_t flags; 466 int r; 467 468 r = kstrtoul(buf, 10, &val); 469 if (r) 470 return r; 471 472 if (val >= uma_info->num_entries) 473 return -EINVAL; 474 475 val = array_index_nospec(val, uma_info->num_entries); 476 opt = &uma_info->entries[val]; 477 478 if (!(opt->flags & AMDGPU_UMA_FLAG_AUTO) && 479 !(opt->flags & AMDGPU_UMA_FLAG_CUSTOM)) { 480 drm_err_once(ddev, "Option %lu not supported due to lack of Custom/Auto flag", val); 481 return -EINVAL; 482 } 483 484 flags = opt->flags; 485 flags &= ~((flags & AMDGPU_UMA_FLAG_AUTO) >> 1); 486 487 guard(mutex)(&uma_info->update_lock); 488 489 r = amdgpu_acpi_set_uma_allocation_size(adev, val, flags); 490 if (r) 491 return r; 492 493 uma_info->uma_option_index = val; 494 495 return count; 496 } 497 static DEVICE_ATTR_RW(carveout); 498 499 static struct attribute *amdgpu_uma_attrs[] = { 500 &dev_attr_carveout.attr, 501 &dev_attr_carveout_options.attr, 502 NULL 503 }; 504 505 const struct attribute_group amdgpu_uma_attr_group = { 506 .name = "uma", 507 .attrs = amdgpu_uma_attrs 508 }; 509 510 static void amdgpu_uma_sysfs_init(struct amdgpu_device *adev) 511 { 512 int rc; 513 514 if (!(adev->flags & AMD_IS_APU)) 515 return; 516 517 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 518 return; 519 520 rc = amdgpu_atomfirmware_get_uma_carveout_info(adev, &adev->uma_info); 521 if (rc) { 522 drm_dbg(adev_to_drm(adev), 523 "Failed to parse UMA carveout info from VBIOS: %d\n", rc); 524 goto out_info; 525 } 526 527 mutex_init(&adev->uma_info.update_lock); 528 529 rc = devm_device_add_group(adev->dev, &amdgpu_uma_attr_group); 530 if (rc) { 531 drm_dbg(adev_to_drm(adev), "Failed to add UMA carveout sysfs interfaces %d\n", rc); 532 goto out_attr; 533 } 534 535 return; 536 537 out_attr: 538 mutex_destroy(&adev->uma_info.update_lock); 539 out_info: 540 return; 541 } 542 543 static void amdgpu_uma_sysfs_fini(struct amdgpu_device *adev) 544 { 545 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 546 547 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 548 return; 549 550 mutex_destroy(&uma_info->update_lock); 551 uma_info->num_entries = 0; 552 } 553 554 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 555 556 /** 557 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 558 * 559 * @adev: amdgpu device pointer 560 * 561 * Returns true if the device is a dGPU with ATPX power control, 562 * otherwise return false. 563 */ 564 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 565 { 566 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 567 return true; 568 return false; 569 } 570 571 /** 572 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 573 * 574 * @adev: amdgpu device pointer 575 * 576 * Returns true if the device is a dGPU with ACPI power control, 577 * otherwise return false. 578 */ 579 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 580 { 581 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 582 return false; 583 584 if (adev->has_pr3 || 585 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 586 return true; 587 return false; 588 } 589 590 /** 591 * amdgpu_device_supports_baco - Does the device support BACO 592 * 593 * @adev: amdgpu device pointer 594 * 595 * Return: 596 * 1 if the device supports BACO; 597 * 3 if the device supports MACO (only works if BACO is supported) 598 * otherwise return 0. 599 */ 600 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 601 { 602 return amdgpu_asic_supports_baco(adev); 603 } 604 605 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 606 { 607 int bamaco_support; 608 609 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 610 bamaco_support = amdgpu_device_supports_baco(adev); 611 612 switch (amdgpu_runtime_pm) { 613 case 2: 614 if (bamaco_support & MACO_SUPPORT) { 615 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 616 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 617 } else if (bamaco_support == BACO_SUPPORT) { 618 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 619 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 620 } 621 break; 622 case 1: 623 if (bamaco_support & BACO_SUPPORT) { 624 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 625 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 626 } 627 break; 628 case -1: 629 case -2: 630 if (amdgpu_device_supports_px(adev)) { 631 /* enable PX as runtime mode */ 632 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 633 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 634 } else if (amdgpu_device_supports_boco(adev)) { 635 /* enable boco as runtime mode */ 636 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 637 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 638 } else { 639 if (!bamaco_support) 640 goto no_runtime_pm; 641 642 switch (adev->asic_type) { 643 case CHIP_VEGA20: 644 case CHIP_ARCTURUS: 645 /* BACO are not supported on vega20 and arctrus */ 646 break; 647 case CHIP_VEGA10: 648 /* enable BACO as runpm mode if noretry=0 */ 649 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 650 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 651 break; 652 default: 653 /* enable BACO as runpm mode on CI+ */ 654 if (!amdgpu_passthrough(adev)) 655 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 656 break; 657 } 658 659 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 660 if (bamaco_support & MACO_SUPPORT) { 661 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 662 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 663 } else { 664 dev_info(adev->dev, "Using BACO for runtime pm\n"); 665 } 666 } 667 } 668 break; 669 case 0: 670 dev_info(adev->dev, "runtime pm is manually disabled\n"); 671 break; 672 default: 673 break; 674 } 675 676 no_runtime_pm: 677 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 678 dev_info(adev->dev, "Runtime PM not available\n"); 679 } 680 /** 681 * amdgpu_device_supports_smart_shift - Is the device dGPU with 682 * smart shift support 683 * 684 * @adev: amdgpu device pointer 685 * 686 * Returns true if the device is a dGPU with Smart Shift support, 687 * otherwise returns false. 688 */ 689 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 690 { 691 return (amdgpu_device_supports_boco(adev) && 692 amdgpu_acpi_is_power_shift_control_supported()); 693 } 694 695 /* 696 * VRAM access helper functions 697 */ 698 699 /** 700 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 701 * 702 * @adev: amdgpu_device pointer 703 * @pos: offset of the buffer in vram 704 * @buf: virtual address of the buffer in system memory 705 * @size: read/write size, sizeof(@buf) must > @size 706 * @write: true - write to vram, otherwise - read from vram 707 */ 708 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 709 void *buf, size_t size, bool write) 710 { 711 unsigned long flags; 712 uint32_t hi = ~0, tmp = 0; 713 uint32_t *data = buf; 714 uint64_t last; 715 int idx; 716 717 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 718 return; 719 720 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 721 722 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 723 for (last = pos + size; pos < last; pos += 4) { 724 tmp = pos >> 31; 725 726 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 727 if (tmp != hi) { 728 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 729 hi = tmp; 730 } 731 if (write) 732 WREG32_NO_KIQ(mmMM_DATA, *data++); 733 else 734 *data++ = RREG32_NO_KIQ(mmMM_DATA); 735 } 736 737 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 738 drm_dev_exit(idx); 739 } 740 741 /** 742 * amdgpu_device_aper_access - access vram by vram aperture 743 * 744 * @adev: amdgpu_device pointer 745 * @pos: offset of the buffer in vram 746 * @buf: virtual address of the buffer in system memory 747 * @size: read/write size, sizeof(@buf) must > @size 748 * @write: true - write to vram, otherwise - read from vram 749 * 750 * The return value means how many bytes have been transferred. 751 */ 752 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 753 void *buf, size_t size, bool write) 754 { 755 #ifdef CONFIG_64BIT 756 void __iomem *addr; 757 size_t count = 0; 758 uint64_t last; 759 760 if (!adev->mman.aper_base_kaddr) 761 return 0; 762 763 last = min(pos + size, adev->gmc.visible_vram_size); 764 if (last > pos) { 765 addr = adev->mman.aper_base_kaddr + pos; 766 count = last - pos; 767 768 if (write) { 769 memcpy_toio(addr, buf, count); 770 /* Make sure HDP write cache flush happens without any reordering 771 * after the system memory contents are sent over PCIe device 772 */ 773 mb(); 774 amdgpu_device_flush_hdp(adev, NULL); 775 } else { 776 amdgpu_device_invalidate_hdp(adev, NULL); 777 /* Make sure HDP read cache is invalidated before issuing a read 778 * to the PCIe device 779 */ 780 mb(); 781 memcpy_fromio(buf, addr, count); 782 } 783 784 } 785 786 return count; 787 #else 788 return 0; 789 #endif 790 } 791 792 /** 793 * amdgpu_device_vram_access - read/write a buffer in vram 794 * 795 * @adev: amdgpu_device pointer 796 * @pos: offset of the buffer in vram 797 * @buf: virtual address of the buffer in system memory 798 * @size: read/write size, sizeof(@buf) must > @size 799 * @write: true - write to vram, otherwise - read from vram 800 */ 801 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 802 void *buf, size_t size, bool write) 803 { 804 size_t count; 805 806 /* try to using vram apreature to access vram first */ 807 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 808 size -= count; 809 if (size) { 810 /* using MM to access rest vram */ 811 pos += count; 812 buf += count; 813 amdgpu_device_mm_access(adev, pos, buf, size, write); 814 } 815 } 816 817 /* 818 * register access helper functions. 819 */ 820 821 /* Check if hw access should be skipped because of hotplug or device error */ 822 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 823 { 824 if (adev->no_hw_access) 825 return true; 826 827 #ifdef CONFIG_LOCKDEP 828 /* 829 * This is a bit complicated to understand, so worth a comment. What we assert 830 * here is that the GPU reset is not running on another thread in parallel. 831 * 832 * For this we trylock the read side of the reset semaphore, if that succeeds 833 * we know that the reset is not running in parallel. 834 * 835 * If the trylock fails we assert that we are either already holding the read 836 * side of the lock or are the reset thread itself and hold the write side of 837 * the lock. 838 */ 839 if (in_task()) { 840 if (down_read_trylock(&adev->reset_domain->sem)) 841 up_read(&adev->reset_domain->sem); 842 else 843 lockdep_assert_held(&adev->reset_domain->sem); 844 } 845 #endif 846 return false; 847 } 848 849 /** 850 * amdgpu_device_rreg - read a memory mapped IO or indirect register 851 * 852 * @adev: amdgpu_device pointer 853 * @reg: dword aligned register offset 854 * @acc_flags: access flags which require special behavior 855 * 856 * Returns the 32 bit value from the offset specified. 857 */ 858 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 859 uint32_t reg, uint32_t acc_flags) 860 { 861 uint32_t ret; 862 863 if (amdgpu_device_skip_hw_access(adev)) 864 return 0; 865 866 if ((reg * 4) < adev->rmmio_size) { 867 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 868 amdgpu_sriov_runtime(adev) && 869 down_read_trylock(&adev->reset_domain->sem)) { 870 ret = amdgpu_kiq_rreg(adev, reg, 0); 871 up_read(&adev->reset_domain->sem); 872 } else { 873 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 874 } 875 } else { 876 ret = adev->pcie_rreg(adev, reg * 4); 877 } 878 879 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 880 881 return ret; 882 } 883 884 /* 885 * MMIO register read with bytes helper functions 886 * @offset:bytes offset from MMIO start 887 */ 888 889 /** 890 * amdgpu_mm_rreg8 - read a memory mapped IO register 891 * 892 * @adev: amdgpu_device pointer 893 * @offset: byte aligned register offset 894 * 895 * Returns the 8 bit value from the offset specified. 896 */ 897 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 898 { 899 if (amdgpu_device_skip_hw_access(adev)) 900 return 0; 901 902 if (offset < adev->rmmio_size) 903 return (readb(adev->rmmio + offset)); 904 BUG(); 905 } 906 907 908 /** 909 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 910 * 911 * @adev: amdgpu_device pointer 912 * @reg: dword aligned register offset 913 * @acc_flags: access flags which require special behavior 914 * @xcc_id: xcc accelerated compute core id 915 * 916 * Returns the 32 bit value from the offset specified. 917 */ 918 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 919 uint32_t reg, uint32_t acc_flags, 920 uint32_t xcc_id) 921 { 922 uint32_t ret, rlcg_flag; 923 924 if (amdgpu_device_skip_hw_access(adev)) 925 return 0; 926 927 if ((reg * 4) < adev->rmmio_size) { 928 if (amdgpu_sriov_vf(adev) && 929 !amdgpu_sriov_runtime(adev) && 930 adev->gfx.rlc.rlcg_reg_access_supported && 931 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 932 GC_HWIP, false, 933 &rlcg_flag)) { 934 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 935 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 936 amdgpu_sriov_runtime(adev) && 937 down_read_trylock(&adev->reset_domain->sem)) { 938 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 939 up_read(&adev->reset_domain->sem); 940 } else { 941 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 942 } 943 } else { 944 ret = adev->pcie_rreg(adev, reg * 4); 945 } 946 947 return ret; 948 } 949 950 /* 951 * MMIO register write with bytes helper functions 952 * @offset:bytes offset from MMIO start 953 * @value: the value want to be written to the register 954 */ 955 956 /** 957 * amdgpu_mm_wreg8 - read a memory mapped IO register 958 * 959 * @adev: amdgpu_device pointer 960 * @offset: byte aligned register offset 961 * @value: 8 bit value to write 962 * 963 * Writes the value specified to the offset specified. 964 */ 965 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 966 { 967 if (amdgpu_device_skip_hw_access(adev)) 968 return; 969 970 if (offset < adev->rmmio_size) 971 writeb(value, adev->rmmio + offset); 972 else 973 BUG(); 974 } 975 976 /** 977 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 978 * 979 * @adev: amdgpu_device pointer 980 * @reg: dword aligned register offset 981 * @v: 32 bit value to write to the register 982 * @acc_flags: access flags which require special behavior 983 * 984 * Writes the value specified to the offset specified. 985 */ 986 void amdgpu_device_wreg(struct amdgpu_device *adev, 987 uint32_t reg, uint32_t v, 988 uint32_t acc_flags) 989 { 990 if (amdgpu_device_skip_hw_access(adev)) 991 return; 992 993 if ((reg * 4) < adev->rmmio_size) { 994 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 995 amdgpu_sriov_runtime(adev) && 996 down_read_trylock(&adev->reset_domain->sem)) { 997 amdgpu_kiq_wreg(adev, reg, v, 0); 998 up_read(&adev->reset_domain->sem); 999 } else { 1000 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1001 } 1002 } else { 1003 adev->pcie_wreg(adev, reg * 4, v); 1004 } 1005 1006 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 1007 } 1008 1009 /** 1010 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 1011 * 1012 * @adev: amdgpu_device pointer 1013 * @reg: mmio/rlc register 1014 * @v: value to write 1015 * @xcc_id: xcc accelerated compute core id 1016 * 1017 * this function is invoked only for the debugfs register access 1018 */ 1019 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 1020 uint32_t reg, uint32_t v, 1021 uint32_t xcc_id) 1022 { 1023 if (amdgpu_device_skip_hw_access(adev)) 1024 return; 1025 1026 if (amdgpu_sriov_fullaccess(adev) && 1027 adev->gfx.rlc.funcs && 1028 adev->gfx.rlc.funcs->is_rlcg_access_range) { 1029 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 1030 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 1031 } else if ((reg * 4) >= adev->rmmio_size) { 1032 adev->pcie_wreg(adev, reg * 4, v); 1033 } else { 1034 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1035 } 1036 } 1037 1038 /** 1039 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 1040 * 1041 * @adev: amdgpu_device pointer 1042 * @reg: dword aligned register offset 1043 * @v: 32 bit value to write to the register 1044 * @acc_flags: access flags which require special behavior 1045 * @xcc_id: xcc accelerated compute core id 1046 * 1047 * Writes the value specified to the offset specified. 1048 */ 1049 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 1050 uint32_t reg, uint32_t v, 1051 uint32_t acc_flags, uint32_t xcc_id) 1052 { 1053 uint32_t rlcg_flag; 1054 1055 if (amdgpu_device_skip_hw_access(adev)) 1056 return; 1057 1058 if ((reg * 4) < adev->rmmio_size) { 1059 if (amdgpu_sriov_vf(adev) && 1060 !amdgpu_sriov_runtime(adev) && 1061 adev->gfx.rlc.rlcg_reg_access_supported && 1062 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 1063 GC_HWIP, true, 1064 &rlcg_flag)) { 1065 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 1066 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 1067 amdgpu_sriov_runtime(adev) && 1068 down_read_trylock(&adev->reset_domain->sem)) { 1069 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 1070 up_read(&adev->reset_domain->sem); 1071 } else { 1072 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1073 } 1074 } else { 1075 adev->pcie_wreg(adev, reg * 4, v); 1076 } 1077 } 1078 1079 /** 1080 * amdgpu_device_indirect_rreg - read an indirect register 1081 * 1082 * @adev: amdgpu_device pointer 1083 * @reg_addr: indirect register address to read from 1084 * 1085 * Returns the value of indirect register @reg_addr 1086 */ 1087 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 1088 u32 reg_addr) 1089 { 1090 unsigned long flags, pcie_index, pcie_data; 1091 void __iomem *pcie_index_offset; 1092 void __iomem *pcie_data_offset; 1093 u32 r; 1094 1095 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1096 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1097 1098 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1099 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1100 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1101 1102 writel(reg_addr, pcie_index_offset); 1103 readl(pcie_index_offset); 1104 r = readl(pcie_data_offset); 1105 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1106 1107 return r; 1108 } 1109 1110 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 1111 u64 reg_addr) 1112 { 1113 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1114 u32 r; 1115 void __iomem *pcie_index_offset; 1116 void __iomem *pcie_index_hi_offset; 1117 void __iomem *pcie_data_offset; 1118 1119 if (unlikely(!adev->nbio.funcs)) { 1120 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 1121 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 1122 } else { 1123 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1124 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1125 } 1126 1127 if (reg_addr >> 32) { 1128 if (unlikely(!adev->nbio.funcs)) 1129 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 1130 else 1131 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1132 } else { 1133 pcie_index_hi = 0; 1134 } 1135 1136 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1137 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1138 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1139 if (pcie_index_hi != 0) 1140 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1141 pcie_index_hi * 4; 1142 1143 writel(reg_addr, pcie_index_offset); 1144 readl(pcie_index_offset); 1145 if (pcie_index_hi != 0) { 1146 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1147 readl(pcie_index_hi_offset); 1148 } 1149 r = readl(pcie_data_offset); 1150 1151 /* clear the high bits */ 1152 if (pcie_index_hi != 0) { 1153 writel(0, pcie_index_hi_offset); 1154 readl(pcie_index_hi_offset); 1155 } 1156 1157 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1158 1159 return r; 1160 } 1161 1162 /** 1163 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1164 * 1165 * @adev: amdgpu_device pointer 1166 * @reg_addr: indirect register address to read from 1167 * 1168 * Returns the value of indirect register @reg_addr 1169 */ 1170 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1171 u32 reg_addr) 1172 { 1173 unsigned long flags, pcie_index, pcie_data; 1174 void __iomem *pcie_index_offset; 1175 void __iomem *pcie_data_offset; 1176 u64 r; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* read low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 r = readl(pcie_data_offset); 1189 /* read high 32 bits */ 1190 writel(reg_addr + 4, pcie_index_offset); 1191 readl(pcie_index_offset); 1192 r |= ((u64)readl(pcie_data_offset) << 32); 1193 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1194 1195 return r; 1196 } 1197 1198 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 u64 r; 1207 1208 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1209 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1210 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1211 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1212 1213 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1214 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1215 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1216 if (pcie_index_hi != 0) 1217 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1218 pcie_index_hi * 4; 1219 1220 /* read low 32 bits */ 1221 writel(reg_addr, pcie_index_offset); 1222 readl(pcie_index_offset); 1223 if (pcie_index_hi != 0) { 1224 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1225 readl(pcie_index_hi_offset); 1226 } 1227 r = readl(pcie_data_offset); 1228 /* read high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 r |= ((u64)readl(pcie_data_offset) << 32); 1236 1237 /* clear the high bits */ 1238 if (pcie_index_hi != 0) { 1239 writel(0, pcie_index_hi_offset); 1240 readl(pcie_index_hi_offset); 1241 } 1242 1243 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1244 1245 return r; 1246 } 1247 1248 /** 1249 * amdgpu_device_indirect_wreg - write an indirect register address 1250 * 1251 * @adev: amdgpu_device pointer 1252 * @reg_addr: indirect register offset 1253 * @reg_data: indirect register data 1254 * 1255 */ 1256 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1257 u32 reg_addr, u32 reg_data) 1258 { 1259 unsigned long flags, pcie_index, pcie_data; 1260 void __iomem *pcie_index_offset; 1261 void __iomem *pcie_data_offset; 1262 1263 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1264 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1265 1266 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1267 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1268 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1269 1270 writel(reg_addr, pcie_index_offset); 1271 readl(pcie_index_offset); 1272 writel(reg_data, pcie_data_offset); 1273 readl(pcie_data_offset); 1274 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1275 } 1276 1277 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1278 u64 reg_addr, u32 reg_data) 1279 { 1280 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1281 void __iomem *pcie_index_offset; 1282 void __iomem *pcie_index_hi_offset; 1283 void __iomem *pcie_data_offset; 1284 1285 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1286 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1287 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1288 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1289 else 1290 pcie_index_hi = 0; 1291 1292 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1293 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1294 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1295 if (pcie_index_hi != 0) 1296 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1297 pcie_index_hi * 4; 1298 1299 writel(reg_addr, pcie_index_offset); 1300 readl(pcie_index_offset); 1301 if (pcie_index_hi != 0) { 1302 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1303 readl(pcie_index_hi_offset); 1304 } 1305 writel(reg_data, pcie_data_offset); 1306 readl(pcie_data_offset); 1307 1308 /* clear the high bits */ 1309 if (pcie_index_hi != 0) { 1310 writel(0, pcie_index_hi_offset); 1311 readl(pcie_index_hi_offset); 1312 } 1313 1314 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1315 } 1316 1317 /** 1318 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1319 * 1320 * @adev: amdgpu_device pointer 1321 * @reg_addr: indirect register offset 1322 * @reg_data: indirect register data 1323 * 1324 */ 1325 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1326 u32 reg_addr, u64 reg_data) 1327 { 1328 unsigned long flags, pcie_index, pcie_data; 1329 void __iomem *pcie_index_offset; 1330 void __iomem *pcie_data_offset; 1331 1332 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1333 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1334 1335 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1336 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1337 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1338 1339 /* write low 32 bits */ 1340 writel(reg_addr, pcie_index_offset); 1341 readl(pcie_index_offset); 1342 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1343 readl(pcie_data_offset); 1344 /* write high 32 bits */ 1345 writel(reg_addr + 4, pcie_index_offset); 1346 readl(pcie_index_offset); 1347 writel((u32)(reg_data >> 32), pcie_data_offset); 1348 readl(pcie_data_offset); 1349 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1350 } 1351 1352 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1353 u64 reg_addr, u64 reg_data) 1354 { 1355 unsigned long flags, pcie_index, pcie_data; 1356 unsigned long pcie_index_hi = 0; 1357 void __iomem *pcie_index_offset; 1358 void __iomem *pcie_index_hi_offset; 1359 void __iomem *pcie_data_offset; 1360 1361 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1362 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1363 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1364 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1365 1366 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1367 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1368 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1369 if (pcie_index_hi != 0) 1370 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1371 pcie_index_hi * 4; 1372 1373 /* write low 32 bits */ 1374 writel(reg_addr, pcie_index_offset); 1375 readl(pcie_index_offset); 1376 if (pcie_index_hi != 0) { 1377 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1378 readl(pcie_index_hi_offset); 1379 } 1380 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1381 readl(pcie_data_offset); 1382 /* write high 32 bits */ 1383 writel(reg_addr + 4, pcie_index_offset); 1384 readl(pcie_index_offset); 1385 if (pcie_index_hi != 0) { 1386 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1387 readl(pcie_index_hi_offset); 1388 } 1389 writel((u32)(reg_data >> 32), pcie_data_offset); 1390 readl(pcie_data_offset); 1391 1392 /* clear the high bits */ 1393 if (pcie_index_hi != 0) { 1394 writel(0, pcie_index_hi_offset); 1395 readl(pcie_index_hi_offset); 1396 } 1397 1398 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1399 } 1400 1401 /** 1402 * amdgpu_device_get_rev_id - query device rev_id 1403 * 1404 * @adev: amdgpu_device pointer 1405 * 1406 * Return device rev_id 1407 */ 1408 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1409 { 1410 return adev->nbio.funcs->get_rev_id(adev); 1411 } 1412 1413 /** 1414 * amdgpu_invalid_rreg - dummy reg read function 1415 * 1416 * @adev: amdgpu_device pointer 1417 * @reg: offset of register 1418 * 1419 * Dummy register read function. Used for register blocks 1420 * that certain asics don't have (all asics). 1421 * Returns the value in the register. 1422 */ 1423 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1424 { 1425 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1426 BUG(); 1427 return 0; 1428 } 1429 1430 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1431 { 1432 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1433 BUG(); 1434 return 0; 1435 } 1436 1437 /** 1438 * amdgpu_invalid_wreg - dummy reg write function 1439 * 1440 * @adev: amdgpu_device pointer 1441 * @reg: offset of register 1442 * @v: value to write to the register 1443 * 1444 * Dummy register read function. Used for register blocks 1445 * that certain asics don't have (all asics). 1446 */ 1447 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1448 { 1449 dev_err(adev->dev, 1450 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1451 v); 1452 BUG(); 1453 } 1454 1455 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1456 { 1457 dev_err(adev->dev, 1458 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1459 v); 1460 BUG(); 1461 } 1462 1463 /** 1464 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1465 * 1466 * @adev: amdgpu_device pointer 1467 * @reg: offset of register 1468 * 1469 * Dummy register read function. Used for register blocks 1470 * that certain asics don't have (all asics). 1471 * Returns the value in the register. 1472 */ 1473 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1474 { 1475 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1476 reg); 1477 BUG(); 1478 return 0; 1479 } 1480 1481 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1482 { 1483 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1484 BUG(); 1485 return 0; 1486 } 1487 1488 /** 1489 * amdgpu_invalid_wreg64 - dummy reg write function 1490 * 1491 * @adev: amdgpu_device pointer 1492 * @reg: offset of register 1493 * @v: value to write to the register 1494 * 1495 * Dummy register read function. Used for register blocks 1496 * that certain asics don't have (all asics). 1497 */ 1498 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1499 { 1500 dev_err(adev->dev, 1501 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1502 reg, v); 1503 BUG(); 1504 } 1505 1506 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1507 { 1508 dev_err(adev->dev, 1509 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1510 reg, v); 1511 BUG(); 1512 } 1513 1514 /** 1515 * amdgpu_block_invalid_rreg - dummy reg read function 1516 * 1517 * @adev: amdgpu_device pointer 1518 * @block: offset of instance 1519 * @reg: offset of register 1520 * 1521 * Dummy register read function. Used for register blocks 1522 * that certain asics don't have (all asics). 1523 * Returns the value in the register. 1524 */ 1525 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1526 uint32_t block, uint32_t reg) 1527 { 1528 dev_err(adev->dev, 1529 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1530 reg, block); 1531 BUG(); 1532 return 0; 1533 } 1534 1535 /** 1536 * amdgpu_block_invalid_wreg - dummy reg write function 1537 * 1538 * @adev: amdgpu_device pointer 1539 * @block: offset of instance 1540 * @reg: offset of register 1541 * @v: value to write to the register 1542 * 1543 * Dummy register read function. Used for register blocks 1544 * that certain asics don't have (all asics). 1545 */ 1546 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1547 uint32_t block, 1548 uint32_t reg, uint32_t v) 1549 { 1550 dev_err(adev->dev, 1551 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1552 reg, block, v); 1553 BUG(); 1554 } 1555 1556 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1557 { 1558 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1559 return AMDGPU_VBIOS_SKIP; 1560 1561 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1562 return AMDGPU_VBIOS_OPTIONAL; 1563 1564 return 0; 1565 } 1566 1567 /** 1568 * amdgpu_device_asic_init - Wrapper for atom asic_init 1569 * 1570 * @adev: amdgpu_device pointer 1571 * 1572 * Does any asic specific work and then calls atom asic init. 1573 */ 1574 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1575 { 1576 uint32_t flags; 1577 bool optional; 1578 int ret; 1579 1580 amdgpu_asic_pre_asic_init(adev); 1581 flags = amdgpu_device_get_vbios_flags(adev); 1582 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1583 1584 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1585 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1586 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1587 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1588 amdgpu_psp_wait_for_bootloader(adev); 1589 if (optional && !adev->bios) 1590 return 0; 1591 1592 ret = amdgpu_atomfirmware_asic_init(adev, true); 1593 return ret; 1594 } else { 1595 if (optional && !adev->bios) 1596 return 0; 1597 1598 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1599 } 1600 1601 return 0; 1602 } 1603 1604 /** 1605 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1606 * 1607 * @adev: amdgpu_device pointer 1608 * 1609 * Allocates a scratch page of VRAM for use by various things in the 1610 * driver. 1611 */ 1612 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1613 { 1614 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1615 AMDGPU_GEM_DOMAIN_VRAM | 1616 AMDGPU_GEM_DOMAIN_GTT, 1617 &adev->mem_scratch.robj, 1618 &adev->mem_scratch.gpu_addr, 1619 (void **)&adev->mem_scratch.ptr); 1620 } 1621 1622 /** 1623 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1624 * 1625 * @adev: amdgpu_device pointer 1626 * 1627 * Frees the VRAM scratch page. 1628 */ 1629 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1630 { 1631 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1632 } 1633 1634 /** 1635 * amdgpu_device_program_register_sequence - program an array of registers. 1636 * 1637 * @adev: amdgpu_device pointer 1638 * @registers: pointer to the register array 1639 * @array_size: size of the register array 1640 * 1641 * Programs an array or registers with and or masks. 1642 * This is a helper for setting golden registers. 1643 */ 1644 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1645 const u32 *registers, 1646 const u32 array_size) 1647 { 1648 u32 tmp, reg, and_mask, or_mask; 1649 int i; 1650 1651 if (array_size % 3) 1652 return; 1653 1654 for (i = 0; i < array_size; i += 3) { 1655 reg = registers[i + 0]; 1656 and_mask = registers[i + 1]; 1657 or_mask = registers[i + 2]; 1658 1659 if (and_mask == 0xffffffff) { 1660 tmp = or_mask; 1661 } else { 1662 tmp = RREG32(reg); 1663 tmp &= ~and_mask; 1664 if (adev->family >= AMDGPU_FAMILY_AI) 1665 tmp |= (or_mask & and_mask); 1666 else 1667 tmp |= or_mask; 1668 } 1669 WREG32(reg, tmp); 1670 } 1671 } 1672 1673 /** 1674 * amdgpu_device_pci_config_reset - reset the GPU 1675 * 1676 * @adev: amdgpu_device pointer 1677 * 1678 * Resets the GPU using the pci config reset sequence. 1679 * Only applicable to asics prior to vega10. 1680 */ 1681 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1682 { 1683 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1684 } 1685 1686 /** 1687 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1688 * 1689 * @adev: amdgpu_device pointer 1690 * 1691 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1692 */ 1693 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1694 { 1695 return pci_reset_function(adev->pdev); 1696 } 1697 1698 /* 1699 * amdgpu_device_wb_*() 1700 * Writeback is the method by which the GPU updates special pages in memory 1701 * with the status of certain GPU events (fences, ring pointers,etc.). 1702 */ 1703 1704 /** 1705 * amdgpu_device_wb_fini - Disable Writeback and free memory 1706 * 1707 * @adev: amdgpu_device pointer 1708 * 1709 * Disables Writeback and frees the Writeback memory (all asics). 1710 * Used at driver shutdown. 1711 */ 1712 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1713 { 1714 if (adev->wb.wb_obj) { 1715 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1716 &adev->wb.gpu_addr, 1717 (void **)&adev->wb.wb); 1718 adev->wb.wb_obj = NULL; 1719 } 1720 } 1721 1722 /** 1723 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1724 * 1725 * @adev: amdgpu_device pointer 1726 * 1727 * Initializes writeback and allocates writeback memory (all asics). 1728 * Used at driver startup. 1729 * Returns 0 on success or an -error on failure. 1730 */ 1731 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1732 { 1733 int r; 1734 1735 if (adev->wb.wb_obj == NULL) { 1736 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1737 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1738 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1739 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1740 (void **)&adev->wb.wb); 1741 if (r) { 1742 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1743 return r; 1744 } 1745 1746 adev->wb.num_wb = AMDGPU_MAX_WB; 1747 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1748 1749 /* clear wb memory */ 1750 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1751 } 1752 1753 return 0; 1754 } 1755 1756 /** 1757 * amdgpu_device_wb_get - Allocate a wb entry 1758 * 1759 * @adev: amdgpu_device pointer 1760 * @wb: wb index 1761 * 1762 * Allocate a wb slot for use by the driver (all asics). 1763 * Returns 0 on success or -EINVAL on failure. 1764 */ 1765 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1766 { 1767 unsigned long flags, offset; 1768 1769 spin_lock_irqsave(&adev->wb.lock, flags); 1770 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1771 if (offset < adev->wb.num_wb) { 1772 __set_bit(offset, adev->wb.used); 1773 spin_unlock_irqrestore(&adev->wb.lock, flags); 1774 *wb = offset << 3; /* convert to dw offset */ 1775 return 0; 1776 } else { 1777 spin_unlock_irqrestore(&adev->wb.lock, flags); 1778 return -EINVAL; 1779 } 1780 } 1781 1782 /** 1783 * amdgpu_device_wb_free - Free a wb entry 1784 * 1785 * @adev: amdgpu_device pointer 1786 * @wb: wb index 1787 * 1788 * Free a wb slot allocated for use by the driver (all asics) 1789 */ 1790 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1791 { 1792 unsigned long flags; 1793 1794 wb >>= 3; 1795 spin_lock_irqsave(&adev->wb.lock, flags); 1796 if (wb < adev->wb.num_wb) 1797 __clear_bit(wb, adev->wb.used); 1798 spin_unlock_irqrestore(&adev->wb.lock, flags); 1799 } 1800 1801 /** 1802 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1803 * 1804 * @adev: amdgpu_device pointer 1805 * 1806 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1807 * to fail, but if any of the BARs is not accessible after the size we abort 1808 * driver loading by returning -ENODEV. 1809 */ 1810 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1811 { 1812 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1813 struct pci_bus *root; 1814 struct resource *res; 1815 int max_size, r; 1816 unsigned int i; 1817 u16 cmd; 1818 1819 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1820 return 0; 1821 1822 /* Bypass for VF */ 1823 if (amdgpu_sriov_vf(adev)) 1824 return 0; 1825 1826 if (!amdgpu_rebar) 1827 return 0; 1828 1829 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1830 if ((amdgpu_runtime_pm != 0) && 1831 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1832 adev->pdev->device == 0x731f && 1833 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1834 return 0; 1835 1836 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1837 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1838 dev_warn( 1839 adev->dev, 1840 "System can't access extended configuration space, please check!!\n"); 1841 1842 /* skip if the bios has already enabled large BAR */ 1843 if (adev->gmc.real_vram_size && 1844 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1845 return 0; 1846 1847 /* Check if the root BUS has 64bit memory resources */ 1848 root = adev->pdev->bus; 1849 while (root->parent) 1850 root = root->parent; 1851 1852 pci_bus_for_each_resource(root, res, i) { 1853 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1854 res->start > 0x100000000ull) 1855 break; 1856 } 1857 1858 /* Trying to resize is pointless without a root hub window above 4GB */ 1859 if (!res) 1860 return 0; 1861 1862 /* Limit the BAR size to what is available */ 1863 max_size = pci_rebar_get_max_size(adev->pdev, 0); 1864 if (max_size < 0) 1865 return 0; 1866 rbar_size = min(max_size, rbar_size); 1867 1868 /* Disable memory decoding while we change the BAR addresses and size */ 1869 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1870 pci_write_config_word(adev->pdev, PCI_COMMAND, 1871 cmd & ~PCI_COMMAND_MEMORY); 1872 1873 /* Tear down doorbell as resizing will release BARs */ 1874 amdgpu_doorbell_fini(adev); 1875 1876 r = pci_resize_resource(adev->pdev, 0, rbar_size, 1877 (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5 1878 : 1 << 2); 1879 if (r == -ENOSPC) 1880 dev_info(adev->dev, 1881 "Not enough PCI address space for a large BAR."); 1882 else if (r && r != -ENOTSUPP) 1883 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1884 1885 /* When the doorbell or fb BAR isn't available we have no chance of 1886 * using the device. 1887 */ 1888 r = amdgpu_doorbell_init(adev); 1889 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1890 return -ENODEV; 1891 1892 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1893 1894 return 0; 1895 } 1896 1897 /* 1898 * GPU helpers function. 1899 */ 1900 /** 1901 * amdgpu_device_need_post - check if the hw need post or not 1902 * 1903 * @adev: amdgpu_device pointer 1904 * 1905 * Check if the asic has been initialized (all asics) at driver startup 1906 * or post is needed if hw reset is performed. 1907 * Returns true if need or false if not. 1908 */ 1909 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1910 { 1911 uint32_t reg, flags; 1912 1913 if (amdgpu_sriov_vf(adev)) 1914 return false; 1915 1916 flags = amdgpu_device_get_vbios_flags(adev); 1917 if (flags & AMDGPU_VBIOS_SKIP) 1918 return false; 1919 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1920 return false; 1921 1922 if (amdgpu_passthrough(adev)) { 1923 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1924 * some old smc fw still need driver do vPost otherwise gpu hang, while 1925 * those smc fw version above 22.15 doesn't have this flaw, so we force 1926 * vpost executed for smc version below 22.15 1927 */ 1928 if (adev->asic_type == CHIP_FIJI) { 1929 int err; 1930 uint32_t fw_ver; 1931 1932 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1933 /* force vPost if error occurred */ 1934 if (err) 1935 return true; 1936 1937 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1938 release_firmware(adev->pm.fw); 1939 if (fw_ver < 0x00160e00) 1940 return true; 1941 } 1942 } 1943 1944 /* Don't post if we need to reset whole hive on init */ 1945 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1946 return false; 1947 1948 if (adev->has_hw_reset) { 1949 adev->has_hw_reset = false; 1950 return true; 1951 } 1952 1953 /* bios scratch used on CIK+ */ 1954 if (adev->asic_type >= CHIP_BONAIRE) 1955 return amdgpu_atombios_scratch_need_asic_init(adev); 1956 1957 /* check MEM_SIZE for older asics */ 1958 reg = amdgpu_asic_get_config_memsize(adev); 1959 1960 if ((reg != 0) && (reg != 0xffffffff)) 1961 return false; 1962 1963 return true; 1964 } 1965 1966 /* 1967 * Check whether seamless boot is supported. 1968 * 1969 * So far we only support seamless boot on DCE 3.0 or later. 1970 * If users report that it works on older ASICS as well, we may 1971 * loosen this. 1972 */ 1973 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1974 { 1975 switch (amdgpu_seamless) { 1976 case -1: 1977 break; 1978 case 1: 1979 return true; 1980 case 0: 1981 return false; 1982 default: 1983 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1984 amdgpu_seamless); 1985 return false; 1986 } 1987 1988 if (!(adev->flags & AMD_IS_APU)) 1989 return false; 1990 1991 if (adev->mman.keep_stolen_vga_memory) 1992 return false; 1993 1994 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1995 } 1996 1997 /* 1998 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1999 * don't support dynamic speed switching. Until we have confirmation from Intel 2000 * that a specific host supports it, it's safer that we keep it disabled for all. 2001 * 2002 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 2003 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 2004 */ 2005 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 2006 { 2007 #if IS_ENABLED(CONFIG_X86) 2008 struct cpuinfo_x86 *c = &cpu_data(0); 2009 2010 /* eGPU change speeds based on USB4 fabric conditions */ 2011 if (dev_is_removable(adev->dev)) 2012 return true; 2013 2014 if (c->x86_vendor == X86_VENDOR_INTEL) 2015 return false; 2016 #endif 2017 return true; 2018 } 2019 2020 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 2021 { 2022 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 2023 * It's unclear if this is a platform-specific or GPU-specific issue. 2024 * Disable ASPM on SI for the time being. 2025 */ 2026 if (adev->family == AMDGPU_FAMILY_SI) 2027 return true; 2028 2029 #if IS_ENABLED(CONFIG_X86) 2030 struct cpuinfo_x86 *c = &cpu_data(0); 2031 2032 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 2033 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 2034 return false; 2035 2036 if (c->x86 == 6 && 2037 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 2038 switch (c->x86_model) { 2039 case VFM_MODEL(INTEL_ALDERLAKE): 2040 case VFM_MODEL(INTEL_ALDERLAKE_L): 2041 case VFM_MODEL(INTEL_RAPTORLAKE): 2042 case VFM_MODEL(INTEL_RAPTORLAKE_P): 2043 case VFM_MODEL(INTEL_RAPTORLAKE_S): 2044 return true; 2045 default: 2046 return false; 2047 } 2048 } else { 2049 return false; 2050 } 2051 #else 2052 return false; 2053 #endif 2054 } 2055 2056 /** 2057 * amdgpu_device_should_use_aspm - check if the device should program ASPM 2058 * 2059 * @adev: amdgpu_device pointer 2060 * 2061 * Confirm whether the module parameter and pcie bridge agree that ASPM should 2062 * be set for this device. 2063 * 2064 * Returns true if it should be used or false if not. 2065 */ 2066 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 2067 { 2068 switch (amdgpu_aspm) { 2069 case -1: 2070 break; 2071 case 0: 2072 return false; 2073 case 1: 2074 return true; 2075 default: 2076 return false; 2077 } 2078 if (adev->flags & AMD_IS_APU) 2079 return false; 2080 if (amdgpu_device_aspm_support_quirk(adev)) 2081 return false; 2082 return pcie_aspm_enabled(adev->pdev); 2083 } 2084 2085 /* if we get transitioned to only one device, take VGA back */ 2086 /** 2087 * amdgpu_device_vga_set_decode - enable/disable vga decode 2088 * 2089 * @pdev: PCI device pointer 2090 * @state: enable/disable vga decode 2091 * 2092 * Enable/disable vga decode (all asics). 2093 * Returns VGA resource flags. 2094 */ 2095 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 2096 bool state) 2097 { 2098 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 2099 2100 amdgpu_asic_set_vga_state(adev, state); 2101 if (state) 2102 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 2103 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 2104 else 2105 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 2106 } 2107 2108 /** 2109 * amdgpu_device_check_block_size - validate the vm block size 2110 * 2111 * @adev: amdgpu_device pointer 2112 * 2113 * Validates the vm block size specified via module parameter. 2114 * The vm block size defines number of bits in page table versus page directory, 2115 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 2116 * page table and the remaining bits are in the page directory. 2117 */ 2118 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 2119 { 2120 /* defines number of bits in page table versus page directory, 2121 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 2122 * page table and the remaining bits are in the page directory 2123 */ 2124 if (amdgpu_vm_block_size == -1) 2125 return; 2126 2127 if (amdgpu_vm_block_size < 9) { 2128 dev_warn(adev->dev, "VM page table size (%d) too small\n", 2129 amdgpu_vm_block_size); 2130 amdgpu_vm_block_size = -1; 2131 } 2132 } 2133 2134 /** 2135 * amdgpu_device_check_vm_size - validate the vm size 2136 * 2137 * @adev: amdgpu_device pointer 2138 * 2139 * Validates the vm size in GB specified via module parameter. 2140 * The VM size is the size of the GPU virtual memory space in GB. 2141 */ 2142 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2143 { 2144 /* no need to check the default value */ 2145 if (amdgpu_vm_size == -1) 2146 return; 2147 2148 if (amdgpu_vm_size < 1) { 2149 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2150 amdgpu_vm_size); 2151 amdgpu_vm_size = -1; 2152 } 2153 } 2154 2155 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2156 { 2157 struct sysinfo si; 2158 bool is_os_64 = (sizeof(void *) == 8); 2159 uint64_t total_memory; 2160 uint64_t dram_size_seven_GB = 0x1B8000000; 2161 uint64_t dram_size_three_GB = 0xB8000000; 2162 2163 if (amdgpu_smu_memory_pool_size == 0) 2164 return; 2165 2166 if (!is_os_64) { 2167 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2168 goto def_value; 2169 } 2170 si_meminfo(&si); 2171 total_memory = (uint64_t)si.totalram * si.mem_unit; 2172 2173 if ((amdgpu_smu_memory_pool_size == 1) || 2174 (amdgpu_smu_memory_pool_size == 2)) { 2175 if (total_memory < dram_size_three_GB) 2176 goto def_value1; 2177 } else if ((amdgpu_smu_memory_pool_size == 4) || 2178 (amdgpu_smu_memory_pool_size == 8)) { 2179 if (total_memory < dram_size_seven_GB) 2180 goto def_value1; 2181 } else { 2182 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2183 goto def_value; 2184 } 2185 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2186 2187 return; 2188 2189 def_value1: 2190 dev_warn(adev->dev, "No enough system memory\n"); 2191 def_value: 2192 adev->pm.smu_prv_buffer_size = 0; 2193 } 2194 2195 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2196 { 2197 if (!(adev->flags & AMD_IS_APU) || 2198 adev->asic_type < CHIP_RAVEN) 2199 return 0; 2200 2201 switch (adev->asic_type) { 2202 case CHIP_RAVEN: 2203 if (adev->pdev->device == 0x15dd) 2204 adev->apu_flags |= AMD_APU_IS_RAVEN; 2205 if (adev->pdev->device == 0x15d8) 2206 adev->apu_flags |= AMD_APU_IS_PICASSO; 2207 break; 2208 case CHIP_RENOIR: 2209 if ((adev->pdev->device == 0x1636) || 2210 (adev->pdev->device == 0x164c)) 2211 adev->apu_flags |= AMD_APU_IS_RENOIR; 2212 else 2213 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2214 break; 2215 case CHIP_VANGOGH: 2216 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2217 break; 2218 case CHIP_YELLOW_CARP: 2219 break; 2220 case CHIP_CYAN_SKILLFISH: 2221 if ((adev->pdev->device == 0x13FE) || 2222 (adev->pdev->device == 0x143F)) 2223 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2224 break; 2225 default: 2226 break; 2227 } 2228 2229 return 0; 2230 } 2231 2232 /** 2233 * amdgpu_device_check_arguments - validate module params 2234 * 2235 * @adev: amdgpu_device pointer 2236 * 2237 * Validates certain module parameters and updates 2238 * the associated values used by the driver (all asics). 2239 */ 2240 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2241 { 2242 int i; 2243 2244 if (amdgpu_sched_jobs < 4) { 2245 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2246 amdgpu_sched_jobs); 2247 amdgpu_sched_jobs = 4; 2248 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2249 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2250 amdgpu_sched_jobs); 2251 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2252 } 2253 2254 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2255 /* gart size must be greater or equal to 32M */ 2256 dev_warn(adev->dev, "gart size (%d) too small\n", 2257 amdgpu_gart_size); 2258 amdgpu_gart_size = -1; 2259 } 2260 2261 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2262 /* gtt size must be greater or equal to 32M */ 2263 dev_warn(adev->dev, "gtt size (%d) too small\n", 2264 amdgpu_gtt_size); 2265 amdgpu_gtt_size = -1; 2266 } 2267 2268 /* valid range is between 4 and 9 inclusive */ 2269 if (amdgpu_vm_fragment_size != -1 && 2270 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2271 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2272 amdgpu_vm_fragment_size = -1; 2273 } 2274 2275 if (amdgpu_sched_hw_submission < 2) { 2276 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2277 amdgpu_sched_hw_submission); 2278 amdgpu_sched_hw_submission = 2; 2279 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2280 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2281 amdgpu_sched_hw_submission); 2282 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2283 } 2284 2285 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2286 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2287 amdgpu_reset_method = -1; 2288 } 2289 2290 amdgpu_device_check_smu_prv_buffer_size(adev); 2291 2292 amdgpu_device_check_vm_size(adev); 2293 2294 amdgpu_device_check_block_size(adev); 2295 2296 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2297 2298 for (i = 0; i < MAX_XCP; i++) { 2299 switch (amdgpu_enforce_isolation) { 2300 case -1: 2301 case 0: 2302 default: 2303 /* disable */ 2304 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2305 break; 2306 case 1: 2307 /* enable */ 2308 adev->enforce_isolation[i] = 2309 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2310 break; 2311 case 2: 2312 /* enable legacy mode */ 2313 adev->enforce_isolation[i] = 2314 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2315 break; 2316 case 3: 2317 /* enable only process isolation without submitting cleaner shader */ 2318 adev->enforce_isolation[i] = 2319 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2320 break; 2321 } 2322 } 2323 2324 return 0; 2325 } 2326 2327 /** 2328 * amdgpu_switcheroo_set_state - set switcheroo state 2329 * 2330 * @pdev: pci dev pointer 2331 * @state: vga_switcheroo state 2332 * 2333 * Callback for the switcheroo driver. Suspends or resumes 2334 * the asics before or after it is powered up using ACPI methods. 2335 */ 2336 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2337 enum vga_switcheroo_state state) 2338 { 2339 struct drm_device *dev = pci_get_drvdata(pdev); 2340 int r; 2341 2342 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2343 state == VGA_SWITCHEROO_OFF) 2344 return; 2345 2346 if (state == VGA_SWITCHEROO_ON) { 2347 pr_info("switched on\n"); 2348 /* don't suspend or resume card normally */ 2349 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2350 2351 pci_set_power_state(pdev, PCI_D0); 2352 amdgpu_device_load_pci_state(pdev); 2353 r = pci_enable_device(pdev); 2354 if (r) 2355 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2356 r); 2357 amdgpu_device_resume(dev, true); 2358 2359 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2360 } else { 2361 dev_info(&pdev->dev, "switched off\n"); 2362 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2363 amdgpu_device_prepare(dev); 2364 amdgpu_device_suspend(dev, true); 2365 amdgpu_device_cache_pci_state(pdev); 2366 /* Shut down the device */ 2367 pci_disable_device(pdev); 2368 pci_set_power_state(pdev, PCI_D3cold); 2369 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2370 } 2371 } 2372 2373 /** 2374 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2375 * 2376 * @pdev: pci dev pointer 2377 * 2378 * Callback for the switcheroo driver. Check of the switcheroo 2379 * state can be changed. 2380 * Returns true if the state can be changed, false if not. 2381 */ 2382 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2383 { 2384 struct drm_device *dev = pci_get_drvdata(pdev); 2385 2386 /* 2387 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2388 * locking inversion with the driver load path. And the access here is 2389 * completely racy anyway. So don't bother with locking for now. 2390 */ 2391 return atomic_read(&dev->open_count) == 0; 2392 } 2393 2394 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2395 .set_gpu_state = amdgpu_switcheroo_set_state, 2396 .reprobe = NULL, 2397 .can_switch = amdgpu_switcheroo_can_switch, 2398 }; 2399 2400 /** 2401 * amdgpu_device_enable_virtual_display - enable virtual display feature 2402 * 2403 * @adev: amdgpu_device pointer 2404 * 2405 * Enabled the virtual display feature if the user has enabled it via 2406 * the module parameter virtual_display. This feature provides a virtual 2407 * display hardware on headless boards or in virtualized environments. 2408 * This function parses and validates the configuration string specified by 2409 * the user and configures the virtual display configuration (number of 2410 * virtual connectors, crtcs, etc.) specified. 2411 */ 2412 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2413 { 2414 adev->enable_virtual_display = false; 2415 2416 if (amdgpu_virtual_display) { 2417 const char *pci_address_name = pci_name(adev->pdev); 2418 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2419 2420 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2421 pciaddstr_tmp = pciaddstr; 2422 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2423 pciaddname = strsep(&pciaddname_tmp, ","); 2424 if (!strcmp("all", pciaddname) 2425 || !strcmp(pci_address_name, pciaddname)) { 2426 long num_crtc; 2427 int res = -1; 2428 2429 adev->enable_virtual_display = true; 2430 2431 if (pciaddname_tmp) 2432 res = kstrtol(pciaddname_tmp, 10, 2433 &num_crtc); 2434 2435 if (!res) { 2436 if (num_crtc < 1) 2437 num_crtc = 1; 2438 if (num_crtc > 6) 2439 num_crtc = 6; 2440 adev->mode_info.num_crtc = num_crtc; 2441 } else { 2442 adev->mode_info.num_crtc = 1; 2443 } 2444 break; 2445 } 2446 } 2447 2448 dev_info( 2449 adev->dev, 2450 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2451 amdgpu_virtual_display, pci_address_name, 2452 adev->enable_virtual_display, adev->mode_info.num_crtc); 2453 2454 kfree(pciaddstr); 2455 } 2456 } 2457 2458 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2459 { 2460 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2461 adev->mode_info.num_crtc = 1; 2462 adev->enable_virtual_display = true; 2463 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2464 adev->enable_virtual_display, 2465 adev->mode_info.num_crtc); 2466 } 2467 } 2468 2469 /** 2470 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2471 * 2472 * @adev: amdgpu_device pointer 2473 * 2474 * Parses the asic configuration parameters specified in the gpu info 2475 * firmware and makes them available to the driver for use in configuring 2476 * the asic. 2477 * Returns 0 on success, -EINVAL on failure. 2478 */ 2479 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2480 { 2481 const char *chip_name; 2482 int err; 2483 const struct gpu_info_firmware_header_v1_0 *hdr; 2484 2485 adev->firmware.gpu_info_fw = NULL; 2486 2487 switch (adev->asic_type) { 2488 default: 2489 return 0; 2490 case CHIP_VEGA10: 2491 chip_name = "vega10"; 2492 break; 2493 case CHIP_VEGA12: 2494 chip_name = "vega12"; 2495 break; 2496 case CHIP_RAVEN: 2497 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2498 chip_name = "raven2"; 2499 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2500 chip_name = "picasso"; 2501 else 2502 chip_name = "raven"; 2503 break; 2504 case CHIP_ARCTURUS: 2505 chip_name = "arcturus"; 2506 break; 2507 case CHIP_NAVI12: 2508 if (adev->discovery.bin) 2509 return 0; 2510 chip_name = "navi12"; 2511 break; 2512 case CHIP_CYAN_SKILLFISH: 2513 if (adev->discovery.bin) 2514 return 0; 2515 chip_name = "cyan_skillfish"; 2516 break; 2517 } 2518 2519 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2520 AMDGPU_UCODE_OPTIONAL, 2521 "amdgpu/%s_gpu_info.bin", chip_name); 2522 if (err) { 2523 dev_err(adev->dev, 2524 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2525 chip_name); 2526 goto out; 2527 } 2528 2529 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2530 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2531 2532 switch (hdr->version_major) { 2533 case 1: 2534 { 2535 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2536 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2537 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2538 2539 /* 2540 * Should be dropped when DAL no longer needs it. 2541 */ 2542 if (adev->asic_type == CHIP_NAVI12) 2543 goto parse_soc_bounding_box; 2544 2545 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2546 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2547 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2548 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2549 adev->gfx.config.max_texture_channel_caches = 2550 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2551 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2552 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2553 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2554 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2555 adev->gfx.config.double_offchip_lds_buf = 2556 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2557 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2558 adev->gfx.cu_info.max_waves_per_simd = 2559 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2560 adev->gfx.cu_info.max_scratch_slots_per_cu = 2561 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2562 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2563 if (hdr->version_minor >= 1) { 2564 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2565 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2566 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2567 adev->gfx.config.num_sc_per_sh = 2568 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2569 adev->gfx.config.num_packer_per_sc = 2570 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2571 } 2572 2573 parse_soc_bounding_box: 2574 /* 2575 * soc bounding box info is not integrated in disocovery table, 2576 * we always need to parse it from gpu info firmware if needed. 2577 */ 2578 if (hdr->version_minor == 2) { 2579 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2580 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2581 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2582 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2583 } 2584 break; 2585 } 2586 default: 2587 dev_err(adev->dev, 2588 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2589 err = -EINVAL; 2590 goto out; 2591 } 2592 out: 2593 return err; 2594 } 2595 2596 static void amdgpu_uid_init(struct amdgpu_device *adev) 2597 { 2598 /* Initialize the UID for the device */ 2599 adev->uid_info = kzalloc_obj(struct amdgpu_uid); 2600 if (!adev->uid_info) { 2601 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2602 return; 2603 } 2604 adev->uid_info->adev = adev; 2605 } 2606 2607 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2608 { 2609 /* Free the UID memory */ 2610 kfree(adev->uid_info); 2611 adev->uid_info = NULL; 2612 } 2613 2614 /** 2615 * amdgpu_device_ip_early_init - run early init for hardware IPs 2616 * 2617 * @adev: amdgpu_device pointer 2618 * 2619 * Early initialization pass for hardware IPs. The hardware IPs that make 2620 * up each asic are discovered each IP's early_init callback is run. This 2621 * is the first stage in initializing the asic. 2622 * Returns 0 on success, negative error code on failure. 2623 */ 2624 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2625 { 2626 struct amdgpu_ip_block *ip_block; 2627 struct pci_dev *parent; 2628 bool total, skip_bios; 2629 uint32_t bios_flags; 2630 int i, r; 2631 2632 amdgpu_device_enable_virtual_display(adev); 2633 2634 if (amdgpu_sriov_vf(adev)) { 2635 r = amdgpu_virt_request_full_gpu(adev, true); 2636 if (r) 2637 return r; 2638 2639 r = amdgpu_virt_init_critical_region(adev); 2640 if (r) 2641 return r; 2642 } 2643 2644 switch (adev->asic_type) { 2645 #ifdef CONFIG_DRM_AMDGPU_SI 2646 case CHIP_VERDE: 2647 case CHIP_TAHITI: 2648 case CHIP_PITCAIRN: 2649 case CHIP_OLAND: 2650 case CHIP_HAINAN: 2651 adev->family = AMDGPU_FAMILY_SI; 2652 r = si_set_ip_blocks(adev); 2653 if (r) 2654 return r; 2655 break; 2656 #endif 2657 #ifdef CONFIG_DRM_AMDGPU_CIK 2658 case CHIP_BONAIRE: 2659 case CHIP_HAWAII: 2660 case CHIP_KAVERI: 2661 case CHIP_KABINI: 2662 case CHIP_MULLINS: 2663 if (adev->flags & AMD_IS_APU) 2664 adev->family = AMDGPU_FAMILY_KV; 2665 else 2666 adev->family = AMDGPU_FAMILY_CI; 2667 2668 r = cik_set_ip_blocks(adev); 2669 if (r) 2670 return r; 2671 break; 2672 #endif 2673 case CHIP_TOPAZ: 2674 case CHIP_TONGA: 2675 case CHIP_FIJI: 2676 case CHIP_POLARIS10: 2677 case CHIP_POLARIS11: 2678 case CHIP_POLARIS12: 2679 case CHIP_VEGAM: 2680 case CHIP_CARRIZO: 2681 case CHIP_STONEY: 2682 if (adev->flags & AMD_IS_APU) 2683 adev->family = AMDGPU_FAMILY_CZ; 2684 else 2685 adev->family = AMDGPU_FAMILY_VI; 2686 2687 r = vi_set_ip_blocks(adev); 2688 if (r) 2689 return r; 2690 break; 2691 default: 2692 r = amdgpu_discovery_set_ip_blocks(adev); 2693 if (r) { 2694 adev->num_ip_blocks = 0; 2695 return r; 2696 } 2697 break; 2698 } 2699 2700 /* Check for IP version 9.4.3 with A0 hardware */ 2701 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2702 !amdgpu_device_get_rev_id(adev)) { 2703 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2704 return -ENODEV; /* device unsupported - no device error */ 2705 } 2706 2707 if (amdgpu_has_atpx() && 2708 (amdgpu_is_atpx_hybrid() || 2709 amdgpu_has_atpx_dgpu_power_cntl()) && 2710 ((adev->flags & AMD_IS_APU) == 0) && 2711 !dev_is_removable(&adev->pdev->dev)) 2712 adev->flags |= AMD_IS_PX; 2713 2714 if (!(adev->flags & AMD_IS_APU)) { 2715 parent = pcie_find_root_port(adev->pdev); 2716 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2717 } 2718 2719 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2720 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2721 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2722 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2723 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2724 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2725 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2726 2727 adev->virt.is_xgmi_node_migrate_enabled = false; 2728 if (amdgpu_sriov_vf(adev)) { 2729 adev->virt.is_xgmi_node_migrate_enabled = 2730 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2731 } 2732 2733 total = true; 2734 for (i = 0; i < adev->num_ip_blocks; i++) { 2735 ip_block = &adev->ip_blocks[i]; 2736 2737 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2738 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2739 adev->ip_blocks[i].version->funcs->name); 2740 adev->ip_blocks[i].status.valid = false; 2741 } else if (ip_block->version->funcs->early_init) { 2742 r = ip_block->version->funcs->early_init(ip_block); 2743 if (r == -ENOENT) { 2744 adev->ip_blocks[i].status.valid = false; 2745 } else if (r) { 2746 dev_err(adev->dev, 2747 "early_init of IP block <%s> failed %d\n", 2748 adev->ip_blocks[i].version->funcs->name, 2749 r); 2750 total = false; 2751 } else { 2752 adev->ip_blocks[i].status.valid = true; 2753 } 2754 } else { 2755 adev->ip_blocks[i].status.valid = true; 2756 } 2757 /* get the vbios after the asic_funcs are set up */ 2758 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2759 r = amdgpu_device_parse_gpu_info_fw(adev); 2760 if (r) 2761 return r; 2762 2763 bios_flags = amdgpu_device_get_vbios_flags(adev); 2764 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2765 /* Read BIOS */ 2766 if (!skip_bios) { 2767 bool optional = 2768 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2769 if (!amdgpu_get_bios(adev) && !optional) 2770 return -EINVAL; 2771 2772 if (optional && !adev->bios) 2773 dev_info( 2774 adev->dev, 2775 "VBIOS image optional, proceeding without VBIOS image"); 2776 2777 if (adev->bios) { 2778 r = amdgpu_atombios_init(adev); 2779 if (r) { 2780 dev_err(adev->dev, 2781 "amdgpu_atombios_init failed\n"); 2782 amdgpu_vf_error_put( 2783 adev, 2784 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2785 0, 0); 2786 return r; 2787 } 2788 } 2789 } 2790 2791 /*get pf2vf msg info at it's earliest time*/ 2792 if (amdgpu_sriov_vf(adev)) 2793 amdgpu_virt_init_data_exchange(adev); 2794 2795 } 2796 } 2797 if (!total) 2798 return -ENODEV; 2799 2800 if (adev->gmc.xgmi.supported) 2801 amdgpu_xgmi_early_init(adev); 2802 2803 if (amdgpu_is_multi_aid(adev)) 2804 amdgpu_uid_init(adev); 2805 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2806 if (ip_block->status.valid != false) 2807 amdgpu_amdkfd_device_probe(adev); 2808 2809 adev->cg_flags &= amdgpu_cg_mask; 2810 adev->pg_flags &= amdgpu_pg_mask; 2811 2812 return 0; 2813 } 2814 2815 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2816 { 2817 int i, r; 2818 2819 for (i = 0; i < adev->num_ip_blocks; i++) { 2820 if (!adev->ip_blocks[i].status.sw) 2821 continue; 2822 if (adev->ip_blocks[i].status.hw) 2823 continue; 2824 if (!amdgpu_ip_member_of_hwini( 2825 adev, adev->ip_blocks[i].version->type)) 2826 continue; 2827 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2828 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2829 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2830 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2831 if (r) { 2832 dev_err(adev->dev, 2833 "hw_init of IP block <%s> failed %d\n", 2834 adev->ip_blocks[i].version->funcs->name, 2835 r); 2836 return r; 2837 } 2838 adev->ip_blocks[i].status.hw = true; 2839 } 2840 } 2841 2842 return 0; 2843 } 2844 2845 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2846 { 2847 int i, r; 2848 2849 for (i = 0; i < adev->num_ip_blocks; i++) { 2850 if (!adev->ip_blocks[i].status.sw) 2851 continue; 2852 if (adev->ip_blocks[i].status.hw) 2853 continue; 2854 if (!amdgpu_ip_member_of_hwini( 2855 adev, adev->ip_blocks[i].version->type)) 2856 continue; 2857 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2858 if (r) { 2859 dev_err(adev->dev, 2860 "hw_init of IP block <%s> failed %d\n", 2861 adev->ip_blocks[i].version->funcs->name, r); 2862 return r; 2863 } 2864 adev->ip_blocks[i].status.hw = true; 2865 } 2866 2867 return 0; 2868 } 2869 2870 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2871 { 2872 int r = 0; 2873 int i; 2874 uint32_t smu_version; 2875 2876 if (adev->asic_type >= CHIP_VEGA10) { 2877 for (i = 0; i < adev->num_ip_blocks; i++) { 2878 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2879 continue; 2880 2881 if (!amdgpu_ip_member_of_hwini(adev, 2882 AMD_IP_BLOCK_TYPE_PSP)) 2883 break; 2884 2885 if (!adev->ip_blocks[i].status.sw) 2886 continue; 2887 2888 /* no need to do the fw loading again if already done*/ 2889 if (adev->ip_blocks[i].status.hw == true) 2890 break; 2891 2892 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2893 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2894 if (r) 2895 return r; 2896 } else { 2897 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2898 if (r) { 2899 dev_err(adev->dev, 2900 "hw_init of IP block <%s> failed %d\n", 2901 adev->ip_blocks[i] 2902 .version->funcs->name, 2903 r); 2904 return r; 2905 } 2906 adev->ip_blocks[i].status.hw = true; 2907 } 2908 break; 2909 } 2910 } 2911 2912 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2913 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2914 2915 return r; 2916 } 2917 2918 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2919 { 2920 struct drm_sched_init_args args = { 2921 .ops = &amdgpu_sched_ops, 2922 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2923 .timeout_wq = adev->reset_domain->wq, 2924 .dev = adev->dev, 2925 }; 2926 long timeout; 2927 int r, i; 2928 2929 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2930 struct amdgpu_ring *ring = adev->rings[i]; 2931 2932 /* No need to setup the GPU scheduler for rings that don't need it */ 2933 if (!ring || ring->no_scheduler) 2934 continue; 2935 2936 switch (ring->funcs->type) { 2937 case AMDGPU_RING_TYPE_GFX: 2938 timeout = adev->gfx_timeout; 2939 break; 2940 case AMDGPU_RING_TYPE_COMPUTE: 2941 timeout = adev->compute_timeout; 2942 break; 2943 case AMDGPU_RING_TYPE_SDMA: 2944 timeout = adev->sdma_timeout; 2945 break; 2946 default: 2947 timeout = adev->video_timeout; 2948 break; 2949 } 2950 2951 args.timeout = timeout; 2952 args.credit_limit = ring->num_hw_submission; 2953 args.score = ring->sched_score; 2954 args.name = ring->name; 2955 2956 r = drm_sched_init(&ring->sched, &args); 2957 if (r) { 2958 dev_err(adev->dev, 2959 "Failed to create scheduler on ring %s.\n", 2960 ring->name); 2961 return r; 2962 } 2963 r = amdgpu_uvd_entity_init(adev, ring); 2964 if (r) { 2965 dev_err(adev->dev, 2966 "Failed to create UVD scheduling entity on ring %s.\n", 2967 ring->name); 2968 return r; 2969 } 2970 r = amdgpu_vce_entity_init(adev, ring); 2971 if (r) { 2972 dev_err(adev->dev, 2973 "Failed to create VCE scheduling entity on ring %s.\n", 2974 ring->name); 2975 return r; 2976 } 2977 } 2978 2979 if (adev->xcp_mgr) 2980 amdgpu_xcp_update_partition_sched_list(adev); 2981 2982 return 0; 2983 } 2984 2985 2986 /** 2987 * amdgpu_device_ip_init - run init for hardware IPs 2988 * 2989 * @adev: amdgpu_device pointer 2990 * 2991 * Main initialization pass for hardware IPs. The list of all the hardware 2992 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2993 * are run. sw_init initializes the software state associated with each IP 2994 * and hw_init initializes the hardware associated with each IP. 2995 * Returns 0 on success, negative error code on failure. 2996 */ 2997 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2998 { 2999 bool init_badpage; 3000 int i, r; 3001 3002 r = amdgpu_ras_init(adev); 3003 if (r) 3004 return r; 3005 3006 for (i = 0; i < adev->num_ip_blocks; i++) { 3007 if (!adev->ip_blocks[i].status.valid) 3008 continue; 3009 if (adev->ip_blocks[i].version->funcs->sw_init) { 3010 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3011 if (r) { 3012 dev_err(adev->dev, 3013 "sw_init of IP block <%s> failed %d\n", 3014 adev->ip_blocks[i].version->funcs->name, 3015 r); 3016 goto init_failed; 3017 } 3018 } 3019 adev->ip_blocks[i].status.sw = true; 3020 3021 if (!amdgpu_ip_member_of_hwini( 3022 adev, adev->ip_blocks[i].version->type)) 3023 continue; 3024 3025 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3026 /* need to do common hw init early so everything is set up for gmc */ 3027 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3028 if (r) { 3029 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3030 r); 3031 goto init_failed; 3032 } 3033 adev->ip_blocks[i].status.hw = true; 3034 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3035 /* need to do gmc hw init early so we can allocate gpu mem */ 3036 /* Try to reserve bad pages early */ 3037 if (amdgpu_sriov_vf(adev)) 3038 amdgpu_virt_exchange_data(adev); 3039 3040 r = amdgpu_device_mem_scratch_init(adev); 3041 if (r) { 3042 dev_err(adev->dev, 3043 "amdgpu_mem_scratch_init failed %d\n", 3044 r); 3045 goto init_failed; 3046 } 3047 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3048 if (r) { 3049 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3050 r); 3051 goto init_failed; 3052 } 3053 r = amdgpu_device_wb_init(adev); 3054 if (r) { 3055 dev_err(adev->dev, 3056 "amdgpu_device_wb_init failed %d\n", r); 3057 goto init_failed; 3058 } 3059 adev->ip_blocks[i].status.hw = true; 3060 3061 /* right after GMC hw init, we create CSA */ 3062 if (adev->gfx.mcbp) { 3063 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3064 AMDGPU_GEM_DOMAIN_VRAM | 3065 AMDGPU_GEM_DOMAIN_GTT, 3066 AMDGPU_CSA_SIZE); 3067 if (r) { 3068 dev_err(adev->dev, 3069 "allocate CSA failed %d\n", r); 3070 goto init_failed; 3071 } 3072 } 3073 3074 r = amdgpu_seq64_init(adev); 3075 if (r) { 3076 dev_err(adev->dev, "allocate seq64 failed %d\n", 3077 r); 3078 goto init_failed; 3079 } 3080 } 3081 } 3082 3083 if (amdgpu_sriov_vf(adev)) 3084 amdgpu_virt_init_data_exchange(adev); 3085 3086 r = amdgpu_ib_pool_init(adev); 3087 if (r) { 3088 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3089 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3090 goto init_failed; 3091 } 3092 3093 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3094 if (r) 3095 goto init_failed; 3096 3097 r = amdgpu_device_ip_hw_init_phase1(adev); 3098 if (r) 3099 goto init_failed; 3100 3101 r = amdgpu_device_fw_loading(adev); 3102 if (r) 3103 goto init_failed; 3104 3105 r = amdgpu_device_ip_hw_init_phase2(adev); 3106 if (r) 3107 goto init_failed; 3108 3109 /* 3110 * retired pages will be loaded from eeprom and reserved here, 3111 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3112 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3113 * for I2C communication which only true at this point. 3114 * 3115 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3116 * failure from bad gpu situation and stop amdgpu init process 3117 * accordingly. For other failed cases, it will still release all 3118 * the resource and print error message, rather than returning one 3119 * negative value to upper level. 3120 * 3121 * Note: theoretically, this should be called before all vram allocations 3122 * to protect retired page from abusing 3123 */ 3124 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3125 r = amdgpu_ras_recovery_init(adev, init_badpage); 3126 if (r) 3127 goto init_failed; 3128 3129 /** 3130 * In case of XGMI grab extra reference for reset domain for this device 3131 */ 3132 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3133 if (amdgpu_xgmi_add_device(adev) == 0) { 3134 if (!amdgpu_sriov_vf(adev)) { 3135 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3136 3137 if (WARN_ON(!hive)) { 3138 r = -ENOENT; 3139 goto init_failed; 3140 } 3141 3142 if (!hive->reset_domain || 3143 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3144 r = -ENOENT; 3145 amdgpu_put_xgmi_hive(hive); 3146 goto init_failed; 3147 } 3148 3149 /* Drop the early temporary reset domain we created for device */ 3150 amdgpu_reset_put_reset_domain(adev->reset_domain); 3151 adev->reset_domain = hive->reset_domain; 3152 amdgpu_put_xgmi_hive(hive); 3153 } 3154 } 3155 } 3156 3157 r = amdgpu_device_init_schedulers(adev); 3158 if (r) 3159 goto init_failed; 3160 3161 if (adev->mman.buffer_funcs_ring && 3162 adev->mman.buffer_funcs_ring->sched.ready) 3163 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3164 3165 /* Don't init kfd if whole hive need to be reset during init */ 3166 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3167 amdgpu_amdkfd_device_init(adev); 3168 } 3169 3170 amdgpu_fru_get_product_info(adev); 3171 3172 r = amdgpu_cper_init(adev); 3173 3174 init_failed: 3175 3176 return r; 3177 } 3178 3179 /** 3180 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3181 * 3182 * @adev: amdgpu_device pointer 3183 * 3184 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3185 * this function before a GPU reset. If the value is retained after a 3186 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3187 */ 3188 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3189 { 3190 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3191 } 3192 3193 /** 3194 * amdgpu_device_check_vram_lost - check if vram is valid 3195 * 3196 * @adev: amdgpu_device pointer 3197 * 3198 * Checks the reset magic value written to the gart pointer in VRAM. 3199 * The driver calls this after a GPU reset to see if the contents of 3200 * VRAM is lost or now. 3201 * returns true if vram is lost, false if not. 3202 */ 3203 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3204 { 3205 if (memcmp(adev->gart.ptr, adev->reset_magic, 3206 AMDGPU_RESET_MAGIC_NUM)) 3207 return true; 3208 3209 if (!amdgpu_in_reset(adev)) 3210 return false; 3211 3212 /* 3213 * For all ASICs with baco/mode1 reset, the VRAM is 3214 * always assumed to be lost. 3215 */ 3216 switch (amdgpu_asic_reset_method(adev)) { 3217 case AMD_RESET_METHOD_LEGACY: 3218 case AMD_RESET_METHOD_LINK: 3219 case AMD_RESET_METHOD_BACO: 3220 case AMD_RESET_METHOD_MODE1: 3221 return true; 3222 default: 3223 return false; 3224 } 3225 } 3226 3227 /** 3228 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3229 * 3230 * @adev: amdgpu_device pointer 3231 * @state: clockgating state (gate or ungate) 3232 * 3233 * The list of all the hardware IPs that make up the asic is walked and the 3234 * set_clockgating_state callbacks are run. 3235 * Late initialization pass enabling clockgating for hardware IPs. 3236 * Fini or suspend, pass disabling clockgating for hardware IPs. 3237 * Returns 0 on success, negative error code on failure. 3238 */ 3239 3240 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3241 enum amd_clockgating_state state) 3242 { 3243 int i, j, r; 3244 3245 if (amdgpu_emu_mode == 1) 3246 return 0; 3247 3248 for (j = 0; j < adev->num_ip_blocks; j++) { 3249 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3250 if (!adev->ip_blocks[i].status.late_initialized) 3251 continue; 3252 if (!adev->ip_blocks[i].version) 3253 continue; 3254 /* skip CG for GFX, SDMA on S0ix */ 3255 if (adev->in_s0ix && 3256 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3257 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3258 continue; 3259 /* skip CG for VCE/UVD, it's handled specially */ 3260 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3261 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3262 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3263 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3264 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3265 /* enable clockgating to save power */ 3266 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3267 state); 3268 if (r) { 3269 dev_err(adev->dev, 3270 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3271 adev->ip_blocks[i].version->funcs->name, 3272 r); 3273 return r; 3274 } 3275 } 3276 } 3277 3278 return 0; 3279 } 3280 3281 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3282 enum amd_powergating_state state) 3283 { 3284 int i, j, r; 3285 3286 if (amdgpu_emu_mode == 1) 3287 return 0; 3288 3289 for (j = 0; j < adev->num_ip_blocks; j++) { 3290 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3291 if (!adev->ip_blocks[i].status.late_initialized) 3292 continue; 3293 if (!adev->ip_blocks[i].version) 3294 continue; 3295 /* skip PG for GFX, SDMA on S0ix */ 3296 if (adev->in_s0ix && 3297 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3298 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3299 continue; 3300 /* skip CG for VCE/UVD, it's handled specially */ 3301 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3302 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3303 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3304 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3305 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3306 /* enable powergating to save power */ 3307 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3308 state); 3309 if (r) { 3310 dev_err(adev->dev, 3311 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3312 adev->ip_blocks[i].version->funcs->name, 3313 r); 3314 return r; 3315 } 3316 } 3317 } 3318 return 0; 3319 } 3320 3321 static int amdgpu_device_enable_mgpu_fan_boost(void) 3322 { 3323 struct amdgpu_gpu_instance *gpu_ins; 3324 struct amdgpu_device *adev; 3325 int i, ret = 0; 3326 3327 mutex_lock(&mgpu_info.mutex); 3328 3329 /* 3330 * MGPU fan boost feature should be enabled 3331 * only when there are two or more dGPUs in 3332 * the system 3333 */ 3334 if (mgpu_info.num_dgpu < 2) 3335 goto out; 3336 3337 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3338 gpu_ins = &(mgpu_info.gpu_ins[i]); 3339 adev = gpu_ins->adev; 3340 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3341 !gpu_ins->mgpu_fan_enabled) { 3342 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3343 if (ret) 3344 break; 3345 3346 gpu_ins->mgpu_fan_enabled = 1; 3347 } 3348 } 3349 3350 out: 3351 mutex_unlock(&mgpu_info.mutex); 3352 3353 return ret; 3354 } 3355 3356 /** 3357 * amdgpu_device_ip_late_init - run late init for hardware IPs 3358 * 3359 * @adev: amdgpu_device pointer 3360 * 3361 * Late initialization pass for hardware IPs. The list of all the hardware 3362 * IPs that make up the asic is walked and the late_init callbacks are run. 3363 * late_init covers any special initialization that an IP requires 3364 * after all of the have been initialized or something that needs to happen 3365 * late in the init process. 3366 * Returns 0 on success, negative error code on failure. 3367 */ 3368 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3369 { 3370 struct amdgpu_gpu_instance *gpu_instance; 3371 int i = 0, r; 3372 3373 for (i = 0; i < adev->num_ip_blocks; i++) { 3374 if (!adev->ip_blocks[i].status.hw) 3375 continue; 3376 if (adev->ip_blocks[i].version->funcs->late_init) { 3377 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3378 if (r) { 3379 dev_err(adev->dev, 3380 "late_init of IP block <%s> failed %d\n", 3381 adev->ip_blocks[i].version->funcs->name, 3382 r); 3383 return r; 3384 } 3385 } 3386 adev->ip_blocks[i].status.late_initialized = true; 3387 } 3388 3389 r = amdgpu_ras_late_init(adev); 3390 if (r) { 3391 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3392 return r; 3393 } 3394 3395 if (!amdgpu_reset_in_recovery(adev)) 3396 amdgpu_ras_set_error_query_ready(adev, true); 3397 3398 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3399 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3400 3401 amdgpu_device_fill_reset_magic(adev); 3402 3403 r = amdgpu_device_enable_mgpu_fan_boost(); 3404 if (r) 3405 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3406 3407 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3408 if (amdgpu_passthrough(adev) && 3409 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3410 adev->asic_type == CHIP_ALDEBARAN)) 3411 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3412 3413 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3414 mutex_lock(&mgpu_info.mutex); 3415 3416 /* 3417 * Reset device p-state to low as this was booted with high. 3418 * 3419 * This should be performed only after all devices from the same 3420 * hive get initialized. 3421 * 3422 * However, it's unknown how many device in the hive in advance. 3423 * As this is counted one by one during devices initializations. 3424 * 3425 * So, we wait for all XGMI interlinked devices initialized. 3426 * This may bring some delays as those devices may come from 3427 * different hives. But that should be OK. 3428 */ 3429 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3430 for (i = 0; i < mgpu_info.num_gpu; i++) { 3431 gpu_instance = &(mgpu_info.gpu_ins[i]); 3432 if (gpu_instance->adev->flags & AMD_IS_APU) 3433 continue; 3434 3435 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3436 AMDGPU_XGMI_PSTATE_MIN); 3437 if (r) { 3438 dev_err(adev->dev, 3439 "pstate setting failed (%d).\n", 3440 r); 3441 break; 3442 } 3443 } 3444 } 3445 3446 mutex_unlock(&mgpu_info.mutex); 3447 } 3448 3449 return 0; 3450 } 3451 3452 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3453 { 3454 struct amdgpu_device *adev = ip_block->adev; 3455 int r; 3456 3457 if (!ip_block->version->funcs->hw_fini) { 3458 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3459 ip_block->version->funcs->name); 3460 } else { 3461 r = ip_block->version->funcs->hw_fini(ip_block); 3462 /* XXX handle errors */ 3463 if (r) { 3464 dev_dbg(adev->dev, 3465 "hw_fini of IP block <%s> failed %d\n", 3466 ip_block->version->funcs->name, r); 3467 } 3468 } 3469 3470 ip_block->status.hw = false; 3471 } 3472 3473 /** 3474 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3475 * 3476 * @adev: amdgpu_device pointer 3477 * 3478 * For ASICs need to disable SMC first 3479 */ 3480 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3481 { 3482 int i; 3483 3484 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3485 return; 3486 3487 for (i = 0; i < adev->num_ip_blocks; i++) { 3488 if (!adev->ip_blocks[i].status.hw) 3489 continue; 3490 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3491 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3492 break; 3493 } 3494 } 3495 } 3496 3497 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3498 { 3499 int i, r; 3500 3501 for (i = 0; i < adev->num_ip_blocks; i++) { 3502 if (!adev->ip_blocks[i].version) 3503 continue; 3504 if (!adev->ip_blocks[i].version->funcs->early_fini) 3505 continue; 3506 3507 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3508 if (r) { 3509 dev_dbg(adev->dev, 3510 "early_fini of IP block <%s> failed %d\n", 3511 adev->ip_blocks[i].version->funcs->name, r); 3512 } 3513 } 3514 3515 amdgpu_amdkfd_suspend(adev, true); 3516 amdgpu_amdkfd_teardown_processes(adev); 3517 amdgpu_userq_suspend(adev); 3518 3519 /* Workaround for ASICs need to disable SMC first */ 3520 amdgpu_device_smu_fini_early(adev); 3521 3522 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3523 if (!adev->ip_blocks[i].status.hw) 3524 continue; 3525 3526 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3527 } 3528 3529 if (amdgpu_sriov_vf(adev)) { 3530 if (amdgpu_virt_release_full_gpu(adev, false)) 3531 dev_err(adev->dev, 3532 "failed to release exclusive mode on fini\n"); 3533 } 3534 3535 /* 3536 * Driver reload on the APU can fail due to firmware validation because 3537 * the PSP is always running, as it is shared across the whole SoC. 3538 * This same issue does not occur on dGPU because it has a mechanism 3539 * that checks whether the PSP is running. A solution for those issues 3540 * in the APU is to trigger a GPU reset, but this should be done during 3541 * the unload phase to avoid adding boot latency and screen flicker. 3542 */ 3543 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { 3544 r = amdgpu_asic_reset(adev); 3545 if (r) 3546 dev_err(adev->dev, "asic reset on %s failed\n", __func__); 3547 } 3548 3549 return 0; 3550 } 3551 3552 /** 3553 * amdgpu_device_ip_fini - run fini for hardware IPs 3554 * 3555 * @adev: amdgpu_device pointer 3556 * 3557 * Main teardown pass for hardware IPs. The list of all the hardware 3558 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3559 * are run. hw_fini tears down the hardware associated with each IP 3560 * and sw_fini tears down any software state associated with each IP. 3561 * Returns 0 on success, negative error code on failure. 3562 */ 3563 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3564 { 3565 int i, r; 3566 3567 amdgpu_cper_fini(adev); 3568 3569 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3570 amdgpu_virt_release_ras_err_handler_data(adev); 3571 3572 if (adev->gmc.xgmi.num_physical_nodes > 1) 3573 amdgpu_xgmi_remove_device(adev); 3574 3575 amdgpu_amdkfd_device_fini_sw(adev); 3576 3577 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3578 if (!adev->ip_blocks[i].status.sw) 3579 continue; 3580 3581 if (!adev->ip_blocks[i].version) 3582 continue; 3583 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3584 amdgpu_ucode_free_bo(adev); 3585 amdgpu_free_static_csa(&adev->virt.csa_obj); 3586 amdgpu_device_wb_fini(adev); 3587 amdgpu_device_mem_scratch_fini(adev); 3588 amdgpu_ib_pool_fini(adev); 3589 amdgpu_seq64_fini(adev); 3590 amdgpu_doorbell_fini(adev); 3591 } 3592 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3593 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3594 /* XXX handle errors */ 3595 if (r) { 3596 dev_dbg(adev->dev, 3597 "sw_fini of IP block <%s> failed %d\n", 3598 adev->ip_blocks[i].version->funcs->name, 3599 r); 3600 } 3601 } 3602 adev->ip_blocks[i].status.sw = false; 3603 adev->ip_blocks[i].status.valid = false; 3604 } 3605 3606 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3607 if (!adev->ip_blocks[i].status.late_initialized) 3608 continue; 3609 if (!adev->ip_blocks[i].version) 3610 continue; 3611 if (adev->ip_blocks[i].version->funcs->late_fini) 3612 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3613 adev->ip_blocks[i].status.late_initialized = false; 3614 } 3615 3616 amdgpu_ras_fini(adev); 3617 amdgpu_uid_fini(adev); 3618 3619 return 0; 3620 } 3621 3622 /** 3623 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3624 * 3625 * @work: work_struct. 3626 */ 3627 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3628 { 3629 struct amdgpu_device *adev = 3630 container_of(work, struct amdgpu_device, delayed_init_work.work); 3631 int r; 3632 3633 r = amdgpu_ib_ring_tests(adev); 3634 if (r) 3635 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3636 } 3637 3638 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3639 { 3640 struct amdgpu_device *adev = 3641 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3642 3643 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3644 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3645 3646 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3647 adev->gfx.gfx_off_state = true; 3648 } 3649 3650 /** 3651 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3652 * 3653 * @adev: amdgpu_device pointer 3654 * 3655 * Main suspend function for hardware IPs. The list of all the hardware 3656 * IPs that make up the asic is walked, clockgating is disabled and the 3657 * suspend callbacks are run. suspend puts the hardware and software state 3658 * in each IP into a state suitable for suspend. 3659 * Returns 0 on success, negative error code on failure. 3660 */ 3661 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3662 { 3663 int i, r, rec; 3664 3665 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3666 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3667 3668 /* 3669 * Per PMFW team's suggestion, driver needs to handle gfxoff 3670 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3671 * scenario. Add the missing df cstate disablement here. 3672 */ 3673 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3674 dev_warn(adev->dev, "Failed to disallow df cstate"); 3675 3676 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3677 if (!adev->ip_blocks[i].status.valid) 3678 continue; 3679 3680 /* displays are handled separately */ 3681 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3682 continue; 3683 3684 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3685 if (r) 3686 goto unwind; 3687 } 3688 3689 return 0; 3690 unwind: 3691 rec = amdgpu_device_ip_resume_phase3(adev); 3692 if (rec) 3693 dev_err(adev->dev, 3694 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3695 rec); 3696 3697 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3698 3699 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3700 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3701 3702 return r; 3703 } 3704 3705 /** 3706 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3707 * 3708 * @adev: amdgpu_device pointer 3709 * 3710 * Main suspend function for hardware IPs. The list of all the hardware 3711 * IPs that make up the asic is walked, clockgating is disabled and the 3712 * suspend callbacks are run. suspend puts the hardware and software state 3713 * in each IP into a state suitable for suspend. 3714 * Returns 0 on success, negative error code on failure. 3715 */ 3716 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3717 { 3718 int i, r, rec; 3719 3720 if (adev->in_s0ix) 3721 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3722 3723 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3724 if (!adev->ip_blocks[i].status.valid) 3725 continue; 3726 /* displays are handled in phase1 */ 3727 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3728 continue; 3729 /* PSP lost connection when err_event_athub occurs */ 3730 if (amdgpu_ras_intr_triggered() && 3731 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3732 adev->ip_blocks[i].status.hw = false; 3733 continue; 3734 } 3735 3736 /* skip unnecessary suspend if we do not initialize them yet */ 3737 if (!amdgpu_ip_member_of_hwini( 3738 adev, adev->ip_blocks[i].version->type)) 3739 continue; 3740 3741 /* Since we skip suspend for S0i3, we need to cancel the delayed 3742 * idle work here as the suspend callback never gets called. 3743 */ 3744 if (adev->in_s0ix && 3745 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3746 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3747 cancel_delayed_work_sync(&adev->gfx.idle_work); 3748 /* skip suspend of gfx/mes and psp for S0ix 3749 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3750 * like at runtime. PSP is also part of the always on hardware 3751 * so no need to suspend it. 3752 */ 3753 if (adev->in_s0ix && 3754 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3755 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3756 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3757 continue; 3758 3759 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3760 if (adev->in_s0ix && 3761 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3762 IP_VERSION(5, 0, 0)) && 3763 (adev->ip_blocks[i].version->type == 3764 AMD_IP_BLOCK_TYPE_SDMA)) 3765 continue; 3766 3767 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3768 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3769 * from this location and RLC Autoload automatically also gets loaded 3770 * from here based on PMFW -> PSP message during re-init sequence. 3771 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3772 * the TMR and reload FWs again for IMU enabled APU ASICs. 3773 */ 3774 if (amdgpu_in_reset(adev) && 3775 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3777 continue; 3778 3779 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3780 if (r) 3781 goto unwind; 3782 3783 /* handle putting the SMC in the appropriate state */ 3784 if (!amdgpu_sriov_vf(adev)) { 3785 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3786 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3787 if (r) { 3788 dev_err(adev->dev, 3789 "SMC failed to set mp1 state %d, %d\n", 3790 adev->mp1_state, r); 3791 goto unwind; 3792 } 3793 } 3794 } 3795 } 3796 3797 return 0; 3798 unwind: 3799 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3800 rec = amdgpu_device_ip_resume_phase1(adev); 3801 if (rec) { 3802 dev_err(adev->dev, 3803 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3804 rec); 3805 return r; 3806 } 3807 3808 rec = amdgpu_device_fw_loading(adev); 3809 if (rec) { 3810 dev_err(adev->dev, 3811 "amdgpu_device_fw_loading failed during unwind: %d\n", 3812 rec); 3813 return r; 3814 } 3815 3816 rec = amdgpu_device_ip_resume_phase2(adev); 3817 if (rec) { 3818 dev_err(adev->dev, 3819 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3820 rec); 3821 return r; 3822 } 3823 3824 return r; 3825 } 3826 3827 /** 3828 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3829 * 3830 * @adev: amdgpu_device pointer 3831 * 3832 * Main suspend function for hardware IPs. The list of all the hardware 3833 * IPs that make up the asic is walked, clockgating is disabled and the 3834 * suspend callbacks are run. suspend puts the hardware and software state 3835 * in each IP into a state suitable for suspend. 3836 * Returns 0 on success, negative error code on failure. 3837 */ 3838 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3839 { 3840 int r; 3841 3842 if (amdgpu_sriov_vf(adev)) { 3843 amdgpu_virt_fini_data_exchange(adev); 3844 amdgpu_virt_request_full_gpu(adev, false); 3845 } 3846 3847 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3848 3849 r = amdgpu_device_ip_suspend_phase1(adev); 3850 if (r) 3851 return r; 3852 r = amdgpu_device_ip_suspend_phase2(adev); 3853 3854 if (amdgpu_sriov_vf(adev)) 3855 amdgpu_virt_release_full_gpu(adev, false); 3856 3857 return r; 3858 } 3859 3860 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3861 { 3862 int i, r; 3863 3864 static enum amd_ip_block_type ip_order[] = { 3865 AMD_IP_BLOCK_TYPE_COMMON, 3866 AMD_IP_BLOCK_TYPE_GMC, 3867 AMD_IP_BLOCK_TYPE_PSP, 3868 AMD_IP_BLOCK_TYPE_IH, 3869 }; 3870 3871 for (i = 0; i < adev->num_ip_blocks; i++) { 3872 int j; 3873 struct amdgpu_ip_block *block; 3874 3875 block = &adev->ip_blocks[i]; 3876 block->status.hw = false; 3877 3878 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3879 3880 if (block->version->type != ip_order[j] || 3881 !block->status.valid) 3882 continue; 3883 3884 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3885 if (r) { 3886 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3887 block->version->funcs->name); 3888 return r; 3889 } 3890 block->status.hw = true; 3891 } 3892 } 3893 3894 return 0; 3895 } 3896 3897 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3898 { 3899 struct amdgpu_ip_block *block; 3900 int i, r = 0; 3901 3902 static enum amd_ip_block_type ip_order[] = { 3903 AMD_IP_BLOCK_TYPE_SMC, 3904 AMD_IP_BLOCK_TYPE_DCE, 3905 AMD_IP_BLOCK_TYPE_GFX, 3906 AMD_IP_BLOCK_TYPE_SDMA, 3907 AMD_IP_BLOCK_TYPE_MES, 3908 AMD_IP_BLOCK_TYPE_UVD, 3909 AMD_IP_BLOCK_TYPE_VCE, 3910 AMD_IP_BLOCK_TYPE_VCN, 3911 AMD_IP_BLOCK_TYPE_JPEG 3912 }; 3913 3914 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3915 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3916 3917 if (!block) 3918 continue; 3919 3920 if (block->status.valid && !block->status.hw) { 3921 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3922 r = amdgpu_ip_block_resume(block); 3923 } else { 3924 r = block->version->funcs->hw_init(block); 3925 } 3926 3927 if (r) { 3928 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3929 block->version->funcs->name); 3930 break; 3931 } 3932 block->status.hw = true; 3933 } 3934 } 3935 3936 return r; 3937 } 3938 3939 /** 3940 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3941 * 3942 * @adev: amdgpu_device pointer 3943 * 3944 * First resume function for hardware IPs. The list of all the hardware 3945 * IPs that make up the asic is walked and the resume callbacks are run for 3946 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3947 * after a suspend and updates the software state as necessary. This 3948 * function is also used for restoring the GPU after a GPU reset. 3949 * Returns 0 on success, negative error code on failure. 3950 */ 3951 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3952 { 3953 int i, r; 3954 3955 for (i = 0; i < adev->num_ip_blocks; i++) { 3956 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3957 continue; 3958 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3959 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3960 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3961 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3962 3963 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3964 if (r) 3965 return r; 3966 } 3967 } 3968 3969 return 0; 3970 } 3971 3972 /** 3973 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3974 * 3975 * @adev: amdgpu_device pointer 3976 * 3977 * Second resume function for hardware IPs. The list of all the hardware 3978 * IPs that make up the asic is walked and the resume callbacks are run for 3979 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3980 * functional state after a suspend and updates the software state as 3981 * necessary. This function is also used for restoring the GPU after a GPU 3982 * reset. 3983 * Returns 0 on success, negative error code on failure. 3984 */ 3985 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3986 { 3987 int i, r; 3988 3989 for (i = 0; i < adev->num_ip_blocks; i++) { 3990 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3991 continue; 3992 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3993 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3995 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3996 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3997 continue; 3998 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3999 if (r) 4000 return r; 4001 } 4002 4003 return 0; 4004 } 4005 4006 /** 4007 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4008 * 4009 * @adev: amdgpu_device pointer 4010 * 4011 * Third resume function for hardware IPs. The list of all the hardware 4012 * IPs that make up the asic is walked and the resume callbacks are run for 4013 * all DCE. resume puts the hardware into a functional state after a suspend 4014 * and updates the software state as necessary. This function is also used 4015 * for restoring the GPU after a GPU reset. 4016 * 4017 * Returns 0 on success, negative error code on failure. 4018 */ 4019 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4020 { 4021 int i, r; 4022 4023 for (i = 0; i < adev->num_ip_blocks; i++) { 4024 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4025 continue; 4026 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4027 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4028 if (r) 4029 return r; 4030 } 4031 } 4032 4033 return 0; 4034 } 4035 4036 /** 4037 * amdgpu_device_ip_resume - run resume for hardware IPs 4038 * 4039 * @adev: amdgpu_device pointer 4040 * 4041 * Main resume function for hardware IPs. The hardware IPs 4042 * are split into two resume functions because they are 4043 * also used in recovering from a GPU reset and some additional 4044 * steps need to be take between them. In this case (S3/S4) they are 4045 * run sequentially. 4046 * Returns 0 on success, negative error code on failure. 4047 */ 4048 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4049 { 4050 int r; 4051 4052 r = amdgpu_device_ip_resume_phase1(adev); 4053 if (r) 4054 return r; 4055 4056 r = amdgpu_device_fw_loading(adev); 4057 if (r) 4058 return r; 4059 4060 r = amdgpu_device_ip_resume_phase2(adev); 4061 4062 if (adev->mman.buffer_funcs_ring->sched.ready) 4063 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4064 4065 if (r) 4066 return r; 4067 4068 amdgpu_fence_driver_hw_init(adev); 4069 4070 r = amdgpu_device_ip_resume_phase3(adev); 4071 4072 return r; 4073 } 4074 4075 /** 4076 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4077 * 4078 * @adev: amdgpu_device pointer 4079 * 4080 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4081 */ 4082 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4083 { 4084 if (amdgpu_sriov_vf(adev)) { 4085 if (adev->is_atom_fw) { 4086 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4087 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4088 } else { 4089 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4090 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4091 } 4092 4093 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4094 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4095 } 4096 } 4097 4098 /** 4099 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4100 * 4101 * @pdev : pci device context 4102 * @asic_type: AMD asic type 4103 * 4104 * Check if there is DC (new modesetting infrastructre) support for an asic. 4105 * returns true if DC has support, false if not. 4106 */ 4107 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4108 enum amd_asic_type asic_type) 4109 { 4110 switch (asic_type) { 4111 #ifdef CONFIG_DRM_AMDGPU_SI 4112 case CHIP_HAINAN: 4113 #endif 4114 case CHIP_TOPAZ: 4115 /* chips with no display hardware */ 4116 return false; 4117 #if defined(CONFIG_DRM_AMD_DC) 4118 case CHIP_TAHITI: 4119 case CHIP_PITCAIRN: 4120 case CHIP_VERDE: 4121 case CHIP_OLAND: 4122 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 4123 case CHIP_KAVERI: 4124 case CHIP_KABINI: 4125 case CHIP_MULLINS: 4126 /* 4127 * We have systems in the wild with these ASICs that require 4128 * TRAVIS and NUTMEG support which is not supported with DC. 4129 * 4130 * Fallback to the non-DC driver here by default so as not to 4131 * cause regressions. 4132 */ 4133 return amdgpu_dc > 0; 4134 default: 4135 return amdgpu_dc != 0; 4136 #else 4137 default: 4138 if (amdgpu_dc > 0) 4139 dev_info_once( 4140 &pdev->dev, 4141 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4142 return false; 4143 #endif 4144 } 4145 } 4146 4147 /** 4148 * amdgpu_device_has_dc_support - check if dc is supported 4149 * 4150 * @adev: amdgpu_device pointer 4151 * 4152 * Returns true for supported, false for not supported 4153 */ 4154 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4155 { 4156 if (adev->enable_virtual_display || 4157 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4158 return false; 4159 4160 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4161 } 4162 4163 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4164 { 4165 struct amdgpu_device *adev = 4166 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4167 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4168 4169 /* It's a bug to not have a hive within this function */ 4170 if (WARN_ON(!hive)) 4171 return; 4172 4173 /* 4174 * Use task barrier to synchronize all xgmi reset works across the 4175 * hive. task_barrier_enter and task_barrier_exit will block 4176 * until all the threads running the xgmi reset works reach 4177 * those points. task_barrier_full will do both blocks. 4178 */ 4179 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4180 4181 task_barrier_enter(&hive->tb); 4182 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4183 4184 if (adev->asic_reset_res) 4185 goto fail; 4186 4187 task_barrier_exit(&hive->tb); 4188 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4189 4190 if (adev->asic_reset_res) 4191 goto fail; 4192 4193 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4194 } else { 4195 4196 task_barrier_full(&hive->tb); 4197 adev->asic_reset_res = amdgpu_asic_reset(adev); 4198 } 4199 4200 fail: 4201 if (adev->asic_reset_res) 4202 dev_warn(adev->dev, 4203 "ASIC reset failed with error, %d for drm dev, %s", 4204 adev->asic_reset_res, adev_to_drm(adev)->unique); 4205 amdgpu_put_xgmi_hive(hive); 4206 } 4207 4208 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4209 { 4210 char buf[AMDGPU_MAX_TIMEOUT_PARAM_LENGTH]; 4211 char *input = buf; 4212 char *timeout_setting = NULL; 4213 int index = 0; 4214 long timeout; 4215 int ret = 0; 4216 4217 /* By default timeout for all queues is 2 sec */ 4218 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4219 adev->video_timeout = msecs_to_jiffies(2000); 4220 4221 if (!strnlen(amdgpu_lockup_timeout, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4222 return 0; 4223 4224 /* 4225 * strsep() destructively modifies its input by replacing delimiters 4226 * with '\0'. Use a stack copy so the global module parameter buffer 4227 * remains intact for multi-GPU systems where this function is called 4228 * once per device. 4229 */ 4230 strscpy(buf, amdgpu_lockup_timeout, sizeof(buf)); 4231 4232 while ((timeout_setting = strsep(&input, ",")) && 4233 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4234 ret = kstrtol(timeout_setting, 0, &timeout); 4235 if (ret) 4236 return ret; 4237 4238 if (timeout == 0) { 4239 index++; 4240 continue; 4241 } else if (timeout < 0) { 4242 timeout = MAX_SCHEDULE_TIMEOUT; 4243 dev_warn(adev->dev, "lockup timeout disabled"); 4244 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4245 } else { 4246 timeout = msecs_to_jiffies(timeout); 4247 } 4248 4249 switch (index++) { 4250 case 0: 4251 adev->gfx_timeout = timeout; 4252 break; 4253 case 1: 4254 adev->compute_timeout = timeout; 4255 break; 4256 case 2: 4257 adev->sdma_timeout = timeout; 4258 break; 4259 case 3: 4260 adev->video_timeout = timeout; 4261 break; 4262 default: 4263 break; 4264 } 4265 } 4266 4267 /* When only one value specified apply it to all queues. */ 4268 if (index == 1) 4269 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4270 adev->video_timeout = timeout; 4271 4272 return ret; 4273 } 4274 4275 /** 4276 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4277 * 4278 * @adev: amdgpu_device pointer 4279 * 4280 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4281 */ 4282 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4283 { 4284 struct iommu_domain *domain; 4285 4286 domain = iommu_get_domain_for_dev(adev->dev); 4287 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4288 adev->ram_is_direct_mapped = true; 4289 } 4290 4291 #if defined(CONFIG_HSA_AMD_P2P) 4292 /** 4293 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4294 * 4295 * @adev: amdgpu_device pointer 4296 * 4297 * return if IOMMU remapping bar address 4298 */ 4299 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4300 { 4301 struct iommu_domain *domain; 4302 4303 domain = iommu_get_domain_for_dev(adev->dev); 4304 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4305 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4306 return true; 4307 4308 return false; 4309 } 4310 #endif 4311 4312 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4313 { 4314 if (amdgpu_mcbp == 1) 4315 adev->gfx.mcbp = true; 4316 else if (amdgpu_mcbp == 0) 4317 adev->gfx.mcbp = false; 4318 4319 if (amdgpu_sriov_vf(adev)) 4320 adev->gfx.mcbp = true; 4321 4322 if (adev->gfx.mcbp) 4323 dev_info(adev->dev, "MCBP is enabled\n"); 4324 } 4325 4326 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4327 { 4328 int r; 4329 4330 r = amdgpu_atombios_sysfs_init(adev); 4331 if (r) 4332 drm_err(&adev->ddev, 4333 "registering atombios sysfs failed (%d).\n", r); 4334 4335 r = amdgpu_pm_sysfs_init(adev); 4336 if (r) 4337 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4338 4339 r = amdgpu_ucode_sysfs_init(adev); 4340 if (r) { 4341 adev->ucode_sysfs_en = false; 4342 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4343 } else 4344 adev->ucode_sysfs_en = true; 4345 4346 r = amdgpu_device_attr_sysfs_init(adev); 4347 if (r) 4348 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4349 4350 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4351 if (r) 4352 dev_err(adev->dev, 4353 "Could not create amdgpu board attributes\n"); 4354 4355 amdgpu_fru_sysfs_init(adev); 4356 amdgpu_reg_state_sysfs_init(adev); 4357 amdgpu_xcp_sysfs_init(adev); 4358 amdgpu_uma_sysfs_init(adev); 4359 4360 return r; 4361 } 4362 4363 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4364 { 4365 if (adev->pm.sysfs_initialized) 4366 amdgpu_pm_sysfs_fini(adev); 4367 if (adev->ucode_sysfs_en) 4368 amdgpu_ucode_sysfs_fini(adev); 4369 amdgpu_device_attr_sysfs_fini(adev); 4370 amdgpu_fru_sysfs_fini(adev); 4371 4372 amdgpu_reg_state_sysfs_fini(adev); 4373 amdgpu_xcp_sysfs_fini(adev); 4374 amdgpu_uma_sysfs_fini(adev); 4375 } 4376 4377 /** 4378 * amdgpu_device_init - initialize the driver 4379 * 4380 * @adev: amdgpu_device pointer 4381 * @flags: driver flags 4382 * 4383 * Initializes the driver info and hw (all asics). 4384 * Returns 0 for success or an error on failure. 4385 * Called at driver startup. 4386 */ 4387 int amdgpu_device_init(struct amdgpu_device *adev, 4388 uint32_t flags) 4389 { 4390 struct pci_dev *pdev = adev->pdev; 4391 int r, i; 4392 bool px = false; 4393 u32 max_MBps; 4394 int tmp; 4395 4396 adev->shutdown = false; 4397 adev->flags = flags; 4398 4399 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4400 adev->asic_type = amdgpu_force_asic_type; 4401 else 4402 adev->asic_type = flags & AMD_ASIC_MASK; 4403 4404 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4405 if (amdgpu_emu_mode == 1) 4406 adev->usec_timeout *= 10; 4407 adev->gmc.gart_size = 512 * 1024 * 1024; 4408 adev->accel_working = false; 4409 adev->num_rings = 0; 4410 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4411 adev->mman.buffer_funcs = NULL; 4412 adev->mman.buffer_funcs_ring = NULL; 4413 adev->vm_manager.vm_pte_funcs = NULL; 4414 adev->vm_manager.vm_pte_num_scheds = 0; 4415 adev->gmc.gmc_funcs = NULL; 4416 adev->harvest_ip_mask = 0x0; 4417 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4418 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4419 4420 adev->smc_rreg = &amdgpu_invalid_rreg; 4421 adev->smc_wreg = &amdgpu_invalid_wreg; 4422 adev->pcie_rreg = &amdgpu_invalid_rreg; 4423 adev->pcie_wreg = &amdgpu_invalid_wreg; 4424 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4425 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4426 adev->pciep_rreg = &amdgpu_invalid_rreg; 4427 adev->pciep_wreg = &amdgpu_invalid_wreg; 4428 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4429 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4430 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4431 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4432 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4433 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4434 adev->didt_rreg = &amdgpu_invalid_rreg; 4435 adev->didt_wreg = &amdgpu_invalid_wreg; 4436 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4437 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4438 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4439 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4440 4441 dev_info( 4442 adev->dev, 4443 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4444 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4445 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4446 4447 /* mutex initialization are all done here so we 4448 * can recall function without having locking issues 4449 */ 4450 mutex_init(&adev->firmware.mutex); 4451 mutex_init(&adev->pm.mutex); 4452 mutex_init(&adev->gfx.gpu_clock_mutex); 4453 mutex_init(&adev->srbm_mutex); 4454 mutex_init(&adev->gfx.pipe_reserve_mutex); 4455 mutex_init(&adev->gfx.gfx_off_mutex); 4456 mutex_init(&adev->gfx.partition_mutex); 4457 mutex_init(&adev->grbm_idx_mutex); 4458 mutex_init(&adev->mn_lock); 4459 mutex_init(&adev->virt.vf_errors.lock); 4460 hash_init(adev->mn_hash); 4461 mutex_init(&adev->psp.mutex); 4462 mutex_init(&adev->notifier_lock); 4463 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4464 mutex_init(&adev->benchmark_mutex); 4465 mutex_init(&adev->gfx.reset_sem_mutex); 4466 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4467 mutex_init(&adev->enforce_isolation_mutex); 4468 for (i = 0; i < MAX_XCP; ++i) { 4469 adev->isolation[i].spearhead = dma_fence_get_stub(); 4470 amdgpu_sync_create(&adev->isolation[i].active); 4471 amdgpu_sync_create(&adev->isolation[i].prev); 4472 } 4473 mutex_init(&adev->gfx.userq_sch_mutex); 4474 mutex_init(&adev->gfx.workload_profile_mutex); 4475 mutex_init(&adev->vcn.workload_profile_mutex); 4476 4477 amdgpu_device_init_apu_flags(adev); 4478 4479 r = amdgpu_device_check_arguments(adev); 4480 if (r) 4481 return r; 4482 4483 spin_lock_init(&adev->mmio_idx_lock); 4484 spin_lock_init(&adev->smc_idx_lock); 4485 spin_lock_init(&adev->pcie_idx_lock); 4486 spin_lock_init(&adev->uvd_ctx_idx_lock); 4487 spin_lock_init(&adev->didt_idx_lock); 4488 spin_lock_init(&adev->gc_cac_idx_lock); 4489 spin_lock_init(&adev->se_cac_idx_lock); 4490 spin_lock_init(&adev->audio_endpt_idx_lock); 4491 spin_lock_init(&adev->mm_stats.lock); 4492 spin_lock_init(&adev->virt.rlcg_reg_lock); 4493 spin_lock_init(&adev->wb.lock); 4494 4495 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4496 4497 INIT_LIST_HEAD(&adev->reset_list); 4498 4499 INIT_LIST_HEAD(&adev->ras_list); 4500 4501 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4502 4503 xa_init(&adev->userq_doorbell_xa); 4504 4505 INIT_DELAYED_WORK(&adev->delayed_init_work, 4506 amdgpu_device_delayed_init_work_handler); 4507 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4508 amdgpu_device_delay_enable_gfx_off); 4509 /* 4510 * Initialize the enforce_isolation work structures for each XCP 4511 * partition. This work handler is responsible for enforcing shader 4512 * isolation on AMD GPUs. It counts the number of emitted fences for 4513 * each GFX and compute ring. If there are any fences, it schedules 4514 * the `enforce_isolation_work` to be run after a delay. If there are 4515 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4516 * runqueue. 4517 */ 4518 for (i = 0; i < MAX_XCP; i++) { 4519 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4520 amdgpu_gfx_enforce_isolation_handler); 4521 adev->gfx.enforce_isolation[i].adev = adev; 4522 adev->gfx.enforce_isolation[i].xcp_id = i; 4523 } 4524 4525 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4526 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4527 4528 adev->gfx.gfx_off_req_count = 1; 4529 adev->gfx.gfx_off_residency = 0; 4530 adev->gfx.gfx_off_entrycount = 0; 4531 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4532 4533 atomic_set(&adev->throttling_logging_enabled, 1); 4534 /* 4535 * If throttling continues, logging will be performed every minute 4536 * to avoid log flooding. "-1" is subtracted since the thermal 4537 * throttling interrupt comes every second. Thus, the total logging 4538 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4539 * for throttling interrupt) = 60 seconds. 4540 */ 4541 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4542 4543 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4544 4545 /* Registers mapping */ 4546 /* TODO: block userspace mapping of io register */ 4547 if (adev->asic_type >= CHIP_BONAIRE) { 4548 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4549 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4550 } else { 4551 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4552 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4553 } 4554 4555 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4556 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4557 4558 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4559 if (!adev->rmmio) 4560 return -ENOMEM; 4561 4562 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4563 (uint32_t)adev->rmmio_base); 4564 dev_info(adev->dev, "register mmio size: %u\n", 4565 (unsigned int)adev->rmmio_size); 4566 4567 /* 4568 * Reset domain needs to be present early, before XGMI hive discovered 4569 * (if any) and initialized to use reset sem and in_gpu reset flag 4570 * early on during init and before calling to RREG32. 4571 */ 4572 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4573 if (!adev->reset_domain) 4574 return -ENOMEM; 4575 4576 /* detect hw virtualization here */ 4577 amdgpu_virt_init(adev); 4578 4579 amdgpu_device_get_pcie_info(adev); 4580 4581 r = amdgpu_device_get_job_timeout_settings(adev); 4582 if (r) { 4583 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4584 return r; 4585 } 4586 4587 amdgpu_device_set_mcbp(adev); 4588 4589 /* 4590 * By default, use default mode where all blocks are expected to be 4591 * initialized. At present a 'swinit' of blocks is required to be 4592 * completed before the need for a different level is detected. 4593 */ 4594 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4595 /* early init functions */ 4596 r = amdgpu_device_ip_early_init(adev); 4597 if (r) 4598 return r; 4599 4600 /* 4601 * No need to remove conflicting FBs for non-display class devices. 4602 * This prevents the sysfb from being freed accidently. 4603 */ 4604 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4605 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4606 /* Get rid of things like offb */ 4607 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4608 if (r) 4609 return r; 4610 } 4611 4612 /* Enable TMZ based on IP_VERSION */ 4613 amdgpu_gmc_tmz_set(adev); 4614 4615 if (amdgpu_sriov_vf(adev) && 4616 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4617 /* VF MMIO access (except mailbox range) from CPU 4618 * will be blocked during sriov runtime 4619 */ 4620 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4621 4622 amdgpu_gmc_noretry_set(adev); 4623 /* Need to get xgmi info early to decide the reset behavior*/ 4624 if (adev->gmc.xgmi.supported) { 4625 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4626 if (r) 4627 return r; 4628 } 4629 4630 /* enable PCIE atomic ops */ 4631 if (amdgpu_sriov_vf(adev)) { 4632 if (adev->virt.fw_reserve.p_pf2vf) 4633 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4634 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4635 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4636 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4637 * internal path natively support atomics, set have_atomics_support to true. 4638 */ 4639 } else if ((adev->flags & AMD_IS_APU && 4640 amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) || 4641 (adev->gmc.xgmi.connected_to_cpu && 4642 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 1, 0))) { 4643 adev->have_atomics_support = true; 4644 } else { 4645 adev->have_atomics_support = 4646 !pci_enable_atomic_ops_to_root(adev->pdev, 4647 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4648 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4649 } 4650 4651 if (!adev->have_atomics_support) 4652 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4653 4654 /* doorbell bar mapping and doorbell index init*/ 4655 amdgpu_doorbell_init(adev); 4656 4657 if (amdgpu_emu_mode == 1) { 4658 /* post the asic on emulation mode */ 4659 emu_soc_asic_init(adev); 4660 goto fence_driver_init; 4661 } 4662 4663 amdgpu_reset_init(adev); 4664 4665 /* detect if we are with an SRIOV vbios */ 4666 if (adev->bios) 4667 amdgpu_device_detect_sriov_bios(adev); 4668 4669 /* check if we need to reset the asic 4670 * E.g., driver was not cleanly unloaded previously, etc. 4671 */ 4672 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4673 if (adev->gmc.xgmi.num_physical_nodes) { 4674 dev_info(adev->dev, "Pending hive reset.\n"); 4675 amdgpu_set_init_level(adev, 4676 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4677 } else { 4678 tmp = amdgpu_reset_method; 4679 /* It should do a default reset when loading or reloading the driver, 4680 * regardless of the module parameter reset_method. 4681 */ 4682 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4683 r = amdgpu_asic_reset(adev); 4684 amdgpu_reset_method = tmp; 4685 } 4686 4687 if (r) { 4688 dev_err(adev->dev, "asic reset on init failed\n"); 4689 goto failed; 4690 } 4691 } 4692 4693 /* Post card if necessary */ 4694 if (amdgpu_device_need_post(adev)) { 4695 if (!adev->bios) { 4696 dev_err(adev->dev, "no vBIOS found\n"); 4697 r = -EINVAL; 4698 goto failed; 4699 } 4700 dev_info(adev->dev, "GPU posting now...\n"); 4701 r = amdgpu_device_asic_init(adev); 4702 if (r) { 4703 dev_err(adev->dev, "gpu post error!\n"); 4704 goto failed; 4705 } 4706 } 4707 4708 if (adev->bios) { 4709 if (adev->is_atom_fw) { 4710 /* Initialize clocks */ 4711 r = amdgpu_atomfirmware_get_clock_info(adev); 4712 if (r) { 4713 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4714 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4715 goto failed; 4716 } 4717 } else { 4718 /* Initialize clocks */ 4719 r = amdgpu_atombios_get_clock_info(adev); 4720 if (r) { 4721 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4722 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4723 goto failed; 4724 } 4725 /* init i2c buses */ 4726 amdgpu_i2c_init(adev); 4727 } 4728 } 4729 4730 fence_driver_init: 4731 /* Fence driver */ 4732 r = amdgpu_fence_driver_sw_init(adev); 4733 if (r) { 4734 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4735 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4736 goto failed; 4737 } 4738 4739 /* init the mode config */ 4740 drm_mode_config_init(adev_to_drm(adev)); 4741 4742 r = amdgpu_device_ip_init(adev); 4743 if (r) { 4744 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4745 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4746 goto release_ras_con; 4747 } 4748 4749 amdgpu_fence_driver_hw_init(adev); 4750 4751 dev_info(adev->dev, 4752 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4753 adev->gfx.config.max_shader_engines, 4754 adev->gfx.config.max_sh_per_se, 4755 adev->gfx.config.max_cu_per_sh, 4756 adev->gfx.cu_info.number); 4757 4758 adev->accel_working = true; 4759 4760 amdgpu_vm_check_compute_bug(adev); 4761 4762 /* Initialize the buffer migration limit. */ 4763 if (amdgpu_moverate >= 0) 4764 max_MBps = amdgpu_moverate; 4765 else 4766 max_MBps = 8; /* Allow 8 MB/s. */ 4767 /* Get a log2 for easy divisions. */ 4768 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4769 4770 /* 4771 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4772 * Otherwise the mgpu fan boost feature will be skipped due to the 4773 * gpu instance is counted less. 4774 */ 4775 amdgpu_register_gpu_instance(adev); 4776 4777 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4778 * explicit gating rather than handling it automatically. 4779 */ 4780 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4781 r = amdgpu_device_ip_late_init(adev); 4782 if (r) { 4783 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4784 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4785 goto release_ras_con; 4786 } 4787 /* must succeed. */ 4788 amdgpu_ras_resume(adev); 4789 queue_delayed_work(system_wq, &adev->delayed_init_work, 4790 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4791 } 4792 4793 if (amdgpu_sriov_vf(adev)) { 4794 amdgpu_virt_release_full_gpu(adev, true); 4795 flush_delayed_work(&adev->delayed_init_work); 4796 } 4797 4798 /* Don't init kfd if whole hive need to be reset during init */ 4799 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4800 kgd2kfd_init_zone_device(adev); 4801 kfd_update_svm_support_properties(adev); 4802 } 4803 4804 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4805 amdgpu_xgmi_reset_on_init(adev); 4806 4807 /* 4808 * Place those sysfs registering after `late_init`. As some of those 4809 * operations performed in `late_init` might affect the sysfs 4810 * interfaces creating. 4811 */ 4812 r = amdgpu_device_sys_interface_init(adev); 4813 4814 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4815 r = amdgpu_pmu_init(adev); 4816 if (r) 4817 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4818 4819 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4820 if (amdgpu_device_cache_pci_state(adev->pdev)) 4821 pci_restore_state(pdev); 4822 4823 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4824 /* this will fail for cards that aren't VGA class devices, just 4825 * ignore it 4826 */ 4827 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4828 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4829 4830 px = amdgpu_device_supports_px(adev); 4831 4832 if (px || (!dev_is_removable(&adev->pdev->dev) && 4833 apple_gmux_detect(NULL, NULL))) 4834 vga_switcheroo_register_client(adev->pdev, 4835 &amdgpu_switcheroo_ops, px); 4836 4837 if (px) 4838 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4839 4840 amdgpu_device_check_iommu_direct_map(adev); 4841 4842 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4843 r = register_pm_notifier(&adev->pm_nb); 4844 if (r) 4845 goto failed; 4846 4847 return 0; 4848 4849 release_ras_con: 4850 if (amdgpu_sriov_vf(adev)) 4851 amdgpu_virt_release_full_gpu(adev, true); 4852 4853 /* failed in exclusive mode due to timeout */ 4854 if (amdgpu_sriov_vf(adev) && 4855 !amdgpu_sriov_runtime(adev) && 4856 amdgpu_virt_mmio_blocked(adev) && 4857 !amdgpu_virt_wait_reset(adev)) { 4858 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4859 /* Don't send request since VF is inactive. */ 4860 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4861 adev->virt.ops = NULL; 4862 r = -EAGAIN; 4863 } 4864 amdgpu_release_ras_context(adev); 4865 4866 failed: 4867 amdgpu_vf_error_trans_all(adev); 4868 4869 return r; 4870 } 4871 4872 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4873 { 4874 4875 /* Clear all CPU mappings pointing to this device */ 4876 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4877 4878 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4879 amdgpu_doorbell_fini(adev); 4880 4881 iounmap(adev->rmmio); 4882 adev->rmmio = NULL; 4883 if (adev->mman.aper_base_kaddr) 4884 iounmap(adev->mman.aper_base_kaddr); 4885 adev->mman.aper_base_kaddr = NULL; 4886 4887 /* Memory manager related */ 4888 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4889 arch_phys_wc_del(adev->gmc.vram_mtrr); 4890 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4891 } 4892 } 4893 4894 /** 4895 * amdgpu_device_fini_hw - tear down the driver 4896 * 4897 * @adev: amdgpu_device pointer 4898 * 4899 * Tear down the driver info (all asics). 4900 * Called at driver shutdown. 4901 */ 4902 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4903 { 4904 dev_info(adev->dev, "finishing device.\n"); 4905 flush_delayed_work(&adev->delayed_init_work); 4906 4907 if (adev->mman.initialized) 4908 drain_workqueue(adev->mman.bdev.wq); 4909 adev->shutdown = true; 4910 4911 unregister_pm_notifier(&adev->pm_nb); 4912 4913 /* make sure IB test finished before entering exclusive mode 4914 * to avoid preemption on IB test 4915 */ 4916 if (amdgpu_sriov_vf(adev)) { 4917 amdgpu_virt_request_full_gpu(adev, false); 4918 amdgpu_virt_fini_data_exchange(adev); 4919 } 4920 4921 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 4922 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 4923 4924 /* disable all interrupts */ 4925 amdgpu_irq_disable_all(adev); 4926 if (adev->mode_info.mode_config_initialized) { 4927 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4928 drm_helper_force_disable_all(adev_to_drm(adev)); 4929 else 4930 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4931 } 4932 amdgpu_fence_driver_hw_fini(adev); 4933 4934 amdgpu_device_sys_interface_fini(adev); 4935 4936 /* disable ras feature must before hw fini */ 4937 amdgpu_ras_pre_fini(adev); 4938 4939 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4940 4941 /* 4942 * device went through surprise hotplug; we need to destroy topology 4943 * before ip_fini_early to prevent kfd locking refcount issues by calling 4944 * amdgpu_amdkfd_suspend() 4945 */ 4946 if (pci_dev_is_disconnected(adev->pdev)) 4947 amdgpu_amdkfd_device_fini_sw(adev); 4948 4949 amdgpu_device_ip_fini_early(adev); 4950 4951 amdgpu_irq_fini_hw(adev); 4952 4953 if (adev->mman.initialized) 4954 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4955 4956 amdgpu_gart_dummy_page_fini(adev); 4957 4958 if (pci_dev_is_disconnected(adev->pdev)) 4959 amdgpu_device_unmap_mmio(adev); 4960 4961 } 4962 4963 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4964 { 4965 int i, idx; 4966 bool px; 4967 4968 amdgpu_device_ip_fini(adev); 4969 amdgpu_fence_driver_sw_fini(adev); 4970 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4971 adev->accel_working = false; 4972 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4973 for (i = 0; i < MAX_XCP; ++i) { 4974 dma_fence_put(adev->isolation[i].spearhead); 4975 amdgpu_sync_free(&adev->isolation[i].active); 4976 amdgpu_sync_free(&adev->isolation[i].prev); 4977 } 4978 4979 amdgpu_reset_fini(adev); 4980 4981 /* free i2c buses */ 4982 amdgpu_i2c_fini(adev); 4983 4984 if (adev->bios) { 4985 if (amdgpu_emu_mode != 1) 4986 amdgpu_atombios_fini(adev); 4987 amdgpu_bios_release(adev); 4988 } 4989 4990 kfree(adev->fru_info); 4991 adev->fru_info = NULL; 4992 4993 kfree(adev->xcp_mgr); 4994 adev->xcp_mgr = NULL; 4995 4996 px = amdgpu_device_supports_px(adev); 4997 4998 if (px || (!dev_is_removable(&adev->pdev->dev) && 4999 apple_gmux_detect(NULL, NULL))) 5000 vga_switcheroo_unregister_client(adev->pdev); 5001 5002 if (px) 5003 vga_switcheroo_fini_domain_pm_ops(adev->dev); 5004 5005 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5006 vga_client_unregister(adev->pdev); 5007 5008 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5009 5010 iounmap(adev->rmmio); 5011 adev->rmmio = NULL; 5012 drm_dev_exit(idx); 5013 } 5014 5015 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5016 amdgpu_pmu_fini(adev); 5017 if (adev->discovery.bin) 5018 amdgpu_discovery_fini(adev); 5019 5020 amdgpu_reset_put_reset_domain(adev->reset_domain); 5021 adev->reset_domain = NULL; 5022 5023 kfree(adev->pci_state); 5024 kfree(adev->pcie_reset_ctx.swds_pcistate); 5025 kfree(adev->pcie_reset_ctx.swus_pcistate); 5026 } 5027 5028 /** 5029 * amdgpu_device_evict_resources - evict device resources 5030 * @adev: amdgpu device object 5031 * 5032 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5033 * of the vram memory type. Mainly used for evicting device resources 5034 * at suspend time. 5035 * 5036 */ 5037 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5038 { 5039 int ret; 5040 5041 /* No need to evict vram on APUs unless going to S4 */ 5042 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5043 return 0; 5044 5045 /* No need to evict when going to S5 through S4 callbacks */ 5046 if (system_state == SYSTEM_POWER_OFF) 5047 return 0; 5048 5049 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5050 if (ret) { 5051 dev_warn(adev->dev, "evicting device resources failed\n"); 5052 return ret; 5053 } 5054 5055 if (adev->in_s4) { 5056 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5057 if (ret) 5058 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5059 } 5060 return ret; 5061 } 5062 5063 /* 5064 * Suspend & resume. 5065 */ 5066 /** 5067 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5068 * @nb: notifier block 5069 * @mode: suspend mode 5070 * @data: data 5071 * 5072 * This function is called when the system is about to suspend or hibernate. 5073 * It is used to set the appropriate flags so that eviction can be optimized 5074 * in the pm prepare callback. 5075 */ 5076 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5077 void *data) 5078 { 5079 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5080 5081 switch (mode) { 5082 case PM_HIBERNATION_PREPARE: 5083 adev->in_s4 = true; 5084 break; 5085 case PM_POST_HIBERNATION: 5086 adev->in_s4 = false; 5087 break; 5088 } 5089 5090 return NOTIFY_DONE; 5091 } 5092 5093 /** 5094 * amdgpu_device_prepare - prepare for device suspend 5095 * 5096 * @dev: drm dev pointer 5097 * 5098 * Prepare to put the hw in the suspend state (all asics). 5099 * Returns 0 for success or an error on failure. 5100 * Called at driver suspend. 5101 */ 5102 int amdgpu_device_prepare(struct drm_device *dev) 5103 { 5104 struct amdgpu_device *adev = drm_to_adev(dev); 5105 int i, r; 5106 5107 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5108 return 0; 5109 5110 /* Evict the majority of BOs before starting suspend sequence */ 5111 r = amdgpu_device_evict_resources(adev); 5112 if (r) 5113 return r; 5114 5115 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5116 5117 for (i = 0; i < adev->num_ip_blocks; i++) { 5118 if (!adev->ip_blocks[i].status.valid) 5119 continue; 5120 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5121 continue; 5122 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5123 if (r) 5124 return r; 5125 } 5126 5127 return 0; 5128 } 5129 5130 /** 5131 * amdgpu_device_complete - complete power state transition 5132 * 5133 * @dev: drm dev pointer 5134 * 5135 * Undo the changes from amdgpu_device_prepare. This will be 5136 * called on all resume transitions, including those that failed. 5137 */ 5138 void amdgpu_device_complete(struct drm_device *dev) 5139 { 5140 struct amdgpu_device *adev = drm_to_adev(dev); 5141 int i; 5142 5143 for (i = 0; i < adev->num_ip_blocks; i++) { 5144 if (!adev->ip_blocks[i].status.valid) 5145 continue; 5146 if (!adev->ip_blocks[i].version->funcs->complete) 5147 continue; 5148 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5149 } 5150 } 5151 5152 /** 5153 * amdgpu_device_suspend - initiate device suspend 5154 * 5155 * @dev: drm dev pointer 5156 * @notify_clients: notify in-kernel DRM clients 5157 * 5158 * Puts the hw in the suspend state (all asics). 5159 * Returns 0 for success or an error on failure. 5160 * Called at driver suspend. 5161 */ 5162 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5163 { 5164 struct amdgpu_device *adev = drm_to_adev(dev); 5165 int r, rec; 5166 5167 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5168 return 0; 5169 5170 adev->in_suspend = true; 5171 5172 if (amdgpu_sriov_vf(adev)) { 5173 if (!adev->in_runpm) 5174 amdgpu_amdkfd_suspend_process(adev); 5175 amdgpu_virt_fini_data_exchange(adev); 5176 r = amdgpu_virt_request_full_gpu(adev, false); 5177 if (r) 5178 return r; 5179 } 5180 5181 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5182 if (r) 5183 goto unwind_sriov; 5184 5185 if (notify_clients) 5186 drm_client_dev_suspend(adev_to_drm(adev)); 5187 5188 cancel_delayed_work_sync(&adev->delayed_init_work); 5189 5190 amdgpu_ras_suspend(adev); 5191 5192 r = amdgpu_device_ip_suspend_phase1(adev); 5193 if (r) 5194 goto unwind_smartshift; 5195 5196 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5197 r = amdgpu_userq_suspend(adev); 5198 if (r) 5199 goto unwind_ip_phase1; 5200 5201 r = amdgpu_device_evict_resources(adev); 5202 if (r) 5203 goto unwind_userq; 5204 5205 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5206 5207 amdgpu_fence_driver_hw_fini(adev); 5208 5209 r = amdgpu_device_ip_suspend_phase2(adev); 5210 if (r) 5211 goto unwind_evict; 5212 5213 if (amdgpu_sriov_vf(adev)) 5214 amdgpu_virt_release_full_gpu(adev, false); 5215 5216 return 0; 5217 5218 unwind_evict: 5219 if (adev->mman.buffer_funcs_ring->sched.ready) 5220 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5221 amdgpu_fence_driver_hw_init(adev); 5222 5223 unwind_userq: 5224 rec = amdgpu_userq_resume(adev); 5225 if (rec) { 5226 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5227 return r; 5228 } 5229 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5230 if (rec) { 5231 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5232 return r; 5233 } 5234 5235 unwind_ip_phase1: 5236 /* suspend phase 1 = resume phase 3 */ 5237 rec = amdgpu_device_ip_resume_phase3(adev); 5238 if (rec) { 5239 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5240 return r; 5241 } 5242 5243 unwind_smartshift: 5244 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5245 if (rec) { 5246 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5247 return r; 5248 } 5249 5250 if (notify_clients) 5251 drm_client_dev_resume(adev_to_drm(adev)); 5252 5253 amdgpu_ras_resume(adev); 5254 5255 unwind_sriov: 5256 if (amdgpu_sriov_vf(adev)) { 5257 rec = amdgpu_virt_request_full_gpu(adev, true); 5258 if (rec) { 5259 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5260 return r; 5261 } 5262 } 5263 5264 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5265 5266 return r; 5267 } 5268 5269 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5270 { 5271 int r; 5272 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5273 5274 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5275 * may not work. The access could be blocked by nBIF protection as VF isn't in 5276 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5277 * so that QEMU reprograms MSIX table. 5278 */ 5279 amdgpu_restore_msix(adev); 5280 5281 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5282 if (r) 5283 return r; 5284 5285 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5286 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5287 5288 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5289 adev->vm_manager.vram_base_offset += 5290 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5291 5292 return 0; 5293 } 5294 5295 /** 5296 * amdgpu_device_resume - initiate device resume 5297 * 5298 * @dev: drm dev pointer 5299 * @notify_clients: notify in-kernel DRM clients 5300 * 5301 * Bring the hw back to operating state (all asics). 5302 * Returns 0 for success or an error on failure. 5303 * Called at driver resume. 5304 */ 5305 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5306 { 5307 struct amdgpu_device *adev = drm_to_adev(dev); 5308 int r = 0; 5309 5310 if (amdgpu_sriov_vf(adev)) { 5311 r = amdgpu_virt_request_full_gpu(adev, true); 5312 if (r) 5313 return r; 5314 } 5315 5316 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5317 r = amdgpu_virt_resume(adev); 5318 if (r) 5319 goto exit; 5320 } 5321 5322 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5323 return 0; 5324 5325 if (adev->in_s0ix) 5326 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5327 5328 /* post card */ 5329 if (amdgpu_device_need_post(adev)) { 5330 r = amdgpu_device_asic_init(adev); 5331 if (r) 5332 dev_err(adev->dev, "amdgpu asic init failed\n"); 5333 } 5334 5335 r = amdgpu_device_ip_resume(adev); 5336 5337 if (r) { 5338 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5339 goto exit; 5340 } 5341 5342 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5343 if (r) 5344 goto exit; 5345 5346 r = amdgpu_userq_resume(adev); 5347 if (r) 5348 goto exit; 5349 5350 r = amdgpu_device_ip_late_init(adev); 5351 if (r) 5352 goto exit; 5353 5354 queue_delayed_work(system_wq, &adev->delayed_init_work, 5355 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5356 exit: 5357 if (amdgpu_sriov_vf(adev)) { 5358 amdgpu_virt_init_data_exchange(adev); 5359 amdgpu_virt_release_full_gpu(adev, true); 5360 5361 if (!r && !adev->in_runpm) 5362 r = amdgpu_amdkfd_resume_process(adev); 5363 } 5364 5365 if (r) 5366 return r; 5367 5368 /* Make sure IB tests flushed */ 5369 flush_delayed_work(&adev->delayed_init_work); 5370 5371 if (notify_clients) 5372 drm_client_dev_resume(adev_to_drm(adev)); 5373 5374 amdgpu_ras_resume(adev); 5375 5376 if (adev->mode_info.num_crtc) { 5377 /* 5378 * Most of the connector probing functions try to acquire runtime pm 5379 * refs to ensure that the GPU is powered on when connector polling is 5380 * performed. Since we're calling this from a runtime PM callback, 5381 * trying to acquire rpm refs will cause us to deadlock. 5382 * 5383 * Since we're guaranteed to be holding the rpm lock, it's safe to 5384 * temporarily disable the rpm helpers so this doesn't deadlock us. 5385 */ 5386 #ifdef CONFIG_PM 5387 dev->dev->power.disable_depth++; 5388 #endif 5389 if (!adev->dc_enabled) 5390 drm_helper_hpd_irq_event(dev); 5391 else 5392 drm_kms_helper_hotplug_event(dev); 5393 #ifdef CONFIG_PM 5394 dev->dev->power.disable_depth--; 5395 #endif 5396 } 5397 5398 amdgpu_vram_mgr_clear_reset_blocks(adev); 5399 adev->in_suspend = false; 5400 5401 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5402 dev_warn(adev->dev, "smart shift update failed\n"); 5403 5404 return 0; 5405 } 5406 5407 /** 5408 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5409 * 5410 * @adev: amdgpu_device pointer 5411 * 5412 * The list of all the hardware IPs that make up the asic is walked and 5413 * the check_soft_reset callbacks are run. check_soft_reset determines 5414 * if the asic is still hung or not. 5415 * Returns true if any of the IPs are still in a hung state, false if not. 5416 */ 5417 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5418 { 5419 int i; 5420 bool asic_hang = false; 5421 5422 if (amdgpu_sriov_vf(adev)) 5423 return true; 5424 5425 if (amdgpu_asic_need_full_reset(adev)) 5426 return true; 5427 5428 for (i = 0; i < adev->num_ip_blocks; i++) { 5429 if (!adev->ip_blocks[i].status.valid) 5430 continue; 5431 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5432 adev->ip_blocks[i].status.hang = 5433 adev->ip_blocks[i].version->funcs->check_soft_reset( 5434 &adev->ip_blocks[i]); 5435 if (adev->ip_blocks[i].status.hang) { 5436 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5437 asic_hang = true; 5438 } 5439 } 5440 return asic_hang; 5441 } 5442 5443 /** 5444 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5445 * 5446 * @adev: amdgpu_device pointer 5447 * 5448 * The list of all the hardware IPs that make up the asic is walked and the 5449 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5450 * handles any IP specific hardware or software state changes that are 5451 * necessary for a soft reset to succeed. 5452 * Returns 0 on success, negative error code on failure. 5453 */ 5454 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5455 { 5456 int i, r = 0; 5457 5458 for (i = 0; i < adev->num_ip_blocks; i++) { 5459 if (!adev->ip_blocks[i].status.valid) 5460 continue; 5461 if (adev->ip_blocks[i].status.hang && 5462 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5463 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5464 if (r) 5465 return r; 5466 } 5467 } 5468 5469 return 0; 5470 } 5471 5472 /** 5473 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5474 * 5475 * @adev: amdgpu_device pointer 5476 * 5477 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5478 * reset is necessary to recover. 5479 * Returns true if a full asic reset is required, false if not. 5480 */ 5481 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5482 { 5483 int i; 5484 5485 if (amdgpu_asic_need_full_reset(adev)) 5486 return true; 5487 5488 for (i = 0; i < adev->num_ip_blocks; i++) { 5489 if (!adev->ip_blocks[i].status.valid) 5490 continue; 5491 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5492 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5493 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5494 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5495 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5496 if (adev->ip_blocks[i].status.hang) { 5497 dev_info(adev->dev, "Some block need full reset!\n"); 5498 return true; 5499 } 5500 } 5501 } 5502 return false; 5503 } 5504 5505 /** 5506 * amdgpu_device_ip_soft_reset - do a soft reset 5507 * 5508 * @adev: amdgpu_device pointer 5509 * 5510 * The list of all the hardware IPs that make up the asic is walked and the 5511 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5512 * IP specific hardware or software state changes that are necessary to soft 5513 * reset the IP. 5514 * Returns 0 on success, negative error code on failure. 5515 */ 5516 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5517 { 5518 int i, r = 0; 5519 5520 for (i = 0; i < adev->num_ip_blocks; i++) { 5521 if (!adev->ip_blocks[i].status.valid) 5522 continue; 5523 if (adev->ip_blocks[i].status.hang && 5524 adev->ip_blocks[i].version->funcs->soft_reset) { 5525 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5526 if (r) 5527 return r; 5528 } 5529 } 5530 5531 return 0; 5532 } 5533 5534 /** 5535 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5536 * 5537 * @adev: amdgpu_device pointer 5538 * 5539 * The list of all the hardware IPs that make up the asic is walked and the 5540 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5541 * handles any IP specific hardware or software state changes that are 5542 * necessary after the IP has been soft reset. 5543 * Returns 0 on success, negative error code on failure. 5544 */ 5545 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5546 { 5547 int i, r = 0; 5548 5549 for (i = 0; i < adev->num_ip_blocks; i++) { 5550 if (!adev->ip_blocks[i].status.valid) 5551 continue; 5552 if (adev->ip_blocks[i].status.hang && 5553 adev->ip_blocks[i].version->funcs->post_soft_reset) 5554 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5555 if (r) 5556 return r; 5557 } 5558 5559 return 0; 5560 } 5561 5562 /** 5563 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5564 * 5565 * @adev: amdgpu_device pointer 5566 * @reset_context: amdgpu reset context pointer 5567 * 5568 * do VF FLR and reinitialize Asic 5569 * return 0 means succeeded otherwise failed 5570 */ 5571 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5572 struct amdgpu_reset_context *reset_context) 5573 { 5574 int r; 5575 struct amdgpu_hive_info *hive = NULL; 5576 5577 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5578 if (!amdgpu_ras_get_fed_status(adev)) 5579 amdgpu_virt_ready_to_reset(adev); 5580 amdgpu_virt_wait_reset(adev); 5581 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5582 r = amdgpu_virt_request_full_gpu(adev, true); 5583 } else { 5584 r = amdgpu_virt_reset_gpu(adev); 5585 } 5586 if (r) 5587 return r; 5588 5589 amdgpu_ras_clear_err_state(adev); 5590 amdgpu_irq_gpu_reset_resume_helper(adev); 5591 5592 /* some sw clean up VF needs to do before recover */ 5593 amdgpu_virt_post_reset(adev); 5594 5595 /* Resume IP prior to SMC */ 5596 r = amdgpu_device_ip_reinit_early_sriov(adev); 5597 if (r) 5598 return r; 5599 5600 amdgpu_virt_init_data_exchange(adev); 5601 5602 r = amdgpu_device_fw_loading(adev); 5603 if (r) 5604 return r; 5605 5606 /* now we are okay to resume SMC/CP/SDMA */ 5607 r = amdgpu_device_ip_reinit_late_sriov(adev); 5608 if (r) 5609 return r; 5610 5611 hive = amdgpu_get_xgmi_hive(adev); 5612 /* Update PSP FW topology after reset */ 5613 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5614 r = amdgpu_xgmi_update_topology(hive, adev); 5615 if (hive) 5616 amdgpu_put_xgmi_hive(hive); 5617 if (r) 5618 return r; 5619 5620 r = amdgpu_ib_ring_tests(adev); 5621 if (r) 5622 return r; 5623 5624 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5625 amdgpu_inc_vram_lost(adev); 5626 5627 /* need to be called during full access so we can't do it later like 5628 * bare-metal does. 5629 */ 5630 amdgpu_amdkfd_post_reset(adev); 5631 amdgpu_virt_release_full_gpu(adev, true); 5632 5633 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5634 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5635 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5636 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5637 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5638 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5639 amdgpu_ras_resume(adev); 5640 5641 amdgpu_virt_ras_telemetry_post_reset(adev); 5642 5643 return 0; 5644 } 5645 5646 /** 5647 * amdgpu_device_has_job_running - check if there is any unfinished job 5648 * 5649 * @adev: amdgpu_device pointer 5650 * 5651 * check if there is any job running on the device when guest driver receives 5652 * FLR notification from host driver. If there are still jobs running, then 5653 * the guest driver will not respond the FLR reset. Instead, let the job hit 5654 * the timeout and guest driver then issue the reset request. 5655 */ 5656 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5657 { 5658 int i; 5659 5660 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5661 struct amdgpu_ring *ring = adev->rings[i]; 5662 5663 if (!amdgpu_ring_sched_ready(ring)) 5664 continue; 5665 5666 if (amdgpu_fence_count_emitted(ring)) 5667 return true; 5668 } 5669 return false; 5670 } 5671 5672 /** 5673 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5674 * 5675 * @adev: amdgpu_device pointer 5676 * 5677 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5678 * a hung GPU. 5679 */ 5680 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5681 { 5682 5683 if (amdgpu_gpu_recovery == 0) 5684 goto disabled; 5685 5686 /* Skip soft reset check in fatal error mode */ 5687 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5688 return true; 5689 5690 if (amdgpu_sriov_vf(adev)) 5691 return true; 5692 5693 if (amdgpu_gpu_recovery == -1) { 5694 switch (adev->asic_type) { 5695 #ifdef CONFIG_DRM_AMDGPU_SI 5696 case CHIP_VERDE: 5697 case CHIP_TAHITI: 5698 case CHIP_PITCAIRN: 5699 case CHIP_OLAND: 5700 case CHIP_HAINAN: 5701 #endif 5702 #ifdef CONFIG_DRM_AMDGPU_CIK 5703 case CHIP_KAVERI: 5704 case CHIP_KABINI: 5705 case CHIP_MULLINS: 5706 #endif 5707 case CHIP_CARRIZO: 5708 case CHIP_STONEY: 5709 case CHIP_CYAN_SKILLFISH: 5710 goto disabled; 5711 default: 5712 break; 5713 } 5714 } 5715 5716 return true; 5717 5718 disabled: 5719 dev_info(adev->dev, "GPU recovery disabled.\n"); 5720 return false; 5721 } 5722 5723 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5724 { 5725 u32 i; 5726 int ret = 0; 5727 5728 if (adev->bios) 5729 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5730 5731 dev_info(adev->dev, "GPU mode1 reset\n"); 5732 5733 /* Cache the state before bus master disable. The saved config space 5734 * values are used in other cases like restore after mode-2 reset. 5735 */ 5736 amdgpu_device_cache_pci_state(adev->pdev); 5737 5738 /* disable BM */ 5739 pci_clear_master(adev->pdev); 5740 5741 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5742 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5743 ret = amdgpu_dpm_mode1_reset(adev); 5744 } else { 5745 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5746 ret = psp_gpu_reset(adev); 5747 } 5748 5749 if (ret) 5750 goto mode1_reset_failed; 5751 5752 /* enable mmio access after mode 1 reset completed */ 5753 adev->no_hw_access = false; 5754 5755 /* ensure no_hw_access is updated before we access hw */ 5756 smp_mb(); 5757 5758 amdgpu_device_load_pci_state(adev->pdev); 5759 ret = amdgpu_psp_wait_for_bootloader(adev); 5760 if (ret) 5761 goto mode1_reset_failed; 5762 5763 /* wait for asic to come out of reset */ 5764 for (i = 0; i < adev->usec_timeout; i++) { 5765 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5766 5767 if (memsize != 0xffffffff) 5768 break; 5769 udelay(1); 5770 } 5771 5772 if (i >= adev->usec_timeout) { 5773 ret = -ETIMEDOUT; 5774 goto mode1_reset_failed; 5775 } 5776 5777 if (adev->bios) 5778 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5779 5780 return 0; 5781 5782 mode1_reset_failed: 5783 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5784 return ret; 5785 } 5786 5787 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5788 { 5789 int ret = 0; 5790 5791 dev_info(adev->dev, "GPU link reset\n"); 5792 5793 if (!amdgpu_reset_in_dpc(adev)) 5794 ret = amdgpu_dpm_link_reset(adev); 5795 5796 if (ret) 5797 goto link_reset_failed; 5798 5799 ret = amdgpu_psp_wait_for_bootloader(adev); 5800 if (ret) 5801 goto link_reset_failed; 5802 5803 return 0; 5804 5805 link_reset_failed: 5806 dev_err(adev->dev, "GPU link reset failed\n"); 5807 return ret; 5808 } 5809 5810 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5811 struct amdgpu_reset_context *reset_context) 5812 { 5813 int i, r = 0; 5814 struct amdgpu_job *job = NULL; 5815 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5816 bool need_full_reset = 5817 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5818 5819 if (reset_context->reset_req_dev == adev) 5820 job = reset_context->job; 5821 5822 if (amdgpu_sriov_vf(adev)) 5823 amdgpu_virt_pre_reset(adev); 5824 5825 amdgpu_fence_driver_isr_toggle(adev, true); 5826 5827 /* block all schedulers and reset given job's ring */ 5828 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5829 struct amdgpu_ring *ring = adev->rings[i]; 5830 5831 if (!amdgpu_ring_sched_ready(ring)) 5832 continue; 5833 5834 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5835 amdgpu_fence_driver_force_completion(ring); 5836 } 5837 5838 amdgpu_fence_driver_isr_toggle(adev, false); 5839 5840 if (job && job->vm) 5841 drm_sched_increase_karma(&job->base); 5842 5843 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5844 /* If reset handler not implemented, continue; otherwise return */ 5845 if (r == -EOPNOTSUPP) 5846 r = 0; 5847 else 5848 return r; 5849 5850 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5851 if (!amdgpu_sriov_vf(adev)) { 5852 5853 if (!need_full_reset) 5854 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5855 5856 if (!need_full_reset && amdgpu_gpu_recovery && 5857 amdgpu_device_ip_check_soft_reset(adev)) { 5858 amdgpu_device_ip_pre_soft_reset(adev); 5859 r = amdgpu_device_ip_soft_reset(adev); 5860 amdgpu_device_ip_post_soft_reset(adev); 5861 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5862 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5863 need_full_reset = true; 5864 } 5865 } 5866 5867 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5868 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5869 /* Trigger ip dump before we reset the asic */ 5870 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5871 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5872 tmp_adev->ip_blocks[i].version->funcs 5873 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5874 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5875 } 5876 5877 if (need_full_reset) 5878 r = amdgpu_device_ip_suspend(adev); 5879 if (need_full_reset) 5880 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5881 else 5882 clear_bit(AMDGPU_NEED_FULL_RESET, 5883 &reset_context->flags); 5884 } 5885 5886 return r; 5887 } 5888 5889 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5890 { 5891 struct list_head *device_list_handle; 5892 bool full_reset, vram_lost = false; 5893 struct amdgpu_device *tmp_adev; 5894 int r, init_level; 5895 5896 device_list_handle = reset_context->reset_device_list; 5897 5898 if (!device_list_handle) 5899 return -EINVAL; 5900 5901 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5902 5903 /** 5904 * If it's reset on init, it's default init level, otherwise keep level 5905 * as recovery level. 5906 */ 5907 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5908 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5909 else 5910 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5911 5912 r = 0; 5913 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5914 amdgpu_set_init_level(tmp_adev, init_level); 5915 if (full_reset) { 5916 /* post card */ 5917 amdgpu_reset_set_dpc_status(tmp_adev, false); 5918 amdgpu_ras_clear_err_state(tmp_adev); 5919 r = amdgpu_device_asic_init(tmp_adev); 5920 if (r) { 5921 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5922 } else { 5923 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5924 5925 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5926 if (r) 5927 goto out; 5928 5929 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5930 5931 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5932 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5933 5934 if (vram_lost) { 5935 dev_info( 5936 tmp_adev->dev, 5937 "VRAM is lost due to GPU reset!\n"); 5938 amdgpu_inc_vram_lost(tmp_adev); 5939 } 5940 5941 r = amdgpu_device_fw_loading(tmp_adev); 5942 if (r) 5943 return r; 5944 5945 r = amdgpu_xcp_restore_partition_mode( 5946 tmp_adev->xcp_mgr); 5947 if (r) 5948 goto out; 5949 5950 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5951 if (r) 5952 goto out; 5953 5954 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5955 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5956 5957 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5958 if (r) 5959 goto out; 5960 5961 if (vram_lost) 5962 amdgpu_device_fill_reset_magic(tmp_adev); 5963 5964 /* 5965 * Add this ASIC as tracked as reset was already 5966 * complete successfully. 5967 */ 5968 amdgpu_register_gpu_instance(tmp_adev); 5969 5970 if (!reset_context->hive && 5971 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5972 amdgpu_xgmi_add_device(tmp_adev); 5973 5974 r = amdgpu_device_ip_late_init(tmp_adev); 5975 if (r) 5976 goto out; 5977 5978 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 5979 if (r) 5980 goto out; 5981 5982 drm_client_dev_resume(adev_to_drm(tmp_adev)); 5983 5984 /* 5985 * The GPU enters bad state once faulty pages 5986 * by ECC has reached the threshold, and ras 5987 * recovery is scheduled next. So add one check 5988 * here to break recovery if it indeed exceeds 5989 * bad page threshold, and remind user to 5990 * retire this GPU or setting one bigger 5991 * bad_page_threshold value to fix this once 5992 * probing driver again. 5993 */ 5994 if (!amdgpu_ras_is_rma(tmp_adev)) { 5995 /* must succeed. */ 5996 amdgpu_ras_resume(tmp_adev); 5997 } else { 5998 r = -EINVAL; 5999 goto out; 6000 } 6001 6002 /* Update PSP FW topology after reset */ 6003 if (reset_context->hive && 6004 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6005 r = amdgpu_xgmi_update_topology( 6006 reset_context->hive, tmp_adev); 6007 } 6008 } 6009 6010 out: 6011 if (!r) { 6012 /* IP init is complete now, set level as default */ 6013 amdgpu_set_init_level(tmp_adev, 6014 AMDGPU_INIT_LEVEL_DEFAULT); 6015 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 6016 r = amdgpu_ib_ring_tests(tmp_adev); 6017 if (r) { 6018 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 6019 r = -EAGAIN; 6020 goto end; 6021 } 6022 } 6023 6024 if (r) 6025 tmp_adev->asic_reset_res = r; 6026 } 6027 6028 end: 6029 return r; 6030 } 6031 6032 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6033 struct amdgpu_reset_context *reset_context) 6034 { 6035 struct amdgpu_device *tmp_adev = NULL; 6036 bool need_full_reset, skip_hw_reset; 6037 int r = 0; 6038 6039 /* Try reset handler method first */ 6040 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6041 reset_list); 6042 6043 reset_context->reset_device_list = device_list_handle; 6044 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6045 /* If reset handler not implemented, continue; otherwise return */ 6046 if (r == -EOPNOTSUPP) 6047 r = 0; 6048 else 6049 return r; 6050 6051 /* Reset handler not implemented, use the default method */ 6052 need_full_reset = 6053 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6054 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6055 6056 /* 6057 * ASIC reset has to be done on all XGMI hive nodes ASAP 6058 * to allow proper links negotiation in FW (within 1 sec) 6059 */ 6060 if (!skip_hw_reset && need_full_reset) { 6061 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6062 /* For XGMI run all resets in parallel to speed up the process */ 6063 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6064 if (!queue_work(system_unbound_wq, 6065 &tmp_adev->xgmi_reset_work)) 6066 r = -EALREADY; 6067 } else 6068 r = amdgpu_asic_reset(tmp_adev); 6069 6070 if (r) { 6071 dev_err(tmp_adev->dev, 6072 "ASIC reset failed with error, %d for drm dev, %s", 6073 r, adev_to_drm(tmp_adev)->unique); 6074 goto out; 6075 } 6076 } 6077 6078 /* For XGMI wait for all resets to complete before proceed */ 6079 if (!r) { 6080 list_for_each_entry(tmp_adev, device_list_handle, 6081 reset_list) { 6082 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6083 flush_work(&tmp_adev->xgmi_reset_work); 6084 r = tmp_adev->asic_reset_res; 6085 if (r) 6086 break; 6087 } 6088 } 6089 } 6090 } 6091 6092 if (!r && amdgpu_ras_intr_triggered()) { 6093 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6094 amdgpu_ras_reset_error_count(tmp_adev, 6095 AMDGPU_RAS_BLOCK__MMHUB); 6096 } 6097 6098 amdgpu_ras_intr_cleared(); 6099 } 6100 6101 r = amdgpu_device_reinit_after_reset(reset_context); 6102 if (r == -EAGAIN) 6103 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6104 else 6105 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6106 6107 out: 6108 return r; 6109 } 6110 6111 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6112 { 6113 6114 switch (amdgpu_asic_reset_method(adev)) { 6115 case AMD_RESET_METHOD_MODE1: 6116 case AMD_RESET_METHOD_LINK: 6117 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6118 break; 6119 case AMD_RESET_METHOD_MODE2: 6120 adev->mp1_state = PP_MP1_STATE_RESET; 6121 break; 6122 default: 6123 adev->mp1_state = PP_MP1_STATE_NONE; 6124 break; 6125 } 6126 } 6127 6128 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6129 { 6130 amdgpu_vf_error_trans_all(adev); 6131 adev->mp1_state = PP_MP1_STATE_NONE; 6132 } 6133 6134 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6135 { 6136 struct pci_dev *p = NULL; 6137 6138 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6139 adev->pdev->bus->number, 1); 6140 if (p) { 6141 pm_runtime_enable(&(p->dev)); 6142 pm_runtime_resume(&(p->dev)); 6143 } 6144 6145 pci_dev_put(p); 6146 } 6147 6148 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6149 { 6150 enum amd_reset_method reset_method; 6151 struct pci_dev *p = NULL; 6152 u64 expires; 6153 6154 /* 6155 * For now, only BACO and mode1 reset are confirmed 6156 * to suffer the audio issue without proper suspended. 6157 */ 6158 reset_method = amdgpu_asic_reset_method(adev); 6159 if ((reset_method != AMD_RESET_METHOD_BACO) && 6160 (reset_method != AMD_RESET_METHOD_MODE1)) 6161 return -EINVAL; 6162 6163 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6164 adev->pdev->bus->number, 1); 6165 if (!p) 6166 return -ENODEV; 6167 6168 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6169 if (!expires) 6170 /* 6171 * If we cannot get the audio device autosuspend delay, 6172 * a fixed 4S interval will be used. Considering 3S is 6173 * the audio controller default autosuspend delay setting. 6174 * 4S used here is guaranteed to cover that. 6175 */ 6176 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6177 6178 while (!pm_runtime_status_suspended(&(p->dev))) { 6179 if (!pm_runtime_suspend(&(p->dev))) 6180 break; 6181 6182 if (expires < ktime_get_mono_fast_ns()) { 6183 dev_warn(adev->dev, "failed to suspend display audio\n"); 6184 pci_dev_put(p); 6185 /* TODO: abort the succeeding gpu reset? */ 6186 return -ETIMEDOUT; 6187 } 6188 } 6189 6190 pm_runtime_disable(&(p->dev)); 6191 6192 pci_dev_put(p); 6193 return 0; 6194 } 6195 6196 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6197 { 6198 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6199 6200 #if defined(CONFIG_DEBUG_FS) 6201 if (!amdgpu_sriov_vf(adev)) 6202 cancel_work(&adev->reset_work); 6203 #endif 6204 cancel_work(&adev->userq_reset_work); 6205 6206 if (adev->kfd.dev) 6207 cancel_work(&adev->kfd.reset_work); 6208 6209 if (amdgpu_sriov_vf(adev)) 6210 cancel_work(&adev->virt.flr_work); 6211 6212 if (con && adev->ras_enabled) 6213 cancel_work(&con->recovery_work); 6214 6215 } 6216 6217 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6218 { 6219 struct amdgpu_device *tmp_adev; 6220 int ret = 0; 6221 6222 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6223 ret |= amdgpu_device_bus_status_check(tmp_adev); 6224 } 6225 6226 return ret; 6227 } 6228 6229 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6230 struct list_head *device_list, 6231 struct amdgpu_hive_info *hive) 6232 { 6233 struct amdgpu_device *tmp_adev = NULL; 6234 6235 /* 6236 * Build list of devices to reset. 6237 * In case we are in XGMI hive mode, resort the device list 6238 * to put adev in the 1st position. 6239 */ 6240 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6241 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6242 list_add_tail(&tmp_adev->reset_list, device_list); 6243 if (adev->shutdown) 6244 tmp_adev->shutdown = true; 6245 if (amdgpu_reset_in_dpc(adev)) 6246 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6247 } 6248 if (!list_is_first(&adev->reset_list, device_list)) 6249 list_rotate_to_front(&adev->reset_list, device_list); 6250 } else { 6251 list_add_tail(&adev->reset_list, device_list); 6252 } 6253 } 6254 6255 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6256 struct list_head *device_list) 6257 { 6258 struct amdgpu_device *tmp_adev = NULL; 6259 6260 if (list_empty(device_list)) 6261 return; 6262 tmp_adev = 6263 list_first_entry(device_list, struct amdgpu_device, reset_list); 6264 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6265 } 6266 6267 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6268 struct list_head *device_list) 6269 { 6270 struct amdgpu_device *tmp_adev = NULL; 6271 6272 if (list_empty(device_list)) 6273 return; 6274 tmp_adev = 6275 list_first_entry(device_list, struct amdgpu_device, reset_list); 6276 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6277 } 6278 6279 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6280 struct amdgpu_job *job, 6281 struct amdgpu_reset_context *reset_context, 6282 struct list_head *device_list, 6283 struct amdgpu_hive_info *hive, 6284 bool need_emergency_restart) 6285 { 6286 struct amdgpu_device *tmp_adev = NULL; 6287 int i; 6288 6289 /* block all schedulers and reset given job's ring */ 6290 list_for_each_entry(tmp_adev, device_list, reset_list) { 6291 amdgpu_device_set_mp1_state(tmp_adev); 6292 6293 /* 6294 * Try to put the audio codec into suspend state 6295 * before gpu reset started. 6296 * 6297 * Due to the power domain of the graphics device 6298 * is shared with AZ power domain. Without this, 6299 * we may change the audio hardware from behind 6300 * the audio driver's back. That will trigger 6301 * some audio codec errors. 6302 */ 6303 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6304 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6305 6306 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6307 6308 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6309 6310 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6311 6312 /* 6313 * Mark these ASICs to be reset as untracked first 6314 * And add them back after reset completed 6315 */ 6316 amdgpu_unregister_gpu_instance(tmp_adev); 6317 6318 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6319 6320 /* disable ras on ALL IPs */ 6321 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6322 amdgpu_device_ip_need_full_reset(tmp_adev)) 6323 amdgpu_ras_suspend(tmp_adev); 6324 6325 amdgpu_userq_pre_reset(tmp_adev); 6326 6327 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6328 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6329 6330 if (!amdgpu_ring_sched_ready(ring)) 6331 continue; 6332 6333 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6334 6335 if (need_emergency_restart) 6336 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6337 } 6338 atomic_inc(&tmp_adev->gpu_reset_counter); 6339 } 6340 } 6341 6342 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6343 struct list_head *device_list, 6344 struct amdgpu_reset_context *reset_context) 6345 { 6346 struct amdgpu_device *tmp_adev = NULL; 6347 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6348 int r = 0; 6349 6350 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6351 list_for_each_entry(tmp_adev, device_list, reset_list) { 6352 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6353 /*TODO Should we stop ?*/ 6354 if (r) { 6355 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6356 r, adev_to_drm(tmp_adev)->unique); 6357 tmp_adev->asic_reset_res = r; 6358 } 6359 } 6360 6361 /* Actual ASIC resets if needed.*/ 6362 /* Host driver will handle XGMI hive reset for SRIOV */ 6363 if (amdgpu_sriov_vf(adev)) { 6364 6365 /* Bail out of reset early */ 6366 if (amdgpu_ras_is_rma(adev)) 6367 return -ENODEV; 6368 6369 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6370 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6371 amdgpu_ras_set_fed(adev, true); 6372 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6373 } 6374 6375 r = amdgpu_device_reset_sriov(adev, reset_context); 6376 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6377 amdgpu_virt_release_full_gpu(adev, true); 6378 goto retry; 6379 } 6380 if (r) 6381 adev->asic_reset_res = r; 6382 } else { 6383 r = amdgpu_do_asic_reset(device_list, reset_context); 6384 if (r && r == -EAGAIN) 6385 goto retry; 6386 } 6387 6388 list_for_each_entry(tmp_adev, device_list, reset_list) { 6389 /* 6390 * Drop any pending non scheduler resets queued before reset is done. 6391 * Any reset scheduled after this point would be valid. Scheduler resets 6392 * were already dropped during drm_sched_stop and no new ones can come 6393 * in before drm_sched_start. 6394 */ 6395 amdgpu_device_stop_pending_resets(tmp_adev); 6396 } 6397 6398 return r; 6399 } 6400 6401 static int amdgpu_device_sched_resume(struct list_head *device_list, 6402 struct amdgpu_reset_context *reset_context, 6403 bool job_signaled) 6404 { 6405 struct amdgpu_device *tmp_adev = NULL; 6406 int i, r = 0; 6407 6408 /* Post ASIC reset for all devs .*/ 6409 list_for_each_entry(tmp_adev, device_list, reset_list) { 6410 6411 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6412 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6413 6414 if (!amdgpu_ring_sched_ready(ring)) 6415 continue; 6416 6417 drm_sched_start(&ring->sched, 0); 6418 } 6419 6420 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6421 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6422 6423 if (tmp_adev->asic_reset_res) { 6424 /* bad news, how to tell it to userspace ? 6425 * for ras error, we should report GPU bad status instead of 6426 * reset failure 6427 */ 6428 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6429 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6430 dev_info( 6431 tmp_adev->dev, 6432 "GPU reset(%d) failed with error %d\n", 6433 atomic_read( 6434 &tmp_adev->gpu_reset_counter), 6435 tmp_adev->asic_reset_res); 6436 amdgpu_vf_error_put(tmp_adev, 6437 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6438 tmp_adev->asic_reset_res); 6439 if (!r) 6440 r = tmp_adev->asic_reset_res; 6441 tmp_adev->asic_reset_res = 0; 6442 } else { 6443 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6444 atomic_read(&tmp_adev->gpu_reset_counter)); 6445 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6446 AMDGPU_SS_DEV_D0)) 6447 dev_warn(tmp_adev->dev, 6448 "smart shift update failed\n"); 6449 } 6450 } 6451 6452 return r; 6453 } 6454 6455 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6456 struct list_head *device_list, 6457 bool need_emergency_restart) 6458 { 6459 struct amdgpu_device *tmp_adev = NULL; 6460 6461 list_for_each_entry(tmp_adev, device_list, reset_list) { 6462 /* unlock kfd: SRIOV would do it separately */ 6463 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6464 amdgpu_amdkfd_post_reset(tmp_adev); 6465 6466 /* kfd_post_reset will do nothing if kfd device is not initialized, 6467 * need to bring up kfd here if it's not be initialized before 6468 */ 6469 if (!adev->kfd.init_complete) 6470 amdgpu_amdkfd_device_init(adev); 6471 6472 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6473 amdgpu_device_resume_display_audio(tmp_adev); 6474 6475 amdgpu_device_unset_mp1_state(tmp_adev); 6476 6477 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6478 6479 } 6480 } 6481 6482 6483 /** 6484 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6485 * 6486 * @adev: amdgpu_device pointer 6487 * @job: which job trigger hang 6488 * @reset_context: amdgpu reset context pointer 6489 * 6490 * Attempt to reset the GPU if it has hung (all asics). 6491 * Attempt to do soft-reset or full-reset and reinitialize Asic 6492 * Returns 0 for success or an error on failure. 6493 */ 6494 6495 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6496 struct amdgpu_job *job, 6497 struct amdgpu_reset_context *reset_context) 6498 { 6499 struct list_head device_list; 6500 bool job_signaled = false; 6501 struct amdgpu_hive_info *hive = NULL; 6502 int r = 0; 6503 bool need_emergency_restart = false; 6504 /* save the pasid here as the job may be freed before the end of the reset */ 6505 int pasid = job ? job->pasid : -EINVAL; 6506 6507 /* 6508 * If it reaches here because of hang/timeout and a RAS error is 6509 * detected at the same time, let RAS recovery take care of it. 6510 */ 6511 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6512 !amdgpu_sriov_vf(adev) && 6513 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6514 dev_dbg(adev->dev, 6515 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6516 reset_context->src); 6517 return 0; 6518 } 6519 6520 /* 6521 * Special case: RAS triggered and full reset isn't supported 6522 */ 6523 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6524 6525 /* 6526 * Flush RAM to disk so that after reboot 6527 * the user can read log and see why the system rebooted. 6528 */ 6529 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6530 amdgpu_ras_get_context(adev)->reboot) { 6531 dev_warn(adev->dev, "Emergency reboot."); 6532 6533 ksys_sync_helper(); 6534 emergency_restart(); 6535 } 6536 6537 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6538 need_emergency_restart ? "jobs stop" : "reset", 6539 reset_context->src); 6540 6541 if (!amdgpu_sriov_vf(adev)) 6542 hive = amdgpu_get_xgmi_hive(adev); 6543 if (hive) 6544 mutex_lock(&hive->hive_lock); 6545 6546 reset_context->job = job; 6547 reset_context->hive = hive; 6548 INIT_LIST_HEAD(&device_list); 6549 6550 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6551 6552 if (!amdgpu_sriov_vf(adev)) { 6553 r = amdgpu_device_health_check(&device_list); 6554 if (r) 6555 goto end_reset; 6556 } 6557 6558 /* Cannot be called after locking reset domain */ 6559 amdgpu_ras_pre_reset(adev, &device_list); 6560 6561 /* We need to lock reset domain only once both for XGMI and single device */ 6562 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6563 6564 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6565 hive, need_emergency_restart); 6566 if (need_emergency_restart) 6567 goto skip_sched_resume; 6568 /* 6569 * Must check guilty signal here since after this point all old 6570 * HW fences are force signaled. 6571 * 6572 * job->base holds a reference to parent fence 6573 */ 6574 if (job && (dma_fence_get_status(&job->hw_fence->base) > 0)) { 6575 job_signaled = true; 6576 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6577 goto skip_hw_reset; 6578 } 6579 6580 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6581 if (r) 6582 goto reset_unlock; 6583 skip_hw_reset: 6584 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6585 if (r) 6586 goto reset_unlock; 6587 skip_sched_resume: 6588 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6589 reset_unlock: 6590 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6591 amdgpu_ras_post_reset(adev, &device_list); 6592 end_reset: 6593 if (hive) { 6594 mutex_unlock(&hive->hive_lock); 6595 amdgpu_put_xgmi_hive(hive); 6596 } 6597 6598 if (r) 6599 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6600 6601 atomic_set(&adev->reset_domain->reset_res, r); 6602 6603 if (!r) { 6604 struct amdgpu_task_info *ti = NULL; 6605 6606 /* 6607 * The job may already be freed at this point via the sched tdr workqueue so 6608 * use the cached pasid. 6609 */ 6610 if (pasid >= 0) 6611 ti = amdgpu_vm_get_task_info_pasid(adev, pasid); 6612 6613 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6614 ti ? &ti->task : NULL); 6615 6616 amdgpu_vm_put_task_info(ti); 6617 } 6618 6619 return r; 6620 } 6621 6622 /** 6623 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6624 * 6625 * @adev: amdgpu_device pointer 6626 * @speed: pointer to the speed of the link 6627 * @width: pointer to the width of the link 6628 * 6629 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6630 * first physical partner to an AMD dGPU. 6631 * This will exclude any virtual switches and links. 6632 */ 6633 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6634 enum pci_bus_speed *speed, 6635 enum pcie_link_width *width) 6636 { 6637 struct pci_dev *parent = adev->pdev; 6638 6639 if (!speed || !width) 6640 return; 6641 6642 *speed = PCI_SPEED_UNKNOWN; 6643 *width = PCIE_LNK_WIDTH_UNKNOWN; 6644 6645 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6646 while ((parent = pci_upstream_bridge(parent))) { 6647 /* skip upstream/downstream switches internal to dGPU*/ 6648 if (parent->vendor == PCI_VENDOR_ID_ATI) 6649 continue; 6650 *speed = pcie_get_speed_cap(parent); 6651 *width = pcie_get_width_cap(parent); 6652 break; 6653 } 6654 } else { 6655 /* use the current speeds rather than max if switching is not supported */ 6656 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6657 } 6658 } 6659 6660 /** 6661 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6662 * 6663 * @adev: amdgpu_device pointer 6664 * @speed: pointer to the speed of the link 6665 * @width: pointer to the width of the link 6666 * 6667 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6668 * AMD dGPU which may be a virtual upstream bridge. 6669 */ 6670 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6671 enum pci_bus_speed *speed, 6672 enum pcie_link_width *width) 6673 { 6674 struct pci_dev *parent = adev->pdev; 6675 6676 if (!speed || !width) 6677 return; 6678 6679 parent = pci_upstream_bridge(parent); 6680 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6681 /* use the upstream/downstream switches internal to dGPU */ 6682 *speed = pcie_get_speed_cap(parent); 6683 *width = pcie_get_width_cap(parent); 6684 while ((parent = pci_upstream_bridge(parent))) { 6685 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6686 /* use the upstream/downstream switches internal to dGPU */ 6687 *speed = pcie_get_speed_cap(parent); 6688 *width = pcie_get_width_cap(parent); 6689 } 6690 } 6691 } else { 6692 /* use the device itself */ 6693 *speed = pcie_get_speed_cap(adev->pdev); 6694 *width = pcie_get_width_cap(adev->pdev); 6695 } 6696 } 6697 6698 /** 6699 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6700 * 6701 * @adev: amdgpu_device pointer 6702 * 6703 * Fetches and stores in the driver the PCIE capabilities (gen speed 6704 * and lanes) of the slot the device is in. Handles APUs and 6705 * virtualized environments where PCIE config space may not be available. 6706 */ 6707 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6708 { 6709 enum pci_bus_speed speed_cap, platform_speed_cap; 6710 enum pcie_link_width platform_link_width, link_width; 6711 6712 if (amdgpu_pcie_gen_cap) 6713 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6714 6715 if (amdgpu_pcie_lane_cap) 6716 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6717 6718 /* covers APUs as well */ 6719 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6720 if (adev->pm.pcie_gen_mask == 0) 6721 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6722 if (adev->pm.pcie_mlw_mask == 0) 6723 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6724 return; 6725 } 6726 6727 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6728 return; 6729 6730 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6731 &platform_link_width); 6732 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6733 6734 if (adev->pm.pcie_gen_mask == 0) { 6735 /* asic caps */ 6736 if (speed_cap == PCI_SPEED_UNKNOWN) { 6737 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6738 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6739 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6740 } else { 6741 if (speed_cap == PCIE_SPEED_32_0GT) 6742 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6743 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6744 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6745 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6746 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6747 else if (speed_cap == PCIE_SPEED_16_0GT) 6748 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6749 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6750 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6751 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6752 else if (speed_cap == PCIE_SPEED_8_0GT) 6753 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6754 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6755 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6756 else if (speed_cap == PCIE_SPEED_5_0GT) 6757 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6758 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6759 else 6760 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6761 } 6762 /* platform caps */ 6763 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6764 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6765 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6766 } else { 6767 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6768 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6769 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6770 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6771 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6772 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6773 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6774 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6775 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6776 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6777 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6778 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6779 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6780 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6781 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6782 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6783 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6784 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6785 else 6786 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6787 6788 } 6789 } 6790 if (adev->pm.pcie_mlw_mask == 0) { 6791 /* asic caps */ 6792 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6793 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6794 } else { 6795 switch (link_width) { 6796 case PCIE_LNK_X32: 6797 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6798 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6799 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6800 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6801 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6802 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6803 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6804 break; 6805 case PCIE_LNK_X16: 6806 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6807 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6808 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6809 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6810 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6811 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6812 break; 6813 case PCIE_LNK_X12: 6814 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6815 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6816 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6817 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6818 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6819 break; 6820 case PCIE_LNK_X8: 6821 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6822 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6823 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6824 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6825 break; 6826 case PCIE_LNK_X4: 6827 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6828 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6829 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6830 break; 6831 case PCIE_LNK_X2: 6832 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6833 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6834 break; 6835 case PCIE_LNK_X1: 6836 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6837 break; 6838 default: 6839 break; 6840 } 6841 } 6842 /* platform caps */ 6843 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6844 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6845 } else { 6846 switch (platform_link_width) { 6847 case PCIE_LNK_X32: 6848 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6849 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6850 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6851 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6852 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6853 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6854 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6855 break; 6856 case PCIE_LNK_X16: 6857 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6858 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6859 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6860 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6861 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6862 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6863 break; 6864 case PCIE_LNK_X12: 6865 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6866 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6867 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6868 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6869 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6870 break; 6871 case PCIE_LNK_X8: 6872 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6873 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6874 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6875 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6876 break; 6877 case PCIE_LNK_X4: 6878 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6879 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6880 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6881 break; 6882 case PCIE_LNK_X2: 6883 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6884 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6885 break; 6886 case PCIE_LNK_X1: 6887 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6888 break; 6889 default: 6890 break; 6891 } 6892 } 6893 } 6894 } 6895 6896 /** 6897 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6898 * 6899 * @adev: amdgpu_device pointer 6900 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6901 * 6902 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6903 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6904 * @peer_adev. 6905 */ 6906 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6907 struct amdgpu_device *peer_adev) 6908 { 6909 #ifdef CONFIG_HSA_AMD_P2P 6910 bool p2p_access = 6911 !adev->gmc.xgmi.connected_to_cpu && 6912 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6913 if (!p2p_access) 6914 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6915 pci_name(peer_adev->pdev)); 6916 6917 bool is_large_bar = adev->gmc.visible_vram_size && 6918 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6919 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6920 6921 if (!p2p_addressable) { 6922 uint64_t address_mask = peer_adev->dev->dma_mask ? 6923 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6924 resource_size_t aper_limit = 6925 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6926 6927 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6928 aper_limit & address_mask); 6929 } 6930 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6931 #else 6932 return false; 6933 #endif 6934 } 6935 6936 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 6937 { 6938 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6939 6940 if (!amdgpu_device_supports_baco(adev)) 6941 return -ENOTSUPP; 6942 6943 if (ras && adev->ras_enabled && 6944 adev->nbio.funcs->enable_doorbell_interrupt) 6945 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6946 6947 return amdgpu_dpm_baco_enter(adev); 6948 } 6949 6950 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 6951 { 6952 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6953 int ret = 0; 6954 6955 if (!amdgpu_device_supports_baco(adev)) 6956 return -ENOTSUPP; 6957 6958 ret = amdgpu_dpm_baco_exit(adev); 6959 if (ret) 6960 return ret; 6961 6962 if (ras && adev->ras_enabled && 6963 adev->nbio.funcs->enable_doorbell_interrupt) 6964 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6965 6966 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6967 adev->nbio.funcs->clear_doorbell_interrupt) 6968 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6969 6970 return 0; 6971 } 6972 6973 /** 6974 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6975 * @pdev: PCI device struct 6976 * @state: PCI channel state 6977 * 6978 * Description: Called when a PCI error is detected. 6979 * 6980 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6981 */ 6982 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6983 { 6984 struct drm_device *dev = pci_get_drvdata(pdev); 6985 struct amdgpu_device *adev = drm_to_adev(dev); 6986 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 6987 amdgpu_get_xgmi_hive(adev); 6988 struct amdgpu_reset_context reset_context; 6989 struct list_head device_list; 6990 6991 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6992 6993 adev->pci_channel_state = state; 6994 6995 switch (state) { 6996 case pci_channel_io_normal: 6997 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6998 return PCI_ERS_RESULT_CAN_RECOVER; 6999 case pci_channel_io_frozen: 7000 /* Fatal error, prepare for slot reset */ 7001 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 7002 if (hive) { 7003 /* Hive devices should be able to support FW based 7004 * link reset on other devices, if not return. 7005 */ 7006 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 7007 dev_warn(adev->dev, 7008 "No support for XGMI hive yet...\n"); 7009 return PCI_ERS_RESULT_DISCONNECT; 7010 } 7011 /* Set dpc status only if device is part of hive 7012 * Non-hive devices should be able to recover after 7013 * link reset. 7014 */ 7015 amdgpu_reset_set_dpc_status(adev, true); 7016 7017 mutex_lock(&hive->hive_lock); 7018 } 7019 memset(&reset_context, 0, sizeof(reset_context)); 7020 INIT_LIST_HEAD(&device_list); 7021 7022 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7023 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7024 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7025 hive, false); 7026 if (hive) 7027 mutex_unlock(&hive->hive_lock); 7028 return PCI_ERS_RESULT_NEED_RESET; 7029 case pci_channel_io_perm_failure: 7030 /* Permanent error, prepare for device removal */ 7031 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7032 return PCI_ERS_RESULT_DISCONNECT; 7033 } 7034 7035 return PCI_ERS_RESULT_NEED_RESET; 7036 } 7037 7038 /** 7039 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7040 * @pdev: pointer to PCI device 7041 */ 7042 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7043 { 7044 struct drm_device *dev = pci_get_drvdata(pdev); 7045 struct amdgpu_device *adev = drm_to_adev(dev); 7046 7047 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7048 7049 /* TODO - dump whatever for debugging purposes */ 7050 7051 /* This called only if amdgpu_pci_error_detected returns 7052 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7053 * works, no need to reset slot. 7054 */ 7055 7056 return PCI_ERS_RESULT_RECOVERED; 7057 } 7058 7059 /** 7060 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7061 * @pdev: PCI device struct 7062 * 7063 * Description: This routine is called by the pci error recovery 7064 * code after the PCI slot has been reset, just before we 7065 * should resume normal operations. 7066 */ 7067 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7068 { 7069 struct drm_device *dev = pci_get_drvdata(pdev); 7070 struct amdgpu_device *adev = drm_to_adev(dev); 7071 struct amdgpu_reset_context reset_context; 7072 struct amdgpu_device *tmp_adev; 7073 struct amdgpu_hive_info *hive; 7074 struct list_head device_list; 7075 struct pci_dev *link_dev; 7076 int r = 0, i, timeout; 7077 u32 memsize; 7078 u16 status; 7079 7080 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7081 7082 memset(&reset_context, 0, sizeof(reset_context)); 7083 INIT_LIST_HEAD(&device_list); 7084 hive = amdgpu_get_xgmi_hive(adev); 7085 if (hive) { 7086 mutex_lock(&hive->hive_lock); 7087 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 7088 list_add_tail(&tmp_adev->reset_list, &device_list); 7089 } else { 7090 list_add_tail(&adev->reset_list, &device_list); 7091 } 7092 7093 if (adev->pcie_reset_ctx.swus) 7094 link_dev = adev->pcie_reset_ctx.swus; 7095 else 7096 link_dev = adev->pdev; 7097 /* wait for asic to come out of reset, timeout = 10s */ 7098 timeout = 10000; 7099 do { 7100 usleep_range(10000, 10500); 7101 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7102 timeout -= 10; 7103 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7104 (status != PCI_VENDOR_ID_AMD)); 7105 7106 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7107 r = -ETIME; 7108 goto out; 7109 } 7110 7111 amdgpu_device_load_switch_state(adev); 7112 /* Restore PCI confspace */ 7113 amdgpu_device_load_pci_state(pdev); 7114 7115 /* confirm ASIC came out of reset */ 7116 for (i = 0; i < adev->usec_timeout; i++) { 7117 memsize = amdgpu_asic_get_config_memsize(adev); 7118 7119 if (memsize != 0xffffffff) 7120 break; 7121 udelay(1); 7122 } 7123 if (memsize == 0xffffffff) { 7124 r = -ETIME; 7125 goto out; 7126 } 7127 7128 reset_context.method = AMD_RESET_METHOD_NONE; 7129 reset_context.reset_req_dev = adev; 7130 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7131 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7132 7133 if (hive) { 7134 reset_context.hive = hive; 7135 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 7136 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7137 } else { 7138 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7139 } 7140 7141 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7142 out: 7143 if (!r) { 7144 if (amdgpu_device_cache_pci_state(adev->pdev)) 7145 pci_restore_state(adev->pdev); 7146 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7147 } else { 7148 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7149 if (hive) { 7150 list_for_each_entry(tmp_adev, &device_list, reset_list) 7151 amdgpu_device_unset_mp1_state(tmp_adev); 7152 } 7153 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7154 } 7155 7156 if (hive) { 7157 mutex_unlock(&hive->hive_lock); 7158 amdgpu_put_xgmi_hive(hive); 7159 } 7160 7161 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7162 } 7163 7164 /** 7165 * amdgpu_pci_resume() - resume normal ops after PCI reset 7166 * @pdev: pointer to PCI device 7167 * 7168 * Called when the error recovery driver tells us that its 7169 * OK to resume normal operation. 7170 */ 7171 void amdgpu_pci_resume(struct pci_dev *pdev) 7172 { 7173 struct drm_device *dev = pci_get_drvdata(pdev); 7174 struct amdgpu_device *adev = drm_to_adev(dev); 7175 struct list_head device_list; 7176 struct amdgpu_hive_info *hive = NULL; 7177 struct amdgpu_device *tmp_adev = NULL; 7178 7179 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7180 7181 /* Only continue execution for the case of pci_channel_io_frozen */ 7182 if (adev->pci_channel_state != pci_channel_io_frozen) 7183 return; 7184 7185 INIT_LIST_HEAD(&device_list); 7186 7187 hive = amdgpu_get_xgmi_hive(adev); 7188 if (hive) { 7189 mutex_lock(&hive->hive_lock); 7190 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7191 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7192 list_add_tail(&tmp_adev->reset_list, &device_list); 7193 } 7194 } else 7195 list_add_tail(&adev->reset_list, &device_list); 7196 7197 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7198 amdgpu_device_gpu_resume(adev, &device_list, false); 7199 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7200 7201 if (hive) { 7202 mutex_unlock(&hive->hive_lock); 7203 amdgpu_put_xgmi_hive(hive); 7204 } 7205 } 7206 7207 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7208 { 7209 struct pci_dev *swus, *swds; 7210 int r; 7211 7212 swds = pci_upstream_bridge(adev->pdev); 7213 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7214 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7215 return; 7216 swus = pci_upstream_bridge(swds); 7217 if (!swus || 7218 (swus->vendor != PCI_VENDOR_ID_ATI && 7219 swus->vendor != PCI_VENDOR_ID_AMD) || 7220 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7221 return; 7222 7223 /* If already saved, return */ 7224 if (adev->pcie_reset_ctx.swus) 7225 return; 7226 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7227 r = pci_save_state(swds); 7228 if (r) 7229 return; 7230 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7231 7232 r = pci_save_state(swus); 7233 if (r) 7234 return; 7235 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7236 7237 adev->pcie_reset_ctx.swus = swus; 7238 } 7239 7240 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7241 { 7242 struct pci_dev *pdev; 7243 int r; 7244 7245 if (!adev->pcie_reset_ctx.swds_pcistate || 7246 !adev->pcie_reset_ctx.swus_pcistate) 7247 return; 7248 7249 pdev = adev->pcie_reset_ctx.swus; 7250 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7251 if (!r) { 7252 pci_restore_state(pdev); 7253 } else { 7254 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7255 return; 7256 } 7257 7258 pdev = pci_upstream_bridge(adev->pdev); 7259 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7260 if (!r) 7261 pci_restore_state(pdev); 7262 else 7263 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7264 } 7265 7266 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7267 { 7268 struct drm_device *dev = pci_get_drvdata(pdev); 7269 struct amdgpu_device *adev = drm_to_adev(dev); 7270 int r; 7271 7272 if (amdgpu_sriov_vf(adev)) 7273 return false; 7274 7275 r = pci_save_state(pdev); 7276 if (!r) { 7277 kfree(adev->pci_state); 7278 7279 adev->pci_state = pci_store_saved_state(pdev); 7280 7281 if (!adev->pci_state) { 7282 dev_err(adev->dev, "Failed to store PCI saved state"); 7283 return false; 7284 } 7285 } else { 7286 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7287 return false; 7288 } 7289 7290 amdgpu_device_cache_switch_state(adev); 7291 7292 return true; 7293 } 7294 7295 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7296 { 7297 struct drm_device *dev = pci_get_drvdata(pdev); 7298 struct amdgpu_device *adev = drm_to_adev(dev); 7299 int r; 7300 7301 if (!adev->pci_state) 7302 return false; 7303 7304 r = pci_load_saved_state(pdev, adev->pci_state); 7305 7306 if (!r) { 7307 pci_restore_state(pdev); 7308 } else { 7309 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7310 return false; 7311 } 7312 7313 return true; 7314 } 7315 7316 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7317 struct amdgpu_ring *ring) 7318 { 7319 #ifdef CONFIG_X86_64 7320 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7321 return; 7322 #endif 7323 if (adev->gmc.xgmi.connected_to_cpu) 7324 return; 7325 7326 if (ring && ring->funcs->emit_hdp_flush) { 7327 amdgpu_ring_emit_hdp_flush(ring); 7328 return; 7329 } 7330 7331 if (!ring && amdgpu_sriov_runtime(adev)) { 7332 if (!amdgpu_kiq_hdp_flush(adev)) 7333 return; 7334 } 7335 7336 amdgpu_hdp_flush(adev, ring); 7337 } 7338 7339 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7340 struct amdgpu_ring *ring) 7341 { 7342 #ifdef CONFIG_X86_64 7343 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7344 return; 7345 #endif 7346 if (adev->gmc.xgmi.connected_to_cpu) 7347 return; 7348 7349 amdgpu_hdp_invalidate(adev, ring); 7350 } 7351 7352 int amdgpu_in_reset(struct amdgpu_device *adev) 7353 { 7354 return atomic_read(&adev->reset_domain->in_gpu_reset); 7355 } 7356 7357 /** 7358 * amdgpu_device_halt() - bring hardware to some kind of halt state 7359 * 7360 * @adev: amdgpu_device pointer 7361 * 7362 * Bring hardware to some kind of halt state so that no one can touch it 7363 * any more. It will help to maintain error context when error occurred. 7364 * Compare to a simple hang, the system will keep stable at least for SSH 7365 * access. Then it should be trivial to inspect the hardware state and 7366 * see what's going on. Implemented as following: 7367 * 7368 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7369 * clears all CPU mappings to device, disallows remappings through page faults 7370 * 2. amdgpu_irq_disable_all() disables all interrupts 7371 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7372 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7373 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7374 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7375 * flush any in flight DMA operations 7376 */ 7377 void amdgpu_device_halt(struct amdgpu_device *adev) 7378 { 7379 struct pci_dev *pdev = adev->pdev; 7380 struct drm_device *ddev = adev_to_drm(adev); 7381 7382 amdgpu_xcp_dev_unplug(adev); 7383 drm_dev_unplug(ddev); 7384 7385 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 7386 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 7387 7388 amdgpu_irq_disable_all(adev); 7389 7390 amdgpu_fence_driver_hw_fini(adev); 7391 7392 adev->no_hw_access = true; 7393 7394 amdgpu_device_unmap_mmio(adev); 7395 7396 pci_disable_device(pdev); 7397 pci_wait_for_pending_transaction(pdev); 7398 } 7399 7400 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7401 u32 reg) 7402 { 7403 unsigned long flags, address, data; 7404 u32 r; 7405 7406 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7407 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7408 7409 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7410 WREG32(address, reg * 4); 7411 (void)RREG32(address); 7412 r = RREG32(data); 7413 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7414 return r; 7415 } 7416 7417 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7418 u32 reg, u32 v) 7419 { 7420 unsigned long flags, address, data; 7421 7422 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7423 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7424 7425 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7426 WREG32(address, reg * 4); 7427 (void)RREG32(address); 7428 WREG32(data, v); 7429 (void)RREG32(data); 7430 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7431 } 7432 7433 /** 7434 * amdgpu_device_get_gang - return a reference to the current gang 7435 * @adev: amdgpu_device pointer 7436 * 7437 * Returns: A new reference to the current gang leader. 7438 */ 7439 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7440 { 7441 struct dma_fence *fence; 7442 7443 rcu_read_lock(); 7444 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7445 rcu_read_unlock(); 7446 return fence; 7447 } 7448 7449 /** 7450 * amdgpu_device_switch_gang - switch to a new gang 7451 * @adev: amdgpu_device pointer 7452 * @gang: the gang to switch to 7453 * 7454 * Try to switch to a new gang. 7455 * Returns: NULL if we switched to the new gang or a reference to the current 7456 * gang leader. 7457 */ 7458 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7459 struct dma_fence *gang) 7460 { 7461 struct dma_fence *old = NULL; 7462 7463 dma_fence_get(gang); 7464 do { 7465 dma_fence_put(old); 7466 old = amdgpu_device_get_gang(adev); 7467 if (old == gang) 7468 break; 7469 7470 if (!dma_fence_is_signaled(old)) { 7471 dma_fence_put(gang); 7472 return old; 7473 } 7474 7475 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7476 old, gang) != old); 7477 7478 /* 7479 * Drop it once for the exchanged reference in adev and once for the 7480 * thread local reference acquired in amdgpu_device_get_gang(). 7481 */ 7482 dma_fence_put(old); 7483 dma_fence_put(old); 7484 return NULL; 7485 } 7486 7487 /** 7488 * amdgpu_device_enforce_isolation - enforce HW isolation 7489 * @adev: the amdgpu device pointer 7490 * @ring: the HW ring the job is supposed to run on 7491 * @job: the job which is about to be pushed to the HW ring 7492 * 7493 * Makes sure that only one client at a time can use the GFX block. 7494 * Returns: The dependency to wait on before the job can be pushed to the HW. 7495 * The function is called multiple times until NULL is returned. 7496 */ 7497 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7498 struct amdgpu_ring *ring, 7499 struct amdgpu_job *job) 7500 { 7501 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7502 struct drm_sched_fence *f = job->base.s_fence; 7503 struct dma_fence *dep; 7504 void *owner; 7505 int r; 7506 7507 /* 7508 * For now enforce isolation only for the GFX block since we only need 7509 * the cleaner shader on those rings. 7510 */ 7511 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7512 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7513 return NULL; 7514 7515 /* 7516 * All submissions where enforce isolation is false are handled as if 7517 * they come from a single client. Use ~0l as the owner to distinct it 7518 * from kernel submissions where the owner is NULL. 7519 */ 7520 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7521 7522 mutex_lock(&adev->enforce_isolation_mutex); 7523 7524 /* 7525 * The "spearhead" submission is the first one which changes the 7526 * ownership to its client. We always need to wait for it to be 7527 * pushed to the HW before proceeding with anything. 7528 */ 7529 if (&f->scheduled != isolation->spearhead && 7530 !dma_fence_is_signaled(isolation->spearhead)) { 7531 dep = isolation->spearhead; 7532 goto out_grab_ref; 7533 } 7534 7535 if (isolation->owner != owner) { 7536 7537 /* 7538 * Wait for any gang to be assembled before switching to a 7539 * different owner or otherwise we could deadlock the 7540 * submissions. 7541 */ 7542 if (!job->gang_submit) { 7543 dep = amdgpu_device_get_gang(adev); 7544 if (!dma_fence_is_signaled(dep)) 7545 goto out_return_dep; 7546 dma_fence_put(dep); 7547 } 7548 7549 dma_fence_put(isolation->spearhead); 7550 isolation->spearhead = dma_fence_get(&f->scheduled); 7551 amdgpu_sync_move(&isolation->active, &isolation->prev); 7552 trace_amdgpu_isolation(isolation->owner, owner); 7553 isolation->owner = owner; 7554 } 7555 7556 /* 7557 * Specifying the ring here helps to pipeline submissions even when 7558 * isolation is enabled. If that is not desired for testing NULL can be 7559 * used instead of the ring to enforce a CPU round trip while switching 7560 * between clients. 7561 */ 7562 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7563 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7564 if (r) 7565 dev_warn(adev->dev, "OOM tracking isolation\n"); 7566 7567 out_grab_ref: 7568 dma_fence_get(dep); 7569 out_return_dep: 7570 mutex_unlock(&adev->enforce_isolation_mutex); 7571 return dep; 7572 } 7573 7574 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7575 { 7576 switch (adev->asic_type) { 7577 #ifdef CONFIG_DRM_AMDGPU_SI 7578 case CHIP_HAINAN: 7579 #endif 7580 case CHIP_TOPAZ: 7581 /* chips with no display hardware */ 7582 return false; 7583 #ifdef CONFIG_DRM_AMDGPU_SI 7584 case CHIP_TAHITI: 7585 case CHIP_PITCAIRN: 7586 case CHIP_VERDE: 7587 case CHIP_OLAND: 7588 #endif 7589 #ifdef CONFIG_DRM_AMDGPU_CIK 7590 case CHIP_BONAIRE: 7591 case CHIP_HAWAII: 7592 case CHIP_KAVERI: 7593 case CHIP_KABINI: 7594 case CHIP_MULLINS: 7595 #endif 7596 case CHIP_TONGA: 7597 case CHIP_FIJI: 7598 case CHIP_POLARIS10: 7599 case CHIP_POLARIS11: 7600 case CHIP_POLARIS12: 7601 case CHIP_VEGAM: 7602 case CHIP_CARRIZO: 7603 case CHIP_STONEY: 7604 /* chips with display hardware */ 7605 return true; 7606 default: 7607 /* IP discovery */ 7608 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7609 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7610 return false; 7611 return true; 7612 } 7613 } 7614 7615 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7616 uint32_t inst, uint32_t reg_addr, char reg_name[], 7617 uint32_t expected_value, uint32_t mask) 7618 { 7619 uint32_t ret = 0; 7620 uint32_t old_ = 0; 7621 uint32_t tmp_ = RREG32(reg_addr); 7622 uint32_t loop = adev->usec_timeout; 7623 7624 while ((tmp_ & (mask)) != (expected_value)) { 7625 if (old_ != tmp_) { 7626 loop = adev->usec_timeout; 7627 old_ = tmp_; 7628 } else 7629 udelay(1); 7630 tmp_ = RREG32(reg_addr); 7631 loop--; 7632 if (!loop) { 7633 dev_warn( 7634 adev->dev, 7635 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7636 inst, reg_name, (uint32_t)expected_value, 7637 (uint32_t)(tmp_ & (mask))); 7638 ret = -ETIMEDOUT; 7639 break; 7640 } 7641 } 7642 return ret; 7643 } 7644 7645 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7646 { 7647 ssize_t size = 0; 7648 7649 if (!ring || !ring->adev) 7650 return size; 7651 7652 if (amdgpu_device_should_recover_gpu(ring->adev)) 7653 size |= AMDGPU_RESET_TYPE_FULL; 7654 7655 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7656 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7657 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7658 7659 return size; 7660 } 7661 7662 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7663 { 7664 ssize_t size = 0; 7665 7666 if (supported_reset == 0) { 7667 size += sysfs_emit_at(buf, size, "unsupported"); 7668 size += sysfs_emit_at(buf, size, "\n"); 7669 return size; 7670 7671 } 7672 7673 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7674 size += sysfs_emit_at(buf, size, "soft "); 7675 7676 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7677 size += sysfs_emit_at(buf, size, "queue "); 7678 7679 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7680 size += sysfs_emit_at(buf, size, "pipe "); 7681 7682 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7683 size += sysfs_emit_at(buf, size, "full "); 7684 7685 size += sysfs_emit_at(buf, size, "\n"); 7686 return size; 7687 } 7688 7689 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7690 enum amdgpu_uid_type type, uint8_t inst, 7691 uint64_t uid) 7692 { 7693 if (!uid_info) 7694 return; 7695 7696 if (type >= AMDGPU_UID_TYPE_MAX) { 7697 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7698 type); 7699 return; 7700 } 7701 7702 if (inst >= AMDGPU_UID_INST_MAX) { 7703 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7704 inst); 7705 return; 7706 } 7707 7708 if (uid_info->uid[type][inst] != 0) { 7709 dev_warn_once( 7710 uid_info->adev->dev, 7711 "Overwriting existing UID %llu for type %d instance %d\n", 7712 uid_info->uid[type][inst], type, inst); 7713 } 7714 7715 uid_info->uid[type][inst] = uid; 7716 } 7717 7718 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7719 enum amdgpu_uid_type type, uint8_t inst) 7720 { 7721 if (!uid_info) 7722 return 0; 7723 7724 if (type >= AMDGPU_UID_TYPE_MAX) { 7725 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7726 type); 7727 return 0; 7728 } 7729 7730 if (inst >= AMDGPU_UID_INST_MAX) { 7731 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7732 inst); 7733 return 0; 7734 } 7735 7736 return uid_info->uid[type][inst]; 7737 } 7738