1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 #include <linux/nospec.h> 40 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_client_event.h> 43 #include <drm/drm_crtc_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/device.h> 47 #include <linux/vgaarb.h> 48 #include <linux/vga_switcheroo.h> 49 #include <linux/efi.h> 50 #include "amdgpu.h" 51 #include "amdgpu_trace.h" 52 #include "amdgpu_i2c.h" 53 #include "atom.h" 54 #include "amdgpu_atombios.h" 55 #include "amdgpu_atomfirmware.h" 56 #include "amd_pcie.h" 57 #ifdef CONFIG_DRM_AMDGPU_SI 58 #include "si.h" 59 #endif 60 #ifdef CONFIG_DRM_AMDGPU_CIK 61 #include "cik.h" 62 #endif 63 #include "vi.h" 64 #include "soc15.h" 65 #include "nv.h" 66 #include "bif/bif_4_1_d.h" 67 #include <linux/firmware.h> 68 #include "amdgpu_vf_error.h" 69 70 #include "amdgpu_amdkfd.h" 71 #include "amdgpu_pm.h" 72 73 #include "amdgpu_xgmi.h" 74 #include "amdgpu_ras.h" 75 #include "amdgpu_ras_mgr.h" 76 #include "amdgpu_pmu.h" 77 #include "amdgpu_fru_eeprom.h" 78 #include "amdgpu_reset.h" 79 #include "amdgpu_virt.h" 80 #include "amdgpu_dev_coredump.h" 81 82 #include <linux/suspend.h> 83 #include <drm/task_barrier.h> 84 #include <linux/pm_runtime.h> 85 86 #include <drm/drm_drv.h> 87 88 #if IS_ENABLED(CONFIG_X86) 89 #include <asm/intel-family.h> 90 #include <asm/cpu_device_id.h> 91 #endif 92 93 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 100 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 101 102 #define AMDGPU_RESUME_MS 2000 103 #define AMDGPU_MAX_RETRY_LIMIT 2 104 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 105 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 106 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 107 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 108 109 #define AMDGPU_VBIOS_SKIP (1U << 0) 110 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 111 112 static const struct drm_driver amdgpu_kms_driver; 113 114 const char *amdgpu_asic_name[] = { 115 "TAHITI", 116 "PITCAIRN", 117 "VERDE", 118 "OLAND", 119 "HAINAN", 120 "BONAIRE", 121 "KAVERI", 122 "KABINI", 123 "HAWAII", 124 "MULLINS", 125 "TOPAZ", 126 "TONGA", 127 "FIJI", 128 "CARRIZO", 129 "STONEY", 130 "POLARIS10", 131 "POLARIS11", 132 "POLARIS12", 133 "VEGAM", 134 "VEGA10", 135 "VEGA12", 136 "VEGA20", 137 "RAVEN", 138 "ARCTURUS", 139 "RENOIR", 140 "ALDEBARAN", 141 "NAVI10", 142 "CYAN_SKILLFISH", 143 "NAVI14", 144 "NAVI12", 145 "SIENNA_CICHLID", 146 "NAVY_FLOUNDER", 147 "VANGOGH", 148 "DIMGREY_CAVEFISH", 149 "BEIGE_GOBY", 150 "YELLOW_CARP", 151 "IP DISCOVERY", 152 "LAST", 153 }; 154 155 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 156 /* 157 * Default init level where all blocks are expected to be initialized. This is 158 * the level of initialization expected by default and also after a full reset 159 * of the device. 160 */ 161 struct amdgpu_init_level amdgpu_init_default = { 162 .level = AMDGPU_INIT_LEVEL_DEFAULT, 163 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 164 }; 165 166 struct amdgpu_init_level amdgpu_init_recovery = { 167 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 168 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 169 }; 170 171 /* 172 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 173 * is used for cases like reset on initialization where the entire hive needs to 174 * be reset before first use. 175 */ 176 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 177 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 178 .hwini_ip_block_mask = 179 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 180 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 181 BIT(AMD_IP_BLOCK_TYPE_PSP) 182 }; 183 184 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 186 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 187 188 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 189 190 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 191 enum amd_ip_block_type block) 192 { 193 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 194 } 195 196 void amdgpu_set_init_level(struct amdgpu_device *adev, 197 enum amdgpu_init_lvl_id lvl) 198 { 199 switch (lvl) { 200 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 201 adev->init_lvl = &amdgpu_init_minimal_xgmi; 202 break; 203 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 204 adev->init_lvl = &amdgpu_init_recovery; 205 break; 206 case AMDGPU_INIT_LEVEL_DEFAULT: 207 fallthrough; 208 default: 209 adev->init_lvl = &amdgpu_init_default; 210 break; 211 } 212 } 213 214 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 215 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 216 void *data); 217 218 /** 219 * DOC: pcie_replay_count 220 * 221 * The amdgpu driver provides a sysfs API for reporting the total number 222 * of PCIe replays (NAKs). 223 * The file pcie_replay_count is used for this and returns the total 224 * number of replays as a sum of the NAKs generated and NAKs received. 225 */ 226 227 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 228 struct device_attribute *attr, char *buf) 229 { 230 struct drm_device *ddev = dev_get_drvdata(dev); 231 struct amdgpu_device *adev = drm_to_adev(ddev); 232 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 233 234 return sysfs_emit(buf, "%llu\n", cnt); 235 } 236 237 static DEVICE_ATTR(pcie_replay_count, 0444, 238 amdgpu_device_get_pcie_replay_count, NULL); 239 240 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 241 { 242 int ret = 0; 243 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 ret = sysfs_create_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 248 return ret; 249 } 250 251 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 252 { 253 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 254 sysfs_remove_file(&adev->dev->kobj, 255 &dev_attr_pcie_replay_count.attr); 256 } 257 258 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 259 const struct bin_attribute *attr, char *buf, 260 loff_t ppos, size_t count) 261 { 262 struct device *dev = kobj_to_dev(kobj); 263 struct drm_device *ddev = dev_get_drvdata(dev); 264 struct amdgpu_device *adev = drm_to_adev(ddev); 265 ssize_t bytes_read; 266 267 switch (ppos) { 268 case AMDGPU_SYS_REG_STATE_XGMI: 269 bytes_read = amdgpu_asic_get_reg_state( 270 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 271 break; 272 case AMDGPU_SYS_REG_STATE_WAFL: 273 bytes_read = amdgpu_asic_get_reg_state( 274 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 275 break; 276 case AMDGPU_SYS_REG_STATE_PCIE: 277 bytes_read = amdgpu_asic_get_reg_state( 278 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 279 break; 280 case AMDGPU_SYS_REG_STATE_USR: 281 bytes_read = amdgpu_asic_get_reg_state( 282 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 283 break; 284 case AMDGPU_SYS_REG_STATE_USR_1: 285 bytes_read = amdgpu_asic_get_reg_state( 286 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 287 break; 288 default: 289 return -EINVAL; 290 } 291 292 return bytes_read; 293 } 294 295 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 296 AMDGPU_SYS_REG_STATE_END); 297 298 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 299 { 300 int ret; 301 302 if (!amdgpu_asic_get_reg_state_supported(adev)) 303 return 0; 304 305 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 307 return ret; 308 } 309 310 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 311 { 312 if (!amdgpu_asic_get_reg_state_supported(adev)) 313 return; 314 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 315 } 316 317 /** 318 * DOC: board_info 319 * 320 * The amdgpu driver provides a sysfs API for giving board related information. 321 * It provides the form factor information in the format 322 * 323 * type : form factor 324 * 325 * Possible form factor values 326 * 327 * - "cem" - PCIE CEM card 328 * - "oam" - Open Compute Accelerator Module 329 * - "unknown" - Not known 330 * 331 */ 332 333 static ssize_t amdgpu_device_get_board_info(struct device *dev, 334 struct device_attribute *attr, 335 char *buf) 336 { 337 struct drm_device *ddev = dev_get_drvdata(dev); 338 struct amdgpu_device *adev = drm_to_adev(ddev); 339 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 340 const char *pkg; 341 342 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 343 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 344 345 switch (pkg_type) { 346 case AMDGPU_PKG_TYPE_CEM: 347 pkg = "cem"; 348 break; 349 case AMDGPU_PKG_TYPE_OAM: 350 pkg = "oam"; 351 break; 352 default: 353 pkg = "unknown"; 354 break; 355 } 356 357 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 358 } 359 360 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 361 362 static struct attribute *amdgpu_board_attrs[] = { 363 &dev_attr_board_info.attr, 364 NULL, 365 }; 366 367 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 368 struct attribute *attr, int n) 369 { 370 struct device *dev = kobj_to_dev(kobj); 371 struct drm_device *ddev = dev_get_drvdata(dev); 372 struct amdgpu_device *adev = drm_to_adev(ddev); 373 374 if (adev->flags & AMD_IS_APU) 375 return 0; 376 377 return attr->mode; 378 } 379 380 static const struct attribute_group amdgpu_board_attrs_group = { 381 .attrs = amdgpu_board_attrs, 382 .is_visible = amdgpu_board_attrs_is_visible 383 }; 384 385 /** 386 * DOC: uma/carveout_options 387 * 388 * This is a read-only file that lists all available UMA allocation 389 * options and their corresponding indices. Example output:: 390 * 391 * $ cat uma/carveout_options 392 * 0: Minimum (512 MB) 393 * 1: (1 GB) 394 * 2: (2 GB) 395 * 3: (4 GB) 396 * 4: (6 GB) 397 * 5: (8 GB) 398 * 6: (12 GB) 399 * 7: Medium (16 GB) 400 * 8: (24 GB) 401 * 9: High (32 GB) 402 */ 403 static ssize_t carveout_options_show(struct device *dev, 404 struct device_attribute *attr, 405 char *buf) 406 { 407 struct drm_device *ddev = dev_get_drvdata(dev); 408 struct amdgpu_device *adev = drm_to_adev(ddev); 409 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 410 uint32_t memory_carved; 411 ssize_t size = 0; 412 413 if (!uma_info || !uma_info->num_entries) 414 return -ENODEV; 415 416 for (int i = 0; i < uma_info->num_entries; i++) { 417 memory_carved = uma_info->entries[i].memory_carved_mb; 418 if (memory_carved >= SZ_1G/SZ_1M) { 419 size += sysfs_emit_at(buf, size, "%d: %s (%u GB)\n", 420 i, 421 uma_info->entries[i].name, 422 memory_carved >> 10); 423 } else { 424 size += sysfs_emit_at(buf, size, "%d: %s (%u MB)\n", 425 i, 426 uma_info->entries[i].name, 427 memory_carved); 428 } 429 } 430 431 return size; 432 } 433 static DEVICE_ATTR_RO(carveout_options); 434 435 /** 436 * DOC: uma/carveout 437 * 438 * This file is both readable and writable. When read, it shows the 439 * index of the current setting. Writing a valid index to this file 440 * allows users to change the UMA carveout size to the selected option 441 * on the next boot. 442 * 443 * The available options and their corresponding indices can be read 444 * from the uma/carveout_options file. 445 */ 446 static ssize_t carveout_show(struct device *dev, 447 struct device_attribute *attr, 448 char *buf) 449 { 450 struct drm_device *ddev = dev_get_drvdata(dev); 451 struct amdgpu_device *adev = drm_to_adev(ddev); 452 453 return sysfs_emit(buf, "%u\n", adev->uma_info.uma_option_index); 454 } 455 456 static ssize_t carveout_store(struct device *dev, 457 struct device_attribute *attr, 458 const char *buf, size_t count) 459 { 460 struct drm_device *ddev = dev_get_drvdata(dev); 461 struct amdgpu_device *adev = drm_to_adev(ddev); 462 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 463 struct amdgpu_uma_carveout_option *opt; 464 unsigned long val; 465 uint8_t flags; 466 int r; 467 468 r = kstrtoul(buf, 10, &val); 469 if (r) 470 return r; 471 472 if (val >= uma_info->num_entries) 473 return -EINVAL; 474 475 val = array_index_nospec(val, uma_info->num_entries); 476 opt = &uma_info->entries[val]; 477 478 if (!(opt->flags & AMDGPU_UMA_FLAG_AUTO) && 479 !(opt->flags & AMDGPU_UMA_FLAG_CUSTOM)) { 480 drm_err_once(ddev, "Option %lu not supported due to lack of Custom/Auto flag", val); 481 return -EINVAL; 482 } 483 484 flags = opt->flags; 485 flags &= ~((flags & AMDGPU_UMA_FLAG_AUTO) >> 1); 486 487 guard(mutex)(&uma_info->update_lock); 488 489 r = amdgpu_acpi_set_uma_allocation_size(adev, val, flags); 490 if (r) 491 return r; 492 493 uma_info->uma_option_index = val; 494 495 return count; 496 } 497 static DEVICE_ATTR_RW(carveout); 498 499 static struct attribute *amdgpu_uma_attrs[] = { 500 &dev_attr_carveout.attr, 501 &dev_attr_carveout_options.attr, 502 NULL 503 }; 504 505 const struct attribute_group amdgpu_uma_attr_group = { 506 .name = "uma", 507 .attrs = amdgpu_uma_attrs 508 }; 509 510 static void amdgpu_uma_sysfs_init(struct amdgpu_device *adev) 511 { 512 int rc; 513 514 if (!(adev->flags & AMD_IS_APU)) 515 return; 516 517 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 518 return; 519 520 rc = amdgpu_atomfirmware_get_uma_carveout_info(adev, &adev->uma_info); 521 if (rc) { 522 drm_dbg(adev_to_drm(adev), 523 "Failed to parse UMA carveout info from VBIOS: %d\n", rc); 524 goto out_info; 525 } 526 527 mutex_init(&adev->uma_info.update_lock); 528 529 rc = devm_device_add_group(adev->dev, &amdgpu_uma_attr_group); 530 if (rc) { 531 drm_dbg(adev_to_drm(adev), "Failed to add UMA carveout sysfs interfaces %d\n", rc); 532 goto out_attr; 533 } 534 535 return; 536 537 out_attr: 538 mutex_destroy(&adev->uma_info.update_lock); 539 out_info: 540 return; 541 } 542 543 static void amdgpu_uma_sysfs_fini(struct amdgpu_device *adev) 544 { 545 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 546 547 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 548 return; 549 550 mutex_destroy(&uma_info->update_lock); 551 uma_info->num_entries = 0; 552 } 553 554 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 555 556 /** 557 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 558 * 559 * @adev: amdgpu device pointer 560 * 561 * Returns true if the device is a dGPU with ATPX power control, 562 * otherwise return false. 563 */ 564 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 565 { 566 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 567 return true; 568 return false; 569 } 570 571 /** 572 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 573 * 574 * @adev: amdgpu device pointer 575 * 576 * Returns true if the device is a dGPU with ACPI power control, 577 * otherwise return false. 578 */ 579 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 580 { 581 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 582 return false; 583 584 if (adev->has_pr3 || 585 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 586 return true; 587 return false; 588 } 589 590 /** 591 * amdgpu_device_supports_baco - Does the device support BACO 592 * 593 * @adev: amdgpu device pointer 594 * 595 * Return: 596 * 1 if the device supports BACO; 597 * 3 if the device supports MACO (only works if BACO is supported) 598 * otherwise return 0. 599 */ 600 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 601 { 602 return amdgpu_asic_supports_baco(adev); 603 } 604 605 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 606 { 607 int bamaco_support; 608 609 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 610 bamaco_support = amdgpu_device_supports_baco(adev); 611 612 switch (amdgpu_runtime_pm) { 613 case 2: 614 if (bamaco_support & MACO_SUPPORT) { 615 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 616 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 617 } else if (bamaco_support == BACO_SUPPORT) { 618 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 619 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 620 } 621 break; 622 case 1: 623 if (bamaco_support & BACO_SUPPORT) { 624 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 625 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 626 } 627 break; 628 case -1: 629 case -2: 630 if (amdgpu_device_supports_px(adev)) { 631 /* enable PX as runtime mode */ 632 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 633 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 634 } else if (amdgpu_device_supports_boco(adev)) { 635 /* enable boco as runtime mode */ 636 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 637 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 638 } else { 639 if (!bamaco_support) 640 goto no_runtime_pm; 641 642 switch (adev->asic_type) { 643 case CHIP_VEGA20: 644 case CHIP_ARCTURUS: 645 /* BACO are not supported on vega20 and arctrus */ 646 break; 647 case CHIP_VEGA10: 648 /* enable BACO as runpm mode if noretry=0 */ 649 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 650 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 651 break; 652 default: 653 /* enable BACO as runpm mode on CI+ */ 654 if (!amdgpu_passthrough(adev)) 655 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 656 break; 657 } 658 659 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 660 if (bamaco_support & MACO_SUPPORT) { 661 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 662 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 663 } else { 664 dev_info(adev->dev, "Using BACO for runtime pm\n"); 665 } 666 } 667 } 668 break; 669 case 0: 670 dev_info(adev->dev, "runtime pm is manually disabled\n"); 671 break; 672 default: 673 break; 674 } 675 676 no_runtime_pm: 677 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 678 dev_info(adev->dev, "Runtime PM not available\n"); 679 } 680 /** 681 * amdgpu_device_supports_smart_shift - Is the device dGPU with 682 * smart shift support 683 * 684 * @adev: amdgpu device pointer 685 * 686 * Returns true if the device is a dGPU with Smart Shift support, 687 * otherwise returns false. 688 */ 689 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 690 { 691 return (amdgpu_device_supports_boco(adev) && 692 amdgpu_acpi_is_power_shift_control_supported()); 693 } 694 695 /* 696 * VRAM access helper functions 697 */ 698 699 /** 700 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 701 * 702 * @adev: amdgpu_device pointer 703 * @pos: offset of the buffer in vram 704 * @buf: virtual address of the buffer in system memory 705 * @size: read/write size, sizeof(@buf) must > @size 706 * @write: true - write to vram, otherwise - read from vram 707 */ 708 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 709 void *buf, size_t size, bool write) 710 { 711 unsigned long flags; 712 uint32_t hi = ~0, tmp = 0; 713 uint32_t *data = buf; 714 uint64_t last; 715 int idx; 716 717 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 718 return; 719 720 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 721 722 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 723 for (last = pos + size; pos < last; pos += 4) { 724 tmp = pos >> 31; 725 726 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 727 if (tmp != hi) { 728 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 729 hi = tmp; 730 } 731 if (write) 732 WREG32_NO_KIQ(mmMM_DATA, *data++); 733 else 734 *data++ = RREG32_NO_KIQ(mmMM_DATA); 735 } 736 737 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 738 drm_dev_exit(idx); 739 } 740 741 /** 742 * amdgpu_device_aper_access - access vram by vram aperture 743 * 744 * @adev: amdgpu_device pointer 745 * @pos: offset of the buffer in vram 746 * @buf: virtual address of the buffer in system memory 747 * @size: read/write size, sizeof(@buf) must > @size 748 * @write: true - write to vram, otherwise - read from vram 749 * 750 * The return value means how many bytes have been transferred. 751 */ 752 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 753 void *buf, size_t size, bool write) 754 { 755 #ifdef CONFIG_64BIT 756 void __iomem *addr; 757 size_t count = 0; 758 uint64_t last; 759 760 if (!adev->mman.aper_base_kaddr) 761 return 0; 762 763 last = min(pos + size, adev->gmc.visible_vram_size); 764 if (last > pos) { 765 addr = adev->mman.aper_base_kaddr + pos; 766 count = last - pos; 767 768 if (write) { 769 memcpy_toio(addr, buf, count); 770 /* Make sure HDP write cache flush happens without any reordering 771 * after the system memory contents are sent over PCIe device 772 */ 773 mb(); 774 amdgpu_device_flush_hdp(adev, NULL); 775 } else { 776 amdgpu_device_invalidate_hdp(adev, NULL); 777 /* Make sure HDP read cache is invalidated before issuing a read 778 * to the PCIe device 779 */ 780 mb(); 781 memcpy_fromio(buf, addr, count); 782 } 783 784 } 785 786 return count; 787 #else 788 return 0; 789 #endif 790 } 791 792 /** 793 * amdgpu_device_vram_access - read/write a buffer in vram 794 * 795 * @adev: amdgpu_device pointer 796 * @pos: offset of the buffer in vram 797 * @buf: virtual address of the buffer in system memory 798 * @size: read/write size, sizeof(@buf) must > @size 799 * @write: true - write to vram, otherwise - read from vram 800 */ 801 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 802 void *buf, size_t size, bool write) 803 { 804 size_t count; 805 806 /* try to using vram apreature to access vram first */ 807 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 808 size -= count; 809 if (size) { 810 /* using MM to access rest vram */ 811 pos += count; 812 buf += count; 813 amdgpu_device_mm_access(adev, pos, buf, size, write); 814 } 815 } 816 817 /* 818 * register access helper functions. 819 */ 820 821 /* Check if hw access should be skipped because of hotplug or device error */ 822 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 823 { 824 if (adev->no_hw_access) 825 return true; 826 827 #ifdef CONFIG_LOCKDEP 828 /* 829 * This is a bit complicated to understand, so worth a comment. What we assert 830 * here is that the GPU reset is not running on another thread in parallel. 831 * 832 * For this we trylock the read side of the reset semaphore, if that succeeds 833 * we know that the reset is not running in parallel. 834 * 835 * If the trylock fails we assert that we are either already holding the read 836 * side of the lock or are the reset thread itself and hold the write side of 837 * the lock. 838 */ 839 if (in_task()) { 840 if (down_read_trylock(&adev->reset_domain->sem)) 841 up_read(&adev->reset_domain->sem); 842 else 843 lockdep_assert_held(&adev->reset_domain->sem); 844 } 845 #endif 846 return false; 847 } 848 849 /** 850 * amdgpu_device_rreg - read a memory mapped IO or indirect register 851 * 852 * @adev: amdgpu_device pointer 853 * @reg: dword aligned register offset 854 * @acc_flags: access flags which require special behavior 855 * 856 * Returns the 32 bit value from the offset specified. 857 */ 858 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 859 uint32_t reg, uint32_t acc_flags) 860 { 861 uint32_t ret; 862 863 if (amdgpu_device_skip_hw_access(adev)) 864 return 0; 865 866 if ((reg * 4) < adev->rmmio_size) { 867 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 868 amdgpu_sriov_runtime(adev) && 869 down_read_trylock(&adev->reset_domain->sem)) { 870 ret = amdgpu_kiq_rreg(adev, reg, 0); 871 up_read(&adev->reset_domain->sem); 872 } else { 873 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 874 } 875 } else { 876 ret = adev->pcie_rreg(adev, reg * 4); 877 } 878 879 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 880 881 return ret; 882 } 883 884 /* 885 * MMIO register read with bytes helper functions 886 * @offset:bytes offset from MMIO start 887 */ 888 889 /** 890 * amdgpu_mm_rreg8 - read a memory mapped IO register 891 * 892 * @adev: amdgpu_device pointer 893 * @offset: byte aligned register offset 894 * 895 * Returns the 8 bit value from the offset specified. 896 */ 897 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 898 { 899 if (amdgpu_device_skip_hw_access(adev)) 900 return 0; 901 902 if (offset < adev->rmmio_size) 903 return (readb(adev->rmmio + offset)); 904 BUG(); 905 } 906 907 908 /** 909 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 910 * 911 * @adev: amdgpu_device pointer 912 * @reg: dword aligned register offset 913 * @acc_flags: access flags which require special behavior 914 * @xcc_id: xcc accelerated compute core id 915 * 916 * Returns the 32 bit value from the offset specified. 917 */ 918 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 919 uint32_t reg, uint32_t acc_flags, 920 uint32_t xcc_id) 921 { 922 uint32_t ret, rlcg_flag; 923 924 if (amdgpu_device_skip_hw_access(adev)) 925 return 0; 926 927 if ((reg * 4) < adev->rmmio_size) { 928 if (amdgpu_sriov_vf(adev) && 929 !amdgpu_sriov_runtime(adev) && 930 adev->gfx.rlc.rlcg_reg_access_supported && 931 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 932 GC_HWIP, false, 933 &rlcg_flag)) { 934 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 935 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 936 amdgpu_sriov_runtime(adev) && 937 down_read_trylock(&adev->reset_domain->sem)) { 938 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 939 up_read(&adev->reset_domain->sem); 940 } else { 941 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 942 } 943 } else { 944 ret = adev->pcie_rreg(adev, reg * 4); 945 } 946 947 return ret; 948 } 949 950 /* 951 * MMIO register write with bytes helper functions 952 * @offset:bytes offset from MMIO start 953 * @value: the value want to be written to the register 954 */ 955 956 /** 957 * amdgpu_mm_wreg8 - read a memory mapped IO register 958 * 959 * @adev: amdgpu_device pointer 960 * @offset: byte aligned register offset 961 * @value: 8 bit value to write 962 * 963 * Writes the value specified to the offset specified. 964 */ 965 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 966 { 967 if (amdgpu_device_skip_hw_access(adev)) 968 return; 969 970 if (offset < adev->rmmio_size) 971 writeb(value, adev->rmmio + offset); 972 else 973 BUG(); 974 } 975 976 /** 977 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 978 * 979 * @adev: amdgpu_device pointer 980 * @reg: dword aligned register offset 981 * @v: 32 bit value to write to the register 982 * @acc_flags: access flags which require special behavior 983 * 984 * Writes the value specified to the offset specified. 985 */ 986 void amdgpu_device_wreg(struct amdgpu_device *adev, 987 uint32_t reg, uint32_t v, 988 uint32_t acc_flags) 989 { 990 if (amdgpu_device_skip_hw_access(adev)) 991 return; 992 993 if ((reg * 4) < adev->rmmio_size) { 994 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 995 amdgpu_sriov_runtime(adev) && 996 down_read_trylock(&adev->reset_domain->sem)) { 997 amdgpu_kiq_wreg(adev, reg, v, 0); 998 up_read(&adev->reset_domain->sem); 999 } else { 1000 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1001 } 1002 } else { 1003 adev->pcie_wreg(adev, reg * 4, v); 1004 } 1005 1006 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 1007 } 1008 1009 /** 1010 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 1011 * 1012 * @adev: amdgpu_device pointer 1013 * @reg: mmio/rlc register 1014 * @v: value to write 1015 * @xcc_id: xcc accelerated compute core id 1016 * 1017 * this function is invoked only for the debugfs register access 1018 */ 1019 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 1020 uint32_t reg, uint32_t v, 1021 uint32_t xcc_id) 1022 { 1023 if (amdgpu_device_skip_hw_access(adev)) 1024 return; 1025 1026 if (amdgpu_sriov_fullaccess(adev) && 1027 adev->gfx.rlc.funcs && 1028 adev->gfx.rlc.funcs->is_rlcg_access_range) { 1029 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 1030 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 1031 } else if ((reg * 4) >= adev->rmmio_size) { 1032 adev->pcie_wreg(adev, reg * 4, v); 1033 } else { 1034 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1035 } 1036 } 1037 1038 /** 1039 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 1040 * 1041 * @adev: amdgpu_device pointer 1042 * @reg: dword aligned register offset 1043 * @v: 32 bit value to write to the register 1044 * @acc_flags: access flags which require special behavior 1045 * @xcc_id: xcc accelerated compute core id 1046 * 1047 * Writes the value specified to the offset specified. 1048 */ 1049 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 1050 uint32_t reg, uint32_t v, 1051 uint32_t acc_flags, uint32_t xcc_id) 1052 { 1053 uint32_t rlcg_flag; 1054 1055 if (amdgpu_device_skip_hw_access(adev)) 1056 return; 1057 1058 if ((reg * 4) < adev->rmmio_size) { 1059 if (amdgpu_sriov_vf(adev) && 1060 !amdgpu_sriov_runtime(adev) && 1061 adev->gfx.rlc.rlcg_reg_access_supported && 1062 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 1063 GC_HWIP, true, 1064 &rlcg_flag)) { 1065 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 1066 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 1067 amdgpu_sriov_runtime(adev) && 1068 down_read_trylock(&adev->reset_domain->sem)) { 1069 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 1070 up_read(&adev->reset_domain->sem); 1071 } else { 1072 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1073 } 1074 } else { 1075 adev->pcie_wreg(adev, reg * 4, v); 1076 } 1077 } 1078 1079 /** 1080 * amdgpu_device_indirect_rreg - read an indirect register 1081 * 1082 * @adev: amdgpu_device pointer 1083 * @reg_addr: indirect register address to read from 1084 * 1085 * Returns the value of indirect register @reg_addr 1086 */ 1087 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 1088 u32 reg_addr) 1089 { 1090 unsigned long flags, pcie_index, pcie_data; 1091 void __iomem *pcie_index_offset; 1092 void __iomem *pcie_data_offset; 1093 u32 r; 1094 1095 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1096 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1097 1098 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1099 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1100 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1101 1102 writel(reg_addr, pcie_index_offset); 1103 readl(pcie_index_offset); 1104 r = readl(pcie_data_offset); 1105 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1106 1107 return r; 1108 } 1109 1110 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 1111 u64 reg_addr) 1112 { 1113 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1114 u32 r; 1115 void __iomem *pcie_index_offset; 1116 void __iomem *pcie_index_hi_offset; 1117 void __iomem *pcie_data_offset; 1118 1119 if (unlikely(!adev->nbio.funcs)) { 1120 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 1121 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 1122 } else { 1123 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1124 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1125 } 1126 1127 if (reg_addr >> 32) { 1128 if (unlikely(!adev->nbio.funcs)) 1129 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 1130 else 1131 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1132 } else { 1133 pcie_index_hi = 0; 1134 } 1135 1136 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1137 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1138 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1139 if (pcie_index_hi != 0) 1140 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1141 pcie_index_hi * 4; 1142 1143 writel(reg_addr, pcie_index_offset); 1144 readl(pcie_index_offset); 1145 if (pcie_index_hi != 0) { 1146 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1147 readl(pcie_index_hi_offset); 1148 } 1149 r = readl(pcie_data_offset); 1150 1151 /* clear the high bits */ 1152 if (pcie_index_hi != 0) { 1153 writel(0, pcie_index_hi_offset); 1154 readl(pcie_index_hi_offset); 1155 } 1156 1157 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1158 1159 return r; 1160 } 1161 1162 /** 1163 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1164 * 1165 * @adev: amdgpu_device pointer 1166 * @reg_addr: indirect register address to read from 1167 * 1168 * Returns the value of indirect register @reg_addr 1169 */ 1170 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1171 u32 reg_addr) 1172 { 1173 unsigned long flags, pcie_index, pcie_data; 1174 void __iomem *pcie_index_offset; 1175 void __iomem *pcie_data_offset; 1176 u64 r; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* read low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 r = readl(pcie_data_offset); 1189 /* read high 32 bits */ 1190 writel(reg_addr + 4, pcie_index_offset); 1191 readl(pcie_index_offset); 1192 r |= ((u64)readl(pcie_data_offset) << 32); 1193 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1194 1195 return r; 1196 } 1197 1198 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 u64 r; 1207 1208 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1209 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1210 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1211 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1212 1213 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1214 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1215 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1216 if (pcie_index_hi != 0) 1217 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1218 pcie_index_hi * 4; 1219 1220 /* read low 32 bits */ 1221 writel(reg_addr, pcie_index_offset); 1222 readl(pcie_index_offset); 1223 if (pcie_index_hi != 0) { 1224 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1225 readl(pcie_index_hi_offset); 1226 } 1227 r = readl(pcie_data_offset); 1228 /* read high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 r |= ((u64)readl(pcie_data_offset) << 32); 1236 1237 /* clear the high bits */ 1238 if (pcie_index_hi != 0) { 1239 writel(0, pcie_index_hi_offset); 1240 readl(pcie_index_hi_offset); 1241 } 1242 1243 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1244 1245 return r; 1246 } 1247 1248 /** 1249 * amdgpu_device_indirect_wreg - write an indirect register address 1250 * 1251 * @adev: amdgpu_device pointer 1252 * @reg_addr: indirect register offset 1253 * @reg_data: indirect register data 1254 * 1255 */ 1256 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1257 u32 reg_addr, u32 reg_data) 1258 { 1259 unsigned long flags, pcie_index, pcie_data; 1260 void __iomem *pcie_index_offset; 1261 void __iomem *pcie_data_offset; 1262 1263 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1264 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1265 1266 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1267 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1268 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1269 1270 writel(reg_addr, pcie_index_offset); 1271 readl(pcie_index_offset); 1272 writel(reg_data, pcie_data_offset); 1273 readl(pcie_data_offset); 1274 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1275 } 1276 1277 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1278 u64 reg_addr, u32 reg_data) 1279 { 1280 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1281 void __iomem *pcie_index_offset; 1282 void __iomem *pcie_index_hi_offset; 1283 void __iomem *pcie_data_offset; 1284 1285 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1286 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1287 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1288 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1289 else 1290 pcie_index_hi = 0; 1291 1292 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1293 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1294 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1295 if (pcie_index_hi != 0) 1296 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1297 pcie_index_hi * 4; 1298 1299 writel(reg_addr, pcie_index_offset); 1300 readl(pcie_index_offset); 1301 if (pcie_index_hi != 0) { 1302 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1303 readl(pcie_index_hi_offset); 1304 } 1305 writel(reg_data, pcie_data_offset); 1306 readl(pcie_data_offset); 1307 1308 /* clear the high bits */ 1309 if (pcie_index_hi != 0) { 1310 writel(0, pcie_index_hi_offset); 1311 readl(pcie_index_hi_offset); 1312 } 1313 1314 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1315 } 1316 1317 /** 1318 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1319 * 1320 * @adev: amdgpu_device pointer 1321 * @reg_addr: indirect register offset 1322 * @reg_data: indirect register data 1323 * 1324 */ 1325 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1326 u32 reg_addr, u64 reg_data) 1327 { 1328 unsigned long flags, pcie_index, pcie_data; 1329 void __iomem *pcie_index_offset; 1330 void __iomem *pcie_data_offset; 1331 1332 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1333 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1334 1335 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1336 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1337 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1338 1339 /* write low 32 bits */ 1340 writel(reg_addr, pcie_index_offset); 1341 readl(pcie_index_offset); 1342 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1343 readl(pcie_data_offset); 1344 /* write high 32 bits */ 1345 writel(reg_addr + 4, pcie_index_offset); 1346 readl(pcie_index_offset); 1347 writel((u32)(reg_data >> 32), pcie_data_offset); 1348 readl(pcie_data_offset); 1349 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1350 } 1351 1352 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1353 u64 reg_addr, u64 reg_data) 1354 { 1355 unsigned long flags, pcie_index, pcie_data; 1356 unsigned long pcie_index_hi = 0; 1357 void __iomem *pcie_index_offset; 1358 void __iomem *pcie_index_hi_offset; 1359 void __iomem *pcie_data_offset; 1360 1361 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1362 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1363 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1364 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1365 1366 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1367 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1368 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1369 if (pcie_index_hi != 0) 1370 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1371 pcie_index_hi * 4; 1372 1373 /* write low 32 bits */ 1374 writel(reg_addr, pcie_index_offset); 1375 readl(pcie_index_offset); 1376 if (pcie_index_hi != 0) { 1377 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1378 readl(pcie_index_hi_offset); 1379 } 1380 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1381 readl(pcie_data_offset); 1382 /* write high 32 bits */ 1383 writel(reg_addr + 4, pcie_index_offset); 1384 readl(pcie_index_offset); 1385 if (pcie_index_hi != 0) { 1386 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1387 readl(pcie_index_hi_offset); 1388 } 1389 writel((u32)(reg_data >> 32), pcie_data_offset); 1390 readl(pcie_data_offset); 1391 1392 /* clear the high bits */ 1393 if (pcie_index_hi != 0) { 1394 writel(0, pcie_index_hi_offset); 1395 readl(pcie_index_hi_offset); 1396 } 1397 1398 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1399 } 1400 1401 /** 1402 * amdgpu_device_get_rev_id - query device rev_id 1403 * 1404 * @adev: amdgpu_device pointer 1405 * 1406 * Return device rev_id 1407 */ 1408 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1409 { 1410 return adev->nbio.funcs->get_rev_id(adev); 1411 } 1412 1413 /** 1414 * amdgpu_invalid_rreg - dummy reg read function 1415 * 1416 * @adev: amdgpu_device pointer 1417 * @reg: offset of register 1418 * 1419 * Dummy register read function. Used for register blocks 1420 * that certain asics don't have (all asics). 1421 * Returns the value in the register. 1422 */ 1423 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1424 { 1425 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1426 BUG(); 1427 return 0; 1428 } 1429 1430 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1431 { 1432 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1433 BUG(); 1434 return 0; 1435 } 1436 1437 /** 1438 * amdgpu_invalid_wreg - dummy reg write function 1439 * 1440 * @adev: amdgpu_device pointer 1441 * @reg: offset of register 1442 * @v: value to write to the register 1443 * 1444 * Dummy register read function. Used for register blocks 1445 * that certain asics don't have (all asics). 1446 */ 1447 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1448 { 1449 dev_err(adev->dev, 1450 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1451 v); 1452 BUG(); 1453 } 1454 1455 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1456 { 1457 dev_err(adev->dev, 1458 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1459 v); 1460 BUG(); 1461 } 1462 1463 /** 1464 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1465 * 1466 * @adev: amdgpu_device pointer 1467 * @reg: offset of register 1468 * 1469 * Dummy register read function. Used for register blocks 1470 * that certain asics don't have (all asics). 1471 * Returns the value in the register. 1472 */ 1473 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1474 { 1475 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1476 reg); 1477 BUG(); 1478 return 0; 1479 } 1480 1481 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1482 { 1483 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1484 BUG(); 1485 return 0; 1486 } 1487 1488 /** 1489 * amdgpu_invalid_wreg64 - dummy reg write function 1490 * 1491 * @adev: amdgpu_device pointer 1492 * @reg: offset of register 1493 * @v: value to write to the register 1494 * 1495 * Dummy register read function. Used for register blocks 1496 * that certain asics don't have (all asics). 1497 */ 1498 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1499 { 1500 dev_err(adev->dev, 1501 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1502 reg, v); 1503 BUG(); 1504 } 1505 1506 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1507 { 1508 dev_err(adev->dev, 1509 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1510 reg, v); 1511 BUG(); 1512 } 1513 1514 /** 1515 * amdgpu_block_invalid_rreg - dummy reg read function 1516 * 1517 * @adev: amdgpu_device pointer 1518 * @block: offset of instance 1519 * @reg: offset of register 1520 * 1521 * Dummy register read function. Used for register blocks 1522 * that certain asics don't have (all asics). 1523 * Returns the value in the register. 1524 */ 1525 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1526 uint32_t block, uint32_t reg) 1527 { 1528 dev_err(adev->dev, 1529 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1530 reg, block); 1531 BUG(); 1532 return 0; 1533 } 1534 1535 /** 1536 * amdgpu_block_invalid_wreg - dummy reg write function 1537 * 1538 * @adev: amdgpu_device pointer 1539 * @block: offset of instance 1540 * @reg: offset of register 1541 * @v: value to write to the register 1542 * 1543 * Dummy register read function. Used for register blocks 1544 * that certain asics don't have (all asics). 1545 */ 1546 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1547 uint32_t block, 1548 uint32_t reg, uint32_t v) 1549 { 1550 dev_err(adev->dev, 1551 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1552 reg, block, v); 1553 BUG(); 1554 } 1555 1556 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1557 { 1558 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1559 return AMDGPU_VBIOS_SKIP; 1560 1561 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1562 return AMDGPU_VBIOS_OPTIONAL; 1563 1564 return 0; 1565 } 1566 1567 /** 1568 * amdgpu_device_asic_init - Wrapper for atom asic_init 1569 * 1570 * @adev: amdgpu_device pointer 1571 * 1572 * Does any asic specific work and then calls atom asic init. 1573 */ 1574 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1575 { 1576 uint32_t flags; 1577 bool optional; 1578 int ret; 1579 1580 amdgpu_asic_pre_asic_init(adev); 1581 flags = amdgpu_device_get_vbios_flags(adev); 1582 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1583 1584 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1585 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1586 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1587 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1588 amdgpu_psp_wait_for_bootloader(adev); 1589 if (optional && !adev->bios) 1590 return 0; 1591 1592 ret = amdgpu_atomfirmware_asic_init(adev, true); 1593 return ret; 1594 } else { 1595 if (optional && !adev->bios) 1596 return 0; 1597 1598 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1599 } 1600 1601 return 0; 1602 } 1603 1604 /** 1605 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1606 * 1607 * @adev: amdgpu_device pointer 1608 * 1609 * Allocates a scratch page of VRAM for use by various things in the 1610 * driver. 1611 */ 1612 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1613 { 1614 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1615 AMDGPU_GEM_DOMAIN_VRAM | 1616 AMDGPU_GEM_DOMAIN_GTT, 1617 &adev->mem_scratch.robj, 1618 &adev->mem_scratch.gpu_addr, 1619 (void **)&adev->mem_scratch.ptr); 1620 } 1621 1622 /** 1623 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1624 * 1625 * @adev: amdgpu_device pointer 1626 * 1627 * Frees the VRAM scratch page. 1628 */ 1629 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1630 { 1631 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1632 } 1633 1634 /** 1635 * amdgpu_device_program_register_sequence - program an array of registers. 1636 * 1637 * @adev: amdgpu_device pointer 1638 * @registers: pointer to the register array 1639 * @array_size: size of the register array 1640 * 1641 * Programs an array or registers with and or masks. 1642 * This is a helper for setting golden registers. 1643 */ 1644 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1645 const u32 *registers, 1646 const u32 array_size) 1647 { 1648 u32 tmp, reg, and_mask, or_mask; 1649 int i; 1650 1651 if (array_size % 3) 1652 return; 1653 1654 for (i = 0; i < array_size; i += 3) { 1655 reg = registers[i + 0]; 1656 and_mask = registers[i + 1]; 1657 or_mask = registers[i + 2]; 1658 1659 if (and_mask == 0xffffffff) { 1660 tmp = or_mask; 1661 } else { 1662 tmp = RREG32(reg); 1663 tmp &= ~and_mask; 1664 if (adev->family >= AMDGPU_FAMILY_AI) 1665 tmp |= (or_mask & and_mask); 1666 else 1667 tmp |= or_mask; 1668 } 1669 WREG32(reg, tmp); 1670 } 1671 } 1672 1673 /** 1674 * amdgpu_device_pci_config_reset - reset the GPU 1675 * 1676 * @adev: amdgpu_device pointer 1677 * 1678 * Resets the GPU using the pci config reset sequence. 1679 * Only applicable to asics prior to vega10. 1680 */ 1681 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1682 { 1683 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1684 } 1685 1686 /** 1687 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1688 * 1689 * @adev: amdgpu_device pointer 1690 * 1691 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1692 */ 1693 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1694 { 1695 return pci_reset_function(adev->pdev); 1696 } 1697 1698 /* 1699 * amdgpu_device_wb_*() 1700 * Writeback is the method by which the GPU updates special pages in memory 1701 * with the status of certain GPU events (fences, ring pointers,etc.). 1702 */ 1703 1704 /** 1705 * amdgpu_device_wb_fini - Disable Writeback and free memory 1706 * 1707 * @adev: amdgpu_device pointer 1708 * 1709 * Disables Writeback and frees the Writeback memory (all asics). 1710 * Used at driver shutdown. 1711 */ 1712 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1713 { 1714 if (adev->wb.wb_obj) { 1715 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1716 &adev->wb.gpu_addr, 1717 (void **)&adev->wb.wb); 1718 adev->wb.wb_obj = NULL; 1719 } 1720 } 1721 1722 /** 1723 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1724 * 1725 * @adev: amdgpu_device pointer 1726 * 1727 * Initializes writeback and allocates writeback memory (all asics). 1728 * Used at driver startup. 1729 * Returns 0 on success or an -error on failure. 1730 */ 1731 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1732 { 1733 int r; 1734 1735 if (adev->wb.wb_obj == NULL) { 1736 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1737 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1738 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1739 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1740 (void **)&adev->wb.wb); 1741 if (r) { 1742 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1743 return r; 1744 } 1745 1746 adev->wb.num_wb = AMDGPU_MAX_WB; 1747 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1748 1749 /* clear wb memory */ 1750 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1751 } 1752 1753 return 0; 1754 } 1755 1756 /** 1757 * amdgpu_device_wb_get - Allocate a wb entry 1758 * 1759 * @adev: amdgpu_device pointer 1760 * @wb: wb index 1761 * 1762 * Allocate a wb slot for use by the driver (all asics). 1763 * Returns 0 on success or -EINVAL on failure. 1764 */ 1765 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1766 { 1767 unsigned long flags, offset; 1768 1769 spin_lock_irqsave(&adev->wb.lock, flags); 1770 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1771 if (offset < adev->wb.num_wb) { 1772 __set_bit(offset, adev->wb.used); 1773 spin_unlock_irqrestore(&adev->wb.lock, flags); 1774 *wb = offset << 3; /* convert to dw offset */ 1775 return 0; 1776 } else { 1777 spin_unlock_irqrestore(&adev->wb.lock, flags); 1778 return -EINVAL; 1779 } 1780 } 1781 1782 /** 1783 * amdgpu_device_wb_free - Free a wb entry 1784 * 1785 * @adev: amdgpu_device pointer 1786 * @wb: wb index 1787 * 1788 * Free a wb slot allocated for use by the driver (all asics) 1789 */ 1790 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1791 { 1792 unsigned long flags; 1793 1794 wb >>= 3; 1795 spin_lock_irqsave(&adev->wb.lock, flags); 1796 if (wb < adev->wb.num_wb) 1797 __clear_bit(wb, adev->wb.used); 1798 spin_unlock_irqrestore(&adev->wb.lock, flags); 1799 } 1800 1801 /** 1802 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1803 * 1804 * @adev: amdgpu_device pointer 1805 * 1806 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1807 * to fail, but if any of the BARs is not accessible after the size we abort 1808 * driver loading by returning -ENODEV. 1809 */ 1810 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1811 { 1812 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1813 struct pci_bus *root; 1814 struct resource *res; 1815 int max_size, r; 1816 unsigned int i; 1817 u16 cmd; 1818 1819 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1820 return 0; 1821 1822 /* Bypass for VF */ 1823 if (amdgpu_sriov_vf(adev)) 1824 return 0; 1825 1826 if (!amdgpu_rebar) 1827 return 0; 1828 1829 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1830 if ((amdgpu_runtime_pm != 0) && 1831 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1832 adev->pdev->device == 0x731f && 1833 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1834 return 0; 1835 1836 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1837 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1838 dev_warn( 1839 adev->dev, 1840 "System can't access extended configuration space, please check!!\n"); 1841 1842 /* skip if the bios has already enabled large BAR */ 1843 if (adev->gmc.real_vram_size && 1844 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1845 return 0; 1846 1847 /* Check if the root BUS has 64bit memory resources */ 1848 root = adev->pdev->bus; 1849 while (root->parent) 1850 root = root->parent; 1851 1852 pci_bus_for_each_resource(root, res, i) { 1853 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1854 res->start > 0x100000000ull) 1855 break; 1856 } 1857 1858 /* Trying to resize is pointless without a root hub window above 4GB */ 1859 if (!res) 1860 return 0; 1861 1862 /* Limit the BAR size to what is available */ 1863 max_size = pci_rebar_get_max_size(adev->pdev, 0); 1864 if (max_size < 0) 1865 return 0; 1866 rbar_size = min(max_size, rbar_size); 1867 1868 /* Disable memory decoding while we change the BAR addresses and size */ 1869 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1870 pci_write_config_word(adev->pdev, PCI_COMMAND, 1871 cmd & ~PCI_COMMAND_MEMORY); 1872 1873 /* Tear down doorbell as resizing will release BARs */ 1874 amdgpu_doorbell_fini(adev); 1875 1876 r = pci_resize_resource(adev->pdev, 0, rbar_size, 1877 (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5 1878 : 1 << 2); 1879 if (r == -ENOSPC) 1880 dev_info(adev->dev, 1881 "Not enough PCI address space for a large BAR."); 1882 else if (r && r != -ENOTSUPP) 1883 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1884 1885 /* When the doorbell or fb BAR isn't available we have no chance of 1886 * using the device. 1887 */ 1888 r = amdgpu_doorbell_init(adev); 1889 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1890 return -ENODEV; 1891 1892 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1893 1894 return 0; 1895 } 1896 1897 /* 1898 * GPU helpers function. 1899 */ 1900 /** 1901 * amdgpu_device_need_post - check if the hw need post or not 1902 * 1903 * @adev: amdgpu_device pointer 1904 * 1905 * Check if the asic has been initialized (all asics) at driver startup 1906 * or post is needed if hw reset is performed. 1907 * Returns true if need or false if not. 1908 */ 1909 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1910 { 1911 uint32_t reg, flags; 1912 1913 if (amdgpu_sriov_vf(adev)) 1914 return false; 1915 1916 flags = amdgpu_device_get_vbios_flags(adev); 1917 if (flags & AMDGPU_VBIOS_SKIP) 1918 return false; 1919 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1920 return false; 1921 1922 if (amdgpu_passthrough(adev)) { 1923 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1924 * some old smc fw still need driver do vPost otherwise gpu hang, while 1925 * those smc fw version above 22.15 doesn't have this flaw, so we force 1926 * vpost executed for smc version below 22.15 1927 */ 1928 if (adev->asic_type == CHIP_FIJI) { 1929 int err; 1930 uint32_t fw_ver; 1931 1932 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1933 /* force vPost if error occurred */ 1934 if (err) 1935 return true; 1936 1937 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1938 release_firmware(adev->pm.fw); 1939 if (fw_ver < 0x00160e00) 1940 return true; 1941 } 1942 } 1943 1944 /* Don't post if we need to reset whole hive on init */ 1945 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1946 return false; 1947 1948 if (adev->has_hw_reset) { 1949 adev->has_hw_reset = false; 1950 return true; 1951 } 1952 1953 /* bios scratch used on CIK+ */ 1954 if (adev->asic_type >= CHIP_BONAIRE) 1955 return amdgpu_atombios_scratch_need_asic_init(adev); 1956 1957 /* check MEM_SIZE for older asics */ 1958 reg = amdgpu_asic_get_config_memsize(adev); 1959 1960 if ((reg != 0) && (reg != 0xffffffff)) 1961 return false; 1962 1963 return true; 1964 } 1965 1966 /* 1967 * Check whether seamless boot is supported. 1968 * 1969 * So far we only support seamless boot on DCE 3.0 or later. 1970 * If users report that it works on older ASICS as well, we may 1971 * loosen this. 1972 */ 1973 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1974 { 1975 switch (amdgpu_seamless) { 1976 case -1: 1977 break; 1978 case 1: 1979 return true; 1980 case 0: 1981 return false; 1982 default: 1983 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1984 amdgpu_seamless); 1985 return false; 1986 } 1987 1988 if (!(adev->flags & AMD_IS_APU)) 1989 return false; 1990 1991 if (adev->mman.keep_stolen_vga_memory) 1992 return false; 1993 1994 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1995 } 1996 1997 /* 1998 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1999 * don't support dynamic speed switching. Until we have confirmation from Intel 2000 * that a specific host supports it, it's safer that we keep it disabled for all. 2001 * 2002 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 2003 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 2004 */ 2005 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 2006 { 2007 #if IS_ENABLED(CONFIG_X86) 2008 struct cpuinfo_x86 *c = &cpu_data(0); 2009 2010 /* eGPU change speeds based on USB4 fabric conditions */ 2011 if (dev_is_removable(adev->dev)) 2012 return true; 2013 2014 if (c->x86_vendor == X86_VENDOR_INTEL) 2015 return false; 2016 #endif 2017 return true; 2018 } 2019 2020 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 2021 { 2022 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 2023 * It's unclear if this is a platform-specific or GPU-specific issue. 2024 * Disable ASPM on SI for the time being. 2025 */ 2026 if (adev->family == AMDGPU_FAMILY_SI) 2027 return true; 2028 2029 #if IS_ENABLED(CONFIG_X86) 2030 struct cpuinfo_x86 *c = &cpu_data(0); 2031 2032 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 2033 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 2034 return false; 2035 2036 if (c->x86 == 6 && 2037 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 2038 switch (c->x86_model) { 2039 case VFM_MODEL(INTEL_ALDERLAKE): 2040 case VFM_MODEL(INTEL_ALDERLAKE_L): 2041 case VFM_MODEL(INTEL_RAPTORLAKE): 2042 case VFM_MODEL(INTEL_RAPTORLAKE_P): 2043 case VFM_MODEL(INTEL_RAPTORLAKE_S): 2044 return true; 2045 default: 2046 return false; 2047 } 2048 } else { 2049 return false; 2050 } 2051 #else 2052 return false; 2053 #endif 2054 } 2055 2056 /** 2057 * amdgpu_device_should_use_aspm - check if the device should program ASPM 2058 * 2059 * @adev: amdgpu_device pointer 2060 * 2061 * Confirm whether the module parameter and pcie bridge agree that ASPM should 2062 * be set for this device. 2063 * 2064 * Returns true if it should be used or false if not. 2065 */ 2066 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 2067 { 2068 switch (amdgpu_aspm) { 2069 case -1: 2070 break; 2071 case 0: 2072 return false; 2073 case 1: 2074 return true; 2075 default: 2076 return false; 2077 } 2078 if (adev->flags & AMD_IS_APU) 2079 return false; 2080 if (amdgpu_device_aspm_support_quirk(adev)) 2081 return false; 2082 return pcie_aspm_enabled(adev->pdev); 2083 } 2084 2085 /* if we get transitioned to only one device, take VGA back */ 2086 /** 2087 * amdgpu_device_vga_set_decode - enable/disable vga decode 2088 * 2089 * @pdev: PCI device pointer 2090 * @state: enable/disable vga decode 2091 * 2092 * Enable/disable vga decode (all asics). 2093 * Returns VGA resource flags. 2094 */ 2095 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 2096 bool state) 2097 { 2098 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 2099 2100 amdgpu_asic_set_vga_state(adev, state); 2101 if (state) 2102 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 2103 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 2104 else 2105 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 2106 } 2107 2108 /** 2109 * amdgpu_device_check_block_size - validate the vm block size 2110 * 2111 * @adev: amdgpu_device pointer 2112 * 2113 * Validates the vm block size specified via module parameter. 2114 * The vm block size defines number of bits in page table versus page directory, 2115 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 2116 * page table and the remaining bits are in the page directory. 2117 */ 2118 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 2119 { 2120 /* defines number of bits in page table versus page directory, 2121 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 2122 * page table and the remaining bits are in the page directory 2123 */ 2124 if (amdgpu_vm_block_size == -1) 2125 return; 2126 2127 if (amdgpu_vm_block_size < 9) { 2128 dev_warn(adev->dev, "VM page table size (%d) too small\n", 2129 amdgpu_vm_block_size); 2130 amdgpu_vm_block_size = -1; 2131 } 2132 } 2133 2134 /** 2135 * amdgpu_device_check_vm_size - validate the vm size 2136 * 2137 * @adev: amdgpu_device pointer 2138 * 2139 * Validates the vm size in GB specified via module parameter. 2140 * The VM size is the size of the GPU virtual memory space in GB. 2141 */ 2142 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2143 { 2144 /* no need to check the default value */ 2145 if (amdgpu_vm_size == -1) 2146 return; 2147 2148 if (amdgpu_vm_size < 1) { 2149 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2150 amdgpu_vm_size); 2151 amdgpu_vm_size = -1; 2152 } 2153 } 2154 2155 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2156 { 2157 struct sysinfo si; 2158 bool is_os_64 = (sizeof(void *) == 8); 2159 uint64_t total_memory; 2160 uint64_t dram_size_seven_GB = 0x1B8000000; 2161 uint64_t dram_size_three_GB = 0xB8000000; 2162 2163 if (amdgpu_smu_memory_pool_size == 0) 2164 return; 2165 2166 if (!is_os_64) { 2167 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2168 goto def_value; 2169 } 2170 si_meminfo(&si); 2171 total_memory = (uint64_t)si.totalram * si.mem_unit; 2172 2173 if ((amdgpu_smu_memory_pool_size == 1) || 2174 (amdgpu_smu_memory_pool_size == 2)) { 2175 if (total_memory < dram_size_three_GB) 2176 goto def_value1; 2177 } else if ((amdgpu_smu_memory_pool_size == 4) || 2178 (amdgpu_smu_memory_pool_size == 8)) { 2179 if (total_memory < dram_size_seven_GB) 2180 goto def_value1; 2181 } else { 2182 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2183 goto def_value; 2184 } 2185 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2186 2187 return; 2188 2189 def_value1: 2190 dev_warn(adev->dev, "No enough system memory\n"); 2191 def_value: 2192 adev->pm.smu_prv_buffer_size = 0; 2193 } 2194 2195 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2196 { 2197 if (!(adev->flags & AMD_IS_APU) || 2198 adev->asic_type < CHIP_RAVEN) 2199 return 0; 2200 2201 switch (adev->asic_type) { 2202 case CHIP_RAVEN: 2203 if (adev->pdev->device == 0x15dd) 2204 adev->apu_flags |= AMD_APU_IS_RAVEN; 2205 if (adev->pdev->device == 0x15d8) 2206 adev->apu_flags |= AMD_APU_IS_PICASSO; 2207 break; 2208 case CHIP_RENOIR: 2209 if ((adev->pdev->device == 0x1636) || 2210 (adev->pdev->device == 0x164c)) 2211 adev->apu_flags |= AMD_APU_IS_RENOIR; 2212 else 2213 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2214 break; 2215 case CHIP_VANGOGH: 2216 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2217 break; 2218 case CHIP_YELLOW_CARP: 2219 break; 2220 case CHIP_CYAN_SKILLFISH: 2221 if ((adev->pdev->device == 0x13FE) || 2222 (adev->pdev->device == 0x143F)) 2223 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2224 break; 2225 default: 2226 break; 2227 } 2228 2229 return 0; 2230 } 2231 2232 /** 2233 * amdgpu_device_check_arguments - validate module params 2234 * 2235 * @adev: amdgpu_device pointer 2236 * 2237 * Validates certain module parameters and updates 2238 * the associated values used by the driver (all asics). 2239 */ 2240 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2241 { 2242 int i; 2243 2244 if (amdgpu_sched_jobs < 4) { 2245 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2246 amdgpu_sched_jobs); 2247 amdgpu_sched_jobs = 4; 2248 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2249 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2250 amdgpu_sched_jobs); 2251 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2252 } 2253 2254 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2255 /* gart size must be greater or equal to 32M */ 2256 dev_warn(adev->dev, "gart size (%d) too small\n", 2257 amdgpu_gart_size); 2258 amdgpu_gart_size = -1; 2259 } 2260 2261 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2262 /* gtt size must be greater or equal to 32M */ 2263 dev_warn(adev->dev, "gtt size (%d) too small\n", 2264 amdgpu_gtt_size); 2265 amdgpu_gtt_size = -1; 2266 } 2267 2268 /* valid range is between 4 and 9 inclusive */ 2269 if (amdgpu_vm_fragment_size != -1 && 2270 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2271 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2272 amdgpu_vm_fragment_size = -1; 2273 } 2274 2275 if (amdgpu_sched_hw_submission < 2) { 2276 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2277 amdgpu_sched_hw_submission); 2278 amdgpu_sched_hw_submission = 2; 2279 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2280 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2281 amdgpu_sched_hw_submission); 2282 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2283 } 2284 2285 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2286 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2287 amdgpu_reset_method = -1; 2288 } 2289 2290 amdgpu_device_check_smu_prv_buffer_size(adev); 2291 2292 amdgpu_device_check_vm_size(adev); 2293 2294 amdgpu_device_check_block_size(adev); 2295 2296 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2297 2298 for (i = 0; i < MAX_XCP; i++) { 2299 switch (amdgpu_enforce_isolation) { 2300 case -1: 2301 case 0: 2302 default: 2303 /* disable */ 2304 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2305 break; 2306 case 1: 2307 /* enable */ 2308 adev->enforce_isolation[i] = 2309 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2310 break; 2311 case 2: 2312 /* enable legacy mode */ 2313 adev->enforce_isolation[i] = 2314 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2315 break; 2316 case 3: 2317 /* enable only process isolation without submitting cleaner shader */ 2318 adev->enforce_isolation[i] = 2319 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2320 break; 2321 } 2322 } 2323 2324 return 0; 2325 } 2326 2327 /** 2328 * amdgpu_switcheroo_set_state - set switcheroo state 2329 * 2330 * @pdev: pci dev pointer 2331 * @state: vga_switcheroo state 2332 * 2333 * Callback for the switcheroo driver. Suspends or resumes 2334 * the asics before or after it is powered up using ACPI methods. 2335 */ 2336 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2337 enum vga_switcheroo_state state) 2338 { 2339 struct drm_device *dev = pci_get_drvdata(pdev); 2340 int r; 2341 2342 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2343 state == VGA_SWITCHEROO_OFF) 2344 return; 2345 2346 if (state == VGA_SWITCHEROO_ON) { 2347 pr_info("switched on\n"); 2348 /* don't suspend or resume card normally */ 2349 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2350 2351 pci_set_power_state(pdev, PCI_D0); 2352 amdgpu_device_load_pci_state(pdev); 2353 r = pci_enable_device(pdev); 2354 if (r) 2355 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2356 r); 2357 amdgpu_device_resume(dev, true); 2358 2359 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2360 } else { 2361 dev_info(&pdev->dev, "switched off\n"); 2362 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2363 amdgpu_device_prepare(dev); 2364 amdgpu_device_suspend(dev, true); 2365 amdgpu_device_cache_pci_state(pdev); 2366 /* Shut down the device */ 2367 pci_disable_device(pdev); 2368 pci_set_power_state(pdev, PCI_D3cold); 2369 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2370 } 2371 } 2372 2373 /** 2374 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2375 * 2376 * @pdev: pci dev pointer 2377 * 2378 * Callback for the switcheroo driver. Check of the switcheroo 2379 * state can be changed. 2380 * Returns true if the state can be changed, false if not. 2381 */ 2382 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2383 { 2384 struct drm_device *dev = pci_get_drvdata(pdev); 2385 2386 /* 2387 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2388 * locking inversion with the driver load path. And the access here is 2389 * completely racy anyway. So don't bother with locking for now. 2390 */ 2391 return atomic_read(&dev->open_count) == 0; 2392 } 2393 2394 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2395 .set_gpu_state = amdgpu_switcheroo_set_state, 2396 .reprobe = NULL, 2397 .can_switch = amdgpu_switcheroo_can_switch, 2398 }; 2399 2400 /** 2401 * amdgpu_device_enable_virtual_display - enable virtual display feature 2402 * 2403 * @adev: amdgpu_device pointer 2404 * 2405 * Enabled the virtual display feature if the user has enabled it via 2406 * the module parameter virtual_display. This feature provides a virtual 2407 * display hardware on headless boards or in virtualized environments. 2408 * This function parses and validates the configuration string specified by 2409 * the user and configures the virtual display configuration (number of 2410 * virtual connectors, crtcs, etc.) specified. 2411 */ 2412 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2413 { 2414 adev->enable_virtual_display = false; 2415 2416 if (amdgpu_virtual_display) { 2417 const char *pci_address_name = pci_name(adev->pdev); 2418 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2419 2420 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2421 pciaddstr_tmp = pciaddstr; 2422 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2423 pciaddname = strsep(&pciaddname_tmp, ","); 2424 if (!strcmp("all", pciaddname) 2425 || !strcmp(pci_address_name, pciaddname)) { 2426 long num_crtc; 2427 int res = -1; 2428 2429 adev->enable_virtual_display = true; 2430 2431 if (pciaddname_tmp) 2432 res = kstrtol(pciaddname_tmp, 10, 2433 &num_crtc); 2434 2435 if (!res) { 2436 if (num_crtc < 1) 2437 num_crtc = 1; 2438 if (num_crtc > 6) 2439 num_crtc = 6; 2440 adev->mode_info.num_crtc = num_crtc; 2441 } else { 2442 adev->mode_info.num_crtc = 1; 2443 } 2444 break; 2445 } 2446 } 2447 2448 dev_info( 2449 adev->dev, 2450 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2451 amdgpu_virtual_display, pci_address_name, 2452 adev->enable_virtual_display, adev->mode_info.num_crtc); 2453 2454 kfree(pciaddstr); 2455 } 2456 } 2457 2458 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2459 { 2460 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2461 adev->mode_info.num_crtc = 1; 2462 adev->enable_virtual_display = true; 2463 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2464 adev->enable_virtual_display, 2465 adev->mode_info.num_crtc); 2466 } 2467 } 2468 2469 /** 2470 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2471 * 2472 * @adev: amdgpu_device pointer 2473 * 2474 * Parses the asic configuration parameters specified in the gpu info 2475 * firmware and makes them available to the driver for use in configuring 2476 * the asic. 2477 * Returns 0 on success, -EINVAL on failure. 2478 */ 2479 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2480 { 2481 const char *chip_name; 2482 int err; 2483 const struct gpu_info_firmware_header_v1_0 *hdr; 2484 2485 adev->firmware.gpu_info_fw = NULL; 2486 2487 switch (adev->asic_type) { 2488 default: 2489 return 0; 2490 case CHIP_VEGA10: 2491 chip_name = "vega10"; 2492 break; 2493 case CHIP_VEGA12: 2494 chip_name = "vega12"; 2495 break; 2496 case CHIP_RAVEN: 2497 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2498 chip_name = "raven2"; 2499 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2500 chip_name = "picasso"; 2501 else 2502 chip_name = "raven"; 2503 break; 2504 case CHIP_ARCTURUS: 2505 chip_name = "arcturus"; 2506 break; 2507 case CHIP_NAVI12: 2508 if (adev->discovery.bin) 2509 return 0; 2510 chip_name = "navi12"; 2511 break; 2512 case CHIP_CYAN_SKILLFISH: 2513 if (adev->discovery.bin) 2514 return 0; 2515 chip_name = "cyan_skillfish"; 2516 break; 2517 } 2518 2519 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2520 AMDGPU_UCODE_OPTIONAL, 2521 "amdgpu/%s_gpu_info.bin", chip_name); 2522 if (err) { 2523 dev_err(adev->dev, 2524 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2525 chip_name); 2526 goto out; 2527 } 2528 2529 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2530 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2531 2532 switch (hdr->version_major) { 2533 case 1: 2534 { 2535 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2536 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2537 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2538 2539 /* 2540 * Should be dropped when DAL no longer needs it. 2541 */ 2542 if (adev->asic_type == CHIP_NAVI12) 2543 goto parse_soc_bounding_box; 2544 2545 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2546 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2547 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2548 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2549 adev->gfx.config.max_texture_channel_caches = 2550 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2551 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2552 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2553 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2554 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2555 adev->gfx.config.double_offchip_lds_buf = 2556 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2557 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2558 adev->gfx.cu_info.max_waves_per_simd = 2559 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2560 adev->gfx.cu_info.max_scratch_slots_per_cu = 2561 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2562 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2563 if (hdr->version_minor >= 1) { 2564 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2565 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2566 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2567 adev->gfx.config.num_sc_per_sh = 2568 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2569 adev->gfx.config.num_packer_per_sc = 2570 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2571 } 2572 2573 parse_soc_bounding_box: 2574 /* 2575 * soc bounding box info is not integrated in disocovery table, 2576 * we always need to parse it from gpu info firmware if needed. 2577 */ 2578 if (hdr->version_minor == 2) { 2579 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2580 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2581 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2582 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2583 } 2584 break; 2585 } 2586 default: 2587 dev_err(adev->dev, 2588 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2589 err = -EINVAL; 2590 goto out; 2591 } 2592 out: 2593 return err; 2594 } 2595 2596 static void amdgpu_uid_init(struct amdgpu_device *adev) 2597 { 2598 /* Initialize the UID for the device */ 2599 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2600 if (!adev->uid_info) { 2601 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2602 return; 2603 } 2604 adev->uid_info->adev = adev; 2605 } 2606 2607 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2608 { 2609 /* Free the UID memory */ 2610 kfree(adev->uid_info); 2611 adev->uid_info = NULL; 2612 } 2613 2614 /** 2615 * amdgpu_device_ip_early_init - run early init for hardware IPs 2616 * 2617 * @adev: amdgpu_device pointer 2618 * 2619 * Early initialization pass for hardware IPs. The hardware IPs that make 2620 * up each asic are discovered each IP's early_init callback is run. This 2621 * is the first stage in initializing the asic. 2622 * Returns 0 on success, negative error code on failure. 2623 */ 2624 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2625 { 2626 struct amdgpu_ip_block *ip_block; 2627 struct pci_dev *parent; 2628 bool total, skip_bios; 2629 uint32_t bios_flags; 2630 int i, r; 2631 2632 amdgpu_device_enable_virtual_display(adev); 2633 2634 if (amdgpu_sriov_vf(adev)) { 2635 r = amdgpu_virt_request_full_gpu(adev, true); 2636 if (r) 2637 return r; 2638 2639 r = amdgpu_virt_init_critical_region(adev); 2640 if (r) 2641 return r; 2642 } 2643 2644 switch (adev->asic_type) { 2645 #ifdef CONFIG_DRM_AMDGPU_SI 2646 case CHIP_VERDE: 2647 case CHIP_TAHITI: 2648 case CHIP_PITCAIRN: 2649 case CHIP_OLAND: 2650 case CHIP_HAINAN: 2651 adev->family = AMDGPU_FAMILY_SI; 2652 r = si_set_ip_blocks(adev); 2653 if (r) 2654 return r; 2655 break; 2656 #endif 2657 #ifdef CONFIG_DRM_AMDGPU_CIK 2658 case CHIP_BONAIRE: 2659 case CHIP_HAWAII: 2660 case CHIP_KAVERI: 2661 case CHIP_KABINI: 2662 case CHIP_MULLINS: 2663 if (adev->flags & AMD_IS_APU) 2664 adev->family = AMDGPU_FAMILY_KV; 2665 else 2666 adev->family = AMDGPU_FAMILY_CI; 2667 2668 r = cik_set_ip_blocks(adev); 2669 if (r) 2670 return r; 2671 break; 2672 #endif 2673 case CHIP_TOPAZ: 2674 case CHIP_TONGA: 2675 case CHIP_FIJI: 2676 case CHIP_POLARIS10: 2677 case CHIP_POLARIS11: 2678 case CHIP_POLARIS12: 2679 case CHIP_VEGAM: 2680 case CHIP_CARRIZO: 2681 case CHIP_STONEY: 2682 if (adev->flags & AMD_IS_APU) 2683 adev->family = AMDGPU_FAMILY_CZ; 2684 else 2685 adev->family = AMDGPU_FAMILY_VI; 2686 2687 r = vi_set_ip_blocks(adev); 2688 if (r) 2689 return r; 2690 break; 2691 default: 2692 r = amdgpu_discovery_set_ip_blocks(adev); 2693 if (r) 2694 return r; 2695 break; 2696 } 2697 2698 /* Check for IP version 9.4.3 with A0 hardware */ 2699 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2700 !amdgpu_device_get_rev_id(adev)) { 2701 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2702 return -ENODEV; /* device unsupported - no device error */ 2703 } 2704 2705 if (amdgpu_has_atpx() && 2706 (amdgpu_is_atpx_hybrid() || 2707 amdgpu_has_atpx_dgpu_power_cntl()) && 2708 ((adev->flags & AMD_IS_APU) == 0) && 2709 !dev_is_removable(&adev->pdev->dev)) 2710 adev->flags |= AMD_IS_PX; 2711 2712 if (!(adev->flags & AMD_IS_APU)) { 2713 parent = pcie_find_root_port(adev->pdev); 2714 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2715 } 2716 2717 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2718 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2719 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2720 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2721 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2722 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2723 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2724 2725 adev->virt.is_xgmi_node_migrate_enabled = false; 2726 if (amdgpu_sriov_vf(adev)) { 2727 adev->virt.is_xgmi_node_migrate_enabled = 2728 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2729 } 2730 2731 total = true; 2732 for (i = 0; i < adev->num_ip_blocks; i++) { 2733 ip_block = &adev->ip_blocks[i]; 2734 2735 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2736 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2737 adev->ip_blocks[i].version->funcs->name); 2738 adev->ip_blocks[i].status.valid = false; 2739 } else if (ip_block->version->funcs->early_init) { 2740 r = ip_block->version->funcs->early_init(ip_block); 2741 if (r == -ENOENT) { 2742 adev->ip_blocks[i].status.valid = false; 2743 } else if (r) { 2744 dev_err(adev->dev, 2745 "early_init of IP block <%s> failed %d\n", 2746 adev->ip_blocks[i].version->funcs->name, 2747 r); 2748 total = false; 2749 } else { 2750 adev->ip_blocks[i].status.valid = true; 2751 } 2752 } else { 2753 adev->ip_blocks[i].status.valid = true; 2754 } 2755 /* get the vbios after the asic_funcs are set up */ 2756 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2757 r = amdgpu_device_parse_gpu_info_fw(adev); 2758 if (r) 2759 return r; 2760 2761 bios_flags = amdgpu_device_get_vbios_flags(adev); 2762 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2763 /* Read BIOS */ 2764 if (!skip_bios) { 2765 bool optional = 2766 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2767 if (!amdgpu_get_bios(adev) && !optional) 2768 return -EINVAL; 2769 2770 if (optional && !adev->bios) 2771 dev_info( 2772 adev->dev, 2773 "VBIOS image optional, proceeding without VBIOS image"); 2774 2775 if (adev->bios) { 2776 r = amdgpu_atombios_init(adev); 2777 if (r) { 2778 dev_err(adev->dev, 2779 "amdgpu_atombios_init failed\n"); 2780 amdgpu_vf_error_put( 2781 adev, 2782 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2783 0, 0); 2784 return r; 2785 } 2786 } 2787 } 2788 2789 /*get pf2vf msg info at it's earliest time*/ 2790 if (amdgpu_sriov_vf(adev)) 2791 amdgpu_virt_init_data_exchange(adev); 2792 2793 } 2794 } 2795 if (!total) 2796 return -ENODEV; 2797 2798 if (adev->gmc.xgmi.supported) 2799 amdgpu_xgmi_early_init(adev); 2800 2801 if (amdgpu_is_multi_aid(adev)) 2802 amdgpu_uid_init(adev); 2803 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2804 if (ip_block->status.valid != false) 2805 amdgpu_amdkfd_device_probe(adev); 2806 2807 adev->cg_flags &= amdgpu_cg_mask; 2808 adev->pg_flags &= amdgpu_pg_mask; 2809 2810 return 0; 2811 } 2812 2813 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2814 { 2815 int i, r; 2816 2817 for (i = 0; i < adev->num_ip_blocks; i++) { 2818 if (!adev->ip_blocks[i].status.sw) 2819 continue; 2820 if (adev->ip_blocks[i].status.hw) 2821 continue; 2822 if (!amdgpu_ip_member_of_hwini( 2823 adev, adev->ip_blocks[i].version->type)) 2824 continue; 2825 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2826 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2827 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2828 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2829 if (r) { 2830 dev_err(adev->dev, 2831 "hw_init of IP block <%s> failed %d\n", 2832 adev->ip_blocks[i].version->funcs->name, 2833 r); 2834 return r; 2835 } 2836 adev->ip_blocks[i].status.hw = true; 2837 } 2838 } 2839 2840 return 0; 2841 } 2842 2843 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2844 { 2845 int i, r; 2846 2847 for (i = 0; i < adev->num_ip_blocks; i++) { 2848 if (!adev->ip_blocks[i].status.sw) 2849 continue; 2850 if (adev->ip_blocks[i].status.hw) 2851 continue; 2852 if (!amdgpu_ip_member_of_hwini( 2853 adev, adev->ip_blocks[i].version->type)) 2854 continue; 2855 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2856 if (r) { 2857 dev_err(adev->dev, 2858 "hw_init of IP block <%s> failed %d\n", 2859 adev->ip_blocks[i].version->funcs->name, r); 2860 return r; 2861 } 2862 adev->ip_blocks[i].status.hw = true; 2863 } 2864 2865 return 0; 2866 } 2867 2868 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2869 { 2870 int r = 0; 2871 int i; 2872 uint32_t smu_version; 2873 2874 if (adev->asic_type >= CHIP_VEGA10) { 2875 for (i = 0; i < adev->num_ip_blocks; i++) { 2876 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2877 continue; 2878 2879 if (!amdgpu_ip_member_of_hwini(adev, 2880 AMD_IP_BLOCK_TYPE_PSP)) 2881 break; 2882 2883 if (!adev->ip_blocks[i].status.sw) 2884 continue; 2885 2886 /* no need to do the fw loading again if already done*/ 2887 if (adev->ip_blocks[i].status.hw == true) 2888 break; 2889 2890 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2891 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2892 if (r) 2893 return r; 2894 } else { 2895 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2896 if (r) { 2897 dev_err(adev->dev, 2898 "hw_init of IP block <%s> failed %d\n", 2899 adev->ip_blocks[i] 2900 .version->funcs->name, 2901 r); 2902 return r; 2903 } 2904 adev->ip_blocks[i].status.hw = true; 2905 } 2906 break; 2907 } 2908 } 2909 2910 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2911 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2912 2913 return r; 2914 } 2915 2916 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2917 { 2918 struct drm_sched_init_args args = { 2919 .ops = &amdgpu_sched_ops, 2920 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2921 .timeout_wq = adev->reset_domain->wq, 2922 .dev = adev->dev, 2923 }; 2924 long timeout; 2925 int r, i; 2926 2927 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2928 struct amdgpu_ring *ring = adev->rings[i]; 2929 2930 /* No need to setup the GPU scheduler for rings that don't need it */ 2931 if (!ring || ring->no_scheduler) 2932 continue; 2933 2934 switch (ring->funcs->type) { 2935 case AMDGPU_RING_TYPE_GFX: 2936 timeout = adev->gfx_timeout; 2937 break; 2938 case AMDGPU_RING_TYPE_COMPUTE: 2939 timeout = adev->compute_timeout; 2940 break; 2941 case AMDGPU_RING_TYPE_SDMA: 2942 timeout = adev->sdma_timeout; 2943 break; 2944 default: 2945 timeout = adev->video_timeout; 2946 break; 2947 } 2948 2949 args.timeout = timeout; 2950 args.credit_limit = ring->num_hw_submission; 2951 args.score = ring->sched_score; 2952 args.name = ring->name; 2953 2954 r = drm_sched_init(&ring->sched, &args); 2955 if (r) { 2956 dev_err(adev->dev, 2957 "Failed to create scheduler on ring %s.\n", 2958 ring->name); 2959 return r; 2960 } 2961 r = amdgpu_uvd_entity_init(adev, ring); 2962 if (r) { 2963 dev_err(adev->dev, 2964 "Failed to create UVD scheduling entity on ring %s.\n", 2965 ring->name); 2966 return r; 2967 } 2968 r = amdgpu_vce_entity_init(adev, ring); 2969 if (r) { 2970 dev_err(adev->dev, 2971 "Failed to create VCE scheduling entity on ring %s.\n", 2972 ring->name); 2973 return r; 2974 } 2975 } 2976 2977 if (adev->xcp_mgr) 2978 amdgpu_xcp_update_partition_sched_list(adev); 2979 2980 return 0; 2981 } 2982 2983 2984 /** 2985 * amdgpu_device_ip_init - run init for hardware IPs 2986 * 2987 * @adev: amdgpu_device pointer 2988 * 2989 * Main initialization pass for hardware IPs. The list of all the hardware 2990 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2991 * are run. sw_init initializes the software state associated with each IP 2992 * and hw_init initializes the hardware associated with each IP. 2993 * Returns 0 on success, negative error code on failure. 2994 */ 2995 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2996 { 2997 bool init_badpage; 2998 int i, r; 2999 3000 r = amdgpu_ras_init(adev); 3001 if (r) 3002 return r; 3003 3004 for (i = 0; i < adev->num_ip_blocks; i++) { 3005 if (!adev->ip_blocks[i].status.valid) 3006 continue; 3007 if (adev->ip_blocks[i].version->funcs->sw_init) { 3008 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3009 if (r) { 3010 dev_err(adev->dev, 3011 "sw_init of IP block <%s> failed %d\n", 3012 adev->ip_blocks[i].version->funcs->name, 3013 r); 3014 goto init_failed; 3015 } 3016 } 3017 adev->ip_blocks[i].status.sw = true; 3018 3019 if (!amdgpu_ip_member_of_hwini( 3020 adev, adev->ip_blocks[i].version->type)) 3021 continue; 3022 3023 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3024 /* need to do common hw init early so everything is set up for gmc */ 3025 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3026 if (r) { 3027 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3028 r); 3029 goto init_failed; 3030 } 3031 adev->ip_blocks[i].status.hw = true; 3032 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3033 /* need to do gmc hw init early so we can allocate gpu mem */ 3034 /* Try to reserve bad pages early */ 3035 if (amdgpu_sriov_vf(adev)) 3036 amdgpu_virt_exchange_data(adev); 3037 3038 r = amdgpu_device_mem_scratch_init(adev); 3039 if (r) { 3040 dev_err(adev->dev, 3041 "amdgpu_mem_scratch_init failed %d\n", 3042 r); 3043 goto init_failed; 3044 } 3045 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3046 if (r) { 3047 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3048 r); 3049 goto init_failed; 3050 } 3051 r = amdgpu_device_wb_init(adev); 3052 if (r) { 3053 dev_err(adev->dev, 3054 "amdgpu_device_wb_init failed %d\n", r); 3055 goto init_failed; 3056 } 3057 adev->ip_blocks[i].status.hw = true; 3058 3059 /* right after GMC hw init, we create CSA */ 3060 if (adev->gfx.mcbp) { 3061 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3062 AMDGPU_GEM_DOMAIN_VRAM | 3063 AMDGPU_GEM_DOMAIN_GTT, 3064 AMDGPU_CSA_SIZE); 3065 if (r) { 3066 dev_err(adev->dev, 3067 "allocate CSA failed %d\n", r); 3068 goto init_failed; 3069 } 3070 } 3071 3072 r = amdgpu_seq64_init(adev); 3073 if (r) { 3074 dev_err(adev->dev, "allocate seq64 failed %d\n", 3075 r); 3076 goto init_failed; 3077 } 3078 } 3079 } 3080 3081 if (amdgpu_sriov_vf(adev)) 3082 amdgpu_virt_init_data_exchange(adev); 3083 3084 r = amdgpu_ib_pool_init(adev); 3085 if (r) { 3086 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3087 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3088 goto init_failed; 3089 } 3090 3091 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3092 if (r) 3093 goto init_failed; 3094 3095 r = amdgpu_device_ip_hw_init_phase1(adev); 3096 if (r) 3097 goto init_failed; 3098 3099 r = amdgpu_device_fw_loading(adev); 3100 if (r) 3101 goto init_failed; 3102 3103 r = amdgpu_device_ip_hw_init_phase2(adev); 3104 if (r) 3105 goto init_failed; 3106 3107 /* 3108 * retired pages will be loaded from eeprom and reserved here, 3109 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3110 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3111 * for I2C communication which only true at this point. 3112 * 3113 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3114 * failure from bad gpu situation and stop amdgpu init process 3115 * accordingly. For other failed cases, it will still release all 3116 * the resource and print error message, rather than returning one 3117 * negative value to upper level. 3118 * 3119 * Note: theoretically, this should be called before all vram allocations 3120 * to protect retired page from abusing 3121 */ 3122 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3123 r = amdgpu_ras_recovery_init(adev, init_badpage); 3124 if (r) 3125 goto init_failed; 3126 3127 /** 3128 * In case of XGMI grab extra reference for reset domain for this device 3129 */ 3130 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3131 if (amdgpu_xgmi_add_device(adev) == 0) { 3132 if (!amdgpu_sriov_vf(adev)) { 3133 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3134 3135 if (WARN_ON(!hive)) { 3136 r = -ENOENT; 3137 goto init_failed; 3138 } 3139 3140 if (!hive->reset_domain || 3141 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3142 r = -ENOENT; 3143 amdgpu_put_xgmi_hive(hive); 3144 goto init_failed; 3145 } 3146 3147 /* Drop the early temporary reset domain we created for device */ 3148 amdgpu_reset_put_reset_domain(adev->reset_domain); 3149 adev->reset_domain = hive->reset_domain; 3150 amdgpu_put_xgmi_hive(hive); 3151 } 3152 } 3153 } 3154 3155 r = amdgpu_device_init_schedulers(adev); 3156 if (r) 3157 goto init_failed; 3158 3159 if (adev->mman.buffer_funcs_ring && 3160 adev->mman.buffer_funcs_ring->sched.ready) 3161 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3162 3163 /* Don't init kfd if whole hive need to be reset during init */ 3164 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3165 amdgpu_amdkfd_device_init(adev); 3166 } 3167 3168 amdgpu_fru_get_product_info(adev); 3169 3170 r = amdgpu_cper_init(adev); 3171 3172 init_failed: 3173 3174 return r; 3175 } 3176 3177 /** 3178 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3179 * 3180 * @adev: amdgpu_device pointer 3181 * 3182 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3183 * this function before a GPU reset. If the value is retained after a 3184 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3185 */ 3186 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3187 { 3188 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3189 } 3190 3191 /** 3192 * amdgpu_device_check_vram_lost - check if vram is valid 3193 * 3194 * @adev: amdgpu_device pointer 3195 * 3196 * Checks the reset magic value written to the gart pointer in VRAM. 3197 * The driver calls this after a GPU reset to see if the contents of 3198 * VRAM is lost or now. 3199 * returns true if vram is lost, false if not. 3200 */ 3201 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3202 { 3203 if (memcmp(adev->gart.ptr, adev->reset_magic, 3204 AMDGPU_RESET_MAGIC_NUM)) 3205 return true; 3206 3207 if (!amdgpu_in_reset(adev)) 3208 return false; 3209 3210 /* 3211 * For all ASICs with baco/mode1 reset, the VRAM is 3212 * always assumed to be lost. 3213 */ 3214 switch (amdgpu_asic_reset_method(adev)) { 3215 case AMD_RESET_METHOD_LEGACY: 3216 case AMD_RESET_METHOD_LINK: 3217 case AMD_RESET_METHOD_BACO: 3218 case AMD_RESET_METHOD_MODE1: 3219 return true; 3220 default: 3221 return false; 3222 } 3223 } 3224 3225 /** 3226 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3227 * 3228 * @adev: amdgpu_device pointer 3229 * @state: clockgating state (gate or ungate) 3230 * 3231 * The list of all the hardware IPs that make up the asic is walked and the 3232 * set_clockgating_state callbacks are run. 3233 * Late initialization pass enabling clockgating for hardware IPs. 3234 * Fini or suspend, pass disabling clockgating for hardware IPs. 3235 * Returns 0 on success, negative error code on failure. 3236 */ 3237 3238 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3239 enum amd_clockgating_state state) 3240 { 3241 int i, j, r; 3242 3243 if (amdgpu_emu_mode == 1) 3244 return 0; 3245 3246 for (j = 0; j < adev->num_ip_blocks; j++) { 3247 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3248 if (!adev->ip_blocks[i].status.late_initialized) 3249 continue; 3250 /* skip CG for GFX, SDMA on S0ix */ 3251 if (adev->in_s0ix && 3252 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3253 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3254 continue; 3255 /* skip CG for VCE/UVD, it's handled specially */ 3256 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3257 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3258 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3259 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3260 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3261 /* enable clockgating to save power */ 3262 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3263 state); 3264 if (r) { 3265 dev_err(adev->dev, 3266 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3267 adev->ip_blocks[i].version->funcs->name, 3268 r); 3269 return r; 3270 } 3271 } 3272 } 3273 3274 return 0; 3275 } 3276 3277 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3278 enum amd_powergating_state state) 3279 { 3280 int i, j, r; 3281 3282 if (amdgpu_emu_mode == 1) 3283 return 0; 3284 3285 for (j = 0; j < adev->num_ip_blocks; j++) { 3286 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3287 if (!adev->ip_blocks[i].status.late_initialized) 3288 continue; 3289 /* skip PG for GFX, SDMA on S0ix */ 3290 if (adev->in_s0ix && 3291 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3292 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3293 continue; 3294 /* skip CG for VCE/UVD, it's handled specially */ 3295 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3296 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3297 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3298 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3299 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3300 /* enable powergating to save power */ 3301 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3302 state); 3303 if (r) { 3304 dev_err(adev->dev, 3305 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3306 adev->ip_blocks[i].version->funcs->name, 3307 r); 3308 return r; 3309 } 3310 } 3311 } 3312 return 0; 3313 } 3314 3315 static int amdgpu_device_enable_mgpu_fan_boost(void) 3316 { 3317 struct amdgpu_gpu_instance *gpu_ins; 3318 struct amdgpu_device *adev; 3319 int i, ret = 0; 3320 3321 mutex_lock(&mgpu_info.mutex); 3322 3323 /* 3324 * MGPU fan boost feature should be enabled 3325 * only when there are two or more dGPUs in 3326 * the system 3327 */ 3328 if (mgpu_info.num_dgpu < 2) 3329 goto out; 3330 3331 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3332 gpu_ins = &(mgpu_info.gpu_ins[i]); 3333 adev = gpu_ins->adev; 3334 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3335 !gpu_ins->mgpu_fan_enabled) { 3336 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3337 if (ret) 3338 break; 3339 3340 gpu_ins->mgpu_fan_enabled = 1; 3341 } 3342 } 3343 3344 out: 3345 mutex_unlock(&mgpu_info.mutex); 3346 3347 return ret; 3348 } 3349 3350 /** 3351 * amdgpu_device_ip_late_init - run late init for hardware IPs 3352 * 3353 * @adev: amdgpu_device pointer 3354 * 3355 * Late initialization pass for hardware IPs. The list of all the hardware 3356 * IPs that make up the asic is walked and the late_init callbacks are run. 3357 * late_init covers any special initialization that an IP requires 3358 * after all of the have been initialized or something that needs to happen 3359 * late in the init process. 3360 * Returns 0 on success, negative error code on failure. 3361 */ 3362 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3363 { 3364 struct amdgpu_gpu_instance *gpu_instance; 3365 int i = 0, r; 3366 3367 for (i = 0; i < adev->num_ip_blocks; i++) { 3368 if (!adev->ip_blocks[i].status.hw) 3369 continue; 3370 if (adev->ip_blocks[i].version->funcs->late_init) { 3371 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3372 if (r) { 3373 dev_err(adev->dev, 3374 "late_init of IP block <%s> failed %d\n", 3375 adev->ip_blocks[i].version->funcs->name, 3376 r); 3377 return r; 3378 } 3379 } 3380 adev->ip_blocks[i].status.late_initialized = true; 3381 } 3382 3383 r = amdgpu_ras_late_init(adev); 3384 if (r) { 3385 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3386 return r; 3387 } 3388 3389 if (!amdgpu_reset_in_recovery(adev)) 3390 amdgpu_ras_set_error_query_ready(adev, true); 3391 3392 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3393 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3394 3395 amdgpu_device_fill_reset_magic(adev); 3396 3397 r = amdgpu_device_enable_mgpu_fan_boost(); 3398 if (r) 3399 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3400 3401 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3402 if (amdgpu_passthrough(adev) && 3403 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3404 adev->asic_type == CHIP_ALDEBARAN)) 3405 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3406 3407 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3408 mutex_lock(&mgpu_info.mutex); 3409 3410 /* 3411 * Reset device p-state to low as this was booted with high. 3412 * 3413 * This should be performed only after all devices from the same 3414 * hive get initialized. 3415 * 3416 * However, it's unknown how many device in the hive in advance. 3417 * As this is counted one by one during devices initializations. 3418 * 3419 * So, we wait for all XGMI interlinked devices initialized. 3420 * This may bring some delays as those devices may come from 3421 * different hives. But that should be OK. 3422 */ 3423 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3424 for (i = 0; i < mgpu_info.num_gpu; i++) { 3425 gpu_instance = &(mgpu_info.gpu_ins[i]); 3426 if (gpu_instance->adev->flags & AMD_IS_APU) 3427 continue; 3428 3429 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3430 AMDGPU_XGMI_PSTATE_MIN); 3431 if (r) { 3432 dev_err(adev->dev, 3433 "pstate setting failed (%d).\n", 3434 r); 3435 break; 3436 } 3437 } 3438 } 3439 3440 mutex_unlock(&mgpu_info.mutex); 3441 } 3442 3443 return 0; 3444 } 3445 3446 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3447 { 3448 struct amdgpu_device *adev = ip_block->adev; 3449 int r; 3450 3451 if (!ip_block->version->funcs->hw_fini) { 3452 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3453 ip_block->version->funcs->name); 3454 } else { 3455 r = ip_block->version->funcs->hw_fini(ip_block); 3456 /* XXX handle errors */ 3457 if (r) { 3458 dev_dbg(adev->dev, 3459 "hw_fini of IP block <%s> failed %d\n", 3460 ip_block->version->funcs->name, r); 3461 } 3462 } 3463 3464 ip_block->status.hw = false; 3465 } 3466 3467 /** 3468 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3469 * 3470 * @adev: amdgpu_device pointer 3471 * 3472 * For ASICs need to disable SMC first 3473 */ 3474 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3475 { 3476 int i; 3477 3478 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3479 return; 3480 3481 for (i = 0; i < adev->num_ip_blocks; i++) { 3482 if (!adev->ip_blocks[i].status.hw) 3483 continue; 3484 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3485 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3486 break; 3487 } 3488 } 3489 } 3490 3491 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3492 { 3493 int i, r; 3494 3495 for (i = 0; i < adev->num_ip_blocks; i++) { 3496 if (!adev->ip_blocks[i].version->funcs->early_fini) 3497 continue; 3498 3499 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3500 if (r) { 3501 dev_dbg(adev->dev, 3502 "early_fini of IP block <%s> failed %d\n", 3503 adev->ip_blocks[i].version->funcs->name, r); 3504 } 3505 } 3506 3507 amdgpu_amdkfd_suspend(adev, true); 3508 amdgpu_amdkfd_teardown_processes(adev); 3509 amdgpu_userq_suspend(adev); 3510 3511 /* Workaround for ASICs need to disable SMC first */ 3512 amdgpu_device_smu_fini_early(adev); 3513 3514 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3515 if (!adev->ip_blocks[i].status.hw) 3516 continue; 3517 3518 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3519 } 3520 3521 if (amdgpu_sriov_vf(adev)) { 3522 if (amdgpu_virt_release_full_gpu(adev, false)) 3523 dev_err(adev->dev, 3524 "failed to release exclusive mode on fini\n"); 3525 } 3526 3527 /* 3528 * Driver reload on the APU can fail due to firmware validation because 3529 * the PSP is always running, as it is shared across the whole SoC. 3530 * This same issue does not occur on dGPU because it has a mechanism 3531 * that checks whether the PSP is running. A solution for those issues 3532 * in the APU is to trigger a GPU reset, but this should be done during 3533 * the unload phase to avoid adding boot latency and screen flicker. 3534 */ 3535 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { 3536 r = amdgpu_asic_reset(adev); 3537 if (r) 3538 dev_err(adev->dev, "asic reset on %s failed\n", __func__); 3539 } 3540 3541 return 0; 3542 } 3543 3544 /** 3545 * amdgpu_device_ip_fini - run fini for hardware IPs 3546 * 3547 * @adev: amdgpu_device pointer 3548 * 3549 * Main teardown pass for hardware IPs. The list of all the hardware 3550 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3551 * are run. hw_fini tears down the hardware associated with each IP 3552 * and sw_fini tears down any software state associated with each IP. 3553 * Returns 0 on success, negative error code on failure. 3554 */ 3555 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3556 { 3557 int i, r; 3558 3559 amdgpu_cper_fini(adev); 3560 3561 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3562 amdgpu_virt_release_ras_err_handler_data(adev); 3563 3564 if (adev->gmc.xgmi.num_physical_nodes > 1) 3565 amdgpu_xgmi_remove_device(adev); 3566 3567 amdgpu_amdkfd_device_fini_sw(adev); 3568 3569 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3570 if (!adev->ip_blocks[i].status.sw) 3571 continue; 3572 3573 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3574 amdgpu_ucode_free_bo(adev); 3575 amdgpu_free_static_csa(&adev->virt.csa_obj); 3576 amdgpu_device_wb_fini(adev); 3577 amdgpu_device_mem_scratch_fini(adev); 3578 amdgpu_ib_pool_fini(adev); 3579 amdgpu_seq64_fini(adev); 3580 amdgpu_doorbell_fini(adev); 3581 } 3582 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3583 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3584 /* XXX handle errors */ 3585 if (r) { 3586 dev_dbg(adev->dev, 3587 "sw_fini of IP block <%s> failed %d\n", 3588 adev->ip_blocks[i].version->funcs->name, 3589 r); 3590 } 3591 } 3592 adev->ip_blocks[i].status.sw = false; 3593 adev->ip_blocks[i].status.valid = false; 3594 } 3595 3596 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3597 if (!adev->ip_blocks[i].status.late_initialized) 3598 continue; 3599 if (adev->ip_blocks[i].version->funcs->late_fini) 3600 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3601 adev->ip_blocks[i].status.late_initialized = false; 3602 } 3603 3604 amdgpu_ras_fini(adev); 3605 amdgpu_uid_fini(adev); 3606 3607 return 0; 3608 } 3609 3610 /** 3611 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3612 * 3613 * @work: work_struct. 3614 */ 3615 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3616 { 3617 struct amdgpu_device *adev = 3618 container_of(work, struct amdgpu_device, delayed_init_work.work); 3619 int r; 3620 3621 r = amdgpu_ib_ring_tests(adev); 3622 if (r) 3623 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3624 } 3625 3626 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3627 { 3628 struct amdgpu_device *adev = 3629 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3630 3631 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3632 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3633 3634 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3635 adev->gfx.gfx_off_state = true; 3636 } 3637 3638 /** 3639 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3640 * 3641 * @adev: amdgpu_device pointer 3642 * 3643 * Main suspend function for hardware IPs. The list of all the hardware 3644 * IPs that make up the asic is walked, clockgating is disabled and the 3645 * suspend callbacks are run. suspend puts the hardware and software state 3646 * in each IP into a state suitable for suspend. 3647 * Returns 0 on success, negative error code on failure. 3648 */ 3649 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3650 { 3651 int i, r, rec; 3652 3653 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3654 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3655 3656 /* 3657 * Per PMFW team's suggestion, driver needs to handle gfxoff 3658 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3659 * scenario. Add the missing df cstate disablement here. 3660 */ 3661 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3662 dev_warn(adev->dev, "Failed to disallow df cstate"); 3663 3664 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3665 if (!adev->ip_blocks[i].status.valid) 3666 continue; 3667 3668 /* displays are handled separately */ 3669 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3670 continue; 3671 3672 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3673 if (r) 3674 goto unwind; 3675 } 3676 3677 return 0; 3678 unwind: 3679 rec = amdgpu_device_ip_resume_phase3(adev); 3680 if (rec) 3681 dev_err(adev->dev, 3682 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3683 rec); 3684 3685 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3686 3687 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3688 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3689 3690 return r; 3691 } 3692 3693 /** 3694 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3695 * 3696 * @adev: amdgpu_device pointer 3697 * 3698 * Main suspend function for hardware IPs. The list of all the hardware 3699 * IPs that make up the asic is walked, clockgating is disabled and the 3700 * suspend callbacks are run. suspend puts the hardware and software state 3701 * in each IP into a state suitable for suspend. 3702 * Returns 0 on success, negative error code on failure. 3703 */ 3704 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3705 { 3706 int i, r, rec; 3707 3708 if (adev->in_s0ix) 3709 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3710 3711 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3712 if (!adev->ip_blocks[i].status.valid) 3713 continue; 3714 /* displays are handled in phase1 */ 3715 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3716 continue; 3717 /* PSP lost connection when err_event_athub occurs */ 3718 if (amdgpu_ras_intr_triggered() && 3719 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3720 adev->ip_blocks[i].status.hw = false; 3721 continue; 3722 } 3723 3724 /* skip unnecessary suspend if we do not initialize them yet */ 3725 if (!amdgpu_ip_member_of_hwini( 3726 adev, adev->ip_blocks[i].version->type)) 3727 continue; 3728 3729 /* Since we skip suspend for S0i3, we need to cancel the delayed 3730 * idle work here as the suspend callback never gets called. 3731 */ 3732 if (adev->in_s0ix && 3733 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3734 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3735 cancel_delayed_work_sync(&adev->gfx.idle_work); 3736 /* skip suspend of gfx/mes and psp for S0ix 3737 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3738 * like at runtime. PSP is also part of the always on hardware 3739 * so no need to suspend it. 3740 */ 3741 if (adev->in_s0ix && 3742 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3744 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3745 continue; 3746 3747 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3748 if (adev->in_s0ix && 3749 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3750 IP_VERSION(5, 0, 0)) && 3751 (adev->ip_blocks[i].version->type == 3752 AMD_IP_BLOCK_TYPE_SDMA)) 3753 continue; 3754 3755 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3756 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3757 * from this location and RLC Autoload automatically also gets loaded 3758 * from here based on PMFW -> PSP message during re-init sequence. 3759 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3760 * the TMR and reload FWs again for IMU enabled APU ASICs. 3761 */ 3762 if (amdgpu_in_reset(adev) && 3763 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3764 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3765 continue; 3766 3767 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3768 if (r) 3769 goto unwind; 3770 3771 /* handle putting the SMC in the appropriate state */ 3772 if (!amdgpu_sriov_vf(adev)) { 3773 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3774 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3775 if (r) { 3776 dev_err(adev->dev, 3777 "SMC failed to set mp1 state %d, %d\n", 3778 adev->mp1_state, r); 3779 goto unwind; 3780 } 3781 } 3782 } 3783 } 3784 3785 return 0; 3786 unwind: 3787 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3788 rec = amdgpu_device_ip_resume_phase1(adev); 3789 if (rec) { 3790 dev_err(adev->dev, 3791 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3792 rec); 3793 return r; 3794 } 3795 3796 rec = amdgpu_device_fw_loading(adev); 3797 if (rec) { 3798 dev_err(adev->dev, 3799 "amdgpu_device_fw_loading failed during unwind: %d\n", 3800 rec); 3801 return r; 3802 } 3803 3804 rec = amdgpu_device_ip_resume_phase2(adev); 3805 if (rec) { 3806 dev_err(adev->dev, 3807 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3808 rec); 3809 return r; 3810 } 3811 3812 return r; 3813 } 3814 3815 /** 3816 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3817 * 3818 * @adev: amdgpu_device pointer 3819 * 3820 * Main suspend function for hardware IPs. The list of all the hardware 3821 * IPs that make up the asic is walked, clockgating is disabled and the 3822 * suspend callbacks are run. suspend puts the hardware and software state 3823 * in each IP into a state suitable for suspend. 3824 * Returns 0 on success, negative error code on failure. 3825 */ 3826 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3827 { 3828 int r; 3829 3830 if (amdgpu_sriov_vf(adev)) { 3831 amdgpu_virt_fini_data_exchange(adev); 3832 amdgpu_virt_request_full_gpu(adev, false); 3833 } 3834 3835 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3836 3837 r = amdgpu_device_ip_suspend_phase1(adev); 3838 if (r) 3839 return r; 3840 r = amdgpu_device_ip_suspend_phase2(adev); 3841 3842 if (amdgpu_sriov_vf(adev)) 3843 amdgpu_virt_release_full_gpu(adev, false); 3844 3845 return r; 3846 } 3847 3848 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3849 { 3850 int i, r; 3851 3852 static enum amd_ip_block_type ip_order[] = { 3853 AMD_IP_BLOCK_TYPE_COMMON, 3854 AMD_IP_BLOCK_TYPE_GMC, 3855 AMD_IP_BLOCK_TYPE_PSP, 3856 AMD_IP_BLOCK_TYPE_IH, 3857 }; 3858 3859 for (i = 0; i < adev->num_ip_blocks; i++) { 3860 int j; 3861 struct amdgpu_ip_block *block; 3862 3863 block = &adev->ip_blocks[i]; 3864 block->status.hw = false; 3865 3866 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3867 3868 if (block->version->type != ip_order[j] || 3869 !block->status.valid) 3870 continue; 3871 3872 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3873 if (r) { 3874 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3875 block->version->funcs->name); 3876 return r; 3877 } 3878 block->status.hw = true; 3879 } 3880 } 3881 3882 return 0; 3883 } 3884 3885 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3886 { 3887 struct amdgpu_ip_block *block; 3888 int i, r = 0; 3889 3890 static enum amd_ip_block_type ip_order[] = { 3891 AMD_IP_BLOCK_TYPE_SMC, 3892 AMD_IP_BLOCK_TYPE_DCE, 3893 AMD_IP_BLOCK_TYPE_GFX, 3894 AMD_IP_BLOCK_TYPE_SDMA, 3895 AMD_IP_BLOCK_TYPE_MES, 3896 AMD_IP_BLOCK_TYPE_UVD, 3897 AMD_IP_BLOCK_TYPE_VCE, 3898 AMD_IP_BLOCK_TYPE_VCN, 3899 AMD_IP_BLOCK_TYPE_JPEG 3900 }; 3901 3902 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3903 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3904 3905 if (!block) 3906 continue; 3907 3908 if (block->status.valid && !block->status.hw) { 3909 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3910 r = amdgpu_ip_block_resume(block); 3911 } else { 3912 r = block->version->funcs->hw_init(block); 3913 } 3914 3915 if (r) { 3916 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3917 block->version->funcs->name); 3918 break; 3919 } 3920 block->status.hw = true; 3921 } 3922 } 3923 3924 return r; 3925 } 3926 3927 /** 3928 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3929 * 3930 * @adev: amdgpu_device pointer 3931 * 3932 * First resume function for hardware IPs. The list of all the hardware 3933 * IPs that make up the asic is walked and the resume callbacks are run for 3934 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3935 * after a suspend and updates the software state as necessary. This 3936 * function is also used for restoring the GPU after a GPU reset. 3937 * Returns 0 on success, negative error code on failure. 3938 */ 3939 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3940 { 3941 int i, r; 3942 3943 for (i = 0; i < adev->num_ip_blocks; i++) { 3944 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3945 continue; 3946 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3947 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3948 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3949 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3950 3951 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3952 if (r) 3953 return r; 3954 } 3955 } 3956 3957 return 0; 3958 } 3959 3960 /** 3961 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3962 * 3963 * @adev: amdgpu_device pointer 3964 * 3965 * Second resume function for hardware IPs. The list of all the hardware 3966 * IPs that make up the asic is walked and the resume callbacks are run for 3967 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3968 * functional state after a suspend and updates the software state as 3969 * necessary. This function is also used for restoring the GPU after a GPU 3970 * reset. 3971 * Returns 0 on success, negative error code on failure. 3972 */ 3973 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3974 { 3975 int i, r; 3976 3977 for (i = 0; i < adev->num_ip_blocks; i++) { 3978 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3979 continue; 3980 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3981 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3984 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3985 continue; 3986 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3987 if (r) 3988 return r; 3989 } 3990 3991 return 0; 3992 } 3993 3994 /** 3995 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3996 * 3997 * @adev: amdgpu_device pointer 3998 * 3999 * Third resume function for hardware IPs. The list of all the hardware 4000 * IPs that make up the asic is walked and the resume callbacks are run for 4001 * all DCE. resume puts the hardware into a functional state after a suspend 4002 * and updates the software state as necessary. This function is also used 4003 * for restoring the GPU after a GPU reset. 4004 * 4005 * Returns 0 on success, negative error code on failure. 4006 */ 4007 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4008 { 4009 int i, r; 4010 4011 for (i = 0; i < adev->num_ip_blocks; i++) { 4012 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4013 continue; 4014 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4015 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4016 if (r) 4017 return r; 4018 } 4019 } 4020 4021 return 0; 4022 } 4023 4024 /** 4025 * amdgpu_device_ip_resume - run resume for hardware IPs 4026 * 4027 * @adev: amdgpu_device pointer 4028 * 4029 * Main resume function for hardware IPs. The hardware IPs 4030 * are split into two resume functions because they are 4031 * also used in recovering from a GPU reset and some additional 4032 * steps need to be take between them. In this case (S3/S4) they are 4033 * run sequentially. 4034 * Returns 0 on success, negative error code on failure. 4035 */ 4036 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4037 { 4038 int r; 4039 4040 r = amdgpu_device_ip_resume_phase1(adev); 4041 if (r) 4042 return r; 4043 4044 r = amdgpu_device_fw_loading(adev); 4045 if (r) 4046 return r; 4047 4048 r = amdgpu_device_ip_resume_phase2(adev); 4049 4050 if (adev->mman.buffer_funcs_ring->sched.ready) 4051 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4052 4053 if (r) 4054 return r; 4055 4056 amdgpu_fence_driver_hw_init(adev); 4057 4058 r = amdgpu_device_ip_resume_phase3(adev); 4059 4060 return r; 4061 } 4062 4063 /** 4064 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4065 * 4066 * @adev: amdgpu_device pointer 4067 * 4068 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4069 */ 4070 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4071 { 4072 if (amdgpu_sriov_vf(adev)) { 4073 if (adev->is_atom_fw) { 4074 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4075 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4076 } else { 4077 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4078 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4079 } 4080 4081 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4082 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4083 } 4084 } 4085 4086 /** 4087 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4088 * 4089 * @pdev : pci device context 4090 * @asic_type: AMD asic type 4091 * 4092 * Check if there is DC (new modesetting infrastructre) support for an asic. 4093 * returns true if DC has support, false if not. 4094 */ 4095 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4096 enum amd_asic_type asic_type) 4097 { 4098 switch (asic_type) { 4099 #ifdef CONFIG_DRM_AMDGPU_SI 4100 case CHIP_HAINAN: 4101 #endif 4102 case CHIP_TOPAZ: 4103 /* chips with no display hardware */ 4104 return false; 4105 #if defined(CONFIG_DRM_AMD_DC) 4106 case CHIP_TAHITI: 4107 case CHIP_PITCAIRN: 4108 case CHIP_VERDE: 4109 case CHIP_OLAND: 4110 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 4111 case CHIP_KAVERI: 4112 case CHIP_KABINI: 4113 case CHIP_MULLINS: 4114 /* 4115 * We have systems in the wild with these ASICs that require 4116 * TRAVIS and NUTMEG support which is not supported with DC. 4117 * 4118 * Fallback to the non-DC driver here by default so as not to 4119 * cause regressions. 4120 */ 4121 return amdgpu_dc > 0; 4122 default: 4123 return amdgpu_dc != 0; 4124 #else 4125 default: 4126 if (amdgpu_dc > 0) 4127 dev_info_once( 4128 &pdev->dev, 4129 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4130 return false; 4131 #endif 4132 } 4133 } 4134 4135 /** 4136 * amdgpu_device_has_dc_support - check if dc is supported 4137 * 4138 * @adev: amdgpu_device pointer 4139 * 4140 * Returns true for supported, false for not supported 4141 */ 4142 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4143 { 4144 if (adev->enable_virtual_display || 4145 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4146 return false; 4147 4148 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4149 } 4150 4151 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4152 { 4153 struct amdgpu_device *adev = 4154 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4155 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4156 4157 /* It's a bug to not have a hive within this function */ 4158 if (WARN_ON(!hive)) 4159 return; 4160 4161 /* 4162 * Use task barrier to synchronize all xgmi reset works across the 4163 * hive. task_barrier_enter and task_barrier_exit will block 4164 * until all the threads running the xgmi reset works reach 4165 * those points. task_barrier_full will do both blocks. 4166 */ 4167 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4168 4169 task_barrier_enter(&hive->tb); 4170 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4171 4172 if (adev->asic_reset_res) 4173 goto fail; 4174 4175 task_barrier_exit(&hive->tb); 4176 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4177 4178 if (adev->asic_reset_res) 4179 goto fail; 4180 4181 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4182 } else { 4183 4184 task_barrier_full(&hive->tb); 4185 adev->asic_reset_res = amdgpu_asic_reset(adev); 4186 } 4187 4188 fail: 4189 if (adev->asic_reset_res) 4190 dev_warn(adev->dev, 4191 "ASIC reset failed with error, %d for drm dev, %s", 4192 adev->asic_reset_res, adev_to_drm(adev)->unique); 4193 amdgpu_put_xgmi_hive(hive); 4194 } 4195 4196 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4197 { 4198 char *input = amdgpu_lockup_timeout; 4199 char *timeout_setting = NULL; 4200 int index = 0; 4201 long timeout; 4202 int ret = 0; 4203 4204 /* By default timeout for all queues is 2 sec */ 4205 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4206 adev->video_timeout = msecs_to_jiffies(2000); 4207 4208 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4209 return 0; 4210 4211 while ((timeout_setting = strsep(&input, ",")) && 4212 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4213 ret = kstrtol(timeout_setting, 0, &timeout); 4214 if (ret) 4215 return ret; 4216 4217 if (timeout == 0) { 4218 index++; 4219 continue; 4220 } else if (timeout < 0) { 4221 timeout = MAX_SCHEDULE_TIMEOUT; 4222 dev_warn(adev->dev, "lockup timeout disabled"); 4223 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4224 } else { 4225 timeout = msecs_to_jiffies(timeout); 4226 } 4227 4228 switch (index++) { 4229 case 0: 4230 adev->gfx_timeout = timeout; 4231 break; 4232 case 1: 4233 adev->compute_timeout = timeout; 4234 break; 4235 case 2: 4236 adev->sdma_timeout = timeout; 4237 break; 4238 case 3: 4239 adev->video_timeout = timeout; 4240 break; 4241 default: 4242 break; 4243 } 4244 } 4245 4246 /* When only one value specified apply it to all queues. */ 4247 if (index == 1) 4248 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4249 adev->video_timeout = timeout; 4250 4251 return ret; 4252 } 4253 4254 /** 4255 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4256 * 4257 * @adev: amdgpu_device pointer 4258 * 4259 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4260 */ 4261 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4262 { 4263 struct iommu_domain *domain; 4264 4265 domain = iommu_get_domain_for_dev(adev->dev); 4266 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4267 adev->ram_is_direct_mapped = true; 4268 } 4269 4270 #if defined(CONFIG_HSA_AMD_P2P) 4271 /** 4272 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4273 * 4274 * @adev: amdgpu_device pointer 4275 * 4276 * return if IOMMU remapping bar address 4277 */ 4278 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4279 { 4280 struct iommu_domain *domain; 4281 4282 domain = iommu_get_domain_for_dev(adev->dev); 4283 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4284 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4285 return true; 4286 4287 return false; 4288 } 4289 #endif 4290 4291 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4292 { 4293 if (amdgpu_mcbp == 1) 4294 adev->gfx.mcbp = true; 4295 else if (amdgpu_mcbp == 0) 4296 adev->gfx.mcbp = false; 4297 4298 if (amdgpu_sriov_vf(adev)) 4299 adev->gfx.mcbp = true; 4300 4301 if (adev->gfx.mcbp) 4302 dev_info(adev->dev, "MCBP is enabled\n"); 4303 } 4304 4305 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4306 { 4307 int r; 4308 4309 r = amdgpu_atombios_sysfs_init(adev); 4310 if (r) 4311 drm_err(&adev->ddev, 4312 "registering atombios sysfs failed (%d).\n", r); 4313 4314 r = amdgpu_pm_sysfs_init(adev); 4315 if (r) 4316 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4317 4318 r = amdgpu_ucode_sysfs_init(adev); 4319 if (r) { 4320 adev->ucode_sysfs_en = false; 4321 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4322 } else 4323 adev->ucode_sysfs_en = true; 4324 4325 r = amdgpu_device_attr_sysfs_init(adev); 4326 if (r) 4327 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4328 4329 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4330 if (r) 4331 dev_err(adev->dev, 4332 "Could not create amdgpu board attributes\n"); 4333 4334 amdgpu_fru_sysfs_init(adev); 4335 amdgpu_reg_state_sysfs_init(adev); 4336 amdgpu_xcp_sysfs_init(adev); 4337 amdgpu_uma_sysfs_init(adev); 4338 4339 return r; 4340 } 4341 4342 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4343 { 4344 if (adev->pm.sysfs_initialized) 4345 amdgpu_pm_sysfs_fini(adev); 4346 if (adev->ucode_sysfs_en) 4347 amdgpu_ucode_sysfs_fini(adev); 4348 amdgpu_device_attr_sysfs_fini(adev); 4349 amdgpu_fru_sysfs_fini(adev); 4350 4351 amdgpu_reg_state_sysfs_fini(adev); 4352 amdgpu_xcp_sysfs_fini(adev); 4353 amdgpu_uma_sysfs_fini(adev); 4354 } 4355 4356 /** 4357 * amdgpu_device_init - initialize the driver 4358 * 4359 * @adev: amdgpu_device pointer 4360 * @flags: driver flags 4361 * 4362 * Initializes the driver info and hw (all asics). 4363 * Returns 0 for success or an error on failure. 4364 * Called at driver startup. 4365 */ 4366 int amdgpu_device_init(struct amdgpu_device *adev, 4367 uint32_t flags) 4368 { 4369 struct pci_dev *pdev = adev->pdev; 4370 int r, i; 4371 bool px = false; 4372 u32 max_MBps; 4373 int tmp; 4374 4375 adev->shutdown = false; 4376 adev->flags = flags; 4377 4378 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4379 adev->asic_type = amdgpu_force_asic_type; 4380 else 4381 adev->asic_type = flags & AMD_ASIC_MASK; 4382 4383 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4384 if (amdgpu_emu_mode == 1) 4385 adev->usec_timeout *= 10; 4386 adev->gmc.gart_size = 512 * 1024 * 1024; 4387 adev->accel_working = false; 4388 adev->num_rings = 0; 4389 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4390 adev->mman.buffer_funcs = NULL; 4391 adev->mman.buffer_funcs_ring = NULL; 4392 adev->vm_manager.vm_pte_funcs = NULL; 4393 adev->vm_manager.vm_pte_num_scheds = 0; 4394 adev->gmc.gmc_funcs = NULL; 4395 adev->harvest_ip_mask = 0x0; 4396 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4397 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4398 4399 adev->smc_rreg = &amdgpu_invalid_rreg; 4400 adev->smc_wreg = &amdgpu_invalid_wreg; 4401 adev->pcie_rreg = &amdgpu_invalid_rreg; 4402 adev->pcie_wreg = &amdgpu_invalid_wreg; 4403 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4404 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4405 adev->pciep_rreg = &amdgpu_invalid_rreg; 4406 adev->pciep_wreg = &amdgpu_invalid_wreg; 4407 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4408 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4409 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4410 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4411 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4412 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4413 adev->didt_rreg = &amdgpu_invalid_rreg; 4414 adev->didt_wreg = &amdgpu_invalid_wreg; 4415 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4416 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4417 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4418 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4419 4420 dev_info( 4421 adev->dev, 4422 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4423 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4424 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4425 4426 /* mutex initialization are all done here so we 4427 * can recall function without having locking issues 4428 */ 4429 mutex_init(&adev->firmware.mutex); 4430 mutex_init(&adev->pm.mutex); 4431 mutex_init(&adev->gfx.gpu_clock_mutex); 4432 mutex_init(&adev->srbm_mutex); 4433 mutex_init(&adev->gfx.pipe_reserve_mutex); 4434 mutex_init(&adev->gfx.gfx_off_mutex); 4435 mutex_init(&adev->gfx.partition_mutex); 4436 mutex_init(&adev->grbm_idx_mutex); 4437 mutex_init(&adev->mn_lock); 4438 mutex_init(&adev->virt.vf_errors.lock); 4439 hash_init(adev->mn_hash); 4440 mutex_init(&adev->psp.mutex); 4441 mutex_init(&adev->notifier_lock); 4442 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4443 mutex_init(&adev->benchmark_mutex); 4444 mutex_init(&adev->gfx.reset_sem_mutex); 4445 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4446 mutex_init(&adev->enforce_isolation_mutex); 4447 for (i = 0; i < MAX_XCP; ++i) { 4448 adev->isolation[i].spearhead = dma_fence_get_stub(); 4449 amdgpu_sync_create(&adev->isolation[i].active); 4450 amdgpu_sync_create(&adev->isolation[i].prev); 4451 } 4452 mutex_init(&adev->gfx.userq_sch_mutex); 4453 mutex_init(&adev->gfx.workload_profile_mutex); 4454 mutex_init(&adev->vcn.workload_profile_mutex); 4455 4456 amdgpu_device_init_apu_flags(adev); 4457 4458 r = amdgpu_device_check_arguments(adev); 4459 if (r) 4460 return r; 4461 4462 spin_lock_init(&adev->mmio_idx_lock); 4463 spin_lock_init(&adev->smc_idx_lock); 4464 spin_lock_init(&adev->pcie_idx_lock); 4465 spin_lock_init(&adev->uvd_ctx_idx_lock); 4466 spin_lock_init(&adev->didt_idx_lock); 4467 spin_lock_init(&adev->gc_cac_idx_lock); 4468 spin_lock_init(&adev->se_cac_idx_lock); 4469 spin_lock_init(&adev->audio_endpt_idx_lock); 4470 spin_lock_init(&adev->mm_stats.lock); 4471 spin_lock_init(&adev->virt.rlcg_reg_lock); 4472 spin_lock_init(&adev->wb.lock); 4473 4474 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4475 4476 INIT_LIST_HEAD(&adev->reset_list); 4477 4478 INIT_LIST_HEAD(&adev->ras_list); 4479 4480 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4481 4482 xa_init(&adev->userq_doorbell_xa); 4483 4484 INIT_DELAYED_WORK(&adev->delayed_init_work, 4485 amdgpu_device_delayed_init_work_handler); 4486 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4487 amdgpu_device_delay_enable_gfx_off); 4488 /* 4489 * Initialize the enforce_isolation work structures for each XCP 4490 * partition. This work handler is responsible for enforcing shader 4491 * isolation on AMD GPUs. It counts the number of emitted fences for 4492 * each GFX and compute ring. If there are any fences, it schedules 4493 * the `enforce_isolation_work` to be run after a delay. If there are 4494 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4495 * runqueue. 4496 */ 4497 for (i = 0; i < MAX_XCP; i++) { 4498 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4499 amdgpu_gfx_enforce_isolation_handler); 4500 adev->gfx.enforce_isolation[i].adev = adev; 4501 adev->gfx.enforce_isolation[i].xcp_id = i; 4502 } 4503 4504 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4505 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4506 4507 adev->gfx.gfx_off_req_count = 1; 4508 adev->gfx.gfx_off_residency = 0; 4509 adev->gfx.gfx_off_entrycount = 0; 4510 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4511 4512 atomic_set(&adev->throttling_logging_enabled, 1); 4513 /* 4514 * If throttling continues, logging will be performed every minute 4515 * to avoid log flooding. "-1" is subtracted since the thermal 4516 * throttling interrupt comes every second. Thus, the total logging 4517 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4518 * for throttling interrupt) = 60 seconds. 4519 */ 4520 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4521 4522 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4523 4524 /* Registers mapping */ 4525 /* TODO: block userspace mapping of io register */ 4526 if (adev->asic_type >= CHIP_BONAIRE) { 4527 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4528 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4529 } else { 4530 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4531 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4532 } 4533 4534 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4535 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4536 4537 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4538 if (!adev->rmmio) 4539 return -ENOMEM; 4540 4541 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4542 (uint32_t)adev->rmmio_base); 4543 dev_info(adev->dev, "register mmio size: %u\n", 4544 (unsigned int)adev->rmmio_size); 4545 4546 /* 4547 * Reset domain needs to be present early, before XGMI hive discovered 4548 * (if any) and initialized to use reset sem and in_gpu reset flag 4549 * early on during init and before calling to RREG32. 4550 */ 4551 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4552 if (!adev->reset_domain) 4553 return -ENOMEM; 4554 4555 /* detect hw virtualization here */ 4556 amdgpu_virt_init(adev); 4557 4558 amdgpu_device_get_pcie_info(adev); 4559 4560 r = amdgpu_device_get_job_timeout_settings(adev); 4561 if (r) { 4562 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4563 return r; 4564 } 4565 4566 amdgpu_device_set_mcbp(adev); 4567 4568 /* 4569 * By default, use default mode where all blocks are expected to be 4570 * initialized. At present a 'swinit' of blocks is required to be 4571 * completed before the need for a different level is detected. 4572 */ 4573 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4574 /* early init functions */ 4575 r = amdgpu_device_ip_early_init(adev); 4576 if (r) 4577 return r; 4578 4579 /* 4580 * No need to remove conflicting FBs for non-display class devices. 4581 * This prevents the sysfb from being freed accidently. 4582 */ 4583 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4584 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4585 /* Get rid of things like offb */ 4586 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4587 if (r) 4588 return r; 4589 } 4590 4591 /* Enable TMZ based on IP_VERSION */ 4592 amdgpu_gmc_tmz_set(adev); 4593 4594 if (amdgpu_sriov_vf(adev) && 4595 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4596 /* VF MMIO access (except mailbox range) from CPU 4597 * will be blocked during sriov runtime 4598 */ 4599 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4600 4601 amdgpu_gmc_noretry_set(adev); 4602 /* Need to get xgmi info early to decide the reset behavior*/ 4603 if (adev->gmc.xgmi.supported) { 4604 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4605 if (r) 4606 return r; 4607 } 4608 4609 /* enable PCIE atomic ops */ 4610 if (amdgpu_sriov_vf(adev)) { 4611 if (adev->virt.fw_reserve.p_pf2vf) 4612 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4613 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4614 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4615 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4616 * internal path natively support atomics, set have_atomics_support to true. 4617 */ 4618 } else if ((adev->flags & AMD_IS_APU) && 4619 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4620 IP_VERSION(9, 0, 0))) { 4621 adev->have_atomics_support = true; 4622 } else { 4623 adev->have_atomics_support = 4624 !pci_enable_atomic_ops_to_root(adev->pdev, 4625 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4626 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4627 } 4628 4629 if (!adev->have_atomics_support) 4630 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4631 4632 /* doorbell bar mapping and doorbell index init*/ 4633 amdgpu_doorbell_init(adev); 4634 4635 if (amdgpu_emu_mode == 1) { 4636 /* post the asic on emulation mode */ 4637 emu_soc_asic_init(adev); 4638 goto fence_driver_init; 4639 } 4640 4641 amdgpu_reset_init(adev); 4642 4643 /* detect if we are with an SRIOV vbios */ 4644 if (adev->bios) 4645 amdgpu_device_detect_sriov_bios(adev); 4646 4647 /* check if we need to reset the asic 4648 * E.g., driver was not cleanly unloaded previously, etc. 4649 */ 4650 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4651 if (adev->gmc.xgmi.num_physical_nodes) { 4652 dev_info(adev->dev, "Pending hive reset.\n"); 4653 amdgpu_set_init_level(adev, 4654 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4655 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4656 !amdgpu_device_has_display_hardware(adev)) { 4657 r = psp_gpu_reset(adev); 4658 } else { 4659 tmp = amdgpu_reset_method; 4660 /* It should do a default reset when loading or reloading the driver, 4661 * regardless of the module parameter reset_method. 4662 */ 4663 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4664 r = amdgpu_asic_reset(adev); 4665 amdgpu_reset_method = tmp; 4666 } 4667 4668 if (r) { 4669 dev_err(adev->dev, "asic reset on init failed\n"); 4670 goto failed; 4671 } 4672 } 4673 4674 /* Post card if necessary */ 4675 if (amdgpu_device_need_post(adev)) { 4676 if (!adev->bios) { 4677 dev_err(adev->dev, "no vBIOS found\n"); 4678 r = -EINVAL; 4679 goto failed; 4680 } 4681 dev_info(adev->dev, "GPU posting now...\n"); 4682 r = amdgpu_device_asic_init(adev); 4683 if (r) { 4684 dev_err(adev->dev, "gpu post error!\n"); 4685 goto failed; 4686 } 4687 } 4688 4689 if (adev->bios) { 4690 if (adev->is_atom_fw) { 4691 /* Initialize clocks */ 4692 r = amdgpu_atomfirmware_get_clock_info(adev); 4693 if (r) { 4694 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4695 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4696 goto failed; 4697 } 4698 } else { 4699 /* Initialize clocks */ 4700 r = amdgpu_atombios_get_clock_info(adev); 4701 if (r) { 4702 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4703 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4704 goto failed; 4705 } 4706 /* init i2c buses */ 4707 amdgpu_i2c_init(adev); 4708 } 4709 } 4710 4711 fence_driver_init: 4712 /* Fence driver */ 4713 r = amdgpu_fence_driver_sw_init(adev); 4714 if (r) { 4715 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4716 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4717 goto failed; 4718 } 4719 4720 /* init the mode config */ 4721 drm_mode_config_init(adev_to_drm(adev)); 4722 4723 r = amdgpu_device_ip_init(adev); 4724 if (r) { 4725 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4726 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4727 goto release_ras_con; 4728 } 4729 4730 amdgpu_fence_driver_hw_init(adev); 4731 4732 dev_info(adev->dev, 4733 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4734 adev->gfx.config.max_shader_engines, 4735 adev->gfx.config.max_sh_per_se, 4736 adev->gfx.config.max_cu_per_sh, 4737 adev->gfx.cu_info.number); 4738 4739 adev->accel_working = true; 4740 4741 amdgpu_vm_check_compute_bug(adev); 4742 4743 /* Initialize the buffer migration limit. */ 4744 if (amdgpu_moverate >= 0) 4745 max_MBps = amdgpu_moverate; 4746 else 4747 max_MBps = 8; /* Allow 8 MB/s. */ 4748 /* Get a log2 for easy divisions. */ 4749 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4750 4751 /* 4752 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4753 * Otherwise the mgpu fan boost feature will be skipped due to the 4754 * gpu instance is counted less. 4755 */ 4756 amdgpu_register_gpu_instance(adev); 4757 4758 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4759 * explicit gating rather than handling it automatically. 4760 */ 4761 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4762 r = amdgpu_device_ip_late_init(adev); 4763 if (r) { 4764 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4765 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4766 goto release_ras_con; 4767 } 4768 /* must succeed. */ 4769 amdgpu_ras_resume(adev); 4770 queue_delayed_work(system_wq, &adev->delayed_init_work, 4771 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4772 } 4773 4774 if (amdgpu_sriov_vf(adev)) { 4775 amdgpu_virt_release_full_gpu(adev, true); 4776 flush_delayed_work(&adev->delayed_init_work); 4777 } 4778 4779 /* Don't init kfd if whole hive need to be reset during init */ 4780 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4781 kgd2kfd_init_zone_device(adev); 4782 kfd_update_svm_support_properties(adev); 4783 } 4784 4785 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4786 amdgpu_xgmi_reset_on_init(adev); 4787 4788 /* 4789 * Place those sysfs registering after `late_init`. As some of those 4790 * operations performed in `late_init` might affect the sysfs 4791 * interfaces creating. 4792 */ 4793 r = amdgpu_device_sys_interface_init(adev); 4794 4795 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4796 r = amdgpu_pmu_init(adev); 4797 if (r) 4798 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4799 4800 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4801 if (amdgpu_device_cache_pci_state(adev->pdev)) 4802 pci_restore_state(pdev); 4803 4804 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4805 /* this will fail for cards that aren't VGA class devices, just 4806 * ignore it 4807 */ 4808 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4809 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4810 4811 px = amdgpu_device_supports_px(adev); 4812 4813 if (px || (!dev_is_removable(&adev->pdev->dev) && 4814 apple_gmux_detect(NULL, NULL))) 4815 vga_switcheroo_register_client(adev->pdev, 4816 &amdgpu_switcheroo_ops, px); 4817 4818 if (px) 4819 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4820 4821 amdgpu_device_check_iommu_direct_map(adev); 4822 4823 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4824 r = register_pm_notifier(&adev->pm_nb); 4825 if (r) 4826 goto failed; 4827 4828 return 0; 4829 4830 release_ras_con: 4831 if (amdgpu_sriov_vf(adev)) 4832 amdgpu_virt_release_full_gpu(adev, true); 4833 4834 /* failed in exclusive mode due to timeout */ 4835 if (amdgpu_sriov_vf(adev) && 4836 !amdgpu_sriov_runtime(adev) && 4837 amdgpu_virt_mmio_blocked(adev) && 4838 !amdgpu_virt_wait_reset(adev)) { 4839 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4840 /* Don't send request since VF is inactive. */ 4841 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4842 adev->virt.ops = NULL; 4843 r = -EAGAIN; 4844 } 4845 amdgpu_release_ras_context(adev); 4846 4847 failed: 4848 amdgpu_vf_error_trans_all(adev); 4849 4850 return r; 4851 } 4852 4853 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4854 { 4855 4856 /* Clear all CPU mappings pointing to this device */ 4857 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4858 4859 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4860 amdgpu_doorbell_fini(adev); 4861 4862 iounmap(adev->rmmio); 4863 adev->rmmio = NULL; 4864 if (adev->mman.aper_base_kaddr) 4865 iounmap(adev->mman.aper_base_kaddr); 4866 adev->mman.aper_base_kaddr = NULL; 4867 4868 /* Memory manager related */ 4869 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4870 arch_phys_wc_del(adev->gmc.vram_mtrr); 4871 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4872 } 4873 } 4874 4875 /** 4876 * amdgpu_device_fini_hw - tear down the driver 4877 * 4878 * @adev: amdgpu_device pointer 4879 * 4880 * Tear down the driver info (all asics). 4881 * Called at driver shutdown. 4882 */ 4883 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4884 { 4885 dev_info(adev->dev, "finishing device.\n"); 4886 flush_delayed_work(&adev->delayed_init_work); 4887 4888 if (adev->mman.initialized) 4889 drain_workqueue(adev->mman.bdev.wq); 4890 adev->shutdown = true; 4891 4892 unregister_pm_notifier(&adev->pm_nb); 4893 4894 /* make sure IB test finished before entering exclusive mode 4895 * to avoid preemption on IB test 4896 */ 4897 if (amdgpu_sriov_vf(adev)) { 4898 amdgpu_virt_request_full_gpu(adev, false); 4899 amdgpu_virt_fini_data_exchange(adev); 4900 } 4901 4902 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 4903 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 4904 4905 /* disable all interrupts */ 4906 amdgpu_irq_disable_all(adev); 4907 if (adev->mode_info.mode_config_initialized) { 4908 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4909 drm_helper_force_disable_all(adev_to_drm(adev)); 4910 else 4911 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4912 } 4913 amdgpu_fence_driver_hw_fini(adev); 4914 4915 amdgpu_device_sys_interface_fini(adev); 4916 4917 /* disable ras feature must before hw fini */ 4918 amdgpu_ras_pre_fini(adev); 4919 4920 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4921 4922 /* 4923 * device went through surprise hotplug; we need to destroy topology 4924 * before ip_fini_early to prevent kfd locking refcount issues by calling 4925 * amdgpu_amdkfd_suspend() 4926 */ 4927 if (pci_dev_is_disconnected(adev->pdev)) 4928 amdgpu_amdkfd_device_fini_sw(adev); 4929 4930 amdgpu_device_ip_fini_early(adev); 4931 4932 amdgpu_irq_fini_hw(adev); 4933 4934 if (adev->mman.initialized) 4935 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4936 4937 amdgpu_gart_dummy_page_fini(adev); 4938 4939 if (pci_dev_is_disconnected(adev->pdev)) 4940 amdgpu_device_unmap_mmio(adev); 4941 4942 } 4943 4944 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4945 { 4946 int i, idx; 4947 bool px; 4948 4949 amdgpu_device_ip_fini(adev); 4950 amdgpu_fence_driver_sw_fini(adev); 4951 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4952 adev->accel_working = false; 4953 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4954 for (i = 0; i < MAX_XCP; ++i) { 4955 dma_fence_put(adev->isolation[i].spearhead); 4956 amdgpu_sync_free(&adev->isolation[i].active); 4957 amdgpu_sync_free(&adev->isolation[i].prev); 4958 } 4959 4960 amdgpu_reset_fini(adev); 4961 4962 /* free i2c buses */ 4963 amdgpu_i2c_fini(adev); 4964 4965 if (adev->bios) { 4966 if (amdgpu_emu_mode != 1) 4967 amdgpu_atombios_fini(adev); 4968 amdgpu_bios_release(adev); 4969 } 4970 4971 kfree(adev->fru_info); 4972 adev->fru_info = NULL; 4973 4974 kfree(adev->xcp_mgr); 4975 adev->xcp_mgr = NULL; 4976 4977 px = amdgpu_device_supports_px(adev); 4978 4979 if (px || (!dev_is_removable(&adev->pdev->dev) && 4980 apple_gmux_detect(NULL, NULL))) 4981 vga_switcheroo_unregister_client(adev->pdev); 4982 4983 if (px) 4984 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4985 4986 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4987 vga_client_unregister(adev->pdev); 4988 4989 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4990 4991 iounmap(adev->rmmio); 4992 adev->rmmio = NULL; 4993 drm_dev_exit(idx); 4994 } 4995 4996 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4997 amdgpu_pmu_fini(adev); 4998 if (adev->discovery.bin) 4999 amdgpu_discovery_fini(adev); 5000 5001 amdgpu_reset_put_reset_domain(adev->reset_domain); 5002 adev->reset_domain = NULL; 5003 5004 kfree(adev->pci_state); 5005 kfree(adev->pcie_reset_ctx.swds_pcistate); 5006 kfree(adev->pcie_reset_ctx.swus_pcistate); 5007 } 5008 5009 /** 5010 * amdgpu_device_evict_resources - evict device resources 5011 * @adev: amdgpu device object 5012 * 5013 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5014 * of the vram memory type. Mainly used for evicting device resources 5015 * at suspend time. 5016 * 5017 */ 5018 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5019 { 5020 int ret; 5021 5022 /* No need to evict vram on APUs unless going to S4 */ 5023 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5024 return 0; 5025 5026 /* No need to evict when going to S5 through S4 callbacks */ 5027 if (system_state == SYSTEM_POWER_OFF) 5028 return 0; 5029 5030 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5031 if (ret) { 5032 dev_warn(adev->dev, "evicting device resources failed\n"); 5033 return ret; 5034 } 5035 5036 if (adev->in_s4) { 5037 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5038 if (ret) 5039 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5040 } 5041 return ret; 5042 } 5043 5044 /* 5045 * Suspend & resume. 5046 */ 5047 /** 5048 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5049 * @nb: notifier block 5050 * @mode: suspend mode 5051 * @data: data 5052 * 5053 * This function is called when the system is about to suspend or hibernate. 5054 * It is used to set the appropriate flags so that eviction can be optimized 5055 * in the pm prepare callback. 5056 */ 5057 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5058 void *data) 5059 { 5060 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5061 5062 switch (mode) { 5063 case PM_HIBERNATION_PREPARE: 5064 adev->in_s4 = true; 5065 break; 5066 case PM_POST_HIBERNATION: 5067 adev->in_s4 = false; 5068 break; 5069 } 5070 5071 return NOTIFY_DONE; 5072 } 5073 5074 /** 5075 * amdgpu_device_prepare - prepare for device suspend 5076 * 5077 * @dev: drm dev pointer 5078 * 5079 * Prepare to put the hw in the suspend state (all asics). 5080 * Returns 0 for success or an error on failure. 5081 * Called at driver suspend. 5082 */ 5083 int amdgpu_device_prepare(struct drm_device *dev) 5084 { 5085 struct amdgpu_device *adev = drm_to_adev(dev); 5086 int i, r; 5087 5088 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5089 return 0; 5090 5091 /* Evict the majority of BOs before starting suspend sequence */ 5092 r = amdgpu_device_evict_resources(adev); 5093 if (r) 5094 return r; 5095 5096 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5097 5098 for (i = 0; i < adev->num_ip_blocks; i++) { 5099 if (!adev->ip_blocks[i].status.valid) 5100 continue; 5101 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5102 continue; 5103 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5104 if (r) 5105 return r; 5106 } 5107 5108 return 0; 5109 } 5110 5111 /** 5112 * amdgpu_device_complete - complete power state transition 5113 * 5114 * @dev: drm dev pointer 5115 * 5116 * Undo the changes from amdgpu_device_prepare. This will be 5117 * called on all resume transitions, including those that failed. 5118 */ 5119 void amdgpu_device_complete(struct drm_device *dev) 5120 { 5121 struct amdgpu_device *adev = drm_to_adev(dev); 5122 int i; 5123 5124 for (i = 0; i < adev->num_ip_blocks; i++) { 5125 if (!adev->ip_blocks[i].status.valid) 5126 continue; 5127 if (!adev->ip_blocks[i].version->funcs->complete) 5128 continue; 5129 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5130 } 5131 } 5132 5133 /** 5134 * amdgpu_device_suspend - initiate device suspend 5135 * 5136 * @dev: drm dev pointer 5137 * @notify_clients: notify in-kernel DRM clients 5138 * 5139 * Puts the hw in the suspend state (all asics). 5140 * Returns 0 for success or an error on failure. 5141 * Called at driver suspend. 5142 */ 5143 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5144 { 5145 struct amdgpu_device *adev = drm_to_adev(dev); 5146 int r, rec; 5147 5148 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5149 return 0; 5150 5151 adev->in_suspend = true; 5152 5153 if (amdgpu_sriov_vf(adev)) { 5154 if (!adev->in_runpm) 5155 amdgpu_amdkfd_suspend_process(adev); 5156 amdgpu_virt_fini_data_exchange(adev); 5157 r = amdgpu_virt_request_full_gpu(adev, false); 5158 if (r) 5159 return r; 5160 } 5161 5162 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5163 if (r) 5164 goto unwind_sriov; 5165 5166 if (notify_clients) 5167 drm_client_dev_suspend(adev_to_drm(adev)); 5168 5169 cancel_delayed_work_sync(&adev->delayed_init_work); 5170 5171 amdgpu_ras_suspend(adev); 5172 5173 r = amdgpu_device_ip_suspend_phase1(adev); 5174 if (r) 5175 goto unwind_smartshift; 5176 5177 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5178 r = amdgpu_userq_suspend(adev); 5179 if (r) 5180 goto unwind_ip_phase1; 5181 5182 r = amdgpu_device_evict_resources(adev); 5183 if (r) 5184 goto unwind_userq; 5185 5186 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5187 5188 amdgpu_fence_driver_hw_fini(adev); 5189 5190 r = amdgpu_device_ip_suspend_phase2(adev); 5191 if (r) 5192 goto unwind_evict; 5193 5194 if (amdgpu_sriov_vf(adev)) 5195 amdgpu_virt_release_full_gpu(adev, false); 5196 5197 return 0; 5198 5199 unwind_evict: 5200 if (adev->mman.buffer_funcs_ring->sched.ready) 5201 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5202 amdgpu_fence_driver_hw_init(adev); 5203 5204 unwind_userq: 5205 rec = amdgpu_userq_resume(adev); 5206 if (rec) { 5207 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5208 return r; 5209 } 5210 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5211 if (rec) { 5212 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5213 return r; 5214 } 5215 5216 unwind_ip_phase1: 5217 /* suspend phase 1 = resume phase 3 */ 5218 rec = amdgpu_device_ip_resume_phase3(adev); 5219 if (rec) { 5220 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5221 return r; 5222 } 5223 5224 unwind_smartshift: 5225 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5226 if (rec) { 5227 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5228 return r; 5229 } 5230 5231 if (notify_clients) 5232 drm_client_dev_resume(adev_to_drm(adev)); 5233 5234 amdgpu_ras_resume(adev); 5235 5236 unwind_sriov: 5237 if (amdgpu_sriov_vf(adev)) { 5238 rec = amdgpu_virt_request_full_gpu(adev, true); 5239 if (rec) { 5240 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5241 return r; 5242 } 5243 } 5244 5245 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5246 5247 return r; 5248 } 5249 5250 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5251 { 5252 int r; 5253 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5254 5255 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5256 * may not work. The access could be blocked by nBIF protection as VF isn't in 5257 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5258 * so that QEMU reprograms MSIX table. 5259 */ 5260 amdgpu_restore_msix(adev); 5261 5262 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5263 if (r) 5264 return r; 5265 5266 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5267 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5268 5269 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5270 adev->vm_manager.vram_base_offset += 5271 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5272 5273 return 0; 5274 } 5275 5276 /** 5277 * amdgpu_device_resume - initiate device resume 5278 * 5279 * @dev: drm dev pointer 5280 * @notify_clients: notify in-kernel DRM clients 5281 * 5282 * Bring the hw back to operating state (all asics). 5283 * Returns 0 for success or an error on failure. 5284 * Called at driver resume. 5285 */ 5286 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5287 { 5288 struct amdgpu_device *adev = drm_to_adev(dev); 5289 int r = 0; 5290 5291 if (amdgpu_sriov_vf(adev)) { 5292 r = amdgpu_virt_request_full_gpu(adev, true); 5293 if (r) 5294 return r; 5295 } 5296 5297 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5298 r = amdgpu_virt_resume(adev); 5299 if (r) 5300 goto exit; 5301 } 5302 5303 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5304 return 0; 5305 5306 if (adev->in_s0ix) 5307 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5308 5309 /* post card */ 5310 if (amdgpu_device_need_post(adev)) { 5311 r = amdgpu_device_asic_init(adev); 5312 if (r) 5313 dev_err(adev->dev, "amdgpu asic init failed\n"); 5314 } 5315 5316 r = amdgpu_device_ip_resume(adev); 5317 5318 if (r) { 5319 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5320 goto exit; 5321 } 5322 5323 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5324 if (r) 5325 goto exit; 5326 5327 r = amdgpu_userq_resume(adev); 5328 if (r) 5329 goto exit; 5330 5331 r = amdgpu_device_ip_late_init(adev); 5332 if (r) 5333 goto exit; 5334 5335 queue_delayed_work(system_wq, &adev->delayed_init_work, 5336 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5337 exit: 5338 if (amdgpu_sriov_vf(adev)) { 5339 amdgpu_virt_init_data_exchange(adev); 5340 amdgpu_virt_release_full_gpu(adev, true); 5341 5342 if (!r && !adev->in_runpm) 5343 r = amdgpu_amdkfd_resume_process(adev); 5344 } 5345 5346 if (r) 5347 return r; 5348 5349 /* Make sure IB tests flushed */ 5350 flush_delayed_work(&adev->delayed_init_work); 5351 5352 if (notify_clients) 5353 drm_client_dev_resume(adev_to_drm(adev)); 5354 5355 amdgpu_ras_resume(adev); 5356 5357 if (adev->mode_info.num_crtc) { 5358 /* 5359 * Most of the connector probing functions try to acquire runtime pm 5360 * refs to ensure that the GPU is powered on when connector polling is 5361 * performed. Since we're calling this from a runtime PM callback, 5362 * trying to acquire rpm refs will cause us to deadlock. 5363 * 5364 * Since we're guaranteed to be holding the rpm lock, it's safe to 5365 * temporarily disable the rpm helpers so this doesn't deadlock us. 5366 */ 5367 #ifdef CONFIG_PM 5368 dev->dev->power.disable_depth++; 5369 #endif 5370 if (!adev->dc_enabled) 5371 drm_helper_hpd_irq_event(dev); 5372 else 5373 drm_kms_helper_hotplug_event(dev); 5374 #ifdef CONFIG_PM 5375 dev->dev->power.disable_depth--; 5376 #endif 5377 } 5378 5379 amdgpu_vram_mgr_clear_reset_blocks(adev); 5380 adev->in_suspend = false; 5381 5382 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5383 dev_warn(adev->dev, "smart shift update failed\n"); 5384 5385 return 0; 5386 } 5387 5388 /** 5389 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5390 * 5391 * @adev: amdgpu_device pointer 5392 * 5393 * The list of all the hardware IPs that make up the asic is walked and 5394 * the check_soft_reset callbacks are run. check_soft_reset determines 5395 * if the asic is still hung or not. 5396 * Returns true if any of the IPs are still in a hung state, false if not. 5397 */ 5398 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5399 { 5400 int i; 5401 bool asic_hang = false; 5402 5403 if (amdgpu_sriov_vf(adev)) 5404 return true; 5405 5406 if (amdgpu_asic_need_full_reset(adev)) 5407 return true; 5408 5409 for (i = 0; i < adev->num_ip_blocks; i++) { 5410 if (!adev->ip_blocks[i].status.valid) 5411 continue; 5412 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5413 adev->ip_blocks[i].status.hang = 5414 adev->ip_blocks[i].version->funcs->check_soft_reset( 5415 &adev->ip_blocks[i]); 5416 if (adev->ip_blocks[i].status.hang) { 5417 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5418 asic_hang = true; 5419 } 5420 } 5421 return asic_hang; 5422 } 5423 5424 /** 5425 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5426 * 5427 * @adev: amdgpu_device pointer 5428 * 5429 * The list of all the hardware IPs that make up the asic is walked and the 5430 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5431 * handles any IP specific hardware or software state changes that are 5432 * necessary for a soft reset to succeed. 5433 * Returns 0 on success, negative error code on failure. 5434 */ 5435 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5436 { 5437 int i, r = 0; 5438 5439 for (i = 0; i < adev->num_ip_blocks; i++) { 5440 if (!adev->ip_blocks[i].status.valid) 5441 continue; 5442 if (adev->ip_blocks[i].status.hang && 5443 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5444 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5445 if (r) 5446 return r; 5447 } 5448 } 5449 5450 return 0; 5451 } 5452 5453 /** 5454 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5455 * 5456 * @adev: amdgpu_device pointer 5457 * 5458 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5459 * reset is necessary to recover. 5460 * Returns true if a full asic reset is required, false if not. 5461 */ 5462 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5463 { 5464 int i; 5465 5466 if (amdgpu_asic_need_full_reset(adev)) 5467 return true; 5468 5469 for (i = 0; i < adev->num_ip_blocks; i++) { 5470 if (!adev->ip_blocks[i].status.valid) 5471 continue; 5472 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5473 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5474 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5475 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5476 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5477 if (adev->ip_blocks[i].status.hang) { 5478 dev_info(adev->dev, "Some block need full reset!\n"); 5479 return true; 5480 } 5481 } 5482 } 5483 return false; 5484 } 5485 5486 /** 5487 * amdgpu_device_ip_soft_reset - do a soft reset 5488 * 5489 * @adev: amdgpu_device pointer 5490 * 5491 * The list of all the hardware IPs that make up the asic is walked and the 5492 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5493 * IP specific hardware or software state changes that are necessary to soft 5494 * reset the IP. 5495 * Returns 0 on success, negative error code on failure. 5496 */ 5497 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5498 { 5499 int i, r = 0; 5500 5501 for (i = 0; i < adev->num_ip_blocks; i++) { 5502 if (!adev->ip_blocks[i].status.valid) 5503 continue; 5504 if (adev->ip_blocks[i].status.hang && 5505 adev->ip_blocks[i].version->funcs->soft_reset) { 5506 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5507 if (r) 5508 return r; 5509 } 5510 } 5511 5512 return 0; 5513 } 5514 5515 /** 5516 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5517 * 5518 * @adev: amdgpu_device pointer 5519 * 5520 * The list of all the hardware IPs that make up the asic is walked and the 5521 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5522 * handles any IP specific hardware or software state changes that are 5523 * necessary after the IP has been soft reset. 5524 * Returns 0 on success, negative error code on failure. 5525 */ 5526 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5527 { 5528 int i, r = 0; 5529 5530 for (i = 0; i < adev->num_ip_blocks; i++) { 5531 if (!adev->ip_blocks[i].status.valid) 5532 continue; 5533 if (adev->ip_blocks[i].status.hang && 5534 adev->ip_blocks[i].version->funcs->post_soft_reset) 5535 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5536 if (r) 5537 return r; 5538 } 5539 5540 return 0; 5541 } 5542 5543 /** 5544 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5545 * 5546 * @adev: amdgpu_device pointer 5547 * @reset_context: amdgpu reset context pointer 5548 * 5549 * do VF FLR and reinitialize Asic 5550 * return 0 means succeeded otherwise failed 5551 */ 5552 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5553 struct amdgpu_reset_context *reset_context) 5554 { 5555 int r; 5556 struct amdgpu_hive_info *hive = NULL; 5557 5558 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5559 if (!amdgpu_ras_get_fed_status(adev)) 5560 amdgpu_virt_ready_to_reset(adev); 5561 amdgpu_virt_wait_reset(adev); 5562 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5563 r = amdgpu_virt_request_full_gpu(adev, true); 5564 } else { 5565 r = amdgpu_virt_reset_gpu(adev); 5566 } 5567 if (r) 5568 return r; 5569 5570 amdgpu_ras_clear_err_state(adev); 5571 amdgpu_irq_gpu_reset_resume_helper(adev); 5572 5573 /* some sw clean up VF needs to do before recover */ 5574 amdgpu_virt_post_reset(adev); 5575 5576 /* Resume IP prior to SMC */ 5577 r = amdgpu_device_ip_reinit_early_sriov(adev); 5578 if (r) 5579 return r; 5580 5581 amdgpu_virt_init_data_exchange(adev); 5582 5583 r = amdgpu_device_fw_loading(adev); 5584 if (r) 5585 return r; 5586 5587 /* now we are okay to resume SMC/CP/SDMA */ 5588 r = amdgpu_device_ip_reinit_late_sriov(adev); 5589 if (r) 5590 return r; 5591 5592 hive = amdgpu_get_xgmi_hive(adev); 5593 /* Update PSP FW topology after reset */ 5594 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5595 r = amdgpu_xgmi_update_topology(hive, adev); 5596 if (hive) 5597 amdgpu_put_xgmi_hive(hive); 5598 if (r) 5599 return r; 5600 5601 r = amdgpu_ib_ring_tests(adev); 5602 if (r) 5603 return r; 5604 5605 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5606 amdgpu_inc_vram_lost(adev); 5607 5608 /* need to be called during full access so we can't do it later like 5609 * bare-metal does. 5610 */ 5611 amdgpu_amdkfd_post_reset(adev); 5612 amdgpu_virt_release_full_gpu(adev, true); 5613 5614 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5615 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5616 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5617 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5618 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5619 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5620 amdgpu_ras_resume(adev); 5621 5622 amdgpu_virt_ras_telemetry_post_reset(adev); 5623 5624 return 0; 5625 } 5626 5627 /** 5628 * amdgpu_device_has_job_running - check if there is any unfinished job 5629 * 5630 * @adev: amdgpu_device pointer 5631 * 5632 * check if there is any job running on the device when guest driver receives 5633 * FLR notification from host driver. If there are still jobs running, then 5634 * the guest driver will not respond the FLR reset. Instead, let the job hit 5635 * the timeout and guest driver then issue the reset request. 5636 */ 5637 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5638 { 5639 int i; 5640 5641 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5642 struct amdgpu_ring *ring = adev->rings[i]; 5643 5644 if (!amdgpu_ring_sched_ready(ring)) 5645 continue; 5646 5647 if (amdgpu_fence_count_emitted(ring)) 5648 return true; 5649 } 5650 return false; 5651 } 5652 5653 /** 5654 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5655 * 5656 * @adev: amdgpu_device pointer 5657 * 5658 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5659 * a hung GPU. 5660 */ 5661 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5662 { 5663 5664 if (amdgpu_gpu_recovery == 0) 5665 goto disabled; 5666 5667 /* Skip soft reset check in fatal error mode */ 5668 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5669 return true; 5670 5671 if (amdgpu_sriov_vf(adev)) 5672 return true; 5673 5674 if (amdgpu_gpu_recovery == -1) { 5675 switch (adev->asic_type) { 5676 #ifdef CONFIG_DRM_AMDGPU_SI 5677 case CHIP_VERDE: 5678 case CHIP_TAHITI: 5679 case CHIP_PITCAIRN: 5680 case CHIP_OLAND: 5681 case CHIP_HAINAN: 5682 #endif 5683 #ifdef CONFIG_DRM_AMDGPU_CIK 5684 case CHIP_KAVERI: 5685 case CHIP_KABINI: 5686 case CHIP_MULLINS: 5687 #endif 5688 case CHIP_CARRIZO: 5689 case CHIP_STONEY: 5690 case CHIP_CYAN_SKILLFISH: 5691 goto disabled; 5692 default: 5693 break; 5694 } 5695 } 5696 5697 return true; 5698 5699 disabled: 5700 dev_info(adev->dev, "GPU recovery disabled.\n"); 5701 return false; 5702 } 5703 5704 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5705 { 5706 u32 i; 5707 int ret = 0; 5708 5709 if (adev->bios) 5710 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5711 5712 dev_info(adev->dev, "GPU mode1 reset\n"); 5713 5714 /* Cache the state before bus master disable. The saved config space 5715 * values are used in other cases like restore after mode-2 reset. 5716 */ 5717 amdgpu_device_cache_pci_state(adev->pdev); 5718 5719 /* disable BM */ 5720 pci_clear_master(adev->pdev); 5721 5722 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5723 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5724 ret = amdgpu_dpm_mode1_reset(adev); 5725 } else { 5726 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5727 ret = psp_gpu_reset(adev); 5728 } 5729 5730 if (ret) 5731 goto mode1_reset_failed; 5732 5733 /* enable mmio access after mode 1 reset completed */ 5734 adev->no_hw_access = false; 5735 5736 /* ensure no_hw_access is updated before we access hw */ 5737 smp_mb(); 5738 5739 amdgpu_device_load_pci_state(adev->pdev); 5740 ret = amdgpu_psp_wait_for_bootloader(adev); 5741 if (ret) 5742 goto mode1_reset_failed; 5743 5744 /* wait for asic to come out of reset */ 5745 for (i = 0; i < adev->usec_timeout; i++) { 5746 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5747 5748 if (memsize != 0xffffffff) 5749 break; 5750 udelay(1); 5751 } 5752 5753 if (i >= adev->usec_timeout) { 5754 ret = -ETIMEDOUT; 5755 goto mode1_reset_failed; 5756 } 5757 5758 if (adev->bios) 5759 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5760 5761 return 0; 5762 5763 mode1_reset_failed: 5764 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5765 return ret; 5766 } 5767 5768 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5769 { 5770 int ret = 0; 5771 5772 dev_info(adev->dev, "GPU link reset\n"); 5773 5774 if (!amdgpu_reset_in_dpc(adev)) 5775 ret = amdgpu_dpm_link_reset(adev); 5776 5777 if (ret) 5778 goto link_reset_failed; 5779 5780 ret = amdgpu_psp_wait_for_bootloader(adev); 5781 if (ret) 5782 goto link_reset_failed; 5783 5784 return 0; 5785 5786 link_reset_failed: 5787 dev_err(adev->dev, "GPU link reset failed\n"); 5788 return ret; 5789 } 5790 5791 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5792 struct amdgpu_reset_context *reset_context) 5793 { 5794 int i, r = 0; 5795 struct amdgpu_job *job = NULL; 5796 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5797 bool need_full_reset = 5798 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5799 5800 if (reset_context->reset_req_dev == adev) 5801 job = reset_context->job; 5802 5803 if (amdgpu_sriov_vf(adev)) 5804 amdgpu_virt_pre_reset(adev); 5805 5806 amdgpu_fence_driver_isr_toggle(adev, true); 5807 5808 /* block all schedulers and reset given job's ring */ 5809 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5810 struct amdgpu_ring *ring = adev->rings[i]; 5811 5812 if (!amdgpu_ring_sched_ready(ring)) 5813 continue; 5814 5815 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5816 amdgpu_fence_driver_force_completion(ring); 5817 } 5818 5819 amdgpu_fence_driver_isr_toggle(adev, false); 5820 5821 if (job && job->vm) 5822 drm_sched_increase_karma(&job->base); 5823 5824 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5825 /* If reset handler not implemented, continue; otherwise return */ 5826 if (r == -EOPNOTSUPP) 5827 r = 0; 5828 else 5829 return r; 5830 5831 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5832 if (!amdgpu_sriov_vf(adev)) { 5833 5834 if (!need_full_reset) 5835 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5836 5837 if (!need_full_reset && amdgpu_gpu_recovery && 5838 amdgpu_device_ip_check_soft_reset(adev)) { 5839 amdgpu_device_ip_pre_soft_reset(adev); 5840 r = amdgpu_device_ip_soft_reset(adev); 5841 amdgpu_device_ip_post_soft_reset(adev); 5842 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5843 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5844 need_full_reset = true; 5845 } 5846 } 5847 5848 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5849 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5850 /* Trigger ip dump before we reset the asic */ 5851 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5852 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5853 tmp_adev->ip_blocks[i].version->funcs 5854 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5855 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5856 } 5857 5858 if (need_full_reset) 5859 r = amdgpu_device_ip_suspend(adev); 5860 if (need_full_reset) 5861 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5862 else 5863 clear_bit(AMDGPU_NEED_FULL_RESET, 5864 &reset_context->flags); 5865 } 5866 5867 return r; 5868 } 5869 5870 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5871 { 5872 struct list_head *device_list_handle; 5873 bool full_reset, vram_lost = false; 5874 struct amdgpu_device *tmp_adev; 5875 int r, init_level; 5876 5877 device_list_handle = reset_context->reset_device_list; 5878 5879 if (!device_list_handle) 5880 return -EINVAL; 5881 5882 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5883 5884 /** 5885 * If it's reset on init, it's default init level, otherwise keep level 5886 * as recovery level. 5887 */ 5888 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5889 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5890 else 5891 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5892 5893 r = 0; 5894 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5895 amdgpu_set_init_level(tmp_adev, init_level); 5896 if (full_reset) { 5897 /* post card */ 5898 amdgpu_reset_set_dpc_status(tmp_adev, false); 5899 amdgpu_ras_clear_err_state(tmp_adev); 5900 r = amdgpu_device_asic_init(tmp_adev); 5901 if (r) { 5902 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5903 } else { 5904 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5905 5906 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5907 if (r) 5908 goto out; 5909 5910 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5911 5912 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5913 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5914 5915 if (vram_lost) { 5916 dev_info( 5917 tmp_adev->dev, 5918 "VRAM is lost due to GPU reset!\n"); 5919 amdgpu_inc_vram_lost(tmp_adev); 5920 } 5921 5922 r = amdgpu_device_fw_loading(tmp_adev); 5923 if (r) 5924 return r; 5925 5926 r = amdgpu_xcp_restore_partition_mode( 5927 tmp_adev->xcp_mgr); 5928 if (r) 5929 goto out; 5930 5931 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5932 if (r) 5933 goto out; 5934 5935 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5936 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5937 5938 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5939 if (r) 5940 goto out; 5941 5942 if (vram_lost) 5943 amdgpu_device_fill_reset_magic(tmp_adev); 5944 5945 /* 5946 * Add this ASIC as tracked as reset was already 5947 * complete successfully. 5948 */ 5949 amdgpu_register_gpu_instance(tmp_adev); 5950 5951 if (!reset_context->hive && 5952 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5953 amdgpu_xgmi_add_device(tmp_adev); 5954 5955 r = amdgpu_device_ip_late_init(tmp_adev); 5956 if (r) 5957 goto out; 5958 5959 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 5960 if (r) 5961 goto out; 5962 5963 drm_client_dev_resume(adev_to_drm(tmp_adev)); 5964 5965 /* 5966 * The GPU enters bad state once faulty pages 5967 * by ECC has reached the threshold, and ras 5968 * recovery is scheduled next. So add one check 5969 * here to break recovery if it indeed exceeds 5970 * bad page threshold, and remind user to 5971 * retire this GPU or setting one bigger 5972 * bad_page_threshold value to fix this once 5973 * probing driver again. 5974 */ 5975 if (!amdgpu_ras_is_rma(tmp_adev)) { 5976 /* must succeed. */ 5977 amdgpu_ras_resume(tmp_adev); 5978 } else { 5979 r = -EINVAL; 5980 goto out; 5981 } 5982 5983 /* Update PSP FW topology after reset */ 5984 if (reset_context->hive && 5985 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5986 r = amdgpu_xgmi_update_topology( 5987 reset_context->hive, tmp_adev); 5988 } 5989 } 5990 5991 out: 5992 if (!r) { 5993 /* IP init is complete now, set level as default */ 5994 amdgpu_set_init_level(tmp_adev, 5995 AMDGPU_INIT_LEVEL_DEFAULT); 5996 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5997 r = amdgpu_ib_ring_tests(tmp_adev); 5998 if (r) { 5999 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 6000 r = -EAGAIN; 6001 goto end; 6002 } 6003 } 6004 6005 if (r) 6006 tmp_adev->asic_reset_res = r; 6007 } 6008 6009 end: 6010 return r; 6011 } 6012 6013 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6014 struct amdgpu_reset_context *reset_context) 6015 { 6016 struct amdgpu_device *tmp_adev = NULL; 6017 bool need_full_reset, skip_hw_reset; 6018 int r = 0; 6019 6020 /* Try reset handler method first */ 6021 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6022 reset_list); 6023 6024 reset_context->reset_device_list = device_list_handle; 6025 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6026 /* If reset handler not implemented, continue; otherwise return */ 6027 if (r == -EOPNOTSUPP) 6028 r = 0; 6029 else 6030 return r; 6031 6032 /* Reset handler not implemented, use the default method */ 6033 need_full_reset = 6034 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6035 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6036 6037 /* 6038 * ASIC reset has to be done on all XGMI hive nodes ASAP 6039 * to allow proper links negotiation in FW (within 1 sec) 6040 */ 6041 if (!skip_hw_reset && need_full_reset) { 6042 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6043 /* For XGMI run all resets in parallel to speed up the process */ 6044 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6045 if (!queue_work(system_unbound_wq, 6046 &tmp_adev->xgmi_reset_work)) 6047 r = -EALREADY; 6048 } else 6049 r = amdgpu_asic_reset(tmp_adev); 6050 6051 if (r) { 6052 dev_err(tmp_adev->dev, 6053 "ASIC reset failed with error, %d for drm dev, %s", 6054 r, adev_to_drm(tmp_adev)->unique); 6055 goto out; 6056 } 6057 } 6058 6059 /* For XGMI wait for all resets to complete before proceed */ 6060 if (!r) { 6061 list_for_each_entry(tmp_adev, device_list_handle, 6062 reset_list) { 6063 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6064 flush_work(&tmp_adev->xgmi_reset_work); 6065 r = tmp_adev->asic_reset_res; 6066 if (r) 6067 break; 6068 } 6069 } 6070 } 6071 } 6072 6073 if (!r && amdgpu_ras_intr_triggered()) { 6074 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6075 amdgpu_ras_reset_error_count(tmp_adev, 6076 AMDGPU_RAS_BLOCK__MMHUB); 6077 } 6078 6079 amdgpu_ras_intr_cleared(); 6080 } 6081 6082 r = amdgpu_device_reinit_after_reset(reset_context); 6083 if (r == -EAGAIN) 6084 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6085 else 6086 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6087 6088 out: 6089 return r; 6090 } 6091 6092 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6093 { 6094 6095 switch (amdgpu_asic_reset_method(adev)) { 6096 case AMD_RESET_METHOD_MODE1: 6097 case AMD_RESET_METHOD_LINK: 6098 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6099 break; 6100 case AMD_RESET_METHOD_MODE2: 6101 adev->mp1_state = PP_MP1_STATE_RESET; 6102 break; 6103 default: 6104 adev->mp1_state = PP_MP1_STATE_NONE; 6105 break; 6106 } 6107 } 6108 6109 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6110 { 6111 amdgpu_vf_error_trans_all(adev); 6112 adev->mp1_state = PP_MP1_STATE_NONE; 6113 } 6114 6115 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6116 { 6117 struct pci_dev *p = NULL; 6118 6119 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6120 adev->pdev->bus->number, 1); 6121 if (p) { 6122 pm_runtime_enable(&(p->dev)); 6123 pm_runtime_resume(&(p->dev)); 6124 } 6125 6126 pci_dev_put(p); 6127 } 6128 6129 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6130 { 6131 enum amd_reset_method reset_method; 6132 struct pci_dev *p = NULL; 6133 u64 expires; 6134 6135 /* 6136 * For now, only BACO and mode1 reset are confirmed 6137 * to suffer the audio issue without proper suspended. 6138 */ 6139 reset_method = amdgpu_asic_reset_method(adev); 6140 if ((reset_method != AMD_RESET_METHOD_BACO) && 6141 (reset_method != AMD_RESET_METHOD_MODE1)) 6142 return -EINVAL; 6143 6144 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6145 adev->pdev->bus->number, 1); 6146 if (!p) 6147 return -ENODEV; 6148 6149 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6150 if (!expires) 6151 /* 6152 * If we cannot get the audio device autosuspend delay, 6153 * a fixed 4S interval will be used. Considering 3S is 6154 * the audio controller default autosuspend delay setting. 6155 * 4S used here is guaranteed to cover that. 6156 */ 6157 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6158 6159 while (!pm_runtime_status_suspended(&(p->dev))) { 6160 if (!pm_runtime_suspend(&(p->dev))) 6161 break; 6162 6163 if (expires < ktime_get_mono_fast_ns()) { 6164 dev_warn(adev->dev, "failed to suspend display audio\n"); 6165 pci_dev_put(p); 6166 /* TODO: abort the succeeding gpu reset? */ 6167 return -ETIMEDOUT; 6168 } 6169 } 6170 6171 pm_runtime_disable(&(p->dev)); 6172 6173 pci_dev_put(p); 6174 return 0; 6175 } 6176 6177 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6178 { 6179 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6180 6181 #if defined(CONFIG_DEBUG_FS) 6182 if (!amdgpu_sriov_vf(adev)) 6183 cancel_work(&adev->reset_work); 6184 #endif 6185 cancel_work(&adev->userq_reset_work); 6186 6187 if (adev->kfd.dev) 6188 cancel_work(&adev->kfd.reset_work); 6189 6190 if (amdgpu_sriov_vf(adev)) 6191 cancel_work(&adev->virt.flr_work); 6192 6193 if (con && adev->ras_enabled) 6194 cancel_work(&con->recovery_work); 6195 6196 } 6197 6198 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6199 { 6200 struct amdgpu_device *tmp_adev; 6201 int ret = 0; 6202 6203 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6204 ret |= amdgpu_device_bus_status_check(tmp_adev); 6205 } 6206 6207 return ret; 6208 } 6209 6210 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6211 struct list_head *device_list, 6212 struct amdgpu_hive_info *hive) 6213 { 6214 struct amdgpu_device *tmp_adev = NULL; 6215 6216 /* 6217 * Build list of devices to reset. 6218 * In case we are in XGMI hive mode, resort the device list 6219 * to put adev in the 1st position. 6220 */ 6221 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6222 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6223 list_add_tail(&tmp_adev->reset_list, device_list); 6224 if (adev->shutdown) 6225 tmp_adev->shutdown = true; 6226 if (amdgpu_reset_in_dpc(adev)) 6227 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6228 } 6229 if (!list_is_first(&adev->reset_list, device_list)) 6230 list_rotate_to_front(&adev->reset_list, device_list); 6231 } else { 6232 list_add_tail(&adev->reset_list, device_list); 6233 } 6234 } 6235 6236 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6237 struct list_head *device_list) 6238 { 6239 struct amdgpu_device *tmp_adev = NULL; 6240 6241 if (list_empty(device_list)) 6242 return; 6243 tmp_adev = 6244 list_first_entry(device_list, struct amdgpu_device, reset_list); 6245 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6246 } 6247 6248 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6249 struct list_head *device_list) 6250 { 6251 struct amdgpu_device *tmp_adev = NULL; 6252 6253 if (list_empty(device_list)) 6254 return; 6255 tmp_adev = 6256 list_first_entry(device_list, struct amdgpu_device, reset_list); 6257 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6258 } 6259 6260 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6261 struct amdgpu_job *job, 6262 struct amdgpu_reset_context *reset_context, 6263 struct list_head *device_list, 6264 struct amdgpu_hive_info *hive, 6265 bool need_emergency_restart) 6266 { 6267 struct amdgpu_device *tmp_adev = NULL; 6268 int i; 6269 6270 /* block all schedulers and reset given job's ring */ 6271 list_for_each_entry(tmp_adev, device_list, reset_list) { 6272 amdgpu_device_set_mp1_state(tmp_adev); 6273 6274 /* 6275 * Try to put the audio codec into suspend state 6276 * before gpu reset started. 6277 * 6278 * Due to the power domain of the graphics device 6279 * is shared with AZ power domain. Without this, 6280 * we may change the audio hardware from behind 6281 * the audio driver's back. That will trigger 6282 * some audio codec errors. 6283 */ 6284 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6285 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6286 6287 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6288 6289 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6290 6291 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6292 6293 /* 6294 * Mark these ASICs to be reset as untracked first 6295 * And add them back after reset completed 6296 */ 6297 amdgpu_unregister_gpu_instance(tmp_adev); 6298 6299 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6300 6301 /* disable ras on ALL IPs */ 6302 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6303 amdgpu_device_ip_need_full_reset(tmp_adev)) 6304 amdgpu_ras_suspend(tmp_adev); 6305 6306 amdgpu_userq_pre_reset(tmp_adev); 6307 6308 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6309 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6310 6311 if (!amdgpu_ring_sched_ready(ring)) 6312 continue; 6313 6314 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6315 6316 if (need_emergency_restart) 6317 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6318 } 6319 atomic_inc(&tmp_adev->gpu_reset_counter); 6320 } 6321 } 6322 6323 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6324 struct list_head *device_list, 6325 struct amdgpu_reset_context *reset_context) 6326 { 6327 struct amdgpu_device *tmp_adev = NULL; 6328 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6329 int r = 0; 6330 6331 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6332 list_for_each_entry(tmp_adev, device_list, reset_list) { 6333 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6334 /*TODO Should we stop ?*/ 6335 if (r) { 6336 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6337 r, adev_to_drm(tmp_adev)->unique); 6338 tmp_adev->asic_reset_res = r; 6339 } 6340 } 6341 6342 /* Actual ASIC resets if needed.*/ 6343 /* Host driver will handle XGMI hive reset for SRIOV */ 6344 if (amdgpu_sriov_vf(adev)) { 6345 6346 /* Bail out of reset early */ 6347 if (amdgpu_ras_is_rma(adev)) 6348 return -ENODEV; 6349 6350 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6351 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6352 amdgpu_ras_set_fed(adev, true); 6353 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6354 } 6355 6356 r = amdgpu_device_reset_sriov(adev, reset_context); 6357 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6358 amdgpu_virt_release_full_gpu(adev, true); 6359 goto retry; 6360 } 6361 if (r) 6362 adev->asic_reset_res = r; 6363 } else { 6364 r = amdgpu_do_asic_reset(device_list, reset_context); 6365 if (r && r == -EAGAIN) 6366 goto retry; 6367 } 6368 6369 list_for_each_entry(tmp_adev, device_list, reset_list) { 6370 /* 6371 * Drop any pending non scheduler resets queued before reset is done. 6372 * Any reset scheduled after this point would be valid. Scheduler resets 6373 * were already dropped during drm_sched_stop and no new ones can come 6374 * in before drm_sched_start. 6375 */ 6376 amdgpu_device_stop_pending_resets(tmp_adev); 6377 } 6378 6379 return r; 6380 } 6381 6382 static int amdgpu_device_sched_resume(struct list_head *device_list, 6383 struct amdgpu_reset_context *reset_context, 6384 bool job_signaled) 6385 { 6386 struct amdgpu_device *tmp_adev = NULL; 6387 int i, r = 0; 6388 6389 /* Post ASIC reset for all devs .*/ 6390 list_for_each_entry(tmp_adev, device_list, reset_list) { 6391 6392 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6393 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6394 6395 if (!amdgpu_ring_sched_ready(ring)) 6396 continue; 6397 6398 drm_sched_start(&ring->sched, 0); 6399 } 6400 6401 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6402 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6403 6404 if (tmp_adev->asic_reset_res) { 6405 /* bad news, how to tell it to userspace ? 6406 * for ras error, we should report GPU bad status instead of 6407 * reset failure 6408 */ 6409 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6410 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6411 dev_info( 6412 tmp_adev->dev, 6413 "GPU reset(%d) failed with error %d\n", 6414 atomic_read( 6415 &tmp_adev->gpu_reset_counter), 6416 tmp_adev->asic_reset_res); 6417 amdgpu_vf_error_put(tmp_adev, 6418 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6419 tmp_adev->asic_reset_res); 6420 if (!r) 6421 r = tmp_adev->asic_reset_res; 6422 tmp_adev->asic_reset_res = 0; 6423 } else { 6424 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6425 atomic_read(&tmp_adev->gpu_reset_counter)); 6426 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6427 AMDGPU_SS_DEV_D0)) 6428 dev_warn(tmp_adev->dev, 6429 "smart shift update failed\n"); 6430 } 6431 } 6432 6433 return r; 6434 } 6435 6436 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6437 struct list_head *device_list, 6438 bool need_emergency_restart) 6439 { 6440 struct amdgpu_device *tmp_adev = NULL; 6441 6442 list_for_each_entry(tmp_adev, device_list, reset_list) { 6443 /* unlock kfd: SRIOV would do it separately */ 6444 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6445 amdgpu_amdkfd_post_reset(tmp_adev); 6446 6447 /* kfd_post_reset will do nothing if kfd device is not initialized, 6448 * need to bring up kfd here if it's not be initialized before 6449 */ 6450 if (!adev->kfd.init_complete) 6451 amdgpu_amdkfd_device_init(adev); 6452 6453 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6454 amdgpu_device_resume_display_audio(tmp_adev); 6455 6456 amdgpu_device_unset_mp1_state(tmp_adev); 6457 6458 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6459 6460 } 6461 } 6462 6463 6464 /** 6465 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6466 * 6467 * @adev: amdgpu_device pointer 6468 * @job: which job trigger hang 6469 * @reset_context: amdgpu reset context pointer 6470 * 6471 * Attempt to reset the GPU if it has hung (all asics). 6472 * Attempt to do soft-reset or full-reset and reinitialize Asic 6473 * Returns 0 for success or an error on failure. 6474 */ 6475 6476 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6477 struct amdgpu_job *job, 6478 struct amdgpu_reset_context *reset_context) 6479 { 6480 struct list_head device_list; 6481 bool job_signaled = false; 6482 struct amdgpu_hive_info *hive = NULL; 6483 int r = 0; 6484 bool need_emergency_restart = false; 6485 /* save the pasid here as the job may be freed before the end of the reset */ 6486 int pasid = job ? job->pasid : -EINVAL; 6487 6488 /* 6489 * If it reaches here because of hang/timeout and a RAS error is 6490 * detected at the same time, let RAS recovery take care of it. 6491 */ 6492 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6493 !amdgpu_sriov_vf(adev) && 6494 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6495 dev_dbg(adev->dev, 6496 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6497 reset_context->src); 6498 return 0; 6499 } 6500 6501 /* 6502 * Special case: RAS triggered and full reset isn't supported 6503 */ 6504 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6505 6506 /* 6507 * Flush RAM to disk so that after reboot 6508 * the user can read log and see why the system rebooted. 6509 */ 6510 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6511 amdgpu_ras_get_context(adev)->reboot) { 6512 dev_warn(adev->dev, "Emergency reboot."); 6513 6514 ksys_sync_helper(); 6515 emergency_restart(); 6516 } 6517 6518 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6519 need_emergency_restart ? "jobs stop" : "reset", 6520 reset_context->src); 6521 6522 if (!amdgpu_sriov_vf(adev)) 6523 hive = amdgpu_get_xgmi_hive(adev); 6524 if (hive) 6525 mutex_lock(&hive->hive_lock); 6526 6527 reset_context->job = job; 6528 reset_context->hive = hive; 6529 INIT_LIST_HEAD(&device_list); 6530 6531 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6532 6533 if (!amdgpu_sriov_vf(adev)) { 6534 r = amdgpu_device_health_check(&device_list); 6535 if (r) 6536 goto end_reset; 6537 } 6538 6539 /* Cannot be called after locking reset domain */ 6540 amdgpu_ras_pre_reset(adev, &device_list); 6541 6542 /* We need to lock reset domain only once both for XGMI and single device */ 6543 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6544 6545 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6546 hive, need_emergency_restart); 6547 if (need_emergency_restart) 6548 goto skip_sched_resume; 6549 /* 6550 * Must check guilty signal here since after this point all old 6551 * HW fences are force signaled. 6552 * 6553 * job->base holds a reference to parent fence 6554 */ 6555 if (job && (dma_fence_get_status(&job->hw_fence->base) > 0)) { 6556 job_signaled = true; 6557 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6558 goto skip_hw_reset; 6559 } 6560 6561 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6562 if (r) 6563 goto reset_unlock; 6564 skip_hw_reset: 6565 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6566 if (r) 6567 goto reset_unlock; 6568 skip_sched_resume: 6569 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6570 reset_unlock: 6571 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6572 amdgpu_ras_post_reset(adev, &device_list); 6573 end_reset: 6574 if (hive) { 6575 mutex_unlock(&hive->hive_lock); 6576 amdgpu_put_xgmi_hive(hive); 6577 } 6578 6579 if (r) 6580 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6581 6582 atomic_set(&adev->reset_domain->reset_res, r); 6583 6584 if (!r) { 6585 struct amdgpu_task_info *ti = NULL; 6586 6587 /* 6588 * The job may already be freed at this point via the sched tdr workqueue so 6589 * use the cached pasid. 6590 */ 6591 if (pasid >= 0) 6592 ti = amdgpu_vm_get_task_info_pasid(adev, pasid); 6593 6594 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6595 ti ? &ti->task : NULL); 6596 6597 amdgpu_vm_put_task_info(ti); 6598 } 6599 6600 return r; 6601 } 6602 6603 /** 6604 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6605 * 6606 * @adev: amdgpu_device pointer 6607 * @speed: pointer to the speed of the link 6608 * @width: pointer to the width of the link 6609 * 6610 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6611 * first physical partner to an AMD dGPU. 6612 * This will exclude any virtual switches and links. 6613 */ 6614 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6615 enum pci_bus_speed *speed, 6616 enum pcie_link_width *width) 6617 { 6618 struct pci_dev *parent = adev->pdev; 6619 6620 if (!speed || !width) 6621 return; 6622 6623 *speed = PCI_SPEED_UNKNOWN; 6624 *width = PCIE_LNK_WIDTH_UNKNOWN; 6625 6626 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6627 while ((parent = pci_upstream_bridge(parent))) { 6628 /* skip upstream/downstream switches internal to dGPU*/ 6629 if (parent->vendor == PCI_VENDOR_ID_ATI) 6630 continue; 6631 *speed = pcie_get_speed_cap(parent); 6632 *width = pcie_get_width_cap(parent); 6633 break; 6634 } 6635 } else { 6636 /* use the current speeds rather than max if switching is not supported */ 6637 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6638 } 6639 } 6640 6641 /** 6642 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6643 * 6644 * @adev: amdgpu_device pointer 6645 * @speed: pointer to the speed of the link 6646 * @width: pointer to the width of the link 6647 * 6648 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6649 * AMD dGPU which may be a virtual upstream bridge. 6650 */ 6651 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6652 enum pci_bus_speed *speed, 6653 enum pcie_link_width *width) 6654 { 6655 struct pci_dev *parent = adev->pdev; 6656 6657 if (!speed || !width) 6658 return; 6659 6660 parent = pci_upstream_bridge(parent); 6661 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6662 /* use the upstream/downstream switches internal to dGPU */ 6663 *speed = pcie_get_speed_cap(parent); 6664 *width = pcie_get_width_cap(parent); 6665 while ((parent = pci_upstream_bridge(parent))) { 6666 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6667 /* use the upstream/downstream switches internal to dGPU */ 6668 *speed = pcie_get_speed_cap(parent); 6669 *width = pcie_get_width_cap(parent); 6670 } 6671 } 6672 } else { 6673 /* use the device itself */ 6674 *speed = pcie_get_speed_cap(adev->pdev); 6675 *width = pcie_get_width_cap(adev->pdev); 6676 } 6677 } 6678 6679 /** 6680 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6681 * 6682 * @adev: amdgpu_device pointer 6683 * 6684 * Fetches and stores in the driver the PCIE capabilities (gen speed 6685 * and lanes) of the slot the device is in. Handles APUs and 6686 * virtualized environments where PCIE config space may not be available. 6687 */ 6688 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6689 { 6690 enum pci_bus_speed speed_cap, platform_speed_cap; 6691 enum pcie_link_width platform_link_width, link_width; 6692 6693 if (amdgpu_pcie_gen_cap) 6694 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6695 6696 if (amdgpu_pcie_lane_cap) 6697 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6698 6699 /* covers APUs as well */ 6700 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6701 if (adev->pm.pcie_gen_mask == 0) 6702 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6703 if (adev->pm.pcie_mlw_mask == 0) 6704 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6705 return; 6706 } 6707 6708 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6709 return; 6710 6711 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6712 &platform_link_width); 6713 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6714 6715 if (adev->pm.pcie_gen_mask == 0) { 6716 /* asic caps */ 6717 if (speed_cap == PCI_SPEED_UNKNOWN) { 6718 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6719 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6720 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6721 } else { 6722 if (speed_cap == PCIE_SPEED_32_0GT) 6723 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6724 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6725 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6726 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6727 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6728 else if (speed_cap == PCIE_SPEED_16_0GT) 6729 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6730 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6731 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6732 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6733 else if (speed_cap == PCIE_SPEED_8_0GT) 6734 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6735 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6736 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6737 else if (speed_cap == PCIE_SPEED_5_0GT) 6738 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6739 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6740 else 6741 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6742 } 6743 /* platform caps */ 6744 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6745 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6746 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6747 } else { 6748 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6749 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6750 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6751 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6752 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6753 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6754 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6755 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6756 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6757 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6758 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6759 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6760 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6761 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6762 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6763 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6764 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6765 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6766 else 6767 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6768 6769 } 6770 } 6771 if (adev->pm.pcie_mlw_mask == 0) { 6772 /* asic caps */ 6773 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6774 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6775 } else { 6776 switch (link_width) { 6777 case PCIE_LNK_X32: 6778 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6779 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6780 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6781 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6782 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6783 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6784 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6785 break; 6786 case PCIE_LNK_X16: 6787 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6788 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6789 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6790 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6791 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6792 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6793 break; 6794 case PCIE_LNK_X12: 6795 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6796 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6797 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6798 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6799 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6800 break; 6801 case PCIE_LNK_X8: 6802 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6803 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6804 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6805 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6806 break; 6807 case PCIE_LNK_X4: 6808 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6809 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6810 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6811 break; 6812 case PCIE_LNK_X2: 6813 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6814 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6815 break; 6816 case PCIE_LNK_X1: 6817 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6818 break; 6819 default: 6820 break; 6821 } 6822 } 6823 /* platform caps */ 6824 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6825 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6826 } else { 6827 switch (platform_link_width) { 6828 case PCIE_LNK_X32: 6829 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6833 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6834 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6835 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6836 break; 6837 case PCIE_LNK_X16: 6838 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6839 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6840 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6841 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6842 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6843 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6844 break; 6845 case PCIE_LNK_X12: 6846 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6847 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6848 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6849 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6850 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6851 break; 6852 case PCIE_LNK_X8: 6853 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6854 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6855 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6856 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6857 break; 6858 case PCIE_LNK_X4: 6859 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6860 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6861 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6862 break; 6863 case PCIE_LNK_X2: 6864 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6865 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6866 break; 6867 case PCIE_LNK_X1: 6868 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6869 break; 6870 default: 6871 break; 6872 } 6873 } 6874 } 6875 } 6876 6877 /** 6878 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6879 * 6880 * @adev: amdgpu_device pointer 6881 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6882 * 6883 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6884 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6885 * @peer_adev. 6886 */ 6887 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6888 struct amdgpu_device *peer_adev) 6889 { 6890 #ifdef CONFIG_HSA_AMD_P2P 6891 bool p2p_access = 6892 !adev->gmc.xgmi.connected_to_cpu && 6893 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6894 if (!p2p_access) 6895 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6896 pci_name(peer_adev->pdev)); 6897 6898 bool is_large_bar = adev->gmc.visible_vram_size && 6899 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6900 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6901 6902 if (!p2p_addressable) { 6903 uint64_t address_mask = peer_adev->dev->dma_mask ? 6904 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6905 resource_size_t aper_limit = 6906 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6907 6908 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6909 aper_limit & address_mask); 6910 } 6911 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6912 #else 6913 return false; 6914 #endif 6915 } 6916 6917 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 6918 { 6919 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6920 6921 if (!amdgpu_device_supports_baco(adev)) 6922 return -ENOTSUPP; 6923 6924 if (ras && adev->ras_enabled && 6925 adev->nbio.funcs->enable_doorbell_interrupt) 6926 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6927 6928 return amdgpu_dpm_baco_enter(adev); 6929 } 6930 6931 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 6932 { 6933 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6934 int ret = 0; 6935 6936 if (!amdgpu_device_supports_baco(adev)) 6937 return -ENOTSUPP; 6938 6939 ret = amdgpu_dpm_baco_exit(adev); 6940 if (ret) 6941 return ret; 6942 6943 if (ras && adev->ras_enabled && 6944 adev->nbio.funcs->enable_doorbell_interrupt) 6945 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6946 6947 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6948 adev->nbio.funcs->clear_doorbell_interrupt) 6949 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6950 6951 return 0; 6952 } 6953 6954 /** 6955 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6956 * @pdev: PCI device struct 6957 * @state: PCI channel state 6958 * 6959 * Description: Called when a PCI error is detected. 6960 * 6961 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6962 */ 6963 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6964 { 6965 struct drm_device *dev = pci_get_drvdata(pdev); 6966 struct amdgpu_device *adev = drm_to_adev(dev); 6967 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 6968 amdgpu_get_xgmi_hive(adev); 6969 struct amdgpu_reset_context reset_context; 6970 struct list_head device_list; 6971 6972 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6973 6974 adev->pci_channel_state = state; 6975 6976 switch (state) { 6977 case pci_channel_io_normal: 6978 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6979 return PCI_ERS_RESULT_CAN_RECOVER; 6980 case pci_channel_io_frozen: 6981 /* Fatal error, prepare for slot reset */ 6982 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6983 if (hive) { 6984 /* Hive devices should be able to support FW based 6985 * link reset on other devices, if not return. 6986 */ 6987 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6988 dev_warn(adev->dev, 6989 "No support for XGMI hive yet...\n"); 6990 return PCI_ERS_RESULT_DISCONNECT; 6991 } 6992 /* Set dpc status only if device is part of hive 6993 * Non-hive devices should be able to recover after 6994 * link reset. 6995 */ 6996 amdgpu_reset_set_dpc_status(adev, true); 6997 6998 mutex_lock(&hive->hive_lock); 6999 } 7000 memset(&reset_context, 0, sizeof(reset_context)); 7001 INIT_LIST_HEAD(&device_list); 7002 7003 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7004 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7005 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7006 hive, false); 7007 if (hive) 7008 mutex_unlock(&hive->hive_lock); 7009 return PCI_ERS_RESULT_NEED_RESET; 7010 case pci_channel_io_perm_failure: 7011 /* Permanent error, prepare for device removal */ 7012 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7013 return PCI_ERS_RESULT_DISCONNECT; 7014 } 7015 7016 return PCI_ERS_RESULT_NEED_RESET; 7017 } 7018 7019 /** 7020 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7021 * @pdev: pointer to PCI device 7022 */ 7023 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7024 { 7025 struct drm_device *dev = pci_get_drvdata(pdev); 7026 struct amdgpu_device *adev = drm_to_adev(dev); 7027 7028 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7029 7030 /* TODO - dump whatever for debugging purposes */ 7031 7032 /* This called only if amdgpu_pci_error_detected returns 7033 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7034 * works, no need to reset slot. 7035 */ 7036 7037 return PCI_ERS_RESULT_RECOVERED; 7038 } 7039 7040 /** 7041 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7042 * @pdev: PCI device struct 7043 * 7044 * Description: This routine is called by the pci error recovery 7045 * code after the PCI slot has been reset, just before we 7046 * should resume normal operations. 7047 */ 7048 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7049 { 7050 struct drm_device *dev = pci_get_drvdata(pdev); 7051 struct amdgpu_device *adev = drm_to_adev(dev); 7052 struct amdgpu_reset_context reset_context; 7053 struct amdgpu_device *tmp_adev; 7054 struct amdgpu_hive_info *hive; 7055 struct list_head device_list; 7056 struct pci_dev *link_dev; 7057 int r = 0, i, timeout; 7058 u32 memsize; 7059 u16 status; 7060 7061 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7062 7063 memset(&reset_context, 0, sizeof(reset_context)); 7064 7065 if (adev->pcie_reset_ctx.swus) 7066 link_dev = adev->pcie_reset_ctx.swus; 7067 else 7068 link_dev = adev->pdev; 7069 /* wait for asic to come out of reset, timeout = 10s */ 7070 timeout = 10000; 7071 do { 7072 usleep_range(10000, 10500); 7073 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7074 timeout -= 10; 7075 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7076 (status != PCI_VENDOR_ID_AMD)); 7077 7078 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7079 r = -ETIME; 7080 goto out; 7081 } 7082 7083 amdgpu_device_load_switch_state(adev); 7084 /* Restore PCI confspace */ 7085 amdgpu_device_load_pci_state(pdev); 7086 7087 /* confirm ASIC came out of reset */ 7088 for (i = 0; i < adev->usec_timeout; i++) { 7089 memsize = amdgpu_asic_get_config_memsize(adev); 7090 7091 if (memsize != 0xffffffff) 7092 break; 7093 udelay(1); 7094 } 7095 if (memsize == 0xffffffff) { 7096 r = -ETIME; 7097 goto out; 7098 } 7099 7100 reset_context.method = AMD_RESET_METHOD_NONE; 7101 reset_context.reset_req_dev = adev; 7102 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7103 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7104 INIT_LIST_HEAD(&device_list); 7105 7106 hive = amdgpu_get_xgmi_hive(adev); 7107 if (hive) { 7108 mutex_lock(&hive->hive_lock); 7109 reset_context.hive = hive; 7110 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7111 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7112 list_add_tail(&tmp_adev->reset_list, &device_list); 7113 } 7114 } else { 7115 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7116 list_add_tail(&adev->reset_list, &device_list); 7117 } 7118 7119 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7120 out: 7121 if (!r) { 7122 if (amdgpu_device_cache_pci_state(adev->pdev)) 7123 pci_restore_state(adev->pdev); 7124 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7125 } else { 7126 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7127 if (hive) { 7128 list_for_each_entry(tmp_adev, &device_list, reset_list) 7129 amdgpu_device_unset_mp1_state(tmp_adev); 7130 } 7131 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7132 } 7133 7134 if (hive) { 7135 mutex_unlock(&hive->hive_lock); 7136 amdgpu_put_xgmi_hive(hive); 7137 } 7138 7139 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7140 } 7141 7142 /** 7143 * amdgpu_pci_resume() - resume normal ops after PCI reset 7144 * @pdev: pointer to PCI device 7145 * 7146 * Called when the error recovery driver tells us that its 7147 * OK to resume normal operation. 7148 */ 7149 void amdgpu_pci_resume(struct pci_dev *pdev) 7150 { 7151 struct drm_device *dev = pci_get_drvdata(pdev); 7152 struct amdgpu_device *adev = drm_to_adev(dev); 7153 struct list_head device_list; 7154 struct amdgpu_hive_info *hive = NULL; 7155 struct amdgpu_device *tmp_adev = NULL; 7156 7157 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7158 7159 /* Only continue execution for the case of pci_channel_io_frozen */ 7160 if (adev->pci_channel_state != pci_channel_io_frozen) 7161 return; 7162 7163 INIT_LIST_HEAD(&device_list); 7164 7165 hive = amdgpu_get_xgmi_hive(adev); 7166 if (hive) { 7167 mutex_lock(&hive->hive_lock); 7168 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7169 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7170 list_add_tail(&tmp_adev->reset_list, &device_list); 7171 } 7172 } else 7173 list_add_tail(&adev->reset_list, &device_list); 7174 7175 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7176 amdgpu_device_gpu_resume(adev, &device_list, false); 7177 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7178 7179 if (hive) { 7180 mutex_unlock(&hive->hive_lock); 7181 amdgpu_put_xgmi_hive(hive); 7182 } 7183 } 7184 7185 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7186 { 7187 struct pci_dev *swus, *swds; 7188 int r; 7189 7190 swds = pci_upstream_bridge(adev->pdev); 7191 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7192 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7193 return; 7194 swus = pci_upstream_bridge(swds); 7195 if (!swus || 7196 (swus->vendor != PCI_VENDOR_ID_ATI && 7197 swus->vendor != PCI_VENDOR_ID_AMD) || 7198 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7199 return; 7200 7201 /* If already saved, return */ 7202 if (adev->pcie_reset_ctx.swus) 7203 return; 7204 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7205 r = pci_save_state(swds); 7206 if (r) 7207 return; 7208 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7209 7210 r = pci_save_state(swus); 7211 if (r) 7212 return; 7213 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7214 7215 adev->pcie_reset_ctx.swus = swus; 7216 } 7217 7218 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7219 { 7220 struct pci_dev *pdev; 7221 int r; 7222 7223 if (!adev->pcie_reset_ctx.swds_pcistate || 7224 !adev->pcie_reset_ctx.swus_pcistate) 7225 return; 7226 7227 pdev = adev->pcie_reset_ctx.swus; 7228 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7229 if (!r) { 7230 pci_restore_state(pdev); 7231 } else { 7232 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7233 return; 7234 } 7235 7236 pdev = pci_upstream_bridge(adev->pdev); 7237 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7238 if (!r) 7239 pci_restore_state(pdev); 7240 else 7241 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7242 } 7243 7244 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7245 { 7246 struct drm_device *dev = pci_get_drvdata(pdev); 7247 struct amdgpu_device *adev = drm_to_adev(dev); 7248 int r; 7249 7250 if (amdgpu_sriov_vf(adev)) 7251 return false; 7252 7253 r = pci_save_state(pdev); 7254 if (!r) { 7255 kfree(adev->pci_state); 7256 7257 adev->pci_state = pci_store_saved_state(pdev); 7258 7259 if (!adev->pci_state) { 7260 dev_err(adev->dev, "Failed to store PCI saved state"); 7261 return false; 7262 } 7263 } else { 7264 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7265 return false; 7266 } 7267 7268 amdgpu_device_cache_switch_state(adev); 7269 7270 return true; 7271 } 7272 7273 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7274 { 7275 struct drm_device *dev = pci_get_drvdata(pdev); 7276 struct amdgpu_device *adev = drm_to_adev(dev); 7277 int r; 7278 7279 if (!adev->pci_state) 7280 return false; 7281 7282 r = pci_load_saved_state(pdev, adev->pci_state); 7283 7284 if (!r) { 7285 pci_restore_state(pdev); 7286 } else { 7287 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7288 return false; 7289 } 7290 7291 return true; 7292 } 7293 7294 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7295 struct amdgpu_ring *ring) 7296 { 7297 #ifdef CONFIG_X86_64 7298 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7299 return; 7300 #endif 7301 if (adev->gmc.xgmi.connected_to_cpu) 7302 return; 7303 7304 if (ring && ring->funcs->emit_hdp_flush) { 7305 amdgpu_ring_emit_hdp_flush(ring); 7306 return; 7307 } 7308 7309 if (!ring && amdgpu_sriov_runtime(adev)) { 7310 if (!amdgpu_kiq_hdp_flush(adev)) 7311 return; 7312 } 7313 7314 amdgpu_hdp_flush(adev, ring); 7315 } 7316 7317 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7318 struct amdgpu_ring *ring) 7319 { 7320 #ifdef CONFIG_X86_64 7321 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7322 return; 7323 #endif 7324 if (adev->gmc.xgmi.connected_to_cpu) 7325 return; 7326 7327 amdgpu_hdp_invalidate(adev, ring); 7328 } 7329 7330 int amdgpu_in_reset(struct amdgpu_device *adev) 7331 { 7332 return atomic_read(&adev->reset_domain->in_gpu_reset); 7333 } 7334 7335 /** 7336 * amdgpu_device_halt() - bring hardware to some kind of halt state 7337 * 7338 * @adev: amdgpu_device pointer 7339 * 7340 * Bring hardware to some kind of halt state so that no one can touch it 7341 * any more. It will help to maintain error context when error occurred. 7342 * Compare to a simple hang, the system will keep stable at least for SSH 7343 * access. Then it should be trivial to inspect the hardware state and 7344 * see what's going on. Implemented as following: 7345 * 7346 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7347 * clears all CPU mappings to device, disallows remappings through page faults 7348 * 2. amdgpu_irq_disable_all() disables all interrupts 7349 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7350 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7351 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7352 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7353 * flush any in flight DMA operations 7354 */ 7355 void amdgpu_device_halt(struct amdgpu_device *adev) 7356 { 7357 struct pci_dev *pdev = adev->pdev; 7358 struct drm_device *ddev = adev_to_drm(adev); 7359 7360 amdgpu_xcp_dev_unplug(adev); 7361 drm_dev_unplug(ddev); 7362 7363 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 7364 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 7365 7366 amdgpu_irq_disable_all(adev); 7367 7368 amdgpu_fence_driver_hw_fini(adev); 7369 7370 adev->no_hw_access = true; 7371 7372 amdgpu_device_unmap_mmio(adev); 7373 7374 pci_disable_device(pdev); 7375 pci_wait_for_pending_transaction(pdev); 7376 } 7377 7378 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7379 u32 reg) 7380 { 7381 unsigned long flags, address, data; 7382 u32 r; 7383 7384 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7385 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7386 7387 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7388 WREG32(address, reg * 4); 7389 (void)RREG32(address); 7390 r = RREG32(data); 7391 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7392 return r; 7393 } 7394 7395 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7396 u32 reg, u32 v) 7397 { 7398 unsigned long flags, address, data; 7399 7400 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7401 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7402 7403 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7404 WREG32(address, reg * 4); 7405 (void)RREG32(address); 7406 WREG32(data, v); 7407 (void)RREG32(data); 7408 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7409 } 7410 7411 /** 7412 * amdgpu_device_get_gang - return a reference to the current gang 7413 * @adev: amdgpu_device pointer 7414 * 7415 * Returns: A new reference to the current gang leader. 7416 */ 7417 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7418 { 7419 struct dma_fence *fence; 7420 7421 rcu_read_lock(); 7422 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7423 rcu_read_unlock(); 7424 return fence; 7425 } 7426 7427 /** 7428 * amdgpu_device_switch_gang - switch to a new gang 7429 * @adev: amdgpu_device pointer 7430 * @gang: the gang to switch to 7431 * 7432 * Try to switch to a new gang. 7433 * Returns: NULL if we switched to the new gang or a reference to the current 7434 * gang leader. 7435 */ 7436 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7437 struct dma_fence *gang) 7438 { 7439 struct dma_fence *old = NULL; 7440 7441 dma_fence_get(gang); 7442 do { 7443 dma_fence_put(old); 7444 old = amdgpu_device_get_gang(adev); 7445 if (old == gang) 7446 break; 7447 7448 if (!dma_fence_is_signaled(old)) { 7449 dma_fence_put(gang); 7450 return old; 7451 } 7452 7453 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7454 old, gang) != old); 7455 7456 /* 7457 * Drop it once for the exchanged reference in adev and once for the 7458 * thread local reference acquired in amdgpu_device_get_gang(). 7459 */ 7460 dma_fence_put(old); 7461 dma_fence_put(old); 7462 return NULL; 7463 } 7464 7465 /** 7466 * amdgpu_device_enforce_isolation - enforce HW isolation 7467 * @adev: the amdgpu device pointer 7468 * @ring: the HW ring the job is supposed to run on 7469 * @job: the job which is about to be pushed to the HW ring 7470 * 7471 * Makes sure that only one client at a time can use the GFX block. 7472 * Returns: The dependency to wait on before the job can be pushed to the HW. 7473 * The function is called multiple times until NULL is returned. 7474 */ 7475 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7476 struct amdgpu_ring *ring, 7477 struct amdgpu_job *job) 7478 { 7479 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7480 struct drm_sched_fence *f = job->base.s_fence; 7481 struct dma_fence *dep; 7482 void *owner; 7483 int r; 7484 7485 /* 7486 * For now enforce isolation only for the GFX block since we only need 7487 * the cleaner shader on those rings. 7488 */ 7489 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7490 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7491 return NULL; 7492 7493 /* 7494 * All submissions where enforce isolation is false are handled as if 7495 * they come from a single client. Use ~0l as the owner to distinct it 7496 * from kernel submissions where the owner is NULL. 7497 */ 7498 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7499 7500 mutex_lock(&adev->enforce_isolation_mutex); 7501 7502 /* 7503 * The "spearhead" submission is the first one which changes the 7504 * ownership to its client. We always need to wait for it to be 7505 * pushed to the HW before proceeding with anything. 7506 */ 7507 if (&f->scheduled != isolation->spearhead && 7508 !dma_fence_is_signaled(isolation->spearhead)) { 7509 dep = isolation->spearhead; 7510 goto out_grab_ref; 7511 } 7512 7513 if (isolation->owner != owner) { 7514 7515 /* 7516 * Wait for any gang to be assembled before switching to a 7517 * different owner or otherwise we could deadlock the 7518 * submissions. 7519 */ 7520 if (!job->gang_submit) { 7521 dep = amdgpu_device_get_gang(adev); 7522 if (!dma_fence_is_signaled(dep)) 7523 goto out_return_dep; 7524 dma_fence_put(dep); 7525 } 7526 7527 dma_fence_put(isolation->spearhead); 7528 isolation->spearhead = dma_fence_get(&f->scheduled); 7529 amdgpu_sync_move(&isolation->active, &isolation->prev); 7530 trace_amdgpu_isolation(isolation->owner, owner); 7531 isolation->owner = owner; 7532 } 7533 7534 /* 7535 * Specifying the ring here helps to pipeline submissions even when 7536 * isolation is enabled. If that is not desired for testing NULL can be 7537 * used instead of the ring to enforce a CPU round trip while switching 7538 * between clients. 7539 */ 7540 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7541 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7542 if (r) 7543 dev_warn(adev->dev, "OOM tracking isolation\n"); 7544 7545 out_grab_ref: 7546 dma_fence_get(dep); 7547 out_return_dep: 7548 mutex_unlock(&adev->enforce_isolation_mutex); 7549 return dep; 7550 } 7551 7552 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7553 { 7554 switch (adev->asic_type) { 7555 #ifdef CONFIG_DRM_AMDGPU_SI 7556 case CHIP_HAINAN: 7557 #endif 7558 case CHIP_TOPAZ: 7559 /* chips with no display hardware */ 7560 return false; 7561 #ifdef CONFIG_DRM_AMDGPU_SI 7562 case CHIP_TAHITI: 7563 case CHIP_PITCAIRN: 7564 case CHIP_VERDE: 7565 case CHIP_OLAND: 7566 #endif 7567 #ifdef CONFIG_DRM_AMDGPU_CIK 7568 case CHIP_BONAIRE: 7569 case CHIP_HAWAII: 7570 case CHIP_KAVERI: 7571 case CHIP_KABINI: 7572 case CHIP_MULLINS: 7573 #endif 7574 case CHIP_TONGA: 7575 case CHIP_FIJI: 7576 case CHIP_POLARIS10: 7577 case CHIP_POLARIS11: 7578 case CHIP_POLARIS12: 7579 case CHIP_VEGAM: 7580 case CHIP_CARRIZO: 7581 case CHIP_STONEY: 7582 /* chips with display hardware */ 7583 return true; 7584 default: 7585 /* IP discovery */ 7586 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7587 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7588 return false; 7589 return true; 7590 } 7591 } 7592 7593 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7594 uint32_t inst, uint32_t reg_addr, char reg_name[], 7595 uint32_t expected_value, uint32_t mask) 7596 { 7597 uint32_t ret = 0; 7598 uint32_t old_ = 0; 7599 uint32_t tmp_ = RREG32(reg_addr); 7600 uint32_t loop = adev->usec_timeout; 7601 7602 while ((tmp_ & (mask)) != (expected_value)) { 7603 if (old_ != tmp_) { 7604 loop = adev->usec_timeout; 7605 old_ = tmp_; 7606 } else 7607 udelay(1); 7608 tmp_ = RREG32(reg_addr); 7609 loop--; 7610 if (!loop) { 7611 dev_warn( 7612 adev->dev, 7613 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7614 inst, reg_name, (uint32_t)expected_value, 7615 (uint32_t)(tmp_ & (mask))); 7616 ret = -ETIMEDOUT; 7617 break; 7618 } 7619 } 7620 return ret; 7621 } 7622 7623 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7624 { 7625 ssize_t size = 0; 7626 7627 if (!ring || !ring->adev) 7628 return size; 7629 7630 if (amdgpu_device_should_recover_gpu(ring->adev)) 7631 size |= AMDGPU_RESET_TYPE_FULL; 7632 7633 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7634 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7635 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7636 7637 return size; 7638 } 7639 7640 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7641 { 7642 ssize_t size = 0; 7643 7644 if (supported_reset == 0) { 7645 size += sysfs_emit_at(buf, size, "unsupported"); 7646 size += sysfs_emit_at(buf, size, "\n"); 7647 return size; 7648 7649 } 7650 7651 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7652 size += sysfs_emit_at(buf, size, "soft "); 7653 7654 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7655 size += sysfs_emit_at(buf, size, "queue "); 7656 7657 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7658 size += sysfs_emit_at(buf, size, "pipe "); 7659 7660 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7661 size += sysfs_emit_at(buf, size, "full "); 7662 7663 size += sysfs_emit_at(buf, size, "\n"); 7664 return size; 7665 } 7666 7667 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7668 enum amdgpu_uid_type type, uint8_t inst, 7669 uint64_t uid) 7670 { 7671 if (!uid_info) 7672 return; 7673 7674 if (type >= AMDGPU_UID_TYPE_MAX) { 7675 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7676 type); 7677 return; 7678 } 7679 7680 if (inst >= AMDGPU_UID_INST_MAX) { 7681 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7682 inst); 7683 return; 7684 } 7685 7686 if (uid_info->uid[type][inst] != 0) { 7687 dev_warn_once( 7688 uid_info->adev->dev, 7689 "Overwriting existing UID %llu for type %d instance %d\n", 7690 uid_info->uid[type][inst], type, inst); 7691 } 7692 7693 uid_info->uid[type][inst] = uid; 7694 } 7695 7696 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7697 enum amdgpu_uid_type type, uint8_t inst) 7698 { 7699 if (!uid_info) 7700 return 0; 7701 7702 if (type >= AMDGPU_UID_TYPE_MAX) { 7703 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7704 type); 7705 return 0; 7706 } 7707 7708 if (inst >= AMDGPU_UID_INST_MAX) { 7709 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7710 inst); 7711 return 0; 7712 } 7713 7714 return uid_info->uid[type][inst]; 7715 } 7716