1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 #include <linux/nospec.h> 40 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_client_event.h> 43 #include <drm/drm_crtc_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/device.h> 47 #include <linux/vgaarb.h> 48 #include <linux/vga_switcheroo.h> 49 #include <linux/efi.h> 50 #include "amdgpu.h" 51 #include "amdgpu_trace.h" 52 #include "amdgpu_i2c.h" 53 #include "atom.h" 54 #include "amdgpu_atombios.h" 55 #include "amdgpu_atomfirmware.h" 56 #include "amd_pcie.h" 57 #ifdef CONFIG_DRM_AMDGPU_SI 58 #include "si.h" 59 #endif 60 #ifdef CONFIG_DRM_AMDGPU_CIK 61 #include "cik.h" 62 #endif 63 #include "vi.h" 64 #include "soc15.h" 65 #include "nv.h" 66 #include "bif/bif_4_1_d.h" 67 #include <linux/firmware.h> 68 #include "amdgpu_vf_error.h" 69 70 #include "amdgpu_amdkfd.h" 71 #include "amdgpu_pm.h" 72 73 #include "amdgpu_xgmi.h" 74 #include "amdgpu_ras.h" 75 #include "amdgpu_ras_mgr.h" 76 #include "amdgpu_pmu.h" 77 #include "amdgpu_fru_eeprom.h" 78 #include "amdgpu_reset.h" 79 #include "amdgpu_virt.h" 80 #include "amdgpu_dev_coredump.h" 81 82 #include <linux/suspend.h> 83 #include <drm/task_barrier.h> 84 #include <linux/pm_runtime.h> 85 86 #include <drm/drm_drv.h> 87 88 #if IS_ENABLED(CONFIG_X86) 89 #include <asm/intel-family.h> 90 #include <asm/cpu_device_id.h> 91 #endif 92 93 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 100 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 101 102 #define AMDGPU_RESUME_MS 2000 103 #define AMDGPU_MAX_RETRY_LIMIT 2 104 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 105 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 106 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 107 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 108 109 #define AMDGPU_VBIOS_SKIP (1U << 0) 110 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 111 112 static const struct drm_driver amdgpu_kms_driver; 113 114 const char *amdgpu_asic_name[] = { 115 "TAHITI", 116 "PITCAIRN", 117 "VERDE", 118 "OLAND", 119 "HAINAN", 120 "BONAIRE", 121 "KAVERI", 122 "KABINI", 123 "HAWAII", 124 "MULLINS", 125 "TOPAZ", 126 "TONGA", 127 "FIJI", 128 "CARRIZO", 129 "STONEY", 130 "POLARIS10", 131 "POLARIS11", 132 "POLARIS12", 133 "VEGAM", 134 "VEGA10", 135 "VEGA12", 136 "VEGA20", 137 "RAVEN", 138 "ARCTURUS", 139 "RENOIR", 140 "ALDEBARAN", 141 "NAVI10", 142 "CYAN_SKILLFISH", 143 "NAVI14", 144 "NAVI12", 145 "SIENNA_CICHLID", 146 "NAVY_FLOUNDER", 147 "VANGOGH", 148 "DIMGREY_CAVEFISH", 149 "BEIGE_GOBY", 150 "YELLOW_CARP", 151 "IP DISCOVERY", 152 "LAST", 153 }; 154 155 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 156 /* 157 * Default init level where all blocks are expected to be initialized. This is 158 * the level of initialization expected by default and also after a full reset 159 * of the device. 160 */ 161 struct amdgpu_init_level amdgpu_init_default = { 162 .level = AMDGPU_INIT_LEVEL_DEFAULT, 163 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 164 }; 165 166 struct amdgpu_init_level amdgpu_init_recovery = { 167 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 168 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 169 }; 170 171 /* 172 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 173 * is used for cases like reset on initialization where the entire hive needs to 174 * be reset before first use. 175 */ 176 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 177 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 178 .hwini_ip_block_mask = 179 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 180 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 181 BIT(AMD_IP_BLOCK_TYPE_PSP) 182 }; 183 184 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 186 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 187 188 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 189 190 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 191 enum amd_ip_block_type block) 192 { 193 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 194 } 195 196 void amdgpu_set_init_level(struct amdgpu_device *adev, 197 enum amdgpu_init_lvl_id lvl) 198 { 199 switch (lvl) { 200 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 201 adev->init_lvl = &amdgpu_init_minimal_xgmi; 202 break; 203 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 204 adev->init_lvl = &amdgpu_init_recovery; 205 break; 206 case AMDGPU_INIT_LEVEL_DEFAULT: 207 fallthrough; 208 default: 209 adev->init_lvl = &amdgpu_init_default; 210 break; 211 } 212 } 213 214 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 215 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 216 void *data); 217 218 /** 219 * DOC: pcie_replay_count 220 * 221 * The amdgpu driver provides a sysfs API for reporting the total number 222 * of PCIe replays (NAKs). 223 * The file pcie_replay_count is used for this and returns the total 224 * number of replays as a sum of the NAKs generated and NAKs received. 225 */ 226 227 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 228 struct device_attribute *attr, char *buf) 229 { 230 struct drm_device *ddev = dev_get_drvdata(dev); 231 struct amdgpu_device *adev = drm_to_adev(ddev); 232 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 233 234 return sysfs_emit(buf, "%llu\n", cnt); 235 } 236 237 static DEVICE_ATTR(pcie_replay_count, 0444, 238 amdgpu_device_get_pcie_replay_count, NULL); 239 240 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 241 { 242 int ret = 0; 243 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 ret = sysfs_create_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 248 return ret; 249 } 250 251 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 252 { 253 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 254 sysfs_remove_file(&adev->dev->kobj, 255 &dev_attr_pcie_replay_count.attr); 256 } 257 258 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 259 const struct bin_attribute *attr, char *buf, 260 loff_t ppos, size_t count) 261 { 262 struct device *dev = kobj_to_dev(kobj); 263 struct drm_device *ddev = dev_get_drvdata(dev); 264 struct amdgpu_device *adev = drm_to_adev(ddev); 265 ssize_t bytes_read; 266 267 switch (ppos) { 268 case AMDGPU_SYS_REG_STATE_XGMI: 269 bytes_read = amdgpu_asic_get_reg_state( 270 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 271 break; 272 case AMDGPU_SYS_REG_STATE_WAFL: 273 bytes_read = amdgpu_asic_get_reg_state( 274 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 275 break; 276 case AMDGPU_SYS_REG_STATE_PCIE: 277 bytes_read = amdgpu_asic_get_reg_state( 278 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 279 break; 280 case AMDGPU_SYS_REG_STATE_USR: 281 bytes_read = amdgpu_asic_get_reg_state( 282 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 283 break; 284 case AMDGPU_SYS_REG_STATE_USR_1: 285 bytes_read = amdgpu_asic_get_reg_state( 286 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 287 break; 288 default: 289 return -EINVAL; 290 } 291 292 return bytes_read; 293 } 294 295 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 296 AMDGPU_SYS_REG_STATE_END); 297 298 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 299 { 300 int ret; 301 302 if (!amdgpu_asic_get_reg_state_supported(adev)) 303 return 0; 304 305 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 307 return ret; 308 } 309 310 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 311 { 312 if (!amdgpu_asic_get_reg_state_supported(adev)) 313 return; 314 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 315 } 316 317 /** 318 * DOC: board_info 319 * 320 * The amdgpu driver provides a sysfs API for giving board related information. 321 * It provides the form factor information in the format 322 * 323 * type : form factor 324 * 325 * Possible form factor values 326 * 327 * - "cem" - PCIE CEM card 328 * - "oam" - Open Compute Accelerator Module 329 * - "unknown" - Not known 330 * 331 */ 332 333 static ssize_t amdgpu_device_get_board_info(struct device *dev, 334 struct device_attribute *attr, 335 char *buf) 336 { 337 struct drm_device *ddev = dev_get_drvdata(dev); 338 struct amdgpu_device *adev = drm_to_adev(ddev); 339 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 340 const char *pkg; 341 342 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 343 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 344 345 switch (pkg_type) { 346 case AMDGPU_PKG_TYPE_CEM: 347 pkg = "cem"; 348 break; 349 case AMDGPU_PKG_TYPE_OAM: 350 pkg = "oam"; 351 break; 352 default: 353 pkg = "unknown"; 354 break; 355 } 356 357 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 358 } 359 360 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 361 362 static struct attribute *amdgpu_board_attrs[] = { 363 &dev_attr_board_info.attr, 364 NULL, 365 }; 366 367 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 368 struct attribute *attr, int n) 369 { 370 struct device *dev = kobj_to_dev(kobj); 371 struct drm_device *ddev = dev_get_drvdata(dev); 372 struct amdgpu_device *adev = drm_to_adev(ddev); 373 374 if (adev->flags & AMD_IS_APU) 375 return 0; 376 377 return attr->mode; 378 } 379 380 static const struct attribute_group amdgpu_board_attrs_group = { 381 .attrs = amdgpu_board_attrs, 382 .is_visible = amdgpu_board_attrs_is_visible 383 }; 384 385 /** 386 * DOC: uma/carveout_options 387 * 388 * This is a read-only file that lists all available UMA allocation 389 * options and their corresponding indices. Example output:: 390 * 391 * $ cat uma/carveout_options 392 * 0: Minimum (512 MB) 393 * 1: (1 GB) 394 * 2: (2 GB) 395 * 3: (4 GB) 396 * 4: (6 GB) 397 * 5: (8 GB) 398 * 6: (12 GB) 399 * 7: Medium (16 GB) 400 * 8: (24 GB) 401 * 9: High (32 GB) 402 */ 403 static ssize_t carveout_options_show(struct device *dev, 404 struct device_attribute *attr, 405 char *buf) 406 { 407 struct drm_device *ddev = dev_get_drvdata(dev); 408 struct amdgpu_device *adev = drm_to_adev(ddev); 409 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 410 uint32_t memory_carved; 411 ssize_t size = 0; 412 413 if (!uma_info || !uma_info->num_entries) 414 return -ENODEV; 415 416 for (int i = 0; i < uma_info->num_entries; i++) { 417 memory_carved = uma_info->entries[i].memory_carved_mb; 418 if (memory_carved >= SZ_1G/SZ_1M) { 419 size += sysfs_emit_at(buf, size, "%d: %s (%u GB)\n", 420 i, 421 uma_info->entries[i].name, 422 memory_carved >> 10); 423 } else { 424 size += sysfs_emit_at(buf, size, "%d: %s (%u MB)\n", 425 i, 426 uma_info->entries[i].name, 427 memory_carved); 428 } 429 } 430 431 return size; 432 } 433 static DEVICE_ATTR_RO(carveout_options); 434 435 /** 436 * DOC: uma/carveout 437 * 438 * This file is both readable and writable. When read, it shows the 439 * index of the current setting. Writing a valid index to this file 440 * allows users to change the UMA carveout size to the selected option 441 * on the next boot. 442 * 443 * The available options and their corresponding indices can be read 444 * from the uma/carveout_options file. 445 */ 446 static ssize_t carveout_show(struct device *dev, 447 struct device_attribute *attr, 448 char *buf) 449 { 450 struct drm_device *ddev = dev_get_drvdata(dev); 451 struct amdgpu_device *adev = drm_to_adev(ddev); 452 453 return sysfs_emit(buf, "%u\n", adev->uma_info.uma_option_index); 454 } 455 456 static ssize_t carveout_store(struct device *dev, 457 struct device_attribute *attr, 458 const char *buf, size_t count) 459 { 460 struct drm_device *ddev = dev_get_drvdata(dev); 461 struct amdgpu_device *adev = drm_to_adev(ddev); 462 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 463 struct amdgpu_uma_carveout_option *opt; 464 unsigned long val; 465 uint8_t flags; 466 int r; 467 468 r = kstrtoul(buf, 10, &val); 469 if (r) 470 return r; 471 472 if (val >= uma_info->num_entries) 473 return -EINVAL; 474 475 val = array_index_nospec(val, uma_info->num_entries); 476 opt = &uma_info->entries[val]; 477 478 if (!(opt->flags & AMDGPU_UMA_FLAG_AUTO) && 479 !(opt->flags & AMDGPU_UMA_FLAG_CUSTOM)) { 480 drm_err_once(ddev, "Option %lu not supported due to lack of Custom/Auto flag", val); 481 return -EINVAL; 482 } 483 484 flags = opt->flags; 485 flags &= ~((flags & AMDGPU_UMA_FLAG_AUTO) >> 1); 486 487 guard(mutex)(&uma_info->update_lock); 488 489 r = amdgpu_acpi_set_uma_allocation_size(adev, val, flags); 490 if (r) 491 return r; 492 493 uma_info->uma_option_index = val; 494 495 return count; 496 } 497 static DEVICE_ATTR_RW(carveout); 498 499 static struct attribute *amdgpu_uma_attrs[] = { 500 &dev_attr_carveout.attr, 501 &dev_attr_carveout_options.attr, 502 NULL 503 }; 504 505 const struct attribute_group amdgpu_uma_attr_group = { 506 .name = "uma", 507 .attrs = amdgpu_uma_attrs 508 }; 509 510 static void amdgpu_uma_sysfs_init(struct amdgpu_device *adev) 511 { 512 int rc; 513 514 if (!(adev->flags & AMD_IS_APU)) 515 return; 516 517 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 518 return; 519 520 rc = amdgpu_atomfirmware_get_uma_carveout_info(adev, &adev->uma_info); 521 if (rc) { 522 drm_dbg(adev_to_drm(adev), 523 "Failed to parse UMA carveout info from VBIOS: %d\n", rc); 524 goto out_info; 525 } 526 527 mutex_init(&adev->uma_info.update_lock); 528 529 rc = devm_device_add_group(adev->dev, &amdgpu_uma_attr_group); 530 if (rc) { 531 drm_dbg(adev_to_drm(adev), "Failed to add UMA carveout sysfs interfaces %d\n", rc); 532 goto out_attr; 533 } 534 535 return; 536 537 out_attr: 538 mutex_destroy(&adev->uma_info.update_lock); 539 out_info: 540 return; 541 } 542 543 static void amdgpu_uma_sysfs_fini(struct amdgpu_device *adev) 544 { 545 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 546 547 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 548 return; 549 550 mutex_destroy(&uma_info->update_lock); 551 uma_info->num_entries = 0; 552 } 553 554 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 555 556 /** 557 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 558 * 559 * @adev: amdgpu device pointer 560 * 561 * Returns true if the device is a dGPU with ATPX power control, 562 * otherwise return false. 563 */ 564 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 565 { 566 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 567 return true; 568 return false; 569 } 570 571 /** 572 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 573 * 574 * @adev: amdgpu device pointer 575 * 576 * Returns true if the device is a dGPU with ACPI power control, 577 * otherwise return false. 578 */ 579 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 580 { 581 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 582 return false; 583 584 if (adev->has_pr3 || 585 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 586 return true; 587 return false; 588 } 589 590 /** 591 * amdgpu_device_supports_baco - Does the device support BACO 592 * 593 * @adev: amdgpu device pointer 594 * 595 * Return: 596 * 1 if the device supports BACO; 597 * 3 if the device supports MACO (only works if BACO is supported) 598 * otherwise return 0. 599 */ 600 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 601 { 602 return amdgpu_asic_supports_baco(adev); 603 } 604 605 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 606 { 607 int bamaco_support; 608 609 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 610 bamaco_support = amdgpu_device_supports_baco(adev); 611 612 switch (amdgpu_runtime_pm) { 613 case 2: 614 if (bamaco_support & MACO_SUPPORT) { 615 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 616 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 617 } else if (bamaco_support == BACO_SUPPORT) { 618 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 619 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 620 } 621 break; 622 case 1: 623 if (bamaco_support & BACO_SUPPORT) { 624 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 625 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 626 } 627 break; 628 case -1: 629 case -2: 630 if (amdgpu_device_supports_px(adev)) { 631 /* enable PX as runtime mode */ 632 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 633 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 634 } else if (amdgpu_device_supports_boco(adev)) { 635 /* enable boco as runtime mode */ 636 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 637 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 638 } else { 639 if (!bamaco_support) 640 goto no_runtime_pm; 641 642 switch (adev->asic_type) { 643 case CHIP_VEGA20: 644 case CHIP_ARCTURUS: 645 /* BACO are not supported on vega20 and arctrus */ 646 break; 647 case CHIP_VEGA10: 648 /* enable BACO as runpm mode if noretry=0 */ 649 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 650 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 651 break; 652 default: 653 /* enable BACO as runpm mode on CI+ */ 654 if (!amdgpu_passthrough(adev)) 655 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 656 break; 657 } 658 659 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 660 if (bamaco_support & MACO_SUPPORT) { 661 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 662 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 663 } else { 664 dev_info(adev->dev, "Using BACO for runtime pm\n"); 665 } 666 } 667 } 668 break; 669 case 0: 670 dev_info(adev->dev, "runtime pm is manually disabled\n"); 671 break; 672 default: 673 break; 674 } 675 676 no_runtime_pm: 677 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 678 dev_info(adev->dev, "Runtime PM not available\n"); 679 } 680 /** 681 * amdgpu_device_supports_smart_shift - Is the device dGPU with 682 * smart shift support 683 * 684 * @adev: amdgpu device pointer 685 * 686 * Returns true if the device is a dGPU with Smart Shift support, 687 * otherwise returns false. 688 */ 689 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 690 { 691 return (amdgpu_device_supports_boco(adev) && 692 amdgpu_acpi_is_power_shift_control_supported()); 693 } 694 695 /* 696 * VRAM access helper functions 697 */ 698 699 /** 700 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 701 * 702 * @adev: amdgpu_device pointer 703 * @pos: offset of the buffer in vram 704 * @buf: virtual address of the buffer in system memory 705 * @size: read/write size, sizeof(@buf) must > @size 706 * @write: true - write to vram, otherwise - read from vram 707 */ 708 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 709 void *buf, size_t size, bool write) 710 { 711 unsigned long flags; 712 uint32_t hi = ~0, tmp = 0; 713 uint32_t *data = buf; 714 uint64_t last; 715 int idx; 716 717 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 718 return; 719 720 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 721 722 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 723 for (last = pos + size; pos < last; pos += 4) { 724 tmp = pos >> 31; 725 726 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 727 if (tmp != hi) { 728 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 729 hi = tmp; 730 } 731 if (write) 732 WREG32_NO_KIQ(mmMM_DATA, *data++); 733 else 734 *data++ = RREG32_NO_KIQ(mmMM_DATA); 735 } 736 737 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 738 drm_dev_exit(idx); 739 } 740 741 /** 742 * amdgpu_device_aper_access - access vram by vram aperture 743 * 744 * @adev: amdgpu_device pointer 745 * @pos: offset of the buffer in vram 746 * @buf: virtual address of the buffer in system memory 747 * @size: read/write size, sizeof(@buf) must > @size 748 * @write: true - write to vram, otherwise - read from vram 749 * 750 * The return value means how many bytes have been transferred. 751 */ 752 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 753 void *buf, size_t size, bool write) 754 { 755 #ifdef CONFIG_64BIT 756 void __iomem *addr; 757 size_t count = 0; 758 uint64_t last; 759 760 if (!adev->mman.aper_base_kaddr) 761 return 0; 762 763 last = min(pos + size, adev->gmc.visible_vram_size); 764 if (last > pos) { 765 addr = adev->mman.aper_base_kaddr + pos; 766 count = last - pos; 767 768 if (write) { 769 memcpy_toio(addr, buf, count); 770 /* Make sure HDP write cache flush happens without any reordering 771 * after the system memory contents are sent over PCIe device 772 */ 773 mb(); 774 amdgpu_device_flush_hdp(adev, NULL); 775 } else { 776 amdgpu_device_invalidate_hdp(adev, NULL); 777 /* Make sure HDP read cache is invalidated before issuing a read 778 * to the PCIe device 779 */ 780 mb(); 781 memcpy_fromio(buf, addr, count); 782 } 783 784 } 785 786 return count; 787 #else 788 return 0; 789 #endif 790 } 791 792 /** 793 * amdgpu_device_vram_access - read/write a buffer in vram 794 * 795 * @adev: amdgpu_device pointer 796 * @pos: offset of the buffer in vram 797 * @buf: virtual address of the buffer in system memory 798 * @size: read/write size, sizeof(@buf) must > @size 799 * @write: true - write to vram, otherwise - read from vram 800 */ 801 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 802 void *buf, size_t size, bool write) 803 { 804 size_t count; 805 806 /* try to using vram apreature to access vram first */ 807 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 808 size -= count; 809 if (size) { 810 /* using MM to access rest vram */ 811 pos += count; 812 buf += count; 813 amdgpu_device_mm_access(adev, pos, buf, size, write); 814 } 815 } 816 817 /* 818 * register access helper functions. 819 */ 820 821 /* Check if hw access should be skipped because of hotplug or device error */ 822 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 823 { 824 if (adev->no_hw_access) 825 return true; 826 827 #ifdef CONFIG_LOCKDEP 828 /* 829 * This is a bit complicated to understand, so worth a comment. What we assert 830 * here is that the GPU reset is not running on another thread in parallel. 831 * 832 * For this we trylock the read side of the reset semaphore, if that succeeds 833 * we know that the reset is not running in parallel. 834 * 835 * If the trylock fails we assert that we are either already holding the read 836 * side of the lock or are the reset thread itself and hold the write side of 837 * the lock. 838 */ 839 if (in_task()) { 840 if (down_read_trylock(&adev->reset_domain->sem)) 841 up_read(&adev->reset_domain->sem); 842 else 843 lockdep_assert_held(&adev->reset_domain->sem); 844 } 845 #endif 846 return false; 847 } 848 849 /** 850 * amdgpu_device_rreg - read a memory mapped IO or indirect register 851 * 852 * @adev: amdgpu_device pointer 853 * @reg: dword aligned register offset 854 * @acc_flags: access flags which require special behavior 855 * 856 * Returns the 32 bit value from the offset specified. 857 */ 858 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 859 uint32_t reg, uint32_t acc_flags) 860 { 861 uint32_t ret; 862 863 if (amdgpu_device_skip_hw_access(adev)) 864 return 0; 865 866 if ((reg * 4) < adev->rmmio_size) { 867 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 868 amdgpu_sriov_runtime(adev) && 869 down_read_trylock(&adev->reset_domain->sem)) { 870 ret = amdgpu_kiq_rreg(adev, reg, 0); 871 up_read(&adev->reset_domain->sem); 872 } else { 873 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 874 } 875 } else { 876 ret = adev->pcie_rreg(adev, reg * 4); 877 } 878 879 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 880 881 return ret; 882 } 883 884 /* 885 * MMIO register read with bytes helper functions 886 * @offset:bytes offset from MMIO start 887 */ 888 889 /** 890 * amdgpu_mm_rreg8 - read a memory mapped IO register 891 * 892 * @adev: amdgpu_device pointer 893 * @offset: byte aligned register offset 894 * 895 * Returns the 8 bit value from the offset specified. 896 */ 897 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 898 { 899 if (amdgpu_device_skip_hw_access(adev)) 900 return 0; 901 902 if (offset < adev->rmmio_size) 903 return (readb(adev->rmmio + offset)); 904 BUG(); 905 } 906 907 908 /** 909 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 910 * 911 * @adev: amdgpu_device pointer 912 * @reg: dword aligned register offset 913 * @acc_flags: access flags which require special behavior 914 * @xcc_id: xcc accelerated compute core id 915 * 916 * Returns the 32 bit value from the offset specified. 917 */ 918 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 919 uint32_t reg, uint32_t acc_flags, 920 uint32_t xcc_id) 921 { 922 uint32_t ret, rlcg_flag; 923 924 if (amdgpu_device_skip_hw_access(adev)) 925 return 0; 926 927 if ((reg * 4) < adev->rmmio_size) { 928 if (amdgpu_sriov_vf(adev) && 929 !amdgpu_sriov_runtime(adev) && 930 adev->gfx.rlc.rlcg_reg_access_supported && 931 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 932 GC_HWIP, false, 933 &rlcg_flag)) { 934 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 935 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 936 amdgpu_sriov_runtime(adev) && 937 down_read_trylock(&adev->reset_domain->sem)) { 938 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 939 up_read(&adev->reset_domain->sem); 940 } else { 941 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 942 } 943 } else { 944 ret = adev->pcie_rreg(adev, reg * 4); 945 } 946 947 return ret; 948 } 949 950 /* 951 * MMIO register write with bytes helper functions 952 * @offset:bytes offset from MMIO start 953 * @value: the value want to be written to the register 954 */ 955 956 /** 957 * amdgpu_mm_wreg8 - read a memory mapped IO register 958 * 959 * @adev: amdgpu_device pointer 960 * @offset: byte aligned register offset 961 * @value: 8 bit value to write 962 * 963 * Writes the value specified to the offset specified. 964 */ 965 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 966 { 967 if (amdgpu_device_skip_hw_access(adev)) 968 return; 969 970 if (offset < adev->rmmio_size) 971 writeb(value, adev->rmmio + offset); 972 else 973 BUG(); 974 } 975 976 /** 977 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 978 * 979 * @adev: amdgpu_device pointer 980 * @reg: dword aligned register offset 981 * @v: 32 bit value to write to the register 982 * @acc_flags: access flags which require special behavior 983 * 984 * Writes the value specified to the offset specified. 985 */ 986 void amdgpu_device_wreg(struct amdgpu_device *adev, 987 uint32_t reg, uint32_t v, 988 uint32_t acc_flags) 989 { 990 if (amdgpu_device_skip_hw_access(adev)) 991 return; 992 993 if ((reg * 4) < adev->rmmio_size) { 994 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 995 amdgpu_sriov_runtime(adev) && 996 down_read_trylock(&adev->reset_domain->sem)) { 997 amdgpu_kiq_wreg(adev, reg, v, 0); 998 up_read(&adev->reset_domain->sem); 999 } else { 1000 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1001 } 1002 } else { 1003 adev->pcie_wreg(adev, reg * 4, v); 1004 } 1005 1006 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 1007 } 1008 1009 /** 1010 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 1011 * 1012 * @adev: amdgpu_device pointer 1013 * @reg: mmio/rlc register 1014 * @v: value to write 1015 * @xcc_id: xcc accelerated compute core id 1016 * 1017 * this function is invoked only for the debugfs register access 1018 */ 1019 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 1020 uint32_t reg, uint32_t v, 1021 uint32_t xcc_id) 1022 { 1023 if (amdgpu_device_skip_hw_access(adev)) 1024 return; 1025 1026 if (amdgpu_sriov_fullaccess(adev) && 1027 adev->gfx.rlc.funcs && 1028 adev->gfx.rlc.funcs->is_rlcg_access_range) { 1029 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 1030 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 1031 } else if ((reg * 4) >= adev->rmmio_size) { 1032 adev->pcie_wreg(adev, reg * 4, v); 1033 } else { 1034 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1035 } 1036 } 1037 1038 /** 1039 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 1040 * 1041 * @adev: amdgpu_device pointer 1042 * @reg: dword aligned register offset 1043 * @v: 32 bit value to write to the register 1044 * @acc_flags: access flags which require special behavior 1045 * @xcc_id: xcc accelerated compute core id 1046 * 1047 * Writes the value specified to the offset specified. 1048 */ 1049 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 1050 uint32_t reg, uint32_t v, 1051 uint32_t acc_flags, uint32_t xcc_id) 1052 { 1053 uint32_t rlcg_flag; 1054 1055 if (amdgpu_device_skip_hw_access(adev)) 1056 return; 1057 1058 if ((reg * 4) < adev->rmmio_size) { 1059 if (amdgpu_sriov_vf(adev) && 1060 !amdgpu_sriov_runtime(adev) && 1061 adev->gfx.rlc.rlcg_reg_access_supported && 1062 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 1063 GC_HWIP, true, 1064 &rlcg_flag)) { 1065 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 1066 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 1067 amdgpu_sriov_runtime(adev) && 1068 down_read_trylock(&adev->reset_domain->sem)) { 1069 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 1070 up_read(&adev->reset_domain->sem); 1071 } else { 1072 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1073 } 1074 } else { 1075 adev->pcie_wreg(adev, reg * 4, v); 1076 } 1077 } 1078 1079 /** 1080 * amdgpu_device_indirect_rreg - read an indirect register 1081 * 1082 * @adev: amdgpu_device pointer 1083 * @reg_addr: indirect register address to read from 1084 * 1085 * Returns the value of indirect register @reg_addr 1086 */ 1087 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 1088 u32 reg_addr) 1089 { 1090 unsigned long flags, pcie_index, pcie_data; 1091 void __iomem *pcie_index_offset; 1092 void __iomem *pcie_data_offset; 1093 u32 r; 1094 1095 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1096 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1097 1098 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1099 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1100 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1101 1102 writel(reg_addr, pcie_index_offset); 1103 readl(pcie_index_offset); 1104 r = readl(pcie_data_offset); 1105 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1106 1107 return r; 1108 } 1109 1110 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 1111 u64 reg_addr) 1112 { 1113 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1114 u32 r; 1115 void __iomem *pcie_index_offset; 1116 void __iomem *pcie_index_hi_offset; 1117 void __iomem *pcie_data_offset; 1118 1119 if (unlikely(!adev->nbio.funcs)) { 1120 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 1121 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 1122 } else { 1123 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1124 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1125 } 1126 1127 if (reg_addr >> 32) { 1128 if (unlikely(!adev->nbio.funcs)) 1129 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 1130 else 1131 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1132 } else { 1133 pcie_index_hi = 0; 1134 } 1135 1136 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1137 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1138 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1139 if (pcie_index_hi != 0) 1140 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1141 pcie_index_hi * 4; 1142 1143 writel(reg_addr, pcie_index_offset); 1144 readl(pcie_index_offset); 1145 if (pcie_index_hi != 0) { 1146 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1147 readl(pcie_index_hi_offset); 1148 } 1149 r = readl(pcie_data_offset); 1150 1151 /* clear the high bits */ 1152 if (pcie_index_hi != 0) { 1153 writel(0, pcie_index_hi_offset); 1154 readl(pcie_index_hi_offset); 1155 } 1156 1157 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1158 1159 return r; 1160 } 1161 1162 /** 1163 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1164 * 1165 * @adev: amdgpu_device pointer 1166 * @reg_addr: indirect register address to read from 1167 * 1168 * Returns the value of indirect register @reg_addr 1169 */ 1170 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1171 u32 reg_addr) 1172 { 1173 unsigned long flags, pcie_index, pcie_data; 1174 void __iomem *pcie_index_offset; 1175 void __iomem *pcie_data_offset; 1176 u64 r; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* read low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 r = readl(pcie_data_offset); 1189 /* read high 32 bits */ 1190 writel(reg_addr + 4, pcie_index_offset); 1191 readl(pcie_index_offset); 1192 r |= ((u64)readl(pcie_data_offset) << 32); 1193 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1194 1195 return r; 1196 } 1197 1198 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 u64 r; 1207 1208 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1209 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1210 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1211 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1212 1213 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1214 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1215 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1216 if (pcie_index_hi != 0) 1217 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1218 pcie_index_hi * 4; 1219 1220 /* read low 32 bits */ 1221 writel(reg_addr, pcie_index_offset); 1222 readl(pcie_index_offset); 1223 if (pcie_index_hi != 0) { 1224 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1225 readl(pcie_index_hi_offset); 1226 } 1227 r = readl(pcie_data_offset); 1228 /* read high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 r |= ((u64)readl(pcie_data_offset) << 32); 1236 1237 /* clear the high bits */ 1238 if (pcie_index_hi != 0) { 1239 writel(0, pcie_index_hi_offset); 1240 readl(pcie_index_hi_offset); 1241 } 1242 1243 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1244 1245 return r; 1246 } 1247 1248 /** 1249 * amdgpu_device_indirect_wreg - write an indirect register address 1250 * 1251 * @adev: amdgpu_device pointer 1252 * @reg_addr: indirect register offset 1253 * @reg_data: indirect register data 1254 * 1255 */ 1256 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1257 u32 reg_addr, u32 reg_data) 1258 { 1259 unsigned long flags, pcie_index, pcie_data; 1260 void __iomem *pcie_index_offset; 1261 void __iomem *pcie_data_offset; 1262 1263 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1264 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1265 1266 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1267 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1268 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1269 1270 writel(reg_addr, pcie_index_offset); 1271 readl(pcie_index_offset); 1272 writel(reg_data, pcie_data_offset); 1273 readl(pcie_data_offset); 1274 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1275 } 1276 1277 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1278 u64 reg_addr, u32 reg_data) 1279 { 1280 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1281 void __iomem *pcie_index_offset; 1282 void __iomem *pcie_index_hi_offset; 1283 void __iomem *pcie_data_offset; 1284 1285 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1286 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1287 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1288 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1289 else 1290 pcie_index_hi = 0; 1291 1292 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1293 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1294 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1295 if (pcie_index_hi != 0) 1296 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1297 pcie_index_hi * 4; 1298 1299 writel(reg_addr, pcie_index_offset); 1300 readl(pcie_index_offset); 1301 if (pcie_index_hi != 0) { 1302 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1303 readl(pcie_index_hi_offset); 1304 } 1305 writel(reg_data, pcie_data_offset); 1306 readl(pcie_data_offset); 1307 1308 /* clear the high bits */ 1309 if (pcie_index_hi != 0) { 1310 writel(0, pcie_index_hi_offset); 1311 readl(pcie_index_hi_offset); 1312 } 1313 1314 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1315 } 1316 1317 /** 1318 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1319 * 1320 * @adev: amdgpu_device pointer 1321 * @reg_addr: indirect register offset 1322 * @reg_data: indirect register data 1323 * 1324 */ 1325 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1326 u32 reg_addr, u64 reg_data) 1327 { 1328 unsigned long flags, pcie_index, pcie_data; 1329 void __iomem *pcie_index_offset; 1330 void __iomem *pcie_data_offset; 1331 1332 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1333 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1334 1335 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1336 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1337 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1338 1339 /* write low 32 bits */ 1340 writel(reg_addr, pcie_index_offset); 1341 readl(pcie_index_offset); 1342 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1343 readl(pcie_data_offset); 1344 /* write high 32 bits */ 1345 writel(reg_addr + 4, pcie_index_offset); 1346 readl(pcie_index_offset); 1347 writel((u32)(reg_data >> 32), pcie_data_offset); 1348 readl(pcie_data_offset); 1349 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1350 } 1351 1352 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1353 u64 reg_addr, u64 reg_data) 1354 { 1355 unsigned long flags, pcie_index, pcie_data; 1356 unsigned long pcie_index_hi = 0; 1357 void __iomem *pcie_index_offset; 1358 void __iomem *pcie_index_hi_offset; 1359 void __iomem *pcie_data_offset; 1360 1361 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1362 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1363 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1364 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1365 1366 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1367 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1368 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1369 if (pcie_index_hi != 0) 1370 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1371 pcie_index_hi * 4; 1372 1373 /* write low 32 bits */ 1374 writel(reg_addr, pcie_index_offset); 1375 readl(pcie_index_offset); 1376 if (pcie_index_hi != 0) { 1377 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1378 readl(pcie_index_hi_offset); 1379 } 1380 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1381 readl(pcie_data_offset); 1382 /* write high 32 bits */ 1383 writel(reg_addr + 4, pcie_index_offset); 1384 readl(pcie_index_offset); 1385 if (pcie_index_hi != 0) { 1386 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1387 readl(pcie_index_hi_offset); 1388 } 1389 writel((u32)(reg_data >> 32), pcie_data_offset); 1390 readl(pcie_data_offset); 1391 1392 /* clear the high bits */ 1393 if (pcie_index_hi != 0) { 1394 writel(0, pcie_index_hi_offset); 1395 readl(pcie_index_hi_offset); 1396 } 1397 1398 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1399 } 1400 1401 /** 1402 * amdgpu_device_get_rev_id - query device rev_id 1403 * 1404 * @adev: amdgpu_device pointer 1405 * 1406 * Return device rev_id 1407 */ 1408 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1409 { 1410 return adev->nbio.funcs->get_rev_id(adev); 1411 } 1412 1413 /** 1414 * amdgpu_invalid_rreg - dummy reg read function 1415 * 1416 * @adev: amdgpu_device pointer 1417 * @reg: offset of register 1418 * 1419 * Dummy register read function. Used for register blocks 1420 * that certain asics don't have (all asics). 1421 * Returns the value in the register. 1422 */ 1423 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1424 { 1425 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1426 BUG(); 1427 return 0; 1428 } 1429 1430 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1431 { 1432 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1433 BUG(); 1434 return 0; 1435 } 1436 1437 /** 1438 * amdgpu_invalid_wreg - dummy reg write function 1439 * 1440 * @adev: amdgpu_device pointer 1441 * @reg: offset of register 1442 * @v: value to write to the register 1443 * 1444 * Dummy register read function. Used for register blocks 1445 * that certain asics don't have (all asics). 1446 */ 1447 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1448 { 1449 dev_err(adev->dev, 1450 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1451 v); 1452 BUG(); 1453 } 1454 1455 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1456 { 1457 dev_err(adev->dev, 1458 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1459 v); 1460 BUG(); 1461 } 1462 1463 /** 1464 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1465 * 1466 * @adev: amdgpu_device pointer 1467 * @reg: offset of register 1468 * 1469 * Dummy register read function. Used for register blocks 1470 * that certain asics don't have (all asics). 1471 * Returns the value in the register. 1472 */ 1473 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1474 { 1475 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1476 reg); 1477 BUG(); 1478 return 0; 1479 } 1480 1481 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1482 { 1483 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1484 BUG(); 1485 return 0; 1486 } 1487 1488 /** 1489 * amdgpu_invalid_wreg64 - dummy reg write function 1490 * 1491 * @adev: amdgpu_device pointer 1492 * @reg: offset of register 1493 * @v: value to write to the register 1494 * 1495 * Dummy register read function. Used for register blocks 1496 * that certain asics don't have (all asics). 1497 */ 1498 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1499 { 1500 dev_err(adev->dev, 1501 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1502 reg, v); 1503 BUG(); 1504 } 1505 1506 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1507 { 1508 dev_err(adev->dev, 1509 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1510 reg, v); 1511 BUG(); 1512 } 1513 1514 /** 1515 * amdgpu_block_invalid_rreg - dummy reg read function 1516 * 1517 * @adev: amdgpu_device pointer 1518 * @block: offset of instance 1519 * @reg: offset of register 1520 * 1521 * Dummy register read function. Used for register blocks 1522 * that certain asics don't have (all asics). 1523 * Returns the value in the register. 1524 */ 1525 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1526 uint32_t block, uint32_t reg) 1527 { 1528 dev_err(adev->dev, 1529 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1530 reg, block); 1531 BUG(); 1532 return 0; 1533 } 1534 1535 /** 1536 * amdgpu_block_invalid_wreg - dummy reg write function 1537 * 1538 * @adev: amdgpu_device pointer 1539 * @block: offset of instance 1540 * @reg: offset of register 1541 * @v: value to write to the register 1542 * 1543 * Dummy register read function. Used for register blocks 1544 * that certain asics don't have (all asics). 1545 */ 1546 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1547 uint32_t block, 1548 uint32_t reg, uint32_t v) 1549 { 1550 dev_err(adev->dev, 1551 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1552 reg, block, v); 1553 BUG(); 1554 } 1555 1556 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1557 { 1558 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1559 return AMDGPU_VBIOS_SKIP; 1560 1561 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1562 return AMDGPU_VBIOS_OPTIONAL; 1563 1564 return 0; 1565 } 1566 1567 /** 1568 * amdgpu_device_asic_init - Wrapper for atom asic_init 1569 * 1570 * @adev: amdgpu_device pointer 1571 * 1572 * Does any asic specific work and then calls atom asic init. 1573 */ 1574 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1575 { 1576 uint32_t flags; 1577 bool optional; 1578 int ret; 1579 1580 amdgpu_asic_pre_asic_init(adev); 1581 flags = amdgpu_device_get_vbios_flags(adev); 1582 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1583 1584 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1585 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1586 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1587 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1588 amdgpu_psp_wait_for_bootloader(adev); 1589 if (optional && !adev->bios) 1590 return 0; 1591 1592 ret = amdgpu_atomfirmware_asic_init(adev, true); 1593 return ret; 1594 } else { 1595 if (optional && !adev->bios) 1596 return 0; 1597 1598 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1599 } 1600 1601 return 0; 1602 } 1603 1604 /** 1605 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1606 * 1607 * @adev: amdgpu_device pointer 1608 * 1609 * Allocates a scratch page of VRAM for use by various things in the 1610 * driver. 1611 */ 1612 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1613 { 1614 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1615 AMDGPU_GEM_DOMAIN_VRAM | 1616 AMDGPU_GEM_DOMAIN_GTT, 1617 &adev->mem_scratch.robj, 1618 &adev->mem_scratch.gpu_addr, 1619 (void **)&adev->mem_scratch.ptr); 1620 } 1621 1622 /** 1623 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1624 * 1625 * @adev: amdgpu_device pointer 1626 * 1627 * Frees the VRAM scratch page. 1628 */ 1629 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1630 { 1631 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1632 } 1633 1634 /** 1635 * amdgpu_device_program_register_sequence - program an array of registers. 1636 * 1637 * @adev: amdgpu_device pointer 1638 * @registers: pointer to the register array 1639 * @array_size: size of the register array 1640 * 1641 * Programs an array or registers with and or masks. 1642 * This is a helper for setting golden registers. 1643 */ 1644 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1645 const u32 *registers, 1646 const u32 array_size) 1647 { 1648 u32 tmp, reg, and_mask, or_mask; 1649 int i; 1650 1651 if (array_size % 3) 1652 return; 1653 1654 for (i = 0; i < array_size; i += 3) { 1655 reg = registers[i + 0]; 1656 and_mask = registers[i + 1]; 1657 or_mask = registers[i + 2]; 1658 1659 if (and_mask == 0xffffffff) { 1660 tmp = or_mask; 1661 } else { 1662 tmp = RREG32(reg); 1663 tmp &= ~and_mask; 1664 if (adev->family >= AMDGPU_FAMILY_AI) 1665 tmp |= (or_mask & and_mask); 1666 else 1667 tmp |= or_mask; 1668 } 1669 WREG32(reg, tmp); 1670 } 1671 } 1672 1673 /** 1674 * amdgpu_device_pci_config_reset - reset the GPU 1675 * 1676 * @adev: amdgpu_device pointer 1677 * 1678 * Resets the GPU using the pci config reset sequence. 1679 * Only applicable to asics prior to vega10. 1680 */ 1681 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1682 { 1683 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1684 } 1685 1686 /** 1687 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1688 * 1689 * @adev: amdgpu_device pointer 1690 * 1691 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1692 */ 1693 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1694 { 1695 return pci_reset_function(adev->pdev); 1696 } 1697 1698 /* 1699 * amdgpu_device_wb_*() 1700 * Writeback is the method by which the GPU updates special pages in memory 1701 * with the status of certain GPU events (fences, ring pointers,etc.). 1702 */ 1703 1704 /** 1705 * amdgpu_device_wb_fini - Disable Writeback and free memory 1706 * 1707 * @adev: amdgpu_device pointer 1708 * 1709 * Disables Writeback and frees the Writeback memory (all asics). 1710 * Used at driver shutdown. 1711 */ 1712 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1713 { 1714 if (adev->wb.wb_obj) { 1715 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1716 &adev->wb.gpu_addr, 1717 (void **)&adev->wb.wb); 1718 adev->wb.wb_obj = NULL; 1719 } 1720 } 1721 1722 /** 1723 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1724 * 1725 * @adev: amdgpu_device pointer 1726 * 1727 * Initializes writeback and allocates writeback memory (all asics). 1728 * Used at driver startup. 1729 * Returns 0 on success or an -error on failure. 1730 */ 1731 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1732 { 1733 int r; 1734 1735 if (adev->wb.wb_obj == NULL) { 1736 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1737 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1738 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1739 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1740 (void **)&adev->wb.wb); 1741 if (r) { 1742 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1743 return r; 1744 } 1745 1746 adev->wb.num_wb = AMDGPU_MAX_WB; 1747 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1748 1749 /* clear wb memory */ 1750 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1751 } 1752 1753 return 0; 1754 } 1755 1756 /** 1757 * amdgpu_device_wb_get - Allocate a wb entry 1758 * 1759 * @adev: amdgpu_device pointer 1760 * @wb: wb index 1761 * 1762 * Allocate a wb slot for use by the driver (all asics). 1763 * Returns 0 on success or -EINVAL on failure. 1764 */ 1765 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1766 { 1767 unsigned long flags, offset; 1768 1769 spin_lock_irqsave(&adev->wb.lock, flags); 1770 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1771 if (offset < adev->wb.num_wb) { 1772 __set_bit(offset, adev->wb.used); 1773 spin_unlock_irqrestore(&adev->wb.lock, flags); 1774 *wb = offset << 3; /* convert to dw offset */ 1775 return 0; 1776 } else { 1777 spin_unlock_irqrestore(&adev->wb.lock, flags); 1778 return -EINVAL; 1779 } 1780 } 1781 1782 /** 1783 * amdgpu_device_wb_free - Free a wb entry 1784 * 1785 * @adev: amdgpu_device pointer 1786 * @wb: wb index 1787 * 1788 * Free a wb slot allocated for use by the driver (all asics) 1789 */ 1790 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1791 { 1792 unsigned long flags; 1793 1794 wb >>= 3; 1795 spin_lock_irqsave(&adev->wb.lock, flags); 1796 if (wb < adev->wb.num_wb) 1797 __clear_bit(wb, adev->wb.used); 1798 spin_unlock_irqrestore(&adev->wb.lock, flags); 1799 } 1800 1801 /** 1802 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1803 * 1804 * @adev: amdgpu_device pointer 1805 * 1806 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1807 * to fail, but if any of the BARs is not accessible after the size we abort 1808 * driver loading by returning -ENODEV. 1809 */ 1810 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1811 { 1812 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1813 struct pci_bus *root; 1814 struct resource *res; 1815 unsigned int i; 1816 u16 cmd; 1817 int r; 1818 1819 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1820 return 0; 1821 1822 /* Bypass for VF */ 1823 if (amdgpu_sriov_vf(adev)) 1824 return 0; 1825 1826 if (!amdgpu_rebar) 1827 return 0; 1828 1829 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1830 if ((amdgpu_runtime_pm != 0) && 1831 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1832 adev->pdev->device == 0x731f && 1833 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1834 return 0; 1835 1836 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1837 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1838 dev_warn( 1839 adev->dev, 1840 "System can't access extended configuration space, please check!!\n"); 1841 1842 /* skip if the bios has already enabled large BAR */ 1843 if (adev->gmc.real_vram_size && 1844 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1845 return 0; 1846 1847 /* Check if the root BUS has 64bit memory resources */ 1848 root = adev->pdev->bus; 1849 while (root->parent) 1850 root = root->parent; 1851 1852 pci_bus_for_each_resource(root, res, i) { 1853 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1854 res->start > 0x100000000ull) 1855 break; 1856 } 1857 1858 /* Trying to resize is pointless without a root hub window above 4GB */ 1859 if (!res) 1860 return 0; 1861 1862 /* Limit the BAR size to what is available */ 1863 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1864 rbar_size); 1865 1866 /* Disable memory decoding while we change the BAR addresses and size */ 1867 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1868 pci_write_config_word(adev->pdev, PCI_COMMAND, 1869 cmd & ~PCI_COMMAND_MEMORY); 1870 1871 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1872 amdgpu_doorbell_fini(adev); 1873 if (adev->asic_type >= CHIP_BONAIRE) 1874 pci_release_resource(adev->pdev, 2); 1875 1876 pci_release_resource(adev->pdev, 0); 1877 1878 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1879 if (r == -ENOSPC) 1880 dev_info(adev->dev, 1881 "Not enough PCI address space for a large BAR."); 1882 else if (r && r != -ENOTSUPP) 1883 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1884 1885 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1886 1887 /* When the doorbell or fb BAR isn't available we have no chance of 1888 * using the device. 1889 */ 1890 r = amdgpu_doorbell_init(adev); 1891 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1892 return -ENODEV; 1893 1894 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1895 1896 return 0; 1897 } 1898 1899 /* 1900 * GPU helpers function. 1901 */ 1902 /** 1903 * amdgpu_device_need_post - check if the hw need post or not 1904 * 1905 * @adev: amdgpu_device pointer 1906 * 1907 * Check if the asic has been initialized (all asics) at driver startup 1908 * or post is needed if hw reset is performed. 1909 * Returns true if need or false if not. 1910 */ 1911 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1912 { 1913 uint32_t reg, flags; 1914 1915 if (amdgpu_sriov_vf(adev)) 1916 return false; 1917 1918 flags = amdgpu_device_get_vbios_flags(adev); 1919 if (flags & AMDGPU_VBIOS_SKIP) 1920 return false; 1921 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1922 return false; 1923 1924 if (amdgpu_passthrough(adev)) { 1925 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1926 * some old smc fw still need driver do vPost otherwise gpu hang, while 1927 * those smc fw version above 22.15 doesn't have this flaw, so we force 1928 * vpost executed for smc version below 22.15 1929 */ 1930 if (adev->asic_type == CHIP_FIJI) { 1931 int err; 1932 uint32_t fw_ver; 1933 1934 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1935 /* force vPost if error occurred */ 1936 if (err) 1937 return true; 1938 1939 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1940 release_firmware(adev->pm.fw); 1941 if (fw_ver < 0x00160e00) 1942 return true; 1943 } 1944 } 1945 1946 /* Don't post if we need to reset whole hive on init */ 1947 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1948 return false; 1949 1950 if (adev->has_hw_reset) { 1951 adev->has_hw_reset = false; 1952 return true; 1953 } 1954 1955 /* bios scratch used on CIK+ */ 1956 if (adev->asic_type >= CHIP_BONAIRE) 1957 return amdgpu_atombios_scratch_need_asic_init(adev); 1958 1959 /* check MEM_SIZE for older asics */ 1960 reg = amdgpu_asic_get_config_memsize(adev); 1961 1962 if ((reg != 0) && (reg != 0xffffffff)) 1963 return false; 1964 1965 return true; 1966 } 1967 1968 /* 1969 * Check whether seamless boot is supported. 1970 * 1971 * So far we only support seamless boot on DCE 3.0 or later. 1972 * If users report that it works on older ASICS as well, we may 1973 * loosen this. 1974 */ 1975 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1976 { 1977 switch (amdgpu_seamless) { 1978 case -1: 1979 break; 1980 case 1: 1981 return true; 1982 case 0: 1983 return false; 1984 default: 1985 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1986 amdgpu_seamless); 1987 return false; 1988 } 1989 1990 if (!(adev->flags & AMD_IS_APU)) 1991 return false; 1992 1993 if (adev->mman.keep_stolen_vga_memory) 1994 return false; 1995 1996 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1997 } 1998 1999 /* 2000 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 2001 * don't support dynamic speed switching. Until we have confirmation from Intel 2002 * that a specific host supports it, it's safer that we keep it disabled for all. 2003 * 2004 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 2005 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 2006 */ 2007 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 2008 { 2009 #if IS_ENABLED(CONFIG_X86) 2010 struct cpuinfo_x86 *c = &cpu_data(0); 2011 2012 /* eGPU change speeds based on USB4 fabric conditions */ 2013 if (dev_is_removable(adev->dev)) 2014 return true; 2015 2016 if (c->x86_vendor == X86_VENDOR_INTEL) 2017 return false; 2018 #endif 2019 return true; 2020 } 2021 2022 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 2023 { 2024 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 2025 * It's unclear if this is a platform-specific or GPU-specific issue. 2026 * Disable ASPM on SI for the time being. 2027 */ 2028 if (adev->family == AMDGPU_FAMILY_SI) 2029 return true; 2030 2031 #if IS_ENABLED(CONFIG_X86) 2032 struct cpuinfo_x86 *c = &cpu_data(0); 2033 2034 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 2035 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 2036 return false; 2037 2038 if (c->x86 == 6 && 2039 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 2040 switch (c->x86_model) { 2041 case VFM_MODEL(INTEL_ALDERLAKE): 2042 case VFM_MODEL(INTEL_ALDERLAKE_L): 2043 case VFM_MODEL(INTEL_RAPTORLAKE): 2044 case VFM_MODEL(INTEL_RAPTORLAKE_P): 2045 case VFM_MODEL(INTEL_RAPTORLAKE_S): 2046 return true; 2047 default: 2048 return false; 2049 } 2050 } else { 2051 return false; 2052 } 2053 #else 2054 return false; 2055 #endif 2056 } 2057 2058 /** 2059 * amdgpu_device_should_use_aspm - check if the device should program ASPM 2060 * 2061 * @adev: amdgpu_device pointer 2062 * 2063 * Confirm whether the module parameter and pcie bridge agree that ASPM should 2064 * be set for this device. 2065 * 2066 * Returns true if it should be used or false if not. 2067 */ 2068 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 2069 { 2070 switch (amdgpu_aspm) { 2071 case -1: 2072 break; 2073 case 0: 2074 return false; 2075 case 1: 2076 return true; 2077 default: 2078 return false; 2079 } 2080 if (adev->flags & AMD_IS_APU) 2081 return false; 2082 if (amdgpu_device_aspm_support_quirk(adev)) 2083 return false; 2084 return pcie_aspm_enabled(adev->pdev); 2085 } 2086 2087 /* if we get transitioned to only one device, take VGA back */ 2088 /** 2089 * amdgpu_device_vga_set_decode - enable/disable vga decode 2090 * 2091 * @pdev: PCI device pointer 2092 * @state: enable/disable vga decode 2093 * 2094 * Enable/disable vga decode (all asics). 2095 * Returns VGA resource flags. 2096 */ 2097 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 2098 bool state) 2099 { 2100 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 2101 2102 amdgpu_asic_set_vga_state(adev, state); 2103 if (state) 2104 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 2105 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 2106 else 2107 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 2108 } 2109 2110 /** 2111 * amdgpu_device_check_block_size - validate the vm block size 2112 * 2113 * @adev: amdgpu_device pointer 2114 * 2115 * Validates the vm block size specified via module parameter. 2116 * The vm block size defines number of bits in page table versus page directory, 2117 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 2118 * page table and the remaining bits are in the page directory. 2119 */ 2120 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 2121 { 2122 /* defines number of bits in page table versus page directory, 2123 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 2124 * page table and the remaining bits are in the page directory 2125 */ 2126 if (amdgpu_vm_block_size == -1) 2127 return; 2128 2129 if (amdgpu_vm_block_size < 9) { 2130 dev_warn(adev->dev, "VM page table size (%d) too small\n", 2131 amdgpu_vm_block_size); 2132 amdgpu_vm_block_size = -1; 2133 } 2134 } 2135 2136 /** 2137 * amdgpu_device_check_vm_size - validate the vm size 2138 * 2139 * @adev: amdgpu_device pointer 2140 * 2141 * Validates the vm size in GB specified via module parameter. 2142 * The VM size is the size of the GPU virtual memory space in GB. 2143 */ 2144 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2145 { 2146 /* no need to check the default value */ 2147 if (amdgpu_vm_size == -1) 2148 return; 2149 2150 if (amdgpu_vm_size < 1) { 2151 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2152 amdgpu_vm_size); 2153 amdgpu_vm_size = -1; 2154 } 2155 } 2156 2157 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2158 { 2159 struct sysinfo si; 2160 bool is_os_64 = (sizeof(void *) == 8); 2161 uint64_t total_memory; 2162 uint64_t dram_size_seven_GB = 0x1B8000000; 2163 uint64_t dram_size_three_GB = 0xB8000000; 2164 2165 if (amdgpu_smu_memory_pool_size == 0) 2166 return; 2167 2168 if (!is_os_64) { 2169 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2170 goto def_value; 2171 } 2172 si_meminfo(&si); 2173 total_memory = (uint64_t)si.totalram * si.mem_unit; 2174 2175 if ((amdgpu_smu_memory_pool_size == 1) || 2176 (amdgpu_smu_memory_pool_size == 2)) { 2177 if (total_memory < dram_size_three_GB) 2178 goto def_value1; 2179 } else if ((amdgpu_smu_memory_pool_size == 4) || 2180 (amdgpu_smu_memory_pool_size == 8)) { 2181 if (total_memory < dram_size_seven_GB) 2182 goto def_value1; 2183 } else { 2184 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2185 goto def_value; 2186 } 2187 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2188 2189 return; 2190 2191 def_value1: 2192 dev_warn(adev->dev, "No enough system memory\n"); 2193 def_value: 2194 adev->pm.smu_prv_buffer_size = 0; 2195 } 2196 2197 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2198 { 2199 if (!(adev->flags & AMD_IS_APU) || 2200 adev->asic_type < CHIP_RAVEN) 2201 return 0; 2202 2203 switch (adev->asic_type) { 2204 case CHIP_RAVEN: 2205 if (adev->pdev->device == 0x15dd) 2206 adev->apu_flags |= AMD_APU_IS_RAVEN; 2207 if (adev->pdev->device == 0x15d8) 2208 adev->apu_flags |= AMD_APU_IS_PICASSO; 2209 break; 2210 case CHIP_RENOIR: 2211 if ((adev->pdev->device == 0x1636) || 2212 (adev->pdev->device == 0x164c)) 2213 adev->apu_flags |= AMD_APU_IS_RENOIR; 2214 else 2215 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2216 break; 2217 case CHIP_VANGOGH: 2218 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2219 break; 2220 case CHIP_YELLOW_CARP: 2221 break; 2222 case CHIP_CYAN_SKILLFISH: 2223 if ((adev->pdev->device == 0x13FE) || 2224 (adev->pdev->device == 0x143F)) 2225 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2226 break; 2227 default: 2228 break; 2229 } 2230 2231 return 0; 2232 } 2233 2234 /** 2235 * amdgpu_device_check_arguments - validate module params 2236 * 2237 * @adev: amdgpu_device pointer 2238 * 2239 * Validates certain module parameters and updates 2240 * the associated values used by the driver (all asics). 2241 */ 2242 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2243 { 2244 int i; 2245 2246 if (amdgpu_sched_jobs < 4) { 2247 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2248 amdgpu_sched_jobs); 2249 amdgpu_sched_jobs = 4; 2250 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2251 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2252 amdgpu_sched_jobs); 2253 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2254 } 2255 2256 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2257 /* gart size must be greater or equal to 32M */ 2258 dev_warn(adev->dev, "gart size (%d) too small\n", 2259 amdgpu_gart_size); 2260 amdgpu_gart_size = -1; 2261 } 2262 2263 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2264 /* gtt size must be greater or equal to 32M */ 2265 dev_warn(adev->dev, "gtt size (%d) too small\n", 2266 amdgpu_gtt_size); 2267 amdgpu_gtt_size = -1; 2268 } 2269 2270 /* valid range is between 4 and 9 inclusive */ 2271 if (amdgpu_vm_fragment_size != -1 && 2272 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2273 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2274 amdgpu_vm_fragment_size = -1; 2275 } 2276 2277 if (amdgpu_sched_hw_submission < 2) { 2278 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2279 amdgpu_sched_hw_submission); 2280 amdgpu_sched_hw_submission = 2; 2281 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2282 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2283 amdgpu_sched_hw_submission); 2284 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2285 } 2286 2287 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2288 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2289 amdgpu_reset_method = -1; 2290 } 2291 2292 amdgpu_device_check_smu_prv_buffer_size(adev); 2293 2294 amdgpu_device_check_vm_size(adev); 2295 2296 amdgpu_device_check_block_size(adev); 2297 2298 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2299 2300 for (i = 0; i < MAX_XCP; i++) { 2301 switch (amdgpu_enforce_isolation) { 2302 case -1: 2303 case 0: 2304 default: 2305 /* disable */ 2306 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2307 break; 2308 case 1: 2309 /* enable */ 2310 adev->enforce_isolation[i] = 2311 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2312 break; 2313 case 2: 2314 /* enable legacy mode */ 2315 adev->enforce_isolation[i] = 2316 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2317 break; 2318 case 3: 2319 /* enable only process isolation without submitting cleaner shader */ 2320 adev->enforce_isolation[i] = 2321 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2322 break; 2323 } 2324 } 2325 2326 return 0; 2327 } 2328 2329 /** 2330 * amdgpu_switcheroo_set_state - set switcheroo state 2331 * 2332 * @pdev: pci dev pointer 2333 * @state: vga_switcheroo state 2334 * 2335 * Callback for the switcheroo driver. Suspends or resumes 2336 * the asics before or after it is powered up using ACPI methods. 2337 */ 2338 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2339 enum vga_switcheroo_state state) 2340 { 2341 struct drm_device *dev = pci_get_drvdata(pdev); 2342 int r; 2343 2344 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2345 state == VGA_SWITCHEROO_OFF) 2346 return; 2347 2348 if (state == VGA_SWITCHEROO_ON) { 2349 pr_info("switched on\n"); 2350 /* don't suspend or resume card normally */ 2351 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2352 2353 pci_set_power_state(pdev, PCI_D0); 2354 amdgpu_device_load_pci_state(pdev); 2355 r = pci_enable_device(pdev); 2356 if (r) 2357 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2358 r); 2359 amdgpu_device_resume(dev, true); 2360 2361 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2362 } else { 2363 dev_info(&pdev->dev, "switched off\n"); 2364 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2365 amdgpu_device_prepare(dev); 2366 amdgpu_device_suspend(dev, true); 2367 amdgpu_device_cache_pci_state(pdev); 2368 /* Shut down the device */ 2369 pci_disable_device(pdev); 2370 pci_set_power_state(pdev, PCI_D3cold); 2371 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2372 } 2373 } 2374 2375 /** 2376 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2377 * 2378 * @pdev: pci dev pointer 2379 * 2380 * Callback for the switcheroo driver. Check of the switcheroo 2381 * state can be changed. 2382 * Returns true if the state can be changed, false if not. 2383 */ 2384 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2385 { 2386 struct drm_device *dev = pci_get_drvdata(pdev); 2387 2388 /* 2389 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2390 * locking inversion with the driver load path. And the access here is 2391 * completely racy anyway. So don't bother with locking for now. 2392 */ 2393 return atomic_read(&dev->open_count) == 0; 2394 } 2395 2396 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2397 .set_gpu_state = amdgpu_switcheroo_set_state, 2398 .reprobe = NULL, 2399 .can_switch = amdgpu_switcheroo_can_switch, 2400 }; 2401 2402 /** 2403 * amdgpu_device_enable_virtual_display - enable virtual display feature 2404 * 2405 * @adev: amdgpu_device pointer 2406 * 2407 * Enabled the virtual display feature if the user has enabled it via 2408 * the module parameter virtual_display. This feature provides a virtual 2409 * display hardware on headless boards or in virtualized environments. 2410 * This function parses and validates the configuration string specified by 2411 * the user and configures the virtual display configuration (number of 2412 * virtual connectors, crtcs, etc.) specified. 2413 */ 2414 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2415 { 2416 adev->enable_virtual_display = false; 2417 2418 if (amdgpu_virtual_display) { 2419 const char *pci_address_name = pci_name(adev->pdev); 2420 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2421 2422 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2423 pciaddstr_tmp = pciaddstr; 2424 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2425 pciaddname = strsep(&pciaddname_tmp, ","); 2426 if (!strcmp("all", pciaddname) 2427 || !strcmp(pci_address_name, pciaddname)) { 2428 long num_crtc; 2429 int res = -1; 2430 2431 adev->enable_virtual_display = true; 2432 2433 if (pciaddname_tmp) 2434 res = kstrtol(pciaddname_tmp, 10, 2435 &num_crtc); 2436 2437 if (!res) { 2438 if (num_crtc < 1) 2439 num_crtc = 1; 2440 if (num_crtc > 6) 2441 num_crtc = 6; 2442 adev->mode_info.num_crtc = num_crtc; 2443 } else { 2444 adev->mode_info.num_crtc = 1; 2445 } 2446 break; 2447 } 2448 } 2449 2450 dev_info( 2451 adev->dev, 2452 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2453 amdgpu_virtual_display, pci_address_name, 2454 adev->enable_virtual_display, adev->mode_info.num_crtc); 2455 2456 kfree(pciaddstr); 2457 } 2458 } 2459 2460 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2461 { 2462 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2463 adev->mode_info.num_crtc = 1; 2464 adev->enable_virtual_display = true; 2465 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2466 adev->enable_virtual_display, 2467 adev->mode_info.num_crtc); 2468 } 2469 } 2470 2471 /** 2472 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2473 * 2474 * @adev: amdgpu_device pointer 2475 * 2476 * Parses the asic configuration parameters specified in the gpu info 2477 * firmware and makes them available to the driver for use in configuring 2478 * the asic. 2479 * Returns 0 on success, -EINVAL on failure. 2480 */ 2481 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2482 { 2483 const char *chip_name; 2484 int err; 2485 const struct gpu_info_firmware_header_v1_0 *hdr; 2486 2487 adev->firmware.gpu_info_fw = NULL; 2488 2489 switch (adev->asic_type) { 2490 default: 2491 return 0; 2492 case CHIP_VEGA10: 2493 chip_name = "vega10"; 2494 break; 2495 case CHIP_VEGA12: 2496 chip_name = "vega12"; 2497 break; 2498 case CHIP_RAVEN: 2499 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2500 chip_name = "raven2"; 2501 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2502 chip_name = "picasso"; 2503 else 2504 chip_name = "raven"; 2505 break; 2506 case CHIP_ARCTURUS: 2507 chip_name = "arcturus"; 2508 break; 2509 case CHIP_NAVI12: 2510 if (adev->discovery.bin) 2511 return 0; 2512 chip_name = "navi12"; 2513 break; 2514 case CHIP_CYAN_SKILLFISH: 2515 if (adev->discovery.bin) 2516 return 0; 2517 chip_name = "cyan_skillfish"; 2518 break; 2519 } 2520 2521 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2522 AMDGPU_UCODE_OPTIONAL, 2523 "amdgpu/%s_gpu_info.bin", chip_name); 2524 if (err) { 2525 dev_err(adev->dev, 2526 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2527 chip_name); 2528 goto out; 2529 } 2530 2531 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2532 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2533 2534 switch (hdr->version_major) { 2535 case 1: 2536 { 2537 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2538 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2539 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2540 2541 /* 2542 * Should be dropped when DAL no longer needs it. 2543 */ 2544 if (adev->asic_type == CHIP_NAVI12) 2545 goto parse_soc_bounding_box; 2546 2547 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2548 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2549 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2550 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2551 adev->gfx.config.max_texture_channel_caches = 2552 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2553 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2554 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2555 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2556 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2557 adev->gfx.config.double_offchip_lds_buf = 2558 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2559 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2560 adev->gfx.cu_info.max_waves_per_simd = 2561 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2562 adev->gfx.cu_info.max_scratch_slots_per_cu = 2563 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2564 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2565 if (hdr->version_minor >= 1) { 2566 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2567 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2568 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2569 adev->gfx.config.num_sc_per_sh = 2570 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2571 adev->gfx.config.num_packer_per_sc = 2572 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2573 } 2574 2575 parse_soc_bounding_box: 2576 /* 2577 * soc bounding box info is not integrated in disocovery table, 2578 * we always need to parse it from gpu info firmware if needed. 2579 */ 2580 if (hdr->version_minor == 2) { 2581 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2582 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2583 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2584 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2585 } 2586 break; 2587 } 2588 default: 2589 dev_err(adev->dev, 2590 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2591 err = -EINVAL; 2592 goto out; 2593 } 2594 out: 2595 return err; 2596 } 2597 2598 static void amdgpu_uid_init(struct amdgpu_device *adev) 2599 { 2600 /* Initialize the UID for the device */ 2601 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2602 if (!adev->uid_info) { 2603 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2604 return; 2605 } 2606 adev->uid_info->adev = adev; 2607 } 2608 2609 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2610 { 2611 /* Free the UID memory */ 2612 kfree(adev->uid_info); 2613 adev->uid_info = NULL; 2614 } 2615 2616 /** 2617 * amdgpu_device_ip_early_init - run early init for hardware IPs 2618 * 2619 * @adev: amdgpu_device pointer 2620 * 2621 * Early initialization pass for hardware IPs. The hardware IPs that make 2622 * up each asic are discovered each IP's early_init callback is run. This 2623 * is the first stage in initializing the asic. 2624 * Returns 0 on success, negative error code on failure. 2625 */ 2626 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2627 { 2628 struct amdgpu_ip_block *ip_block; 2629 struct pci_dev *parent; 2630 bool total, skip_bios; 2631 uint32_t bios_flags; 2632 int i, r; 2633 2634 amdgpu_device_enable_virtual_display(adev); 2635 2636 if (amdgpu_sriov_vf(adev)) { 2637 r = amdgpu_virt_request_full_gpu(adev, true); 2638 if (r) 2639 return r; 2640 2641 r = amdgpu_virt_init_critical_region(adev); 2642 if (r) 2643 return r; 2644 } 2645 2646 switch (adev->asic_type) { 2647 #ifdef CONFIG_DRM_AMDGPU_SI 2648 case CHIP_VERDE: 2649 case CHIP_TAHITI: 2650 case CHIP_PITCAIRN: 2651 case CHIP_OLAND: 2652 case CHIP_HAINAN: 2653 adev->family = AMDGPU_FAMILY_SI; 2654 r = si_set_ip_blocks(adev); 2655 if (r) 2656 return r; 2657 break; 2658 #endif 2659 #ifdef CONFIG_DRM_AMDGPU_CIK 2660 case CHIP_BONAIRE: 2661 case CHIP_HAWAII: 2662 case CHIP_KAVERI: 2663 case CHIP_KABINI: 2664 case CHIP_MULLINS: 2665 if (adev->flags & AMD_IS_APU) 2666 adev->family = AMDGPU_FAMILY_KV; 2667 else 2668 adev->family = AMDGPU_FAMILY_CI; 2669 2670 r = cik_set_ip_blocks(adev); 2671 if (r) 2672 return r; 2673 break; 2674 #endif 2675 case CHIP_TOPAZ: 2676 case CHIP_TONGA: 2677 case CHIP_FIJI: 2678 case CHIP_POLARIS10: 2679 case CHIP_POLARIS11: 2680 case CHIP_POLARIS12: 2681 case CHIP_VEGAM: 2682 case CHIP_CARRIZO: 2683 case CHIP_STONEY: 2684 if (adev->flags & AMD_IS_APU) 2685 adev->family = AMDGPU_FAMILY_CZ; 2686 else 2687 adev->family = AMDGPU_FAMILY_VI; 2688 2689 r = vi_set_ip_blocks(adev); 2690 if (r) 2691 return r; 2692 break; 2693 default: 2694 r = amdgpu_discovery_set_ip_blocks(adev); 2695 if (r) 2696 return r; 2697 break; 2698 } 2699 2700 /* Check for IP version 9.4.3 with A0 hardware */ 2701 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2702 !amdgpu_device_get_rev_id(adev)) { 2703 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2704 return -ENODEV; /* device unsupported - no device error */ 2705 } 2706 2707 if (amdgpu_has_atpx() && 2708 (amdgpu_is_atpx_hybrid() || 2709 amdgpu_has_atpx_dgpu_power_cntl()) && 2710 ((adev->flags & AMD_IS_APU) == 0) && 2711 !dev_is_removable(&adev->pdev->dev)) 2712 adev->flags |= AMD_IS_PX; 2713 2714 if (!(adev->flags & AMD_IS_APU)) { 2715 parent = pcie_find_root_port(adev->pdev); 2716 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2717 } 2718 2719 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2720 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2721 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2722 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2723 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2724 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2725 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2726 2727 adev->virt.is_xgmi_node_migrate_enabled = false; 2728 if (amdgpu_sriov_vf(adev)) { 2729 adev->virt.is_xgmi_node_migrate_enabled = 2730 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2731 } 2732 2733 total = true; 2734 for (i = 0; i < adev->num_ip_blocks; i++) { 2735 ip_block = &adev->ip_blocks[i]; 2736 2737 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2738 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2739 adev->ip_blocks[i].version->funcs->name); 2740 adev->ip_blocks[i].status.valid = false; 2741 } else if (ip_block->version->funcs->early_init) { 2742 r = ip_block->version->funcs->early_init(ip_block); 2743 if (r == -ENOENT) { 2744 adev->ip_blocks[i].status.valid = false; 2745 } else if (r) { 2746 dev_err(adev->dev, 2747 "early_init of IP block <%s> failed %d\n", 2748 adev->ip_blocks[i].version->funcs->name, 2749 r); 2750 total = false; 2751 } else { 2752 adev->ip_blocks[i].status.valid = true; 2753 } 2754 } else { 2755 adev->ip_blocks[i].status.valid = true; 2756 } 2757 /* get the vbios after the asic_funcs are set up */ 2758 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2759 r = amdgpu_device_parse_gpu_info_fw(adev); 2760 if (r) 2761 return r; 2762 2763 bios_flags = amdgpu_device_get_vbios_flags(adev); 2764 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2765 /* Read BIOS */ 2766 if (!skip_bios) { 2767 bool optional = 2768 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2769 if (!amdgpu_get_bios(adev) && !optional) 2770 return -EINVAL; 2771 2772 if (optional && !adev->bios) 2773 dev_info( 2774 adev->dev, 2775 "VBIOS image optional, proceeding without VBIOS image"); 2776 2777 if (adev->bios) { 2778 r = amdgpu_atombios_init(adev); 2779 if (r) { 2780 dev_err(adev->dev, 2781 "amdgpu_atombios_init failed\n"); 2782 amdgpu_vf_error_put( 2783 adev, 2784 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2785 0, 0); 2786 return r; 2787 } 2788 } 2789 } 2790 2791 /*get pf2vf msg info at it's earliest time*/ 2792 if (amdgpu_sriov_vf(adev)) 2793 amdgpu_virt_init_data_exchange(adev); 2794 2795 } 2796 } 2797 if (!total) 2798 return -ENODEV; 2799 2800 if (adev->gmc.xgmi.supported) 2801 amdgpu_xgmi_early_init(adev); 2802 2803 if (amdgpu_is_multi_aid(adev)) 2804 amdgpu_uid_init(adev); 2805 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2806 if (ip_block->status.valid != false) 2807 amdgpu_amdkfd_device_probe(adev); 2808 2809 adev->cg_flags &= amdgpu_cg_mask; 2810 adev->pg_flags &= amdgpu_pg_mask; 2811 2812 return 0; 2813 } 2814 2815 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2816 { 2817 int i, r; 2818 2819 for (i = 0; i < adev->num_ip_blocks; i++) { 2820 if (!adev->ip_blocks[i].status.sw) 2821 continue; 2822 if (adev->ip_blocks[i].status.hw) 2823 continue; 2824 if (!amdgpu_ip_member_of_hwini( 2825 adev, adev->ip_blocks[i].version->type)) 2826 continue; 2827 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2828 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2829 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2830 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2831 if (r) { 2832 dev_err(adev->dev, 2833 "hw_init of IP block <%s> failed %d\n", 2834 adev->ip_blocks[i].version->funcs->name, 2835 r); 2836 return r; 2837 } 2838 adev->ip_blocks[i].status.hw = true; 2839 } 2840 } 2841 2842 return 0; 2843 } 2844 2845 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2846 { 2847 int i, r; 2848 2849 for (i = 0; i < adev->num_ip_blocks; i++) { 2850 if (!adev->ip_blocks[i].status.sw) 2851 continue; 2852 if (adev->ip_blocks[i].status.hw) 2853 continue; 2854 if (!amdgpu_ip_member_of_hwini( 2855 adev, adev->ip_blocks[i].version->type)) 2856 continue; 2857 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2858 if (r) { 2859 dev_err(adev->dev, 2860 "hw_init of IP block <%s> failed %d\n", 2861 adev->ip_blocks[i].version->funcs->name, r); 2862 return r; 2863 } 2864 adev->ip_blocks[i].status.hw = true; 2865 } 2866 2867 return 0; 2868 } 2869 2870 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2871 { 2872 int r = 0; 2873 int i; 2874 uint32_t smu_version; 2875 2876 if (adev->asic_type >= CHIP_VEGA10) { 2877 for (i = 0; i < adev->num_ip_blocks; i++) { 2878 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2879 continue; 2880 2881 if (!amdgpu_ip_member_of_hwini(adev, 2882 AMD_IP_BLOCK_TYPE_PSP)) 2883 break; 2884 2885 if (!adev->ip_blocks[i].status.sw) 2886 continue; 2887 2888 /* no need to do the fw loading again if already done*/ 2889 if (adev->ip_blocks[i].status.hw == true) 2890 break; 2891 2892 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2893 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2894 if (r) 2895 return r; 2896 } else { 2897 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2898 if (r) { 2899 dev_err(adev->dev, 2900 "hw_init of IP block <%s> failed %d\n", 2901 adev->ip_blocks[i] 2902 .version->funcs->name, 2903 r); 2904 return r; 2905 } 2906 adev->ip_blocks[i].status.hw = true; 2907 } 2908 break; 2909 } 2910 } 2911 2912 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2913 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2914 2915 return r; 2916 } 2917 2918 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2919 { 2920 struct drm_sched_init_args args = { 2921 .ops = &amdgpu_sched_ops, 2922 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2923 .timeout_wq = adev->reset_domain->wq, 2924 .dev = adev->dev, 2925 }; 2926 long timeout; 2927 int r, i; 2928 2929 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2930 struct amdgpu_ring *ring = adev->rings[i]; 2931 2932 /* No need to setup the GPU scheduler for rings that don't need it */ 2933 if (!ring || ring->no_scheduler) 2934 continue; 2935 2936 switch (ring->funcs->type) { 2937 case AMDGPU_RING_TYPE_GFX: 2938 timeout = adev->gfx_timeout; 2939 break; 2940 case AMDGPU_RING_TYPE_COMPUTE: 2941 timeout = adev->compute_timeout; 2942 break; 2943 case AMDGPU_RING_TYPE_SDMA: 2944 timeout = adev->sdma_timeout; 2945 break; 2946 default: 2947 timeout = adev->video_timeout; 2948 break; 2949 } 2950 2951 args.timeout = timeout; 2952 args.credit_limit = ring->num_hw_submission; 2953 args.score = ring->sched_score; 2954 args.name = ring->name; 2955 2956 r = drm_sched_init(&ring->sched, &args); 2957 if (r) { 2958 dev_err(adev->dev, 2959 "Failed to create scheduler on ring %s.\n", 2960 ring->name); 2961 return r; 2962 } 2963 r = amdgpu_uvd_entity_init(adev, ring); 2964 if (r) { 2965 dev_err(adev->dev, 2966 "Failed to create UVD scheduling entity on ring %s.\n", 2967 ring->name); 2968 return r; 2969 } 2970 r = amdgpu_vce_entity_init(adev, ring); 2971 if (r) { 2972 dev_err(adev->dev, 2973 "Failed to create VCE scheduling entity on ring %s.\n", 2974 ring->name); 2975 return r; 2976 } 2977 } 2978 2979 if (adev->xcp_mgr) 2980 amdgpu_xcp_update_partition_sched_list(adev); 2981 2982 return 0; 2983 } 2984 2985 2986 /** 2987 * amdgpu_device_ip_init - run init for hardware IPs 2988 * 2989 * @adev: amdgpu_device pointer 2990 * 2991 * Main initialization pass for hardware IPs. The list of all the hardware 2992 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2993 * are run. sw_init initializes the software state associated with each IP 2994 * and hw_init initializes the hardware associated with each IP. 2995 * Returns 0 on success, negative error code on failure. 2996 */ 2997 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2998 { 2999 bool init_badpage; 3000 int i, r; 3001 3002 r = amdgpu_ras_init(adev); 3003 if (r) 3004 return r; 3005 3006 for (i = 0; i < adev->num_ip_blocks; i++) { 3007 if (!adev->ip_blocks[i].status.valid) 3008 continue; 3009 if (adev->ip_blocks[i].version->funcs->sw_init) { 3010 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3011 if (r) { 3012 dev_err(adev->dev, 3013 "sw_init of IP block <%s> failed %d\n", 3014 adev->ip_blocks[i].version->funcs->name, 3015 r); 3016 goto init_failed; 3017 } 3018 } 3019 adev->ip_blocks[i].status.sw = true; 3020 3021 if (!amdgpu_ip_member_of_hwini( 3022 adev, adev->ip_blocks[i].version->type)) 3023 continue; 3024 3025 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3026 /* need to do common hw init early so everything is set up for gmc */ 3027 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3028 if (r) { 3029 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3030 r); 3031 goto init_failed; 3032 } 3033 adev->ip_blocks[i].status.hw = true; 3034 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3035 /* need to do gmc hw init early so we can allocate gpu mem */ 3036 /* Try to reserve bad pages early */ 3037 if (amdgpu_sriov_vf(adev)) 3038 amdgpu_virt_exchange_data(adev); 3039 3040 r = amdgpu_device_mem_scratch_init(adev); 3041 if (r) { 3042 dev_err(adev->dev, 3043 "amdgpu_mem_scratch_init failed %d\n", 3044 r); 3045 goto init_failed; 3046 } 3047 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3048 if (r) { 3049 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3050 r); 3051 goto init_failed; 3052 } 3053 r = amdgpu_device_wb_init(adev); 3054 if (r) { 3055 dev_err(adev->dev, 3056 "amdgpu_device_wb_init failed %d\n", r); 3057 goto init_failed; 3058 } 3059 adev->ip_blocks[i].status.hw = true; 3060 3061 /* right after GMC hw init, we create CSA */ 3062 if (adev->gfx.mcbp) { 3063 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3064 AMDGPU_GEM_DOMAIN_VRAM | 3065 AMDGPU_GEM_DOMAIN_GTT, 3066 AMDGPU_CSA_SIZE); 3067 if (r) { 3068 dev_err(adev->dev, 3069 "allocate CSA failed %d\n", r); 3070 goto init_failed; 3071 } 3072 } 3073 3074 r = amdgpu_seq64_init(adev); 3075 if (r) { 3076 dev_err(adev->dev, "allocate seq64 failed %d\n", 3077 r); 3078 goto init_failed; 3079 } 3080 } 3081 } 3082 3083 if (amdgpu_sriov_vf(adev)) 3084 amdgpu_virt_init_data_exchange(adev); 3085 3086 r = amdgpu_ib_pool_init(adev); 3087 if (r) { 3088 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3089 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3090 goto init_failed; 3091 } 3092 3093 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3094 if (r) 3095 goto init_failed; 3096 3097 r = amdgpu_device_ip_hw_init_phase1(adev); 3098 if (r) 3099 goto init_failed; 3100 3101 r = amdgpu_device_fw_loading(adev); 3102 if (r) 3103 goto init_failed; 3104 3105 r = amdgpu_device_ip_hw_init_phase2(adev); 3106 if (r) 3107 goto init_failed; 3108 3109 /* 3110 * retired pages will be loaded from eeprom and reserved here, 3111 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3112 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3113 * for I2C communication which only true at this point. 3114 * 3115 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3116 * failure from bad gpu situation and stop amdgpu init process 3117 * accordingly. For other failed cases, it will still release all 3118 * the resource and print error message, rather than returning one 3119 * negative value to upper level. 3120 * 3121 * Note: theoretically, this should be called before all vram allocations 3122 * to protect retired page from abusing 3123 */ 3124 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3125 r = amdgpu_ras_recovery_init(adev, init_badpage); 3126 if (r) 3127 goto init_failed; 3128 3129 /** 3130 * In case of XGMI grab extra reference for reset domain for this device 3131 */ 3132 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3133 if (amdgpu_xgmi_add_device(adev) == 0) { 3134 if (!amdgpu_sriov_vf(adev)) { 3135 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3136 3137 if (WARN_ON(!hive)) { 3138 r = -ENOENT; 3139 goto init_failed; 3140 } 3141 3142 if (!hive->reset_domain || 3143 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3144 r = -ENOENT; 3145 amdgpu_put_xgmi_hive(hive); 3146 goto init_failed; 3147 } 3148 3149 /* Drop the early temporary reset domain we created for device */ 3150 amdgpu_reset_put_reset_domain(adev->reset_domain); 3151 adev->reset_domain = hive->reset_domain; 3152 amdgpu_put_xgmi_hive(hive); 3153 } 3154 } 3155 } 3156 3157 r = amdgpu_device_init_schedulers(adev); 3158 if (r) 3159 goto init_failed; 3160 3161 if (adev->mman.buffer_funcs_ring && 3162 adev->mman.buffer_funcs_ring->sched.ready) 3163 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3164 3165 /* Don't init kfd if whole hive need to be reset during init */ 3166 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3167 amdgpu_amdkfd_device_init(adev); 3168 } 3169 3170 amdgpu_fru_get_product_info(adev); 3171 3172 r = amdgpu_cper_init(adev); 3173 3174 init_failed: 3175 3176 return r; 3177 } 3178 3179 /** 3180 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3181 * 3182 * @adev: amdgpu_device pointer 3183 * 3184 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3185 * this function before a GPU reset. If the value is retained after a 3186 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3187 */ 3188 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3189 { 3190 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3191 } 3192 3193 /** 3194 * amdgpu_device_check_vram_lost - check if vram is valid 3195 * 3196 * @adev: amdgpu_device pointer 3197 * 3198 * Checks the reset magic value written to the gart pointer in VRAM. 3199 * The driver calls this after a GPU reset to see if the contents of 3200 * VRAM is lost or now. 3201 * returns true if vram is lost, false if not. 3202 */ 3203 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3204 { 3205 if (memcmp(adev->gart.ptr, adev->reset_magic, 3206 AMDGPU_RESET_MAGIC_NUM)) 3207 return true; 3208 3209 if (!amdgpu_in_reset(adev)) 3210 return false; 3211 3212 /* 3213 * For all ASICs with baco/mode1 reset, the VRAM is 3214 * always assumed to be lost. 3215 */ 3216 switch (amdgpu_asic_reset_method(adev)) { 3217 case AMD_RESET_METHOD_LEGACY: 3218 case AMD_RESET_METHOD_LINK: 3219 case AMD_RESET_METHOD_BACO: 3220 case AMD_RESET_METHOD_MODE1: 3221 return true; 3222 default: 3223 return false; 3224 } 3225 } 3226 3227 /** 3228 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3229 * 3230 * @adev: amdgpu_device pointer 3231 * @state: clockgating state (gate or ungate) 3232 * 3233 * The list of all the hardware IPs that make up the asic is walked and the 3234 * set_clockgating_state callbacks are run. 3235 * Late initialization pass enabling clockgating for hardware IPs. 3236 * Fini or suspend, pass disabling clockgating for hardware IPs. 3237 * Returns 0 on success, negative error code on failure. 3238 */ 3239 3240 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3241 enum amd_clockgating_state state) 3242 { 3243 int i, j, r; 3244 3245 if (amdgpu_emu_mode == 1) 3246 return 0; 3247 3248 for (j = 0; j < adev->num_ip_blocks; j++) { 3249 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3250 if (!adev->ip_blocks[i].status.late_initialized) 3251 continue; 3252 /* skip CG for GFX, SDMA on S0ix */ 3253 if (adev->in_s0ix && 3254 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3255 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3256 continue; 3257 /* skip CG for VCE/UVD, it's handled specially */ 3258 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3259 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3260 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3261 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3262 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3263 /* enable clockgating to save power */ 3264 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3265 state); 3266 if (r) { 3267 dev_err(adev->dev, 3268 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3269 adev->ip_blocks[i].version->funcs->name, 3270 r); 3271 return r; 3272 } 3273 } 3274 } 3275 3276 return 0; 3277 } 3278 3279 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3280 enum amd_powergating_state state) 3281 { 3282 int i, j, r; 3283 3284 if (amdgpu_emu_mode == 1) 3285 return 0; 3286 3287 for (j = 0; j < adev->num_ip_blocks; j++) { 3288 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3289 if (!adev->ip_blocks[i].status.late_initialized) 3290 continue; 3291 /* skip PG for GFX, SDMA on S0ix */ 3292 if (adev->in_s0ix && 3293 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3294 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3295 continue; 3296 /* skip CG for VCE/UVD, it's handled specially */ 3297 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3298 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3299 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3300 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3301 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3302 /* enable powergating to save power */ 3303 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3304 state); 3305 if (r) { 3306 dev_err(adev->dev, 3307 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3308 adev->ip_blocks[i].version->funcs->name, 3309 r); 3310 return r; 3311 } 3312 } 3313 } 3314 return 0; 3315 } 3316 3317 static int amdgpu_device_enable_mgpu_fan_boost(void) 3318 { 3319 struct amdgpu_gpu_instance *gpu_ins; 3320 struct amdgpu_device *adev; 3321 int i, ret = 0; 3322 3323 mutex_lock(&mgpu_info.mutex); 3324 3325 /* 3326 * MGPU fan boost feature should be enabled 3327 * only when there are two or more dGPUs in 3328 * the system 3329 */ 3330 if (mgpu_info.num_dgpu < 2) 3331 goto out; 3332 3333 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3334 gpu_ins = &(mgpu_info.gpu_ins[i]); 3335 adev = gpu_ins->adev; 3336 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3337 !gpu_ins->mgpu_fan_enabled) { 3338 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3339 if (ret) 3340 break; 3341 3342 gpu_ins->mgpu_fan_enabled = 1; 3343 } 3344 } 3345 3346 out: 3347 mutex_unlock(&mgpu_info.mutex); 3348 3349 return ret; 3350 } 3351 3352 /** 3353 * amdgpu_device_ip_late_init - run late init for hardware IPs 3354 * 3355 * @adev: amdgpu_device pointer 3356 * 3357 * Late initialization pass for hardware IPs. The list of all the hardware 3358 * IPs that make up the asic is walked and the late_init callbacks are run. 3359 * late_init covers any special initialization that an IP requires 3360 * after all of the have been initialized or something that needs to happen 3361 * late in the init process. 3362 * Returns 0 on success, negative error code on failure. 3363 */ 3364 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3365 { 3366 struct amdgpu_gpu_instance *gpu_instance; 3367 int i = 0, r; 3368 3369 for (i = 0; i < adev->num_ip_blocks; i++) { 3370 if (!adev->ip_blocks[i].status.hw) 3371 continue; 3372 if (adev->ip_blocks[i].version->funcs->late_init) { 3373 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3374 if (r) { 3375 dev_err(adev->dev, 3376 "late_init of IP block <%s> failed %d\n", 3377 adev->ip_blocks[i].version->funcs->name, 3378 r); 3379 return r; 3380 } 3381 } 3382 adev->ip_blocks[i].status.late_initialized = true; 3383 } 3384 3385 r = amdgpu_ras_late_init(adev); 3386 if (r) { 3387 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3388 return r; 3389 } 3390 3391 if (!amdgpu_reset_in_recovery(adev)) 3392 amdgpu_ras_set_error_query_ready(adev, true); 3393 3394 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3395 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3396 3397 amdgpu_device_fill_reset_magic(adev); 3398 3399 r = amdgpu_device_enable_mgpu_fan_boost(); 3400 if (r) 3401 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3402 3403 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3404 if (amdgpu_passthrough(adev) && 3405 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3406 adev->asic_type == CHIP_ALDEBARAN)) 3407 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3408 3409 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3410 mutex_lock(&mgpu_info.mutex); 3411 3412 /* 3413 * Reset device p-state to low as this was booted with high. 3414 * 3415 * This should be performed only after all devices from the same 3416 * hive get initialized. 3417 * 3418 * However, it's unknown how many device in the hive in advance. 3419 * As this is counted one by one during devices initializations. 3420 * 3421 * So, we wait for all XGMI interlinked devices initialized. 3422 * This may bring some delays as those devices may come from 3423 * different hives. But that should be OK. 3424 */ 3425 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3426 for (i = 0; i < mgpu_info.num_gpu; i++) { 3427 gpu_instance = &(mgpu_info.gpu_ins[i]); 3428 if (gpu_instance->adev->flags & AMD_IS_APU) 3429 continue; 3430 3431 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3432 AMDGPU_XGMI_PSTATE_MIN); 3433 if (r) { 3434 dev_err(adev->dev, 3435 "pstate setting failed (%d).\n", 3436 r); 3437 break; 3438 } 3439 } 3440 } 3441 3442 mutex_unlock(&mgpu_info.mutex); 3443 } 3444 3445 return 0; 3446 } 3447 3448 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3449 { 3450 struct amdgpu_device *adev = ip_block->adev; 3451 int r; 3452 3453 if (!ip_block->version->funcs->hw_fini) { 3454 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3455 ip_block->version->funcs->name); 3456 } else { 3457 r = ip_block->version->funcs->hw_fini(ip_block); 3458 /* XXX handle errors */ 3459 if (r) { 3460 dev_dbg(adev->dev, 3461 "hw_fini of IP block <%s> failed %d\n", 3462 ip_block->version->funcs->name, r); 3463 } 3464 } 3465 3466 ip_block->status.hw = false; 3467 } 3468 3469 /** 3470 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3471 * 3472 * @adev: amdgpu_device pointer 3473 * 3474 * For ASICs need to disable SMC first 3475 */ 3476 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3477 { 3478 int i; 3479 3480 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3481 return; 3482 3483 for (i = 0; i < adev->num_ip_blocks; i++) { 3484 if (!adev->ip_blocks[i].status.hw) 3485 continue; 3486 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3487 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3488 break; 3489 } 3490 } 3491 } 3492 3493 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3494 { 3495 int i, r; 3496 3497 for (i = 0; i < adev->num_ip_blocks; i++) { 3498 if (!adev->ip_blocks[i].version->funcs->early_fini) 3499 continue; 3500 3501 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3502 if (r) { 3503 dev_dbg(adev->dev, 3504 "early_fini of IP block <%s> failed %d\n", 3505 adev->ip_blocks[i].version->funcs->name, r); 3506 } 3507 } 3508 3509 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3510 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3511 3512 amdgpu_amdkfd_suspend(adev, true); 3513 amdgpu_userq_suspend(adev); 3514 3515 /* Workaround for ASICs need to disable SMC first */ 3516 amdgpu_device_smu_fini_early(adev); 3517 3518 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3519 if (!adev->ip_blocks[i].status.hw) 3520 continue; 3521 3522 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3523 } 3524 3525 if (amdgpu_sriov_vf(adev)) { 3526 if (amdgpu_virt_release_full_gpu(adev, false)) 3527 dev_err(adev->dev, 3528 "failed to release exclusive mode on fini\n"); 3529 } 3530 3531 /* 3532 * Driver reload on the APU can fail due to firmware validation because 3533 * the PSP is always running, as it is shared across the whole SoC. 3534 * This same issue does not occur on dGPU because it has a mechanism 3535 * that checks whether the PSP is running. A solution for those issues 3536 * in the APU is to trigger a GPU reset, but this should be done during 3537 * the unload phase to avoid adding boot latency and screen flicker. 3538 */ 3539 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { 3540 r = amdgpu_asic_reset(adev); 3541 if (r) 3542 dev_err(adev->dev, "asic reset on %s failed\n", __func__); 3543 } 3544 3545 return 0; 3546 } 3547 3548 /** 3549 * amdgpu_device_ip_fini - run fini for hardware IPs 3550 * 3551 * @adev: amdgpu_device pointer 3552 * 3553 * Main teardown pass for hardware IPs. The list of all the hardware 3554 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3555 * are run. hw_fini tears down the hardware associated with each IP 3556 * and sw_fini tears down any software state associated with each IP. 3557 * Returns 0 on success, negative error code on failure. 3558 */ 3559 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3560 { 3561 int i, r; 3562 3563 amdgpu_cper_fini(adev); 3564 3565 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3566 amdgpu_virt_release_ras_err_handler_data(adev); 3567 3568 if (adev->gmc.xgmi.num_physical_nodes > 1) 3569 amdgpu_xgmi_remove_device(adev); 3570 3571 amdgpu_amdkfd_device_fini_sw(adev); 3572 3573 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3574 if (!adev->ip_blocks[i].status.sw) 3575 continue; 3576 3577 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3578 amdgpu_ucode_free_bo(adev); 3579 amdgpu_free_static_csa(&adev->virt.csa_obj); 3580 amdgpu_device_wb_fini(adev); 3581 amdgpu_device_mem_scratch_fini(adev); 3582 amdgpu_ib_pool_fini(adev); 3583 amdgpu_seq64_fini(adev); 3584 amdgpu_doorbell_fini(adev); 3585 } 3586 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3587 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3588 /* XXX handle errors */ 3589 if (r) { 3590 dev_dbg(adev->dev, 3591 "sw_fini of IP block <%s> failed %d\n", 3592 adev->ip_blocks[i].version->funcs->name, 3593 r); 3594 } 3595 } 3596 adev->ip_blocks[i].status.sw = false; 3597 adev->ip_blocks[i].status.valid = false; 3598 } 3599 3600 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3601 if (!adev->ip_blocks[i].status.late_initialized) 3602 continue; 3603 if (adev->ip_blocks[i].version->funcs->late_fini) 3604 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3605 adev->ip_blocks[i].status.late_initialized = false; 3606 } 3607 3608 amdgpu_ras_fini(adev); 3609 amdgpu_uid_fini(adev); 3610 3611 return 0; 3612 } 3613 3614 /** 3615 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3616 * 3617 * @work: work_struct. 3618 */ 3619 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3620 { 3621 struct amdgpu_device *adev = 3622 container_of(work, struct amdgpu_device, delayed_init_work.work); 3623 int r; 3624 3625 r = amdgpu_ib_ring_tests(adev); 3626 if (r) 3627 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3628 } 3629 3630 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3631 { 3632 struct amdgpu_device *adev = 3633 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3634 3635 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3636 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3637 3638 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3639 adev->gfx.gfx_off_state = true; 3640 } 3641 3642 /** 3643 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3644 * 3645 * @adev: amdgpu_device pointer 3646 * 3647 * Main suspend function for hardware IPs. The list of all the hardware 3648 * IPs that make up the asic is walked, clockgating is disabled and the 3649 * suspend callbacks are run. suspend puts the hardware and software state 3650 * in each IP into a state suitable for suspend. 3651 * Returns 0 on success, negative error code on failure. 3652 */ 3653 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3654 { 3655 int i, r, rec; 3656 3657 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3658 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3659 3660 /* 3661 * Per PMFW team's suggestion, driver needs to handle gfxoff 3662 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3663 * scenario. Add the missing df cstate disablement here. 3664 */ 3665 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3666 dev_warn(adev->dev, "Failed to disallow df cstate"); 3667 3668 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3669 if (!adev->ip_blocks[i].status.valid) 3670 continue; 3671 3672 /* displays are handled separately */ 3673 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3674 continue; 3675 3676 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3677 if (r) 3678 goto unwind; 3679 } 3680 3681 return 0; 3682 unwind: 3683 rec = amdgpu_device_ip_resume_phase3(adev); 3684 if (rec) 3685 dev_err(adev->dev, 3686 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3687 rec); 3688 3689 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3690 3691 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3692 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3693 3694 return r; 3695 } 3696 3697 /** 3698 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3699 * 3700 * @adev: amdgpu_device pointer 3701 * 3702 * Main suspend function for hardware IPs. The list of all the hardware 3703 * IPs that make up the asic is walked, clockgating is disabled and the 3704 * suspend callbacks are run. suspend puts the hardware and software state 3705 * in each IP into a state suitable for suspend. 3706 * Returns 0 on success, negative error code on failure. 3707 */ 3708 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3709 { 3710 int i, r, rec; 3711 3712 if (adev->in_s0ix) 3713 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3714 3715 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3716 if (!adev->ip_blocks[i].status.valid) 3717 continue; 3718 /* displays are handled in phase1 */ 3719 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3720 continue; 3721 /* PSP lost connection when err_event_athub occurs */ 3722 if (amdgpu_ras_intr_triggered() && 3723 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3724 adev->ip_blocks[i].status.hw = false; 3725 continue; 3726 } 3727 3728 /* skip unnecessary suspend if we do not initialize them yet */ 3729 if (!amdgpu_ip_member_of_hwini( 3730 adev, adev->ip_blocks[i].version->type)) 3731 continue; 3732 3733 /* Since we skip suspend for S0i3, we need to cancel the delayed 3734 * idle work here as the suspend callback never gets called. 3735 */ 3736 if (adev->in_s0ix && 3737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3738 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3739 cancel_delayed_work_sync(&adev->gfx.idle_work); 3740 /* skip suspend of gfx/mes and psp for S0ix 3741 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3742 * like at runtime. PSP is also part of the always on hardware 3743 * so no need to suspend it. 3744 */ 3745 if (adev->in_s0ix && 3746 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3747 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3748 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3749 continue; 3750 3751 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3752 if (adev->in_s0ix && 3753 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3754 IP_VERSION(5, 0, 0)) && 3755 (adev->ip_blocks[i].version->type == 3756 AMD_IP_BLOCK_TYPE_SDMA)) 3757 continue; 3758 3759 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3760 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3761 * from this location and RLC Autoload automatically also gets loaded 3762 * from here based on PMFW -> PSP message during re-init sequence. 3763 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3764 * the TMR and reload FWs again for IMU enabled APU ASICs. 3765 */ 3766 if (amdgpu_in_reset(adev) && 3767 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3768 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3769 continue; 3770 3771 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3772 if (r) 3773 goto unwind; 3774 3775 /* handle putting the SMC in the appropriate state */ 3776 if (!amdgpu_sriov_vf(adev)) { 3777 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3778 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3779 if (r) { 3780 dev_err(adev->dev, 3781 "SMC failed to set mp1 state %d, %d\n", 3782 adev->mp1_state, r); 3783 goto unwind; 3784 } 3785 } 3786 } 3787 } 3788 3789 return 0; 3790 unwind: 3791 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3792 rec = amdgpu_device_ip_resume_phase1(adev); 3793 if (rec) { 3794 dev_err(adev->dev, 3795 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3796 rec); 3797 return r; 3798 } 3799 3800 rec = amdgpu_device_fw_loading(adev); 3801 if (rec) { 3802 dev_err(adev->dev, 3803 "amdgpu_device_fw_loading failed during unwind: %d\n", 3804 rec); 3805 return r; 3806 } 3807 3808 rec = amdgpu_device_ip_resume_phase2(adev); 3809 if (rec) { 3810 dev_err(adev->dev, 3811 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3812 rec); 3813 return r; 3814 } 3815 3816 return r; 3817 } 3818 3819 /** 3820 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3821 * 3822 * @adev: amdgpu_device pointer 3823 * 3824 * Main suspend function for hardware IPs. The list of all the hardware 3825 * IPs that make up the asic is walked, clockgating is disabled and the 3826 * suspend callbacks are run. suspend puts the hardware and software state 3827 * in each IP into a state suitable for suspend. 3828 * Returns 0 on success, negative error code on failure. 3829 */ 3830 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3831 { 3832 int r; 3833 3834 if (amdgpu_sriov_vf(adev)) { 3835 amdgpu_virt_fini_data_exchange(adev); 3836 amdgpu_virt_request_full_gpu(adev, false); 3837 } 3838 3839 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3840 3841 r = amdgpu_device_ip_suspend_phase1(adev); 3842 if (r) 3843 return r; 3844 r = amdgpu_device_ip_suspend_phase2(adev); 3845 3846 if (amdgpu_sriov_vf(adev)) 3847 amdgpu_virt_release_full_gpu(adev, false); 3848 3849 return r; 3850 } 3851 3852 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3853 { 3854 int i, r; 3855 3856 static enum amd_ip_block_type ip_order[] = { 3857 AMD_IP_BLOCK_TYPE_COMMON, 3858 AMD_IP_BLOCK_TYPE_GMC, 3859 AMD_IP_BLOCK_TYPE_PSP, 3860 AMD_IP_BLOCK_TYPE_IH, 3861 }; 3862 3863 for (i = 0; i < adev->num_ip_blocks; i++) { 3864 int j; 3865 struct amdgpu_ip_block *block; 3866 3867 block = &adev->ip_blocks[i]; 3868 block->status.hw = false; 3869 3870 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3871 3872 if (block->version->type != ip_order[j] || 3873 !block->status.valid) 3874 continue; 3875 3876 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3877 if (r) { 3878 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3879 block->version->funcs->name); 3880 return r; 3881 } 3882 block->status.hw = true; 3883 } 3884 } 3885 3886 return 0; 3887 } 3888 3889 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3890 { 3891 struct amdgpu_ip_block *block; 3892 int i, r = 0; 3893 3894 static enum amd_ip_block_type ip_order[] = { 3895 AMD_IP_BLOCK_TYPE_SMC, 3896 AMD_IP_BLOCK_TYPE_DCE, 3897 AMD_IP_BLOCK_TYPE_GFX, 3898 AMD_IP_BLOCK_TYPE_SDMA, 3899 AMD_IP_BLOCK_TYPE_MES, 3900 AMD_IP_BLOCK_TYPE_UVD, 3901 AMD_IP_BLOCK_TYPE_VCE, 3902 AMD_IP_BLOCK_TYPE_VCN, 3903 AMD_IP_BLOCK_TYPE_JPEG 3904 }; 3905 3906 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3907 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3908 3909 if (!block) 3910 continue; 3911 3912 if (block->status.valid && !block->status.hw) { 3913 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3914 r = amdgpu_ip_block_resume(block); 3915 } else { 3916 r = block->version->funcs->hw_init(block); 3917 } 3918 3919 if (r) { 3920 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3921 block->version->funcs->name); 3922 break; 3923 } 3924 block->status.hw = true; 3925 } 3926 } 3927 3928 return r; 3929 } 3930 3931 /** 3932 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3933 * 3934 * @adev: amdgpu_device pointer 3935 * 3936 * First resume function for hardware IPs. The list of all the hardware 3937 * IPs that make up the asic is walked and the resume callbacks are run for 3938 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3939 * after a suspend and updates the software state as necessary. This 3940 * function is also used for restoring the GPU after a GPU reset. 3941 * Returns 0 on success, negative error code on failure. 3942 */ 3943 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3944 { 3945 int i, r; 3946 3947 for (i = 0; i < adev->num_ip_blocks; i++) { 3948 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3949 continue; 3950 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3951 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3952 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3953 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3954 3955 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3956 if (r) 3957 return r; 3958 } 3959 } 3960 3961 return 0; 3962 } 3963 3964 /** 3965 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3966 * 3967 * @adev: amdgpu_device pointer 3968 * 3969 * Second resume function for hardware IPs. The list of all the hardware 3970 * IPs that make up the asic is walked and the resume callbacks are run for 3971 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3972 * functional state after a suspend and updates the software state as 3973 * necessary. This function is also used for restoring the GPU after a GPU 3974 * reset. 3975 * Returns 0 on success, negative error code on failure. 3976 */ 3977 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3978 { 3979 int i, r; 3980 3981 for (i = 0; i < adev->num_ip_blocks; i++) { 3982 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3983 continue; 3984 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3985 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3986 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3987 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3988 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3989 continue; 3990 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3991 if (r) 3992 return r; 3993 } 3994 3995 return 0; 3996 } 3997 3998 /** 3999 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4000 * 4001 * @adev: amdgpu_device pointer 4002 * 4003 * Third resume function for hardware IPs. The list of all the hardware 4004 * IPs that make up the asic is walked and the resume callbacks are run for 4005 * all DCE. resume puts the hardware into a functional state after a suspend 4006 * and updates the software state as necessary. This function is also used 4007 * for restoring the GPU after a GPU reset. 4008 * 4009 * Returns 0 on success, negative error code on failure. 4010 */ 4011 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4012 { 4013 int i, r; 4014 4015 for (i = 0; i < adev->num_ip_blocks; i++) { 4016 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4017 continue; 4018 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4019 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4020 if (r) 4021 return r; 4022 } 4023 } 4024 4025 return 0; 4026 } 4027 4028 /** 4029 * amdgpu_device_ip_resume - run resume for hardware IPs 4030 * 4031 * @adev: amdgpu_device pointer 4032 * 4033 * Main resume function for hardware IPs. The hardware IPs 4034 * are split into two resume functions because they are 4035 * also used in recovering from a GPU reset and some additional 4036 * steps need to be take between them. In this case (S3/S4) they are 4037 * run sequentially. 4038 * Returns 0 on success, negative error code on failure. 4039 */ 4040 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4041 { 4042 int r; 4043 4044 r = amdgpu_device_ip_resume_phase1(adev); 4045 if (r) 4046 return r; 4047 4048 r = amdgpu_device_fw_loading(adev); 4049 if (r) 4050 return r; 4051 4052 r = amdgpu_device_ip_resume_phase2(adev); 4053 4054 if (adev->mman.buffer_funcs_ring->sched.ready) 4055 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4056 4057 if (r) 4058 return r; 4059 4060 amdgpu_fence_driver_hw_init(adev); 4061 4062 r = amdgpu_device_ip_resume_phase3(adev); 4063 4064 return r; 4065 } 4066 4067 /** 4068 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4069 * 4070 * @adev: amdgpu_device pointer 4071 * 4072 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4073 */ 4074 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4075 { 4076 if (amdgpu_sriov_vf(adev)) { 4077 if (adev->is_atom_fw) { 4078 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4079 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4080 } else { 4081 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4082 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4083 } 4084 4085 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4086 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4087 } 4088 } 4089 4090 /** 4091 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4092 * 4093 * @pdev : pci device context 4094 * @asic_type: AMD asic type 4095 * 4096 * Check if there is DC (new modesetting infrastructre) support for an asic. 4097 * returns true if DC has support, false if not. 4098 */ 4099 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4100 enum amd_asic_type asic_type) 4101 { 4102 switch (asic_type) { 4103 #ifdef CONFIG_DRM_AMDGPU_SI 4104 case CHIP_HAINAN: 4105 #endif 4106 case CHIP_TOPAZ: 4107 /* chips with no display hardware */ 4108 return false; 4109 #if defined(CONFIG_DRM_AMD_DC) 4110 case CHIP_TAHITI: 4111 case CHIP_PITCAIRN: 4112 case CHIP_VERDE: 4113 case CHIP_OLAND: 4114 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 4115 case CHIP_KAVERI: 4116 case CHIP_KABINI: 4117 case CHIP_MULLINS: 4118 /* 4119 * We have systems in the wild with these ASICs that require 4120 * TRAVIS and NUTMEG support which is not supported with DC. 4121 * 4122 * Fallback to the non-DC driver here by default so as not to 4123 * cause regressions. 4124 */ 4125 return amdgpu_dc > 0; 4126 default: 4127 return amdgpu_dc != 0; 4128 #else 4129 default: 4130 if (amdgpu_dc > 0) 4131 dev_info_once( 4132 &pdev->dev, 4133 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4134 return false; 4135 #endif 4136 } 4137 } 4138 4139 /** 4140 * amdgpu_device_has_dc_support - check if dc is supported 4141 * 4142 * @adev: amdgpu_device pointer 4143 * 4144 * Returns true for supported, false for not supported 4145 */ 4146 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4147 { 4148 if (adev->enable_virtual_display || 4149 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4150 return false; 4151 4152 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4153 } 4154 4155 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4156 { 4157 struct amdgpu_device *adev = 4158 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4159 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4160 4161 /* It's a bug to not have a hive within this function */ 4162 if (WARN_ON(!hive)) 4163 return; 4164 4165 /* 4166 * Use task barrier to synchronize all xgmi reset works across the 4167 * hive. task_barrier_enter and task_barrier_exit will block 4168 * until all the threads running the xgmi reset works reach 4169 * those points. task_barrier_full will do both blocks. 4170 */ 4171 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4172 4173 task_barrier_enter(&hive->tb); 4174 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4175 4176 if (adev->asic_reset_res) 4177 goto fail; 4178 4179 task_barrier_exit(&hive->tb); 4180 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4181 4182 if (adev->asic_reset_res) 4183 goto fail; 4184 4185 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4186 } else { 4187 4188 task_barrier_full(&hive->tb); 4189 adev->asic_reset_res = amdgpu_asic_reset(adev); 4190 } 4191 4192 fail: 4193 if (adev->asic_reset_res) 4194 dev_warn(adev->dev, 4195 "ASIC reset failed with error, %d for drm dev, %s", 4196 adev->asic_reset_res, adev_to_drm(adev)->unique); 4197 amdgpu_put_xgmi_hive(hive); 4198 } 4199 4200 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4201 { 4202 char *input = amdgpu_lockup_timeout; 4203 char *timeout_setting = NULL; 4204 int index = 0; 4205 long timeout; 4206 int ret = 0; 4207 4208 /* By default timeout for all queues is 2 sec */ 4209 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4210 adev->video_timeout = msecs_to_jiffies(2000); 4211 4212 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4213 return 0; 4214 4215 while ((timeout_setting = strsep(&input, ",")) && 4216 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4217 ret = kstrtol(timeout_setting, 0, &timeout); 4218 if (ret) 4219 return ret; 4220 4221 if (timeout == 0) { 4222 index++; 4223 continue; 4224 } else if (timeout < 0) { 4225 timeout = MAX_SCHEDULE_TIMEOUT; 4226 dev_warn(adev->dev, "lockup timeout disabled"); 4227 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4228 } else { 4229 timeout = msecs_to_jiffies(timeout); 4230 } 4231 4232 switch (index++) { 4233 case 0: 4234 adev->gfx_timeout = timeout; 4235 break; 4236 case 1: 4237 adev->compute_timeout = timeout; 4238 break; 4239 case 2: 4240 adev->sdma_timeout = timeout; 4241 break; 4242 case 3: 4243 adev->video_timeout = timeout; 4244 break; 4245 default: 4246 break; 4247 } 4248 } 4249 4250 /* When only one value specified apply it to all queues. */ 4251 if (index == 1) 4252 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4253 adev->video_timeout = timeout; 4254 4255 return ret; 4256 } 4257 4258 /** 4259 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4260 * 4261 * @adev: amdgpu_device pointer 4262 * 4263 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4264 */ 4265 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4266 { 4267 struct iommu_domain *domain; 4268 4269 domain = iommu_get_domain_for_dev(adev->dev); 4270 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4271 adev->ram_is_direct_mapped = true; 4272 } 4273 4274 #if defined(CONFIG_HSA_AMD_P2P) 4275 /** 4276 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4277 * 4278 * @adev: amdgpu_device pointer 4279 * 4280 * return if IOMMU remapping bar address 4281 */ 4282 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4283 { 4284 struct iommu_domain *domain; 4285 4286 domain = iommu_get_domain_for_dev(adev->dev); 4287 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4288 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4289 return true; 4290 4291 return false; 4292 } 4293 #endif 4294 4295 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4296 { 4297 if (amdgpu_mcbp == 1) 4298 adev->gfx.mcbp = true; 4299 else if (amdgpu_mcbp == 0) 4300 adev->gfx.mcbp = false; 4301 4302 if (amdgpu_sriov_vf(adev)) 4303 adev->gfx.mcbp = true; 4304 4305 if (adev->gfx.mcbp) 4306 dev_info(adev->dev, "MCBP is enabled\n"); 4307 } 4308 4309 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4310 { 4311 int r; 4312 4313 r = amdgpu_atombios_sysfs_init(adev); 4314 if (r) 4315 drm_err(&adev->ddev, 4316 "registering atombios sysfs failed (%d).\n", r); 4317 4318 r = amdgpu_pm_sysfs_init(adev); 4319 if (r) 4320 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4321 4322 r = amdgpu_ucode_sysfs_init(adev); 4323 if (r) { 4324 adev->ucode_sysfs_en = false; 4325 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4326 } else 4327 adev->ucode_sysfs_en = true; 4328 4329 r = amdgpu_device_attr_sysfs_init(adev); 4330 if (r) 4331 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4332 4333 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4334 if (r) 4335 dev_err(adev->dev, 4336 "Could not create amdgpu board attributes\n"); 4337 4338 amdgpu_fru_sysfs_init(adev); 4339 amdgpu_reg_state_sysfs_init(adev); 4340 amdgpu_xcp_sysfs_init(adev); 4341 amdgpu_uma_sysfs_init(adev); 4342 4343 return r; 4344 } 4345 4346 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4347 { 4348 if (adev->pm.sysfs_initialized) 4349 amdgpu_pm_sysfs_fini(adev); 4350 if (adev->ucode_sysfs_en) 4351 amdgpu_ucode_sysfs_fini(adev); 4352 amdgpu_device_attr_sysfs_fini(adev); 4353 amdgpu_fru_sysfs_fini(adev); 4354 4355 amdgpu_reg_state_sysfs_fini(adev); 4356 amdgpu_xcp_sysfs_fini(adev); 4357 amdgpu_uma_sysfs_fini(adev); 4358 } 4359 4360 /** 4361 * amdgpu_device_init - initialize the driver 4362 * 4363 * @adev: amdgpu_device pointer 4364 * @flags: driver flags 4365 * 4366 * Initializes the driver info and hw (all asics). 4367 * Returns 0 for success or an error on failure. 4368 * Called at driver startup. 4369 */ 4370 int amdgpu_device_init(struct amdgpu_device *adev, 4371 uint32_t flags) 4372 { 4373 struct pci_dev *pdev = adev->pdev; 4374 int r, i; 4375 bool px = false; 4376 u32 max_MBps; 4377 int tmp; 4378 4379 adev->shutdown = false; 4380 adev->flags = flags; 4381 4382 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4383 adev->asic_type = amdgpu_force_asic_type; 4384 else 4385 adev->asic_type = flags & AMD_ASIC_MASK; 4386 4387 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4388 if (amdgpu_emu_mode == 1) 4389 adev->usec_timeout *= 10; 4390 adev->gmc.gart_size = 512 * 1024 * 1024; 4391 adev->accel_working = false; 4392 adev->num_rings = 0; 4393 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4394 adev->mman.buffer_funcs = NULL; 4395 adev->mman.buffer_funcs_ring = NULL; 4396 adev->vm_manager.vm_pte_funcs = NULL; 4397 adev->vm_manager.vm_pte_num_scheds = 0; 4398 adev->gmc.gmc_funcs = NULL; 4399 adev->harvest_ip_mask = 0x0; 4400 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4401 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4402 4403 adev->smc_rreg = &amdgpu_invalid_rreg; 4404 adev->smc_wreg = &amdgpu_invalid_wreg; 4405 adev->pcie_rreg = &amdgpu_invalid_rreg; 4406 adev->pcie_wreg = &amdgpu_invalid_wreg; 4407 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4408 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4409 adev->pciep_rreg = &amdgpu_invalid_rreg; 4410 adev->pciep_wreg = &amdgpu_invalid_wreg; 4411 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4412 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4413 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4414 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4415 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4416 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4417 adev->didt_rreg = &amdgpu_invalid_rreg; 4418 adev->didt_wreg = &amdgpu_invalid_wreg; 4419 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4420 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4421 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4422 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4423 4424 dev_info( 4425 adev->dev, 4426 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4427 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4428 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4429 4430 /* mutex initialization are all done here so we 4431 * can recall function without having locking issues 4432 */ 4433 mutex_init(&adev->firmware.mutex); 4434 mutex_init(&adev->pm.mutex); 4435 mutex_init(&adev->gfx.gpu_clock_mutex); 4436 mutex_init(&adev->srbm_mutex); 4437 mutex_init(&adev->gfx.pipe_reserve_mutex); 4438 mutex_init(&adev->gfx.gfx_off_mutex); 4439 mutex_init(&adev->gfx.partition_mutex); 4440 mutex_init(&adev->grbm_idx_mutex); 4441 mutex_init(&adev->mn_lock); 4442 mutex_init(&adev->virt.vf_errors.lock); 4443 hash_init(adev->mn_hash); 4444 mutex_init(&adev->psp.mutex); 4445 mutex_init(&adev->notifier_lock); 4446 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4447 mutex_init(&adev->benchmark_mutex); 4448 mutex_init(&adev->gfx.reset_sem_mutex); 4449 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4450 mutex_init(&adev->enforce_isolation_mutex); 4451 for (i = 0; i < MAX_XCP; ++i) { 4452 adev->isolation[i].spearhead = dma_fence_get_stub(); 4453 amdgpu_sync_create(&adev->isolation[i].active); 4454 amdgpu_sync_create(&adev->isolation[i].prev); 4455 } 4456 mutex_init(&adev->gfx.userq_sch_mutex); 4457 mutex_init(&adev->gfx.workload_profile_mutex); 4458 mutex_init(&adev->vcn.workload_profile_mutex); 4459 4460 amdgpu_device_init_apu_flags(adev); 4461 4462 r = amdgpu_device_check_arguments(adev); 4463 if (r) 4464 return r; 4465 4466 spin_lock_init(&adev->mmio_idx_lock); 4467 spin_lock_init(&adev->smc_idx_lock); 4468 spin_lock_init(&adev->pcie_idx_lock); 4469 spin_lock_init(&adev->uvd_ctx_idx_lock); 4470 spin_lock_init(&adev->didt_idx_lock); 4471 spin_lock_init(&adev->gc_cac_idx_lock); 4472 spin_lock_init(&adev->se_cac_idx_lock); 4473 spin_lock_init(&adev->audio_endpt_idx_lock); 4474 spin_lock_init(&adev->mm_stats.lock); 4475 spin_lock_init(&adev->virt.rlcg_reg_lock); 4476 spin_lock_init(&adev->wb.lock); 4477 4478 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4479 4480 INIT_LIST_HEAD(&adev->reset_list); 4481 4482 INIT_LIST_HEAD(&adev->ras_list); 4483 4484 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4485 4486 xa_init(&adev->userq_doorbell_xa); 4487 4488 INIT_DELAYED_WORK(&adev->delayed_init_work, 4489 amdgpu_device_delayed_init_work_handler); 4490 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4491 amdgpu_device_delay_enable_gfx_off); 4492 /* 4493 * Initialize the enforce_isolation work structures for each XCP 4494 * partition. This work handler is responsible for enforcing shader 4495 * isolation on AMD GPUs. It counts the number of emitted fences for 4496 * each GFX and compute ring. If there are any fences, it schedules 4497 * the `enforce_isolation_work` to be run after a delay. If there are 4498 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4499 * runqueue. 4500 */ 4501 for (i = 0; i < MAX_XCP; i++) { 4502 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4503 amdgpu_gfx_enforce_isolation_handler); 4504 adev->gfx.enforce_isolation[i].adev = adev; 4505 adev->gfx.enforce_isolation[i].xcp_id = i; 4506 } 4507 4508 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4509 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4510 4511 adev->gfx.gfx_off_req_count = 1; 4512 adev->gfx.gfx_off_residency = 0; 4513 adev->gfx.gfx_off_entrycount = 0; 4514 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4515 4516 atomic_set(&adev->throttling_logging_enabled, 1); 4517 /* 4518 * If throttling continues, logging will be performed every minute 4519 * to avoid log flooding. "-1" is subtracted since the thermal 4520 * throttling interrupt comes every second. Thus, the total logging 4521 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4522 * for throttling interrupt) = 60 seconds. 4523 */ 4524 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4525 4526 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4527 4528 /* Registers mapping */ 4529 /* TODO: block userspace mapping of io register */ 4530 if (adev->asic_type >= CHIP_BONAIRE) { 4531 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4532 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4533 } else { 4534 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4535 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4536 } 4537 4538 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4539 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4540 4541 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4542 if (!adev->rmmio) 4543 return -ENOMEM; 4544 4545 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4546 (uint32_t)adev->rmmio_base); 4547 dev_info(adev->dev, "register mmio size: %u\n", 4548 (unsigned int)adev->rmmio_size); 4549 4550 /* 4551 * Reset domain needs to be present early, before XGMI hive discovered 4552 * (if any) and initialized to use reset sem and in_gpu reset flag 4553 * early on during init and before calling to RREG32. 4554 */ 4555 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4556 if (!adev->reset_domain) 4557 return -ENOMEM; 4558 4559 /* detect hw virtualization here */ 4560 amdgpu_virt_init(adev); 4561 4562 amdgpu_device_get_pcie_info(adev); 4563 4564 r = amdgpu_device_get_job_timeout_settings(adev); 4565 if (r) { 4566 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4567 return r; 4568 } 4569 4570 amdgpu_device_set_mcbp(adev); 4571 4572 /* 4573 * By default, use default mode where all blocks are expected to be 4574 * initialized. At present a 'swinit' of blocks is required to be 4575 * completed before the need for a different level is detected. 4576 */ 4577 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4578 /* early init functions */ 4579 r = amdgpu_device_ip_early_init(adev); 4580 if (r) 4581 return r; 4582 4583 /* 4584 * No need to remove conflicting FBs for non-display class devices. 4585 * This prevents the sysfb from being freed accidently. 4586 */ 4587 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4588 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4589 /* Get rid of things like offb */ 4590 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4591 if (r) 4592 return r; 4593 } 4594 4595 /* Enable TMZ based on IP_VERSION */ 4596 amdgpu_gmc_tmz_set(adev); 4597 4598 if (amdgpu_sriov_vf(adev) && 4599 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4600 /* VF MMIO access (except mailbox range) from CPU 4601 * will be blocked during sriov runtime 4602 */ 4603 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4604 4605 amdgpu_gmc_noretry_set(adev); 4606 /* Need to get xgmi info early to decide the reset behavior*/ 4607 if (adev->gmc.xgmi.supported) { 4608 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4609 if (r) 4610 return r; 4611 } 4612 4613 /* enable PCIE atomic ops */ 4614 if (amdgpu_sriov_vf(adev)) { 4615 if (adev->virt.fw_reserve.p_pf2vf) 4616 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4617 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4618 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4619 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4620 * internal path natively support atomics, set have_atomics_support to true. 4621 */ 4622 } else if ((adev->flags & AMD_IS_APU) && 4623 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4624 IP_VERSION(9, 0, 0))) { 4625 adev->have_atomics_support = true; 4626 } else { 4627 adev->have_atomics_support = 4628 !pci_enable_atomic_ops_to_root(adev->pdev, 4629 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4630 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4631 } 4632 4633 if (!adev->have_atomics_support) 4634 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4635 4636 /* doorbell bar mapping and doorbell index init*/ 4637 amdgpu_doorbell_init(adev); 4638 4639 if (amdgpu_emu_mode == 1) { 4640 /* post the asic on emulation mode */ 4641 emu_soc_asic_init(adev); 4642 goto fence_driver_init; 4643 } 4644 4645 amdgpu_reset_init(adev); 4646 4647 /* detect if we are with an SRIOV vbios */ 4648 if (adev->bios) 4649 amdgpu_device_detect_sriov_bios(adev); 4650 4651 /* check if we need to reset the asic 4652 * E.g., driver was not cleanly unloaded previously, etc. 4653 */ 4654 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4655 if (adev->gmc.xgmi.num_physical_nodes) { 4656 dev_info(adev->dev, "Pending hive reset.\n"); 4657 amdgpu_set_init_level(adev, 4658 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4659 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4660 !amdgpu_device_has_display_hardware(adev)) { 4661 r = psp_gpu_reset(adev); 4662 } else { 4663 tmp = amdgpu_reset_method; 4664 /* It should do a default reset when loading or reloading the driver, 4665 * regardless of the module parameter reset_method. 4666 */ 4667 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4668 r = amdgpu_asic_reset(adev); 4669 amdgpu_reset_method = tmp; 4670 } 4671 4672 if (r) { 4673 dev_err(adev->dev, "asic reset on init failed\n"); 4674 goto failed; 4675 } 4676 } 4677 4678 /* Post card if necessary */ 4679 if (amdgpu_device_need_post(adev)) { 4680 if (!adev->bios) { 4681 dev_err(adev->dev, "no vBIOS found\n"); 4682 r = -EINVAL; 4683 goto failed; 4684 } 4685 dev_info(adev->dev, "GPU posting now...\n"); 4686 r = amdgpu_device_asic_init(adev); 4687 if (r) { 4688 dev_err(adev->dev, "gpu post error!\n"); 4689 goto failed; 4690 } 4691 } 4692 4693 if (adev->bios) { 4694 if (adev->is_atom_fw) { 4695 /* Initialize clocks */ 4696 r = amdgpu_atomfirmware_get_clock_info(adev); 4697 if (r) { 4698 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4699 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4700 goto failed; 4701 } 4702 } else { 4703 /* Initialize clocks */ 4704 r = amdgpu_atombios_get_clock_info(adev); 4705 if (r) { 4706 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4707 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4708 goto failed; 4709 } 4710 /* init i2c buses */ 4711 amdgpu_i2c_init(adev); 4712 } 4713 } 4714 4715 fence_driver_init: 4716 /* Fence driver */ 4717 r = amdgpu_fence_driver_sw_init(adev); 4718 if (r) { 4719 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4720 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4721 goto failed; 4722 } 4723 4724 /* init the mode config */ 4725 drm_mode_config_init(adev_to_drm(adev)); 4726 4727 r = amdgpu_device_ip_init(adev); 4728 if (r) { 4729 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4730 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4731 goto release_ras_con; 4732 } 4733 4734 amdgpu_fence_driver_hw_init(adev); 4735 4736 dev_info(adev->dev, 4737 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4738 adev->gfx.config.max_shader_engines, 4739 adev->gfx.config.max_sh_per_se, 4740 adev->gfx.config.max_cu_per_sh, 4741 adev->gfx.cu_info.number); 4742 4743 adev->accel_working = true; 4744 4745 amdgpu_vm_check_compute_bug(adev); 4746 4747 /* Initialize the buffer migration limit. */ 4748 if (amdgpu_moverate >= 0) 4749 max_MBps = amdgpu_moverate; 4750 else 4751 max_MBps = 8; /* Allow 8 MB/s. */ 4752 /* Get a log2 for easy divisions. */ 4753 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4754 4755 /* 4756 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4757 * Otherwise the mgpu fan boost feature will be skipped due to the 4758 * gpu instance is counted less. 4759 */ 4760 amdgpu_register_gpu_instance(adev); 4761 4762 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4763 * explicit gating rather than handling it automatically. 4764 */ 4765 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4766 r = amdgpu_device_ip_late_init(adev); 4767 if (r) { 4768 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4769 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4770 goto release_ras_con; 4771 } 4772 /* must succeed. */ 4773 amdgpu_ras_resume(adev); 4774 queue_delayed_work(system_wq, &adev->delayed_init_work, 4775 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4776 } 4777 4778 if (amdgpu_sriov_vf(adev)) { 4779 amdgpu_virt_release_full_gpu(adev, true); 4780 flush_delayed_work(&adev->delayed_init_work); 4781 } 4782 4783 /* Don't init kfd if whole hive need to be reset during init */ 4784 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4785 kgd2kfd_init_zone_device(adev); 4786 kfd_update_svm_support_properties(adev); 4787 } 4788 4789 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4790 amdgpu_xgmi_reset_on_init(adev); 4791 4792 /* 4793 * Place those sysfs registering after `late_init`. As some of those 4794 * operations performed in `late_init` might affect the sysfs 4795 * interfaces creating. 4796 */ 4797 r = amdgpu_device_sys_interface_init(adev); 4798 4799 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4800 r = amdgpu_pmu_init(adev); 4801 if (r) 4802 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4803 4804 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4805 if (amdgpu_device_cache_pci_state(adev->pdev)) 4806 pci_restore_state(pdev); 4807 4808 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4809 /* this will fail for cards that aren't VGA class devices, just 4810 * ignore it 4811 */ 4812 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4813 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4814 4815 px = amdgpu_device_supports_px(adev); 4816 4817 if (px || (!dev_is_removable(&adev->pdev->dev) && 4818 apple_gmux_detect(NULL, NULL))) 4819 vga_switcheroo_register_client(adev->pdev, 4820 &amdgpu_switcheroo_ops, px); 4821 4822 if (px) 4823 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4824 4825 amdgpu_device_check_iommu_direct_map(adev); 4826 4827 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4828 r = register_pm_notifier(&adev->pm_nb); 4829 if (r) 4830 goto failed; 4831 4832 return 0; 4833 4834 release_ras_con: 4835 if (amdgpu_sriov_vf(adev)) 4836 amdgpu_virt_release_full_gpu(adev, true); 4837 4838 /* failed in exclusive mode due to timeout */ 4839 if (amdgpu_sriov_vf(adev) && 4840 !amdgpu_sriov_runtime(adev) && 4841 amdgpu_virt_mmio_blocked(adev) && 4842 !amdgpu_virt_wait_reset(adev)) { 4843 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4844 /* Don't send request since VF is inactive. */ 4845 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4846 adev->virt.ops = NULL; 4847 r = -EAGAIN; 4848 } 4849 amdgpu_release_ras_context(adev); 4850 4851 failed: 4852 amdgpu_vf_error_trans_all(adev); 4853 4854 return r; 4855 } 4856 4857 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4858 { 4859 4860 /* Clear all CPU mappings pointing to this device */ 4861 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4862 4863 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4864 amdgpu_doorbell_fini(adev); 4865 4866 iounmap(adev->rmmio); 4867 adev->rmmio = NULL; 4868 if (adev->mman.aper_base_kaddr) 4869 iounmap(adev->mman.aper_base_kaddr); 4870 adev->mman.aper_base_kaddr = NULL; 4871 4872 /* Memory manager related */ 4873 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4874 arch_phys_wc_del(adev->gmc.vram_mtrr); 4875 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4876 } 4877 } 4878 4879 /** 4880 * amdgpu_device_fini_hw - tear down the driver 4881 * 4882 * @adev: amdgpu_device pointer 4883 * 4884 * Tear down the driver info (all asics). 4885 * Called at driver shutdown. 4886 */ 4887 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4888 { 4889 dev_info(adev->dev, "finishing device.\n"); 4890 flush_delayed_work(&adev->delayed_init_work); 4891 4892 if (adev->mman.initialized) 4893 drain_workqueue(adev->mman.bdev.wq); 4894 adev->shutdown = true; 4895 4896 unregister_pm_notifier(&adev->pm_nb); 4897 4898 /* make sure IB test finished before entering exclusive mode 4899 * to avoid preemption on IB test 4900 */ 4901 if (amdgpu_sriov_vf(adev)) { 4902 amdgpu_virt_request_full_gpu(adev, false); 4903 amdgpu_virt_fini_data_exchange(adev); 4904 } 4905 4906 /* disable all interrupts */ 4907 amdgpu_irq_disable_all(adev); 4908 if (adev->mode_info.mode_config_initialized) { 4909 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4910 drm_helper_force_disable_all(adev_to_drm(adev)); 4911 else 4912 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4913 } 4914 amdgpu_fence_driver_hw_fini(adev); 4915 4916 amdgpu_device_sys_interface_fini(adev); 4917 4918 /* disable ras feature must before hw fini */ 4919 amdgpu_ras_pre_fini(adev); 4920 4921 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4922 4923 amdgpu_device_ip_fini_early(adev); 4924 4925 amdgpu_irq_fini_hw(adev); 4926 4927 if (adev->mman.initialized) 4928 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4929 4930 amdgpu_gart_dummy_page_fini(adev); 4931 4932 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4933 amdgpu_device_unmap_mmio(adev); 4934 4935 } 4936 4937 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4938 { 4939 int i, idx; 4940 bool px; 4941 4942 amdgpu_device_ip_fini(adev); 4943 amdgpu_fence_driver_sw_fini(adev); 4944 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4945 adev->accel_working = false; 4946 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4947 for (i = 0; i < MAX_XCP; ++i) { 4948 dma_fence_put(adev->isolation[i].spearhead); 4949 amdgpu_sync_free(&adev->isolation[i].active); 4950 amdgpu_sync_free(&adev->isolation[i].prev); 4951 } 4952 4953 amdgpu_reset_fini(adev); 4954 4955 /* free i2c buses */ 4956 amdgpu_i2c_fini(adev); 4957 4958 if (adev->bios) { 4959 if (amdgpu_emu_mode != 1) 4960 amdgpu_atombios_fini(adev); 4961 amdgpu_bios_release(adev); 4962 } 4963 4964 kfree(adev->fru_info); 4965 adev->fru_info = NULL; 4966 4967 kfree(adev->xcp_mgr); 4968 adev->xcp_mgr = NULL; 4969 4970 px = amdgpu_device_supports_px(adev); 4971 4972 if (px || (!dev_is_removable(&adev->pdev->dev) && 4973 apple_gmux_detect(NULL, NULL))) 4974 vga_switcheroo_unregister_client(adev->pdev); 4975 4976 if (px) 4977 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4978 4979 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4980 vga_client_unregister(adev->pdev); 4981 4982 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4983 4984 iounmap(adev->rmmio); 4985 adev->rmmio = NULL; 4986 drm_dev_exit(idx); 4987 } 4988 4989 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4990 amdgpu_pmu_fini(adev); 4991 if (adev->discovery.bin) 4992 amdgpu_discovery_fini(adev); 4993 4994 amdgpu_reset_put_reset_domain(adev->reset_domain); 4995 adev->reset_domain = NULL; 4996 4997 kfree(adev->pci_state); 4998 kfree(adev->pcie_reset_ctx.swds_pcistate); 4999 kfree(adev->pcie_reset_ctx.swus_pcistate); 5000 } 5001 5002 /** 5003 * amdgpu_device_evict_resources - evict device resources 5004 * @adev: amdgpu device object 5005 * 5006 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5007 * of the vram memory type. Mainly used for evicting device resources 5008 * at suspend time. 5009 * 5010 */ 5011 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5012 { 5013 int ret; 5014 5015 /* No need to evict vram on APUs unless going to S4 */ 5016 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5017 return 0; 5018 5019 /* No need to evict when going to S5 through S4 callbacks */ 5020 if (system_state == SYSTEM_POWER_OFF) 5021 return 0; 5022 5023 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5024 if (ret) { 5025 dev_warn(adev->dev, "evicting device resources failed\n"); 5026 return ret; 5027 } 5028 5029 if (adev->in_s4) { 5030 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5031 if (ret) 5032 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5033 } 5034 return ret; 5035 } 5036 5037 /* 5038 * Suspend & resume. 5039 */ 5040 /** 5041 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5042 * @nb: notifier block 5043 * @mode: suspend mode 5044 * @data: data 5045 * 5046 * This function is called when the system is about to suspend or hibernate. 5047 * It is used to set the appropriate flags so that eviction can be optimized 5048 * in the pm prepare callback. 5049 */ 5050 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5051 void *data) 5052 { 5053 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5054 5055 switch (mode) { 5056 case PM_HIBERNATION_PREPARE: 5057 adev->in_s4 = true; 5058 break; 5059 case PM_POST_HIBERNATION: 5060 adev->in_s4 = false; 5061 break; 5062 } 5063 5064 return NOTIFY_DONE; 5065 } 5066 5067 /** 5068 * amdgpu_device_prepare - prepare for device suspend 5069 * 5070 * @dev: drm dev pointer 5071 * 5072 * Prepare to put the hw in the suspend state (all asics). 5073 * Returns 0 for success or an error on failure. 5074 * Called at driver suspend. 5075 */ 5076 int amdgpu_device_prepare(struct drm_device *dev) 5077 { 5078 struct amdgpu_device *adev = drm_to_adev(dev); 5079 int i, r; 5080 5081 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5082 return 0; 5083 5084 /* Evict the majority of BOs before starting suspend sequence */ 5085 r = amdgpu_device_evict_resources(adev); 5086 if (r) 5087 return r; 5088 5089 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5090 5091 for (i = 0; i < adev->num_ip_blocks; i++) { 5092 if (!adev->ip_blocks[i].status.valid) 5093 continue; 5094 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5095 continue; 5096 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5097 if (r) 5098 return r; 5099 } 5100 5101 return 0; 5102 } 5103 5104 /** 5105 * amdgpu_device_complete - complete power state transition 5106 * 5107 * @dev: drm dev pointer 5108 * 5109 * Undo the changes from amdgpu_device_prepare. This will be 5110 * called on all resume transitions, including those that failed. 5111 */ 5112 void amdgpu_device_complete(struct drm_device *dev) 5113 { 5114 struct amdgpu_device *adev = drm_to_adev(dev); 5115 int i; 5116 5117 for (i = 0; i < adev->num_ip_blocks; i++) { 5118 if (!adev->ip_blocks[i].status.valid) 5119 continue; 5120 if (!adev->ip_blocks[i].version->funcs->complete) 5121 continue; 5122 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5123 } 5124 } 5125 5126 /** 5127 * amdgpu_device_suspend - initiate device suspend 5128 * 5129 * @dev: drm dev pointer 5130 * @notify_clients: notify in-kernel DRM clients 5131 * 5132 * Puts the hw in the suspend state (all asics). 5133 * Returns 0 for success or an error on failure. 5134 * Called at driver suspend. 5135 */ 5136 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5137 { 5138 struct amdgpu_device *adev = drm_to_adev(dev); 5139 int r, rec; 5140 5141 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5142 return 0; 5143 5144 adev->in_suspend = true; 5145 5146 if (amdgpu_sriov_vf(adev)) { 5147 if (!adev->in_runpm) 5148 amdgpu_amdkfd_suspend_process(adev); 5149 amdgpu_virt_fini_data_exchange(adev); 5150 r = amdgpu_virt_request_full_gpu(adev, false); 5151 if (r) 5152 return r; 5153 } 5154 5155 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5156 if (r) 5157 goto unwind_sriov; 5158 5159 if (notify_clients) 5160 drm_client_dev_suspend(adev_to_drm(adev)); 5161 5162 cancel_delayed_work_sync(&adev->delayed_init_work); 5163 5164 amdgpu_ras_suspend(adev); 5165 5166 r = amdgpu_device_ip_suspend_phase1(adev); 5167 if (r) 5168 goto unwind_smartshift; 5169 5170 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5171 r = amdgpu_userq_suspend(adev); 5172 if (r) 5173 goto unwind_ip_phase1; 5174 5175 r = amdgpu_device_evict_resources(adev); 5176 if (r) 5177 goto unwind_userq; 5178 5179 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5180 5181 amdgpu_fence_driver_hw_fini(adev); 5182 5183 r = amdgpu_device_ip_suspend_phase2(adev); 5184 if (r) 5185 goto unwind_evict; 5186 5187 if (amdgpu_sriov_vf(adev)) 5188 amdgpu_virt_release_full_gpu(adev, false); 5189 5190 return 0; 5191 5192 unwind_evict: 5193 if (adev->mman.buffer_funcs_ring->sched.ready) 5194 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5195 amdgpu_fence_driver_hw_init(adev); 5196 5197 unwind_userq: 5198 rec = amdgpu_userq_resume(adev); 5199 if (rec) { 5200 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5201 return r; 5202 } 5203 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5204 if (rec) { 5205 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5206 return r; 5207 } 5208 5209 unwind_ip_phase1: 5210 /* suspend phase 1 = resume phase 3 */ 5211 rec = amdgpu_device_ip_resume_phase3(adev); 5212 if (rec) { 5213 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5214 return r; 5215 } 5216 5217 unwind_smartshift: 5218 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5219 if (rec) { 5220 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5221 return r; 5222 } 5223 5224 if (notify_clients) 5225 drm_client_dev_resume(adev_to_drm(adev)); 5226 5227 amdgpu_ras_resume(adev); 5228 5229 unwind_sriov: 5230 if (amdgpu_sriov_vf(adev)) { 5231 rec = amdgpu_virt_request_full_gpu(adev, true); 5232 if (rec) { 5233 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5234 return r; 5235 } 5236 } 5237 5238 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5239 5240 return r; 5241 } 5242 5243 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5244 { 5245 int r; 5246 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5247 5248 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5249 * may not work. The access could be blocked by nBIF protection as VF isn't in 5250 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5251 * so that QEMU reprograms MSIX table. 5252 */ 5253 amdgpu_restore_msix(adev); 5254 5255 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5256 if (r) 5257 return r; 5258 5259 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5260 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5261 5262 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5263 adev->vm_manager.vram_base_offset += 5264 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5265 5266 return 0; 5267 } 5268 5269 /** 5270 * amdgpu_device_resume - initiate device resume 5271 * 5272 * @dev: drm dev pointer 5273 * @notify_clients: notify in-kernel DRM clients 5274 * 5275 * Bring the hw back to operating state (all asics). 5276 * Returns 0 for success or an error on failure. 5277 * Called at driver resume. 5278 */ 5279 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5280 { 5281 struct amdgpu_device *adev = drm_to_adev(dev); 5282 int r = 0; 5283 5284 if (amdgpu_sriov_vf(adev)) { 5285 r = amdgpu_virt_request_full_gpu(adev, true); 5286 if (r) 5287 return r; 5288 } 5289 5290 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5291 r = amdgpu_virt_resume(adev); 5292 if (r) 5293 goto exit; 5294 } 5295 5296 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5297 return 0; 5298 5299 if (adev->in_s0ix) 5300 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5301 5302 /* post card */ 5303 if (amdgpu_device_need_post(adev)) { 5304 r = amdgpu_device_asic_init(adev); 5305 if (r) 5306 dev_err(adev->dev, "amdgpu asic init failed\n"); 5307 } 5308 5309 r = amdgpu_device_ip_resume(adev); 5310 5311 if (r) { 5312 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5313 goto exit; 5314 } 5315 5316 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5317 if (r) 5318 goto exit; 5319 5320 r = amdgpu_userq_resume(adev); 5321 if (r) 5322 goto exit; 5323 5324 r = amdgpu_device_ip_late_init(adev); 5325 if (r) 5326 goto exit; 5327 5328 queue_delayed_work(system_wq, &adev->delayed_init_work, 5329 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5330 exit: 5331 if (amdgpu_sriov_vf(adev)) { 5332 amdgpu_virt_init_data_exchange(adev); 5333 amdgpu_virt_release_full_gpu(adev, true); 5334 5335 if (!r && !adev->in_runpm) 5336 r = amdgpu_amdkfd_resume_process(adev); 5337 } 5338 5339 if (r) 5340 return r; 5341 5342 /* Make sure IB tests flushed */ 5343 flush_delayed_work(&adev->delayed_init_work); 5344 5345 if (notify_clients) 5346 drm_client_dev_resume(adev_to_drm(adev)); 5347 5348 amdgpu_ras_resume(adev); 5349 5350 if (adev->mode_info.num_crtc) { 5351 /* 5352 * Most of the connector probing functions try to acquire runtime pm 5353 * refs to ensure that the GPU is powered on when connector polling is 5354 * performed. Since we're calling this from a runtime PM callback, 5355 * trying to acquire rpm refs will cause us to deadlock. 5356 * 5357 * Since we're guaranteed to be holding the rpm lock, it's safe to 5358 * temporarily disable the rpm helpers so this doesn't deadlock us. 5359 */ 5360 #ifdef CONFIG_PM 5361 dev->dev->power.disable_depth++; 5362 #endif 5363 if (!adev->dc_enabled) 5364 drm_helper_hpd_irq_event(dev); 5365 else 5366 drm_kms_helper_hotplug_event(dev); 5367 #ifdef CONFIG_PM 5368 dev->dev->power.disable_depth--; 5369 #endif 5370 } 5371 5372 amdgpu_vram_mgr_clear_reset_blocks(adev); 5373 adev->in_suspend = false; 5374 5375 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5376 dev_warn(adev->dev, "smart shift update failed\n"); 5377 5378 return 0; 5379 } 5380 5381 /** 5382 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5383 * 5384 * @adev: amdgpu_device pointer 5385 * 5386 * The list of all the hardware IPs that make up the asic is walked and 5387 * the check_soft_reset callbacks are run. check_soft_reset determines 5388 * if the asic is still hung or not. 5389 * Returns true if any of the IPs are still in a hung state, false if not. 5390 */ 5391 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5392 { 5393 int i; 5394 bool asic_hang = false; 5395 5396 if (amdgpu_sriov_vf(adev)) 5397 return true; 5398 5399 if (amdgpu_asic_need_full_reset(adev)) 5400 return true; 5401 5402 for (i = 0; i < adev->num_ip_blocks; i++) { 5403 if (!adev->ip_blocks[i].status.valid) 5404 continue; 5405 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5406 adev->ip_blocks[i].status.hang = 5407 adev->ip_blocks[i].version->funcs->check_soft_reset( 5408 &adev->ip_blocks[i]); 5409 if (adev->ip_blocks[i].status.hang) { 5410 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5411 asic_hang = true; 5412 } 5413 } 5414 return asic_hang; 5415 } 5416 5417 /** 5418 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5419 * 5420 * @adev: amdgpu_device pointer 5421 * 5422 * The list of all the hardware IPs that make up the asic is walked and the 5423 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5424 * handles any IP specific hardware or software state changes that are 5425 * necessary for a soft reset to succeed. 5426 * Returns 0 on success, negative error code on failure. 5427 */ 5428 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5429 { 5430 int i, r = 0; 5431 5432 for (i = 0; i < adev->num_ip_blocks; i++) { 5433 if (!adev->ip_blocks[i].status.valid) 5434 continue; 5435 if (adev->ip_blocks[i].status.hang && 5436 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5437 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5438 if (r) 5439 return r; 5440 } 5441 } 5442 5443 return 0; 5444 } 5445 5446 /** 5447 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5448 * 5449 * @adev: amdgpu_device pointer 5450 * 5451 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5452 * reset is necessary to recover. 5453 * Returns true if a full asic reset is required, false if not. 5454 */ 5455 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5456 { 5457 int i; 5458 5459 if (amdgpu_asic_need_full_reset(adev)) 5460 return true; 5461 5462 for (i = 0; i < adev->num_ip_blocks; i++) { 5463 if (!adev->ip_blocks[i].status.valid) 5464 continue; 5465 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5466 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5467 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5468 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5469 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5470 if (adev->ip_blocks[i].status.hang) { 5471 dev_info(adev->dev, "Some block need full reset!\n"); 5472 return true; 5473 } 5474 } 5475 } 5476 return false; 5477 } 5478 5479 /** 5480 * amdgpu_device_ip_soft_reset - do a soft reset 5481 * 5482 * @adev: amdgpu_device pointer 5483 * 5484 * The list of all the hardware IPs that make up the asic is walked and the 5485 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5486 * IP specific hardware or software state changes that are necessary to soft 5487 * reset the IP. 5488 * Returns 0 on success, negative error code on failure. 5489 */ 5490 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5491 { 5492 int i, r = 0; 5493 5494 for (i = 0; i < adev->num_ip_blocks; i++) { 5495 if (!adev->ip_blocks[i].status.valid) 5496 continue; 5497 if (adev->ip_blocks[i].status.hang && 5498 adev->ip_blocks[i].version->funcs->soft_reset) { 5499 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5500 if (r) 5501 return r; 5502 } 5503 } 5504 5505 return 0; 5506 } 5507 5508 /** 5509 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5510 * 5511 * @adev: amdgpu_device pointer 5512 * 5513 * The list of all the hardware IPs that make up the asic is walked and the 5514 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5515 * handles any IP specific hardware or software state changes that are 5516 * necessary after the IP has been soft reset. 5517 * Returns 0 on success, negative error code on failure. 5518 */ 5519 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5520 { 5521 int i, r = 0; 5522 5523 for (i = 0; i < adev->num_ip_blocks; i++) { 5524 if (!adev->ip_blocks[i].status.valid) 5525 continue; 5526 if (adev->ip_blocks[i].status.hang && 5527 adev->ip_blocks[i].version->funcs->post_soft_reset) 5528 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5529 if (r) 5530 return r; 5531 } 5532 5533 return 0; 5534 } 5535 5536 /** 5537 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5538 * 5539 * @adev: amdgpu_device pointer 5540 * @reset_context: amdgpu reset context pointer 5541 * 5542 * do VF FLR and reinitialize Asic 5543 * return 0 means succeeded otherwise failed 5544 */ 5545 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5546 struct amdgpu_reset_context *reset_context) 5547 { 5548 int r; 5549 struct amdgpu_hive_info *hive = NULL; 5550 5551 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5552 if (!amdgpu_ras_get_fed_status(adev)) 5553 amdgpu_virt_ready_to_reset(adev); 5554 amdgpu_virt_wait_reset(adev); 5555 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5556 r = amdgpu_virt_request_full_gpu(adev, true); 5557 } else { 5558 r = amdgpu_virt_reset_gpu(adev); 5559 } 5560 if (r) 5561 return r; 5562 5563 amdgpu_ras_clear_err_state(adev); 5564 amdgpu_irq_gpu_reset_resume_helper(adev); 5565 5566 /* some sw clean up VF needs to do before recover */ 5567 amdgpu_virt_post_reset(adev); 5568 5569 /* Resume IP prior to SMC */ 5570 r = amdgpu_device_ip_reinit_early_sriov(adev); 5571 if (r) 5572 return r; 5573 5574 amdgpu_virt_init_data_exchange(adev); 5575 5576 r = amdgpu_device_fw_loading(adev); 5577 if (r) 5578 return r; 5579 5580 /* now we are okay to resume SMC/CP/SDMA */ 5581 r = amdgpu_device_ip_reinit_late_sriov(adev); 5582 if (r) 5583 return r; 5584 5585 hive = amdgpu_get_xgmi_hive(adev); 5586 /* Update PSP FW topology after reset */ 5587 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5588 r = amdgpu_xgmi_update_topology(hive, adev); 5589 if (hive) 5590 amdgpu_put_xgmi_hive(hive); 5591 if (r) 5592 return r; 5593 5594 r = amdgpu_ib_ring_tests(adev); 5595 if (r) 5596 return r; 5597 5598 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5599 amdgpu_inc_vram_lost(adev); 5600 5601 /* need to be called during full access so we can't do it later like 5602 * bare-metal does. 5603 */ 5604 amdgpu_amdkfd_post_reset(adev); 5605 amdgpu_virt_release_full_gpu(adev, true); 5606 5607 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5608 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5609 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5610 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5611 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5612 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5613 amdgpu_ras_resume(adev); 5614 5615 amdgpu_virt_ras_telemetry_post_reset(adev); 5616 5617 return 0; 5618 } 5619 5620 /** 5621 * amdgpu_device_has_job_running - check if there is any unfinished job 5622 * 5623 * @adev: amdgpu_device pointer 5624 * 5625 * check if there is any job running on the device when guest driver receives 5626 * FLR notification from host driver. If there are still jobs running, then 5627 * the guest driver will not respond the FLR reset. Instead, let the job hit 5628 * the timeout and guest driver then issue the reset request. 5629 */ 5630 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5631 { 5632 int i; 5633 5634 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5635 struct amdgpu_ring *ring = adev->rings[i]; 5636 5637 if (!amdgpu_ring_sched_ready(ring)) 5638 continue; 5639 5640 if (amdgpu_fence_count_emitted(ring)) 5641 return true; 5642 } 5643 return false; 5644 } 5645 5646 /** 5647 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5648 * 5649 * @adev: amdgpu_device pointer 5650 * 5651 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5652 * a hung GPU. 5653 */ 5654 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5655 { 5656 5657 if (amdgpu_gpu_recovery == 0) 5658 goto disabled; 5659 5660 /* Skip soft reset check in fatal error mode */ 5661 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5662 return true; 5663 5664 if (amdgpu_sriov_vf(adev)) 5665 return true; 5666 5667 if (amdgpu_gpu_recovery == -1) { 5668 switch (adev->asic_type) { 5669 #ifdef CONFIG_DRM_AMDGPU_SI 5670 case CHIP_VERDE: 5671 case CHIP_TAHITI: 5672 case CHIP_PITCAIRN: 5673 case CHIP_OLAND: 5674 case CHIP_HAINAN: 5675 #endif 5676 #ifdef CONFIG_DRM_AMDGPU_CIK 5677 case CHIP_KAVERI: 5678 case CHIP_KABINI: 5679 case CHIP_MULLINS: 5680 #endif 5681 case CHIP_CARRIZO: 5682 case CHIP_STONEY: 5683 case CHIP_CYAN_SKILLFISH: 5684 goto disabled; 5685 default: 5686 break; 5687 } 5688 } 5689 5690 return true; 5691 5692 disabled: 5693 dev_info(adev->dev, "GPU recovery disabled.\n"); 5694 return false; 5695 } 5696 5697 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5698 { 5699 u32 i; 5700 int ret = 0; 5701 5702 if (adev->bios) 5703 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5704 5705 dev_info(adev->dev, "GPU mode1 reset\n"); 5706 5707 /* Cache the state before bus master disable. The saved config space 5708 * values are used in other cases like restore after mode-2 reset. 5709 */ 5710 amdgpu_device_cache_pci_state(adev->pdev); 5711 5712 /* disable BM */ 5713 pci_clear_master(adev->pdev); 5714 5715 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5716 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5717 ret = amdgpu_dpm_mode1_reset(adev); 5718 } else { 5719 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5720 ret = psp_gpu_reset(adev); 5721 } 5722 5723 if (ret) 5724 goto mode1_reset_failed; 5725 5726 amdgpu_device_load_pci_state(adev->pdev); 5727 ret = amdgpu_psp_wait_for_bootloader(adev); 5728 if (ret) 5729 goto mode1_reset_failed; 5730 5731 /* wait for asic to come out of reset */ 5732 for (i = 0; i < adev->usec_timeout; i++) { 5733 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5734 5735 if (memsize != 0xffffffff) 5736 break; 5737 udelay(1); 5738 } 5739 5740 if (i >= adev->usec_timeout) { 5741 ret = -ETIMEDOUT; 5742 goto mode1_reset_failed; 5743 } 5744 5745 if (adev->bios) 5746 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5747 5748 return 0; 5749 5750 mode1_reset_failed: 5751 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5752 return ret; 5753 } 5754 5755 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5756 { 5757 int ret = 0; 5758 5759 dev_info(adev->dev, "GPU link reset\n"); 5760 5761 if (!amdgpu_reset_in_dpc(adev)) 5762 ret = amdgpu_dpm_link_reset(adev); 5763 5764 if (ret) 5765 goto link_reset_failed; 5766 5767 ret = amdgpu_psp_wait_for_bootloader(adev); 5768 if (ret) 5769 goto link_reset_failed; 5770 5771 return 0; 5772 5773 link_reset_failed: 5774 dev_err(adev->dev, "GPU link reset failed\n"); 5775 return ret; 5776 } 5777 5778 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5779 struct amdgpu_reset_context *reset_context) 5780 { 5781 int i, r = 0; 5782 struct amdgpu_job *job = NULL; 5783 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5784 bool need_full_reset = 5785 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5786 5787 if (reset_context->reset_req_dev == adev) 5788 job = reset_context->job; 5789 5790 if (amdgpu_sriov_vf(adev)) 5791 amdgpu_virt_pre_reset(adev); 5792 5793 amdgpu_fence_driver_isr_toggle(adev, true); 5794 5795 /* block all schedulers and reset given job's ring */ 5796 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5797 struct amdgpu_ring *ring = adev->rings[i]; 5798 5799 if (!amdgpu_ring_sched_ready(ring)) 5800 continue; 5801 5802 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5803 amdgpu_fence_driver_force_completion(ring); 5804 } 5805 5806 amdgpu_fence_driver_isr_toggle(adev, false); 5807 5808 if (job && job->vm) 5809 drm_sched_increase_karma(&job->base); 5810 5811 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5812 /* If reset handler not implemented, continue; otherwise return */ 5813 if (r == -EOPNOTSUPP) 5814 r = 0; 5815 else 5816 return r; 5817 5818 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5819 if (!amdgpu_sriov_vf(adev)) { 5820 5821 if (!need_full_reset) 5822 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5823 5824 if (!need_full_reset && amdgpu_gpu_recovery && 5825 amdgpu_device_ip_check_soft_reset(adev)) { 5826 amdgpu_device_ip_pre_soft_reset(adev); 5827 r = amdgpu_device_ip_soft_reset(adev); 5828 amdgpu_device_ip_post_soft_reset(adev); 5829 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5830 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5831 need_full_reset = true; 5832 } 5833 } 5834 5835 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5836 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5837 /* Trigger ip dump before we reset the asic */ 5838 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5839 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5840 tmp_adev->ip_blocks[i].version->funcs 5841 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5842 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5843 } 5844 5845 if (need_full_reset) 5846 r = amdgpu_device_ip_suspend(adev); 5847 if (need_full_reset) 5848 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5849 else 5850 clear_bit(AMDGPU_NEED_FULL_RESET, 5851 &reset_context->flags); 5852 } 5853 5854 return r; 5855 } 5856 5857 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5858 { 5859 struct list_head *device_list_handle; 5860 bool full_reset, vram_lost = false; 5861 struct amdgpu_device *tmp_adev; 5862 int r, init_level; 5863 5864 device_list_handle = reset_context->reset_device_list; 5865 5866 if (!device_list_handle) 5867 return -EINVAL; 5868 5869 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5870 5871 /** 5872 * If it's reset on init, it's default init level, otherwise keep level 5873 * as recovery level. 5874 */ 5875 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5876 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5877 else 5878 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5879 5880 r = 0; 5881 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5882 amdgpu_set_init_level(tmp_adev, init_level); 5883 if (full_reset) { 5884 /* post card */ 5885 amdgpu_reset_set_dpc_status(tmp_adev, false); 5886 amdgpu_ras_clear_err_state(tmp_adev); 5887 r = amdgpu_device_asic_init(tmp_adev); 5888 if (r) { 5889 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5890 } else { 5891 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5892 5893 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5894 if (r) 5895 goto out; 5896 5897 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5898 5899 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5900 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5901 5902 if (vram_lost) { 5903 dev_info( 5904 tmp_adev->dev, 5905 "VRAM is lost due to GPU reset!\n"); 5906 amdgpu_inc_vram_lost(tmp_adev); 5907 } 5908 5909 r = amdgpu_device_fw_loading(tmp_adev); 5910 if (r) 5911 return r; 5912 5913 r = amdgpu_xcp_restore_partition_mode( 5914 tmp_adev->xcp_mgr); 5915 if (r) 5916 goto out; 5917 5918 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5919 if (r) 5920 goto out; 5921 5922 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5923 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5924 5925 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5926 if (r) 5927 goto out; 5928 5929 if (vram_lost) 5930 amdgpu_device_fill_reset_magic(tmp_adev); 5931 5932 /* 5933 * Add this ASIC as tracked as reset was already 5934 * complete successfully. 5935 */ 5936 amdgpu_register_gpu_instance(tmp_adev); 5937 5938 if (!reset_context->hive && 5939 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5940 amdgpu_xgmi_add_device(tmp_adev); 5941 5942 r = amdgpu_device_ip_late_init(tmp_adev); 5943 if (r) 5944 goto out; 5945 5946 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 5947 if (r) 5948 goto out; 5949 5950 drm_client_dev_resume(adev_to_drm(tmp_adev)); 5951 5952 /* 5953 * The GPU enters bad state once faulty pages 5954 * by ECC has reached the threshold, and ras 5955 * recovery is scheduled next. So add one check 5956 * here to break recovery if it indeed exceeds 5957 * bad page threshold, and remind user to 5958 * retire this GPU or setting one bigger 5959 * bad_page_threshold value to fix this once 5960 * probing driver again. 5961 */ 5962 if (!amdgpu_ras_is_rma(tmp_adev)) { 5963 /* must succeed. */ 5964 amdgpu_ras_resume(tmp_adev); 5965 } else { 5966 r = -EINVAL; 5967 goto out; 5968 } 5969 5970 /* Update PSP FW topology after reset */ 5971 if (reset_context->hive && 5972 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5973 r = amdgpu_xgmi_update_topology( 5974 reset_context->hive, tmp_adev); 5975 } 5976 } 5977 5978 out: 5979 if (!r) { 5980 /* IP init is complete now, set level as default */ 5981 amdgpu_set_init_level(tmp_adev, 5982 AMDGPU_INIT_LEVEL_DEFAULT); 5983 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5984 r = amdgpu_ib_ring_tests(tmp_adev); 5985 if (r) { 5986 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5987 r = -EAGAIN; 5988 goto end; 5989 } 5990 } 5991 5992 if (r) 5993 tmp_adev->asic_reset_res = r; 5994 } 5995 5996 end: 5997 return r; 5998 } 5999 6000 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6001 struct amdgpu_reset_context *reset_context) 6002 { 6003 struct amdgpu_device *tmp_adev = NULL; 6004 bool need_full_reset, skip_hw_reset; 6005 int r = 0; 6006 6007 /* Try reset handler method first */ 6008 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6009 reset_list); 6010 6011 reset_context->reset_device_list = device_list_handle; 6012 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6013 /* If reset handler not implemented, continue; otherwise return */ 6014 if (r == -EOPNOTSUPP) 6015 r = 0; 6016 else 6017 return r; 6018 6019 /* Reset handler not implemented, use the default method */ 6020 need_full_reset = 6021 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6022 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6023 6024 /* 6025 * ASIC reset has to be done on all XGMI hive nodes ASAP 6026 * to allow proper links negotiation in FW (within 1 sec) 6027 */ 6028 if (!skip_hw_reset && need_full_reset) { 6029 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6030 /* For XGMI run all resets in parallel to speed up the process */ 6031 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6032 if (!queue_work(system_unbound_wq, 6033 &tmp_adev->xgmi_reset_work)) 6034 r = -EALREADY; 6035 } else 6036 r = amdgpu_asic_reset(tmp_adev); 6037 6038 if (r) { 6039 dev_err(tmp_adev->dev, 6040 "ASIC reset failed with error, %d for drm dev, %s", 6041 r, adev_to_drm(tmp_adev)->unique); 6042 goto out; 6043 } 6044 } 6045 6046 /* For XGMI wait for all resets to complete before proceed */ 6047 if (!r) { 6048 list_for_each_entry(tmp_adev, device_list_handle, 6049 reset_list) { 6050 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6051 flush_work(&tmp_adev->xgmi_reset_work); 6052 r = tmp_adev->asic_reset_res; 6053 if (r) 6054 break; 6055 } 6056 } 6057 } 6058 } 6059 6060 if (!r && amdgpu_ras_intr_triggered()) { 6061 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6062 amdgpu_ras_reset_error_count(tmp_adev, 6063 AMDGPU_RAS_BLOCK__MMHUB); 6064 } 6065 6066 amdgpu_ras_intr_cleared(); 6067 } 6068 6069 r = amdgpu_device_reinit_after_reset(reset_context); 6070 if (r == -EAGAIN) 6071 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6072 else 6073 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6074 6075 out: 6076 return r; 6077 } 6078 6079 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6080 { 6081 6082 switch (amdgpu_asic_reset_method(adev)) { 6083 case AMD_RESET_METHOD_MODE1: 6084 case AMD_RESET_METHOD_LINK: 6085 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6086 break; 6087 case AMD_RESET_METHOD_MODE2: 6088 adev->mp1_state = PP_MP1_STATE_RESET; 6089 break; 6090 default: 6091 adev->mp1_state = PP_MP1_STATE_NONE; 6092 break; 6093 } 6094 } 6095 6096 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6097 { 6098 amdgpu_vf_error_trans_all(adev); 6099 adev->mp1_state = PP_MP1_STATE_NONE; 6100 } 6101 6102 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6103 { 6104 struct pci_dev *p = NULL; 6105 6106 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6107 adev->pdev->bus->number, 1); 6108 if (p) { 6109 pm_runtime_enable(&(p->dev)); 6110 pm_runtime_resume(&(p->dev)); 6111 } 6112 6113 pci_dev_put(p); 6114 } 6115 6116 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6117 { 6118 enum amd_reset_method reset_method; 6119 struct pci_dev *p = NULL; 6120 u64 expires; 6121 6122 /* 6123 * For now, only BACO and mode1 reset are confirmed 6124 * to suffer the audio issue without proper suspended. 6125 */ 6126 reset_method = amdgpu_asic_reset_method(adev); 6127 if ((reset_method != AMD_RESET_METHOD_BACO) && 6128 (reset_method != AMD_RESET_METHOD_MODE1)) 6129 return -EINVAL; 6130 6131 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6132 adev->pdev->bus->number, 1); 6133 if (!p) 6134 return -ENODEV; 6135 6136 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6137 if (!expires) 6138 /* 6139 * If we cannot get the audio device autosuspend delay, 6140 * a fixed 4S interval will be used. Considering 3S is 6141 * the audio controller default autosuspend delay setting. 6142 * 4S used here is guaranteed to cover that. 6143 */ 6144 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6145 6146 while (!pm_runtime_status_suspended(&(p->dev))) { 6147 if (!pm_runtime_suspend(&(p->dev))) 6148 break; 6149 6150 if (expires < ktime_get_mono_fast_ns()) { 6151 dev_warn(adev->dev, "failed to suspend display audio\n"); 6152 pci_dev_put(p); 6153 /* TODO: abort the succeeding gpu reset? */ 6154 return -ETIMEDOUT; 6155 } 6156 } 6157 6158 pm_runtime_disable(&(p->dev)); 6159 6160 pci_dev_put(p); 6161 return 0; 6162 } 6163 6164 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6165 { 6166 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6167 6168 #if defined(CONFIG_DEBUG_FS) 6169 if (!amdgpu_sriov_vf(adev)) 6170 cancel_work(&adev->reset_work); 6171 #endif 6172 cancel_work(&adev->userq_reset_work); 6173 6174 if (adev->kfd.dev) 6175 cancel_work(&adev->kfd.reset_work); 6176 6177 if (amdgpu_sriov_vf(adev)) 6178 cancel_work(&adev->virt.flr_work); 6179 6180 if (con && adev->ras_enabled) 6181 cancel_work(&con->recovery_work); 6182 6183 } 6184 6185 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6186 { 6187 struct amdgpu_device *tmp_adev; 6188 int ret = 0; 6189 6190 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6191 ret |= amdgpu_device_bus_status_check(tmp_adev); 6192 } 6193 6194 return ret; 6195 } 6196 6197 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6198 struct list_head *device_list, 6199 struct amdgpu_hive_info *hive) 6200 { 6201 struct amdgpu_device *tmp_adev = NULL; 6202 6203 /* 6204 * Build list of devices to reset. 6205 * In case we are in XGMI hive mode, resort the device list 6206 * to put adev in the 1st position. 6207 */ 6208 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6209 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6210 list_add_tail(&tmp_adev->reset_list, device_list); 6211 if (adev->shutdown) 6212 tmp_adev->shutdown = true; 6213 if (amdgpu_reset_in_dpc(adev)) 6214 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6215 } 6216 if (!list_is_first(&adev->reset_list, device_list)) 6217 list_rotate_to_front(&adev->reset_list, device_list); 6218 } else { 6219 list_add_tail(&adev->reset_list, device_list); 6220 } 6221 } 6222 6223 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6224 struct list_head *device_list) 6225 { 6226 struct amdgpu_device *tmp_adev = NULL; 6227 6228 if (list_empty(device_list)) 6229 return; 6230 tmp_adev = 6231 list_first_entry(device_list, struct amdgpu_device, reset_list); 6232 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6233 } 6234 6235 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6236 struct list_head *device_list) 6237 { 6238 struct amdgpu_device *tmp_adev = NULL; 6239 6240 if (list_empty(device_list)) 6241 return; 6242 tmp_adev = 6243 list_first_entry(device_list, struct amdgpu_device, reset_list); 6244 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6245 } 6246 6247 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6248 struct amdgpu_job *job, 6249 struct amdgpu_reset_context *reset_context, 6250 struct list_head *device_list, 6251 struct amdgpu_hive_info *hive, 6252 bool need_emergency_restart) 6253 { 6254 struct amdgpu_device *tmp_adev = NULL; 6255 int i; 6256 6257 /* block all schedulers and reset given job's ring */ 6258 list_for_each_entry(tmp_adev, device_list, reset_list) { 6259 amdgpu_device_set_mp1_state(tmp_adev); 6260 6261 /* 6262 * Try to put the audio codec into suspend state 6263 * before gpu reset started. 6264 * 6265 * Due to the power domain of the graphics device 6266 * is shared with AZ power domain. Without this, 6267 * we may change the audio hardware from behind 6268 * the audio driver's back. That will trigger 6269 * some audio codec errors. 6270 */ 6271 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6272 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6273 6274 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6275 6276 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6277 6278 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6279 6280 /* 6281 * Mark these ASICs to be reset as untracked first 6282 * And add them back after reset completed 6283 */ 6284 amdgpu_unregister_gpu_instance(tmp_adev); 6285 6286 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6287 6288 /* disable ras on ALL IPs */ 6289 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6290 amdgpu_device_ip_need_full_reset(tmp_adev)) 6291 amdgpu_ras_suspend(tmp_adev); 6292 6293 amdgpu_userq_pre_reset(tmp_adev); 6294 6295 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6296 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6297 6298 if (!amdgpu_ring_sched_ready(ring)) 6299 continue; 6300 6301 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6302 6303 if (need_emergency_restart) 6304 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6305 } 6306 atomic_inc(&tmp_adev->gpu_reset_counter); 6307 } 6308 } 6309 6310 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6311 struct list_head *device_list, 6312 struct amdgpu_reset_context *reset_context) 6313 { 6314 struct amdgpu_device *tmp_adev = NULL; 6315 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6316 int r = 0; 6317 6318 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6319 list_for_each_entry(tmp_adev, device_list, reset_list) { 6320 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6321 /*TODO Should we stop ?*/ 6322 if (r) { 6323 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6324 r, adev_to_drm(tmp_adev)->unique); 6325 tmp_adev->asic_reset_res = r; 6326 } 6327 } 6328 6329 /* Actual ASIC resets if needed.*/ 6330 /* Host driver will handle XGMI hive reset for SRIOV */ 6331 if (amdgpu_sriov_vf(adev)) { 6332 6333 /* Bail out of reset early */ 6334 if (amdgpu_ras_is_rma(adev)) 6335 return -ENODEV; 6336 6337 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6338 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6339 amdgpu_ras_set_fed(adev, true); 6340 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6341 } 6342 6343 r = amdgpu_device_reset_sriov(adev, reset_context); 6344 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6345 amdgpu_virt_release_full_gpu(adev, true); 6346 goto retry; 6347 } 6348 if (r) 6349 adev->asic_reset_res = r; 6350 } else { 6351 r = amdgpu_do_asic_reset(device_list, reset_context); 6352 if (r && r == -EAGAIN) 6353 goto retry; 6354 } 6355 6356 list_for_each_entry(tmp_adev, device_list, reset_list) { 6357 /* 6358 * Drop any pending non scheduler resets queued before reset is done. 6359 * Any reset scheduled after this point would be valid. Scheduler resets 6360 * were already dropped during drm_sched_stop and no new ones can come 6361 * in before drm_sched_start. 6362 */ 6363 amdgpu_device_stop_pending_resets(tmp_adev); 6364 } 6365 6366 return r; 6367 } 6368 6369 static int amdgpu_device_sched_resume(struct list_head *device_list, 6370 struct amdgpu_reset_context *reset_context, 6371 bool job_signaled) 6372 { 6373 struct amdgpu_device *tmp_adev = NULL; 6374 int i, r = 0; 6375 6376 /* Post ASIC reset for all devs .*/ 6377 list_for_each_entry(tmp_adev, device_list, reset_list) { 6378 6379 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6380 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6381 6382 if (!amdgpu_ring_sched_ready(ring)) 6383 continue; 6384 6385 drm_sched_start(&ring->sched, 0); 6386 } 6387 6388 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6389 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6390 6391 if (tmp_adev->asic_reset_res) { 6392 /* bad news, how to tell it to userspace ? 6393 * for ras error, we should report GPU bad status instead of 6394 * reset failure 6395 */ 6396 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6397 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6398 dev_info( 6399 tmp_adev->dev, 6400 "GPU reset(%d) failed with error %d\n", 6401 atomic_read( 6402 &tmp_adev->gpu_reset_counter), 6403 tmp_adev->asic_reset_res); 6404 amdgpu_vf_error_put(tmp_adev, 6405 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6406 tmp_adev->asic_reset_res); 6407 if (!r) 6408 r = tmp_adev->asic_reset_res; 6409 tmp_adev->asic_reset_res = 0; 6410 } else { 6411 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6412 atomic_read(&tmp_adev->gpu_reset_counter)); 6413 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6414 AMDGPU_SS_DEV_D0)) 6415 dev_warn(tmp_adev->dev, 6416 "smart shift update failed\n"); 6417 } 6418 } 6419 6420 return r; 6421 } 6422 6423 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6424 struct list_head *device_list, 6425 bool need_emergency_restart) 6426 { 6427 struct amdgpu_device *tmp_adev = NULL; 6428 6429 list_for_each_entry(tmp_adev, device_list, reset_list) { 6430 /* unlock kfd: SRIOV would do it separately */ 6431 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6432 amdgpu_amdkfd_post_reset(tmp_adev); 6433 6434 /* kfd_post_reset will do nothing if kfd device is not initialized, 6435 * need to bring up kfd here if it's not be initialized before 6436 */ 6437 if (!adev->kfd.init_complete) 6438 amdgpu_amdkfd_device_init(adev); 6439 6440 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6441 amdgpu_device_resume_display_audio(tmp_adev); 6442 6443 amdgpu_device_unset_mp1_state(tmp_adev); 6444 6445 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6446 6447 } 6448 } 6449 6450 6451 /** 6452 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6453 * 6454 * @adev: amdgpu_device pointer 6455 * @job: which job trigger hang 6456 * @reset_context: amdgpu reset context pointer 6457 * 6458 * Attempt to reset the GPU if it has hung (all asics). 6459 * Attempt to do soft-reset or full-reset and reinitialize Asic 6460 * Returns 0 for success or an error on failure. 6461 */ 6462 6463 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6464 struct amdgpu_job *job, 6465 struct amdgpu_reset_context *reset_context) 6466 { 6467 struct list_head device_list; 6468 bool job_signaled = false; 6469 struct amdgpu_hive_info *hive = NULL; 6470 int r = 0; 6471 bool need_emergency_restart = false; 6472 /* save the pasid here as the job may be freed before the end of the reset */ 6473 int pasid = job ? job->pasid : -EINVAL; 6474 6475 /* 6476 * If it reaches here because of hang/timeout and a RAS error is 6477 * detected at the same time, let RAS recovery take care of it. 6478 */ 6479 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6480 !amdgpu_sriov_vf(adev) && 6481 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6482 dev_dbg(adev->dev, 6483 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6484 reset_context->src); 6485 return 0; 6486 } 6487 6488 /* 6489 * Special case: RAS triggered and full reset isn't supported 6490 */ 6491 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6492 6493 /* 6494 * Flush RAM to disk so that after reboot 6495 * the user can read log and see why the system rebooted. 6496 */ 6497 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6498 amdgpu_ras_get_context(adev)->reboot) { 6499 dev_warn(adev->dev, "Emergency reboot."); 6500 6501 ksys_sync_helper(); 6502 emergency_restart(); 6503 } 6504 6505 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6506 need_emergency_restart ? "jobs stop" : "reset", 6507 reset_context->src); 6508 6509 if (!amdgpu_sriov_vf(adev)) 6510 hive = amdgpu_get_xgmi_hive(adev); 6511 if (hive) 6512 mutex_lock(&hive->hive_lock); 6513 6514 reset_context->job = job; 6515 reset_context->hive = hive; 6516 INIT_LIST_HEAD(&device_list); 6517 6518 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6519 6520 if (!amdgpu_sriov_vf(adev)) { 6521 r = amdgpu_device_health_check(&device_list); 6522 if (r) 6523 goto end_reset; 6524 } 6525 6526 /* Cannot be called after locking reset domain */ 6527 amdgpu_ras_pre_reset(adev, &device_list); 6528 6529 /* We need to lock reset domain only once both for XGMI and single device */ 6530 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6531 6532 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6533 hive, need_emergency_restart); 6534 if (need_emergency_restart) 6535 goto skip_sched_resume; 6536 /* 6537 * Must check guilty signal here since after this point all old 6538 * HW fences are force signaled. 6539 * 6540 * job->base holds a reference to parent fence 6541 */ 6542 if (job && dma_fence_is_signaled(&job->hw_fence->base)) { 6543 job_signaled = true; 6544 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6545 goto skip_hw_reset; 6546 } 6547 6548 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6549 if (r) 6550 goto reset_unlock; 6551 skip_hw_reset: 6552 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6553 if (r) 6554 goto reset_unlock; 6555 skip_sched_resume: 6556 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6557 reset_unlock: 6558 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6559 amdgpu_ras_post_reset(adev, &device_list); 6560 end_reset: 6561 if (hive) { 6562 mutex_unlock(&hive->hive_lock); 6563 amdgpu_put_xgmi_hive(hive); 6564 } 6565 6566 if (r) 6567 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6568 6569 atomic_set(&adev->reset_domain->reset_res, r); 6570 6571 if (!r) { 6572 struct amdgpu_task_info *ti = NULL; 6573 6574 /* 6575 * The job may already be freed at this point via the sched tdr workqueue so 6576 * use the cached pasid. 6577 */ 6578 if (pasid >= 0) 6579 ti = amdgpu_vm_get_task_info_pasid(adev, pasid); 6580 6581 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6582 ti ? &ti->task : NULL); 6583 6584 amdgpu_vm_put_task_info(ti); 6585 } 6586 6587 return r; 6588 } 6589 6590 /** 6591 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6592 * 6593 * @adev: amdgpu_device pointer 6594 * @speed: pointer to the speed of the link 6595 * @width: pointer to the width of the link 6596 * 6597 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6598 * first physical partner to an AMD dGPU. 6599 * This will exclude any virtual switches and links. 6600 */ 6601 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6602 enum pci_bus_speed *speed, 6603 enum pcie_link_width *width) 6604 { 6605 struct pci_dev *parent = adev->pdev; 6606 6607 if (!speed || !width) 6608 return; 6609 6610 *speed = PCI_SPEED_UNKNOWN; 6611 *width = PCIE_LNK_WIDTH_UNKNOWN; 6612 6613 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6614 while ((parent = pci_upstream_bridge(parent))) { 6615 /* skip upstream/downstream switches internal to dGPU*/ 6616 if (parent->vendor == PCI_VENDOR_ID_ATI) 6617 continue; 6618 *speed = pcie_get_speed_cap(parent); 6619 *width = pcie_get_width_cap(parent); 6620 break; 6621 } 6622 } else { 6623 /* use the current speeds rather than max if switching is not supported */ 6624 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6625 } 6626 } 6627 6628 /** 6629 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6630 * 6631 * @adev: amdgpu_device pointer 6632 * @speed: pointer to the speed of the link 6633 * @width: pointer to the width of the link 6634 * 6635 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6636 * AMD dGPU which may be a virtual upstream bridge. 6637 */ 6638 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6639 enum pci_bus_speed *speed, 6640 enum pcie_link_width *width) 6641 { 6642 struct pci_dev *parent = adev->pdev; 6643 6644 if (!speed || !width) 6645 return; 6646 6647 parent = pci_upstream_bridge(parent); 6648 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6649 /* use the upstream/downstream switches internal to dGPU */ 6650 *speed = pcie_get_speed_cap(parent); 6651 *width = pcie_get_width_cap(parent); 6652 while ((parent = pci_upstream_bridge(parent))) { 6653 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6654 /* use the upstream/downstream switches internal to dGPU */ 6655 *speed = pcie_get_speed_cap(parent); 6656 *width = pcie_get_width_cap(parent); 6657 } 6658 } 6659 } else { 6660 /* use the device itself */ 6661 *speed = pcie_get_speed_cap(adev->pdev); 6662 *width = pcie_get_width_cap(adev->pdev); 6663 } 6664 } 6665 6666 /** 6667 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6668 * 6669 * @adev: amdgpu_device pointer 6670 * 6671 * Fetches and stores in the driver the PCIE capabilities (gen speed 6672 * and lanes) of the slot the device is in. Handles APUs and 6673 * virtualized environments where PCIE config space may not be available. 6674 */ 6675 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6676 { 6677 enum pci_bus_speed speed_cap, platform_speed_cap; 6678 enum pcie_link_width platform_link_width, link_width; 6679 6680 if (amdgpu_pcie_gen_cap) 6681 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6682 6683 if (amdgpu_pcie_lane_cap) 6684 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6685 6686 /* covers APUs as well */ 6687 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6688 if (adev->pm.pcie_gen_mask == 0) 6689 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6690 if (adev->pm.pcie_mlw_mask == 0) 6691 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6692 return; 6693 } 6694 6695 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6696 return; 6697 6698 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6699 &platform_link_width); 6700 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6701 6702 if (adev->pm.pcie_gen_mask == 0) { 6703 /* asic caps */ 6704 if (speed_cap == PCI_SPEED_UNKNOWN) { 6705 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6706 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6707 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6708 } else { 6709 if (speed_cap == PCIE_SPEED_32_0GT) 6710 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6711 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6712 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6713 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6714 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6715 else if (speed_cap == PCIE_SPEED_16_0GT) 6716 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6717 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6718 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6719 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6720 else if (speed_cap == PCIE_SPEED_8_0GT) 6721 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6722 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6723 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6724 else if (speed_cap == PCIE_SPEED_5_0GT) 6725 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6726 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6727 else 6728 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6729 } 6730 /* platform caps */ 6731 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6732 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6733 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6734 } else { 6735 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6736 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6737 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6738 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6739 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6740 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6741 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6742 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6743 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6744 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6745 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6746 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6747 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6748 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6749 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6750 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6751 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6752 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6753 else 6754 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6755 6756 } 6757 } 6758 if (adev->pm.pcie_mlw_mask == 0) { 6759 /* asic caps */ 6760 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6761 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6762 } else { 6763 switch (link_width) { 6764 case PCIE_LNK_X32: 6765 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6766 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6767 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6768 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6769 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6770 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6771 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6772 break; 6773 case PCIE_LNK_X16: 6774 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6775 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6776 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6777 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6778 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6779 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6780 break; 6781 case PCIE_LNK_X12: 6782 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6783 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6784 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6785 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6786 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6787 break; 6788 case PCIE_LNK_X8: 6789 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6790 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6791 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6792 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6793 break; 6794 case PCIE_LNK_X4: 6795 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6796 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6797 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6798 break; 6799 case PCIE_LNK_X2: 6800 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6801 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6802 break; 6803 case PCIE_LNK_X1: 6804 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6805 break; 6806 default: 6807 break; 6808 } 6809 } 6810 /* platform caps */ 6811 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6812 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6813 } else { 6814 switch (platform_link_width) { 6815 case PCIE_LNK_X32: 6816 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6819 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6821 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6822 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6823 break; 6824 case PCIE_LNK_X16: 6825 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6826 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6827 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6828 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6829 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6831 break; 6832 case PCIE_LNK_X12: 6833 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6834 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6835 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6836 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6837 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6838 break; 6839 case PCIE_LNK_X8: 6840 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6841 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6842 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6843 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6844 break; 6845 case PCIE_LNK_X4: 6846 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6847 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6848 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6849 break; 6850 case PCIE_LNK_X2: 6851 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6852 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6853 break; 6854 case PCIE_LNK_X1: 6855 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6856 break; 6857 default: 6858 break; 6859 } 6860 } 6861 } 6862 } 6863 6864 /** 6865 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6866 * 6867 * @adev: amdgpu_device pointer 6868 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6869 * 6870 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6871 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6872 * @peer_adev. 6873 */ 6874 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6875 struct amdgpu_device *peer_adev) 6876 { 6877 #ifdef CONFIG_HSA_AMD_P2P 6878 bool p2p_access = 6879 !adev->gmc.xgmi.connected_to_cpu && 6880 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6881 if (!p2p_access) 6882 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6883 pci_name(peer_adev->pdev)); 6884 6885 bool is_large_bar = adev->gmc.visible_vram_size && 6886 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6887 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6888 6889 if (!p2p_addressable) { 6890 uint64_t address_mask = peer_adev->dev->dma_mask ? 6891 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6892 resource_size_t aper_limit = 6893 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6894 6895 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6896 aper_limit & address_mask); 6897 } 6898 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6899 #else 6900 return false; 6901 #endif 6902 } 6903 6904 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 6905 { 6906 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6907 6908 if (!amdgpu_device_supports_baco(adev)) 6909 return -ENOTSUPP; 6910 6911 if (ras && adev->ras_enabled && 6912 adev->nbio.funcs->enable_doorbell_interrupt) 6913 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6914 6915 return amdgpu_dpm_baco_enter(adev); 6916 } 6917 6918 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 6919 { 6920 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6921 int ret = 0; 6922 6923 if (!amdgpu_device_supports_baco(adev)) 6924 return -ENOTSUPP; 6925 6926 ret = amdgpu_dpm_baco_exit(adev); 6927 if (ret) 6928 return ret; 6929 6930 if (ras && adev->ras_enabled && 6931 adev->nbio.funcs->enable_doorbell_interrupt) 6932 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6933 6934 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6935 adev->nbio.funcs->clear_doorbell_interrupt) 6936 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6937 6938 return 0; 6939 } 6940 6941 /** 6942 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6943 * @pdev: PCI device struct 6944 * @state: PCI channel state 6945 * 6946 * Description: Called when a PCI error is detected. 6947 * 6948 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6949 */ 6950 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6951 { 6952 struct drm_device *dev = pci_get_drvdata(pdev); 6953 struct amdgpu_device *adev = drm_to_adev(dev); 6954 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 6955 amdgpu_get_xgmi_hive(adev); 6956 struct amdgpu_reset_context reset_context; 6957 struct list_head device_list; 6958 6959 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6960 6961 adev->pci_channel_state = state; 6962 6963 switch (state) { 6964 case pci_channel_io_normal: 6965 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6966 return PCI_ERS_RESULT_CAN_RECOVER; 6967 case pci_channel_io_frozen: 6968 /* Fatal error, prepare for slot reset */ 6969 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6970 if (hive) { 6971 /* Hive devices should be able to support FW based 6972 * link reset on other devices, if not return. 6973 */ 6974 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6975 dev_warn(adev->dev, 6976 "No support for XGMI hive yet...\n"); 6977 return PCI_ERS_RESULT_DISCONNECT; 6978 } 6979 /* Set dpc status only if device is part of hive 6980 * Non-hive devices should be able to recover after 6981 * link reset. 6982 */ 6983 amdgpu_reset_set_dpc_status(adev, true); 6984 6985 mutex_lock(&hive->hive_lock); 6986 } 6987 memset(&reset_context, 0, sizeof(reset_context)); 6988 INIT_LIST_HEAD(&device_list); 6989 6990 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6991 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6992 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6993 hive, false); 6994 if (hive) 6995 mutex_unlock(&hive->hive_lock); 6996 return PCI_ERS_RESULT_NEED_RESET; 6997 case pci_channel_io_perm_failure: 6998 /* Permanent error, prepare for device removal */ 6999 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7000 return PCI_ERS_RESULT_DISCONNECT; 7001 } 7002 7003 return PCI_ERS_RESULT_NEED_RESET; 7004 } 7005 7006 /** 7007 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7008 * @pdev: pointer to PCI device 7009 */ 7010 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7011 { 7012 struct drm_device *dev = pci_get_drvdata(pdev); 7013 struct amdgpu_device *adev = drm_to_adev(dev); 7014 7015 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7016 7017 /* TODO - dump whatever for debugging purposes */ 7018 7019 /* This called only if amdgpu_pci_error_detected returns 7020 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7021 * works, no need to reset slot. 7022 */ 7023 7024 return PCI_ERS_RESULT_RECOVERED; 7025 } 7026 7027 /** 7028 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7029 * @pdev: PCI device struct 7030 * 7031 * Description: This routine is called by the pci error recovery 7032 * code after the PCI slot has been reset, just before we 7033 * should resume normal operations. 7034 */ 7035 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7036 { 7037 struct drm_device *dev = pci_get_drvdata(pdev); 7038 struct amdgpu_device *adev = drm_to_adev(dev); 7039 struct amdgpu_reset_context reset_context; 7040 struct amdgpu_device *tmp_adev; 7041 struct amdgpu_hive_info *hive; 7042 struct list_head device_list; 7043 struct pci_dev *link_dev; 7044 int r = 0, i, timeout; 7045 u32 memsize; 7046 u16 status; 7047 7048 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7049 7050 memset(&reset_context, 0, sizeof(reset_context)); 7051 7052 if (adev->pcie_reset_ctx.swus) 7053 link_dev = adev->pcie_reset_ctx.swus; 7054 else 7055 link_dev = adev->pdev; 7056 /* wait for asic to come out of reset, timeout = 10s */ 7057 timeout = 10000; 7058 do { 7059 usleep_range(10000, 10500); 7060 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7061 timeout -= 10; 7062 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7063 (status != PCI_VENDOR_ID_AMD)); 7064 7065 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7066 r = -ETIME; 7067 goto out; 7068 } 7069 7070 amdgpu_device_load_switch_state(adev); 7071 /* Restore PCI confspace */ 7072 amdgpu_device_load_pci_state(pdev); 7073 7074 /* confirm ASIC came out of reset */ 7075 for (i = 0; i < adev->usec_timeout; i++) { 7076 memsize = amdgpu_asic_get_config_memsize(adev); 7077 7078 if (memsize != 0xffffffff) 7079 break; 7080 udelay(1); 7081 } 7082 if (memsize == 0xffffffff) { 7083 r = -ETIME; 7084 goto out; 7085 } 7086 7087 reset_context.method = AMD_RESET_METHOD_NONE; 7088 reset_context.reset_req_dev = adev; 7089 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7090 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7091 INIT_LIST_HEAD(&device_list); 7092 7093 hive = amdgpu_get_xgmi_hive(adev); 7094 if (hive) { 7095 mutex_lock(&hive->hive_lock); 7096 reset_context.hive = hive; 7097 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7098 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7099 list_add_tail(&tmp_adev->reset_list, &device_list); 7100 } 7101 } else { 7102 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7103 list_add_tail(&adev->reset_list, &device_list); 7104 } 7105 7106 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7107 out: 7108 if (!r) { 7109 if (amdgpu_device_cache_pci_state(adev->pdev)) 7110 pci_restore_state(adev->pdev); 7111 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7112 } else { 7113 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7114 if (hive) { 7115 list_for_each_entry(tmp_adev, &device_list, reset_list) 7116 amdgpu_device_unset_mp1_state(tmp_adev); 7117 } 7118 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7119 } 7120 7121 if (hive) { 7122 mutex_unlock(&hive->hive_lock); 7123 amdgpu_put_xgmi_hive(hive); 7124 } 7125 7126 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7127 } 7128 7129 /** 7130 * amdgpu_pci_resume() - resume normal ops after PCI reset 7131 * @pdev: pointer to PCI device 7132 * 7133 * Called when the error recovery driver tells us that its 7134 * OK to resume normal operation. 7135 */ 7136 void amdgpu_pci_resume(struct pci_dev *pdev) 7137 { 7138 struct drm_device *dev = pci_get_drvdata(pdev); 7139 struct amdgpu_device *adev = drm_to_adev(dev); 7140 struct list_head device_list; 7141 struct amdgpu_hive_info *hive = NULL; 7142 struct amdgpu_device *tmp_adev = NULL; 7143 7144 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7145 7146 /* Only continue execution for the case of pci_channel_io_frozen */ 7147 if (adev->pci_channel_state != pci_channel_io_frozen) 7148 return; 7149 7150 INIT_LIST_HEAD(&device_list); 7151 7152 hive = amdgpu_get_xgmi_hive(adev); 7153 if (hive) { 7154 mutex_lock(&hive->hive_lock); 7155 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7156 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7157 list_add_tail(&tmp_adev->reset_list, &device_list); 7158 } 7159 } else 7160 list_add_tail(&adev->reset_list, &device_list); 7161 7162 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7163 amdgpu_device_gpu_resume(adev, &device_list, false); 7164 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7165 7166 if (hive) { 7167 mutex_unlock(&hive->hive_lock); 7168 amdgpu_put_xgmi_hive(hive); 7169 } 7170 } 7171 7172 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7173 { 7174 struct pci_dev *swus, *swds; 7175 int r; 7176 7177 swds = pci_upstream_bridge(adev->pdev); 7178 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7179 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7180 return; 7181 swus = pci_upstream_bridge(swds); 7182 if (!swus || 7183 (swus->vendor != PCI_VENDOR_ID_ATI && 7184 swus->vendor != PCI_VENDOR_ID_AMD) || 7185 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7186 return; 7187 7188 /* If already saved, return */ 7189 if (adev->pcie_reset_ctx.swus) 7190 return; 7191 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7192 r = pci_save_state(swds); 7193 if (r) 7194 return; 7195 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7196 7197 r = pci_save_state(swus); 7198 if (r) 7199 return; 7200 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7201 7202 adev->pcie_reset_ctx.swus = swus; 7203 } 7204 7205 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7206 { 7207 struct pci_dev *pdev; 7208 int r; 7209 7210 if (!adev->pcie_reset_ctx.swds_pcistate || 7211 !adev->pcie_reset_ctx.swus_pcistate) 7212 return; 7213 7214 pdev = adev->pcie_reset_ctx.swus; 7215 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7216 if (!r) { 7217 pci_restore_state(pdev); 7218 } else { 7219 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7220 return; 7221 } 7222 7223 pdev = pci_upstream_bridge(adev->pdev); 7224 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7225 if (!r) 7226 pci_restore_state(pdev); 7227 else 7228 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7229 } 7230 7231 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7232 { 7233 struct drm_device *dev = pci_get_drvdata(pdev); 7234 struct amdgpu_device *adev = drm_to_adev(dev); 7235 int r; 7236 7237 if (amdgpu_sriov_vf(adev)) 7238 return false; 7239 7240 r = pci_save_state(pdev); 7241 if (!r) { 7242 kfree(adev->pci_state); 7243 7244 adev->pci_state = pci_store_saved_state(pdev); 7245 7246 if (!adev->pci_state) { 7247 dev_err(adev->dev, "Failed to store PCI saved state"); 7248 return false; 7249 } 7250 } else { 7251 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7252 return false; 7253 } 7254 7255 amdgpu_device_cache_switch_state(adev); 7256 7257 return true; 7258 } 7259 7260 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7261 { 7262 struct drm_device *dev = pci_get_drvdata(pdev); 7263 struct amdgpu_device *adev = drm_to_adev(dev); 7264 int r; 7265 7266 if (!adev->pci_state) 7267 return false; 7268 7269 r = pci_load_saved_state(pdev, adev->pci_state); 7270 7271 if (!r) { 7272 pci_restore_state(pdev); 7273 } else { 7274 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7275 return false; 7276 } 7277 7278 return true; 7279 } 7280 7281 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7282 struct amdgpu_ring *ring) 7283 { 7284 #ifdef CONFIG_X86_64 7285 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7286 return; 7287 #endif 7288 if (adev->gmc.xgmi.connected_to_cpu) 7289 return; 7290 7291 if (ring && ring->funcs->emit_hdp_flush) { 7292 amdgpu_ring_emit_hdp_flush(ring); 7293 return; 7294 } 7295 7296 if (!ring && amdgpu_sriov_runtime(adev)) { 7297 if (!amdgpu_kiq_hdp_flush(adev)) 7298 return; 7299 } 7300 7301 amdgpu_hdp_flush(adev, ring); 7302 } 7303 7304 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7305 struct amdgpu_ring *ring) 7306 { 7307 #ifdef CONFIG_X86_64 7308 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7309 return; 7310 #endif 7311 if (adev->gmc.xgmi.connected_to_cpu) 7312 return; 7313 7314 amdgpu_hdp_invalidate(adev, ring); 7315 } 7316 7317 int amdgpu_in_reset(struct amdgpu_device *adev) 7318 { 7319 return atomic_read(&adev->reset_domain->in_gpu_reset); 7320 } 7321 7322 /** 7323 * amdgpu_device_halt() - bring hardware to some kind of halt state 7324 * 7325 * @adev: amdgpu_device pointer 7326 * 7327 * Bring hardware to some kind of halt state so that no one can touch it 7328 * any more. It will help to maintain error context when error occurred. 7329 * Compare to a simple hang, the system will keep stable at least for SSH 7330 * access. Then it should be trivial to inspect the hardware state and 7331 * see what's going on. Implemented as following: 7332 * 7333 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7334 * clears all CPU mappings to device, disallows remappings through page faults 7335 * 2. amdgpu_irq_disable_all() disables all interrupts 7336 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7337 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7338 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7339 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7340 * flush any in flight DMA operations 7341 */ 7342 void amdgpu_device_halt(struct amdgpu_device *adev) 7343 { 7344 struct pci_dev *pdev = adev->pdev; 7345 struct drm_device *ddev = adev_to_drm(adev); 7346 7347 amdgpu_xcp_dev_unplug(adev); 7348 drm_dev_unplug(ddev); 7349 7350 amdgpu_irq_disable_all(adev); 7351 7352 amdgpu_fence_driver_hw_fini(adev); 7353 7354 adev->no_hw_access = true; 7355 7356 amdgpu_device_unmap_mmio(adev); 7357 7358 pci_disable_device(pdev); 7359 pci_wait_for_pending_transaction(pdev); 7360 } 7361 7362 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7363 u32 reg) 7364 { 7365 unsigned long flags, address, data; 7366 u32 r; 7367 7368 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7369 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7370 7371 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7372 WREG32(address, reg * 4); 7373 (void)RREG32(address); 7374 r = RREG32(data); 7375 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7376 return r; 7377 } 7378 7379 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7380 u32 reg, u32 v) 7381 { 7382 unsigned long flags, address, data; 7383 7384 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7385 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7386 7387 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7388 WREG32(address, reg * 4); 7389 (void)RREG32(address); 7390 WREG32(data, v); 7391 (void)RREG32(data); 7392 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7393 } 7394 7395 /** 7396 * amdgpu_device_get_gang - return a reference to the current gang 7397 * @adev: amdgpu_device pointer 7398 * 7399 * Returns: A new reference to the current gang leader. 7400 */ 7401 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7402 { 7403 struct dma_fence *fence; 7404 7405 rcu_read_lock(); 7406 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7407 rcu_read_unlock(); 7408 return fence; 7409 } 7410 7411 /** 7412 * amdgpu_device_switch_gang - switch to a new gang 7413 * @adev: amdgpu_device pointer 7414 * @gang: the gang to switch to 7415 * 7416 * Try to switch to a new gang. 7417 * Returns: NULL if we switched to the new gang or a reference to the current 7418 * gang leader. 7419 */ 7420 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7421 struct dma_fence *gang) 7422 { 7423 struct dma_fence *old = NULL; 7424 7425 dma_fence_get(gang); 7426 do { 7427 dma_fence_put(old); 7428 old = amdgpu_device_get_gang(adev); 7429 if (old == gang) 7430 break; 7431 7432 if (!dma_fence_is_signaled(old)) { 7433 dma_fence_put(gang); 7434 return old; 7435 } 7436 7437 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7438 old, gang) != old); 7439 7440 /* 7441 * Drop it once for the exchanged reference in adev and once for the 7442 * thread local reference acquired in amdgpu_device_get_gang(). 7443 */ 7444 dma_fence_put(old); 7445 dma_fence_put(old); 7446 return NULL; 7447 } 7448 7449 /** 7450 * amdgpu_device_enforce_isolation - enforce HW isolation 7451 * @adev: the amdgpu device pointer 7452 * @ring: the HW ring the job is supposed to run on 7453 * @job: the job which is about to be pushed to the HW ring 7454 * 7455 * Makes sure that only one client at a time can use the GFX block. 7456 * Returns: The dependency to wait on before the job can be pushed to the HW. 7457 * The function is called multiple times until NULL is returned. 7458 */ 7459 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7460 struct amdgpu_ring *ring, 7461 struct amdgpu_job *job) 7462 { 7463 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7464 struct drm_sched_fence *f = job->base.s_fence; 7465 struct dma_fence *dep; 7466 void *owner; 7467 int r; 7468 7469 /* 7470 * For now enforce isolation only for the GFX block since we only need 7471 * the cleaner shader on those rings. 7472 */ 7473 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7474 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7475 return NULL; 7476 7477 /* 7478 * All submissions where enforce isolation is false are handled as if 7479 * they come from a single client. Use ~0l as the owner to distinct it 7480 * from kernel submissions where the owner is NULL. 7481 */ 7482 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7483 7484 mutex_lock(&adev->enforce_isolation_mutex); 7485 7486 /* 7487 * The "spearhead" submission is the first one which changes the 7488 * ownership to its client. We always need to wait for it to be 7489 * pushed to the HW before proceeding with anything. 7490 */ 7491 if (&f->scheduled != isolation->spearhead && 7492 !dma_fence_is_signaled(isolation->spearhead)) { 7493 dep = isolation->spearhead; 7494 goto out_grab_ref; 7495 } 7496 7497 if (isolation->owner != owner) { 7498 7499 /* 7500 * Wait for any gang to be assembled before switching to a 7501 * different owner or otherwise we could deadlock the 7502 * submissions. 7503 */ 7504 if (!job->gang_submit) { 7505 dep = amdgpu_device_get_gang(adev); 7506 if (!dma_fence_is_signaled(dep)) 7507 goto out_return_dep; 7508 dma_fence_put(dep); 7509 } 7510 7511 dma_fence_put(isolation->spearhead); 7512 isolation->spearhead = dma_fence_get(&f->scheduled); 7513 amdgpu_sync_move(&isolation->active, &isolation->prev); 7514 trace_amdgpu_isolation(isolation->owner, owner); 7515 isolation->owner = owner; 7516 } 7517 7518 /* 7519 * Specifying the ring here helps to pipeline submissions even when 7520 * isolation is enabled. If that is not desired for testing NULL can be 7521 * used instead of the ring to enforce a CPU round trip while switching 7522 * between clients. 7523 */ 7524 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7525 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7526 if (r) 7527 dev_warn(adev->dev, "OOM tracking isolation\n"); 7528 7529 out_grab_ref: 7530 dma_fence_get(dep); 7531 out_return_dep: 7532 mutex_unlock(&adev->enforce_isolation_mutex); 7533 return dep; 7534 } 7535 7536 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7537 { 7538 switch (adev->asic_type) { 7539 #ifdef CONFIG_DRM_AMDGPU_SI 7540 case CHIP_HAINAN: 7541 #endif 7542 case CHIP_TOPAZ: 7543 /* chips with no display hardware */ 7544 return false; 7545 #ifdef CONFIG_DRM_AMDGPU_SI 7546 case CHIP_TAHITI: 7547 case CHIP_PITCAIRN: 7548 case CHIP_VERDE: 7549 case CHIP_OLAND: 7550 #endif 7551 #ifdef CONFIG_DRM_AMDGPU_CIK 7552 case CHIP_BONAIRE: 7553 case CHIP_HAWAII: 7554 case CHIP_KAVERI: 7555 case CHIP_KABINI: 7556 case CHIP_MULLINS: 7557 #endif 7558 case CHIP_TONGA: 7559 case CHIP_FIJI: 7560 case CHIP_POLARIS10: 7561 case CHIP_POLARIS11: 7562 case CHIP_POLARIS12: 7563 case CHIP_VEGAM: 7564 case CHIP_CARRIZO: 7565 case CHIP_STONEY: 7566 /* chips with display hardware */ 7567 return true; 7568 default: 7569 /* IP discovery */ 7570 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7571 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7572 return false; 7573 return true; 7574 } 7575 } 7576 7577 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7578 uint32_t inst, uint32_t reg_addr, char reg_name[], 7579 uint32_t expected_value, uint32_t mask) 7580 { 7581 uint32_t ret = 0; 7582 uint32_t old_ = 0; 7583 uint32_t tmp_ = RREG32(reg_addr); 7584 uint32_t loop = adev->usec_timeout; 7585 7586 while ((tmp_ & (mask)) != (expected_value)) { 7587 if (old_ != tmp_) { 7588 loop = adev->usec_timeout; 7589 old_ = tmp_; 7590 } else 7591 udelay(1); 7592 tmp_ = RREG32(reg_addr); 7593 loop--; 7594 if (!loop) { 7595 dev_warn( 7596 adev->dev, 7597 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7598 inst, reg_name, (uint32_t)expected_value, 7599 (uint32_t)(tmp_ & (mask))); 7600 ret = -ETIMEDOUT; 7601 break; 7602 } 7603 } 7604 return ret; 7605 } 7606 7607 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7608 { 7609 ssize_t size = 0; 7610 7611 if (!ring || !ring->adev) 7612 return size; 7613 7614 if (amdgpu_device_should_recover_gpu(ring->adev)) 7615 size |= AMDGPU_RESET_TYPE_FULL; 7616 7617 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7618 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7619 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7620 7621 return size; 7622 } 7623 7624 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7625 { 7626 ssize_t size = 0; 7627 7628 if (supported_reset == 0) { 7629 size += sysfs_emit_at(buf, size, "unsupported"); 7630 size += sysfs_emit_at(buf, size, "\n"); 7631 return size; 7632 7633 } 7634 7635 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7636 size += sysfs_emit_at(buf, size, "soft "); 7637 7638 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7639 size += sysfs_emit_at(buf, size, "queue "); 7640 7641 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7642 size += sysfs_emit_at(buf, size, "pipe "); 7643 7644 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7645 size += sysfs_emit_at(buf, size, "full "); 7646 7647 size += sysfs_emit_at(buf, size, "\n"); 7648 return size; 7649 } 7650 7651 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7652 enum amdgpu_uid_type type, uint8_t inst, 7653 uint64_t uid) 7654 { 7655 if (!uid_info) 7656 return; 7657 7658 if (type >= AMDGPU_UID_TYPE_MAX) { 7659 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7660 type); 7661 return; 7662 } 7663 7664 if (inst >= AMDGPU_UID_INST_MAX) { 7665 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7666 inst); 7667 return; 7668 } 7669 7670 if (uid_info->uid[type][inst] != 0) { 7671 dev_warn_once( 7672 uid_info->adev->dev, 7673 "Overwriting existing UID %llu for type %d instance %d\n", 7674 uid_info->uid[type][inst], type, inst); 7675 } 7676 7677 uid_info->uid[type][inst] = uid; 7678 } 7679 7680 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7681 enum amdgpu_uid_type type, uint8_t inst) 7682 { 7683 if (!uid_info) 7684 return 0; 7685 7686 if (type >= AMDGPU_UID_TYPE_MAX) { 7687 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7688 type); 7689 return 0; 7690 } 7691 7692 if (inst >= AMDGPU_UID_INST_MAX) { 7693 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7694 inst); 7695 return 0; 7696 } 7697 7698 return uid_info->uid[type][inst]; 7699 } 7700