1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39 #include <linux/nospec.h>
40
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_client_event.h>
43 #include <drm/drm_crtc_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/device.h>
47 #include <linux/vgaarb.h>
48 #include <linux/vga_switcheroo.h>
49 #include <linux/efi.h>
50 #include "amdgpu.h"
51 #include "amdgpu_trace.h"
52 #include "amdgpu_i2c.h"
53 #include "atom.h"
54 #include "amdgpu_atombios.h"
55 #include "amdgpu_atomfirmware.h"
56 #include "amd_pcie.h"
57 #ifdef CONFIG_DRM_AMDGPU_SI
58 #include "si.h"
59 #endif
60 #ifdef CONFIG_DRM_AMDGPU_CIK
61 #include "cik.h"
62 #endif
63 #include "vi.h"
64 #include "soc15.h"
65 #include "nv.h"
66 #include "bif/bif_4_1_d.h"
67 #include <linux/firmware.h>
68 #include "amdgpu_vf_error.h"
69
70 #include "amdgpu_amdkfd.h"
71 #include "amdgpu_pm.h"
72
73 #include "amdgpu_xgmi.h"
74 #include "amdgpu_ras.h"
75 #include "amdgpu_ras_mgr.h"
76 #include "amdgpu_pmu.h"
77 #include "amdgpu_fru_eeprom.h"
78 #include "amdgpu_reset.h"
79 #include "amdgpu_virt.h"
80 #include "amdgpu_dev_coredump.h"
81
82 #include <linux/suspend.h>
83 #include <drm/task_barrier.h>
84 #include <linux/pm_runtime.h>
85
86 #include <drm/drm_drv.h>
87
88 #if IS_ENABLED(CONFIG_X86)
89 #include <asm/intel-family.h>
90 #include <asm/cpu_device_id.h>
91 #endif
92
93 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
97 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
98 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
99 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
100 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin");
101
102 #define AMDGPU_RESUME_MS 2000
103 #define AMDGPU_MAX_RETRY_LIMIT 2
104 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
105 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
106 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
107 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
108
109 #define AMDGPU_VBIOS_SKIP (1U << 0)
110 #define AMDGPU_VBIOS_OPTIONAL (1U << 1)
111
112 static const struct drm_driver amdgpu_kms_driver;
113
114 const char *amdgpu_asic_name[] = {
115 "TAHITI",
116 "PITCAIRN",
117 "VERDE",
118 "OLAND",
119 "HAINAN",
120 "BONAIRE",
121 "KAVERI",
122 "KABINI",
123 "HAWAII",
124 "MULLINS",
125 "TOPAZ",
126 "TONGA",
127 "FIJI",
128 "CARRIZO",
129 "STONEY",
130 "POLARIS10",
131 "POLARIS11",
132 "POLARIS12",
133 "VEGAM",
134 "VEGA10",
135 "VEGA12",
136 "VEGA20",
137 "RAVEN",
138 "ARCTURUS",
139 "RENOIR",
140 "ALDEBARAN",
141 "NAVI10",
142 "CYAN_SKILLFISH",
143 "NAVI14",
144 "NAVI12",
145 "SIENNA_CICHLID",
146 "NAVY_FLOUNDER",
147 "VANGOGH",
148 "DIMGREY_CAVEFISH",
149 "BEIGE_GOBY",
150 "YELLOW_CARP",
151 "IP DISCOVERY",
152 "LAST",
153 };
154
155 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
156 /*
157 * Default init level where all blocks are expected to be initialized. This is
158 * the level of initialization expected by default and also after a full reset
159 * of the device.
160 */
161 struct amdgpu_init_level amdgpu_init_default = {
162 .level = AMDGPU_INIT_LEVEL_DEFAULT,
163 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
164 };
165
166 struct amdgpu_init_level amdgpu_init_recovery = {
167 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
168 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
169 };
170
171 /*
172 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
173 * is used for cases like reset on initialization where the entire hive needs to
174 * be reset before first use.
175 */
176 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
177 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
178 .hwini_ip_block_mask =
179 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
180 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
181 BIT(AMD_IP_BLOCK_TYPE_PSP)
182 };
183
184 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev);
185 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev);
186 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev);
187
188 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev);
189
amdgpu_ip_member_of_hwini(struct amdgpu_device * adev,enum amd_ip_block_type block)190 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
191 enum amd_ip_block_type block)
192 {
193 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
194 }
195
amdgpu_set_init_level(struct amdgpu_device * adev,enum amdgpu_init_lvl_id lvl)196 void amdgpu_set_init_level(struct amdgpu_device *adev,
197 enum amdgpu_init_lvl_id lvl)
198 {
199 switch (lvl) {
200 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
201 adev->init_lvl = &amdgpu_init_minimal_xgmi;
202 break;
203 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
204 adev->init_lvl = &amdgpu_init_recovery;
205 break;
206 case AMDGPU_INIT_LEVEL_DEFAULT:
207 fallthrough;
208 default:
209 adev->init_lvl = &amdgpu_init_default;
210 break;
211 }
212 }
213
214 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
215 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
216 void *data);
217
218 /**
219 * DOC: pcie_replay_count
220 *
221 * The amdgpu driver provides a sysfs API for reporting the total number
222 * of PCIe replays (NAKs).
223 * The file pcie_replay_count is used for this and returns the total
224 * number of replays as a sum of the NAKs generated and NAKs received.
225 */
226
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)227 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
228 struct device_attribute *attr, char *buf)
229 {
230 struct drm_device *ddev = dev_get_drvdata(dev);
231 struct amdgpu_device *adev = drm_to_adev(ddev);
232 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
233
234 return sysfs_emit(buf, "%llu\n", cnt);
235 }
236
237 static DEVICE_ATTR(pcie_replay_count, 0444,
238 amdgpu_device_get_pcie_replay_count, NULL);
239
amdgpu_device_attr_sysfs_init(struct amdgpu_device * adev)240 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
241 {
242 int ret = 0;
243
244 if (amdgpu_nbio_is_replay_cnt_supported(adev))
245 ret = sysfs_create_file(&adev->dev->kobj,
246 &dev_attr_pcie_replay_count.attr);
247
248 return ret;
249 }
250
amdgpu_device_attr_sysfs_fini(struct amdgpu_device * adev)251 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
252 {
253 if (amdgpu_nbio_is_replay_cnt_supported(adev))
254 sysfs_remove_file(&adev->dev->kobj,
255 &dev_attr_pcie_replay_count.attr);
256 }
257
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,const struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)258 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
259 const struct bin_attribute *attr, char *buf,
260 loff_t ppos, size_t count)
261 {
262 struct device *dev = kobj_to_dev(kobj);
263 struct drm_device *ddev = dev_get_drvdata(dev);
264 struct amdgpu_device *adev = drm_to_adev(ddev);
265 ssize_t bytes_read;
266
267 switch (ppos) {
268 case AMDGPU_SYS_REG_STATE_XGMI:
269 bytes_read = amdgpu_asic_get_reg_state(
270 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
271 break;
272 case AMDGPU_SYS_REG_STATE_WAFL:
273 bytes_read = amdgpu_asic_get_reg_state(
274 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
275 break;
276 case AMDGPU_SYS_REG_STATE_PCIE:
277 bytes_read = amdgpu_asic_get_reg_state(
278 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
279 break;
280 case AMDGPU_SYS_REG_STATE_USR:
281 bytes_read = amdgpu_asic_get_reg_state(
282 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
283 break;
284 case AMDGPU_SYS_REG_STATE_USR_1:
285 bytes_read = amdgpu_asic_get_reg_state(
286 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
287 break;
288 default:
289 return -EINVAL;
290 }
291
292 return bytes_read;
293 }
294
295 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
296 AMDGPU_SYS_REG_STATE_END);
297
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)298 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
299 {
300 int ret;
301
302 if (!amdgpu_asic_get_reg_state_supported(adev))
303 return 0;
304
305 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
306
307 return ret;
308 }
309
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)310 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
311 {
312 if (!amdgpu_asic_get_reg_state_supported(adev))
313 return;
314 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
315 }
316
317 /**
318 * DOC: board_info
319 *
320 * The amdgpu driver provides a sysfs API for giving board related information.
321 * It provides the form factor information in the format
322 *
323 * type : form factor
324 *
325 * Possible form factor values
326 *
327 * - "cem" - PCIE CEM card
328 * - "oam" - Open Compute Accelerator Module
329 * - "unknown" - Not known
330 *
331 */
332
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)333 static ssize_t amdgpu_device_get_board_info(struct device *dev,
334 struct device_attribute *attr,
335 char *buf)
336 {
337 struct drm_device *ddev = dev_get_drvdata(dev);
338 struct amdgpu_device *adev = drm_to_adev(ddev);
339 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
340 const char *pkg;
341
342 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
343 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
344
345 switch (pkg_type) {
346 case AMDGPU_PKG_TYPE_CEM:
347 pkg = "cem";
348 break;
349 case AMDGPU_PKG_TYPE_OAM:
350 pkg = "oam";
351 break;
352 default:
353 pkg = "unknown";
354 break;
355 }
356
357 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
358 }
359
360 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
361
362 static struct attribute *amdgpu_board_attrs[] = {
363 &dev_attr_board_info.attr,
364 NULL,
365 };
366
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)367 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
368 struct attribute *attr, int n)
369 {
370 struct device *dev = kobj_to_dev(kobj);
371 struct drm_device *ddev = dev_get_drvdata(dev);
372 struct amdgpu_device *adev = drm_to_adev(ddev);
373
374 if (adev->flags & AMD_IS_APU)
375 return 0;
376
377 return attr->mode;
378 }
379
380 static const struct attribute_group amdgpu_board_attrs_group = {
381 .attrs = amdgpu_board_attrs,
382 .is_visible = amdgpu_board_attrs_is_visible
383 };
384
385 /**
386 * DOC: uma/carveout_options
387 *
388 * This is a read-only file that lists all available UMA allocation
389 * options and their corresponding indices. Example output::
390 *
391 * $ cat uma/carveout_options
392 * 0: Minimum (512 MB)
393 * 1: (1 GB)
394 * 2: (2 GB)
395 * 3: (4 GB)
396 * 4: (6 GB)
397 * 5: (8 GB)
398 * 6: (12 GB)
399 * 7: Medium (16 GB)
400 * 8: (24 GB)
401 * 9: High (32 GB)
402 */
carveout_options_show(struct device * dev,struct device_attribute * attr,char * buf)403 static ssize_t carveout_options_show(struct device *dev,
404 struct device_attribute *attr,
405 char *buf)
406 {
407 struct drm_device *ddev = dev_get_drvdata(dev);
408 struct amdgpu_device *adev = drm_to_adev(ddev);
409 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info;
410 uint32_t memory_carved;
411 ssize_t size = 0;
412
413 if (!uma_info || !uma_info->num_entries)
414 return -ENODEV;
415
416 for (int i = 0; i < uma_info->num_entries; i++) {
417 memory_carved = uma_info->entries[i].memory_carved_mb;
418 if (memory_carved >= SZ_1G/SZ_1M) {
419 size += sysfs_emit_at(buf, size, "%d: %s (%u GB)\n",
420 i,
421 uma_info->entries[i].name,
422 memory_carved >> 10);
423 } else {
424 size += sysfs_emit_at(buf, size, "%d: %s (%u MB)\n",
425 i,
426 uma_info->entries[i].name,
427 memory_carved);
428 }
429 }
430
431 return size;
432 }
433 static DEVICE_ATTR_RO(carveout_options);
434
435 /**
436 * DOC: uma/carveout
437 *
438 * This file is both readable and writable. When read, it shows the
439 * index of the current setting. Writing a valid index to this file
440 * allows users to change the UMA carveout size to the selected option
441 * on the next boot.
442 *
443 * The available options and their corresponding indices can be read
444 * from the uma/carveout_options file.
445 */
carveout_show(struct device * dev,struct device_attribute * attr,char * buf)446 static ssize_t carveout_show(struct device *dev,
447 struct device_attribute *attr,
448 char *buf)
449 {
450 struct drm_device *ddev = dev_get_drvdata(dev);
451 struct amdgpu_device *adev = drm_to_adev(ddev);
452
453 return sysfs_emit(buf, "%u\n", adev->uma_info.uma_option_index);
454 }
455
carveout_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)456 static ssize_t carveout_store(struct device *dev,
457 struct device_attribute *attr,
458 const char *buf, size_t count)
459 {
460 struct drm_device *ddev = dev_get_drvdata(dev);
461 struct amdgpu_device *adev = drm_to_adev(ddev);
462 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info;
463 struct amdgpu_uma_carveout_option *opt;
464 unsigned long val;
465 uint8_t flags;
466 int r;
467
468 r = kstrtoul(buf, 10, &val);
469 if (r)
470 return r;
471
472 if (val >= uma_info->num_entries)
473 return -EINVAL;
474
475 val = array_index_nospec(val, uma_info->num_entries);
476 opt = &uma_info->entries[val];
477
478 if (!(opt->flags & AMDGPU_UMA_FLAG_AUTO) &&
479 !(opt->flags & AMDGPU_UMA_FLAG_CUSTOM)) {
480 drm_err_once(ddev, "Option %lu not supported due to lack of Custom/Auto flag", val);
481 return -EINVAL;
482 }
483
484 flags = opt->flags;
485 flags &= ~((flags & AMDGPU_UMA_FLAG_AUTO) >> 1);
486
487 guard(mutex)(&uma_info->update_lock);
488
489 r = amdgpu_acpi_set_uma_allocation_size(adev, val, flags);
490 if (r)
491 return r;
492
493 uma_info->uma_option_index = val;
494
495 return count;
496 }
497 static DEVICE_ATTR_RW(carveout);
498
499 static struct attribute *amdgpu_uma_attrs[] = {
500 &dev_attr_carveout.attr,
501 &dev_attr_carveout_options.attr,
502 NULL
503 };
504
505 const struct attribute_group amdgpu_uma_attr_group = {
506 .name = "uma",
507 .attrs = amdgpu_uma_attrs
508 };
509
amdgpu_uma_sysfs_init(struct amdgpu_device * adev)510 static void amdgpu_uma_sysfs_init(struct amdgpu_device *adev)
511 {
512 int rc;
513
514 if (!(adev->flags & AMD_IS_APU))
515 return;
516
517 if (!amdgpu_acpi_is_set_uma_allocation_size_supported())
518 return;
519
520 rc = amdgpu_atomfirmware_get_uma_carveout_info(adev, &adev->uma_info);
521 if (rc) {
522 drm_dbg(adev_to_drm(adev),
523 "Failed to parse UMA carveout info from VBIOS: %d\n", rc);
524 goto out_info;
525 }
526
527 mutex_init(&adev->uma_info.update_lock);
528
529 rc = devm_device_add_group(adev->dev, &amdgpu_uma_attr_group);
530 if (rc) {
531 drm_dbg(adev_to_drm(adev), "Failed to add UMA carveout sysfs interfaces %d\n", rc);
532 goto out_attr;
533 }
534
535 return;
536
537 out_attr:
538 mutex_destroy(&adev->uma_info.update_lock);
539 out_info:
540 return;
541 }
542
amdgpu_uma_sysfs_fini(struct amdgpu_device * adev)543 static void amdgpu_uma_sysfs_fini(struct amdgpu_device *adev)
544 {
545 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info;
546
547 if (!amdgpu_acpi_is_set_uma_allocation_size_supported())
548 return;
549
550 mutex_destroy(&uma_info->update_lock);
551 uma_info->num_entries = 0;
552 }
553
554 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
555
556 /**
557 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
558 *
559 * @adev: amdgpu device pointer
560 *
561 * Returns true if the device is a dGPU with ATPX power control,
562 * otherwise return false.
563 */
amdgpu_device_supports_px(struct amdgpu_device * adev)564 bool amdgpu_device_supports_px(struct amdgpu_device *adev)
565 {
566 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
567 return true;
568 return false;
569 }
570
571 /**
572 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
573 *
574 * @adev: amdgpu device pointer
575 *
576 * Returns true if the device is a dGPU with ACPI power control,
577 * otherwise return false.
578 */
amdgpu_device_supports_boco(struct amdgpu_device * adev)579 bool amdgpu_device_supports_boco(struct amdgpu_device *adev)
580 {
581 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
582 return false;
583
584 if (adev->has_pr3 ||
585 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
586 return true;
587 return false;
588 }
589
590 /**
591 * amdgpu_device_supports_baco - Does the device support BACO
592 *
593 * @adev: amdgpu device pointer
594 *
595 * Return:
596 * 1 if the device supports BACO;
597 * 3 if the device supports MACO (only works if BACO is supported)
598 * otherwise return 0.
599 */
amdgpu_device_supports_baco(struct amdgpu_device * adev)600 int amdgpu_device_supports_baco(struct amdgpu_device *adev)
601 {
602 return amdgpu_asic_supports_baco(adev);
603 }
604
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)605 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
606 {
607 int bamaco_support;
608
609 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
610 bamaco_support = amdgpu_device_supports_baco(adev);
611
612 switch (amdgpu_runtime_pm) {
613 case 2:
614 if (bamaco_support & MACO_SUPPORT) {
615 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
616 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
617 } else if (bamaco_support == BACO_SUPPORT) {
618 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
619 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
620 }
621 break;
622 case 1:
623 if (bamaco_support & BACO_SUPPORT) {
624 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
625 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
626 }
627 break;
628 case -1:
629 case -2:
630 if (amdgpu_device_supports_px(adev)) {
631 /* enable PX as runtime mode */
632 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
633 dev_info(adev->dev, "Using ATPX for runtime pm\n");
634 } else if (amdgpu_device_supports_boco(adev)) {
635 /* enable boco as runtime mode */
636 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
637 dev_info(adev->dev, "Using BOCO for runtime pm\n");
638 } else {
639 if (!bamaco_support)
640 goto no_runtime_pm;
641
642 switch (adev->asic_type) {
643 case CHIP_VEGA20:
644 case CHIP_ARCTURUS:
645 /* BACO are not supported on vega20 and arctrus */
646 break;
647 case CHIP_VEGA10:
648 /* enable BACO as runpm mode if noretry=0 */
649 if (!adev->gmc.noretry && !amdgpu_passthrough(adev))
650 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
651 break;
652 default:
653 /* enable BACO as runpm mode on CI+ */
654 if (!amdgpu_passthrough(adev))
655 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
656 break;
657 }
658
659 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
660 if (bamaco_support & MACO_SUPPORT) {
661 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
662 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
663 } else {
664 dev_info(adev->dev, "Using BACO for runtime pm\n");
665 }
666 }
667 }
668 break;
669 case 0:
670 dev_info(adev->dev, "runtime pm is manually disabled\n");
671 break;
672 default:
673 break;
674 }
675
676 no_runtime_pm:
677 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
678 dev_info(adev->dev, "Runtime PM not available\n");
679 }
680 /**
681 * amdgpu_device_supports_smart_shift - Is the device dGPU with
682 * smart shift support
683 *
684 * @adev: amdgpu device pointer
685 *
686 * Returns true if the device is a dGPU with Smart Shift support,
687 * otherwise returns false.
688 */
amdgpu_device_supports_smart_shift(struct amdgpu_device * adev)689 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev)
690 {
691 return (amdgpu_device_supports_boco(adev) &&
692 amdgpu_acpi_is_power_shift_control_supported());
693 }
694
695 /*
696 * VRAM access helper functions
697 */
698
699 /**
700 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
701 *
702 * @adev: amdgpu_device pointer
703 * @pos: offset of the buffer in vram
704 * @buf: virtual address of the buffer in system memory
705 * @size: read/write size, sizeof(@buf) must > @size
706 * @write: true - write to vram, otherwise - read from vram
707 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)708 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
709 void *buf, size_t size, bool write)
710 {
711 unsigned long flags;
712 uint32_t hi = ~0, tmp = 0;
713 uint32_t *data = buf;
714 uint64_t last;
715 int idx;
716
717 if (!drm_dev_enter(adev_to_drm(adev), &idx))
718 return;
719
720 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
721
722 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
723 for (last = pos + size; pos < last; pos += 4) {
724 tmp = pos >> 31;
725
726 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
727 if (tmp != hi) {
728 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
729 hi = tmp;
730 }
731 if (write)
732 WREG32_NO_KIQ(mmMM_DATA, *data++);
733 else
734 *data++ = RREG32_NO_KIQ(mmMM_DATA);
735 }
736
737 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
738 drm_dev_exit(idx);
739 }
740
741 /**
742 * amdgpu_device_aper_access - access vram by vram aperture
743 *
744 * @adev: amdgpu_device pointer
745 * @pos: offset of the buffer in vram
746 * @buf: virtual address of the buffer in system memory
747 * @size: read/write size, sizeof(@buf) must > @size
748 * @write: true - write to vram, otherwise - read from vram
749 *
750 * The return value means how many bytes have been transferred.
751 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)752 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
753 void *buf, size_t size, bool write)
754 {
755 #ifdef CONFIG_64BIT
756 void __iomem *addr;
757 size_t count = 0;
758 uint64_t last;
759
760 if (!adev->mman.aper_base_kaddr)
761 return 0;
762
763 last = min(pos + size, adev->gmc.visible_vram_size);
764 if (last > pos) {
765 addr = adev->mman.aper_base_kaddr + pos;
766 count = last - pos;
767
768 if (write) {
769 memcpy_toio(addr, buf, count);
770 /* Make sure HDP write cache flush happens without any reordering
771 * after the system memory contents are sent over PCIe device
772 */
773 mb();
774 amdgpu_device_flush_hdp(adev, NULL);
775 } else {
776 amdgpu_device_invalidate_hdp(adev, NULL);
777 /* Make sure HDP read cache is invalidated before issuing a read
778 * to the PCIe device
779 */
780 mb();
781 memcpy_fromio(buf, addr, count);
782 }
783
784 }
785
786 return count;
787 #else
788 return 0;
789 #endif
790 }
791
792 /**
793 * amdgpu_device_vram_access - read/write a buffer in vram
794 *
795 * @adev: amdgpu_device pointer
796 * @pos: offset of the buffer in vram
797 * @buf: virtual address of the buffer in system memory
798 * @size: read/write size, sizeof(@buf) must > @size
799 * @write: true - write to vram, otherwise - read from vram
800 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)801 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
802 void *buf, size_t size, bool write)
803 {
804 size_t count;
805
806 /* try to using vram apreature to access vram first */
807 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
808 size -= count;
809 if (size) {
810 /* using MM to access rest vram */
811 pos += count;
812 buf += count;
813 amdgpu_device_mm_access(adev, pos, buf, size, write);
814 }
815 }
816
817 /*
818 * register access helper functions.
819 */
820
821 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)822 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
823 {
824 if (adev->no_hw_access)
825 return true;
826
827 #ifdef CONFIG_LOCKDEP
828 /*
829 * This is a bit complicated to understand, so worth a comment. What we assert
830 * here is that the GPU reset is not running on another thread in parallel.
831 *
832 * For this we trylock the read side of the reset semaphore, if that succeeds
833 * we know that the reset is not running in parallel.
834 *
835 * If the trylock fails we assert that we are either already holding the read
836 * side of the lock or are the reset thread itself and hold the write side of
837 * the lock.
838 */
839 if (in_task()) {
840 if (down_read_trylock(&adev->reset_domain->sem))
841 up_read(&adev->reset_domain->sem);
842 else
843 lockdep_assert_held(&adev->reset_domain->sem);
844 }
845 #endif
846 return false;
847 }
848
849 /**
850 * amdgpu_device_rreg - read a memory mapped IO or indirect register
851 *
852 * @adev: amdgpu_device pointer
853 * @reg: dword aligned register offset
854 * @acc_flags: access flags which require special behavior
855 *
856 * Returns the 32 bit value from the offset specified.
857 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)858 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
859 uint32_t reg, uint32_t acc_flags)
860 {
861 uint32_t ret;
862
863 if (amdgpu_device_skip_hw_access(adev))
864 return 0;
865
866 if ((reg * 4) < adev->rmmio_size) {
867 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
868 amdgpu_sriov_runtime(adev) &&
869 down_read_trylock(&adev->reset_domain->sem)) {
870 ret = amdgpu_kiq_rreg(adev, reg, 0);
871 up_read(&adev->reset_domain->sem);
872 } else {
873 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
874 }
875 } else {
876 ret = adev->pcie_rreg(adev, reg * 4);
877 }
878
879 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
880
881 return ret;
882 }
883
884 /*
885 * MMIO register read with bytes helper functions
886 * @offset:bytes offset from MMIO start
887 */
888
889 /**
890 * amdgpu_mm_rreg8 - read a memory mapped IO register
891 *
892 * @adev: amdgpu_device pointer
893 * @offset: byte aligned register offset
894 *
895 * Returns the 8 bit value from the offset specified.
896 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)897 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
898 {
899 if (amdgpu_device_skip_hw_access(adev))
900 return 0;
901
902 if (offset < adev->rmmio_size)
903 return (readb(adev->rmmio + offset));
904 BUG();
905 }
906
907
908 /**
909 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
910 *
911 * @adev: amdgpu_device pointer
912 * @reg: dword aligned register offset
913 * @acc_flags: access flags which require special behavior
914 * @xcc_id: xcc accelerated compute core id
915 *
916 * Returns the 32 bit value from the offset specified.
917 */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)918 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
919 uint32_t reg, uint32_t acc_flags,
920 uint32_t xcc_id)
921 {
922 uint32_t ret, rlcg_flag;
923
924 if (amdgpu_device_skip_hw_access(adev))
925 return 0;
926
927 if ((reg * 4) < adev->rmmio_size) {
928 if (amdgpu_sriov_vf(adev) &&
929 !amdgpu_sriov_runtime(adev) &&
930 adev->gfx.rlc.rlcg_reg_access_supported &&
931 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
932 GC_HWIP, false,
933 &rlcg_flag)) {
934 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
935 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
936 amdgpu_sriov_runtime(adev) &&
937 down_read_trylock(&adev->reset_domain->sem)) {
938 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
939 up_read(&adev->reset_domain->sem);
940 } else {
941 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
942 }
943 } else {
944 ret = adev->pcie_rreg(adev, reg * 4);
945 }
946
947 return ret;
948 }
949
950 /*
951 * MMIO register write with bytes helper functions
952 * @offset:bytes offset from MMIO start
953 * @value: the value want to be written to the register
954 */
955
956 /**
957 * amdgpu_mm_wreg8 - read a memory mapped IO register
958 *
959 * @adev: amdgpu_device pointer
960 * @offset: byte aligned register offset
961 * @value: 8 bit value to write
962 *
963 * Writes the value specified to the offset specified.
964 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)965 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
966 {
967 if (amdgpu_device_skip_hw_access(adev))
968 return;
969
970 if (offset < adev->rmmio_size)
971 writeb(value, adev->rmmio + offset);
972 else
973 BUG();
974 }
975
976 /**
977 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
978 *
979 * @adev: amdgpu_device pointer
980 * @reg: dword aligned register offset
981 * @v: 32 bit value to write to the register
982 * @acc_flags: access flags which require special behavior
983 *
984 * Writes the value specified to the offset specified.
985 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)986 void amdgpu_device_wreg(struct amdgpu_device *adev,
987 uint32_t reg, uint32_t v,
988 uint32_t acc_flags)
989 {
990 if (amdgpu_device_skip_hw_access(adev))
991 return;
992
993 if ((reg * 4) < adev->rmmio_size) {
994 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
995 amdgpu_sriov_runtime(adev) &&
996 down_read_trylock(&adev->reset_domain->sem)) {
997 amdgpu_kiq_wreg(adev, reg, v, 0);
998 up_read(&adev->reset_domain->sem);
999 } else {
1000 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
1001 }
1002 } else {
1003 adev->pcie_wreg(adev, reg * 4, v);
1004 }
1005
1006 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
1007 }
1008
1009 /**
1010 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
1011 *
1012 * @adev: amdgpu_device pointer
1013 * @reg: mmio/rlc register
1014 * @v: value to write
1015 * @xcc_id: xcc accelerated compute core id
1016 *
1017 * this function is invoked only for the debugfs register access
1018 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)1019 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
1020 uint32_t reg, uint32_t v,
1021 uint32_t xcc_id)
1022 {
1023 if (amdgpu_device_skip_hw_access(adev))
1024 return;
1025
1026 if (amdgpu_sriov_fullaccess(adev) &&
1027 adev->gfx.rlc.funcs &&
1028 adev->gfx.rlc.funcs->is_rlcg_access_range) {
1029 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
1030 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
1031 } else if ((reg * 4) >= adev->rmmio_size) {
1032 adev->pcie_wreg(adev, reg * 4, v);
1033 } else {
1034 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
1035 }
1036 }
1037
1038 /**
1039 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
1040 *
1041 * @adev: amdgpu_device pointer
1042 * @reg: dword aligned register offset
1043 * @v: 32 bit value to write to the register
1044 * @acc_flags: access flags which require special behavior
1045 * @xcc_id: xcc accelerated compute core id
1046 *
1047 * Writes the value specified to the offset specified.
1048 */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)1049 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
1050 uint32_t reg, uint32_t v,
1051 uint32_t acc_flags, uint32_t xcc_id)
1052 {
1053 uint32_t rlcg_flag;
1054
1055 if (amdgpu_device_skip_hw_access(adev))
1056 return;
1057
1058 if ((reg * 4) < adev->rmmio_size) {
1059 if (amdgpu_sriov_vf(adev) &&
1060 !amdgpu_sriov_runtime(adev) &&
1061 adev->gfx.rlc.rlcg_reg_access_supported &&
1062 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
1063 GC_HWIP, true,
1064 &rlcg_flag)) {
1065 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
1066 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
1067 amdgpu_sriov_runtime(adev) &&
1068 down_read_trylock(&adev->reset_domain->sem)) {
1069 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
1070 up_read(&adev->reset_domain->sem);
1071 } else {
1072 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
1073 }
1074 } else {
1075 adev->pcie_wreg(adev, reg * 4, v);
1076 }
1077 }
1078
1079 /**
1080 * amdgpu_device_indirect_rreg - read an indirect register
1081 *
1082 * @adev: amdgpu_device pointer
1083 * @reg_addr: indirect register address to read from
1084 *
1085 * Returns the value of indirect register @reg_addr
1086 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)1087 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
1088 u32 reg_addr)
1089 {
1090 unsigned long flags, pcie_index, pcie_data;
1091 void __iomem *pcie_index_offset;
1092 void __iomem *pcie_data_offset;
1093 u32 r;
1094
1095 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1096 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1097
1098 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1099 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1100 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1101
1102 writel(reg_addr, pcie_index_offset);
1103 readl(pcie_index_offset);
1104 r = readl(pcie_data_offset);
1105 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1106
1107 return r;
1108 }
1109
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)1110 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
1111 u64 reg_addr)
1112 {
1113 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1114 u32 r;
1115 void __iomem *pcie_index_offset;
1116 void __iomem *pcie_index_hi_offset;
1117 void __iomem *pcie_data_offset;
1118
1119 if (unlikely(!adev->nbio.funcs)) {
1120 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
1121 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
1122 } else {
1123 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1124 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1125 }
1126
1127 if (reg_addr >> 32) {
1128 if (unlikely(!adev->nbio.funcs))
1129 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
1130 else
1131 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1132 } else {
1133 pcie_index_hi = 0;
1134 }
1135
1136 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1137 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1138 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1139 if (pcie_index_hi != 0)
1140 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1141 pcie_index_hi * 4;
1142
1143 writel(reg_addr, pcie_index_offset);
1144 readl(pcie_index_offset);
1145 if (pcie_index_hi != 0) {
1146 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1147 readl(pcie_index_hi_offset);
1148 }
1149 r = readl(pcie_data_offset);
1150
1151 /* clear the high bits */
1152 if (pcie_index_hi != 0) {
1153 writel(0, pcie_index_hi_offset);
1154 readl(pcie_index_hi_offset);
1155 }
1156
1157 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1158
1159 return r;
1160 }
1161
1162 /**
1163 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1164 *
1165 * @adev: amdgpu_device pointer
1166 * @reg_addr: indirect register address to read from
1167 *
1168 * Returns the value of indirect register @reg_addr
1169 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)1170 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1171 u32 reg_addr)
1172 {
1173 unsigned long flags, pcie_index, pcie_data;
1174 void __iomem *pcie_index_offset;
1175 void __iomem *pcie_data_offset;
1176 u64 r;
1177
1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1180
1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1184
1185 /* read low 32 bits */
1186 writel(reg_addr, pcie_index_offset);
1187 readl(pcie_index_offset);
1188 r = readl(pcie_data_offset);
1189 /* read high 32 bits */
1190 writel(reg_addr + 4, pcie_index_offset);
1191 readl(pcie_index_offset);
1192 r |= ((u64)readl(pcie_data_offset) << 32);
1193 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1194
1195 return r;
1196 }
1197
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)1198 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1199 u64 reg_addr)
1200 {
1201 unsigned long flags, pcie_index, pcie_data;
1202 unsigned long pcie_index_hi = 0;
1203 void __iomem *pcie_index_offset;
1204 void __iomem *pcie_index_hi_offset;
1205 void __iomem *pcie_data_offset;
1206 u64 r;
1207
1208 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1209 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1210 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1211 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1212
1213 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1214 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1215 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1216 if (pcie_index_hi != 0)
1217 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1218 pcie_index_hi * 4;
1219
1220 /* read low 32 bits */
1221 writel(reg_addr, pcie_index_offset);
1222 readl(pcie_index_offset);
1223 if (pcie_index_hi != 0) {
1224 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1225 readl(pcie_index_hi_offset);
1226 }
1227 r = readl(pcie_data_offset);
1228 /* read high 32 bits */
1229 writel(reg_addr + 4, pcie_index_offset);
1230 readl(pcie_index_offset);
1231 if (pcie_index_hi != 0) {
1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1233 readl(pcie_index_hi_offset);
1234 }
1235 r |= ((u64)readl(pcie_data_offset) << 32);
1236
1237 /* clear the high bits */
1238 if (pcie_index_hi != 0) {
1239 writel(0, pcie_index_hi_offset);
1240 readl(pcie_index_hi_offset);
1241 }
1242
1243 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1244
1245 return r;
1246 }
1247
1248 /**
1249 * amdgpu_device_indirect_wreg - write an indirect register address
1250 *
1251 * @adev: amdgpu_device pointer
1252 * @reg_addr: indirect register offset
1253 * @reg_data: indirect register data
1254 *
1255 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1256 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1257 u32 reg_addr, u32 reg_data)
1258 {
1259 unsigned long flags, pcie_index, pcie_data;
1260 void __iomem *pcie_index_offset;
1261 void __iomem *pcie_data_offset;
1262
1263 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1264 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1265
1266 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1267 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1268 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1269
1270 writel(reg_addr, pcie_index_offset);
1271 readl(pcie_index_offset);
1272 writel(reg_data, pcie_data_offset);
1273 readl(pcie_data_offset);
1274 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1275 }
1276
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1277 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1278 u64 reg_addr, u32 reg_data)
1279 {
1280 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1281 void __iomem *pcie_index_offset;
1282 void __iomem *pcie_index_hi_offset;
1283 void __iomem *pcie_data_offset;
1284
1285 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1286 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1287 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1288 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1289 else
1290 pcie_index_hi = 0;
1291
1292 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1293 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1294 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1295 if (pcie_index_hi != 0)
1296 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1297 pcie_index_hi * 4;
1298
1299 writel(reg_addr, pcie_index_offset);
1300 readl(pcie_index_offset);
1301 if (pcie_index_hi != 0) {
1302 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1303 readl(pcie_index_hi_offset);
1304 }
1305 writel(reg_data, pcie_data_offset);
1306 readl(pcie_data_offset);
1307
1308 /* clear the high bits */
1309 if (pcie_index_hi != 0) {
1310 writel(0, pcie_index_hi_offset);
1311 readl(pcie_index_hi_offset);
1312 }
1313
1314 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1315 }
1316
1317 /**
1318 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1319 *
1320 * @adev: amdgpu_device pointer
1321 * @reg_addr: indirect register offset
1322 * @reg_data: indirect register data
1323 *
1324 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1325 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1326 u32 reg_addr, u64 reg_data)
1327 {
1328 unsigned long flags, pcie_index, pcie_data;
1329 void __iomem *pcie_index_offset;
1330 void __iomem *pcie_data_offset;
1331
1332 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1333 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1334
1335 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1336 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1337 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1338
1339 /* write low 32 bits */
1340 writel(reg_addr, pcie_index_offset);
1341 readl(pcie_index_offset);
1342 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1343 readl(pcie_data_offset);
1344 /* write high 32 bits */
1345 writel(reg_addr + 4, pcie_index_offset);
1346 readl(pcie_index_offset);
1347 writel((u32)(reg_data >> 32), pcie_data_offset);
1348 readl(pcie_data_offset);
1349 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1350 }
1351
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1352 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1353 u64 reg_addr, u64 reg_data)
1354 {
1355 unsigned long flags, pcie_index, pcie_data;
1356 unsigned long pcie_index_hi = 0;
1357 void __iomem *pcie_index_offset;
1358 void __iomem *pcie_index_hi_offset;
1359 void __iomem *pcie_data_offset;
1360
1361 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1362 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1363 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1364 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1365
1366 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1367 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1368 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1369 if (pcie_index_hi != 0)
1370 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1371 pcie_index_hi * 4;
1372
1373 /* write low 32 bits */
1374 writel(reg_addr, pcie_index_offset);
1375 readl(pcie_index_offset);
1376 if (pcie_index_hi != 0) {
1377 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1378 readl(pcie_index_hi_offset);
1379 }
1380 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1381 readl(pcie_data_offset);
1382 /* write high 32 bits */
1383 writel(reg_addr + 4, pcie_index_offset);
1384 readl(pcie_index_offset);
1385 if (pcie_index_hi != 0) {
1386 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1387 readl(pcie_index_hi_offset);
1388 }
1389 writel((u32)(reg_data >> 32), pcie_data_offset);
1390 readl(pcie_data_offset);
1391
1392 /* clear the high bits */
1393 if (pcie_index_hi != 0) {
1394 writel(0, pcie_index_hi_offset);
1395 readl(pcie_index_hi_offset);
1396 }
1397
1398 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1399 }
1400
1401 /**
1402 * amdgpu_device_get_rev_id - query device rev_id
1403 *
1404 * @adev: amdgpu_device pointer
1405 *
1406 * Return device rev_id
1407 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1408 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1409 {
1410 return adev->nbio.funcs->get_rev_id(adev);
1411 }
1412
1413 /**
1414 * amdgpu_invalid_rreg - dummy reg read function
1415 *
1416 * @adev: amdgpu_device pointer
1417 * @reg: offset of register
1418 *
1419 * Dummy register read function. Used for register blocks
1420 * that certain asics don't have (all asics).
1421 * Returns the value in the register.
1422 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1423 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1424 {
1425 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg);
1426 BUG();
1427 return 0;
1428 }
1429
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1430 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1431 {
1432 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
1433 BUG();
1434 return 0;
1435 }
1436
1437 /**
1438 * amdgpu_invalid_wreg - dummy reg write function
1439 *
1440 * @adev: amdgpu_device pointer
1441 * @reg: offset of register
1442 * @v: value to write to the register
1443 *
1444 * Dummy register read function. Used for register blocks
1445 * that certain asics don't have (all asics).
1446 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1447 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1448 {
1449 dev_err(adev->dev,
1450 "Invalid callback to write register 0x%04X with 0x%08X\n", reg,
1451 v);
1452 BUG();
1453 }
1454
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1455 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1456 {
1457 dev_err(adev->dev,
1458 "Invalid callback to write register 0x%llX with 0x%08X\n", reg,
1459 v);
1460 BUG();
1461 }
1462
1463 /**
1464 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1465 *
1466 * @adev: amdgpu_device pointer
1467 * @reg: offset of register
1468 *
1469 * Dummy register read function. Used for register blocks
1470 * that certain asics don't have (all asics).
1471 * Returns the value in the register.
1472 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1473 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1474 {
1475 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n",
1476 reg);
1477 BUG();
1478 return 0;
1479 }
1480
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1481 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1482 {
1483 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
1484 BUG();
1485 return 0;
1486 }
1487
1488 /**
1489 * amdgpu_invalid_wreg64 - dummy reg write function
1490 *
1491 * @adev: amdgpu_device pointer
1492 * @reg: offset of register
1493 * @v: value to write to the register
1494 *
1495 * Dummy register read function. Used for register blocks
1496 * that certain asics don't have (all asics).
1497 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1498 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1499 {
1500 dev_err(adev->dev,
1501 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1502 reg, v);
1503 BUG();
1504 }
1505
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1506 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1507 {
1508 dev_err(adev->dev,
1509 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1510 reg, v);
1511 BUG();
1512 }
1513
1514 /**
1515 * amdgpu_block_invalid_rreg - dummy reg read function
1516 *
1517 * @adev: amdgpu_device pointer
1518 * @block: offset of instance
1519 * @reg: offset of register
1520 *
1521 * Dummy register read function. Used for register blocks
1522 * that certain asics don't have (all asics).
1523 * Returns the value in the register.
1524 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1525 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1526 uint32_t block, uint32_t reg)
1527 {
1528 dev_err(adev->dev,
1529 "Invalid callback to read register 0x%04X in block 0x%04X\n",
1530 reg, block);
1531 BUG();
1532 return 0;
1533 }
1534
1535 /**
1536 * amdgpu_block_invalid_wreg - dummy reg write function
1537 *
1538 * @adev: amdgpu_device pointer
1539 * @block: offset of instance
1540 * @reg: offset of register
1541 * @v: value to write to the register
1542 *
1543 * Dummy register read function. Used for register blocks
1544 * that certain asics don't have (all asics).
1545 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1546 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1547 uint32_t block,
1548 uint32_t reg, uint32_t v)
1549 {
1550 dev_err(adev->dev,
1551 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1552 reg, block, v);
1553 BUG();
1554 }
1555
amdgpu_device_get_vbios_flags(struct amdgpu_device * adev)1556 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev)
1557 {
1558 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1559 return AMDGPU_VBIOS_SKIP;
1560
1561 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev))
1562 return AMDGPU_VBIOS_OPTIONAL;
1563
1564 return 0;
1565 }
1566
1567 /**
1568 * amdgpu_device_asic_init - Wrapper for atom asic_init
1569 *
1570 * @adev: amdgpu_device pointer
1571 *
1572 * Does any asic specific work and then calls atom asic init.
1573 */
amdgpu_device_asic_init(struct amdgpu_device * adev)1574 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1575 {
1576 uint32_t flags;
1577 bool optional;
1578 int ret;
1579
1580 amdgpu_asic_pre_asic_init(adev);
1581 flags = amdgpu_device_get_vbios_flags(adev);
1582 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP));
1583
1584 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1585 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1586 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
1587 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1588 amdgpu_psp_wait_for_bootloader(adev);
1589 if (optional && !adev->bios)
1590 return 0;
1591
1592 ret = amdgpu_atomfirmware_asic_init(adev, true);
1593 return ret;
1594 } else {
1595 if (optional && !adev->bios)
1596 return 0;
1597
1598 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1599 }
1600
1601 return 0;
1602 }
1603
1604 /**
1605 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1606 *
1607 * @adev: amdgpu_device pointer
1608 *
1609 * Allocates a scratch page of VRAM for use by various things in the
1610 * driver.
1611 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1612 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1613 {
1614 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1615 AMDGPU_GEM_DOMAIN_VRAM |
1616 AMDGPU_GEM_DOMAIN_GTT,
1617 &adev->mem_scratch.robj,
1618 &adev->mem_scratch.gpu_addr,
1619 (void **)&adev->mem_scratch.ptr);
1620 }
1621
1622 /**
1623 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1624 *
1625 * @adev: amdgpu_device pointer
1626 *
1627 * Frees the VRAM scratch page.
1628 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1629 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1630 {
1631 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1632 }
1633
1634 /**
1635 * amdgpu_device_program_register_sequence - program an array of registers.
1636 *
1637 * @adev: amdgpu_device pointer
1638 * @registers: pointer to the register array
1639 * @array_size: size of the register array
1640 *
1641 * Programs an array or registers with and or masks.
1642 * This is a helper for setting golden registers.
1643 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1644 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1645 const u32 *registers,
1646 const u32 array_size)
1647 {
1648 u32 tmp, reg, and_mask, or_mask;
1649 int i;
1650
1651 if (array_size % 3)
1652 return;
1653
1654 for (i = 0; i < array_size; i += 3) {
1655 reg = registers[i + 0];
1656 and_mask = registers[i + 1];
1657 or_mask = registers[i + 2];
1658
1659 if (and_mask == 0xffffffff) {
1660 tmp = or_mask;
1661 } else {
1662 tmp = RREG32(reg);
1663 tmp &= ~and_mask;
1664 if (adev->family >= AMDGPU_FAMILY_AI)
1665 tmp |= (or_mask & and_mask);
1666 else
1667 tmp |= or_mask;
1668 }
1669 WREG32(reg, tmp);
1670 }
1671 }
1672
1673 /**
1674 * amdgpu_device_pci_config_reset - reset the GPU
1675 *
1676 * @adev: amdgpu_device pointer
1677 *
1678 * Resets the GPU using the pci config reset sequence.
1679 * Only applicable to asics prior to vega10.
1680 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1681 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1682 {
1683 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1684 }
1685
1686 /**
1687 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1688 *
1689 * @adev: amdgpu_device pointer
1690 *
1691 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1692 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1693 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1694 {
1695 return pci_reset_function(adev->pdev);
1696 }
1697
1698 /*
1699 * amdgpu_device_wb_*()
1700 * Writeback is the method by which the GPU updates special pages in memory
1701 * with the status of certain GPU events (fences, ring pointers,etc.).
1702 */
1703
1704 /**
1705 * amdgpu_device_wb_fini - Disable Writeback and free memory
1706 *
1707 * @adev: amdgpu_device pointer
1708 *
1709 * Disables Writeback and frees the Writeback memory (all asics).
1710 * Used at driver shutdown.
1711 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1712 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1713 {
1714 if (adev->wb.wb_obj) {
1715 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1716 &adev->wb.gpu_addr,
1717 (void **)&adev->wb.wb);
1718 adev->wb.wb_obj = NULL;
1719 }
1720 }
1721
1722 /**
1723 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1724 *
1725 * @adev: amdgpu_device pointer
1726 *
1727 * Initializes writeback and allocates writeback memory (all asics).
1728 * Used at driver startup.
1729 * Returns 0 on success or an -error on failure.
1730 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1731 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1732 {
1733 int r;
1734
1735 if (adev->wb.wb_obj == NULL) {
1736 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1737 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1738 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1739 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1740 (void **)&adev->wb.wb);
1741 if (r) {
1742 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1743 return r;
1744 }
1745
1746 adev->wb.num_wb = AMDGPU_MAX_WB;
1747 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1748
1749 /* clear wb memory */
1750 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1751 }
1752
1753 return 0;
1754 }
1755
1756 /**
1757 * amdgpu_device_wb_get - Allocate a wb entry
1758 *
1759 * @adev: amdgpu_device pointer
1760 * @wb: wb index
1761 *
1762 * Allocate a wb slot for use by the driver (all asics).
1763 * Returns 0 on success or -EINVAL on failure.
1764 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1765 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1766 {
1767 unsigned long flags, offset;
1768
1769 spin_lock_irqsave(&adev->wb.lock, flags);
1770 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1771 if (offset < adev->wb.num_wb) {
1772 __set_bit(offset, adev->wb.used);
1773 spin_unlock_irqrestore(&adev->wb.lock, flags);
1774 *wb = offset << 3; /* convert to dw offset */
1775 return 0;
1776 } else {
1777 spin_unlock_irqrestore(&adev->wb.lock, flags);
1778 return -EINVAL;
1779 }
1780 }
1781
1782 /**
1783 * amdgpu_device_wb_free - Free a wb entry
1784 *
1785 * @adev: amdgpu_device pointer
1786 * @wb: wb index
1787 *
1788 * Free a wb slot allocated for use by the driver (all asics)
1789 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1790 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1791 {
1792 unsigned long flags;
1793
1794 wb >>= 3;
1795 spin_lock_irqsave(&adev->wb.lock, flags);
1796 if (wb < adev->wb.num_wb)
1797 __clear_bit(wb, adev->wb.used);
1798 spin_unlock_irqrestore(&adev->wb.lock, flags);
1799 }
1800
1801 /**
1802 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1803 *
1804 * @adev: amdgpu_device pointer
1805 *
1806 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1807 * to fail, but if any of the BARs is not accessible after the size we abort
1808 * driver loading by returning -ENODEV.
1809 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1810 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1811 {
1812 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1813 struct pci_bus *root;
1814 struct resource *res;
1815 int max_size, r;
1816 unsigned int i;
1817 u16 cmd;
1818
1819 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1820 return 0;
1821
1822 /* Bypass for VF */
1823 if (amdgpu_sriov_vf(adev))
1824 return 0;
1825
1826 if (!amdgpu_rebar)
1827 return 0;
1828
1829 /* resizing on Dell G5 SE platforms causes problems with runtime pm */
1830 if ((amdgpu_runtime_pm != 0) &&
1831 adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1832 adev->pdev->device == 0x731f &&
1833 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1834 return 0;
1835
1836 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1837 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1838 dev_warn(
1839 adev->dev,
1840 "System can't access extended configuration space, please check!!\n");
1841
1842 /* skip if the bios has already enabled large BAR */
1843 if (adev->gmc.real_vram_size &&
1844 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1845 return 0;
1846
1847 /* Check if the root BUS has 64bit memory resources */
1848 root = adev->pdev->bus;
1849 while (root->parent)
1850 root = root->parent;
1851
1852 pci_bus_for_each_resource(root, res, i) {
1853 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1854 res->start > 0x100000000ull)
1855 break;
1856 }
1857
1858 /* Trying to resize is pointless without a root hub window above 4GB */
1859 if (!res)
1860 return 0;
1861
1862 /* Limit the BAR size to what is available */
1863 max_size = pci_rebar_get_max_size(adev->pdev, 0);
1864 if (max_size < 0)
1865 return 0;
1866 rbar_size = min(max_size, rbar_size);
1867
1868 /* Disable memory decoding while we change the BAR addresses and size */
1869 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1870 pci_write_config_word(adev->pdev, PCI_COMMAND,
1871 cmd & ~PCI_COMMAND_MEMORY);
1872
1873 /* Tear down doorbell as resizing will release BARs */
1874 amdgpu_doorbell_fini(adev);
1875
1876 r = pci_resize_resource(adev->pdev, 0, rbar_size,
1877 (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5
1878 : 1 << 2);
1879 if (r == -ENOSPC)
1880 dev_info(adev->dev,
1881 "Not enough PCI address space for a large BAR.");
1882 else if (r && r != -ENOTSUPP)
1883 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r);
1884
1885 /* When the doorbell or fb BAR isn't available we have no chance of
1886 * using the device.
1887 */
1888 r = amdgpu_doorbell_init(adev);
1889 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1890 return -ENODEV;
1891
1892 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1893
1894 return 0;
1895 }
1896
1897 /*
1898 * GPU helpers function.
1899 */
1900 /**
1901 * amdgpu_device_need_post - check if the hw need post or not
1902 *
1903 * @adev: amdgpu_device pointer
1904 *
1905 * Check if the asic has been initialized (all asics) at driver startup
1906 * or post is needed if hw reset is performed.
1907 * Returns true if need or false if not.
1908 */
amdgpu_device_need_post(struct amdgpu_device * adev)1909 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1910 {
1911 uint32_t reg, flags;
1912
1913 if (amdgpu_sriov_vf(adev))
1914 return false;
1915
1916 flags = amdgpu_device_get_vbios_flags(adev);
1917 if (flags & AMDGPU_VBIOS_SKIP)
1918 return false;
1919 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios)
1920 return false;
1921
1922 if (amdgpu_passthrough(adev)) {
1923 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1924 * some old smc fw still need driver do vPost otherwise gpu hang, while
1925 * those smc fw version above 22.15 doesn't have this flaw, so we force
1926 * vpost executed for smc version below 22.15
1927 */
1928 if (adev->asic_type == CHIP_FIJI) {
1929 int err;
1930 uint32_t fw_ver;
1931
1932 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1933 /* force vPost if error occurred */
1934 if (err)
1935 return true;
1936
1937 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1938 release_firmware(adev->pm.fw);
1939 if (fw_ver < 0x00160e00)
1940 return true;
1941 }
1942 }
1943
1944 /* Don't post if we need to reset whole hive on init */
1945 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1946 return false;
1947
1948 if (adev->has_hw_reset) {
1949 adev->has_hw_reset = false;
1950 return true;
1951 }
1952
1953 /* bios scratch used on CIK+ */
1954 if (adev->asic_type >= CHIP_BONAIRE)
1955 return amdgpu_atombios_scratch_need_asic_init(adev);
1956
1957 /* check MEM_SIZE for older asics */
1958 reg = amdgpu_asic_get_config_memsize(adev);
1959
1960 if ((reg != 0) && (reg != 0xffffffff))
1961 return false;
1962
1963 return true;
1964 }
1965
1966 /*
1967 * Check whether seamless boot is supported.
1968 *
1969 * So far we only support seamless boot on DCE 3.0 or later.
1970 * If users report that it works on older ASICS as well, we may
1971 * loosen this.
1972 */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1973 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1974 {
1975 switch (amdgpu_seamless) {
1976 case -1:
1977 break;
1978 case 1:
1979 return true;
1980 case 0:
1981 return false;
1982 default:
1983 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n",
1984 amdgpu_seamless);
1985 return false;
1986 }
1987
1988 if (!(adev->flags & AMD_IS_APU))
1989 return false;
1990
1991 if (adev->mman.keep_stolen_vga_memory)
1992 return false;
1993
1994 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1995 }
1996
1997 /*
1998 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1999 * don't support dynamic speed switching. Until we have confirmation from Intel
2000 * that a specific host supports it, it's safer that we keep it disabled for all.
2001 *
2002 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
2003 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
2004 */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)2005 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
2006 {
2007 #if IS_ENABLED(CONFIG_X86)
2008 struct cpuinfo_x86 *c = &cpu_data(0);
2009
2010 /* eGPU change speeds based on USB4 fabric conditions */
2011 if (dev_is_removable(adev->dev))
2012 return true;
2013
2014 if (c->x86_vendor == X86_VENDOR_INTEL)
2015 return false;
2016 #endif
2017 return true;
2018 }
2019
amdgpu_device_aspm_support_quirk(struct amdgpu_device * adev)2020 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev)
2021 {
2022 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4.
2023 * It's unclear if this is a platform-specific or GPU-specific issue.
2024 * Disable ASPM on SI for the time being.
2025 */
2026 if (adev->family == AMDGPU_FAMILY_SI)
2027 return true;
2028
2029 #if IS_ENABLED(CONFIG_X86)
2030 struct cpuinfo_x86 *c = &cpu_data(0);
2031
2032 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) ||
2033 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1)))
2034 return false;
2035
2036 if (c->x86 == 6 &&
2037 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) {
2038 switch (c->x86_model) {
2039 case VFM_MODEL(INTEL_ALDERLAKE):
2040 case VFM_MODEL(INTEL_ALDERLAKE_L):
2041 case VFM_MODEL(INTEL_RAPTORLAKE):
2042 case VFM_MODEL(INTEL_RAPTORLAKE_P):
2043 case VFM_MODEL(INTEL_RAPTORLAKE_S):
2044 return true;
2045 default:
2046 return false;
2047 }
2048 } else {
2049 return false;
2050 }
2051 #else
2052 return false;
2053 #endif
2054 }
2055
2056 /**
2057 * amdgpu_device_should_use_aspm - check if the device should program ASPM
2058 *
2059 * @adev: amdgpu_device pointer
2060 *
2061 * Confirm whether the module parameter and pcie bridge agree that ASPM should
2062 * be set for this device.
2063 *
2064 * Returns true if it should be used or false if not.
2065 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)2066 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
2067 {
2068 switch (amdgpu_aspm) {
2069 case -1:
2070 break;
2071 case 0:
2072 return false;
2073 case 1:
2074 return true;
2075 default:
2076 return false;
2077 }
2078 if (adev->flags & AMD_IS_APU)
2079 return false;
2080 if (amdgpu_device_aspm_support_quirk(adev))
2081 return false;
2082 return pcie_aspm_enabled(adev->pdev);
2083 }
2084
2085 /* if we get transitioned to only one device, take VGA back */
2086 /**
2087 * amdgpu_device_vga_set_decode - enable/disable vga decode
2088 *
2089 * @pdev: PCI device pointer
2090 * @state: enable/disable vga decode
2091 *
2092 * Enable/disable vga decode (all asics).
2093 * Returns VGA resource flags.
2094 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)2095 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
2096 bool state)
2097 {
2098 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
2099
2100 amdgpu_asic_set_vga_state(adev, state);
2101 if (state)
2102 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
2103 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
2104 else
2105 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
2106 }
2107
2108 /**
2109 * amdgpu_device_check_block_size - validate the vm block size
2110 *
2111 * @adev: amdgpu_device pointer
2112 *
2113 * Validates the vm block size specified via module parameter.
2114 * The vm block size defines number of bits in page table versus page directory,
2115 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
2116 * page table and the remaining bits are in the page directory.
2117 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)2118 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
2119 {
2120 /* defines number of bits in page table versus page directory,
2121 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
2122 * page table and the remaining bits are in the page directory
2123 */
2124 if (amdgpu_vm_block_size == -1)
2125 return;
2126
2127 if (amdgpu_vm_block_size < 9) {
2128 dev_warn(adev->dev, "VM page table size (%d) too small\n",
2129 amdgpu_vm_block_size);
2130 amdgpu_vm_block_size = -1;
2131 }
2132 }
2133
2134 /**
2135 * amdgpu_device_check_vm_size - validate the vm size
2136 *
2137 * @adev: amdgpu_device pointer
2138 *
2139 * Validates the vm size in GB specified via module parameter.
2140 * The VM size is the size of the GPU virtual memory space in GB.
2141 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)2142 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
2143 {
2144 /* no need to check the default value */
2145 if (amdgpu_vm_size == -1)
2146 return;
2147
2148 if (amdgpu_vm_size < 1) {
2149 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
2150 amdgpu_vm_size);
2151 amdgpu_vm_size = -1;
2152 }
2153 }
2154
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)2155 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
2156 {
2157 struct sysinfo si;
2158 bool is_os_64 = (sizeof(void *) == 8);
2159 uint64_t total_memory;
2160 uint64_t dram_size_seven_GB = 0x1B8000000;
2161 uint64_t dram_size_three_GB = 0xB8000000;
2162
2163 if (amdgpu_smu_memory_pool_size == 0)
2164 return;
2165
2166 if (!is_os_64) {
2167 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n");
2168 goto def_value;
2169 }
2170 si_meminfo(&si);
2171 total_memory = (uint64_t)si.totalram * si.mem_unit;
2172
2173 if ((amdgpu_smu_memory_pool_size == 1) ||
2174 (amdgpu_smu_memory_pool_size == 2)) {
2175 if (total_memory < dram_size_three_GB)
2176 goto def_value1;
2177 } else if ((amdgpu_smu_memory_pool_size == 4) ||
2178 (amdgpu_smu_memory_pool_size == 8)) {
2179 if (total_memory < dram_size_seven_GB)
2180 goto def_value1;
2181 } else {
2182 dev_warn(adev->dev, "Smu memory pool size not supported\n");
2183 goto def_value;
2184 }
2185 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
2186
2187 return;
2188
2189 def_value1:
2190 dev_warn(adev->dev, "No enough system memory\n");
2191 def_value:
2192 adev->pm.smu_prv_buffer_size = 0;
2193 }
2194
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)2195 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
2196 {
2197 if (!(adev->flags & AMD_IS_APU) ||
2198 adev->asic_type < CHIP_RAVEN)
2199 return 0;
2200
2201 switch (adev->asic_type) {
2202 case CHIP_RAVEN:
2203 if (adev->pdev->device == 0x15dd)
2204 adev->apu_flags |= AMD_APU_IS_RAVEN;
2205 if (adev->pdev->device == 0x15d8)
2206 adev->apu_flags |= AMD_APU_IS_PICASSO;
2207 break;
2208 case CHIP_RENOIR:
2209 if ((adev->pdev->device == 0x1636) ||
2210 (adev->pdev->device == 0x164c))
2211 adev->apu_flags |= AMD_APU_IS_RENOIR;
2212 else
2213 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
2214 break;
2215 case CHIP_VANGOGH:
2216 adev->apu_flags |= AMD_APU_IS_VANGOGH;
2217 break;
2218 case CHIP_YELLOW_CARP:
2219 break;
2220 case CHIP_CYAN_SKILLFISH:
2221 if ((adev->pdev->device == 0x13FE) ||
2222 (adev->pdev->device == 0x143F))
2223 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
2224 break;
2225 default:
2226 break;
2227 }
2228
2229 return 0;
2230 }
2231
2232 /**
2233 * amdgpu_device_check_arguments - validate module params
2234 *
2235 * @adev: amdgpu_device pointer
2236 *
2237 * Validates certain module parameters and updates
2238 * the associated values used by the driver (all asics).
2239 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)2240 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2241 {
2242 int i;
2243
2244 if (amdgpu_sched_jobs < 4) {
2245 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2246 amdgpu_sched_jobs);
2247 amdgpu_sched_jobs = 4;
2248 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2249 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2250 amdgpu_sched_jobs);
2251 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2252 }
2253
2254 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2255 /* gart size must be greater or equal to 32M */
2256 dev_warn(adev->dev, "gart size (%d) too small\n",
2257 amdgpu_gart_size);
2258 amdgpu_gart_size = -1;
2259 }
2260
2261 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2262 /* gtt size must be greater or equal to 32M */
2263 dev_warn(adev->dev, "gtt size (%d) too small\n",
2264 amdgpu_gtt_size);
2265 amdgpu_gtt_size = -1;
2266 }
2267
2268 /* valid range is between 4 and 9 inclusive */
2269 if (amdgpu_vm_fragment_size != -1 &&
2270 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2271 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2272 amdgpu_vm_fragment_size = -1;
2273 }
2274
2275 if (amdgpu_sched_hw_submission < 2) {
2276 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2277 amdgpu_sched_hw_submission);
2278 amdgpu_sched_hw_submission = 2;
2279 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2280 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2281 amdgpu_sched_hw_submission);
2282 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2283 }
2284
2285 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2286 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2287 amdgpu_reset_method = -1;
2288 }
2289
2290 amdgpu_device_check_smu_prv_buffer_size(adev);
2291
2292 amdgpu_device_check_vm_size(adev);
2293
2294 amdgpu_device_check_block_size(adev);
2295
2296 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2297
2298 for (i = 0; i < MAX_XCP; i++) {
2299 switch (amdgpu_enforce_isolation) {
2300 case -1:
2301 case 0:
2302 default:
2303 /* disable */
2304 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
2305 break;
2306 case 1:
2307 /* enable */
2308 adev->enforce_isolation[i] =
2309 AMDGPU_ENFORCE_ISOLATION_ENABLE;
2310 break;
2311 case 2:
2312 /* enable legacy mode */
2313 adev->enforce_isolation[i] =
2314 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
2315 break;
2316 case 3:
2317 /* enable only process isolation without submitting cleaner shader */
2318 adev->enforce_isolation[i] =
2319 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER;
2320 break;
2321 }
2322 }
2323
2324 return 0;
2325 }
2326
2327 /**
2328 * amdgpu_switcheroo_set_state - set switcheroo state
2329 *
2330 * @pdev: pci dev pointer
2331 * @state: vga_switcheroo state
2332 *
2333 * Callback for the switcheroo driver. Suspends or resumes
2334 * the asics before or after it is powered up using ACPI methods.
2335 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2336 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2337 enum vga_switcheroo_state state)
2338 {
2339 struct drm_device *dev = pci_get_drvdata(pdev);
2340 int r;
2341
2342 if (amdgpu_device_supports_px(drm_to_adev(dev)) &&
2343 state == VGA_SWITCHEROO_OFF)
2344 return;
2345
2346 if (state == VGA_SWITCHEROO_ON) {
2347 pr_info("switched on\n");
2348 /* don't suspend or resume card normally */
2349 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2350
2351 pci_set_power_state(pdev, PCI_D0);
2352 amdgpu_device_load_pci_state(pdev);
2353 r = pci_enable_device(pdev);
2354 if (r)
2355 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n",
2356 r);
2357 amdgpu_device_resume(dev, true);
2358
2359 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2360 } else {
2361 dev_info(&pdev->dev, "switched off\n");
2362 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2363 amdgpu_device_prepare(dev);
2364 amdgpu_device_suspend(dev, true);
2365 amdgpu_device_cache_pci_state(pdev);
2366 /* Shut down the device */
2367 pci_disable_device(pdev);
2368 pci_set_power_state(pdev, PCI_D3cold);
2369 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2370 }
2371 }
2372
2373 /**
2374 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2375 *
2376 * @pdev: pci dev pointer
2377 *
2378 * Callback for the switcheroo driver. Check of the switcheroo
2379 * state can be changed.
2380 * Returns true if the state can be changed, false if not.
2381 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2382 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2383 {
2384 struct drm_device *dev = pci_get_drvdata(pdev);
2385
2386 /*
2387 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2388 * locking inversion with the driver load path. And the access here is
2389 * completely racy anyway. So don't bother with locking for now.
2390 */
2391 return atomic_read(&dev->open_count) == 0;
2392 }
2393
2394 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2395 .set_gpu_state = amdgpu_switcheroo_set_state,
2396 .reprobe = NULL,
2397 .can_switch = amdgpu_switcheroo_can_switch,
2398 };
2399
2400 /**
2401 * amdgpu_device_enable_virtual_display - enable virtual display feature
2402 *
2403 * @adev: amdgpu_device pointer
2404 *
2405 * Enabled the virtual display feature if the user has enabled it via
2406 * the module parameter virtual_display. This feature provides a virtual
2407 * display hardware on headless boards or in virtualized environments.
2408 * This function parses and validates the configuration string specified by
2409 * the user and configures the virtual display configuration (number of
2410 * virtual connectors, crtcs, etc.) specified.
2411 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2412 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2413 {
2414 adev->enable_virtual_display = false;
2415
2416 if (amdgpu_virtual_display) {
2417 const char *pci_address_name = pci_name(adev->pdev);
2418 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2419
2420 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2421 pciaddstr_tmp = pciaddstr;
2422 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2423 pciaddname = strsep(&pciaddname_tmp, ",");
2424 if (!strcmp("all", pciaddname)
2425 || !strcmp(pci_address_name, pciaddname)) {
2426 long num_crtc;
2427 int res = -1;
2428
2429 adev->enable_virtual_display = true;
2430
2431 if (pciaddname_tmp)
2432 res = kstrtol(pciaddname_tmp, 10,
2433 &num_crtc);
2434
2435 if (!res) {
2436 if (num_crtc < 1)
2437 num_crtc = 1;
2438 if (num_crtc > 6)
2439 num_crtc = 6;
2440 adev->mode_info.num_crtc = num_crtc;
2441 } else {
2442 adev->mode_info.num_crtc = 1;
2443 }
2444 break;
2445 }
2446 }
2447
2448 dev_info(
2449 adev->dev,
2450 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2451 amdgpu_virtual_display, pci_address_name,
2452 adev->enable_virtual_display, adev->mode_info.num_crtc);
2453
2454 kfree(pciaddstr);
2455 }
2456 }
2457
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2458 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2459 {
2460 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2461 adev->mode_info.num_crtc = 1;
2462 adev->enable_virtual_display = true;
2463 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n",
2464 adev->enable_virtual_display,
2465 adev->mode_info.num_crtc);
2466 }
2467 }
2468
2469 /**
2470 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2471 *
2472 * @adev: amdgpu_device pointer
2473 *
2474 * Parses the asic configuration parameters specified in the gpu info
2475 * firmware and makes them available to the driver for use in configuring
2476 * the asic.
2477 * Returns 0 on success, -EINVAL on failure.
2478 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2479 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2480 {
2481 const char *chip_name;
2482 int err;
2483 const struct gpu_info_firmware_header_v1_0 *hdr;
2484
2485 adev->firmware.gpu_info_fw = NULL;
2486
2487 switch (adev->asic_type) {
2488 default:
2489 return 0;
2490 case CHIP_VEGA10:
2491 chip_name = "vega10";
2492 break;
2493 case CHIP_VEGA12:
2494 chip_name = "vega12";
2495 break;
2496 case CHIP_RAVEN:
2497 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2498 chip_name = "raven2";
2499 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2500 chip_name = "picasso";
2501 else
2502 chip_name = "raven";
2503 break;
2504 case CHIP_ARCTURUS:
2505 chip_name = "arcturus";
2506 break;
2507 case CHIP_NAVI12:
2508 if (adev->discovery.bin)
2509 return 0;
2510 chip_name = "navi12";
2511 break;
2512 case CHIP_CYAN_SKILLFISH:
2513 if (adev->discovery.bin)
2514 return 0;
2515 chip_name = "cyan_skillfish";
2516 break;
2517 }
2518
2519 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2520 AMDGPU_UCODE_OPTIONAL,
2521 "amdgpu/%s_gpu_info.bin", chip_name);
2522 if (err) {
2523 dev_err(adev->dev,
2524 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2525 chip_name);
2526 goto out;
2527 }
2528
2529 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2530 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2531
2532 switch (hdr->version_major) {
2533 case 1:
2534 {
2535 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2536 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2537 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2538
2539 /*
2540 * Should be dropped when DAL no longer needs it.
2541 */
2542 if (adev->asic_type == CHIP_NAVI12)
2543 goto parse_soc_bounding_box;
2544
2545 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2546 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2547 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2548 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2549 adev->gfx.config.max_texture_channel_caches =
2550 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2551 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2552 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2553 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2554 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2555 adev->gfx.config.double_offchip_lds_buf =
2556 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2557 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2558 adev->gfx.cu_info.max_waves_per_simd =
2559 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2560 adev->gfx.cu_info.max_scratch_slots_per_cu =
2561 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2562 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2563 if (hdr->version_minor >= 1) {
2564 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2565 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2566 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2567 adev->gfx.config.num_sc_per_sh =
2568 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2569 adev->gfx.config.num_packer_per_sc =
2570 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2571 }
2572
2573 parse_soc_bounding_box:
2574 /*
2575 * soc bounding box info is not integrated in disocovery table,
2576 * we always need to parse it from gpu info firmware if needed.
2577 */
2578 if (hdr->version_minor == 2) {
2579 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2580 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2581 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2582 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2583 }
2584 break;
2585 }
2586 default:
2587 dev_err(adev->dev,
2588 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2589 err = -EINVAL;
2590 goto out;
2591 }
2592 out:
2593 return err;
2594 }
2595
amdgpu_uid_init(struct amdgpu_device * adev)2596 static void amdgpu_uid_init(struct amdgpu_device *adev)
2597 {
2598 /* Initialize the UID for the device */
2599 adev->uid_info = kzalloc_obj(struct amdgpu_uid);
2600 if (!adev->uid_info) {
2601 dev_warn(adev->dev, "Failed to allocate memory for UID\n");
2602 return;
2603 }
2604 adev->uid_info->adev = adev;
2605 }
2606
amdgpu_uid_fini(struct amdgpu_device * adev)2607 static void amdgpu_uid_fini(struct amdgpu_device *adev)
2608 {
2609 /* Free the UID memory */
2610 kfree(adev->uid_info);
2611 adev->uid_info = NULL;
2612 }
2613
2614 /**
2615 * amdgpu_device_ip_early_init - run early init for hardware IPs
2616 *
2617 * @adev: amdgpu_device pointer
2618 *
2619 * Early initialization pass for hardware IPs. The hardware IPs that make
2620 * up each asic are discovered each IP's early_init callback is run. This
2621 * is the first stage in initializing the asic.
2622 * Returns 0 on success, negative error code on failure.
2623 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2624 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2625 {
2626 struct amdgpu_ip_block *ip_block;
2627 struct pci_dev *parent;
2628 bool total, skip_bios;
2629 uint32_t bios_flags;
2630 int i, r;
2631
2632 amdgpu_device_enable_virtual_display(adev);
2633
2634 if (amdgpu_sriov_vf(adev)) {
2635 r = amdgpu_virt_request_full_gpu(adev, true);
2636 if (r)
2637 return r;
2638
2639 r = amdgpu_virt_init_critical_region(adev);
2640 if (r)
2641 return r;
2642 }
2643
2644 switch (adev->asic_type) {
2645 #ifdef CONFIG_DRM_AMDGPU_SI
2646 case CHIP_VERDE:
2647 case CHIP_TAHITI:
2648 case CHIP_PITCAIRN:
2649 case CHIP_OLAND:
2650 case CHIP_HAINAN:
2651 adev->family = AMDGPU_FAMILY_SI;
2652 r = si_set_ip_blocks(adev);
2653 if (r)
2654 return r;
2655 break;
2656 #endif
2657 #ifdef CONFIG_DRM_AMDGPU_CIK
2658 case CHIP_BONAIRE:
2659 case CHIP_HAWAII:
2660 case CHIP_KAVERI:
2661 case CHIP_KABINI:
2662 case CHIP_MULLINS:
2663 if (adev->flags & AMD_IS_APU)
2664 adev->family = AMDGPU_FAMILY_KV;
2665 else
2666 adev->family = AMDGPU_FAMILY_CI;
2667
2668 r = cik_set_ip_blocks(adev);
2669 if (r)
2670 return r;
2671 break;
2672 #endif
2673 case CHIP_TOPAZ:
2674 case CHIP_TONGA:
2675 case CHIP_FIJI:
2676 case CHIP_POLARIS10:
2677 case CHIP_POLARIS11:
2678 case CHIP_POLARIS12:
2679 case CHIP_VEGAM:
2680 case CHIP_CARRIZO:
2681 case CHIP_STONEY:
2682 if (adev->flags & AMD_IS_APU)
2683 adev->family = AMDGPU_FAMILY_CZ;
2684 else
2685 adev->family = AMDGPU_FAMILY_VI;
2686
2687 r = vi_set_ip_blocks(adev);
2688 if (r)
2689 return r;
2690 break;
2691 default:
2692 r = amdgpu_discovery_set_ip_blocks(adev);
2693 if (r)
2694 return r;
2695 break;
2696 }
2697
2698 /* Check for IP version 9.4.3 with A0 hardware */
2699 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
2700 !amdgpu_device_get_rev_id(adev)) {
2701 dev_err(adev->dev, "Unsupported A0 hardware\n");
2702 return -ENODEV; /* device unsupported - no device error */
2703 }
2704
2705 if (amdgpu_has_atpx() &&
2706 (amdgpu_is_atpx_hybrid() ||
2707 amdgpu_has_atpx_dgpu_power_cntl()) &&
2708 ((adev->flags & AMD_IS_APU) == 0) &&
2709 !dev_is_removable(&adev->pdev->dev))
2710 adev->flags |= AMD_IS_PX;
2711
2712 if (!(adev->flags & AMD_IS_APU)) {
2713 parent = pcie_find_root_port(adev->pdev);
2714 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2715 }
2716
2717 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2718 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2719 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2720 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2721 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2722 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2723 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2724
2725 adev->virt.is_xgmi_node_migrate_enabled = false;
2726 if (amdgpu_sriov_vf(adev)) {
2727 adev->virt.is_xgmi_node_migrate_enabled =
2728 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4);
2729 }
2730
2731 total = true;
2732 for (i = 0; i < adev->num_ip_blocks; i++) {
2733 ip_block = &adev->ip_blocks[i];
2734
2735 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2736 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i,
2737 adev->ip_blocks[i].version->funcs->name);
2738 adev->ip_blocks[i].status.valid = false;
2739 } else if (ip_block->version->funcs->early_init) {
2740 r = ip_block->version->funcs->early_init(ip_block);
2741 if (r == -ENOENT) {
2742 adev->ip_blocks[i].status.valid = false;
2743 } else if (r) {
2744 dev_err(adev->dev,
2745 "early_init of IP block <%s> failed %d\n",
2746 adev->ip_blocks[i].version->funcs->name,
2747 r);
2748 total = false;
2749 } else {
2750 adev->ip_blocks[i].status.valid = true;
2751 }
2752 } else {
2753 adev->ip_blocks[i].status.valid = true;
2754 }
2755 /* get the vbios after the asic_funcs are set up */
2756 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2757 r = amdgpu_device_parse_gpu_info_fw(adev);
2758 if (r)
2759 return r;
2760
2761 bios_flags = amdgpu_device_get_vbios_flags(adev);
2762 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP);
2763 /* Read BIOS */
2764 if (!skip_bios) {
2765 bool optional =
2766 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL);
2767 if (!amdgpu_get_bios(adev) && !optional)
2768 return -EINVAL;
2769
2770 if (optional && !adev->bios)
2771 dev_info(
2772 adev->dev,
2773 "VBIOS image optional, proceeding without VBIOS image");
2774
2775 if (adev->bios) {
2776 r = amdgpu_atombios_init(adev);
2777 if (r) {
2778 dev_err(adev->dev,
2779 "amdgpu_atombios_init failed\n");
2780 amdgpu_vf_error_put(
2781 adev,
2782 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL,
2783 0, 0);
2784 return r;
2785 }
2786 }
2787 }
2788
2789 /*get pf2vf msg info at it's earliest time*/
2790 if (amdgpu_sriov_vf(adev))
2791 amdgpu_virt_init_data_exchange(adev);
2792
2793 }
2794 }
2795 if (!total)
2796 return -ENODEV;
2797
2798 if (adev->gmc.xgmi.supported)
2799 amdgpu_xgmi_early_init(adev);
2800
2801 if (amdgpu_is_multi_aid(adev))
2802 amdgpu_uid_init(adev);
2803 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2804 if (ip_block->status.valid != false)
2805 amdgpu_amdkfd_device_probe(adev);
2806
2807 adev->cg_flags &= amdgpu_cg_mask;
2808 adev->pg_flags &= amdgpu_pg_mask;
2809
2810 return 0;
2811 }
2812
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2813 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2814 {
2815 int i, r;
2816
2817 for (i = 0; i < adev->num_ip_blocks; i++) {
2818 if (!adev->ip_blocks[i].status.sw)
2819 continue;
2820 if (adev->ip_blocks[i].status.hw)
2821 continue;
2822 if (!amdgpu_ip_member_of_hwini(
2823 adev, adev->ip_blocks[i].version->type))
2824 continue;
2825 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2826 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2827 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2828 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2829 if (r) {
2830 dev_err(adev->dev,
2831 "hw_init of IP block <%s> failed %d\n",
2832 adev->ip_blocks[i].version->funcs->name,
2833 r);
2834 return r;
2835 }
2836 adev->ip_blocks[i].status.hw = true;
2837 }
2838 }
2839
2840 return 0;
2841 }
2842
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2843 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2844 {
2845 int i, r;
2846
2847 for (i = 0; i < adev->num_ip_blocks; i++) {
2848 if (!adev->ip_blocks[i].status.sw)
2849 continue;
2850 if (adev->ip_blocks[i].status.hw)
2851 continue;
2852 if (!amdgpu_ip_member_of_hwini(
2853 adev, adev->ip_blocks[i].version->type))
2854 continue;
2855 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2856 if (r) {
2857 dev_err(adev->dev,
2858 "hw_init of IP block <%s> failed %d\n",
2859 adev->ip_blocks[i].version->funcs->name, r);
2860 return r;
2861 }
2862 adev->ip_blocks[i].status.hw = true;
2863 }
2864
2865 return 0;
2866 }
2867
amdgpu_device_fw_loading(struct amdgpu_device * adev)2868 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2869 {
2870 int r = 0;
2871 int i;
2872 uint32_t smu_version;
2873
2874 if (adev->asic_type >= CHIP_VEGA10) {
2875 for (i = 0; i < adev->num_ip_blocks; i++) {
2876 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2877 continue;
2878
2879 if (!amdgpu_ip_member_of_hwini(adev,
2880 AMD_IP_BLOCK_TYPE_PSP))
2881 break;
2882
2883 if (!adev->ip_blocks[i].status.sw)
2884 continue;
2885
2886 /* no need to do the fw loading again if already done*/
2887 if (adev->ip_blocks[i].status.hw == true)
2888 break;
2889
2890 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2891 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2892 if (r)
2893 return r;
2894 } else {
2895 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2896 if (r) {
2897 dev_err(adev->dev,
2898 "hw_init of IP block <%s> failed %d\n",
2899 adev->ip_blocks[i]
2900 .version->funcs->name,
2901 r);
2902 return r;
2903 }
2904 adev->ip_blocks[i].status.hw = true;
2905 }
2906 break;
2907 }
2908 }
2909
2910 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2911 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2912
2913 return r;
2914 }
2915
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2916 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2917 {
2918 struct drm_sched_init_args args = {
2919 .ops = &amdgpu_sched_ops,
2920 .num_rqs = DRM_SCHED_PRIORITY_COUNT,
2921 .timeout_wq = adev->reset_domain->wq,
2922 .dev = adev->dev,
2923 };
2924 long timeout;
2925 int r, i;
2926
2927 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2928 struct amdgpu_ring *ring = adev->rings[i];
2929
2930 /* No need to setup the GPU scheduler for rings that don't need it */
2931 if (!ring || ring->no_scheduler)
2932 continue;
2933
2934 switch (ring->funcs->type) {
2935 case AMDGPU_RING_TYPE_GFX:
2936 timeout = adev->gfx_timeout;
2937 break;
2938 case AMDGPU_RING_TYPE_COMPUTE:
2939 timeout = adev->compute_timeout;
2940 break;
2941 case AMDGPU_RING_TYPE_SDMA:
2942 timeout = adev->sdma_timeout;
2943 break;
2944 default:
2945 timeout = adev->video_timeout;
2946 break;
2947 }
2948
2949 args.timeout = timeout;
2950 args.credit_limit = ring->num_hw_submission;
2951 args.score = ring->sched_score;
2952 args.name = ring->name;
2953
2954 r = drm_sched_init(&ring->sched, &args);
2955 if (r) {
2956 dev_err(adev->dev,
2957 "Failed to create scheduler on ring %s.\n",
2958 ring->name);
2959 return r;
2960 }
2961 r = amdgpu_uvd_entity_init(adev, ring);
2962 if (r) {
2963 dev_err(adev->dev,
2964 "Failed to create UVD scheduling entity on ring %s.\n",
2965 ring->name);
2966 return r;
2967 }
2968 r = amdgpu_vce_entity_init(adev, ring);
2969 if (r) {
2970 dev_err(adev->dev,
2971 "Failed to create VCE scheduling entity on ring %s.\n",
2972 ring->name);
2973 return r;
2974 }
2975 }
2976
2977 if (adev->xcp_mgr)
2978 amdgpu_xcp_update_partition_sched_list(adev);
2979
2980 return 0;
2981 }
2982
2983
2984 /**
2985 * amdgpu_device_ip_init - run init for hardware IPs
2986 *
2987 * @adev: amdgpu_device pointer
2988 *
2989 * Main initialization pass for hardware IPs. The list of all the hardware
2990 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2991 * are run. sw_init initializes the software state associated with each IP
2992 * and hw_init initializes the hardware associated with each IP.
2993 * Returns 0 on success, negative error code on failure.
2994 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2995 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2996 {
2997 bool init_badpage;
2998 int i, r;
2999
3000 r = amdgpu_ras_init(adev);
3001 if (r)
3002 return r;
3003
3004 for (i = 0; i < adev->num_ip_blocks; i++) {
3005 if (!adev->ip_blocks[i].status.valid)
3006 continue;
3007 if (adev->ip_blocks[i].version->funcs->sw_init) {
3008 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
3009 if (r) {
3010 dev_err(adev->dev,
3011 "sw_init of IP block <%s> failed %d\n",
3012 adev->ip_blocks[i].version->funcs->name,
3013 r);
3014 goto init_failed;
3015 }
3016 }
3017 adev->ip_blocks[i].status.sw = true;
3018
3019 if (!amdgpu_ip_member_of_hwini(
3020 adev, adev->ip_blocks[i].version->type))
3021 continue;
3022
3023 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
3024 /* need to do common hw init early so everything is set up for gmc */
3025 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3026 if (r) {
3027 dev_err(adev->dev, "hw_init %d failed %d\n", i,
3028 r);
3029 goto init_failed;
3030 }
3031 adev->ip_blocks[i].status.hw = true;
3032 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3033 /* need to do gmc hw init early so we can allocate gpu mem */
3034 /* Try to reserve bad pages early */
3035 if (amdgpu_sriov_vf(adev))
3036 amdgpu_virt_exchange_data(adev);
3037
3038 r = amdgpu_device_mem_scratch_init(adev);
3039 if (r) {
3040 dev_err(adev->dev,
3041 "amdgpu_mem_scratch_init failed %d\n",
3042 r);
3043 goto init_failed;
3044 }
3045 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3046 if (r) {
3047 dev_err(adev->dev, "hw_init %d failed %d\n", i,
3048 r);
3049 goto init_failed;
3050 }
3051 r = amdgpu_device_wb_init(adev);
3052 if (r) {
3053 dev_err(adev->dev,
3054 "amdgpu_device_wb_init failed %d\n", r);
3055 goto init_failed;
3056 }
3057 adev->ip_blocks[i].status.hw = true;
3058
3059 /* right after GMC hw init, we create CSA */
3060 if (adev->gfx.mcbp) {
3061 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
3062 AMDGPU_GEM_DOMAIN_VRAM |
3063 AMDGPU_GEM_DOMAIN_GTT,
3064 AMDGPU_CSA_SIZE);
3065 if (r) {
3066 dev_err(adev->dev,
3067 "allocate CSA failed %d\n", r);
3068 goto init_failed;
3069 }
3070 }
3071
3072 r = amdgpu_seq64_init(adev);
3073 if (r) {
3074 dev_err(adev->dev, "allocate seq64 failed %d\n",
3075 r);
3076 goto init_failed;
3077 }
3078 }
3079 }
3080
3081 if (amdgpu_sriov_vf(adev))
3082 amdgpu_virt_init_data_exchange(adev);
3083
3084 r = amdgpu_ib_pool_init(adev);
3085 if (r) {
3086 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
3087 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
3088 goto init_failed;
3089 }
3090
3091 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
3092 if (r)
3093 goto init_failed;
3094
3095 r = amdgpu_device_ip_hw_init_phase1(adev);
3096 if (r)
3097 goto init_failed;
3098
3099 r = amdgpu_device_fw_loading(adev);
3100 if (r)
3101 goto init_failed;
3102
3103 r = amdgpu_device_ip_hw_init_phase2(adev);
3104 if (r)
3105 goto init_failed;
3106
3107 /*
3108 * retired pages will be loaded from eeprom and reserved here,
3109 * it should be called after amdgpu_device_ip_hw_init_phase2 since
3110 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3111 * for I2C communication which only true at this point.
3112 *
3113 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3114 * failure from bad gpu situation and stop amdgpu init process
3115 * accordingly. For other failed cases, it will still release all
3116 * the resource and print error message, rather than returning one
3117 * negative value to upper level.
3118 *
3119 * Note: theoretically, this should be called before all vram allocations
3120 * to protect retired page from abusing
3121 */
3122 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3123 r = amdgpu_ras_recovery_init(adev, init_badpage);
3124 if (r)
3125 goto init_failed;
3126
3127 /**
3128 * In case of XGMI grab extra reference for reset domain for this device
3129 */
3130 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3131 if (amdgpu_xgmi_add_device(adev) == 0) {
3132 if (!amdgpu_sriov_vf(adev)) {
3133 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3134
3135 if (WARN_ON(!hive)) {
3136 r = -ENOENT;
3137 goto init_failed;
3138 }
3139
3140 if (!hive->reset_domain ||
3141 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3142 r = -ENOENT;
3143 amdgpu_put_xgmi_hive(hive);
3144 goto init_failed;
3145 }
3146
3147 /* Drop the early temporary reset domain we created for device */
3148 amdgpu_reset_put_reset_domain(adev->reset_domain);
3149 adev->reset_domain = hive->reset_domain;
3150 amdgpu_put_xgmi_hive(hive);
3151 }
3152 }
3153 }
3154
3155 r = amdgpu_device_init_schedulers(adev);
3156 if (r)
3157 goto init_failed;
3158
3159 if (adev->mman.buffer_funcs_ring &&
3160 adev->mman.buffer_funcs_ring->sched.ready)
3161 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3162
3163 /* Don't init kfd if whole hive need to be reset during init */
3164 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3165 amdgpu_amdkfd_device_init(adev);
3166 }
3167
3168 amdgpu_fru_get_product_info(adev);
3169
3170 r = amdgpu_cper_init(adev);
3171
3172 init_failed:
3173
3174 return r;
3175 }
3176
3177 /**
3178 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3179 *
3180 * @adev: amdgpu_device pointer
3181 *
3182 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3183 * this function before a GPU reset. If the value is retained after a
3184 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3185 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)3186 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3187 {
3188 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3189 }
3190
3191 /**
3192 * amdgpu_device_check_vram_lost - check if vram is valid
3193 *
3194 * @adev: amdgpu_device pointer
3195 *
3196 * Checks the reset magic value written to the gart pointer in VRAM.
3197 * The driver calls this after a GPU reset to see if the contents of
3198 * VRAM is lost or now.
3199 * returns true if vram is lost, false if not.
3200 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3201 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3202 {
3203 if (memcmp(adev->gart.ptr, adev->reset_magic,
3204 AMDGPU_RESET_MAGIC_NUM))
3205 return true;
3206
3207 if (!amdgpu_in_reset(adev))
3208 return false;
3209
3210 /*
3211 * For all ASICs with baco/mode1 reset, the VRAM is
3212 * always assumed to be lost.
3213 */
3214 switch (amdgpu_asic_reset_method(adev)) {
3215 case AMD_RESET_METHOD_LEGACY:
3216 case AMD_RESET_METHOD_LINK:
3217 case AMD_RESET_METHOD_BACO:
3218 case AMD_RESET_METHOD_MODE1:
3219 return true;
3220 default:
3221 return false;
3222 }
3223 }
3224
3225 /**
3226 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3227 *
3228 * @adev: amdgpu_device pointer
3229 * @state: clockgating state (gate or ungate)
3230 *
3231 * The list of all the hardware IPs that make up the asic is walked and the
3232 * set_clockgating_state callbacks are run.
3233 * Late initialization pass enabling clockgating for hardware IPs.
3234 * Fini or suspend, pass disabling clockgating for hardware IPs.
3235 * Returns 0 on success, negative error code on failure.
3236 */
3237
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3238 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3239 enum amd_clockgating_state state)
3240 {
3241 int i, j, r;
3242
3243 if (amdgpu_emu_mode == 1)
3244 return 0;
3245
3246 for (j = 0; j < adev->num_ip_blocks; j++) {
3247 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3248 if (!adev->ip_blocks[i].status.late_initialized)
3249 continue;
3250 /* skip CG for GFX, SDMA on S0ix */
3251 if (adev->in_s0ix &&
3252 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3253 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3254 continue;
3255 /* skip CG for VCE/UVD, it's handled specially */
3256 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3257 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3258 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3259 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3260 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3261 /* enable clockgating to save power */
3262 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
3263 state);
3264 if (r) {
3265 dev_err(adev->dev,
3266 "set_clockgating_state(gate) of IP block <%s> failed %d\n",
3267 adev->ip_blocks[i].version->funcs->name,
3268 r);
3269 return r;
3270 }
3271 }
3272 }
3273
3274 return 0;
3275 }
3276
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3277 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3278 enum amd_powergating_state state)
3279 {
3280 int i, j, r;
3281
3282 if (amdgpu_emu_mode == 1)
3283 return 0;
3284
3285 for (j = 0; j < adev->num_ip_blocks; j++) {
3286 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3287 if (!adev->ip_blocks[i].status.late_initialized)
3288 continue;
3289 /* skip PG for GFX, SDMA on S0ix */
3290 if (adev->in_s0ix &&
3291 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3292 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3293 continue;
3294 /* skip CG for VCE/UVD, it's handled specially */
3295 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3296 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3297 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3298 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3299 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3300 /* enable powergating to save power */
3301 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
3302 state);
3303 if (r) {
3304 dev_err(adev->dev,
3305 "set_powergating_state(gate) of IP block <%s> failed %d\n",
3306 adev->ip_blocks[i].version->funcs->name,
3307 r);
3308 return r;
3309 }
3310 }
3311 }
3312 return 0;
3313 }
3314
amdgpu_device_enable_mgpu_fan_boost(void)3315 static int amdgpu_device_enable_mgpu_fan_boost(void)
3316 {
3317 struct amdgpu_gpu_instance *gpu_ins;
3318 struct amdgpu_device *adev;
3319 int i, ret = 0;
3320
3321 mutex_lock(&mgpu_info.mutex);
3322
3323 /*
3324 * MGPU fan boost feature should be enabled
3325 * only when there are two or more dGPUs in
3326 * the system
3327 */
3328 if (mgpu_info.num_dgpu < 2)
3329 goto out;
3330
3331 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3332 gpu_ins = &(mgpu_info.gpu_ins[i]);
3333 adev = gpu_ins->adev;
3334 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) &&
3335 !gpu_ins->mgpu_fan_enabled) {
3336 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3337 if (ret)
3338 break;
3339
3340 gpu_ins->mgpu_fan_enabled = 1;
3341 }
3342 }
3343
3344 out:
3345 mutex_unlock(&mgpu_info.mutex);
3346
3347 return ret;
3348 }
3349
3350 /**
3351 * amdgpu_device_ip_late_init - run late init for hardware IPs
3352 *
3353 * @adev: amdgpu_device pointer
3354 *
3355 * Late initialization pass for hardware IPs. The list of all the hardware
3356 * IPs that make up the asic is walked and the late_init callbacks are run.
3357 * late_init covers any special initialization that an IP requires
3358 * after all of the have been initialized or something that needs to happen
3359 * late in the init process.
3360 * Returns 0 on success, negative error code on failure.
3361 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3362 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3363 {
3364 struct amdgpu_gpu_instance *gpu_instance;
3365 int i = 0, r;
3366
3367 for (i = 0; i < adev->num_ip_blocks; i++) {
3368 if (!adev->ip_blocks[i].status.hw)
3369 continue;
3370 if (adev->ip_blocks[i].version->funcs->late_init) {
3371 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3372 if (r) {
3373 dev_err(adev->dev,
3374 "late_init of IP block <%s> failed %d\n",
3375 adev->ip_blocks[i].version->funcs->name,
3376 r);
3377 return r;
3378 }
3379 }
3380 adev->ip_blocks[i].status.late_initialized = true;
3381 }
3382
3383 r = amdgpu_ras_late_init(adev);
3384 if (r) {
3385 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r);
3386 return r;
3387 }
3388
3389 if (!amdgpu_reset_in_recovery(adev))
3390 amdgpu_ras_set_error_query_ready(adev, true);
3391
3392 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3393 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3394
3395 amdgpu_device_fill_reset_magic(adev);
3396
3397 r = amdgpu_device_enable_mgpu_fan_boost();
3398 if (r)
3399 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r);
3400
3401 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3402 if (amdgpu_passthrough(adev) &&
3403 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3404 adev->asic_type == CHIP_ALDEBARAN))
3405 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3406
3407 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3408 mutex_lock(&mgpu_info.mutex);
3409
3410 /*
3411 * Reset device p-state to low as this was booted with high.
3412 *
3413 * This should be performed only after all devices from the same
3414 * hive get initialized.
3415 *
3416 * However, it's unknown how many device in the hive in advance.
3417 * As this is counted one by one during devices initializations.
3418 *
3419 * So, we wait for all XGMI interlinked devices initialized.
3420 * This may bring some delays as those devices may come from
3421 * different hives. But that should be OK.
3422 */
3423 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3424 for (i = 0; i < mgpu_info.num_gpu; i++) {
3425 gpu_instance = &(mgpu_info.gpu_ins[i]);
3426 if (gpu_instance->adev->flags & AMD_IS_APU)
3427 continue;
3428
3429 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3430 AMDGPU_XGMI_PSTATE_MIN);
3431 if (r) {
3432 dev_err(adev->dev,
3433 "pstate setting failed (%d).\n",
3434 r);
3435 break;
3436 }
3437 }
3438 }
3439
3440 mutex_unlock(&mgpu_info.mutex);
3441 }
3442
3443 return 0;
3444 }
3445
amdgpu_ip_block_hw_fini(struct amdgpu_ip_block * ip_block)3446 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3447 {
3448 struct amdgpu_device *adev = ip_block->adev;
3449 int r;
3450
3451 if (!ip_block->version->funcs->hw_fini) {
3452 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n",
3453 ip_block->version->funcs->name);
3454 } else {
3455 r = ip_block->version->funcs->hw_fini(ip_block);
3456 /* XXX handle errors */
3457 if (r) {
3458 dev_dbg(adev->dev,
3459 "hw_fini of IP block <%s> failed %d\n",
3460 ip_block->version->funcs->name, r);
3461 }
3462 }
3463
3464 ip_block->status.hw = false;
3465 }
3466
3467 /**
3468 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3469 *
3470 * @adev: amdgpu_device pointer
3471 *
3472 * For ASICs need to disable SMC first
3473 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3474 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3475 {
3476 int i;
3477
3478 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3479 return;
3480
3481 for (i = 0; i < adev->num_ip_blocks; i++) {
3482 if (!adev->ip_blocks[i].status.hw)
3483 continue;
3484 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3485 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3486 break;
3487 }
3488 }
3489 }
3490
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3491 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3492 {
3493 int i, r;
3494
3495 for (i = 0; i < adev->num_ip_blocks; i++) {
3496 if (!adev->ip_blocks[i].version->funcs->early_fini)
3497 continue;
3498
3499 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3500 if (r) {
3501 dev_dbg(adev->dev,
3502 "early_fini of IP block <%s> failed %d\n",
3503 adev->ip_blocks[i].version->funcs->name, r);
3504 }
3505 }
3506
3507 amdgpu_amdkfd_suspend(adev, true);
3508 amdgpu_amdkfd_teardown_processes(adev);
3509 amdgpu_userq_suspend(adev);
3510
3511 /* Workaround for ASICs need to disable SMC first */
3512 amdgpu_device_smu_fini_early(adev);
3513
3514 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3515 if (!adev->ip_blocks[i].status.hw)
3516 continue;
3517
3518 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3519 }
3520
3521 if (amdgpu_sriov_vf(adev)) {
3522 if (amdgpu_virt_release_full_gpu(adev, false))
3523 dev_err(adev->dev,
3524 "failed to release exclusive mode on fini\n");
3525 }
3526
3527 /*
3528 * Driver reload on the APU can fail due to firmware validation because
3529 * the PSP is always running, as it is shared across the whole SoC.
3530 * This same issue does not occur on dGPU because it has a mechanism
3531 * that checks whether the PSP is running. A solution for those issues
3532 * in the APU is to trigger a GPU reset, but this should be done during
3533 * the unload phase to avoid adding boot latency and screen flicker.
3534 */
3535 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) {
3536 r = amdgpu_asic_reset(adev);
3537 if (r)
3538 dev_err(adev->dev, "asic reset on %s failed\n", __func__);
3539 }
3540
3541 return 0;
3542 }
3543
3544 /**
3545 * amdgpu_device_ip_fini - run fini for hardware IPs
3546 *
3547 * @adev: amdgpu_device pointer
3548 *
3549 * Main teardown pass for hardware IPs. The list of all the hardware
3550 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3551 * are run. hw_fini tears down the hardware associated with each IP
3552 * and sw_fini tears down any software state associated with each IP.
3553 * Returns 0 on success, negative error code on failure.
3554 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3555 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3556 {
3557 int i, r;
3558
3559 amdgpu_cper_fini(adev);
3560
3561 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3562 amdgpu_virt_release_ras_err_handler_data(adev);
3563
3564 if (adev->gmc.xgmi.num_physical_nodes > 1)
3565 amdgpu_xgmi_remove_device(adev);
3566
3567 amdgpu_amdkfd_device_fini_sw(adev);
3568
3569 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3570 if (!adev->ip_blocks[i].status.sw)
3571 continue;
3572
3573 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3574 amdgpu_ucode_free_bo(adev);
3575 amdgpu_free_static_csa(&adev->virt.csa_obj);
3576 amdgpu_device_wb_fini(adev);
3577 amdgpu_device_mem_scratch_fini(adev);
3578 amdgpu_ib_pool_fini(adev);
3579 amdgpu_seq64_fini(adev);
3580 amdgpu_doorbell_fini(adev);
3581 }
3582 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3583 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3584 /* XXX handle errors */
3585 if (r) {
3586 dev_dbg(adev->dev,
3587 "sw_fini of IP block <%s> failed %d\n",
3588 adev->ip_blocks[i].version->funcs->name,
3589 r);
3590 }
3591 }
3592 adev->ip_blocks[i].status.sw = false;
3593 adev->ip_blocks[i].status.valid = false;
3594 }
3595
3596 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3597 if (!adev->ip_blocks[i].status.late_initialized)
3598 continue;
3599 if (adev->ip_blocks[i].version->funcs->late_fini)
3600 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3601 adev->ip_blocks[i].status.late_initialized = false;
3602 }
3603
3604 amdgpu_ras_fini(adev);
3605 amdgpu_uid_fini(adev);
3606
3607 return 0;
3608 }
3609
3610 /**
3611 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3612 *
3613 * @work: work_struct.
3614 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3615 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3616 {
3617 struct amdgpu_device *adev =
3618 container_of(work, struct amdgpu_device, delayed_init_work.work);
3619 int r;
3620
3621 r = amdgpu_ib_ring_tests(adev);
3622 if (r)
3623 dev_err(adev->dev, "ib ring test failed (%d).\n", r);
3624 }
3625
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3626 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3627 {
3628 struct amdgpu_device *adev =
3629 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3630
3631 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3632 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3633
3634 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0))
3635 adev->gfx.gfx_off_state = true;
3636 }
3637
3638 /**
3639 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3640 *
3641 * @adev: amdgpu_device pointer
3642 *
3643 * Main suspend function for hardware IPs. The list of all the hardware
3644 * IPs that make up the asic is walked, clockgating is disabled and the
3645 * suspend callbacks are run. suspend puts the hardware and software state
3646 * in each IP into a state suitable for suspend.
3647 * Returns 0 on success, negative error code on failure.
3648 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3649 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3650 {
3651 int i, r, rec;
3652
3653 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3654 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3655
3656 /*
3657 * Per PMFW team's suggestion, driver needs to handle gfxoff
3658 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3659 * scenario. Add the missing df cstate disablement here.
3660 */
3661 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3662 dev_warn(adev->dev, "Failed to disallow df cstate");
3663
3664 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3665 if (!adev->ip_blocks[i].status.valid)
3666 continue;
3667
3668 /* displays are handled separately */
3669 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3670 continue;
3671
3672 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3673 if (r)
3674 goto unwind;
3675 }
3676
3677 return 0;
3678 unwind:
3679 rec = amdgpu_device_ip_resume_phase3(adev);
3680 if (rec)
3681 dev_err(adev->dev,
3682 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n",
3683 rec);
3684
3685 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW);
3686
3687 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3688 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3689
3690 return r;
3691 }
3692
3693 /**
3694 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3695 *
3696 * @adev: amdgpu_device pointer
3697 *
3698 * Main suspend function for hardware IPs. The list of all the hardware
3699 * IPs that make up the asic is walked, clockgating is disabled and the
3700 * suspend callbacks are run. suspend puts the hardware and software state
3701 * in each IP into a state suitable for suspend.
3702 * Returns 0 on success, negative error code on failure.
3703 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3704 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3705 {
3706 int i, r, rec;
3707
3708 if (adev->in_s0ix)
3709 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3710
3711 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3712 if (!adev->ip_blocks[i].status.valid)
3713 continue;
3714 /* displays are handled in phase1 */
3715 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3716 continue;
3717 /* PSP lost connection when err_event_athub occurs */
3718 if (amdgpu_ras_intr_triggered() &&
3719 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3720 adev->ip_blocks[i].status.hw = false;
3721 continue;
3722 }
3723
3724 /* skip unnecessary suspend if we do not initialize them yet */
3725 if (!amdgpu_ip_member_of_hwini(
3726 adev, adev->ip_blocks[i].version->type))
3727 continue;
3728
3729 /* Since we skip suspend for S0i3, we need to cancel the delayed
3730 * idle work here as the suspend callback never gets called.
3731 */
3732 if (adev->in_s0ix &&
3733 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX &&
3734 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0))
3735 cancel_delayed_work_sync(&adev->gfx.idle_work);
3736 /* skip suspend of gfx/mes and psp for S0ix
3737 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3738 * like at runtime. PSP is also part of the always on hardware
3739 * so no need to suspend it.
3740 */
3741 if (adev->in_s0ix &&
3742 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3744 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3745 continue;
3746
3747 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3748 if (adev->in_s0ix &&
3749 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3750 IP_VERSION(5, 0, 0)) &&
3751 (adev->ip_blocks[i].version->type ==
3752 AMD_IP_BLOCK_TYPE_SDMA))
3753 continue;
3754
3755 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3756 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3757 * from this location and RLC Autoload automatically also gets loaded
3758 * from here based on PMFW -> PSP message during re-init sequence.
3759 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3760 * the TMR and reload FWs again for IMU enabled APU ASICs.
3761 */
3762 if (amdgpu_in_reset(adev) &&
3763 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3764 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3765 continue;
3766
3767 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3768 if (r)
3769 goto unwind;
3770
3771 /* handle putting the SMC in the appropriate state */
3772 if (!amdgpu_sriov_vf(adev)) {
3773 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3774 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3775 if (r) {
3776 dev_err(adev->dev,
3777 "SMC failed to set mp1 state %d, %d\n",
3778 adev->mp1_state, r);
3779 goto unwind;
3780 }
3781 }
3782 }
3783 }
3784
3785 return 0;
3786 unwind:
3787 /* suspend phase 2 = resume phase 1 + resume phase 2 */
3788 rec = amdgpu_device_ip_resume_phase1(adev);
3789 if (rec) {
3790 dev_err(adev->dev,
3791 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n",
3792 rec);
3793 return r;
3794 }
3795
3796 rec = amdgpu_device_fw_loading(adev);
3797 if (rec) {
3798 dev_err(adev->dev,
3799 "amdgpu_device_fw_loading failed during unwind: %d\n",
3800 rec);
3801 return r;
3802 }
3803
3804 rec = amdgpu_device_ip_resume_phase2(adev);
3805 if (rec) {
3806 dev_err(adev->dev,
3807 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n",
3808 rec);
3809 return r;
3810 }
3811
3812 return r;
3813 }
3814
3815 /**
3816 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3817 *
3818 * @adev: amdgpu_device pointer
3819 *
3820 * Main suspend function for hardware IPs. The list of all the hardware
3821 * IPs that make up the asic is walked, clockgating is disabled and the
3822 * suspend callbacks are run. suspend puts the hardware and software state
3823 * in each IP into a state suitable for suspend.
3824 * Returns 0 on success, negative error code on failure.
3825 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3826 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3827 {
3828 int r;
3829
3830 if (amdgpu_sriov_vf(adev)) {
3831 amdgpu_virt_fini_data_exchange(adev);
3832 amdgpu_virt_request_full_gpu(adev, false);
3833 }
3834
3835 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3836
3837 r = amdgpu_device_ip_suspend_phase1(adev);
3838 if (r)
3839 return r;
3840 r = amdgpu_device_ip_suspend_phase2(adev);
3841
3842 if (amdgpu_sriov_vf(adev))
3843 amdgpu_virt_release_full_gpu(adev, false);
3844
3845 return r;
3846 }
3847
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3848 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3849 {
3850 int i, r;
3851
3852 static enum amd_ip_block_type ip_order[] = {
3853 AMD_IP_BLOCK_TYPE_COMMON,
3854 AMD_IP_BLOCK_TYPE_GMC,
3855 AMD_IP_BLOCK_TYPE_PSP,
3856 AMD_IP_BLOCK_TYPE_IH,
3857 };
3858
3859 for (i = 0; i < adev->num_ip_blocks; i++) {
3860 int j;
3861 struct amdgpu_ip_block *block;
3862
3863 block = &adev->ip_blocks[i];
3864 block->status.hw = false;
3865
3866 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3867
3868 if (block->version->type != ip_order[j] ||
3869 !block->status.valid)
3870 continue;
3871
3872 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3873 if (r) {
3874 dev_err(adev->dev, "RE-INIT-early: %s failed\n",
3875 block->version->funcs->name);
3876 return r;
3877 }
3878 block->status.hw = true;
3879 }
3880 }
3881
3882 return 0;
3883 }
3884
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3885 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3886 {
3887 struct amdgpu_ip_block *block;
3888 int i, r = 0;
3889
3890 static enum amd_ip_block_type ip_order[] = {
3891 AMD_IP_BLOCK_TYPE_SMC,
3892 AMD_IP_BLOCK_TYPE_DCE,
3893 AMD_IP_BLOCK_TYPE_GFX,
3894 AMD_IP_BLOCK_TYPE_SDMA,
3895 AMD_IP_BLOCK_TYPE_MES,
3896 AMD_IP_BLOCK_TYPE_UVD,
3897 AMD_IP_BLOCK_TYPE_VCE,
3898 AMD_IP_BLOCK_TYPE_VCN,
3899 AMD_IP_BLOCK_TYPE_JPEG
3900 };
3901
3902 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3903 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
3904
3905 if (!block)
3906 continue;
3907
3908 if (block->status.valid && !block->status.hw) {
3909 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3910 r = amdgpu_ip_block_resume(block);
3911 } else {
3912 r = block->version->funcs->hw_init(block);
3913 }
3914
3915 if (r) {
3916 dev_err(adev->dev, "RE-INIT-late: %s failed\n",
3917 block->version->funcs->name);
3918 break;
3919 }
3920 block->status.hw = true;
3921 }
3922 }
3923
3924 return r;
3925 }
3926
3927 /**
3928 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3929 *
3930 * @adev: amdgpu_device pointer
3931 *
3932 * First resume function for hardware IPs. The list of all the hardware
3933 * IPs that make up the asic is walked and the resume callbacks are run for
3934 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3935 * after a suspend and updates the software state as necessary. This
3936 * function is also used for restoring the GPU after a GPU reset.
3937 * Returns 0 on success, negative error code on failure.
3938 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3939 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3940 {
3941 int i, r;
3942
3943 for (i = 0; i < adev->num_ip_blocks; i++) {
3944 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3945 continue;
3946 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3947 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3948 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3949 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3950
3951 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3952 if (r)
3953 return r;
3954 }
3955 }
3956
3957 return 0;
3958 }
3959
3960 /**
3961 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3962 *
3963 * @adev: amdgpu_device pointer
3964 *
3965 * Second resume function for hardware IPs. The list of all the hardware
3966 * IPs that make up the asic is walked and the resume callbacks are run for
3967 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3968 * functional state after a suspend and updates the software state as
3969 * necessary. This function is also used for restoring the GPU after a GPU
3970 * reset.
3971 * Returns 0 on success, negative error code on failure.
3972 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3973 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3974 {
3975 int i, r;
3976
3977 for (i = 0; i < adev->num_ip_blocks; i++) {
3978 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3979 continue;
3980 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3981 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3984 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3985 continue;
3986 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3987 if (r)
3988 return r;
3989 }
3990
3991 return 0;
3992 }
3993
3994 /**
3995 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3996 *
3997 * @adev: amdgpu_device pointer
3998 *
3999 * Third resume function for hardware IPs. The list of all the hardware
4000 * IPs that make up the asic is walked and the resume callbacks are run for
4001 * all DCE. resume puts the hardware into a functional state after a suspend
4002 * and updates the software state as necessary. This function is also used
4003 * for restoring the GPU after a GPU reset.
4004 *
4005 * Returns 0 on success, negative error code on failure.
4006 */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)4007 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
4008 {
4009 int i, r;
4010
4011 for (i = 0; i < adev->num_ip_blocks; i++) {
4012 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
4013 continue;
4014 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
4015 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
4016 if (r)
4017 return r;
4018 }
4019 }
4020
4021 return 0;
4022 }
4023
4024 /**
4025 * amdgpu_device_ip_resume - run resume for hardware IPs
4026 *
4027 * @adev: amdgpu_device pointer
4028 *
4029 * Main resume function for hardware IPs. The hardware IPs
4030 * are split into two resume functions because they are
4031 * also used in recovering from a GPU reset and some additional
4032 * steps need to be take between them. In this case (S3/S4) they are
4033 * run sequentially.
4034 * Returns 0 on success, negative error code on failure.
4035 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)4036 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
4037 {
4038 int r;
4039
4040 r = amdgpu_device_ip_resume_phase1(adev);
4041 if (r)
4042 return r;
4043
4044 r = amdgpu_device_fw_loading(adev);
4045 if (r)
4046 return r;
4047
4048 r = amdgpu_device_ip_resume_phase2(adev);
4049
4050 if (adev->mman.buffer_funcs_ring->sched.ready)
4051 amdgpu_ttm_set_buffer_funcs_status(adev, true);
4052
4053 if (r)
4054 return r;
4055
4056 amdgpu_fence_driver_hw_init(adev);
4057
4058 r = amdgpu_device_ip_resume_phase3(adev);
4059
4060 return r;
4061 }
4062
4063 /**
4064 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
4065 *
4066 * @adev: amdgpu_device pointer
4067 *
4068 * Query the VBIOS data tables to determine if the board supports SR-IOV.
4069 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)4070 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
4071 {
4072 if (amdgpu_sriov_vf(adev)) {
4073 if (adev->is_atom_fw) {
4074 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
4075 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4076 } else {
4077 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
4078 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4079 }
4080
4081 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
4082 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
4083 }
4084 }
4085
4086 /**
4087 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
4088 *
4089 * @pdev : pci device context
4090 * @asic_type: AMD asic type
4091 *
4092 * Check if there is DC (new modesetting infrastructre) support for an asic.
4093 * returns true if DC has support, false if not.
4094 */
amdgpu_device_asic_has_dc_support(struct pci_dev * pdev,enum amd_asic_type asic_type)4095 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev,
4096 enum amd_asic_type asic_type)
4097 {
4098 switch (asic_type) {
4099 #ifdef CONFIG_DRM_AMDGPU_SI
4100 case CHIP_HAINAN:
4101 #endif
4102 case CHIP_TOPAZ:
4103 /* chips with no display hardware */
4104 return false;
4105 #if defined(CONFIG_DRM_AMD_DC)
4106 case CHIP_TAHITI:
4107 case CHIP_PITCAIRN:
4108 case CHIP_VERDE:
4109 case CHIP_OLAND:
4110 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI);
4111 case CHIP_KAVERI:
4112 case CHIP_KABINI:
4113 case CHIP_MULLINS:
4114 /*
4115 * We have systems in the wild with these ASICs that require
4116 * TRAVIS and NUTMEG support which is not supported with DC.
4117 *
4118 * Fallback to the non-DC driver here by default so as not to
4119 * cause regressions.
4120 */
4121 return amdgpu_dc > 0;
4122 default:
4123 return amdgpu_dc != 0;
4124 #else
4125 default:
4126 if (amdgpu_dc > 0)
4127 dev_info_once(
4128 &pdev->dev,
4129 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
4130 return false;
4131 #endif
4132 }
4133 }
4134
4135 /**
4136 * amdgpu_device_has_dc_support - check if dc is supported
4137 *
4138 * @adev: amdgpu_device pointer
4139 *
4140 * Returns true for supported, false for not supported
4141 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)4142 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
4143 {
4144 if (adev->enable_virtual_display ||
4145 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
4146 return false;
4147
4148 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type);
4149 }
4150
amdgpu_device_xgmi_reset_func(struct work_struct * __work)4151 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
4152 {
4153 struct amdgpu_device *adev =
4154 container_of(__work, struct amdgpu_device, xgmi_reset_work);
4155 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
4156
4157 /* It's a bug to not have a hive within this function */
4158 if (WARN_ON(!hive))
4159 return;
4160
4161 /*
4162 * Use task barrier to synchronize all xgmi reset works across the
4163 * hive. task_barrier_enter and task_barrier_exit will block
4164 * until all the threads running the xgmi reset works reach
4165 * those points. task_barrier_full will do both blocks.
4166 */
4167 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
4168
4169 task_barrier_enter(&hive->tb);
4170 adev->asic_reset_res = amdgpu_device_baco_enter(adev);
4171
4172 if (adev->asic_reset_res)
4173 goto fail;
4174
4175 task_barrier_exit(&hive->tb);
4176 adev->asic_reset_res = amdgpu_device_baco_exit(adev);
4177
4178 if (adev->asic_reset_res)
4179 goto fail;
4180
4181 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
4182 } else {
4183
4184 task_barrier_full(&hive->tb);
4185 adev->asic_reset_res = amdgpu_asic_reset(adev);
4186 }
4187
4188 fail:
4189 if (adev->asic_reset_res)
4190 dev_warn(adev->dev,
4191 "ASIC reset failed with error, %d for drm dev, %s",
4192 adev->asic_reset_res, adev_to_drm(adev)->unique);
4193 amdgpu_put_xgmi_hive(hive);
4194 }
4195
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)4196 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4197 {
4198 char *input = amdgpu_lockup_timeout;
4199 char *timeout_setting = NULL;
4200 int index = 0;
4201 long timeout;
4202 int ret = 0;
4203
4204 /* By default timeout for all queues is 2 sec */
4205 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
4206 adev->video_timeout = msecs_to_jiffies(2000);
4207
4208 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
4209 return 0;
4210
4211 while ((timeout_setting = strsep(&input, ",")) &&
4212 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4213 ret = kstrtol(timeout_setting, 0, &timeout);
4214 if (ret)
4215 return ret;
4216
4217 if (timeout == 0) {
4218 index++;
4219 continue;
4220 } else if (timeout < 0) {
4221 timeout = MAX_SCHEDULE_TIMEOUT;
4222 dev_warn(adev->dev, "lockup timeout disabled");
4223 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4224 } else {
4225 timeout = msecs_to_jiffies(timeout);
4226 }
4227
4228 switch (index++) {
4229 case 0:
4230 adev->gfx_timeout = timeout;
4231 break;
4232 case 1:
4233 adev->compute_timeout = timeout;
4234 break;
4235 case 2:
4236 adev->sdma_timeout = timeout;
4237 break;
4238 case 3:
4239 adev->video_timeout = timeout;
4240 break;
4241 default:
4242 break;
4243 }
4244 }
4245
4246 /* When only one value specified apply it to all queues. */
4247 if (index == 1)
4248 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
4249 adev->video_timeout = timeout;
4250
4251 return ret;
4252 }
4253
4254 /**
4255 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4256 *
4257 * @adev: amdgpu_device pointer
4258 *
4259 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4260 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4261 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4262 {
4263 struct iommu_domain *domain;
4264
4265 domain = iommu_get_domain_for_dev(adev->dev);
4266 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4267 adev->ram_is_direct_mapped = true;
4268 }
4269
4270 #if defined(CONFIG_HSA_AMD_P2P)
4271 /**
4272 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4273 *
4274 * @adev: amdgpu_device pointer
4275 *
4276 * return if IOMMU remapping bar address
4277 */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4278 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4279 {
4280 struct iommu_domain *domain;
4281
4282 domain = iommu_get_domain_for_dev(adev->dev);
4283 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4284 domain->type == IOMMU_DOMAIN_DMA_FQ))
4285 return true;
4286
4287 return false;
4288 }
4289 #endif
4290
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4291 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4292 {
4293 if (amdgpu_mcbp == 1)
4294 adev->gfx.mcbp = true;
4295 else if (amdgpu_mcbp == 0)
4296 adev->gfx.mcbp = false;
4297
4298 if (amdgpu_sriov_vf(adev))
4299 adev->gfx.mcbp = true;
4300
4301 if (adev->gfx.mcbp)
4302 dev_info(adev->dev, "MCBP is enabled\n");
4303 }
4304
amdgpu_device_sys_interface_init(struct amdgpu_device * adev)4305 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev)
4306 {
4307 int r;
4308
4309 r = amdgpu_atombios_sysfs_init(adev);
4310 if (r)
4311 drm_err(&adev->ddev,
4312 "registering atombios sysfs failed (%d).\n", r);
4313
4314 r = amdgpu_pm_sysfs_init(adev);
4315 if (r)
4316 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r);
4317
4318 r = amdgpu_ucode_sysfs_init(adev);
4319 if (r) {
4320 adev->ucode_sysfs_en = false;
4321 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r);
4322 } else
4323 adev->ucode_sysfs_en = true;
4324
4325 r = amdgpu_device_attr_sysfs_init(adev);
4326 if (r)
4327 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4328
4329 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4330 if (r)
4331 dev_err(adev->dev,
4332 "Could not create amdgpu board attributes\n");
4333
4334 amdgpu_fru_sysfs_init(adev);
4335 amdgpu_reg_state_sysfs_init(adev);
4336 amdgpu_xcp_sysfs_init(adev);
4337 amdgpu_uma_sysfs_init(adev);
4338
4339 return r;
4340 }
4341
amdgpu_device_sys_interface_fini(struct amdgpu_device * adev)4342 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev)
4343 {
4344 if (adev->pm.sysfs_initialized)
4345 amdgpu_pm_sysfs_fini(adev);
4346 if (adev->ucode_sysfs_en)
4347 amdgpu_ucode_sysfs_fini(adev);
4348 amdgpu_device_attr_sysfs_fini(adev);
4349 amdgpu_fru_sysfs_fini(adev);
4350
4351 amdgpu_reg_state_sysfs_fini(adev);
4352 amdgpu_xcp_sysfs_fini(adev);
4353 amdgpu_uma_sysfs_fini(adev);
4354 }
4355
4356 /**
4357 * amdgpu_device_init - initialize the driver
4358 *
4359 * @adev: amdgpu_device pointer
4360 * @flags: driver flags
4361 *
4362 * Initializes the driver info and hw (all asics).
4363 * Returns 0 for success or an error on failure.
4364 * Called at driver startup.
4365 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4366 int amdgpu_device_init(struct amdgpu_device *adev,
4367 uint32_t flags)
4368 {
4369 struct pci_dev *pdev = adev->pdev;
4370 int r, i;
4371 bool px = false;
4372 u32 max_MBps;
4373 int tmp;
4374
4375 adev->shutdown = false;
4376 adev->flags = flags;
4377
4378 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4379 adev->asic_type = amdgpu_force_asic_type;
4380 else
4381 adev->asic_type = flags & AMD_ASIC_MASK;
4382
4383 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4384 if (amdgpu_emu_mode == 1)
4385 adev->usec_timeout *= 10;
4386 adev->gmc.gart_size = 512 * 1024 * 1024;
4387 adev->accel_working = false;
4388 adev->num_rings = 0;
4389 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4390 adev->mman.buffer_funcs = NULL;
4391 adev->mman.buffer_funcs_ring = NULL;
4392 adev->vm_manager.vm_pte_funcs = NULL;
4393 adev->vm_manager.vm_pte_num_scheds = 0;
4394 adev->gmc.gmc_funcs = NULL;
4395 adev->harvest_ip_mask = 0x0;
4396 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4397 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4398
4399 adev->smc_rreg = &amdgpu_invalid_rreg;
4400 adev->smc_wreg = &amdgpu_invalid_wreg;
4401 adev->pcie_rreg = &amdgpu_invalid_rreg;
4402 adev->pcie_wreg = &amdgpu_invalid_wreg;
4403 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4404 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4405 adev->pciep_rreg = &amdgpu_invalid_rreg;
4406 adev->pciep_wreg = &amdgpu_invalid_wreg;
4407 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4408 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4409 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4410 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4411 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4412 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4413 adev->didt_rreg = &amdgpu_invalid_rreg;
4414 adev->didt_wreg = &amdgpu_invalid_wreg;
4415 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4416 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4417 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4418 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4419
4420 dev_info(
4421 adev->dev,
4422 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4423 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4424 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4425
4426 /* mutex initialization are all done here so we
4427 * can recall function without having locking issues
4428 */
4429 mutex_init(&adev->firmware.mutex);
4430 mutex_init(&adev->pm.mutex);
4431 mutex_init(&adev->gfx.gpu_clock_mutex);
4432 mutex_init(&adev->srbm_mutex);
4433 mutex_init(&adev->gfx.pipe_reserve_mutex);
4434 mutex_init(&adev->gfx.gfx_off_mutex);
4435 mutex_init(&adev->gfx.partition_mutex);
4436 mutex_init(&adev->grbm_idx_mutex);
4437 mutex_init(&adev->mn_lock);
4438 mutex_init(&adev->virt.vf_errors.lock);
4439 hash_init(adev->mn_hash);
4440 mutex_init(&adev->psp.mutex);
4441 mutex_init(&adev->notifier_lock);
4442 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4443 mutex_init(&adev->benchmark_mutex);
4444 mutex_init(&adev->gfx.reset_sem_mutex);
4445 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4446 mutex_init(&adev->enforce_isolation_mutex);
4447 for (i = 0; i < MAX_XCP; ++i) {
4448 adev->isolation[i].spearhead = dma_fence_get_stub();
4449 amdgpu_sync_create(&adev->isolation[i].active);
4450 amdgpu_sync_create(&adev->isolation[i].prev);
4451 }
4452 mutex_init(&adev->gfx.userq_sch_mutex);
4453 mutex_init(&adev->gfx.workload_profile_mutex);
4454 mutex_init(&adev->vcn.workload_profile_mutex);
4455
4456 amdgpu_device_init_apu_flags(adev);
4457
4458 r = amdgpu_device_check_arguments(adev);
4459 if (r)
4460 return r;
4461
4462 spin_lock_init(&adev->mmio_idx_lock);
4463 spin_lock_init(&adev->smc_idx_lock);
4464 spin_lock_init(&adev->pcie_idx_lock);
4465 spin_lock_init(&adev->uvd_ctx_idx_lock);
4466 spin_lock_init(&adev->didt_idx_lock);
4467 spin_lock_init(&adev->gc_cac_idx_lock);
4468 spin_lock_init(&adev->se_cac_idx_lock);
4469 spin_lock_init(&adev->audio_endpt_idx_lock);
4470 spin_lock_init(&adev->mm_stats.lock);
4471 spin_lock_init(&adev->virt.rlcg_reg_lock);
4472 spin_lock_init(&adev->wb.lock);
4473
4474 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ);
4475
4476 INIT_LIST_HEAD(&adev->reset_list);
4477
4478 INIT_LIST_HEAD(&adev->ras_list);
4479
4480 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4481
4482 xa_init(&adev->userq_doorbell_xa);
4483
4484 INIT_DELAYED_WORK(&adev->delayed_init_work,
4485 amdgpu_device_delayed_init_work_handler);
4486 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4487 amdgpu_device_delay_enable_gfx_off);
4488 /*
4489 * Initialize the enforce_isolation work structures for each XCP
4490 * partition. This work handler is responsible for enforcing shader
4491 * isolation on AMD GPUs. It counts the number of emitted fences for
4492 * each GFX and compute ring. If there are any fences, it schedules
4493 * the `enforce_isolation_work` to be run after a delay. If there are
4494 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4495 * runqueue.
4496 */
4497 for (i = 0; i < MAX_XCP; i++) {
4498 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4499 amdgpu_gfx_enforce_isolation_handler);
4500 adev->gfx.enforce_isolation[i].adev = adev;
4501 adev->gfx.enforce_isolation[i].xcp_id = i;
4502 }
4503
4504 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4505 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
4506
4507 adev->gfx.gfx_off_req_count = 1;
4508 adev->gfx.gfx_off_residency = 0;
4509 adev->gfx.gfx_off_entrycount = 0;
4510 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4511
4512 atomic_set(&adev->throttling_logging_enabled, 1);
4513 /*
4514 * If throttling continues, logging will be performed every minute
4515 * to avoid log flooding. "-1" is subtracted since the thermal
4516 * throttling interrupt comes every second. Thus, the total logging
4517 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4518 * for throttling interrupt) = 60 seconds.
4519 */
4520 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4521
4522 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4523
4524 /* Registers mapping */
4525 /* TODO: block userspace mapping of io register */
4526 if (adev->asic_type >= CHIP_BONAIRE) {
4527 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4528 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4529 } else {
4530 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4531 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4532 }
4533
4534 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4535 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4536
4537 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4538 if (!adev->rmmio)
4539 return -ENOMEM;
4540
4541 dev_info(adev->dev, "register mmio base: 0x%08X\n",
4542 (uint32_t)adev->rmmio_base);
4543 dev_info(adev->dev, "register mmio size: %u\n",
4544 (unsigned int)adev->rmmio_size);
4545
4546 /*
4547 * Reset domain needs to be present early, before XGMI hive discovered
4548 * (if any) and initialized to use reset sem and in_gpu reset flag
4549 * early on during init and before calling to RREG32.
4550 */
4551 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4552 if (!adev->reset_domain)
4553 return -ENOMEM;
4554
4555 /* detect hw virtualization here */
4556 amdgpu_virt_init(adev);
4557
4558 amdgpu_device_get_pcie_info(adev);
4559
4560 r = amdgpu_device_get_job_timeout_settings(adev);
4561 if (r) {
4562 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4563 return r;
4564 }
4565
4566 amdgpu_device_set_mcbp(adev);
4567
4568 /*
4569 * By default, use default mode where all blocks are expected to be
4570 * initialized. At present a 'swinit' of blocks is required to be
4571 * completed before the need for a different level is detected.
4572 */
4573 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4574 /* early init functions */
4575 r = amdgpu_device_ip_early_init(adev);
4576 if (r)
4577 return r;
4578
4579 /*
4580 * No need to remove conflicting FBs for non-display class devices.
4581 * This prevents the sysfb from being freed accidently.
4582 */
4583 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA ||
4584 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) {
4585 /* Get rid of things like offb */
4586 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4587 if (r)
4588 return r;
4589 }
4590
4591 /* Enable TMZ based on IP_VERSION */
4592 amdgpu_gmc_tmz_set(adev);
4593
4594 if (amdgpu_sriov_vf(adev) &&
4595 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4596 /* VF MMIO access (except mailbox range) from CPU
4597 * will be blocked during sriov runtime
4598 */
4599 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4600
4601 amdgpu_gmc_noretry_set(adev);
4602 /* Need to get xgmi info early to decide the reset behavior*/
4603 if (adev->gmc.xgmi.supported) {
4604 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4605 if (r)
4606 return r;
4607 }
4608
4609 /* enable PCIE atomic ops */
4610 if (amdgpu_sriov_vf(adev)) {
4611 if (adev->virt.fw_reserve.p_pf2vf)
4612 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4613 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4614 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4615 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4616 * internal path natively support atomics, set have_atomics_support to true.
4617 */
4618 } else if ((adev->flags & AMD_IS_APU &&
4619 amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) ||
4620 (adev->gmc.xgmi.connected_to_cpu &&
4621 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 1, 0))) {
4622 adev->have_atomics_support = true;
4623 } else {
4624 adev->have_atomics_support =
4625 !pci_enable_atomic_ops_to_root(adev->pdev,
4626 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4627 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4628 }
4629
4630 if (!adev->have_atomics_support)
4631 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4632
4633 /* doorbell bar mapping and doorbell index init*/
4634 amdgpu_doorbell_init(adev);
4635
4636 if (amdgpu_emu_mode == 1) {
4637 /* post the asic on emulation mode */
4638 emu_soc_asic_init(adev);
4639 goto fence_driver_init;
4640 }
4641
4642 amdgpu_reset_init(adev);
4643
4644 /* detect if we are with an SRIOV vbios */
4645 if (adev->bios)
4646 amdgpu_device_detect_sriov_bios(adev);
4647
4648 /* check if we need to reset the asic
4649 * E.g., driver was not cleanly unloaded previously, etc.
4650 */
4651 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4652 if (adev->gmc.xgmi.num_physical_nodes) {
4653 dev_info(adev->dev, "Pending hive reset.\n");
4654 amdgpu_set_init_level(adev,
4655 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4656 } else {
4657 tmp = amdgpu_reset_method;
4658 /* It should do a default reset when loading or reloading the driver,
4659 * regardless of the module parameter reset_method.
4660 */
4661 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4662 r = amdgpu_asic_reset(adev);
4663 amdgpu_reset_method = tmp;
4664 }
4665
4666 if (r) {
4667 dev_err(adev->dev, "asic reset on init failed\n");
4668 goto failed;
4669 }
4670 }
4671
4672 /* Post card if necessary */
4673 if (amdgpu_device_need_post(adev)) {
4674 if (!adev->bios) {
4675 dev_err(adev->dev, "no vBIOS found\n");
4676 r = -EINVAL;
4677 goto failed;
4678 }
4679 dev_info(adev->dev, "GPU posting now...\n");
4680 r = amdgpu_device_asic_init(adev);
4681 if (r) {
4682 dev_err(adev->dev, "gpu post error!\n");
4683 goto failed;
4684 }
4685 }
4686
4687 if (adev->bios) {
4688 if (adev->is_atom_fw) {
4689 /* Initialize clocks */
4690 r = amdgpu_atomfirmware_get_clock_info(adev);
4691 if (r) {
4692 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4693 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4694 goto failed;
4695 }
4696 } else {
4697 /* Initialize clocks */
4698 r = amdgpu_atombios_get_clock_info(adev);
4699 if (r) {
4700 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4701 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4702 goto failed;
4703 }
4704 /* init i2c buses */
4705 amdgpu_i2c_init(adev);
4706 }
4707 }
4708
4709 fence_driver_init:
4710 /* Fence driver */
4711 r = amdgpu_fence_driver_sw_init(adev);
4712 if (r) {
4713 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4714 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4715 goto failed;
4716 }
4717
4718 /* init the mode config */
4719 drm_mode_config_init(adev_to_drm(adev));
4720
4721 r = amdgpu_device_ip_init(adev);
4722 if (r) {
4723 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4724 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4725 goto release_ras_con;
4726 }
4727
4728 amdgpu_fence_driver_hw_init(adev);
4729
4730 dev_info(adev->dev,
4731 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4732 adev->gfx.config.max_shader_engines,
4733 adev->gfx.config.max_sh_per_se,
4734 adev->gfx.config.max_cu_per_sh,
4735 adev->gfx.cu_info.number);
4736
4737 adev->accel_working = true;
4738
4739 amdgpu_vm_check_compute_bug(adev);
4740
4741 /* Initialize the buffer migration limit. */
4742 if (amdgpu_moverate >= 0)
4743 max_MBps = amdgpu_moverate;
4744 else
4745 max_MBps = 8; /* Allow 8 MB/s. */
4746 /* Get a log2 for easy divisions. */
4747 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4748
4749 /*
4750 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4751 * Otherwise the mgpu fan boost feature will be skipped due to the
4752 * gpu instance is counted less.
4753 */
4754 amdgpu_register_gpu_instance(adev);
4755
4756 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4757 * explicit gating rather than handling it automatically.
4758 */
4759 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4760 r = amdgpu_device_ip_late_init(adev);
4761 if (r) {
4762 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4763 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4764 goto release_ras_con;
4765 }
4766 /* must succeed. */
4767 amdgpu_ras_resume(adev);
4768 queue_delayed_work(system_wq, &adev->delayed_init_work,
4769 msecs_to_jiffies(AMDGPU_RESUME_MS));
4770 }
4771
4772 if (amdgpu_sriov_vf(adev)) {
4773 amdgpu_virt_release_full_gpu(adev, true);
4774 flush_delayed_work(&adev->delayed_init_work);
4775 }
4776
4777 /* Don't init kfd if whole hive need to be reset during init */
4778 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4779 kgd2kfd_init_zone_device(adev);
4780 kfd_update_svm_support_properties(adev);
4781 }
4782
4783 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4784 amdgpu_xgmi_reset_on_init(adev);
4785
4786 /*
4787 * Place those sysfs registering after `late_init`. As some of those
4788 * operations performed in `late_init` might affect the sysfs
4789 * interfaces creating.
4790 */
4791 r = amdgpu_device_sys_interface_init(adev);
4792
4793 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4794 r = amdgpu_pmu_init(adev);
4795 if (r)
4796 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4797
4798 /* Have stored pci confspace at hand for restore in sudden PCI error */
4799 if (amdgpu_device_cache_pci_state(adev->pdev))
4800 pci_restore_state(pdev);
4801
4802 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4803 /* this will fail for cards that aren't VGA class devices, just
4804 * ignore it
4805 */
4806 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4807 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4808
4809 px = amdgpu_device_supports_px(adev);
4810
4811 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4812 apple_gmux_detect(NULL, NULL)))
4813 vga_switcheroo_register_client(adev->pdev,
4814 &amdgpu_switcheroo_ops, px);
4815
4816 if (px)
4817 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4818
4819 amdgpu_device_check_iommu_direct_map(adev);
4820
4821 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4822 r = register_pm_notifier(&adev->pm_nb);
4823 if (r)
4824 goto failed;
4825
4826 return 0;
4827
4828 release_ras_con:
4829 if (amdgpu_sriov_vf(adev))
4830 amdgpu_virt_release_full_gpu(adev, true);
4831
4832 /* failed in exclusive mode due to timeout */
4833 if (amdgpu_sriov_vf(adev) &&
4834 !amdgpu_sriov_runtime(adev) &&
4835 amdgpu_virt_mmio_blocked(adev) &&
4836 !amdgpu_virt_wait_reset(adev)) {
4837 dev_err(adev->dev, "VF exclusive mode timeout\n");
4838 /* Don't send request since VF is inactive. */
4839 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4840 adev->virt.ops = NULL;
4841 r = -EAGAIN;
4842 }
4843 amdgpu_release_ras_context(adev);
4844
4845 failed:
4846 amdgpu_vf_error_trans_all(adev);
4847
4848 return r;
4849 }
4850
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4851 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4852 {
4853
4854 /* Clear all CPU mappings pointing to this device */
4855 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4856
4857 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4858 amdgpu_doorbell_fini(adev);
4859
4860 iounmap(adev->rmmio);
4861 adev->rmmio = NULL;
4862 if (adev->mman.aper_base_kaddr)
4863 iounmap(adev->mman.aper_base_kaddr);
4864 adev->mman.aper_base_kaddr = NULL;
4865
4866 /* Memory manager related */
4867 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4868 arch_phys_wc_del(adev->gmc.vram_mtrr);
4869 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4870 }
4871 }
4872
4873 /**
4874 * amdgpu_device_fini_hw - tear down the driver
4875 *
4876 * @adev: amdgpu_device pointer
4877 *
4878 * Tear down the driver info (all asics).
4879 * Called at driver shutdown.
4880 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4881 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4882 {
4883 dev_info(adev->dev, "finishing device.\n");
4884 flush_delayed_work(&adev->delayed_init_work);
4885
4886 if (adev->mman.initialized)
4887 drain_workqueue(adev->mman.bdev.wq);
4888 adev->shutdown = true;
4889
4890 unregister_pm_notifier(&adev->pm_nb);
4891
4892 /* make sure IB test finished before entering exclusive mode
4893 * to avoid preemption on IB test
4894 */
4895 if (amdgpu_sriov_vf(adev)) {
4896 amdgpu_virt_request_full_gpu(adev, false);
4897 amdgpu_virt_fini_data_exchange(adev);
4898 }
4899
4900 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
4901 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
4902
4903 /* disable all interrupts */
4904 amdgpu_irq_disable_all(adev);
4905 if (adev->mode_info.mode_config_initialized) {
4906 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4907 drm_helper_force_disable_all(adev_to_drm(adev));
4908 else
4909 drm_atomic_helper_shutdown(adev_to_drm(adev));
4910 }
4911 amdgpu_fence_driver_hw_fini(adev);
4912
4913 amdgpu_device_sys_interface_fini(adev);
4914
4915 /* disable ras feature must before hw fini */
4916 amdgpu_ras_pre_fini(adev);
4917
4918 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4919
4920 /*
4921 * device went through surprise hotplug; we need to destroy topology
4922 * before ip_fini_early to prevent kfd locking refcount issues by calling
4923 * amdgpu_amdkfd_suspend()
4924 */
4925 if (pci_dev_is_disconnected(adev->pdev))
4926 amdgpu_amdkfd_device_fini_sw(adev);
4927
4928 amdgpu_device_ip_fini_early(adev);
4929
4930 amdgpu_irq_fini_hw(adev);
4931
4932 if (adev->mman.initialized)
4933 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4934
4935 amdgpu_gart_dummy_page_fini(adev);
4936
4937 if (pci_dev_is_disconnected(adev->pdev))
4938 amdgpu_device_unmap_mmio(adev);
4939
4940 }
4941
amdgpu_device_fini_sw(struct amdgpu_device * adev)4942 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4943 {
4944 int i, idx;
4945 bool px;
4946
4947 amdgpu_device_ip_fini(adev);
4948 amdgpu_fence_driver_sw_fini(adev);
4949 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4950 adev->accel_working = false;
4951 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4952 for (i = 0; i < MAX_XCP; ++i) {
4953 dma_fence_put(adev->isolation[i].spearhead);
4954 amdgpu_sync_free(&adev->isolation[i].active);
4955 amdgpu_sync_free(&adev->isolation[i].prev);
4956 }
4957
4958 amdgpu_reset_fini(adev);
4959
4960 /* free i2c buses */
4961 amdgpu_i2c_fini(adev);
4962
4963 if (adev->bios) {
4964 if (amdgpu_emu_mode != 1)
4965 amdgpu_atombios_fini(adev);
4966 amdgpu_bios_release(adev);
4967 }
4968
4969 kfree(adev->fru_info);
4970 adev->fru_info = NULL;
4971
4972 kfree(adev->xcp_mgr);
4973 adev->xcp_mgr = NULL;
4974
4975 px = amdgpu_device_supports_px(adev);
4976
4977 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4978 apple_gmux_detect(NULL, NULL)))
4979 vga_switcheroo_unregister_client(adev->pdev);
4980
4981 if (px)
4982 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4983
4984 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4985 vga_client_unregister(adev->pdev);
4986
4987 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4988
4989 iounmap(adev->rmmio);
4990 adev->rmmio = NULL;
4991 drm_dev_exit(idx);
4992 }
4993
4994 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4995 amdgpu_pmu_fini(adev);
4996 if (adev->discovery.bin)
4997 amdgpu_discovery_fini(adev);
4998
4999 amdgpu_reset_put_reset_domain(adev->reset_domain);
5000 adev->reset_domain = NULL;
5001
5002 kfree(adev->pci_state);
5003 kfree(adev->pcie_reset_ctx.swds_pcistate);
5004 kfree(adev->pcie_reset_ctx.swus_pcistate);
5005 }
5006
5007 /**
5008 * amdgpu_device_evict_resources - evict device resources
5009 * @adev: amdgpu device object
5010 *
5011 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
5012 * of the vram memory type. Mainly used for evicting device resources
5013 * at suspend time.
5014 *
5015 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)5016 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
5017 {
5018 int ret;
5019
5020 /* No need to evict vram on APUs unless going to S4 */
5021 if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
5022 return 0;
5023
5024 /* No need to evict when going to S5 through S4 callbacks */
5025 if (system_state == SYSTEM_POWER_OFF)
5026 return 0;
5027
5028 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
5029 if (ret) {
5030 dev_warn(adev->dev, "evicting device resources failed\n");
5031 return ret;
5032 }
5033
5034 if (adev->in_s4) {
5035 ret = ttm_device_prepare_hibernation(&adev->mman.bdev);
5036 if (ret)
5037 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret);
5038 }
5039 return ret;
5040 }
5041
5042 /*
5043 * Suspend & resume.
5044 */
5045 /**
5046 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
5047 * @nb: notifier block
5048 * @mode: suspend mode
5049 * @data: data
5050 *
5051 * This function is called when the system is about to suspend or hibernate.
5052 * It is used to set the appropriate flags so that eviction can be optimized
5053 * in the pm prepare callback.
5054 */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)5055 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
5056 void *data)
5057 {
5058 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
5059
5060 switch (mode) {
5061 case PM_HIBERNATION_PREPARE:
5062 adev->in_s4 = true;
5063 break;
5064 case PM_POST_HIBERNATION:
5065 adev->in_s4 = false;
5066 break;
5067 }
5068
5069 return NOTIFY_DONE;
5070 }
5071
5072 /**
5073 * amdgpu_device_prepare - prepare for device suspend
5074 *
5075 * @dev: drm dev pointer
5076 *
5077 * Prepare to put the hw in the suspend state (all asics).
5078 * Returns 0 for success or an error on failure.
5079 * Called at driver suspend.
5080 */
amdgpu_device_prepare(struct drm_device * dev)5081 int amdgpu_device_prepare(struct drm_device *dev)
5082 {
5083 struct amdgpu_device *adev = drm_to_adev(dev);
5084 int i, r;
5085
5086 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5087 return 0;
5088
5089 /* Evict the majority of BOs before starting suspend sequence */
5090 r = amdgpu_device_evict_resources(adev);
5091 if (r)
5092 return r;
5093
5094 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
5095
5096 for (i = 0; i < adev->num_ip_blocks; i++) {
5097 if (!adev->ip_blocks[i].status.valid)
5098 continue;
5099 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
5100 continue;
5101 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
5102 if (r)
5103 return r;
5104 }
5105
5106 return 0;
5107 }
5108
5109 /**
5110 * amdgpu_device_complete - complete power state transition
5111 *
5112 * @dev: drm dev pointer
5113 *
5114 * Undo the changes from amdgpu_device_prepare. This will be
5115 * called on all resume transitions, including those that failed.
5116 */
amdgpu_device_complete(struct drm_device * dev)5117 void amdgpu_device_complete(struct drm_device *dev)
5118 {
5119 struct amdgpu_device *adev = drm_to_adev(dev);
5120 int i;
5121
5122 for (i = 0; i < adev->num_ip_blocks; i++) {
5123 if (!adev->ip_blocks[i].status.valid)
5124 continue;
5125 if (!adev->ip_blocks[i].version->funcs->complete)
5126 continue;
5127 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]);
5128 }
5129 }
5130
5131 /**
5132 * amdgpu_device_suspend - initiate device suspend
5133 *
5134 * @dev: drm dev pointer
5135 * @notify_clients: notify in-kernel DRM clients
5136 *
5137 * Puts the hw in the suspend state (all asics).
5138 * Returns 0 for success or an error on failure.
5139 * Called at driver suspend.
5140 */
amdgpu_device_suspend(struct drm_device * dev,bool notify_clients)5141 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
5142 {
5143 struct amdgpu_device *adev = drm_to_adev(dev);
5144 int r, rec;
5145
5146 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5147 return 0;
5148
5149 adev->in_suspend = true;
5150
5151 if (amdgpu_sriov_vf(adev)) {
5152 if (!adev->in_runpm)
5153 amdgpu_amdkfd_suspend_process(adev);
5154 amdgpu_virt_fini_data_exchange(adev);
5155 r = amdgpu_virt_request_full_gpu(adev, false);
5156 if (r)
5157 return r;
5158 }
5159
5160 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
5161 if (r)
5162 goto unwind_sriov;
5163
5164 if (notify_clients)
5165 drm_client_dev_suspend(adev_to_drm(adev));
5166
5167 cancel_delayed_work_sync(&adev->delayed_init_work);
5168
5169 amdgpu_ras_suspend(adev);
5170
5171 r = amdgpu_device_ip_suspend_phase1(adev);
5172 if (r)
5173 goto unwind_smartshift;
5174
5175 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5176 r = amdgpu_userq_suspend(adev);
5177 if (r)
5178 goto unwind_ip_phase1;
5179
5180 r = amdgpu_device_evict_resources(adev);
5181 if (r)
5182 goto unwind_userq;
5183
5184 amdgpu_ttm_set_buffer_funcs_status(adev, false);
5185
5186 amdgpu_fence_driver_hw_fini(adev);
5187
5188 r = amdgpu_device_ip_suspend_phase2(adev);
5189 if (r)
5190 goto unwind_evict;
5191
5192 if (amdgpu_sriov_vf(adev))
5193 amdgpu_virt_release_full_gpu(adev, false);
5194
5195 return 0;
5196
5197 unwind_evict:
5198 if (adev->mman.buffer_funcs_ring->sched.ready)
5199 amdgpu_ttm_set_buffer_funcs_status(adev, true);
5200 amdgpu_fence_driver_hw_init(adev);
5201
5202 unwind_userq:
5203 rec = amdgpu_userq_resume(adev);
5204 if (rec) {
5205 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec);
5206 return r;
5207 }
5208 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5209 if (rec) {
5210 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
5211 return r;
5212 }
5213
5214 unwind_ip_phase1:
5215 /* suspend phase 1 = resume phase 3 */
5216 rec = amdgpu_device_ip_resume_phase3(adev);
5217 if (rec) {
5218 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
5219 return r;
5220 }
5221
5222 unwind_smartshift:
5223 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
5224 if (rec) {
5225 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec);
5226 return r;
5227 }
5228
5229 if (notify_clients)
5230 drm_client_dev_resume(adev_to_drm(adev));
5231
5232 amdgpu_ras_resume(adev);
5233
5234 unwind_sriov:
5235 if (amdgpu_sriov_vf(adev)) {
5236 rec = amdgpu_virt_request_full_gpu(adev, true);
5237 if (rec) {
5238 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec);
5239 return r;
5240 }
5241 }
5242
5243 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
5244
5245 return r;
5246 }
5247
amdgpu_virt_resume(struct amdgpu_device * adev)5248 static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
5249 {
5250 int r;
5251 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
5252
5253 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO)
5254 * may not work. The access could be blocked by nBIF protection as VF isn't in
5255 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX
5256 * so that QEMU reprograms MSIX table.
5257 */
5258 amdgpu_restore_msix(adev);
5259
5260 r = adev->gfxhub.funcs->get_xgmi_info(adev);
5261 if (r)
5262 return r;
5263
5264 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n",
5265 prev_physical_node_id, adev->gmc.xgmi.physical_node_id);
5266
5267 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev);
5268 adev->vm_manager.vram_base_offset +=
5269 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
5270
5271 return 0;
5272 }
5273
5274 /**
5275 * amdgpu_device_resume - initiate device resume
5276 *
5277 * @dev: drm dev pointer
5278 * @notify_clients: notify in-kernel DRM clients
5279 *
5280 * Bring the hw back to operating state (all asics).
5281 * Returns 0 for success or an error on failure.
5282 * Called at driver resume.
5283 */
amdgpu_device_resume(struct drm_device * dev,bool notify_clients)5284 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
5285 {
5286 struct amdgpu_device *adev = drm_to_adev(dev);
5287 int r = 0;
5288
5289 if (amdgpu_sriov_vf(adev)) {
5290 r = amdgpu_virt_request_full_gpu(adev, true);
5291 if (r)
5292 return r;
5293 }
5294
5295 if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
5296 r = amdgpu_virt_resume(adev);
5297 if (r)
5298 goto exit;
5299 }
5300
5301 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5302 return 0;
5303
5304 if (adev->in_s0ix)
5305 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
5306
5307 /* post card */
5308 if (amdgpu_device_need_post(adev)) {
5309 r = amdgpu_device_asic_init(adev);
5310 if (r)
5311 dev_err(adev->dev, "amdgpu asic init failed\n");
5312 }
5313
5314 r = amdgpu_device_ip_resume(adev);
5315
5316 if (r) {
5317 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
5318 goto exit;
5319 }
5320
5321 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5322 if (r)
5323 goto exit;
5324
5325 r = amdgpu_userq_resume(adev);
5326 if (r)
5327 goto exit;
5328
5329 r = amdgpu_device_ip_late_init(adev);
5330 if (r)
5331 goto exit;
5332
5333 queue_delayed_work(system_wq, &adev->delayed_init_work,
5334 msecs_to_jiffies(AMDGPU_RESUME_MS));
5335 exit:
5336 if (amdgpu_sriov_vf(adev)) {
5337 amdgpu_virt_init_data_exchange(adev);
5338 amdgpu_virt_release_full_gpu(adev, true);
5339
5340 if (!r && !adev->in_runpm)
5341 r = amdgpu_amdkfd_resume_process(adev);
5342 }
5343
5344 if (r)
5345 return r;
5346
5347 /* Make sure IB tests flushed */
5348 flush_delayed_work(&adev->delayed_init_work);
5349
5350 if (notify_clients)
5351 drm_client_dev_resume(adev_to_drm(adev));
5352
5353 amdgpu_ras_resume(adev);
5354
5355 if (adev->mode_info.num_crtc) {
5356 /*
5357 * Most of the connector probing functions try to acquire runtime pm
5358 * refs to ensure that the GPU is powered on when connector polling is
5359 * performed. Since we're calling this from a runtime PM callback,
5360 * trying to acquire rpm refs will cause us to deadlock.
5361 *
5362 * Since we're guaranteed to be holding the rpm lock, it's safe to
5363 * temporarily disable the rpm helpers so this doesn't deadlock us.
5364 */
5365 #ifdef CONFIG_PM
5366 dev->dev->power.disable_depth++;
5367 #endif
5368 if (!adev->dc_enabled)
5369 drm_helper_hpd_irq_event(dev);
5370 else
5371 drm_kms_helper_hotplug_event(dev);
5372 #ifdef CONFIG_PM
5373 dev->dev->power.disable_depth--;
5374 #endif
5375 }
5376
5377 amdgpu_vram_mgr_clear_reset_blocks(adev);
5378 adev->in_suspend = false;
5379
5380 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0))
5381 dev_warn(adev->dev, "smart shift update failed\n");
5382
5383 return 0;
5384 }
5385
5386 /**
5387 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5388 *
5389 * @adev: amdgpu_device pointer
5390 *
5391 * The list of all the hardware IPs that make up the asic is walked and
5392 * the check_soft_reset callbacks are run. check_soft_reset determines
5393 * if the asic is still hung or not.
5394 * Returns true if any of the IPs are still in a hung state, false if not.
5395 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)5396 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5397 {
5398 int i;
5399 bool asic_hang = false;
5400
5401 if (amdgpu_sriov_vf(adev))
5402 return true;
5403
5404 if (amdgpu_asic_need_full_reset(adev))
5405 return true;
5406
5407 for (i = 0; i < adev->num_ip_blocks; i++) {
5408 if (!adev->ip_blocks[i].status.valid)
5409 continue;
5410 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5411 adev->ip_blocks[i].status.hang =
5412 adev->ip_blocks[i].version->funcs->check_soft_reset(
5413 &adev->ip_blocks[i]);
5414 if (adev->ip_blocks[i].status.hang) {
5415 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5416 asic_hang = true;
5417 }
5418 }
5419 return asic_hang;
5420 }
5421
5422 /**
5423 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5424 *
5425 * @adev: amdgpu_device pointer
5426 *
5427 * The list of all the hardware IPs that make up the asic is walked and the
5428 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5429 * handles any IP specific hardware or software state changes that are
5430 * necessary for a soft reset to succeed.
5431 * Returns 0 on success, negative error code on failure.
5432 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5433 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5434 {
5435 int i, r = 0;
5436
5437 for (i = 0; i < adev->num_ip_blocks; i++) {
5438 if (!adev->ip_blocks[i].status.valid)
5439 continue;
5440 if (adev->ip_blocks[i].status.hang &&
5441 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5442 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5443 if (r)
5444 return r;
5445 }
5446 }
5447
5448 return 0;
5449 }
5450
5451 /**
5452 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5453 *
5454 * @adev: amdgpu_device pointer
5455 *
5456 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5457 * reset is necessary to recover.
5458 * Returns true if a full asic reset is required, false if not.
5459 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5460 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5461 {
5462 int i;
5463
5464 if (amdgpu_asic_need_full_reset(adev))
5465 return true;
5466
5467 for (i = 0; i < adev->num_ip_blocks; i++) {
5468 if (!adev->ip_blocks[i].status.valid)
5469 continue;
5470 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5471 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5472 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5473 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5474 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5475 if (adev->ip_blocks[i].status.hang) {
5476 dev_info(adev->dev, "Some block need full reset!\n");
5477 return true;
5478 }
5479 }
5480 }
5481 return false;
5482 }
5483
5484 /**
5485 * amdgpu_device_ip_soft_reset - do a soft reset
5486 *
5487 * @adev: amdgpu_device pointer
5488 *
5489 * The list of all the hardware IPs that make up the asic is walked and the
5490 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5491 * IP specific hardware or software state changes that are necessary to soft
5492 * reset the IP.
5493 * Returns 0 on success, negative error code on failure.
5494 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5495 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5496 {
5497 int i, r = 0;
5498
5499 for (i = 0; i < adev->num_ip_blocks; i++) {
5500 if (!adev->ip_blocks[i].status.valid)
5501 continue;
5502 if (adev->ip_blocks[i].status.hang &&
5503 adev->ip_blocks[i].version->funcs->soft_reset) {
5504 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5505 if (r)
5506 return r;
5507 }
5508 }
5509
5510 return 0;
5511 }
5512
5513 /**
5514 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5515 *
5516 * @adev: amdgpu_device pointer
5517 *
5518 * The list of all the hardware IPs that make up the asic is walked and the
5519 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5520 * handles any IP specific hardware or software state changes that are
5521 * necessary after the IP has been soft reset.
5522 * Returns 0 on success, negative error code on failure.
5523 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5524 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5525 {
5526 int i, r = 0;
5527
5528 for (i = 0; i < adev->num_ip_blocks; i++) {
5529 if (!adev->ip_blocks[i].status.valid)
5530 continue;
5531 if (adev->ip_blocks[i].status.hang &&
5532 adev->ip_blocks[i].version->funcs->post_soft_reset)
5533 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5534 if (r)
5535 return r;
5536 }
5537
5538 return 0;
5539 }
5540
5541 /**
5542 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5543 *
5544 * @adev: amdgpu_device pointer
5545 * @reset_context: amdgpu reset context pointer
5546 *
5547 * do VF FLR and reinitialize Asic
5548 * return 0 means succeeded otherwise failed
5549 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5550 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5551 struct amdgpu_reset_context *reset_context)
5552 {
5553 int r;
5554 struct amdgpu_hive_info *hive = NULL;
5555
5556 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5557 if (!amdgpu_ras_get_fed_status(adev))
5558 amdgpu_virt_ready_to_reset(adev);
5559 amdgpu_virt_wait_reset(adev);
5560 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5561 r = amdgpu_virt_request_full_gpu(adev, true);
5562 } else {
5563 r = amdgpu_virt_reset_gpu(adev);
5564 }
5565 if (r)
5566 return r;
5567
5568 amdgpu_ras_clear_err_state(adev);
5569 amdgpu_irq_gpu_reset_resume_helper(adev);
5570
5571 /* some sw clean up VF needs to do before recover */
5572 amdgpu_virt_post_reset(adev);
5573
5574 /* Resume IP prior to SMC */
5575 r = amdgpu_device_ip_reinit_early_sriov(adev);
5576 if (r)
5577 return r;
5578
5579 amdgpu_virt_init_data_exchange(adev);
5580
5581 r = amdgpu_device_fw_loading(adev);
5582 if (r)
5583 return r;
5584
5585 /* now we are okay to resume SMC/CP/SDMA */
5586 r = amdgpu_device_ip_reinit_late_sriov(adev);
5587 if (r)
5588 return r;
5589
5590 hive = amdgpu_get_xgmi_hive(adev);
5591 /* Update PSP FW topology after reset */
5592 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5593 r = amdgpu_xgmi_update_topology(hive, adev);
5594 if (hive)
5595 amdgpu_put_xgmi_hive(hive);
5596 if (r)
5597 return r;
5598
5599 r = amdgpu_ib_ring_tests(adev);
5600 if (r)
5601 return r;
5602
5603 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5604 amdgpu_inc_vram_lost(adev);
5605
5606 /* need to be called during full access so we can't do it later like
5607 * bare-metal does.
5608 */
5609 amdgpu_amdkfd_post_reset(adev);
5610 amdgpu_virt_release_full_gpu(adev, true);
5611
5612 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5613 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5614 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5615 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5616 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
5617 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5618 amdgpu_ras_resume(adev);
5619
5620 amdgpu_virt_ras_telemetry_post_reset(adev);
5621
5622 return 0;
5623 }
5624
5625 /**
5626 * amdgpu_device_has_job_running - check if there is any unfinished job
5627 *
5628 * @adev: amdgpu_device pointer
5629 *
5630 * check if there is any job running on the device when guest driver receives
5631 * FLR notification from host driver. If there are still jobs running, then
5632 * the guest driver will not respond the FLR reset. Instead, let the job hit
5633 * the timeout and guest driver then issue the reset request.
5634 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5635 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5636 {
5637 int i;
5638
5639 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5640 struct amdgpu_ring *ring = adev->rings[i];
5641
5642 if (!amdgpu_ring_sched_ready(ring))
5643 continue;
5644
5645 if (amdgpu_fence_count_emitted(ring))
5646 return true;
5647 }
5648 return false;
5649 }
5650
5651 /**
5652 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5653 *
5654 * @adev: amdgpu_device pointer
5655 *
5656 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5657 * a hung GPU.
5658 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5659 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5660 {
5661
5662 if (amdgpu_gpu_recovery == 0)
5663 goto disabled;
5664
5665 /* Skip soft reset check in fatal error mode */
5666 if (!amdgpu_ras_is_poison_mode_supported(adev))
5667 return true;
5668
5669 if (amdgpu_sriov_vf(adev))
5670 return true;
5671
5672 if (amdgpu_gpu_recovery == -1) {
5673 switch (adev->asic_type) {
5674 #ifdef CONFIG_DRM_AMDGPU_SI
5675 case CHIP_VERDE:
5676 case CHIP_TAHITI:
5677 case CHIP_PITCAIRN:
5678 case CHIP_OLAND:
5679 case CHIP_HAINAN:
5680 #endif
5681 #ifdef CONFIG_DRM_AMDGPU_CIK
5682 case CHIP_KAVERI:
5683 case CHIP_KABINI:
5684 case CHIP_MULLINS:
5685 #endif
5686 case CHIP_CARRIZO:
5687 case CHIP_STONEY:
5688 case CHIP_CYAN_SKILLFISH:
5689 goto disabled;
5690 default:
5691 break;
5692 }
5693 }
5694
5695 return true;
5696
5697 disabled:
5698 dev_info(adev->dev, "GPU recovery disabled.\n");
5699 return false;
5700 }
5701
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5702 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5703 {
5704 u32 i;
5705 int ret = 0;
5706
5707 if (adev->bios)
5708 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5709
5710 dev_info(adev->dev, "GPU mode1 reset\n");
5711
5712 /* Cache the state before bus master disable. The saved config space
5713 * values are used in other cases like restore after mode-2 reset.
5714 */
5715 amdgpu_device_cache_pci_state(adev->pdev);
5716
5717 /* disable BM */
5718 pci_clear_master(adev->pdev);
5719
5720 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5721 dev_info(adev->dev, "GPU smu mode1 reset\n");
5722 ret = amdgpu_dpm_mode1_reset(adev);
5723 } else {
5724 dev_info(adev->dev, "GPU psp mode1 reset\n");
5725 ret = psp_gpu_reset(adev);
5726 }
5727
5728 if (ret)
5729 goto mode1_reset_failed;
5730
5731 /* enable mmio access after mode 1 reset completed */
5732 adev->no_hw_access = false;
5733
5734 /* ensure no_hw_access is updated before we access hw */
5735 smp_mb();
5736
5737 amdgpu_device_load_pci_state(adev->pdev);
5738 ret = amdgpu_psp_wait_for_bootloader(adev);
5739 if (ret)
5740 goto mode1_reset_failed;
5741
5742 /* wait for asic to come out of reset */
5743 for (i = 0; i < adev->usec_timeout; i++) {
5744 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5745
5746 if (memsize != 0xffffffff)
5747 break;
5748 udelay(1);
5749 }
5750
5751 if (i >= adev->usec_timeout) {
5752 ret = -ETIMEDOUT;
5753 goto mode1_reset_failed;
5754 }
5755
5756 if (adev->bios)
5757 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5758
5759 return 0;
5760
5761 mode1_reset_failed:
5762 dev_err(adev->dev, "GPU mode1 reset failed\n");
5763 return ret;
5764 }
5765
amdgpu_device_link_reset(struct amdgpu_device * adev)5766 int amdgpu_device_link_reset(struct amdgpu_device *adev)
5767 {
5768 int ret = 0;
5769
5770 dev_info(adev->dev, "GPU link reset\n");
5771
5772 if (!amdgpu_reset_in_dpc(adev))
5773 ret = amdgpu_dpm_link_reset(adev);
5774
5775 if (ret)
5776 goto link_reset_failed;
5777
5778 ret = amdgpu_psp_wait_for_bootloader(adev);
5779 if (ret)
5780 goto link_reset_failed;
5781
5782 return 0;
5783
5784 link_reset_failed:
5785 dev_err(adev->dev, "GPU link reset failed\n");
5786 return ret;
5787 }
5788
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5789 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5790 struct amdgpu_reset_context *reset_context)
5791 {
5792 int i, r = 0;
5793 struct amdgpu_job *job = NULL;
5794 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5795 bool need_full_reset =
5796 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5797
5798 if (reset_context->reset_req_dev == adev)
5799 job = reset_context->job;
5800
5801 if (amdgpu_sriov_vf(adev))
5802 amdgpu_virt_pre_reset(adev);
5803
5804 amdgpu_fence_driver_isr_toggle(adev, true);
5805
5806 /* block all schedulers and reset given job's ring */
5807 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5808 struct amdgpu_ring *ring = adev->rings[i];
5809
5810 if (!amdgpu_ring_sched_ready(ring))
5811 continue;
5812
5813 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5814 amdgpu_fence_driver_force_completion(ring);
5815 }
5816
5817 amdgpu_fence_driver_isr_toggle(adev, false);
5818
5819 if (job && job->vm)
5820 drm_sched_increase_karma(&job->base);
5821
5822 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5823 /* If reset handler not implemented, continue; otherwise return */
5824 if (r == -EOPNOTSUPP)
5825 r = 0;
5826 else
5827 return r;
5828
5829 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5830 if (!amdgpu_sriov_vf(adev)) {
5831
5832 if (!need_full_reset)
5833 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5834
5835 if (!need_full_reset && amdgpu_gpu_recovery &&
5836 amdgpu_device_ip_check_soft_reset(adev)) {
5837 amdgpu_device_ip_pre_soft_reset(adev);
5838 r = amdgpu_device_ip_soft_reset(adev);
5839 amdgpu_device_ip_post_soft_reset(adev);
5840 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5841 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5842 need_full_reset = true;
5843 }
5844 }
5845
5846 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5847 dev_info(tmp_adev->dev, "Dumping IP State\n");
5848 /* Trigger ip dump before we reset the asic */
5849 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5850 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5851 tmp_adev->ip_blocks[i].version->funcs
5852 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5853 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5854 }
5855
5856 if (need_full_reset)
5857 r = amdgpu_device_ip_suspend(adev);
5858 if (need_full_reset)
5859 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5860 else
5861 clear_bit(AMDGPU_NEED_FULL_RESET,
5862 &reset_context->flags);
5863 }
5864
5865 return r;
5866 }
5867
amdgpu_device_reinit_after_reset(struct amdgpu_reset_context * reset_context)5868 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5869 {
5870 struct list_head *device_list_handle;
5871 bool full_reset, vram_lost = false;
5872 struct amdgpu_device *tmp_adev;
5873 int r, init_level;
5874
5875 device_list_handle = reset_context->reset_device_list;
5876
5877 if (!device_list_handle)
5878 return -EINVAL;
5879
5880 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5881
5882 /**
5883 * If it's reset on init, it's default init level, otherwise keep level
5884 * as recovery level.
5885 */
5886 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5887 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5888 else
5889 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5890
5891 r = 0;
5892 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5893 amdgpu_set_init_level(tmp_adev, init_level);
5894 if (full_reset) {
5895 /* post card */
5896 amdgpu_reset_set_dpc_status(tmp_adev, false);
5897 amdgpu_ras_clear_err_state(tmp_adev);
5898 r = amdgpu_device_asic_init(tmp_adev);
5899 if (r) {
5900 dev_warn(tmp_adev->dev, "asic atom init failed!");
5901 } else {
5902 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5903
5904 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5905 if (r)
5906 goto out;
5907
5908 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5909
5910 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5911 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5912
5913 if (vram_lost) {
5914 dev_info(
5915 tmp_adev->dev,
5916 "VRAM is lost due to GPU reset!\n");
5917 amdgpu_inc_vram_lost(tmp_adev);
5918 }
5919
5920 r = amdgpu_device_fw_loading(tmp_adev);
5921 if (r)
5922 return r;
5923
5924 r = amdgpu_xcp_restore_partition_mode(
5925 tmp_adev->xcp_mgr);
5926 if (r)
5927 goto out;
5928
5929 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5930 if (r)
5931 goto out;
5932
5933 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5934 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5935
5936 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5937 if (r)
5938 goto out;
5939
5940 if (vram_lost)
5941 amdgpu_device_fill_reset_magic(tmp_adev);
5942
5943 /*
5944 * Add this ASIC as tracked as reset was already
5945 * complete successfully.
5946 */
5947 amdgpu_register_gpu_instance(tmp_adev);
5948
5949 if (!reset_context->hive &&
5950 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5951 amdgpu_xgmi_add_device(tmp_adev);
5952
5953 r = amdgpu_device_ip_late_init(tmp_adev);
5954 if (r)
5955 goto out;
5956
5957 r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
5958 if (r)
5959 goto out;
5960
5961 drm_client_dev_resume(adev_to_drm(tmp_adev));
5962
5963 /*
5964 * The GPU enters bad state once faulty pages
5965 * by ECC has reached the threshold, and ras
5966 * recovery is scheduled next. So add one check
5967 * here to break recovery if it indeed exceeds
5968 * bad page threshold, and remind user to
5969 * retire this GPU or setting one bigger
5970 * bad_page_threshold value to fix this once
5971 * probing driver again.
5972 */
5973 if (!amdgpu_ras_is_rma(tmp_adev)) {
5974 /* must succeed. */
5975 amdgpu_ras_resume(tmp_adev);
5976 } else {
5977 r = -EINVAL;
5978 goto out;
5979 }
5980
5981 /* Update PSP FW topology after reset */
5982 if (reset_context->hive &&
5983 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5984 r = amdgpu_xgmi_update_topology(
5985 reset_context->hive, tmp_adev);
5986 }
5987 }
5988
5989 out:
5990 if (!r) {
5991 /* IP init is complete now, set level as default */
5992 amdgpu_set_init_level(tmp_adev,
5993 AMDGPU_INIT_LEVEL_DEFAULT);
5994 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5995 r = amdgpu_ib_ring_tests(tmp_adev);
5996 if (r) {
5997 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5998 r = -EAGAIN;
5999 goto end;
6000 }
6001 }
6002
6003 if (r)
6004 tmp_adev->asic_reset_res = r;
6005 }
6006
6007 end:
6008 return r;
6009 }
6010
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)6011 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
6012 struct amdgpu_reset_context *reset_context)
6013 {
6014 struct amdgpu_device *tmp_adev = NULL;
6015 bool need_full_reset, skip_hw_reset;
6016 int r = 0;
6017
6018 /* Try reset handler method first */
6019 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6020 reset_list);
6021
6022 reset_context->reset_device_list = device_list_handle;
6023 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
6024 /* If reset handler not implemented, continue; otherwise return */
6025 if (r == -EOPNOTSUPP)
6026 r = 0;
6027 else
6028 return r;
6029
6030 /* Reset handler not implemented, use the default method */
6031 need_full_reset =
6032 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6033 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
6034
6035 /*
6036 * ASIC reset has to be done on all XGMI hive nodes ASAP
6037 * to allow proper links negotiation in FW (within 1 sec)
6038 */
6039 if (!skip_hw_reset && need_full_reset) {
6040 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6041 /* For XGMI run all resets in parallel to speed up the process */
6042 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
6043 if (!queue_work(system_unbound_wq,
6044 &tmp_adev->xgmi_reset_work))
6045 r = -EALREADY;
6046 } else
6047 r = amdgpu_asic_reset(tmp_adev);
6048
6049 if (r) {
6050 dev_err(tmp_adev->dev,
6051 "ASIC reset failed with error, %d for drm dev, %s",
6052 r, adev_to_drm(tmp_adev)->unique);
6053 goto out;
6054 }
6055 }
6056
6057 /* For XGMI wait for all resets to complete before proceed */
6058 if (!r) {
6059 list_for_each_entry(tmp_adev, device_list_handle,
6060 reset_list) {
6061 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
6062 flush_work(&tmp_adev->xgmi_reset_work);
6063 r = tmp_adev->asic_reset_res;
6064 if (r)
6065 break;
6066 }
6067 }
6068 }
6069 }
6070
6071 if (!r && amdgpu_ras_intr_triggered()) {
6072 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6073 amdgpu_ras_reset_error_count(tmp_adev,
6074 AMDGPU_RAS_BLOCK__MMHUB);
6075 }
6076
6077 amdgpu_ras_intr_cleared();
6078 }
6079
6080 r = amdgpu_device_reinit_after_reset(reset_context);
6081 if (r == -EAGAIN)
6082 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6083 else
6084 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6085
6086 out:
6087 return r;
6088 }
6089
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)6090 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
6091 {
6092
6093 switch (amdgpu_asic_reset_method(adev)) {
6094 case AMD_RESET_METHOD_MODE1:
6095 case AMD_RESET_METHOD_LINK:
6096 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
6097 break;
6098 case AMD_RESET_METHOD_MODE2:
6099 adev->mp1_state = PP_MP1_STATE_RESET;
6100 break;
6101 default:
6102 adev->mp1_state = PP_MP1_STATE_NONE;
6103 break;
6104 }
6105 }
6106
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)6107 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
6108 {
6109 amdgpu_vf_error_trans_all(adev);
6110 adev->mp1_state = PP_MP1_STATE_NONE;
6111 }
6112
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)6113 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
6114 {
6115 struct pci_dev *p = NULL;
6116
6117 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
6118 adev->pdev->bus->number, 1);
6119 if (p) {
6120 pm_runtime_enable(&(p->dev));
6121 pm_runtime_resume(&(p->dev));
6122 }
6123
6124 pci_dev_put(p);
6125 }
6126
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)6127 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
6128 {
6129 enum amd_reset_method reset_method;
6130 struct pci_dev *p = NULL;
6131 u64 expires;
6132
6133 /*
6134 * For now, only BACO and mode1 reset are confirmed
6135 * to suffer the audio issue without proper suspended.
6136 */
6137 reset_method = amdgpu_asic_reset_method(adev);
6138 if ((reset_method != AMD_RESET_METHOD_BACO) &&
6139 (reset_method != AMD_RESET_METHOD_MODE1))
6140 return -EINVAL;
6141
6142 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
6143 adev->pdev->bus->number, 1);
6144 if (!p)
6145 return -ENODEV;
6146
6147 expires = pm_runtime_autosuspend_expiration(&(p->dev));
6148 if (!expires)
6149 /*
6150 * If we cannot get the audio device autosuspend delay,
6151 * a fixed 4S interval will be used. Considering 3S is
6152 * the audio controller default autosuspend delay setting.
6153 * 4S used here is guaranteed to cover that.
6154 */
6155 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
6156
6157 while (!pm_runtime_status_suspended(&(p->dev))) {
6158 if (!pm_runtime_suspend(&(p->dev)))
6159 break;
6160
6161 if (expires < ktime_get_mono_fast_ns()) {
6162 dev_warn(adev->dev, "failed to suspend display audio\n");
6163 pci_dev_put(p);
6164 /* TODO: abort the succeeding gpu reset? */
6165 return -ETIMEDOUT;
6166 }
6167 }
6168
6169 pm_runtime_disable(&(p->dev));
6170
6171 pci_dev_put(p);
6172 return 0;
6173 }
6174
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)6175 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
6176 {
6177 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
6178
6179 #if defined(CONFIG_DEBUG_FS)
6180 if (!amdgpu_sriov_vf(adev))
6181 cancel_work(&adev->reset_work);
6182 #endif
6183 cancel_work(&adev->userq_reset_work);
6184
6185 if (adev->kfd.dev)
6186 cancel_work(&adev->kfd.reset_work);
6187
6188 if (amdgpu_sriov_vf(adev))
6189 cancel_work(&adev->virt.flr_work);
6190
6191 if (con && adev->ras_enabled)
6192 cancel_work(&con->recovery_work);
6193
6194 }
6195
amdgpu_device_health_check(struct list_head * device_list_handle)6196 static int amdgpu_device_health_check(struct list_head *device_list_handle)
6197 {
6198 struct amdgpu_device *tmp_adev;
6199 int ret = 0;
6200
6201 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6202 ret |= amdgpu_device_bus_status_check(tmp_adev);
6203 }
6204
6205 return ret;
6206 }
6207
amdgpu_device_recovery_prepare(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_hive_info * hive)6208 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
6209 struct list_head *device_list,
6210 struct amdgpu_hive_info *hive)
6211 {
6212 struct amdgpu_device *tmp_adev = NULL;
6213
6214 /*
6215 * Build list of devices to reset.
6216 * In case we are in XGMI hive mode, resort the device list
6217 * to put adev in the 1st position.
6218 */
6219 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
6220 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6221 list_add_tail(&tmp_adev->reset_list, device_list);
6222 if (adev->shutdown)
6223 tmp_adev->shutdown = true;
6224 if (amdgpu_reset_in_dpc(adev))
6225 tmp_adev->pcie_reset_ctx.in_link_reset = true;
6226 }
6227 if (!list_is_first(&adev->reset_list, device_list))
6228 list_rotate_to_front(&adev->reset_list, device_list);
6229 } else {
6230 list_add_tail(&adev->reset_list, device_list);
6231 }
6232 }
6233
amdgpu_device_recovery_get_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6234 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
6235 struct list_head *device_list)
6236 {
6237 struct amdgpu_device *tmp_adev = NULL;
6238
6239 if (list_empty(device_list))
6240 return;
6241 tmp_adev =
6242 list_first_entry(device_list, struct amdgpu_device, reset_list);
6243 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
6244 }
6245
amdgpu_device_recovery_put_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6246 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
6247 struct list_head *device_list)
6248 {
6249 struct amdgpu_device *tmp_adev = NULL;
6250
6251 if (list_empty(device_list))
6252 return;
6253 tmp_adev =
6254 list_first_entry(device_list, struct amdgpu_device, reset_list);
6255 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6256 }
6257
amdgpu_device_halt_activities(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context,struct list_head * device_list,struct amdgpu_hive_info * hive,bool need_emergency_restart)6258 static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
6259 struct amdgpu_job *job,
6260 struct amdgpu_reset_context *reset_context,
6261 struct list_head *device_list,
6262 struct amdgpu_hive_info *hive,
6263 bool need_emergency_restart)
6264 {
6265 struct amdgpu_device *tmp_adev = NULL;
6266 int i;
6267
6268 /* block all schedulers and reset given job's ring */
6269 list_for_each_entry(tmp_adev, device_list, reset_list) {
6270 amdgpu_device_set_mp1_state(tmp_adev);
6271
6272 /*
6273 * Try to put the audio codec into suspend state
6274 * before gpu reset started.
6275 *
6276 * Due to the power domain of the graphics device
6277 * is shared with AZ power domain. Without this,
6278 * we may change the audio hardware from behind
6279 * the audio driver's back. That will trigger
6280 * some audio codec errors.
6281 */
6282 if (!amdgpu_device_suspend_display_audio(tmp_adev))
6283 tmp_adev->pcie_reset_ctx.audio_suspended = true;
6284
6285 amdgpu_ras_set_error_query_ready(tmp_adev, false);
6286
6287 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
6288
6289 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
6290
6291 /*
6292 * Mark these ASICs to be reset as untracked first
6293 * And add them back after reset completed
6294 */
6295 amdgpu_unregister_gpu_instance(tmp_adev);
6296
6297 drm_client_dev_suspend(adev_to_drm(tmp_adev));
6298
6299 /* disable ras on ALL IPs */
6300 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) &&
6301 amdgpu_device_ip_need_full_reset(tmp_adev))
6302 amdgpu_ras_suspend(tmp_adev);
6303
6304 amdgpu_userq_pre_reset(tmp_adev);
6305
6306 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6307 struct amdgpu_ring *ring = tmp_adev->rings[i];
6308
6309 if (!amdgpu_ring_sched_ready(ring))
6310 continue;
6311
6312 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
6313
6314 if (need_emergency_restart)
6315 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
6316 }
6317 atomic_inc(&tmp_adev->gpu_reset_counter);
6318 }
6319 }
6320
amdgpu_device_asic_reset(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_reset_context * reset_context)6321 static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
6322 struct list_head *device_list,
6323 struct amdgpu_reset_context *reset_context)
6324 {
6325 struct amdgpu_device *tmp_adev = NULL;
6326 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
6327 int r = 0;
6328
6329 retry: /* Rest of adevs pre asic reset from XGMI hive. */
6330 list_for_each_entry(tmp_adev, device_list, reset_list) {
6331 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
6332 /*TODO Should we stop ?*/
6333 if (r) {
6334 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6335 r, adev_to_drm(tmp_adev)->unique);
6336 tmp_adev->asic_reset_res = r;
6337 }
6338 }
6339
6340 /* Actual ASIC resets if needed.*/
6341 /* Host driver will handle XGMI hive reset for SRIOV */
6342 if (amdgpu_sriov_vf(adev)) {
6343
6344 /* Bail out of reset early */
6345 if (amdgpu_ras_is_rma(adev))
6346 return -ENODEV;
6347
6348 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
6349 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
6350 amdgpu_ras_set_fed(adev, true);
6351 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
6352 }
6353
6354 r = amdgpu_device_reset_sriov(adev, reset_context);
6355 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
6356 amdgpu_virt_release_full_gpu(adev, true);
6357 goto retry;
6358 }
6359 if (r)
6360 adev->asic_reset_res = r;
6361 } else {
6362 r = amdgpu_do_asic_reset(device_list, reset_context);
6363 if (r && r == -EAGAIN)
6364 goto retry;
6365 }
6366
6367 list_for_each_entry(tmp_adev, device_list, reset_list) {
6368 /*
6369 * Drop any pending non scheduler resets queued before reset is done.
6370 * Any reset scheduled after this point would be valid. Scheduler resets
6371 * were already dropped during drm_sched_stop and no new ones can come
6372 * in before drm_sched_start.
6373 */
6374 amdgpu_device_stop_pending_resets(tmp_adev);
6375 }
6376
6377 return r;
6378 }
6379
amdgpu_device_sched_resume(struct list_head * device_list,struct amdgpu_reset_context * reset_context,bool job_signaled)6380 static int amdgpu_device_sched_resume(struct list_head *device_list,
6381 struct amdgpu_reset_context *reset_context,
6382 bool job_signaled)
6383 {
6384 struct amdgpu_device *tmp_adev = NULL;
6385 int i, r = 0;
6386
6387 /* Post ASIC reset for all devs .*/
6388 list_for_each_entry(tmp_adev, device_list, reset_list) {
6389
6390 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6391 struct amdgpu_ring *ring = tmp_adev->rings[i];
6392
6393 if (!amdgpu_ring_sched_ready(ring))
6394 continue;
6395
6396 drm_sched_start(&ring->sched, 0);
6397 }
6398
6399 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6400 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6401
6402 if (tmp_adev->asic_reset_res) {
6403 /* bad news, how to tell it to userspace ?
6404 * for ras error, we should report GPU bad status instead of
6405 * reset failure
6406 */
6407 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6408 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6409 dev_info(
6410 tmp_adev->dev,
6411 "GPU reset(%d) failed with error %d\n",
6412 atomic_read(
6413 &tmp_adev->gpu_reset_counter),
6414 tmp_adev->asic_reset_res);
6415 amdgpu_vf_error_put(tmp_adev,
6416 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0,
6417 tmp_adev->asic_reset_res);
6418 if (!r)
6419 r = tmp_adev->asic_reset_res;
6420 tmp_adev->asic_reset_res = 0;
6421 } else {
6422 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
6423 atomic_read(&tmp_adev->gpu_reset_counter));
6424 if (amdgpu_acpi_smart_shift_update(tmp_adev,
6425 AMDGPU_SS_DEV_D0))
6426 dev_warn(tmp_adev->dev,
6427 "smart shift update failed\n");
6428 }
6429 }
6430
6431 return r;
6432 }
6433
amdgpu_device_gpu_resume(struct amdgpu_device * adev,struct list_head * device_list,bool need_emergency_restart)6434 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
6435 struct list_head *device_list,
6436 bool need_emergency_restart)
6437 {
6438 struct amdgpu_device *tmp_adev = NULL;
6439
6440 list_for_each_entry(tmp_adev, device_list, reset_list) {
6441 /* unlock kfd: SRIOV would do it separately */
6442 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6443 amdgpu_amdkfd_post_reset(tmp_adev);
6444
6445 /* kfd_post_reset will do nothing if kfd device is not initialized,
6446 * need to bring up kfd here if it's not be initialized before
6447 */
6448 if (!adev->kfd.init_complete)
6449 amdgpu_amdkfd_device_init(adev);
6450
6451 if (tmp_adev->pcie_reset_ctx.audio_suspended)
6452 amdgpu_device_resume_display_audio(tmp_adev);
6453
6454 amdgpu_device_unset_mp1_state(tmp_adev);
6455
6456 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6457
6458 }
6459 }
6460
6461
6462 /**
6463 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
6464 *
6465 * @adev: amdgpu_device pointer
6466 * @job: which job trigger hang
6467 * @reset_context: amdgpu reset context pointer
6468 *
6469 * Attempt to reset the GPU if it has hung (all asics).
6470 * Attempt to do soft-reset or full-reset and reinitialize Asic
6471 * Returns 0 for success or an error on failure.
6472 */
6473
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)6474 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
6475 struct amdgpu_job *job,
6476 struct amdgpu_reset_context *reset_context)
6477 {
6478 struct list_head device_list;
6479 bool job_signaled = false;
6480 struct amdgpu_hive_info *hive = NULL;
6481 int r = 0;
6482 bool need_emergency_restart = false;
6483 /* save the pasid here as the job may be freed before the end of the reset */
6484 int pasid = job ? job->pasid : -EINVAL;
6485
6486 /*
6487 * If it reaches here because of hang/timeout and a RAS error is
6488 * detected at the same time, let RAS recovery take care of it.
6489 */
6490 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
6491 !amdgpu_sriov_vf(adev) &&
6492 reset_context->src != AMDGPU_RESET_SRC_RAS) {
6493 dev_dbg(adev->dev,
6494 "Gpu recovery from source: %d yielding to RAS error recovery handling",
6495 reset_context->src);
6496 return 0;
6497 }
6498
6499 /*
6500 * Special case: RAS triggered and full reset isn't supported
6501 */
6502 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
6503
6504 /*
6505 * Flush RAM to disk so that after reboot
6506 * the user can read log and see why the system rebooted.
6507 */
6508 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
6509 amdgpu_ras_get_context(adev)->reboot) {
6510 dev_warn(adev->dev, "Emergency reboot.");
6511
6512 ksys_sync_helper();
6513 emergency_restart();
6514 }
6515
6516 dev_info(adev->dev, "GPU %s begin!. Source: %d\n",
6517 need_emergency_restart ? "jobs stop" : "reset",
6518 reset_context->src);
6519
6520 if (!amdgpu_sriov_vf(adev))
6521 hive = amdgpu_get_xgmi_hive(adev);
6522 if (hive)
6523 mutex_lock(&hive->hive_lock);
6524
6525 reset_context->job = job;
6526 reset_context->hive = hive;
6527 INIT_LIST_HEAD(&device_list);
6528
6529 amdgpu_device_recovery_prepare(adev, &device_list, hive);
6530
6531 if (!amdgpu_sriov_vf(adev)) {
6532 r = amdgpu_device_health_check(&device_list);
6533 if (r)
6534 goto end_reset;
6535 }
6536
6537 /* Cannot be called after locking reset domain */
6538 amdgpu_ras_pre_reset(adev, &device_list);
6539
6540 /* We need to lock reset domain only once both for XGMI and single device */
6541 amdgpu_device_recovery_get_reset_lock(adev, &device_list);
6542
6543 amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
6544 hive, need_emergency_restart);
6545 if (need_emergency_restart)
6546 goto skip_sched_resume;
6547 /*
6548 * Must check guilty signal here since after this point all old
6549 * HW fences are force signaled.
6550 *
6551 * job->base holds a reference to parent fence
6552 */
6553 if (job && (dma_fence_get_status(&job->hw_fence->base) > 0)) {
6554 job_signaled = true;
6555 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
6556 goto skip_hw_reset;
6557 }
6558
6559 r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
6560 if (r)
6561 goto reset_unlock;
6562 skip_hw_reset:
6563 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
6564 if (r)
6565 goto reset_unlock;
6566 skip_sched_resume:
6567 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
6568 reset_unlock:
6569 amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6570 amdgpu_ras_post_reset(adev, &device_list);
6571 end_reset:
6572 if (hive) {
6573 mutex_unlock(&hive->hive_lock);
6574 amdgpu_put_xgmi_hive(hive);
6575 }
6576
6577 if (r)
6578 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6579
6580 atomic_set(&adev->reset_domain->reset_res, r);
6581
6582 if (!r) {
6583 struct amdgpu_task_info *ti = NULL;
6584
6585 /*
6586 * The job may already be freed at this point via the sched tdr workqueue so
6587 * use the cached pasid.
6588 */
6589 if (pasid >= 0)
6590 ti = amdgpu_vm_get_task_info_pasid(adev, pasid);
6591
6592 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE,
6593 ti ? &ti->task : NULL);
6594
6595 amdgpu_vm_put_task_info(ti);
6596 }
6597
6598 return r;
6599 }
6600
6601 /**
6602 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6603 *
6604 * @adev: amdgpu_device pointer
6605 * @speed: pointer to the speed of the link
6606 * @width: pointer to the width of the link
6607 *
6608 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6609 * first physical partner to an AMD dGPU.
6610 * This will exclude any virtual switches and links.
6611 */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6612 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6613 enum pci_bus_speed *speed,
6614 enum pcie_link_width *width)
6615 {
6616 struct pci_dev *parent = adev->pdev;
6617
6618 if (!speed || !width)
6619 return;
6620
6621 *speed = PCI_SPEED_UNKNOWN;
6622 *width = PCIE_LNK_WIDTH_UNKNOWN;
6623
6624 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6625 while ((parent = pci_upstream_bridge(parent))) {
6626 /* skip upstream/downstream switches internal to dGPU*/
6627 if (parent->vendor == PCI_VENDOR_ID_ATI)
6628 continue;
6629 *speed = pcie_get_speed_cap(parent);
6630 *width = pcie_get_width_cap(parent);
6631 break;
6632 }
6633 } else {
6634 /* use the current speeds rather than max if switching is not supported */
6635 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6636 }
6637 }
6638
6639 /**
6640 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU
6641 *
6642 * @adev: amdgpu_device pointer
6643 * @speed: pointer to the speed of the link
6644 * @width: pointer to the width of the link
6645 *
6646 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6647 * AMD dGPU which may be a virtual upstream bridge.
6648 */
amdgpu_device_gpu_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6649 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev,
6650 enum pci_bus_speed *speed,
6651 enum pcie_link_width *width)
6652 {
6653 struct pci_dev *parent = adev->pdev;
6654
6655 if (!speed || !width)
6656 return;
6657
6658 parent = pci_upstream_bridge(parent);
6659 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) {
6660 /* use the upstream/downstream switches internal to dGPU */
6661 *speed = pcie_get_speed_cap(parent);
6662 *width = pcie_get_width_cap(parent);
6663 while ((parent = pci_upstream_bridge(parent))) {
6664 if (parent->vendor == PCI_VENDOR_ID_ATI) {
6665 /* use the upstream/downstream switches internal to dGPU */
6666 *speed = pcie_get_speed_cap(parent);
6667 *width = pcie_get_width_cap(parent);
6668 }
6669 }
6670 } else {
6671 /* use the device itself */
6672 *speed = pcie_get_speed_cap(adev->pdev);
6673 *width = pcie_get_width_cap(adev->pdev);
6674 }
6675 }
6676
6677 /**
6678 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6679 *
6680 * @adev: amdgpu_device pointer
6681 *
6682 * Fetches and stores in the driver the PCIE capabilities (gen speed
6683 * and lanes) of the slot the device is in. Handles APUs and
6684 * virtualized environments where PCIE config space may not be available.
6685 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6686 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6687 {
6688 enum pci_bus_speed speed_cap, platform_speed_cap;
6689 enum pcie_link_width platform_link_width, link_width;
6690
6691 if (amdgpu_pcie_gen_cap)
6692 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6693
6694 if (amdgpu_pcie_lane_cap)
6695 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6696
6697 /* covers APUs as well */
6698 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6699 if (adev->pm.pcie_gen_mask == 0)
6700 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6701 if (adev->pm.pcie_mlw_mask == 0)
6702 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6703 return;
6704 }
6705
6706 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6707 return;
6708
6709 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6710 &platform_link_width);
6711 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width);
6712
6713 if (adev->pm.pcie_gen_mask == 0) {
6714 /* asic caps */
6715 if (speed_cap == PCI_SPEED_UNKNOWN) {
6716 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6717 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6718 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6719 } else {
6720 if (speed_cap == PCIE_SPEED_32_0GT)
6721 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6722 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6723 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6724 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6725 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6726 else if (speed_cap == PCIE_SPEED_16_0GT)
6727 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6728 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6729 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6730 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6731 else if (speed_cap == PCIE_SPEED_8_0GT)
6732 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6733 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6734 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6735 else if (speed_cap == PCIE_SPEED_5_0GT)
6736 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6737 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6738 else
6739 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6740 }
6741 /* platform caps */
6742 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6743 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6744 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6745 } else {
6746 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6747 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6748 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6749 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6750 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6751 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6752 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6753 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6754 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6755 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6756 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6757 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6758 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6759 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6760 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6761 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6762 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6763 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6764 else
6765 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6766
6767 }
6768 }
6769 if (adev->pm.pcie_mlw_mask == 0) {
6770 /* asic caps */
6771 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6772 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK;
6773 } else {
6774 switch (link_width) {
6775 case PCIE_LNK_X32:
6776 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 |
6777 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6778 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6779 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6780 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6781 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6782 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6783 break;
6784 case PCIE_LNK_X16:
6785 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6786 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6787 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6788 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6789 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6790 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6791 break;
6792 case PCIE_LNK_X12:
6793 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6794 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6795 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6796 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6797 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6798 break;
6799 case PCIE_LNK_X8:
6800 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6801 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6802 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6803 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6804 break;
6805 case PCIE_LNK_X4:
6806 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6807 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6808 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6809 break;
6810 case PCIE_LNK_X2:
6811 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6812 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6813 break;
6814 case PCIE_LNK_X1:
6815 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1;
6816 break;
6817 default:
6818 break;
6819 }
6820 }
6821 /* platform caps */
6822 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6823 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6824 } else {
6825 switch (platform_link_width) {
6826 case PCIE_LNK_X32:
6827 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6828 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6829 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6833 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6834 break;
6835 case PCIE_LNK_X16:
6836 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6837 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6838 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6839 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6840 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6841 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6842 break;
6843 case PCIE_LNK_X12:
6844 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6845 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6846 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6847 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6848 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6849 break;
6850 case PCIE_LNK_X8:
6851 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6852 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6853 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6854 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6855 break;
6856 case PCIE_LNK_X4:
6857 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6858 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6859 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6860 break;
6861 case PCIE_LNK_X2:
6862 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6863 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6864 break;
6865 case PCIE_LNK_X1:
6866 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6867 break;
6868 default:
6869 break;
6870 }
6871 }
6872 }
6873 }
6874
6875 /**
6876 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6877 *
6878 * @adev: amdgpu_device pointer
6879 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6880 *
6881 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6882 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6883 * @peer_adev.
6884 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6885 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6886 struct amdgpu_device *peer_adev)
6887 {
6888 #ifdef CONFIG_HSA_AMD_P2P
6889 bool p2p_access =
6890 !adev->gmc.xgmi.connected_to_cpu &&
6891 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6892 if (!p2p_access)
6893 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6894 pci_name(peer_adev->pdev));
6895
6896 bool is_large_bar = adev->gmc.visible_vram_size &&
6897 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6898 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6899
6900 if (!p2p_addressable) {
6901 uint64_t address_mask = peer_adev->dev->dma_mask ?
6902 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6903 resource_size_t aper_limit =
6904 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6905
6906 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6907 aper_limit & address_mask);
6908 }
6909 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6910 #else
6911 return false;
6912 #endif
6913 }
6914
amdgpu_device_baco_enter(struct amdgpu_device * adev)6915 int amdgpu_device_baco_enter(struct amdgpu_device *adev)
6916 {
6917 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6918
6919 if (!amdgpu_device_supports_baco(adev))
6920 return -ENOTSUPP;
6921
6922 if (ras && adev->ras_enabled &&
6923 adev->nbio.funcs->enable_doorbell_interrupt)
6924 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6925
6926 return amdgpu_dpm_baco_enter(adev);
6927 }
6928
amdgpu_device_baco_exit(struct amdgpu_device * adev)6929 int amdgpu_device_baco_exit(struct amdgpu_device *adev)
6930 {
6931 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6932 int ret = 0;
6933
6934 if (!amdgpu_device_supports_baco(adev))
6935 return -ENOTSUPP;
6936
6937 ret = amdgpu_dpm_baco_exit(adev);
6938 if (ret)
6939 return ret;
6940
6941 if (ras && adev->ras_enabled &&
6942 adev->nbio.funcs->enable_doorbell_interrupt)
6943 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6944
6945 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6946 adev->nbio.funcs->clear_doorbell_interrupt)
6947 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6948
6949 return 0;
6950 }
6951
6952 /**
6953 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6954 * @pdev: PCI device struct
6955 * @state: PCI channel state
6956 *
6957 * Description: Called when a PCI error is detected.
6958 *
6959 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6960 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6961 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6962 {
6963 struct drm_device *dev = pci_get_drvdata(pdev);
6964 struct amdgpu_device *adev = drm_to_adev(dev);
6965 struct amdgpu_hive_info *hive __free(xgmi_put_hive) =
6966 amdgpu_get_xgmi_hive(adev);
6967 struct amdgpu_reset_context reset_context;
6968 struct list_head device_list;
6969
6970 dev_info(adev->dev, "PCI error: detected callback!!\n");
6971
6972 adev->pci_channel_state = state;
6973
6974 switch (state) {
6975 case pci_channel_io_normal:
6976 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state);
6977 return PCI_ERS_RESULT_CAN_RECOVER;
6978 case pci_channel_io_frozen:
6979 /* Fatal error, prepare for slot reset */
6980 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state);
6981 if (hive) {
6982 /* Hive devices should be able to support FW based
6983 * link reset on other devices, if not return.
6984 */
6985 if (!amdgpu_dpm_is_link_reset_supported(adev)) {
6986 dev_warn(adev->dev,
6987 "No support for XGMI hive yet...\n");
6988 return PCI_ERS_RESULT_DISCONNECT;
6989 }
6990 /* Set dpc status only if device is part of hive
6991 * Non-hive devices should be able to recover after
6992 * link reset.
6993 */
6994 amdgpu_reset_set_dpc_status(adev, true);
6995
6996 mutex_lock(&hive->hive_lock);
6997 }
6998 memset(&reset_context, 0, sizeof(reset_context));
6999 INIT_LIST_HEAD(&device_list);
7000
7001 amdgpu_device_recovery_prepare(adev, &device_list, hive);
7002 amdgpu_device_recovery_get_reset_lock(adev, &device_list);
7003 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
7004 hive, false);
7005 if (hive)
7006 mutex_unlock(&hive->hive_lock);
7007 return PCI_ERS_RESULT_NEED_RESET;
7008 case pci_channel_io_perm_failure:
7009 /* Permanent error, prepare for device removal */
7010 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state);
7011 return PCI_ERS_RESULT_DISCONNECT;
7012 }
7013
7014 return PCI_ERS_RESULT_NEED_RESET;
7015 }
7016
7017 /**
7018 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
7019 * @pdev: pointer to PCI device
7020 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)7021 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
7022 {
7023 struct drm_device *dev = pci_get_drvdata(pdev);
7024 struct amdgpu_device *adev = drm_to_adev(dev);
7025
7026 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n");
7027
7028 /* TODO - dump whatever for debugging purposes */
7029
7030 /* This called only if amdgpu_pci_error_detected returns
7031 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
7032 * works, no need to reset slot.
7033 */
7034
7035 return PCI_ERS_RESULT_RECOVERED;
7036 }
7037
7038 /**
7039 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
7040 * @pdev: PCI device struct
7041 *
7042 * Description: This routine is called by the pci error recovery
7043 * code after the PCI slot has been reset, just before we
7044 * should resume normal operations.
7045 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)7046 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
7047 {
7048 struct drm_device *dev = pci_get_drvdata(pdev);
7049 struct amdgpu_device *adev = drm_to_adev(dev);
7050 struct amdgpu_reset_context reset_context;
7051 struct amdgpu_device *tmp_adev;
7052 struct amdgpu_hive_info *hive;
7053 struct list_head device_list;
7054 struct pci_dev *link_dev;
7055 int r = 0, i, timeout;
7056 u32 memsize;
7057 u16 status;
7058
7059 dev_info(adev->dev, "PCI error: slot reset callback!!\n");
7060
7061 memset(&reset_context, 0, sizeof(reset_context));
7062
7063 if (adev->pcie_reset_ctx.swus)
7064 link_dev = adev->pcie_reset_ctx.swus;
7065 else
7066 link_dev = adev->pdev;
7067 /* wait for asic to come out of reset, timeout = 10s */
7068 timeout = 10000;
7069 do {
7070 usleep_range(10000, 10500);
7071 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status);
7072 timeout -= 10;
7073 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) &&
7074 (status != PCI_VENDOR_ID_AMD));
7075
7076 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) {
7077 r = -ETIME;
7078 goto out;
7079 }
7080
7081 amdgpu_device_load_switch_state(adev);
7082 /* Restore PCI confspace */
7083 amdgpu_device_load_pci_state(pdev);
7084
7085 /* confirm ASIC came out of reset */
7086 for (i = 0; i < adev->usec_timeout; i++) {
7087 memsize = amdgpu_asic_get_config_memsize(adev);
7088
7089 if (memsize != 0xffffffff)
7090 break;
7091 udelay(1);
7092 }
7093 if (memsize == 0xffffffff) {
7094 r = -ETIME;
7095 goto out;
7096 }
7097
7098 reset_context.method = AMD_RESET_METHOD_NONE;
7099 reset_context.reset_req_dev = adev;
7100 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
7101 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
7102 INIT_LIST_HEAD(&device_list);
7103
7104 hive = amdgpu_get_xgmi_hive(adev);
7105 if (hive) {
7106 mutex_lock(&hive->hive_lock);
7107 reset_context.hive = hive;
7108 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
7109 tmp_adev->pcie_reset_ctx.in_link_reset = true;
7110 list_add_tail(&tmp_adev->reset_list, &device_list);
7111 }
7112 } else {
7113 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
7114 list_add_tail(&adev->reset_list, &device_list);
7115 }
7116
7117 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);
7118 out:
7119 if (!r) {
7120 if (amdgpu_device_cache_pci_state(adev->pdev))
7121 pci_restore_state(adev->pdev);
7122 dev_info(adev->dev, "PCIe error recovery succeeded\n");
7123 } else {
7124 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r);
7125 if (hive) {
7126 list_for_each_entry(tmp_adev, &device_list, reset_list)
7127 amdgpu_device_unset_mp1_state(tmp_adev);
7128 }
7129 amdgpu_device_recovery_put_reset_lock(adev, &device_list);
7130 }
7131
7132 if (hive) {
7133 mutex_unlock(&hive->hive_lock);
7134 amdgpu_put_xgmi_hive(hive);
7135 }
7136
7137 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
7138 }
7139
7140 /**
7141 * amdgpu_pci_resume() - resume normal ops after PCI reset
7142 * @pdev: pointer to PCI device
7143 *
7144 * Called when the error recovery driver tells us that its
7145 * OK to resume normal operation.
7146 */
amdgpu_pci_resume(struct pci_dev * pdev)7147 void amdgpu_pci_resume(struct pci_dev *pdev)
7148 {
7149 struct drm_device *dev = pci_get_drvdata(pdev);
7150 struct amdgpu_device *adev = drm_to_adev(dev);
7151 struct list_head device_list;
7152 struct amdgpu_hive_info *hive = NULL;
7153 struct amdgpu_device *tmp_adev = NULL;
7154
7155 dev_info(adev->dev, "PCI error: resume callback!!\n");
7156
7157 /* Only continue execution for the case of pci_channel_io_frozen */
7158 if (adev->pci_channel_state != pci_channel_io_frozen)
7159 return;
7160
7161 INIT_LIST_HEAD(&device_list);
7162
7163 hive = amdgpu_get_xgmi_hive(adev);
7164 if (hive) {
7165 mutex_lock(&hive->hive_lock);
7166 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
7167 tmp_adev->pcie_reset_ctx.in_link_reset = false;
7168 list_add_tail(&tmp_adev->reset_list, &device_list);
7169 }
7170 } else
7171 list_add_tail(&adev->reset_list, &device_list);
7172
7173 amdgpu_device_sched_resume(&device_list, NULL, NULL);
7174 amdgpu_device_gpu_resume(adev, &device_list, false);
7175 amdgpu_device_recovery_put_reset_lock(adev, &device_list);
7176
7177 if (hive) {
7178 mutex_unlock(&hive->hive_lock);
7179 amdgpu_put_xgmi_hive(hive);
7180 }
7181 }
7182
amdgpu_device_cache_switch_state(struct amdgpu_device * adev)7183 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev)
7184 {
7185 struct pci_dev *swus, *swds;
7186 int r;
7187
7188 swds = pci_upstream_bridge(adev->pdev);
7189 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI ||
7190 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM)
7191 return;
7192 swus = pci_upstream_bridge(swds);
7193 if (!swus ||
7194 (swus->vendor != PCI_VENDOR_ID_ATI &&
7195 swus->vendor != PCI_VENDOR_ID_AMD) ||
7196 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM)
7197 return;
7198
7199 /* If already saved, return */
7200 if (adev->pcie_reset_ctx.swus)
7201 return;
7202 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */
7203 r = pci_save_state(swds);
7204 if (r)
7205 return;
7206 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds);
7207
7208 r = pci_save_state(swus);
7209 if (r)
7210 return;
7211 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus);
7212
7213 adev->pcie_reset_ctx.swus = swus;
7214 }
7215
amdgpu_device_load_switch_state(struct amdgpu_device * adev)7216 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev)
7217 {
7218 struct pci_dev *pdev;
7219 int r;
7220
7221 if (!adev->pcie_reset_ctx.swds_pcistate ||
7222 !adev->pcie_reset_ctx.swus_pcistate)
7223 return;
7224
7225 pdev = adev->pcie_reset_ctx.swus;
7226 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate);
7227 if (!r) {
7228 pci_restore_state(pdev);
7229 } else {
7230 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r);
7231 return;
7232 }
7233
7234 pdev = pci_upstream_bridge(adev->pdev);
7235 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate);
7236 if (!r)
7237 pci_restore_state(pdev);
7238 else
7239 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r);
7240 }
7241
amdgpu_device_cache_pci_state(struct pci_dev * pdev)7242 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
7243 {
7244 struct drm_device *dev = pci_get_drvdata(pdev);
7245 struct amdgpu_device *adev = drm_to_adev(dev);
7246 int r;
7247
7248 if (amdgpu_sriov_vf(adev))
7249 return false;
7250
7251 r = pci_save_state(pdev);
7252 if (!r) {
7253 kfree(adev->pci_state);
7254
7255 adev->pci_state = pci_store_saved_state(pdev);
7256
7257 if (!adev->pci_state) {
7258 dev_err(adev->dev, "Failed to store PCI saved state");
7259 return false;
7260 }
7261 } else {
7262 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r);
7263 return false;
7264 }
7265
7266 amdgpu_device_cache_switch_state(adev);
7267
7268 return true;
7269 }
7270
amdgpu_device_load_pci_state(struct pci_dev * pdev)7271 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
7272 {
7273 struct drm_device *dev = pci_get_drvdata(pdev);
7274 struct amdgpu_device *adev = drm_to_adev(dev);
7275 int r;
7276
7277 if (!adev->pci_state)
7278 return false;
7279
7280 r = pci_load_saved_state(pdev, adev->pci_state);
7281
7282 if (!r) {
7283 pci_restore_state(pdev);
7284 } else {
7285 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r);
7286 return false;
7287 }
7288
7289 return true;
7290 }
7291
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7292 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
7293 struct amdgpu_ring *ring)
7294 {
7295 #ifdef CONFIG_X86_64
7296 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7297 return;
7298 #endif
7299 if (adev->gmc.xgmi.connected_to_cpu)
7300 return;
7301
7302 if (ring && ring->funcs->emit_hdp_flush) {
7303 amdgpu_ring_emit_hdp_flush(ring);
7304 return;
7305 }
7306
7307 if (!ring && amdgpu_sriov_runtime(adev)) {
7308 if (!amdgpu_kiq_hdp_flush(adev))
7309 return;
7310 }
7311
7312 amdgpu_hdp_flush(adev, ring);
7313 }
7314
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7315 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
7316 struct amdgpu_ring *ring)
7317 {
7318 #ifdef CONFIG_X86_64
7319 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7320 return;
7321 #endif
7322 if (adev->gmc.xgmi.connected_to_cpu)
7323 return;
7324
7325 amdgpu_hdp_invalidate(adev, ring);
7326 }
7327
amdgpu_in_reset(struct amdgpu_device * adev)7328 int amdgpu_in_reset(struct amdgpu_device *adev)
7329 {
7330 return atomic_read(&adev->reset_domain->in_gpu_reset);
7331 }
7332
7333 /**
7334 * amdgpu_device_halt() - bring hardware to some kind of halt state
7335 *
7336 * @adev: amdgpu_device pointer
7337 *
7338 * Bring hardware to some kind of halt state so that no one can touch it
7339 * any more. It will help to maintain error context when error occurred.
7340 * Compare to a simple hang, the system will keep stable at least for SSH
7341 * access. Then it should be trivial to inspect the hardware state and
7342 * see what's going on. Implemented as following:
7343 *
7344 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
7345 * clears all CPU mappings to device, disallows remappings through page faults
7346 * 2. amdgpu_irq_disable_all() disables all interrupts
7347 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
7348 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
7349 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
7350 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
7351 * flush any in flight DMA operations
7352 */
amdgpu_device_halt(struct amdgpu_device * adev)7353 void amdgpu_device_halt(struct amdgpu_device *adev)
7354 {
7355 struct pci_dev *pdev = adev->pdev;
7356 struct drm_device *ddev = adev_to_drm(adev);
7357
7358 amdgpu_xcp_dev_unplug(adev);
7359 drm_dev_unplug(ddev);
7360
7361 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
7362 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
7363
7364 amdgpu_irq_disable_all(adev);
7365
7366 amdgpu_fence_driver_hw_fini(adev);
7367
7368 adev->no_hw_access = true;
7369
7370 amdgpu_device_unmap_mmio(adev);
7371
7372 pci_disable_device(pdev);
7373 pci_wait_for_pending_transaction(pdev);
7374 }
7375
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)7376 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
7377 u32 reg)
7378 {
7379 unsigned long flags, address, data;
7380 u32 r;
7381
7382 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7383 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7384
7385 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7386 WREG32(address, reg * 4);
7387 (void)RREG32(address);
7388 r = RREG32(data);
7389 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7390 return r;
7391 }
7392
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)7393 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
7394 u32 reg, u32 v)
7395 {
7396 unsigned long flags, address, data;
7397
7398 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7399 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7400
7401 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7402 WREG32(address, reg * 4);
7403 (void)RREG32(address);
7404 WREG32(data, v);
7405 (void)RREG32(data);
7406 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7407 }
7408
7409 /**
7410 * amdgpu_device_get_gang - return a reference to the current gang
7411 * @adev: amdgpu_device pointer
7412 *
7413 * Returns: A new reference to the current gang leader.
7414 */
amdgpu_device_get_gang(struct amdgpu_device * adev)7415 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
7416 {
7417 struct dma_fence *fence;
7418
7419 rcu_read_lock();
7420 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
7421 rcu_read_unlock();
7422 return fence;
7423 }
7424
7425 /**
7426 * amdgpu_device_switch_gang - switch to a new gang
7427 * @adev: amdgpu_device pointer
7428 * @gang: the gang to switch to
7429 *
7430 * Try to switch to a new gang.
7431 * Returns: NULL if we switched to the new gang or a reference to the current
7432 * gang leader.
7433 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)7434 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
7435 struct dma_fence *gang)
7436 {
7437 struct dma_fence *old = NULL;
7438
7439 dma_fence_get(gang);
7440 do {
7441 dma_fence_put(old);
7442 old = amdgpu_device_get_gang(adev);
7443 if (old == gang)
7444 break;
7445
7446 if (!dma_fence_is_signaled(old)) {
7447 dma_fence_put(gang);
7448 return old;
7449 }
7450
7451 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
7452 old, gang) != old);
7453
7454 /*
7455 * Drop it once for the exchanged reference in adev and once for the
7456 * thread local reference acquired in amdgpu_device_get_gang().
7457 */
7458 dma_fence_put(old);
7459 dma_fence_put(old);
7460 return NULL;
7461 }
7462
7463 /**
7464 * amdgpu_device_enforce_isolation - enforce HW isolation
7465 * @adev: the amdgpu device pointer
7466 * @ring: the HW ring the job is supposed to run on
7467 * @job: the job which is about to be pushed to the HW ring
7468 *
7469 * Makes sure that only one client at a time can use the GFX block.
7470 * Returns: The dependency to wait on before the job can be pushed to the HW.
7471 * The function is called multiple times until NULL is returned.
7472 */
amdgpu_device_enforce_isolation(struct amdgpu_device * adev,struct amdgpu_ring * ring,struct amdgpu_job * job)7473 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
7474 struct amdgpu_ring *ring,
7475 struct amdgpu_job *job)
7476 {
7477 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id];
7478 struct drm_sched_fence *f = job->base.s_fence;
7479 struct dma_fence *dep;
7480 void *owner;
7481 int r;
7482
7483 /*
7484 * For now enforce isolation only for the GFX block since we only need
7485 * the cleaner shader on those rings.
7486 */
7487 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX &&
7488 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
7489 return NULL;
7490
7491 /*
7492 * All submissions where enforce isolation is false are handled as if
7493 * they come from a single client. Use ~0l as the owner to distinct it
7494 * from kernel submissions where the owner is NULL.
7495 */
7496 owner = job->enforce_isolation ? f->owner : (void *)~0l;
7497
7498 mutex_lock(&adev->enforce_isolation_mutex);
7499
7500 /*
7501 * The "spearhead" submission is the first one which changes the
7502 * ownership to its client. We always need to wait for it to be
7503 * pushed to the HW before proceeding with anything.
7504 */
7505 if (&f->scheduled != isolation->spearhead &&
7506 !dma_fence_is_signaled(isolation->spearhead)) {
7507 dep = isolation->spearhead;
7508 goto out_grab_ref;
7509 }
7510
7511 if (isolation->owner != owner) {
7512
7513 /*
7514 * Wait for any gang to be assembled before switching to a
7515 * different owner or otherwise we could deadlock the
7516 * submissions.
7517 */
7518 if (!job->gang_submit) {
7519 dep = amdgpu_device_get_gang(adev);
7520 if (!dma_fence_is_signaled(dep))
7521 goto out_return_dep;
7522 dma_fence_put(dep);
7523 }
7524
7525 dma_fence_put(isolation->spearhead);
7526 isolation->spearhead = dma_fence_get(&f->scheduled);
7527 amdgpu_sync_move(&isolation->active, &isolation->prev);
7528 trace_amdgpu_isolation(isolation->owner, owner);
7529 isolation->owner = owner;
7530 }
7531
7532 /*
7533 * Specifying the ring here helps to pipeline submissions even when
7534 * isolation is enabled. If that is not desired for testing NULL can be
7535 * used instead of the ring to enforce a CPU round trip while switching
7536 * between clients.
7537 */
7538 dep = amdgpu_sync_peek_fence(&isolation->prev, ring);
7539 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT);
7540 if (r)
7541 dev_warn(adev->dev, "OOM tracking isolation\n");
7542
7543 out_grab_ref:
7544 dma_fence_get(dep);
7545 out_return_dep:
7546 mutex_unlock(&adev->enforce_isolation_mutex);
7547 return dep;
7548 }
7549
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)7550 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
7551 {
7552 switch (adev->asic_type) {
7553 #ifdef CONFIG_DRM_AMDGPU_SI
7554 case CHIP_HAINAN:
7555 #endif
7556 case CHIP_TOPAZ:
7557 /* chips with no display hardware */
7558 return false;
7559 #ifdef CONFIG_DRM_AMDGPU_SI
7560 case CHIP_TAHITI:
7561 case CHIP_PITCAIRN:
7562 case CHIP_VERDE:
7563 case CHIP_OLAND:
7564 #endif
7565 #ifdef CONFIG_DRM_AMDGPU_CIK
7566 case CHIP_BONAIRE:
7567 case CHIP_HAWAII:
7568 case CHIP_KAVERI:
7569 case CHIP_KABINI:
7570 case CHIP_MULLINS:
7571 #endif
7572 case CHIP_TONGA:
7573 case CHIP_FIJI:
7574 case CHIP_POLARIS10:
7575 case CHIP_POLARIS11:
7576 case CHIP_POLARIS12:
7577 case CHIP_VEGAM:
7578 case CHIP_CARRIZO:
7579 case CHIP_STONEY:
7580 /* chips with display hardware */
7581 return true;
7582 default:
7583 /* IP discovery */
7584 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
7585 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
7586 return false;
7587 return true;
7588 }
7589 }
7590
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)7591 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
7592 uint32_t inst, uint32_t reg_addr, char reg_name[],
7593 uint32_t expected_value, uint32_t mask)
7594 {
7595 uint32_t ret = 0;
7596 uint32_t old_ = 0;
7597 uint32_t tmp_ = RREG32(reg_addr);
7598 uint32_t loop = adev->usec_timeout;
7599
7600 while ((tmp_ & (mask)) != (expected_value)) {
7601 if (old_ != tmp_) {
7602 loop = adev->usec_timeout;
7603 old_ = tmp_;
7604 } else
7605 udelay(1);
7606 tmp_ = RREG32(reg_addr);
7607 loop--;
7608 if (!loop) {
7609 dev_warn(
7610 adev->dev,
7611 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
7612 inst, reg_name, (uint32_t)expected_value,
7613 (uint32_t)(tmp_ & (mask)));
7614 ret = -ETIMEDOUT;
7615 break;
7616 }
7617 }
7618 return ret;
7619 }
7620
amdgpu_get_soft_full_reset_mask(struct amdgpu_ring * ring)7621 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
7622 {
7623 ssize_t size = 0;
7624
7625 if (!ring || !ring->adev)
7626 return size;
7627
7628 if (amdgpu_device_should_recover_gpu(ring->adev))
7629 size |= AMDGPU_RESET_TYPE_FULL;
7630
7631 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
7632 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
7633 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
7634
7635 return size;
7636 }
7637
amdgpu_show_reset_mask(char * buf,uint32_t supported_reset)7638 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
7639 {
7640 ssize_t size = 0;
7641
7642 if (supported_reset == 0) {
7643 size += sysfs_emit_at(buf, size, "unsupported");
7644 size += sysfs_emit_at(buf, size, "\n");
7645 return size;
7646
7647 }
7648
7649 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
7650 size += sysfs_emit_at(buf, size, "soft ");
7651
7652 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
7653 size += sysfs_emit_at(buf, size, "queue ");
7654
7655 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
7656 size += sysfs_emit_at(buf, size, "pipe ");
7657
7658 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
7659 size += sysfs_emit_at(buf, size, "full ");
7660
7661 size += sysfs_emit_at(buf, size, "\n");
7662 return size;
7663 }
7664
amdgpu_device_set_uid(struct amdgpu_uid * uid_info,enum amdgpu_uid_type type,uint8_t inst,uint64_t uid)7665 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info,
7666 enum amdgpu_uid_type type, uint8_t inst,
7667 uint64_t uid)
7668 {
7669 if (!uid_info)
7670 return;
7671
7672 if (type >= AMDGPU_UID_TYPE_MAX) {
7673 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n",
7674 type);
7675 return;
7676 }
7677
7678 if (inst >= AMDGPU_UID_INST_MAX) {
7679 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n",
7680 inst);
7681 return;
7682 }
7683
7684 if (uid_info->uid[type][inst] != 0) {
7685 dev_warn_once(
7686 uid_info->adev->dev,
7687 "Overwriting existing UID %llu for type %d instance %d\n",
7688 uid_info->uid[type][inst], type, inst);
7689 }
7690
7691 uid_info->uid[type][inst] = uid;
7692 }
7693
amdgpu_device_get_uid(struct amdgpu_uid * uid_info,enum amdgpu_uid_type type,uint8_t inst)7694 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info,
7695 enum amdgpu_uid_type type, uint8_t inst)
7696 {
7697 if (!uid_info)
7698 return 0;
7699
7700 if (type >= AMDGPU_UID_TYPE_MAX) {
7701 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n",
7702 type);
7703 return 0;
7704 }
7705
7706 if (inst >= AMDGPU_UID_INST_MAX) {
7707 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n",
7708 inst);
7709 return 0;
7710 }
7711
7712 return uid_info->uid[type][inst];
7713 }
7714