xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c (revision 39d3389331abd712461f50249722f7ed9d815068)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39 
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
49 #include "amdgpu.h"
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
52 #include "atom.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
55 #include "amd_pcie.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
57 #include "si.h"
58 #endif
59 #ifdef CONFIG_DRM_AMDGPU_CIK
60 #include "cik.h"
61 #endif
62 #include "vi.h"
63 #include "soc15.h"
64 #include "nv.h"
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
68 
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
71 
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_ras_mgr.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78 #include "amdgpu_virt.h"
79 #include "amdgpu_dev_coredump.h"
80 
81 #include <linux/suspend.h>
82 #include <drm/task_barrier.h>
83 #include <linux/pm_runtime.h>
84 
85 #include <drm/drm_drv.h>
86 
87 #if IS_ENABLED(CONFIG_X86)
88 #include <asm/intel-family.h>
89 #include <asm/cpu_device_id.h>
90 #endif
91 
92 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
97 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
98 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
99 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin");
100 
101 #define AMDGPU_RESUME_MS		2000
102 #define AMDGPU_MAX_RETRY_LIMIT		2
103 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
104 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
105 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
106 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
107 
108 #define AMDGPU_VBIOS_SKIP (1U << 0)
109 #define AMDGPU_VBIOS_OPTIONAL (1U << 1)
110 
111 static const struct drm_driver amdgpu_kms_driver;
112 
113 const char *amdgpu_asic_name[] = {
114 	"TAHITI",
115 	"PITCAIRN",
116 	"VERDE",
117 	"OLAND",
118 	"HAINAN",
119 	"BONAIRE",
120 	"KAVERI",
121 	"KABINI",
122 	"HAWAII",
123 	"MULLINS",
124 	"TOPAZ",
125 	"TONGA",
126 	"FIJI",
127 	"CARRIZO",
128 	"STONEY",
129 	"POLARIS10",
130 	"POLARIS11",
131 	"POLARIS12",
132 	"VEGAM",
133 	"VEGA10",
134 	"VEGA12",
135 	"VEGA20",
136 	"RAVEN",
137 	"ARCTURUS",
138 	"RENOIR",
139 	"ALDEBARAN",
140 	"NAVI10",
141 	"CYAN_SKILLFISH",
142 	"NAVI14",
143 	"NAVI12",
144 	"SIENNA_CICHLID",
145 	"NAVY_FLOUNDER",
146 	"VANGOGH",
147 	"DIMGREY_CAVEFISH",
148 	"BEIGE_GOBY",
149 	"YELLOW_CARP",
150 	"IP DISCOVERY",
151 	"LAST",
152 };
153 
154 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM  - 1, 0)
155 /*
156  * Default init level where all blocks are expected to be initialized. This is
157  * the level of initialization expected by default and also after a full reset
158  * of the device.
159  */
160 struct amdgpu_init_level amdgpu_init_default = {
161 	.level = AMDGPU_INIT_LEVEL_DEFAULT,
162 	.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
163 };
164 
165 struct amdgpu_init_level amdgpu_init_recovery = {
166 	.level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
167 	.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
168 };
169 
170 /*
171  * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
172  * is used for cases like reset on initialization where the entire hive needs to
173  * be reset before first use.
174  */
175 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
176 	.level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
177 	.hwini_ip_block_mask =
178 		BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
179 		BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
180 		BIT(AMD_IP_BLOCK_TYPE_PSP)
181 };
182 
183 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev);
184 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev);
185 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev);
186 
187 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev);
188 
amdgpu_ip_member_of_hwini(struct amdgpu_device * adev,enum amd_ip_block_type block)189 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
190 					     enum amd_ip_block_type block)
191 {
192 	return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
193 }
194 
amdgpu_set_init_level(struct amdgpu_device * adev,enum amdgpu_init_lvl_id lvl)195 void amdgpu_set_init_level(struct amdgpu_device *adev,
196 			   enum amdgpu_init_lvl_id lvl)
197 {
198 	switch (lvl) {
199 	case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
200 		adev->init_lvl = &amdgpu_init_minimal_xgmi;
201 		break;
202 	case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
203 		adev->init_lvl = &amdgpu_init_recovery;
204 		break;
205 	case AMDGPU_INIT_LEVEL_DEFAULT:
206 		fallthrough;
207 	default:
208 		adev->init_lvl = &amdgpu_init_default;
209 		break;
210 	}
211 }
212 
213 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
214 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
215 				     void *data);
216 
217 /**
218  * DOC: pcie_replay_count
219  *
220  * The amdgpu driver provides a sysfs API for reporting the total number
221  * of PCIe replays (NAKs).
222  * The file pcie_replay_count is used for this and returns the total
223  * number of replays as a sum of the NAKs generated and NAKs received.
224  */
225 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)226 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
227 		struct device_attribute *attr, char *buf)
228 {
229 	struct drm_device *ddev = dev_get_drvdata(dev);
230 	struct amdgpu_device *adev = drm_to_adev(ddev);
231 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
232 
233 	return sysfs_emit(buf, "%llu\n", cnt);
234 }
235 
236 static DEVICE_ATTR(pcie_replay_count, 0444,
237 		amdgpu_device_get_pcie_replay_count, NULL);
238 
amdgpu_device_attr_sysfs_init(struct amdgpu_device * adev)239 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
240 {
241 	int ret = 0;
242 
243 	if (amdgpu_nbio_is_replay_cnt_supported(adev))
244 		ret = sysfs_create_file(&adev->dev->kobj,
245 					&dev_attr_pcie_replay_count.attr);
246 
247 	return ret;
248 }
249 
amdgpu_device_attr_sysfs_fini(struct amdgpu_device * adev)250 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
251 {
252 	if (amdgpu_nbio_is_replay_cnt_supported(adev))
253 		sysfs_remove_file(&adev->dev->kobj,
254 				  &dev_attr_pcie_replay_count.attr);
255 }
256 
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,const struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)257 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
258 					  const struct bin_attribute *attr, char *buf,
259 					  loff_t ppos, size_t count)
260 {
261 	struct device *dev = kobj_to_dev(kobj);
262 	struct drm_device *ddev = dev_get_drvdata(dev);
263 	struct amdgpu_device *adev = drm_to_adev(ddev);
264 	ssize_t bytes_read;
265 
266 	switch (ppos) {
267 	case AMDGPU_SYS_REG_STATE_XGMI:
268 		bytes_read = amdgpu_asic_get_reg_state(
269 			adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
270 		break;
271 	case AMDGPU_SYS_REG_STATE_WAFL:
272 		bytes_read = amdgpu_asic_get_reg_state(
273 			adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
274 		break;
275 	case AMDGPU_SYS_REG_STATE_PCIE:
276 		bytes_read = amdgpu_asic_get_reg_state(
277 			adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
278 		break;
279 	case AMDGPU_SYS_REG_STATE_USR:
280 		bytes_read = amdgpu_asic_get_reg_state(
281 			adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
282 		break;
283 	case AMDGPU_SYS_REG_STATE_USR_1:
284 		bytes_read = amdgpu_asic_get_reg_state(
285 			adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
286 		break;
287 	default:
288 		return -EINVAL;
289 	}
290 
291 	return bytes_read;
292 }
293 
294 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
295 		      AMDGPU_SYS_REG_STATE_END);
296 
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)297 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
298 {
299 	int ret;
300 
301 	if (!amdgpu_asic_get_reg_state_supported(adev))
302 		return 0;
303 
304 	ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
305 
306 	return ret;
307 }
308 
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)309 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
310 {
311 	if (!amdgpu_asic_get_reg_state_supported(adev))
312 		return;
313 	sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
314 }
315 
amdgpu_ip_block_suspend(struct amdgpu_ip_block * ip_block)316 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
317 {
318 	int r;
319 
320 	if (ip_block->version->funcs->suspend) {
321 		r = ip_block->version->funcs->suspend(ip_block);
322 		if (r) {
323 			dev_err(ip_block->adev->dev,
324 				"suspend of IP block <%s> failed %d\n",
325 				ip_block->version->funcs->name, r);
326 			return r;
327 		}
328 	}
329 
330 	ip_block->status.hw = false;
331 	return 0;
332 }
333 
amdgpu_ip_block_resume(struct amdgpu_ip_block * ip_block)334 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
335 {
336 	int r;
337 
338 	if (ip_block->version->funcs->resume) {
339 		r = ip_block->version->funcs->resume(ip_block);
340 		if (r) {
341 			dev_err(ip_block->adev->dev,
342 				"resume of IP block <%s> failed %d\n",
343 				ip_block->version->funcs->name, r);
344 			return r;
345 		}
346 	}
347 
348 	ip_block->status.hw = true;
349 	return 0;
350 }
351 
352 /**
353  * DOC: board_info
354  *
355  * The amdgpu driver provides a sysfs API for giving board related information.
356  * It provides the form factor information in the format
357  *
358  *   type : form factor
359  *
360  * Possible form factor values
361  *
362  * - "cem"		- PCIE CEM card
363  * - "oam"		- Open Compute Accelerator Module
364  * - "unknown"	- Not known
365  *
366  */
367 
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)368 static ssize_t amdgpu_device_get_board_info(struct device *dev,
369 					    struct device_attribute *attr,
370 					    char *buf)
371 {
372 	struct drm_device *ddev = dev_get_drvdata(dev);
373 	struct amdgpu_device *adev = drm_to_adev(ddev);
374 	enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
375 	const char *pkg;
376 
377 	if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
378 		pkg_type = adev->smuio.funcs->get_pkg_type(adev);
379 
380 	switch (pkg_type) {
381 	case AMDGPU_PKG_TYPE_CEM:
382 		pkg = "cem";
383 		break;
384 	case AMDGPU_PKG_TYPE_OAM:
385 		pkg = "oam";
386 		break;
387 	default:
388 		pkg = "unknown";
389 		break;
390 	}
391 
392 	return sysfs_emit(buf, "%s : %s\n", "type", pkg);
393 }
394 
395 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
396 
397 static struct attribute *amdgpu_board_attrs[] = {
398 	&dev_attr_board_info.attr,
399 	NULL,
400 };
401 
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)402 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
403 					     struct attribute *attr, int n)
404 {
405 	struct device *dev = kobj_to_dev(kobj);
406 	struct drm_device *ddev = dev_get_drvdata(dev);
407 	struct amdgpu_device *adev = drm_to_adev(ddev);
408 
409 	if (adev->flags & AMD_IS_APU)
410 		return 0;
411 
412 	return attr->mode;
413 }
414 
415 static const struct attribute_group amdgpu_board_attrs_group = {
416 	.attrs = amdgpu_board_attrs,
417 	.is_visible = amdgpu_board_attrs_is_visible
418 };
419 
420 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
421 
422 /**
423  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
424  *
425  * @adev: amdgpu device pointer
426  *
427  * Returns true if the device is a dGPU with ATPX power control,
428  * otherwise return false.
429  */
amdgpu_device_supports_px(struct amdgpu_device * adev)430 bool amdgpu_device_supports_px(struct amdgpu_device *adev)
431 {
432 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
433 		return true;
434 	return false;
435 }
436 
437 /**
438  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
439  *
440  * @adev: amdgpu device pointer
441  *
442  * Returns true if the device is a dGPU with ACPI power control,
443  * otherwise return false.
444  */
amdgpu_device_supports_boco(struct amdgpu_device * adev)445 bool amdgpu_device_supports_boco(struct amdgpu_device *adev)
446 {
447 	if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
448 		return false;
449 
450 	if (adev->has_pr3 ||
451 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
452 		return true;
453 	return false;
454 }
455 
456 /**
457  * amdgpu_device_supports_baco - Does the device support BACO
458  *
459  * @adev: amdgpu device pointer
460  *
461  * Return:
462  * 1 if the device supports BACO;
463  * 3 if the device supports MACO (only works if BACO is supported)
464  * otherwise return 0.
465  */
amdgpu_device_supports_baco(struct amdgpu_device * adev)466 int amdgpu_device_supports_baco(struct amdgpu_device *adev)
467 {
468 	return amdgpu_asic_supports_baco(adev);
469 }
470 
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)471 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
472 {
473 	int bamaco_support;
474 
475 	adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
476 	bamaco_support = amdgpu_device_supports_baco(adev);
477 
478 	switch (amdgpu_runtime_pm) {
479 	case 2:
480 		if (bamaco_support & MACO_SUPPORT) {
481 			adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
482 			dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
483 		} else if (bamaco_support == BACO_SUPPORT) {
484 			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
485 			dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
486 		}
487 		break;
488 	case 1:
489 		if (bamaco_support & BACO_SUPPORT) {
490 			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
491 			dev_info(adev->dev, "Forcing BACO for runtime pm\n");
492 		}
493 		break;
494 	case -1:
495 	case -2:
496 		if (amdgpu_device_supports_px(adev)) {
497 			/* enable PX as runtime mode */
498 			adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
499 			dev_info(adev->dev, "Using ATPX for runtime pm\n");
500 		} else if (amdgpu_device_supports_boco(adev)) {
501 			/* enable boco as runtime mode */
502 			adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
503 			dev_info(adev->dev, "Using BOCO for runtime pm\n");
504 		} else {
505 			if (!bamaco_support)
506 				goto no_runtime_pm;
507 
508 			switch (adev->asic_type) {
509 			case CHIP_VEGA20:
510 			case CHIP_ARCTURUS:
511 				/* BACO are not supported on vega20 and arctrus */
512 				break;
513 			case CHIP_VEGA10:
514 				/* enable BACO as runpm mode if noretry=0 */
515 				if (!adev->gmc.noretry && !amdgpu_passthrough(adev))
516 					adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
517 				break;
518 			default:
519 				/* enable BACO as runpm mode on CI+ */
520 				if (!amdgpu_passthrough(adev))
521 					adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
522 				break;
523 			}
524 
525 			if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
526 				if (bamaco_support & MACO_SUPPORT) {
527 					adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
528 					dev_info(adev->dev, "Using BAMACO for runtime pm\n");
529 				} else {
530 					dev_info(adev->dev, "Using BACO for runtime pm\n");
531 				}
532 			}
533 		}
534 		break;
535 	case 0:
536 		dev_info(adev->dev, "runtime pm is manually disabled\n");
537 		break;
538 	default:
539 		break;
540 	}
541 
542 no_runtime_pm:
543 	if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
544 		dev_info(adev->dev, "Runtime PM not available\n");
545 }
546 /**
547  * amdgpu_device_supports_smart_shift - Is the device dGPU with
548  * smart shift support
549  *
550  * @adev: amdgpu device pointer
551  *
552  * Returns true if the device is a dGPU with Smart Shift support,
553  * otherwise returns false.
554  */
amdgpu_device_supports_smart_shift(struct amdgpu_device * adev)555 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev)
556 {
557 	return (amdgpu_device_supports_boco(adev) &&
558 		amdgpu_acpi_is_power_shift_control_supported());
559 }
560 
561 /*
562  * VRAM access helper functions
563  */
564 
565 /**
566  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
567  *
568  * @adev: amdgpu_device pointer
569  * @pos: offset of the buffer in vram
570  * @buf: virtual address of the buffer in system memory
571  * @size: read/write size, sizeof(@buf) must > @size
572  * @write: true - write to vram, otherwise - read from vram
573  */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
575 			     void *buf, size_t size, bool write)
576 {
577 	unsigned long flags;
578 	uint32_t hi = ~0, tmp = 0;
579 	uint32_t *data = buf;
580 	uint64_t last;
581 	int idx;
582 
583 	if (!drm_dev_enter(adev_to_drm(adev), &idx))
584 		return;
585 
586 	BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
587 
588 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
589 	for (last = pos + size; pos < last; pos += 4) {
590 		tmp = pos >> 31;
591 
592 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
593 		if (tmp != hi) {
594 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
595 			hi = tmp;
596 		}
597 		if (write)
598 			WREG32_NO_KIQ(mmMM_DATA, *data++);
599 		else
600 			*data++ = RREG32_NO_KIQ(mmMM_DATA);
601 	}
602 
603 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
604 	drm_dev_exit(idx);
605 }
606 
607 /**
608  * amdgpu_device_aper_access - access vram by vram aperture
609  *
610  * @adev: amdgpu_device pointer
611  * @pos: offset of the buffer in vram
612  * @buf: virtual address of the buffer in system memory
613  * @size: read/write size, sizeof(@buf) must > @size
614  * @write: true - write to vram, otherwise - read from vram
615  *
616  * The return value means how many bytes have been transferred.
617  */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
619 				 void *buf, size_t size, bool write)
620 {
621 #ifdef CONFIG_64BIT
622 	void __iomem *addr;
623 	size_t count = 0;
624 	uint64_t last;
625 
626 	if (!adev->mman.aper_base_kaddr)
627 		return 0;
628 
629 	last = min(pos + size, adev->gmc.visible_vram_size);
630 	if (last > pos) {
631 		addr = adev->mman.aper_base_kaddr + pos;
632 		count = last - pos;
633 
634 		if (write) {
635 			memcpy_toio(addr, buf, count);
636 			/* Make sure HDP write cache flush happens without any reordering
637 			 * after the system memory contents are sent over PCIe device
638 			 */
639 			mb();
640 			amdgpu_device_flush_hdp(adev, NULL);
641 		} else {
642 			amdgpu_device_invalidate_hdp(adev, NULL);
643 			/* Make sure HDP read cache is invalidated before issuing a read
644 			 * to the PCIe device
645 			 */
646 			mb();
647 			memcpy_fromio(buf, addr, count);
648 		}
649 
650 	}
651 
652 	return count;
653 #else
654 	return 0;
655 #endif
656 }
657 
658 /**
659  * amdgpu_device_vram_access - read/write a buffer in vram
660  *
661  * @adev: amdgpu_device pointer
662  * @pos: offset of the buffer in vram
663  * @buf: virtual address of the buffer in system memory
664  * @size: read/write size, sizeof(@buf) must > @size
665  * @write: true - write to vram, otherwise - read from vram
666  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
668 			       void *buf, size_t size, bool write)
669 {
670 	size_t count;
671 
672 	/* try to using vram apreature to access vram first */
673 	count = amdgpu_device_aper_access(adev, pos, buf, size, write);
674 	size -= count;
675 	if (size) {
676 		/* using MM to access rest vram */
677 		pos += count;
678 		buf += count;
679 		amdgpu_device_mm_access(adev, pos, buf, size, write);
680 	}
681 }
682 
683 /*
684  * register access helper functions.
685  */
686 
687 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
689 {
690 	if (adev->no_hw_access)
691 		return true;
692 
693 #ifdef CONFIG_LOCKDEP
694 	/*
695 	 * This is a bit complicated to understand, so worth a comment. What we assert
696 	 * here is that the GPU reset is not running on another thread in parallel.
697 	 *
698 	 * For this we trylock the read side of the reset semaphore, if that succeeds
699 	 * we know that the reset is not running in parallel.
700 	 *
701 	 * If the trylock fails we assert that we are either already holding the read
702 	 * side of the lock or are the reset thread itself and hold the write side of
703 	 * the lock.
704 	 */
705 	if (in_task()) {
706 		if (down_read_trylock(&adev->reset_domain->sem))
707 			up_read(&adev->reset_domain->sem);
708 		else
709 			lockdep_assert_held(&adev->reset_domain->sem);
710 	}
711 #endif
712 	return false;
713 }
714 
715 /**
716  * amdgpu_device_rreg - read a memory mapped IO or indirect register
717  *
718  * @adev: amdgpu_device pointer
719  * @reg: dword aligned register offset
720  * @acc_flags: access flags which require special behavior
721  *
722  * Returns the 32 bit value from the offset specified.
723  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
725 			    uint32_t reg, uint32_t acc_flags)
726 {
727 	uint32_t ret;
728 
729 	if (amdgpu_device_skip_hw_access(adev))
730 		return 0;
731 
732 	if ((reg * 4) < adev->rmmio_size) {
733 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
734 		    amdgpu_sriov_runtime(adev) &&
735 		    down_read_trylock(&adev->reset_domain->sem)) {
736 			ret = amdgpu_kiq_rreg(adev, reg, 0);
737 			up_read(&adev->reset_domain->sem);
738 		} else {
739 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
740 		}
741 	} else {
742 		ret = adev->pcie_rreg(adev, reg * 4);
743 	}
744 
745 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
746 
747 	return ret;
748 }
749 
750 /*
751  * MMIO register read with bytes helper functions
752  * @offset:bytes offset from MMIO start
753  */
754 
755 /**
756  * amdgpu_mm_rreg8 - read a memory mapped IO register
757  *
758  * @adev: amdgpu_device pointer
759  * @offset: byte aligned register offset
760  *
761  * Returns the 8 bit value from the offset specified.
762  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
764 {
765 	if (amdgpu_device_skip_hw_access(adev))
766 		return 0;
767 
768 	if (offset < adev->rmmio_size)
769 		return (readb(adev->rmmio + offset));
770 	BUG();
771 }
772 
773 
774 /**
775  * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
776  *
777  * @adev: amdgpu_device pointer
778  * @reg: dword aligned register offset
779  * @acc_flags: access flags which require special behavior
780  * @xcc_id: xcc accelerated compute core id
781  *
782  * Returns the 32 bit value from the offset specified.
783  */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
785 				uint32_t reg, uint32_t acc_flags,
786 				uint32_t xcc_id)
787 {
788 	uint32_t ret, rlcg_flag;
789 
790 	if (amdgpu_device_skip_hw_access(adev))
791 		return 0;
792 
793 	if ((reg * 4) < adev->rmmio_size) {
794 		if (amdgpu_sriov_vf(adev) &&
795 		    !amdgpu_sriov_runtime(adev) &&
796 		    adev->gfx.rlc.rlcg_reg_access_supported &&
797 		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
798 							 GC_HWIP, false,
799 							 &rlcg_flag)) {
800 			ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
801 		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
802 		    amdgpu_sriov_runtime(adev) &&
803 		    down_read_trylock(&adev->reset_domain->sem)) {
804 			ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
805 			up_read(&adev->reset_domain->sem);
806 		} else {
807 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
808 		}
809 	} else {
810 		ret = adev->pcie_rreg(adev, reg * 4);
811 	}
812 
813 	return ret;
814 }
815 
816 /*
817  * MMIO register write with bytes helper functions
818  * @offset:bytes offset from MMIO start
819  * @value: the value want to be written to the register
820  */
821 
822 /**
823  * amdgpu_mm_wreg8 - read a memory mapped IO register
824  *
825  * @adev: amdgpu_device pointer
826  * @offset: byte aligned register offset
827  * @value: 8 bit value to write
828  *
829  * Writes the value specified to the offset specified.
830  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
832 {
833 	if (amdgpu_device_skip_hw_access(adev))
834 		return;
835 
836 	if (offset < adev->rmmio_size)
837 		writeb(value, adev->rmmio + offset);
838 	else
839 		BUG();
840 }
841 
842 /**
843  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
844  *
845  * @adev: amdgpu_device pointer
846  * @reg: dword aligned register offset
847  * @v: 32 bit value to write to the register
848  * @acc_flags: access flags which require special behavior
849  *
850  * Writes the value specified to the offset specified.
851  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)852 void amdgpu_device_wreg(struct amdgpu_device *adev,
853 			uint32_t reg, uint32_t v,
854 			uint32_t acc_flags)
855 {
856 	if (amdgpu_device_skip_hw_access(adev))
857 		return;
858 
859 	if ((reg * 4) < adev->rmmio_size) {
860 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
861 		    amdgpu_sriov_runtime(adev) &&
862 		    down_read_trylock(&adev->reset_domain->sem)) {
863 			amdgpu_kiq_wreg(adev, reg, v, 0);
864 			up_read(&adev->reset_domain->sem);
865 		} else {
866 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
867 		}
868 	} else {
869 		adev->pcie_wreg(adev, reg * 4, v);
870 	}
871 
872 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
873 }
874 
875 /**
876  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
877  *
878  * @adev: amdgpu_device pointer
879  * @reg: mmio/rlc register
880  * @v: value to write
881  * @xcc_id: xcc accelerated compute core id
882  *
883  * this function is invoked only for the debugfs register access
884  */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
886 			     uint32_t reg, uint32_t v,
887 			     uint32_t xcc_id)
888 {
889 	if (amdgpu_device_skip_hw_access(adev))
890 		return;
891 
892 	if (amdgpu_sriov_fullaccess(adev) &&
893 	    adev->gfx.rlc.funcs &&
894 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
895 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
896 			return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
897 	} else if ((reg * 4) >= adev->rmmio_size) {
898 		adev->pcie_wreg(adev, reg * 4, v);
899 	} else {
900 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
901 	}
902 }
903 
904 /**
905  * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
906  *
907  * @adev: amdgpu_device pointer
908  * @reg: dword aligned register offset
909  * @v: 32 bit value to write to the register
910  * @acc_flags: access flags which require special behavior
911  * @xcc_id: xcc accelerated compute core id
912  *
913  * Writes the value specified to the offset specified.
914  */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
916 			uint32_t reg, uint32_t v,
917 			uint32_t acc_flags, uint32_t xcc_id)
918 {
919 	uint32_t rlcg_flag;
920 
921 	if (amdgpu_device_skip_hw_access(adev))
922 		return;
923 
924 	if ((reg * 4) < adev->rmmio_size) {
925 		if (amdgpu_sriov_vf(adev) &&
926 		    !amdgpu_sriov_runtime(adev) &&
927 		    adev->gfx.rlc.rlcg_reg_access_supported &&
928 		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
929 							 GC_HWIP, true,
930 							 &rlcg_flag)) {
931 			amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
932 		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
933 		    amdgpu_sriov_runtime(adev) &&
934 		    down_read_trylock(&adev->reset_domain->sem)) {
935 			amdgpu_kiq_wreg(adev, reg, v, xcc_id);
936 			up_read(&adev->reset_domain->sem);
937 		} else {
938 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
939 		}
940 	} else {
941 		adev->pcie_wreg(adev, reg * 4, v);
942 	}
943 }
944 
945 /**
946  * amdgpu_device_indirect_rreg - read an indirect register
947  *
948  * @adev: amdgpu_device pointer
949  * @reg_addr: indirect register address to read from
950  *
951  * Returns the value of indirect register @reg_addr
952  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
954 				u32 reg_addr)
955 {
956 	unsigned long flags, pcie_index, pcie_data;
957 	void __iomem *pcie_index_offset;
958 	void __iomem *pcie_data_offset;
959 	u32 r;
960 
961 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
962 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
963 
964 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
965 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
966 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
967 
968 	writel(reg_addr, pcie_index_offset);
969 	readl(pcie_index_offset);
970 	r = readl(pcie_data_offset);
971 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
972 
973 	return r;
974 }
975 
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
977 				    u64 reg_addr)
978 {
979 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
980 	u32 r;
981 	void __iomem *pcie_index_offset;
982 	void __iomem *pcie_index_hi_offset;
983 	void __iomem *pcie_data_offset;
984 
985 	if (unlikely(!adev->nbio.funcs)) {
986 		pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
987 		pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
988 	} else {
989 		pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
990 		pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
991 	}
992 
993 	if (reg_addr >> 32) {
994 		if (unlikely(!adev->nbio.funcs))
995 			pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
996 		else
997 			pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
998 	} else {
999 		pcie_index_hi = 0;
1000 	}
1001 
1002 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1003 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1004 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1005 	if (pcie_index_hi != 0)
1006 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1007 				pcie_index_hi * 4;
1008 
1009 	writel(reg_addr, pcie_index_offset);
1010 	readl(pcie_index_offset);
1011 	if (pcie_index_hi != 0) {
1012 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1013 		readl(pcie_index_hi_offset);
1014 	}
1015 	r = readl(pcie_data_offset);
1016 
1017 	/* clear the high bits */
1018 	if (pcie_index_hi != 0) {
1019 		writel(0, pcie_index_hi_offset);
1020 		readl(pcie_index_hi_offset);
1021 	}
1022 
1023 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1024 
1025 	return r;
1026 }
1027 
1028 /**
1029  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1030  *
1031  * @adev: amdgpu_device pointer
1032  * @reg_addr: indirect register address to read from
1033  *
1034  * Returns the value of indirect register @reg_addr
1035  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1037 				  u32 reg_addr)
1038 {
1039 	unsigned long flags, pcie_index, pcie_data;
1040 	void __iomem *pcie_index_offset;
1041 	void __iomem *pcie_data_offset;
1042 	u64 r;
1043 
1044 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1045 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1046 
1047 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1048 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1049 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1050 
1051 	/* read low 32 bits */
1052 	writel(reg_addr, pcie_index_offset);
1053 	readl(pcie_index_offset);
1054 	r = readl(pcie_data_offset);
1055 	/* read high 32 bits */
1056 	writel(reg_addr + 4, pcie_index_offset);
1057 	readl(pcie_index_offset);
1058 	r |= ((u64)readl(pcie_data_offset) << 32);
1059 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1060 
1061 	return r;
1062 }
1063 
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1065 				  u64 reg_addr)
1066 {
1067 	unsigned long flags, pcie_index, pcie_data;
1068 	unsigned long pcie_index_hi = 0;
1069 	void __iomem *pcie_index_offset;
1070 	void __iomem *pcie_index_hi_offset;
1071 	void __iomem *pcie_data_offset;
1072 	u64 r;
1073 
1074 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1075 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1076 	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1077 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1078 
1079 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1080 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1081 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1082 	if (pcie_index_hi != 0)
1083 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1084 			pcie_index_hi * 4;
1085 
1086 	/* read low 32 bits */
1087 	writel(reg_addr, pcie_index_offset);
1088 	readl(pcie_index_offset);
1089 	if (pcie_index_hi != 0) {
1090 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1091 		readl(pcie_index_hi_offset);
1092 	}
1093 	r = readl(pcie_data_offset);
1094 	/* read high 32 bits */
1095 	writel(reg_addr + 4, pcie_index_offset);
1096 	readl(pcie_index_offset);
1097 	if (pcie_index_hi != 0) {
1098 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1099 		readl(pcie_index_hi_offset);
1100 	}
1101 	r |= ((u64)readl(pcie_data_offset) << 32);
1102 
1103 	/* clear the high bits */
1104 	if (pcie_index_hi != 0) {
1105 		writel(0, pcie_index_hi_offset);
1106 		readl(pcie_index_hi_offset);
1107 	}
1108 
1109 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1110 
1111 	return r;
1112 }
1113 
1114 /**
1115  * amdgpu_device_indirect_wreg - write an indirect register address
1116  *
1117  * @adev: amdgpu_device pointer
1118  * @reg_addr: indirect register offset
1119  * @reg_data: indirect register data
1120  *
1121  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1123 				 u32 reg_addr, u32 reg_data)
1124 {
1125 	unsigned long flags, pcie_index, pcie_data;
1126 	void __iomem *pcie_index_offset;
1127 	void __iomem *pcie_data_offset;
1128 
1129 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1130 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1131 
1132 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1133 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1134 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1135 
1136 	writel(reg_addr, pcie_index_offset);
1137 	readl(pcie_index_offset);
1138 	writel(reg_data, pcie_data_offset);
1139 	readl(pcie_data_offset);
1140 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1141 }
1142 
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1144 				     u64 reg_addr, u32 reg_data)
1145 {
1146 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1147 	void __iomem *pcie_index_offset;
1148 	void __iomem *pcie_index_hi_offset;
1149 	void __iomem *pcie_data_offset;
1150 
1151 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1152 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1153 	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1154 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1155 	else
1156 		pcie_index_hi = 0;
1157 
1158 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1159 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1160 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1161 	if (pcie_index_hi != 0)
1162 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1163 				pcie_index_hi * 4;
1164 
1165 	writel(reg_addr, pcie_index_offset);
1166 	readl(pcie_index_offset);
1167 	if (pcie_index_hi != 0) {
1168 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1169 		readl(pcie_index_hi_offset);
1170 	}
1171 	writel(reg_data, pcie_data_offset);
1172 	readl(pcie_data_offset);
1173 
1174 	/* clear the high bits */
1175 	if (pcie_index_hi != 0) {
1176 		writel(0, pcie_index_hi_offset);
1177 		readl(pcie_index_hi_offset);
1178 	}
1179 
1180 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1181 }
1182 
1183 /**
1184  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1185  *
1186  * @adev: amdgpu_device pointer
1187  * @reg_addr: indirect register offset
1188  * @reg_data: indirect register data
1189  *
1190  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1192 				   u32 reg_addr, u64 reg_data)
1193 {
1194 	unsigned long flags, pcie_index, pcie_data;
1195 	void __iomem *pcie_index_offset;
1196 	void __iomem *pcie_data_offset;
1197 
1198 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1199 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1200 
1201 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1202 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1203 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1204 
1205 	/* write low 32 bits */
1206 	writel(reg_addr, pcie_index_offset);
1207 	readl(pcie_index_offset);
1208 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1209 	readl(pcie_data_offset);
1210 	/* write high 32 bits */
1211 	writel(reg_addr + 4, pcie_index_offset);
1212 	readl(pcie_index_offset);
1213 	writel((u32)(reg_data >> 32), pcie_data_offset);
1214 	readl(pcie_data_offset);
1215 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1216 }
1217 
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1219 				   u64 reg_addr, u64 reg_data)
1220 {
1221 	unsigned long flags, pcie_index, pcie_data;
1222 	unsigned long pcie_index_hi = 0;
1223 	void __iomem *pcie_index_offset;
1224 	void __iomem *pcie_index_hi_offset;
1225 	void __iomem *pcie_data_offset;
1226 
1227 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1228 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1229 	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1230 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1231 
1232 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1233 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1234 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1235 	if (pcie_index_hi != 0)
1236 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1237 				pcie_index_hi * 4;
1238 
1239 	/* write low 32 bits */
1240 	writel(reg_addr, pcie_index_offset);
1241 	readl(pcie_index_offset);
1242 	if (pcie_index_hi != 0) {
1243 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1244 		readl(pcie_index_hi_offset);
1245 	}
1246 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1247 	readl(pcie_data_offset);
1248 	/* write high 32 bits */
1249 	writel(reg_addr + 4, pcie_index_offset);
1250 	readl(pcie_index_offset);
1251 	if (pcie_index_hi != 0) {
1252 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1253 		readl(pcie_index_hi_offset);
1254 	}
1255 	writel((u32)(reg_data >> 32), pcie_data_offset);
1256 	readl(pcie_data_offset);
1257 
1258 	/* clear the high bits */
1259 	if (pcie_index_hi != 0) {
1260 		writel(0, pcie_index_hi_offset);
1261 		readl(pcie_index_hi_offset);
1262 	}
1263 
1264 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1265 }
1266 
1267 /**
1268  * amdgpu_device_get_rev_id - query device rev_id
1269  *
1270  * @adev: amdgpu_device pointer
1271  *
1272  * Return device rev_id
1273  */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1275 {
1276 	return adev->nbio.funcs->get_rev_id(adev);
1277 }
1278 
1279 /**
1280  * amdgpu_invalid_rreg - dummy reg read function
1281  *
1282  * @adev: amdgpu_device pointer
1283  * @reg: offset of register
1284  *
1285  * Dummy register read function.  Used for register blocks
1286  * that certain asics don't have (all asics).
1287  * Returns the value in the register.
1288  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1290 {
1291 	dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg);
1292 	BUG();
1293 	return 0;
1294 }
1295 
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1297 {
1298 	dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
1299 	BUG();
1300 	return 0;
1301 }
1302 
1303 /**
1304  * amdgpu_invalid_wreg - dummy reg write function
1305  *
1306  * @adev: amdgpu_device pointer
1307  * @reg: offset of register
1308  * @v: value to write to the register
1309  *
1310  * Dummy register read function.  Used for register blocks
1311  * that certain asics don't have (all asics).
1312  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1314 {
1315 	dev_err(adev->dev,
1316 		"Invalid callback to write register 0x%04X with 0x%08X\n", reg,
1317 		v);
1318 	BUG();
1319 }
1320 
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1322 {
1323 	dev_err(adev->dev,
1324 		"Invalid callback to write register 0x%llX with 0x%08X\n", reg,
1325 		v);
1326 	BUG();
1327 }
1328 
1329 /**
1330  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1331  *
1332  * @adev: amdgpu_device pointer
1333  * @reg: offset of register
1334  *
1335  * Dummy register read function.  Used for register blocks
1336  * that certain asics don't have (all asics).
1337  * Returns the value in the register.
1338  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1340 {
1341 	dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n",
1342 		reg);
1343 	BUG();
1344 	return 0;
1345 }
1346 
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1348 {
1349 	dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
1350 	BUG();
1351 	return 0;
1352 }
1353 
1354 /**
1355  * amdgpu_invalid_wreg64 - dummy reg write function
1356  *
1357  * @adev: amdgpu_device pointer
1358  * @reg: offset of register
1359  * @v: value to write to the register
1360  *
1361  * Dummy register read function.  Used for register blocks
1362  * that certain asics don't have (all asics).
1363  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1365 {
1366 	dev_err(adev->dev,
1367 		"Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1368 		reg, v);
1369 	BUG();
1370 }
1371 
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1373 {
1374 	dev_err(adev->dev,
1375 		"Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1376 		reg, v);
1377 	BUG();
1378 }
1379 
1380 /**
1381  * amdgpu_block_invalid_rreg - dummy reg read function
1382  *
1383  * @adev: amdgpu_device pointer
1384  * @block: offset of instance
1385  * @reg: offset of register
1386  *
1387  * Dummy register read function.  Used for register blocks
1388  * that certain asics don't have (all asics).
1389  * Returns the value in the register.
1390  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1392 					  uint32_t block, uint32_t reg)
1393 {
1394 	dev_err(adev->dev,
1395 		"Invalid callback to read register 0x%04X in block 0x%04X\n",
1396 		reg, block);
1397 	BUG();
1398 	return 0;
1399 }
1400 
1401 /**
1402  * amdgpu_block_invalid_wreg - dummy reg write function
1403  *
1404  * @adev: amdgpu_device pointer
1405  * @block: offset of instance
1406  * @reg: offset of register
1407  * @v: value to write to the register
1408  *
1409  * Dummy register read function.  Used for register blocks
1410  * that certain asics don't have (all asics).
1411  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1413 				      uint32_t block,
1414 				      uint32_t reg, uint32_t v)
1415 {
1416 	dev_err(adev->dev,
1417 		"Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1418 		reg, block, v);
1419 	BUG();
1420 }
1421 
amdgpu_device_get_vbios_flags(struct amdgpu_device * adev)1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev)
1423 {
1424 	if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1425 		return AMDGPU_VBIOS_SKIP;
1426 
1427 	if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev))
1428 		return AMDGPU_VBIOS_OPTIONAL;
1429 
1430 	return 0;
1431 }
1432 
1433 /**
1434  * amdgpu_device_asic_init - Wrapper for atom asic_init
1435  *
1436  * @adev: amdgpu_device pointer
1437  *
1438  * Does any asic specific work and then calls atom asic init.
1439  */
amdgpu_device_asic_init(struct amdgpu_device * adev)1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1441 {
1442 	uint32_t flags;
1443 	bool optional;
1444 	int ret;
1445 
1446 	amdgpu_asic_pre_asic_init(adev);
1447 	flags = amdgpu_device_get_vbios_flags(adev);
1448 	optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP));
1449 
1450 	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1451 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1452 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
1453 	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1454 		amdgpu_psp_wait_for_bootloader(adev);
1455 		if (optional && !adev->bios)
1456 			return 0;
1457 
1458 		ret = amdgpu_atomfirmware_asic_init(adev, true);
1459 		return ret;
1460 	} else {
1461 		if (optional && !adev->bios)
1462 			return 0;
1463 
1464 		return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1465 	}
1466 
1467 	return 0;
1468 }
1469 
1470 /**
1471  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1472  *
1473  * @adev: amdgpu_device pointer
1474  *
1475  * Allocates a scratch page of VRAM for use by various things in the
1476  * driver.
1477  */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1479 {
1480 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1481 				       AMDGPU_GEM_DOMAIN_VRAM |
1482 				       AMDGPU_GEM_DOMAIN_GTT,
1483 				       &adev->mem_scratch.robj,
1484 				       &adev->mem_scratch.gpu_addr,
1485 				       (void **)&adev->mem_scratch.ptr);
1486 }
1487 
1488 /**
1489  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1490  *
1491  * @adev: amdgpu_device pointer
1492  *
1493  * Frees the VRAM scratch page.
1494  */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1496 {
1497 	amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1498 }
1499 
1500 /**
1501  * amdgpu_device_program_register_sequence - program an array of registers.
1502  *
1503  * @adev: amdgpu_device pointer
1504  * @registers: pointer to the register array
1505  * @array_size: size of the register array
1506  *
1507  * Programs an array or registers with and or masks.
1508  * This is a helper for setting golden registers.
1509  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1511 					     const u32 *registers,
1512 					     const u32 array_size)
1513 {
1514 	u32 tmp, reg, and_mask, or_mask;
1515 	int i;
1516 
1517 	if (array_size % 3)
1518 		return;
1519 
1520 	for (i = 0; i < array_size; i += 3) {
1521 		reg = registers[i + 0];
1522 		and_mask = registers[i + 1];
1523 		or_mask = registers[i + 2];
1524 
1525 		if (and_mask == 0xffffffff) {
1526 			tmp = or_mask;
1527 		} else {
1528 			tmp = RREG32(reg);
1529 			tmp &= ~and_mask;
1530 			if (adev->family >= AMDGPU_FAMILY_AI)
1531 				tmp |= (or_mask & and_mask);
1532 			else
1533 				tmp |= or_mask;
1534 		}
1535 		WREG32(reg, tmp);
1536 	}
1537 }
1538 
1539 /**
1540  * amdgpu_device_pci_config_reset - reset the GPU
1541  *
1542  * @adev: amdgpu_device pointer
1543  *
1544  * Resets the GPU using the pci config reset sequence.
1545  * Only applicable to asics prior to vega10.
1546  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1548 {
1549 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1550 }
1551 
1552 /**
1553  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1554  *
1555  * @adev: amdgpu_device pointer
1556  *
1557  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1558  */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1560 {
1561 	return pci_reset_function(adev->pdev);
1562 }
1563 
1564 /*
1565  * amdgpu_device_wb_*()
1566  * Writeback is the method by which the GPU updates special pages in memory
1567  * with the status of certain GPU events (fences, ring pointers,etc.).
1568  */
1569 
1570 /**
1571  * amdgpu_device_wb_fini - Disable Writeback and free memory
1572  *
1573  * @adev: amdgpu_device pointer
1574  *
1575  * Disables Writeback and frees the Writeback memory (all asics).
1576  * Used at driver shutdown.
1577  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1579 {
1580 	if (adev->wb.wb_obj) {
1581 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1582 				      &adev->wb.gpu_addr,
1583 				      (void **)&adev->wb.wb);
1584 		adev->wb.wb_obj = NULL;
1585 	}
1586 }
1587 
1588 /**
1589  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1590  *
1591  * @adev: amdgpu_device pointer
1592  *
1593  * Initializes writeback and allocates writeback memory (all asics).
1594  * Used at driver startup.
1595  * Returns 0 on success or an -error on failure.
1596  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1598 {
1599 	int r;
1600 
1601 	if (adev->wb.wb_obj == NULL) {
1602 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1603 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1604 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1605 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1606 					    (void **)&adev->wb.wb);
1607 		if (r) {
1608 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1609 			return r;
1610 		}
1611 
1612 		adev->wb.num_wb = AMDGPU_MAX_WB;
1613 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1614 
1615 		/* clear wb memory */
1616 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1617 	}
1618 
1619 	return 0;
1620 }
1621 
1622 /**
1623  * amdgpu_device_wb_get - Allocate a wb entry
1624  *
1625  * @adev: amdgpu_device pointer
1626  * @wb: wb index
1627  *
1628  * Allocate a wb slot for use by the driver (all asics).
1629  * Returns 0 on success or -EINVAL on failure.
1630  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1632 {
1633 	unsigned long flags, offset;
1634 
1635 	spin_lock_irqsave(&adev->wb.lock, flags);
1636 	offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1637 	if (offset < adev->wb.num_wb) {
1638 		__set_bit(offset, adev->wb.used);
1639 		spin_unlock_irqrestore(&adev->wb.lock, flags);
1640 		*wb = offset << 3; /* convert to dw offset */
1641 		return 0;
1642 	} else {
1643 		spin_unlock_irqrestore(&adev->wb.lock, flags);
1644 		return -EINVAL;
1645 	}
1646 }
1647 
1648 /**
1649  * amdgpu_device_wb_free - Free a wb entry
1650  *
1651  * @adev: amdgpu_device pointer
1652  * @wb: wb index
1653  *
1654  * Free a wb slot allocated for use by the driver (all asics)
1655  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1657 {
1658 	unsigned long flags;
1659 
1660 	wb >>= 3;
1661 	spin_lock_irqsave(&adev->wb.lock, flags);
1662 	if (wb < adev->wb.num_wb)
1663 		__clear_bit(wb, adev->wb.used);
1664 	spin_unlock_irqrestore(&adev->wb.lock, flags);
1665 }
1666 
1667 /**
1668  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1669  *
1670  * @adev: amdgpu_device pointer
1671  *
1672  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1673  * to fail, but if any of the BARs is not accessible after the size we abort
1674  * driver loading by returning -ENODEV.
1675  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1677 {
1678 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1679 	struct pci_bus *root;
1680 	struct resource *res;
1681 	int max_size, r;
1682 	unsigned int i;
1683 	u16 cmd;
1684 
1685 	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1686 		return 0;
1687 
1688 	/* Bypass for VF */
1689 	if (amdgpu_sriov_vf(adev))
1690 		return 0;
1691 
1692 	if (!amdgpu_rebar)
1693 		return 0;
1694 
1695 	/* resizing on Dell G5 SE platforms causes problems with runtime pm */
1696 	if ((amdgpu_runtime_pm != 0) &&
1697 	    adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1698 	    adev->pdev->device == 0x731f &&
1699 	    adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1700 		return 0;
1701 
1702 	/* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1703 	if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1704 		dev_warn(
1705 			adev->dev,
1706 			"System can't access extended configuration space, please check!!\n");
1707 
1708 	/* skip if the bios has already enabled large BAR */
1709 	if (adev->gmc.real_vram_size &&
1710 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1711 		return 0;
1712 
1713 	/* Check if the root BUS has 64bit memory resources */
1714 	root = adev->pdev->bus;
1715 	while (root->parent)
1716 		root = root->parent;
1717 
1718 	pci_bus_for_each_resource(root, res, i) {
1719 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1720 		    res->start > 0x100000000ull)
1721 			break;
1722 	}
1723 
1724 	/* Trying to resize is pointless without a root hub window above 4GB */
1725 	if (!res)
1726 		return 0;
1727 
1728 	/* Limit the BAR size to what is available */
1729 	max_size = pci_rebar_get_max_size(adev->pdev, 0);
1730 	if (max_size < 0)
1731 		return 0;
1732 	rbar_size = min(max_size, rbar_size);
1733 
1734 	/* Disable memory decoding while we change the BAR addresses and size */
1735 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1736 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1737 			      cmd & ~PCI_COMMAND_MEMORY);
1738 
1739 	/* Tear down doorbell as resizing will release BARs */
1740 	amdgpu_doorbell_fini(adev);
1741 
1742 	r = pci_resize_resource(adev->pdev, 0, rbar_size,
1743 				(adev->asic_type >= CHIP_BONAIRE) ? 1 << 5
1744 								  : 1 << 2);
1745 	if (r == -ENOSPC)
1746 		dev_info(adev->dev,
1747 			 "Not enough PCI address space for a large BAR.");
1748 	else if (r && r != -ENOTSUPP)
1749 		dev_err(adev->dev, "Problem resizing BAR0 (%d).", r);
1750 
1751 	/* When the doorbell or fb BAR isn't available we have no chance of
1752 	 * using the device.
1753 	 */
1754 	r = amdgpu_doorbell_init(adev);
1755 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1756 		return -ENODEV;
1757 
1758 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1759 
1760 	return 0;
1761 }
1762 
1763 /*
1764  * GPU helpers function.
1765  */
1766 /**
1767  * amdgpu_device_need_post - check if the hw need post or not
1768  *
1769  * @adev: amdgpu_device pointer
1770  *
1771  * Check if the asic has been initialized (all asics) at driver startup
1772  * or post is needed if  hw reset is performed.
1773  * Returns true if need or false if not.
1774  */
amdgpu_device_need_post(struct amdgpu_device * adev)1775 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1776 {
1777 	uint32_t reg, flags;
1778 
1779 	if (amdgpu_sriov_vf(adev))
1780 		return false;
1781 
1782 	flags = amdgpu_device_get_vbios_flags(adev);
1783 	if (flags & AMDGPU_VBIOS_SKIP)
1784 		return false;
1785 	if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios)
1786 		return false;
1787 
1788 	if (amdgpu_passthrough(adev)) {
1789 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1790 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1791 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1792 		 * vpost executed for smc version below 22.15
1793 		 */
1794 		if (adev->asic_type == CHIP_FIJI) {
1795 			int err;
1796 			uint32_t fw_ver;
1797 
1798 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1799 			/* force vPost if error occurred */
1800 			if (err)
1801 				return true;
1802 
1803 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1804 			release_firmware(adev->pm.fw);
1805 			if (fw_ver < 0x00160e00)
1806 				return true;
1807 		}
1808 	}
1809 
1810 	/* Don't post if we need to reset whole hive on init */
1811 	if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1812 		return false;
1813 
1814 	if (adev->has_hw_reset) {
1815 		adev->has_hw_reset = false;
1816 		return true;
1817 	}
1818 
1819 	/* bios scratch used on CIK+ */
1820 	if (adev->asic_type >= CHIP_BONAIRE)
1821 		return amdgpu_atombios_scratch_need_asic_init(adev);
1822 
1823 	/* check MEM_SIZE for older asics */
1824 	reg = amdgpu_asic_get_config_memsize(adev);
1825 
1826 	if ((reg != 0) && (reg != 0xffffffff))
1827 		return false;
1828 
1829 	return true;
1830 }
1831 
1832 /*
1833  * Check whether seamless boot is supported.
1834  *
1835  * So far we only support seamless boot on DCE 3.0 or later.
1836  * If users report that it works on older ASICS as well, we may
1837  * loosen this.
1838  */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1839 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1840 {
1841 	switch (amdgpu_seamless) {
1842 	case -1:
1843 		break;
1844 	case 1:
1845 		return true;
1846 	case 0:
1847 		return false;
1848 	default:
1849 		dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n",
1850 			amdgpu_seamless);
1851 		return false;
1852 	}
1853 
1854 	if (!(adev->flags & AMD_IS_APU))
1855 		return false;
1856 
1857 	if (adev->mman.keep_stolen_vga_memory)
1858 		return false;
1859 
1860 	return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1861 }
1862 
1863 /*
1864  * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1865  * don't support dynamic speed switching. Until we have confirmation from Intel
1866  * that a specific host supports it, it's safer that we keep it disabled for all.
1867  *
1868  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1869  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1870  */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1871 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1872 {
1873 #if IS_ENABLED(CONFIG_X86)
1874 	struct cpuinfo_x86 *c = &cpu_data(0);
1875 
1876 	/* eGPU change speeds based on USB4 fabric conditions */
1877 	if (dev_is_removable(adev->dev))
1878 		return true;
1879 
1880 	if (c->x86_vendor == X86_VENDOR_INTEL)
1881 		return false;
1882 #endif
1883 	return true;
1884 }
1885 
amdgpu_device_aspm_support_quirk(struct amdgpu_device * adev)1886 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev)
1887 {
1888 	/* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4.
1889 	 * It's unclear if this is a platform-specific or GPU-specific issue.
1890 	 * Disable ASPM on SI for the time being.
1891 	 */
1892 	if (adev->family == AMDGPU_FAMILY_SI)
1893 		return true;
1894 
1895 #if IS_ENABLED(CONFIG_X86)
1896 	struct cpuinfo_x86 *c = &cpu_data(0);
1897 
1898 	if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) ||
1899 		  amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1)))
1900 		return false;
1901 
1902 	if (c->x86 == 6 &&
1903 		adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) {
1904 		switch (c->x86_model) {
1905 		case VFM_MODEL(INTEL_ALDERLAKE):
1906 		case VFM_MODEL(INTEL_ALDERLAKE_L):
1907 		case VFM_MODEL(INTEL_RAPTORLAKE):
1908 		case VFM_MODEL(INTEL_RAPTORLAKE_P):
1909 		case VFM_MODEL(INTEL_RAPTORLAKE_S):
1910 			return true;
1911 		default:
1912 			return false;
1913 		}
1914 	} else {
1915 		return false;
1916 	}
1917 #else
1918 	return false;
1919 #endif
1920 }
1921 
1922 /**
1923  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1924  *
1925  * @adev: amdgpu_device pointer
1926  *
1927  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1928  * be set for this device.
1929  *
1930  * Returns true if it should be used or false if not.
1931  */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1932 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1933 {
1934 	switch (amdgpu_aspm) {
1935 	case -1:
1936 		break;
1937 	case 0:
1938 		return false;
1939 	case 1:
1940 		return true;
1941 	default:
1942 		return false;
1943 	}
1944 	if (adev->flags & AMD_IS_APU)
1945 		return false;
1946 	if (amdgpu_device_aspm_support_quirk(adev))
1947 		return false;
1948 	return pcie_aspm_enabled(adev->pdev);
1949 }
1950 
1951 /* if we get transitioned to only one device, take VGA back */
1952 /**
1953  * amdgpu_device_vga_set_decode - enable/disable vga decode
1954  *
1955  * @pdev: PCI device pointer
1956  * @state: enable/disable vga decode
1957  *
1958  * Enable/disable vga decode (all asics).
1959  * Returns VGA resource flags.
1960  */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1961 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1962 		bool state)
1963 {
1964 	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1965 
1966 	amdgpu_asic_set_vga_state(adev, state);
1967 	if (state)
1968 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1969 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1970 	else
1971 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1972 }
1973 
1974 /**
1975  * amdgpu_device_check_block_size - validate the vm block size
1976  *
1977  * @adev: amdgpu_device pointer
1978  *
1979  * Validates the vm block size specified via module parameter.
1980  * The vm block size defines number of bits in page table versus page directory,
1981  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1982  * page table and the remaining bits are in the page directory.
1983  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1984 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1985 {
1986 	/* defines number of bits in page table versus page directory,
1987 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1988 	 * page table and the remaining bits are in the page directory
1989 	 */
1990 	if (amdgpu_vm_block_size == -1)
1991 		return;
1992 
1993 	if (amdgpu_vm_block_size < 9) {
1994 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1995 			 amdgpu_vm_block_size);
1996 		amdgpu_vm_block_size = -1;
1997 	}
1998 }
1999 
2000 /**
2001  * amdgpu_device_check_vm_size - validate the vm size
2002  *
2003  * @adev: amdgpu_device pointer
2004  *
2005  * Validates the vm size in GB specified via module parameter.
2006  * The VM size is the size of the GPU virtual memory space in GB.
2007  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)2008 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
2009 {
2010 	/* no need to check the default value */
2011 	if (amdgpu_vm_size == -1)
2012 		return;
2013 
2014 	if (amdgpu_vm_size < 1) {
2015 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
2016 			 amdgpu_vm_size);
2017 		amdgpu_vm_size = -1;
2018 	}
2019 }
2020 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)2021 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
2022 {
2023 	struct sysinfo si;
2024 	bool is_os_64 = (sizeof(void *) == 8);
2025 	uint64_t total_memory;
2026 	uint64_t dram_size_seven_GB = 0x1B8000000;
2027 	uint64_t dram_size_three_GB = 0xB8000000;
2028 
2029 	if (amdgpu_smu_memory_pool_size == 0)
2030 		return;
2031 
2032 	if (!is_os_64) {
2033 		dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n");
2034 		goto def_value;
2035 	}
2036 	si_meminfo(&si);
2037 	total_memory = (uint64_t)si.totalram * si.mem_unit;
2038 
2039 	if ((amdgpu_smu_memory_pool_size == 1) ||
2040 		(amdgpu_smu_memory_pool_size == 2)) {
2041 		if (total_memory < dram_size_three_GB)
2042 			goto def_value1;
2043 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
2044 		(amdgpu_smu_memory_pool_size == 8)) {
2045 		if (total_memory < dram_size_seven_GB)
2046 			goto def_value1;
2047 	} else {
2048 		dev_warn(adev->dev, "Smu memory pool size not supported\n");
2049 		goto def_value;
2050 	}
2051 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
2052 
2053 	return;
2054 
2055 def_value1:
2056 	dev_warn(adev->dev, "No enough system memory\n");
2057 def_value:
2058 	adev->pm.smu_prv_buffer_size = 0;
2059 }
2060 
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)2061 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
2062 {
2063 	if (!(adev->flags & AMD_IS_APU) ||
2064 	    adev->asic_type < CHIP_RAVEN)
2065 		return 0;
2066 
2067 	switch (adev->asic_type) {
2068 	case CHIP_RAVEN:
2069 		if (adev->pdev->device == 0x15dd)
2070 			adev->apu_flags |= AMD_APU_IS_RAVEN;
2071 		if (adev->pdev->device == 0x15d8)
2072 			adev->apu_flags |= AMD_APU_IS_PICASSO;
2073 		break;
2074 	case CHIP_RENOIR:
2075 		if ((adev->pdev->device == 0x1636) ||
2076 		    (adev->pdev->device == 0x164c))
2077 			adev->apu_flags |= AMD_APU_IS_RENOIR;
2078 		else
2079 			adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
2080 		break;
2081 	case CHIP_VANGOGH:
2082 		adev->apu_flags |= AMD_APU_IS_VANGOGH;
2083 		break;
2084 	case CHIP_YELLOW_CARP:
2085 		break;
2086 	case CHIP_CYAN_SKILLFISH:
2087 		if ((adev->pdev->device == 0x13FE) ||
2088 		    (adev->pdev->device == 0x143F))
2089 			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
2090 		break;
2091 	default:
2092 		break;
2093 	}
2094 
2095 	return 0;
2096 }
2097 
2098 /**
2099  * amdgpu_device_check_arguments - validate module params
2100  *
2101  * @adev: amdgpu_device pointer
2102  *
2103  * Validates certain module parameters and updates
2104  * the associated values used by the driver (all asics).
2105  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)2106 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2107 {
2108 	int i;
2109 
2110 	if (amdgpu_sched_jobs < 4) {
2111 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2112 			 amdgpu_sched_jobs);
2113 		amdgpu_sched_jobs = 4;
2114 	} else if (!is_power_of_2(amdgpu_sched_jobs)) {
2115 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2116 			 amdgpu_sched_jobs);
2117 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2118 	}
2119 
2120 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2121 		/* gart size must be greater or equal to 32M */
2122 		dev_warn(adev->dev, "gart size (%d) too small\n",
2123 			 amdgpu_gart_size);
2124 		amdgpu_gart_size = -1;
2125 	}
2126 
2127 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2128 		/* gtt size must be greater or equal to 32M */
2129 		dev_warn(adev->dev, "gtt size (%d) too small\n",
2130 				 amdgpu_gtt_size);
2131 		amdgpu_gtt_size = -1;
2132 	}
2133 
2134 	/* valid range is between 4 and 9 inclusive */
2135 	if (amdgpu_vm_fragment_size != -1 &&
2136 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2137 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
2138 		amdgpu_vm_fragment_size = -1;
2139 	}
2140 
2141 	if (amdgpu_sched_hw_submission < 2) {
2142 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2143 			 amdgpu_sched_hw_submission);
2144 		amdgpu_sched_hw_submission = 2;
2145 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2146 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2147 			 amdgpu_sched_hw_submission);
2148 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2149 	}
2150 
2151 	if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2152 		dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2153 		amdgpu_reset_method = -1;
2154 	}
2155 
2156 	amdgpu_device_check_smu_prv_buffer_size(adev);
2157 
2158 	amdgpu_device_check_vm_size(adev);
2159 
2160 	amdgpu_device_check_block_size(adev);
2161 
2162 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2163 
2164 	for (i = 0; i < MAX_XCP; i++) {
2165 		switch (amdgpu_enforce_isolation) {
2166 		case -1:
2167 		case 0:
2168 		default:
2169 			/* disable */
2170 			adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
2171 			break;
2172 		case 1:
2173 			/* enable */
2174 			adev->enforce_isolation[i] =
2175 				AMDGPU_ENFORCE_ISOLATION_ENABLE;
2176 			break;
2177 		case 2:
2178 			/* enable legacy mode */
2179 			adev->enforce_isolation[i] =
2180 				AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
2181 			break;
2182 		case 3:
2183 			/* enable only process isolation without submitting cleaner shader */
2184 			adev->enforce_isolation[i] =
2185 				AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER;
2186 			break;
2187 		}
2188 	}
2189 
2190 	return 0;
2191 }
2192 
2193 /**
2194  * amdgpu_switcheroo_set_state - set switcheroo state
2195  *
2196  * @pdev: pci dev pointer
2197  * @state: vga_switcheroo state
2198  *
2199  * Callback for the switcheroo driver.  Suspends or resumes
2200  * the asics before or after it is powered up using ACPI methods.
2201  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2202 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2203 					enum vga_switcheroo_state state)
2204 {
2205 	struct drm_device *dev = pci_get_drvdata(pdev);
2206 	int r;
2207 
2208 	if (amdgpu_device_supports_px(drm_to_adev(dev)) &&
2209 	    state == VGA_SWITCHEROO_OFF)
2210 		return;
2211 
2212 	if (state == VGA_SWITCHEROO_ON) {
2213 		pr_info("switched on\n");
2214 		/* don't suspend or resume card normally */
2215 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2216 
2217 		pci_set_power_state(pdev, PCI_D0);
2218 		amdgpu_device_load_pci_state(pdev);
2219 		r = pci_enable_device(pdev);
2220 		if (r)
2221 			dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n",
2222 				 r);
2223 		amdgpu_device_resume(dev, true);
2224 
2225 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
2226 	} else {
2227 		dev_info(&pdev->dev, "switched off\n");
2228 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2229 		amdgpu_device_prepare(dev);
2230 		amdgpu_device_suspend(dev, true);
2231 		amdgpu_device_cache_pci_state(pdev);
2232 		/* Shut down the device */
2233 		pci_disable_device(pdev);
2234 		pci_set_power_state(pdev, PCI_D3cold);
2235 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2236 	}
2237 }
2238 
2239 /**
2240  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2241  *
2242  * @pdev: pci dev pointer
2243  *
2244  * Callback for the switcheroo driver.  Check of the switcheroo
2245  * state can be changed.
2246  * Returns true if the state can be changed, false if not.
2247  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2248 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2249 {
2250 	struct drm_device *dev = pci_get_drvdata(pdev);
2251 
2252        /*
2253 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
2254 	* locking inversion with the driver load path. And the access here is
2255 	* completely racy anyway. So don't bother with locking for now.
2256 	*/
2257 	return atomic_read(&dev->open_count) == 0;
2258 }
2259 
2260 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2261 	.set_gpu_state = amdgpu_switcheroo_set_state,
2262 	.reprobe = NULL,
2263 	.can_switch = amdgpu_switcheroo_can_switch,
2264 };
2265 
2266 /**
2267  * amdgpu_device_ip_set_clockgating_state - set the CG state
2268  *
2269  * @dev: amdgpu_device pointer
2270  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2271  * @state: clockgating state (gate or ungate)
2272  *
2273  * Sets the requested clockgating state for all instances of
2274  * the hardware IP specified.
2275  * Returns the error code from the last instance.
2276  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2277 int amdgpu_device_ip_set_clockgating_state(void *dev,
2278 					   enum amd_ip_block_type block_type,
2279 					   enum amd_clockgating_state state)
2280 {
2281 	struct amdgpu_device *adev = dev;
2282 	int i, r = 0;
2283 
2284 	for (i = 0; i < adev->num_ip_blocks; i++) {
2285 		if (!adev->ip_blocks[i].status.valid)
2286 			continue;
2287 		if (adev->ip_blocks[i].version->type != block_type)
2288 			continue;
2289 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2290 			continue;
2291 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2292 			&adev->ip_blocks[i], state);
2293 		if (r)
2294 			dev_err(adev->dev,
2295 				"set_clockgating_state of IP block <%s> failed %d\n",
2296 				adev->ip_blocks[i].version->funcs->name, r);
2297 	}
2298 	return r;
2299 }
2300 
2301 /**
2302  * amdgpu_device_ip_set_powergating_state - set the PG state
2303  *
2304  * @dev: amdgpu_device pointer
2305  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2306  * @state: powergating state (gate or ungate)
2307  *
2308  * Sets the requested powergating state for all instances of
2309  * the hardware IP specified.
2310  * Returns the error code from the last instance.
2311  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2312 int amdgpu_device_ip_set_powergating_state(void *dev,
2313 					   enum amd_ip_block_type block_type,
2314 					   enum amd_powergating_state state)
2315 {
2316 	struct amdgpu_device *adev = dev;
2317 	int i, r = 0;
2318 
2319 	for (i = 0; i < adev->num_ip_blocks; i++) {
2320 		if (!adev->ip_blocks[i].status.valid)
2321 			continue;
2322 		if (adev->ip_blocks[i].version->type != block_type)
2323 			continue;
2324 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2325 			continue;
2326 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2327 			&adev->ip_blocks[i], state);
2328 		if (r)
2329 			dev_err(adev->dev,
2330 				"set_powergating_state of IP block <%s> failed %d\n",
2331 				adev->ip_blocks[i].version->funcs->name, r);
2332 	}
2333 	return r;
2334 }
2335 
2336 /**
2337  * amdgpu_device_ip_get_clockgating_state - get the CG state
2338  *
2339  * @adev: amdgpu_device pointer
2340  * @flags: clockgating feature flags
2341  *
2342  * Walks the list of IPs on the device and updates the clockgating
2343  * flags for each IP.
2344  * Updates @flags with the feature flags for each hardware IP where
2345  * clockgating is enabled.
2346  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2347 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2348 					    u64 *flags)
2349 {
2350 	int i;
2351 
2352 	for (i = 0; i < adev->num_ip_blocks; i++) {
2353 		if (!adev->ip_blocks[i].status.valid)
2354 			continue;
2355 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2356 			adev->ip_blocks[i].version->funcs->get_clockgating_state(
2357 				&adev->ip_blocks[i], flags);
2358 	}
2359 }
2360 
2361 /**
2362  * amdgpu_device_ip_wait_for_idle - wait for idle
2363  *
2364  * @adev: amdgpu_device pointer
2365  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2366  *
2367  * Waits for the request hardware IP to be idle.
2368  * Returns 0 for success or a negative error code on failure.
2369  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2370 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2371 				   enum amd_ip_block_type block_type)
2372 {
2373 	int i, r;
2374 
2375 	for (i = 0; i < adev->num_ip_blocks; i++) {
2376 		if (!adev->ip_blocks[i].status.valid)
2377 			continue;
2378 		if (adev->ip_blocks[i].version->type == block_type) {
2379 			if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2380 				r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2381 								&adev->ip_blocks[i]);
2382 				if (r)
2383 					return r;
2384 			}
2385 			break;
2386 		}
2387 	}
2388 	return 0;
2389 
2390 }
2391 
2392 /**
2393  * amdgpu_device_ip_is_hw - is the hardware IP enabled
2394  *
2395  * @adev: amdgpu_device pointer
2396  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2397  *
2398  * Check if the hardware IP is enable or not.
2399  * Returns true if it the IP is enable, false if not.
2400  */
amdgpu_device_ip_is_hw(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2401 bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev,
2402 			    enum amd_ip_block_type block_type)
2403 {
2404 	int i;
2405 
2406 	for (i = 0; i < adev->num_ip_blocks; i++) {
2407 		if (adev->ip_blocks[i].version->type == block_type)
2408 			return adev->ip_blocks[i].status.hw;
2409 	}
2410 	return false;
2411 }
2412 
2413 /**
2414  * amdgpu_device_ip_is_valid - is the hardware IP valid
2415  *
2416  * @adev: amdgpu_device pointer
2417  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2418  *
2419  * Check if the hardware IP is valid or not.
2420  * Returns true if it the IP is valid, false if not.
2421  */
amdgpu_device_ip_is_valid(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2422 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2423 			       enum amd_ip_block_type block_type)
2424 {
2425 	int i;
2426 
2427 	for (i = 0; i < adev->num_ip_blocks; i++) {
2428 		if (adev->ip_blocks[i].version->type == block_type)
2429 			return adev->ip_blocks[i].status.valid;
2430 	}
2431 	return false;
2432 
2433 }
2434 
2435 /**
2436  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2437  *
2438  * @adev: amdgpu_device pointer
2439  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2440  *
2441  * Returns a pointer to the hardware IP block structure
2442  * if it exists for the asic, otherwise NULL.
2443  */
2444 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2445 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2446 			      enum amd_ip_block_type type)
2447 {
2448 	int i;
2449 
2450 	for (i = 0; i < adev->num_ip_blocks; i++)
2451 		if (adev->ip_blocks[i].version->type == type)
2452 			return &adev->ip_blocks[i];
2453 
2454 	return NULL;
2455 }
2456 
2457 /**
2458  * amdgpu_device_ip_block_version_cmp
2459  *
2460  * @adev: amdgpu_device pointer
2461  * @type: enum amd_ip_block_type
2462  * @major: major version
2463  * @minor: minor version
2464  *
2465  * return 0 if equal or greater
2466  * return 1 if smaller or the ip_block doesn't exist
2467  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2468 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2469 				       enum amd_ip_block_type type,
2470 				       u32 major, u32 minor)
2471 {
2472 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2473 
2474 	if (ip_block && ((ip_block->version->major > major) ||
2475 			((ip_block->version->major == major) &&
2476 			(ip_block->version->minor >= minor))))
2477 		return 0;
2478 
2479 	return 1;
2480 }
2481 
2482 static const char *ip_block_names[] = {
2483 	[AMD_IP_BLOCK_TYPE_COMMON] = "common",
2484 	[AMD_IP_BLOCK_TYPE_GMC] = "gmc",
2485 	[AMD_IP_BLOCK_TYPE_IH] = "ih",
2486 	[AMD_IP_BLOCK_TYPE_SMC] = "smu",
2487 	[AMD_IP_BLOCK_TYPE_PSP] = "psp",
2488 	[AMD_IP_BLOCK_TYPE_DCE] = "dce",
2489 	[AMD_IP_BLOCK_TYPE_GFX] = "gfx",
2490 	[AMD_IP_BLOCK_TYPE_SDMA] = "sdma",
2491 	[AMD_IP_BLOCK_TYPE_UVD] = "uvd",
2492 	[AMD_IP_BLOCK_TYPE_VCE] = "vce",
2493 	[AMD_IP_BLOCK_TYPE_ACP] = "acp",
2494 	[AMD_IP_BLOCK_TYPE_VCN] = "vcn",
2495 	[AMD_IP_BLOCK_TYPE_MES] = "mes",
2496 	[AMD_IP_BLOCK_TYPE_JPEG] = "jpeg",
2497 	[AMD_IP_BLOCK_TYPE_VPE] = "vpe",
2498 	[AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm",
2499 	[AMD_IP_BLOCK_TYPE_ISP] = "isp",
2500 	[AMD_IP_BLOCK_TYPE_RAS] = "ras",
2501 };
2502 
ip_block_name(struct amdgpu_device * adev,enum amd_ip_block_type type)2503 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type)
2504 {
2505 	int idx = (int)type;
2506 
2507 	return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown";
2508 }
2509 
2510 /**
2511  * amdgpu_device_ip_block_add
2512  *
2513  * @adev: amdgpu_device pointer
2514  * @ip_block_version: pointer to the IP to add
2515  *
2516  * Adds the IP block driver information to the collection of IPs
2517  * on the asic.
2518  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2519 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2520 			       const struct amdgpu_ip_block_version *ip_block_version)
2521 {
2522 	if (!ip_block_version)
2523 		return -EINVAL;
2524 
2525 	switch (ip_block_version->type) {
2526 	case AMD_IP_BLOCK_TYPE_VCN:
2527 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2528 			return 0;
2529 		break;
2530 	case AMD_IP_BLOCK_TYPE_JPEG:
2531 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2532 			return 0;
2533 		break;
2534 	default:
2535 		break;
2536 	}
2537 
2538 	dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n",
2539 		 adev->num_ip_blocks,
2540 		 ip_block_name(adev, ip_block_version->type),
2541 		 ip_block_version->major,
2542 		 ip_block_version->minor,
2543 		 ip_block_version->rev,
2544 		 ip_block_version->funcs->name);
2545 
2546 	adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2547 
2548 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2549 
2550 	return 0;
2551 }
2552 
2553 /**
2554  * amdgpu_device_enable_virtual_display - enable virtual display feature
2555  *
2556  * @adev: amdgpu_device pointer
2557  *
2558  * Enabled the virtual display feature if the user has enabled it via
2559  * the module parameter virtual_display.  This feature provides a virtual
2560  * display hardware on headless boards or in virtualized environments.
2561  * This function parses and validates the configuration string specified by
2562  * the user and configures the virtual display configuration (number of
2563  * virtual connectors, crtcs, etc.) specified.
2564  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2565 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2566 {
2567 	adev->enable_virtual_display = false;
2568 
2569 	if (amdgpu_virtual_display) {
2570 		const char *pci_address_name = pci_name(adev->pdev);
2571 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2572 
2573 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2574 		pciaddstr_tmp = pciaddstr;
2575 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2576 			pciaddname = strsep(&pciaddname_tmp, ",");
2577 			if (!strcmp("all", pciaddname)
2578 			    || !strcmp(pci_address_name, pciaddname)) {
2579 				long num_crtc;
2580 				int res = -1;
2581 
2582 				adev->enable_virtual_display = true;
2583 
2584 				if (pciaddname_tmp)
2585 					res = kstrtol(pciaddname_tmp, 10,
2586 						      &num_crtc);
2587 
2588 				if (!res) {
2589 					if (num_crtc < 1)
2590 						num_crtc = 1;
2591 					if (num_crtc > 6)
2592 						num_crtc = 6;
2593 					adev->mode_info.num_crtc = num_crtc;
2594 				} else {
2595 					adev->mode_info.num_crtc = 1;
2596 				}
2597 				break;
2598 			}
2599 		}
2600 
2601 		dev_info(
2602 			adev->dev,
2603 			"virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2604 			amdgpu_virtual_display, pci_address_name,
2605 			adev->enable_virtual_display, adev->mode_info.num_crtc);
2606 
2607 		kfree(pciaddstr);
2608 	}
2609 }
2610 
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2611 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2612 {
2613 	if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2614 		adev->mode_info.num_crtc = 1;
2615 		adev->enable_virtual_display = true;
2616 		dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n",
2617 			 adev->enable_virtual_display,
2618 			 adev->mode_info.num_crtc);
2619 	}
2620 }
2621 
2622 /**
2623  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2624  *
2625  * @adev: amdgpu_device pointer
2626  *
2627  * Parses the asic configuration parameters specified in the gpu info
2628  * firmware and makes them available to the driver for use in configuring
2629  * the asic.
2630  * Returns 0 on success, -EINVAL on failure.
2631  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2632 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2633 {
2634 	const char *chip_name;
2635 	int err;
2636 	const struct gpu_info_firmware_header_v1_0 *hdr;
2637 
2638 	adev->firmware.gpu_info_fw = NULL;
2639 
2640 	switch (adev->asic_type) {
2641 	default:
2642 		return 0;
2643 	case CHIP_VEGA10:
2644 		chip_name = "vega10";
2645 		break;
2646 	case CHIP_VEGA12:
2647 		chip_name = "vega12";
2648 		break;
2649 	case CHIP_RAVEN:
2650 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2651 			chip_name = "raven2";
2652 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2653 			chip_name = "picasso";
2654 		else
2655 			chip_name = "raven";
2656 		break;
2657 	case CHIP_ARCTURUS:
2658 		chip_name = "arcturus";
2659 		break;
2660 	case CHIP_NAVI12:
2661 		if (adev->discovery.bin)
2662 			return 0;
2663 		chip_name = "navi12";
2664 		break;
2665 	case CHIP_CYAN_SKILLFISH:
2666 		if (adev->discovery.bin)
2667 			return 0;
2668 		chip_name = "cyan_skillfish";
2669 		break;
2670 	}
2671 
2672 	err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2673 				   AMDGPU_UCODE_OPTIONAL,
2674 				   "amdgpu/%s_gpu_info.bin", chip_name);
2675 	if (err) {
2676 		dev_err(adev->dev,
2677 			"Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2678 			chip_name);
2679 		goto out;
2680 	}
2681 
2682 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2683 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2684 
2685 	switch (hdr->version_major) {
2686 	case 1:
2687 	{
2688 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2689 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2690 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2691 
2692 		/*
2693 		 * Should be dropped when DAL no longer needs it.
2694 		 */
2695 		if (adev->asic_type == CHIP_NAVI12)
2696 			goto parse_soc_bounding_box;
2697 
2698 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2699 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2700 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2701 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2702 		adev->gfx.config.max_texture_channel_caches =
2703 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
2704 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2705 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2706 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2707 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2708 		adev->gfx.config.double_offchip_lds_buf =
2709 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2710 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2711 		adev->gfx.cu_info.max_waves_per_simd =
2712 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2713 		adev->gfx.cu_info.max_scratch_slots_per_cu =
2714 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2715 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2716 		if (hdr->version_minor >= 1) {
2717 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2718 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2719 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2720 			adev->gfx.config.num_sc_per_sh =
2721 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2722 			adev->gfx.config.num_packer_per_sc =
2723 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2724 		}
2725 
2726 parse_soc_bounding_box:
2727 		/*
2728 		 * soc bounding box info is not integrated in disocovery table,
2729 		 * we always need to parse it from gpu info firmware if needed.
2730 		 */
2731 		if (hdr->version_minor == 2) {
2732 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2733 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2734 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2735 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2736 		}
2737 		break;
2738 	}
2739 	default:
2740 		dev_err(adev->dev,
2741 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2742 		err = -EINVAL;
2743 		goto out;
2744 	}
2745 out:
2746 	return err;
2747 }
2748 
amdgpu_uid_init(struct amdgpu_device * adev)2749 static void amdgpu_uid_init(struct amdgpu_device *adev)
2750 {
2751 	/* Initialize the UID for the device */
2752 	adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL);
2753 	if (!adev->uid_info) {
2754 		dev_warn(adev->dev, "Failed to allocate memory for UID\n");
2755 		return;
2756 	}
2757 	adev->uid_info->adev = adev;
2758 }
2759 
amdgpu_uid_fini(struct amdgpu_device * adev)2760 static void amdgpu_uid_fini(struct amdgpu_device *adev)
2761 {
2762 	/* Free the UID memory */
2763 	kfree(adev->uid_info);
2764 	adev->uid_info = NULL;
2765 }
2766 
2767 /**
2768  * amdgpu_device_ip_early_init - run early init for hardware IPs
2769  *
2770  * @adev: amdgpu_device pointer
2771  *
2772  * Early initialization pass for hardware IPs.  The hardware IPs that make
2773  * up each asic are discovered each IP's early_init callback is run.  This
2774  * is the first stage in initializing the asic.
2775  * Returns 0 on success, negative error code on failure.
2776  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2777 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2778 {
2779 	struct amdgpu_ip_block *ip_block;
2780 	struct pci_dev *parent;
2781 	bool total, skip_bios;
2782 	uint32_t bios_flags;
2783 	int i, r;
2784 
2785 	amdgpu_device_enable_virtual_display(adev);
2786 
2787 	if (amdgpu_sriov_vf(adev)) {
2788 		r = amdgpu_virt_request_full_gpu(adev, true);
2789 		if (r)
2790 			return r;
2791 
2792 		r = amdgpu_virt_init_critical_region(adev);
2793 		if (r)
2794 			return r;
2795 	}
2796 
2797 	switch (adev->asic_type) {
2798 #ifdef CONFIG_DRM_AMDGPU_SI
2799 	case CHIP_VERDE:
2800 	case CHIP_TAHITI:
2801 	case CHIP_PITCAIRN:
2802 	case CHIP_OLAND:
2803 	case CHIP_HAINAN:
2804 		adev->family = AMDGPU_FAMILY_SI;
2805 		r = si_set_ip_blocks(adev);
2806 		if (r)
2807 			return r;
2808 		break;
2809 #endif
2810 #ifdef CONFIG_DRM_AMDGPU_CIK
2811 	case CHIP_BONAIRE:
2812 	case CHIP_HAWAII:
2813 	case CHIP_KAVERI:
2814 	case CHIP_KABINI:
2815 	case CHIP_MULLINS:
2816 		if (adev->flags & AMD_IS_APU)
2817 			adev->family = AMDGPU_FAMILY_KV;
2818 		else
2819 			adev->family = AMDGPU_FAMILY_CI;
2820 
2821 		r = cik_set_ip_blocks(adev);
2822 		if (r)
2823 			return r;
2824 		break;
2825 #endif
2826 	case CHIP_TOPAZ:
2827 	case CHIP_TONGA:
2828 	case CHIP_FIJI:
2829 	case CHIP_POLARIS10:
2830 	case CHIP_POLARIS11:
2831 	case CHIP_POLARIS12:
2832 	case CHIP_VEGAM:
2833 	case CHIP_CARRIZO:
2834 	case CHIP_STONEY:
2835 		if (adev->flags & AMD_IS_APU)
2836 			adev->family = AMDGPU_FAMILY_CZ;
2837 		else
2838 			adev->family = AMDGPU_FAMILY_VI;
2839 
2840 		r = vi_set_ip_blocks(adev);
2841 		if (r)
2842 			return r;
2843 		break;
2844 	default:
2845 		r = amdgpu_discovery_set_ip_blocks(adev);
2846 		if (r)
2847 			return r;
2848 		break;
2849 	}
2850 
2851 	/* Check for IP version 9.4.3 with A0 hardware */
2852 	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
2853 	    !amdgpu_device_get_rev_id(adev)) {
2854 		dev_err(adev->dev, "Unsupported A0 hardware\n");
2855 		return -ENODEV;	/* device unsupported - no device error */
2856 	}
2857 
2858 	if (amdgpu_has_atpx() &&
2859 	    (amdgpu_is_atpx_hybrid() ||
2860 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
2861 	    ((adev->flags & AMD_IS_APU) == 0) &&
2862 	    !dev_is_removable(&adev->pdev->dev))
2863 		adev->flags |= AMD_IS_PX;
2864 
2865 	if (!(adev->flags & AMD_IS_APU)) {
2866 		parent = pcie_find_root_port(adev->pdev);
2867 		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2868 	}
2869 
2870 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2871 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2872 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2873 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2874 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2875 	if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2876 		adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2877 
2878 	adev->virt.is_xgmi_node_migrate_enabled = false;
2879 	if (amdgpu_sriov_vf(adev)) {
2880 		adev->virt.is_xgmi_node_migrate_enabled =
2881 			amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4);
2882 	}
2883 
2884 	total = true;
2885 	for (i = 0; i < adev->num_ip_blocks; i++) {
2886 		ip_block = &adev->ip_blocks[i];
2887 
2888 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2889 			dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i,
2890 				 adev->ip_blocks[i].version->funcs->name);
2891 			adev->ip_blocks[i].status.valid = false;
2892 		} else if (ip_block->version->funcs->early_init) {
2893 			r = ip_block->version->funcs->early_init(ip_block);
2894 			if (r == -ENOENT) {
2895 				adev->ip_blocks[i].status.valid = false;
2896 			} else if (r) {
2897 				dev_err(adev->dev,
2898 					"early_init of IP block <%s> failed %d\n",
2899 					adev->ip_blocks[i].version->funcs->name,
2900 					r);
2901 				total = false;
2902 			} else {
2903 				adev->ip_blocks[i].status.valid = true;
2904 			}
2905 		} else {
2906 			adev->ip_blocks[i].status.valid = true;
2907 		}
2908 		/* get the vbios after the asic_funcs are set up */
2909 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2910 			r = amdgpu_device_parse_gpu_info_fw(adev);
2911 			if (r)
2912 				return r;
2913 
2914 			bios_flags = amdgpu_device_get_vbios_flags(adev);
2915 			skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP);
2916 			/* Read BIOS */
2917 			if (!skip_bios) {
2918 				bool optional =
2919 					!!(bios_flags & AMDGPU_VBIOS_OPTIONAL);
2920 				if (!amdgpu_get_bios(adev) && !optional)
2921 					return -EINVAL;
2922 
2923 				if (optional && !adev->bios)
2924 					dev_info(
2925 						adev->dev,
2926 						"VBIOS image optional, proceeding without VBIOS image");
2927 
2928 				if (adev->bios) {
2929 					r = amdgpu_atombios_init(adev);
2930 					if (r) {
2931 						dev_err(adev->dev,
2932 							"amdgpu_atombios_init failed\n");
2933 						amdgpu_vf_error_put(
2934 							adev,
2935 							AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL,
2936 							0, 0);
2937 						return r;
2938 					}
2939 				}
2940 			}
2941 
2942 			/*get pf2vf msg info at it's earliest time*/
2943 			if (amdgpu_sriov_vf(adev))
2944 				amdgpu_virt_init_data_exchange(adev);
2945 
2946 		}
2947 	}
2948 	if (!total)
2949 		return -ENODEV;
2950 
2951 	if (adev->gmc.xgmi.supported)
2952 		amdgpu_xgmi_early_init(adev);
2953 
2954 	if (amdgpu_is_multi_aid(adev))
2955 		amdgpu_uid_init(adev);
2956 	ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2957 	if (ip_block->status.valid != false)
2958 		amdgpu_amdkfd_device_probe(adev);
2959 
2960 	adev->cg_flags &= amdgpu_cg_mask;
2961 	adev->pg_flags &= amdgpu_pg_mask;
2962 
2963 	return 0;
2964 }
2965 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2966 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2967 {
2968 	int i, r;
2969 
2970 	for (i = 0; i < adev->num_ip_blocks; i++) {
2971 		if (!adev->ip_blocks[i].status.sw)
2972 			continue;
2973 		if (adev->ip_blocks[i].status.hw)
2974 			continue;
2975 		if (!amdgpu_ip_member_of_hwini(
2976 			    adev, adev->ip_blocks[i].version->type))
2977 			continue;
2978 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2979 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2980 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2981 			r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2982 			if (r) {
2983 				dev_err(adev->dev,
2984 					"hw_init of IP block <%s> failed %d\n",
2985 					adev->ip_blocks[i].version->funcs->name,
2986 					r);
2987 				return r;
2988 			}
2989 			adev->ip_blocks[i].status.hw = true;
2990 		}
2991 	}
2992 
2993 	return 0;
2994 }
2995 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2996 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2997 {
2998 	int i, r;
2999 
3000 	for (i = 0; i < adev->num_ip_blocks; i++) {
3001 		if (!adev->ip_blocks[i].status.sw)
3002 			continue;
3003 		if (adev->ip_blocks[i].status.hw)
3004 			continue;
3005 		if (!amdgpu_ip_member_of_hwini(
3006 			    adev, adev->ip_blocks[i].version->type))
3007 			continue;
3008 		r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3009 		if (r) {
3010 			dev_err(adev->dev,
3011 				"hw_init of IP block <%s> failed %d\n",
3012 				adev->ip_blocks[i].version->funcs->name, r);
3013 			return r;
3014 		}
3015 		adev->ip_blocks[i].status.hw = true;
3016 	}
3017 
3018 	return 0;
3019 }
3020 
amdgpu_device_fw_loading(struct amdgpu_device * adev)3021 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
3022 {
3023 	int r = 0;
3024 	int i;
3025 	uint32_t smu_version;
3026 
3027 	if (adev->asic_type >= CHIP_VEGA10) {
3028 		for (i = 0; i < adev->num_ip_blocks; i++) {
3029 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
3030 				continue;
3031 
3032 			if (!amdgpu_ip_member_of_hwini(adev,
3033 						       AMD_IP_BLOCK_TYPE_PSP))
3034 				break;
3035 
3036 			if (!adev->ip_blocks[i].status.sw)
3037 				continue;
3038 
3039 			/* no need to do the fw loading again if already done*/
3040 			if (adev->ip_blocks[i].status.hw == true)
3041 				break;
3042 
3043 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
3044 				r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3045 				if (r)
3046 					return r;
3047 			} else {
3048 				r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3049 				if (r) {
3050 					dev_err(adev->dev,
3051 						"hw_init of IP block <%s> failed %d\n",
3052 						adev->ip_blocks[i]
3053 							.version->funcs->name,
3054 						r);
3055 					return r;
3056 				}
3057 				adev->ip_blocks[i].status.hw = true;
3058 			}
3059 			break;
3060 		}
3061 	}
3062 
3063 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
3064 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
3065 
3066 	return r;
3067 }
3068 
amdgpu_device_init_schedulers(struct amdgpu_device * adev)3069 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
3070 {
3071 	struct drm_sched_init_args args = {
3072 		.ops = &amdgpu_sched_ops,
3073 		.num_rqs = DRM_SCHED_PRIORITY_COUNT,
3074 		.timeout_wq = adev->reset_domain->wq,
3075 		.dev = adev->dev,
3076 	};
3077 	long timeout;
3078 	int r, i;
3079 
3080 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3081 		struct amdgpu_ring *ring = adev->rings[i];
3082 
3083 		/* No need to setup the GPU scheduler for rings that don't need it */
3084 		if (!ring || ring->no_scheduler)
3085 			continue;
3086 
3087 		switch (ring->funcs->type) {
3088 		case AMDGPU_RING_TYPE_GFX:
3089 			timeout = adev->gfx_timeout;
3090 			break;
3091 		case AMDGPU_RING_TYPE_COMPUTE:
3092 			timeout = adev->compute_timeout;
3093 			break;
3094 		case AMDGPU_RING_TYPE_SDMA:
3095 			timeout = adev->sdma_timeout;
3096 			break;
3097 		default:
3098 			timeout = adev->video_timeout;
3099 			break;
3100 		}
3101 
3102 		args.timeout = timeout;
3103 		args.credit_limit = ring->num_hw_submission;
3104 		args.score = ring->sched_score;
3105 		args.name = ring->name;
3106 
3107 		r = drm_sched_init(&ring->sched, &args);
3108 		if (r) {
3109 			dev_err(adev->dev,
3110 				"Failed to create scheduler on ring %s.\n",
3111 				ring->name);
3112 			return r;
3113 		}
3114 		r = amdgpu_uvd_entity_init(adev, ring);
3115 		if (r) {
3116 			dev_err(adev->dev,
3117 				"Failed to create UVD scheduling entity on ring %s.\n",
3118 				ring->name);
3119 			return r;
3120 		}
3121 		r = amdgpu_vce_entity_init(adev, ring);
3122 		if (r) {
3123 			dev_err(adev->dev,
3124 				"Failed to create VCE scheduling entity on ring %s.\n",
3125 				ring->name);
3126 			return r;
3127 		}
3128 	}
3129 
3130 	if (adev->xcp_mgr)
3131 		amdgpu_xcp_update_partition_sched_list(adev);
3132 
3133 	return 0;
3134 }
3135 
3136 
3137 /**
3138  * amdgpu_device_ip_init - run init for hardware IPs
3139  *
3140  * @adev: amdgpu_device pointer
3141  *
3142  * Main initialization pass for hardware IPs.  The list of all the hardware
3143  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
3144  * are run.  sw_init initializes the software state associated with each IP
3145  * and hw_init initializes the hardware associated with each IP.
3146  * Returns 0 on success, negative error code on failure.
3147  */
amdgpu_device_ip_init(struct amdgpu_device * adev)3148 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
3149 {
3150 	bool init_badpage;
3151 	int i, r;
3152 
3153 	r = amdgpu_ras_init(adev);
3154 	if (r)
3155 		return r;
3156 
3157 	for (i = 0; i < adev->num_ip_blocks; i++) {
3158 		if (!adev->ip_blocks[i].status.valid)
3159 			continue;
3160 		if (adev->ip_blocks[i].version->funcs->sw_init) {
3161 			r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
3162 			if (r) {
3163 				dev_err(adev->dev,
3164 					"sw_init of IP block <%s> failed %d\n",
3165 					adev->ip_blocks[i].version->funcs->name,
3166 					r);
3167 				goto init_failed;
3168 			}
3169 		}
3170 		adev->ip_blocks[i].status.sw = true;
3171 
3172 		if (!amdgpu_ip_member_of_hwini(
3173 			    adev, adev->ip_blocks[i].version->type))
3174 			continue;
3175 
3176 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
3177 			/* need to do common hw init early so everything is set up for gmc */
3178 			r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3179 			if (r) {
3180 				dev_err(adev->dev, "hw_init %d failed %d\n", i,
3181 					r);
3182 				goto init_failed;
3183 			}
3184 			adev->ip_blocks[i].status.hw = true;
3185 		} else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3186 			/* need to do gmc hw init early so we can allocate gpu mem */
3187 			/* Try to reserve bad pages early */
3188 			if (amdgpu_sriov_vf(adev))
3189 				amdgpu_virt_exchange_data(adev);
3190 
3191 			r = amdgpu_device_mem_scratch_init(adev);
3192 			if (r) {
3193 				dev_err(adev->dev,
3194 					"amdgpu_mem_scratch_init failed %d\n",
3195 					r);
3196 				goto init_failed;
3197 			}
3198 			r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3199 			if (r) {
3200 				dev_err(adev->dev, "hw_init %d failed %d\n", i,
3201 					r);
3202 				goto init_failed;
3203 			}
3204 			r = amdgpu_device_wb_init(adev);
3205 			if (r) {
3206 				dev_err(adev->dev,
3207 					"amdgpu_device_wb_init failed %d\n", r);
3208 				goto init_failed;
3209 			}
3210 			adev->ip_blocks[i].status.hw = true;
3211 
3212 			/* right after GMC hw init, we create CSA */
3213 			if (adev->gfx.mcbp) {
3214 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
3215 							       AMDGPU_GEM_DOMAIN_VRAM |
3216 							       AMDGPU_GEM_DOMAIN_GTT,
3217 							       AMDGPU_CSA_SIZE);
3218 				if (r) {
3219 					dev_err(adev->dev,
3220 						"allocate CSA failed %d\n", r);
3221 					goto init_failed;
3222 				}
3223 			}
3224 
3225 			r = amdgpu_seq64_init(adev);
3226 			if (r) {
3227 				dev_err(adev->dev, "allocate seq64 failed %d\n",
3228 					r);
3229 				goto init_failed;
3230 			}
3231 		}
3232 	}
3233 
3234 	if (amdgpu_sriov_vf(adev))
3235 		amdgpu_virt_init_data_exchange(adev);
3236 
3237 	r = amdgpu_ib_pool_init(adev);
3238 	if (r) {
3239 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
3240 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
3241 		goto init_failed;
3242 	}
3243 
3244 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
3245 	if (r)
3246 		goto init_failed;
3247 
3248 	r = amdgpu_device_ip_hw_init_phase1(adev);
3249 	if (r)
3250 		goto init_failed;
3251 
3252 	r = amdgpu_device_fw_loading(adev);
3253 	if (r)
3254 		goto init_failed;
3255 
3256 	r = amdgpu_device_ip_hw_init_phase2(adev);
3257 	if (r)
3258 		goto init_failed;
3259 
3260 	/*
3261 	 * retired pages will be loaded from eeprom and reserved here,
3262 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
3263 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3264 	 * for I2C communication which only true at this point.
3265 	 *
3266 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3267 	 * failure from bad gpu situation and stop amdgpu init process
3268 	 * accordingly. For other failed cases, it will still release all
3269 	 * the resource and print error message, rather than returning one
3270 	 * negative value to upper level.
3271 	 *
3272 	 * Note: theoretically, this should be called before all vram allocations
3273 	 * to protect retired page from abusing
3274 	 */
3275 	init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3276 	r = amdgpu_ras_recovery_init(adev, init_badpage);
3277 	if (r)
3278 		goto init_failed;
3279 
3280 	/**
3281 	 * In case of XGMI grab extra reference for reset domain for this device
3282 	 */
3283 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
3284 		if (amdgpu_xgmi_add_device(adev) == 0) {
3285 			if (!amdgpu_sriov_vf(adev)) {
3286 				struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3287 
3288 				if (WARN_ON(!hive)) {
3289 					r = -ENOENT;
3290 					goto init_failed;
3291 				}
3292 
3293 				if (!hive->reset_domain ||
3294 				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3295 					r = -ENOENT;
3296 					amdgpu_put_xgmi_hive(hive);
3297 					goto init_failed;
3298 				}
3299 
3300 				/* Drop the early temporary reset domain we created for device */
3301 				amdgpu_reset_put_reset_domain(adev->reset_domain);
3302 				adev->reset_domain = hive->reset_domain;
3303 				amdgpu_put_xgmi_hive(hive);
3304 			}
3305 		}
3306 	}
3307 
3308 	r = amdgpu_device_init_schedulers(adev);
3309 	if (r)
3310 		goto init_failed;
3311 
3312 	if (adev->mman.buffer_funcs_ring->sched.ready)
3313 		amdgpu_ttm_set_buffer_funcs_status(adev, true);
3314 
3315 	/* Don't init kfd if whole hive need to be reset during init */
3316 	if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3317 		kgd2kfd_init_zone_device(adev);
3318 		amdgpu_amdkfd_device_init(adev);
3319 	}
3320 
3321 	amdgpu_fru_get_product_info(adev);
3322 
3323 	if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev))
3324 		r = amdgpu_cper_init(adev);
3325 
3326 init_failed:
3327 
3328 	return r;
3329 }
3330 
3331 /**
3332  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3333  *
3334  * @adev: amdgpu_device pointer
3335  *
3336  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
3337  * this function before a GPU reset.  If the value is retained after a
3338  * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3339  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)3340 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3341 {
3342 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3343 }
3344 
3345 /**
3346  * amdgpu_device_check_vram_lost - check if vram is valid
3347  *
3348  * @adev: amdgpu_device pointer
3349  *
3350  * Checks the reset magic value written to the gart pointer in VRAM.
3351  * The driver calls this after a GPU reset to see if the contents of
3352  * VRAM is lost or now.
3353  * returns true if vram is lost, false if not.
3354  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3355 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3356 {
3357 	if (memcmp(adev->gart.ptr, adev->reset_magic,
3358 			AMDGPU_RESET_MAGIC_NUM))
3359 		return true;
3360 
3361 	if (!amdgpu_in_reset(adev))
3362 		return false;
3363 
3364 	/*
3365 	 * For all ASICs with baco/mode1 reset, the VRAM is
3366 	 * always assumed to be lost.
3367 	 */
3368 	switch (amdgpu_asic_reset_method(adev)) {
3369 	case AMD_RESET_METHOD_LEGACY:
3370 	case AMD_RESET_METHOD_LINK:
3371 	case AMD_RESET_METHOD_BACO:
3372 	case AMD_RESET_METHOD_MODE1:
3373 		return true;
3374 	default:
3375 		return false;
3376 	}
3377 }
3378 
3379 /**
3380  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3381  *
3382  * @adev: amdgpu_device pointer
3383  * @state: clockgating state (gate or ungate)
3384  *
3385  * The list of all the hardware IPs that make up the asic is walked and the
3386  * set_clockgating_state callbacks are run.
3387  * Late initialization pass enabling clockgating for hardware IPs.
3388  * Fini or suspend, pass disabling clockgating for hardware IPs.
3389  * Returns 0 on success, negative error code on failure.
3390  */
3391 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3392 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3393 			       enum amd_clockgating_state state)
3394 {
3395 	int i, j, r;
3396 
3397 	if (amdgpu_emu_mode == 1)
3398 		return 0;
3399 
3400 	for (j = 0; j < adev->num_ip_blocks; j++) {
3401 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3402 		if (!adev->ip_blocks[i].status.late_initialized)
3403 			continue;
3404 		/* skip CG for GFX, SDMA on S0ix */
3405 		if (adev->in_s0ix &&
3406 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3407 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3408 			continue;
3409 		/* skip CG for VCE/UVD, it's handled specially */
3410 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3411 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3412 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3413 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3414 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3415 			/* enable clockgating to save power */
3416 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
3417 										     state);
3418 			if (r) {
3419 				dev_err(adev->dev,
3420 					"set_clockgating_state(gate) of IP block <%s> failed %d\n",
3421 					adev->ip_blocks[i].version->funcs->name,
3422 					r);
3423 				return r;
3424 			}
3425 		}
3426 	}
3427 
3428 	return 0;
3429 }
3430 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3431 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3432 			       enum amd_powergating_state state)
3433 {
3434 	int i, j, r;
3435 
3436 	if (amdgpu_emu_mode == 1)
3437 		return 0;
3438 
3439 	for (j = 0; j < adev->num_ip_blocks; j++) {
3440 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3441 		if (!adev->ip_blocks[i].status.late_initialized)
3442 			continue;
3443 		/* skip PG for GFX, SDMA on S0ix */
3444 		if (adev->in_s0ix &&
3445 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3446 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3447 			continue;
3448 		/* skip CG for VCE/UVD, it's handled specially */
3449 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3450 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3451 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3452 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3453 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
3454 			/* enable powergating to save power */
3455 			r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
3456 											state);
3457 			if (r) {
3458 				dev_err(adev->dev,
3459 					"set_powergating_state(gate) of IP block <%s> failed %d\n",
3460 					adev->ip_blocks[i].version->funcs->name,
3461 					r);
3462 				return r;
3463 			}
3464 		}
3465 	}
3466 	return 0;
3467 }
3468 
amdgpu_device_enable_mgpu_fan_boost(void)3469 static int amdgpu_device_enable_mgpu_fan_boost(void)
3470 {
3471 	struct amdgpu_gpu_instance *gpu_ins;
3472 	struct amdgpu_device *adev;
3473 	int i, ret = 0;
3474 
3475 	mutex_lock(&mgpu_info.mutex);
3476 
3477 	/*
3478 	 * MGPU fan boost feature should be enabled
3479 	 * only when there are two or more dGPUs in
3480 	 * the system
3481 	 */
3482 	if (mgpu_info.num_dgpu < 2)
3483 		goto out;
3484 
3485 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
3486 		gpu_ins = &(mgpu_info.gpu_ins[i]);
3487 		adev = gpu_ins->adev;
3488 		if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) &&
3489 		    !gpu_ins->mgpu_fan_enabled) {
3490 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3491 			if (ret)
3492 				break;
3493 
3494 			gpu_ins->mgpu_fan_enabled = 1;
3495 		}
3496 	}
3497 
3498 out:
3499 	mutex_unlock(&mgpu_info.mutex);
3500 
3501 	return ret;
3502 }
3503 
3504 /**
3505  * amdgpu_device_ip_late_init - run late init for hardware IPs
3506  *
3507  * @adev: amdgpu_device pointer
3508  *
3509  * Late initialization pass for hardware IPs.  The list of all the hardware
3510  * IPs that make up the asic is walked and the late_init callbacks are run.
3511  * late_init covers any special initialization that an IP requires
3512  * after all of the have been initialized or something that needs to happen
3513  * late in the init process.
3514  * Returns 0 on success, negative error code on failure.
3515  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3516 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3517 {
3518 	struct amdgpu_gpu_instance *gpu_instance;
3519 	int i = 0, r;
3520 
3521 	for (i = 0; i < adev->num_ip_blocks; i++) {
3522 		if (!adev->ip_blocks[i].status.hw)
3523 			continue;
3524 		if (adev->ip_blocks[i].version->funcs->late_init) {
3525 			r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3526 			if (r) {
3527 				dev_err(adev->dev,
3528 					"late_init of IP block <%s> failed %d\n",
3529 					adev->ip_blocks[i].version->funcs->name,
3530 					r);
3531 				return r;
3532 			}
3533 		}
3534 		adev->ip_blocks[i].status.late_initialized = true;
3535 	}
3536 
3537 	r = amdgpu_ras_late_init(adev);
3538 	if (r) {
3539 		dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r);
3540 		return r;
3541 	}
3542 
3543 	if (!amdgpu_reset_in_recovery(adev))
3544 		amdgpu_ras_set_error_query_ready(adev, true);
3545 
3546 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3547 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3548 
3549 	amdgpu_device_fill_reset_magic(adev);
3550 
3551 	r = amdgpu_device_enable_mgpu_fan_boost();
3552 	if (r)
3553 		dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r);
3554 
3555 	/* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3556 	if (amdgpu_passthrough(adev) &&
3557 	    ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3558 	     adev->asic_type == CHIP_ALDEBARAN))
3559 		amdgpu_dpm_handle_passthrough_sbr(adev, true);
3560 
3561 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
3562 		mutex_lock(&mgpu_info.mutex);
3563 
3564 		/*
3565 		 * Reset device p-state to low as this was booted with high.
3566 		 *
3567 		 * This should be performed only after all devices from the same
3568 		 * hive get initialized.
3569 		 *
3570 		 * However, it's unknown how many device in the hive in advance.
3571 		 * As this is counted one by one during devices initializations.
3572 		 *
3573 		 * So, we wait for all XGMI interlinked devices initialized.
3574 		 * This may bring some delays as those devices may come from
3575 		 * different hives. But that should be OK.
3576 		 */
3577 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3578 			for (i = 0; i < mgpu_info.num_gpu; i++) {
3579 				gpu_instance = &(mgpu_info.gpu_ins[i]);
3580 				if (gpu_instance->adev->flags & AMD_IS_APU)
3581 					continue;
3582 
3583 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3584 						AMDGPU_XGMI_PSTATE_MIN);
3585 				if (r) {
3586 					dev_err(adev->dev,
3587 						"pstate setting failed (%d).\n",
3588 						r);
3589 					break;
3590 				}
3591 			}
3592 		}
3593 
3594 		mutex_unlock(&mgpu_info.mutex);
3595 	}
3596 
3597 	return 0;
3598 }
3599 
amdgpu_ip_block_hw_fini(struct amdgpu_ip_block * ip_block)3600 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3601 {
3602 	struct amdgpu_device *adev = ip_block->adev;
3603 	int r;
3604 
3605 	if (!ip_block->version->funcs->hw_fini) {
3606 		dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n",
3607 			ip_block->version->funcs->name);
3608 	} else {
3609 		r = ip_block->version->funcs->hw_fini(ip_block);
3610 		/* XXX handle errors */
3611 		if (r) {
3612 			dev_dbg(adev->dev,
3613 				"hw_fini of IP block <%s> failed %d\n",
3614 				ip_block->version->funcs->name, r);
3615 		}
3616 	}
3617 
3618 	ip_block->status.hw = false;
3619 }
3620 
3621 /**
3622  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3623  *
3624  * @adev: amdgpu_device pointer
3625  *
3626  * For ASICs need to disable SMC first
3627  */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3628 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3629 {
3630 	int i;
3631 
3632 	if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3633 		return;
3634 
3635 	for (i = 0; i < adev->num_ip_blocks; i++) {
3636 		if (!adev->ip_blocks[i].status.hw)
3637 			continue;
3638 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3639 			amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3640 			break;
3641 		}
3642 	}
3643 }
3644 
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3645 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3646 {
3647 	int i, r;
3648 
3649 	for (i = 0; i < adev->num_ip_blocks; i++) {
3650 		if (!adev->ip_blocks[i].version->funcs->early_fini)
3651 			continue;
3652 
3653 		r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3654 		if (r) {
3655 			dev_dbg(adev->dev,
3656 				"early_fini of IP block <%s> failed %d\n",
3657 				adev->ip_blocks[i].version->funcs->name, r);
3658 		}
3659 	}
3660 
3661 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3662 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3663 
3664 	amdgpu_amdkfd_suspend(adev, true);
3665 	amdgpu_userq_suspend(adev);
3666 
3667 	/* Workaround for ASICs need to disable SMC first */
3668 	amdgpu_device_smu_fini_early(adev);
3669 
3670 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3671 		if (!adev->ip_blocks[i].status.hw)
3672 			continue;
3673 
3674 		amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3675 	}
3676 
3677 	if (amdgpu_sriov_vf(adev)) {
3678 		if (amdgpu_virt_release_full_gpu(adev, false))
3679 			dev_err(adev->dev,
3680 				"failed to release exclusive mode on fini\n");
3681 	}
3682 
3683 	/*
3684 	 * Driver reload on the APU can fail due to firmware validation because
3685 	 * the PSP is always running, as it is shared across the whole SoC.
3686 	 * This same issue does not occur on dGPU because it has a mechanism
3687 	 * that checks whether the PSP is running. A solution for those issues
3688 	 * in the APU is to trigger a GPU reset, but this should be done during
3689 	 * the unload phase to avoid adding boot latency and screen flicker.
3690 	 */
3691 	if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) {
3692 		r = amdgpu_asic_reset(adev);
3693 		if (r)
3694 			dev_err(adev->dev, "asic reset on %s failed\n", __func__);
3695 	}
3696 
3697 	return 0;
3698 }
3699 
3700 /**
3701  * amdgpu_device_ip_fini - run fini for hardware IPs
3702  *
3703  * @adev: amdgpu_device pointer
3704  *
3705  * Main teardown pass for hardware IPs.  The list of all the hardware
3706  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3707  * are run.  hw_fini tears down the hardware associated with each IP
3708  * and sw_fini tears down any software state associated with each IP.
3709  * Returns 0 on success, negative error code on failure.
3710  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3711 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3712 {
3713 	int i, r;
3714 
3715 	amdgpu_cper_fini(adev);
3716 
3717 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3718 		amdgpu_virt_release_ras_err_handler_data(adev);
3719 
3720 	if (adev->gmc.xgmi.num_physical_nodes > 1)
3721 		amdgpu_xgmi_remove_device(adev);
3722 
3723 	amdgpu_amdkfd_device_fini_sw(adev);
3724 
3725 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3726 		if (!adev->ip_blocks[i].status.sw)
3727 			continue;
3728 
3729 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3730 			amdgpu_ucode_free_bo(adev);
3731 			amdgpu_free_static_csa(&adev->virt.csa_obj);
3732 			amdgpu_device_wb_fini(adev);
3733 			amdgpu_device_mem_scratch_fini(adev);
3734 			amdgpu_ib_pool_fini(adev);
3735 			amdgpu_seq64_fini(adev);
3736 			amdgpu_doorbell_fini(adev);
3737 		}
3738 		if (adev->ip_blocks[i].version->funcs->sw_fini) {
3739 			r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3740 			/* XXX handle errors */
3741 			if (r) {
3742 				dev_dbg(adev->dev,
3743 					"sw_fini of IP block <%s> failed %d\n",
3744 					adev->ip_blocks[i].version->funcs->name,
3745 					r);
3746 			}
3747 		}
3748 		adev->ip_blocks[i].status.sw = false;
3749 		adev->ip_blocks[i].status.valid = false;
3750 	}
3751 
3752 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3753 		if (!adev->ip_blocks[i].status.late_initialized)
3754 			continue;
3755 		if (adev->ip_blocks[i].version->funcs->late_fini)
3756 			adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3757 		adev->ip_blocks[i].status.late_initialized = false;
3758 	}
3759 
3760 	amdgpu_ras_fini(adev);
3761 	amdgpu_uid_fini(adev);
3762 
3763 	return 0;
3764 }
3765 
3766 /**
3767  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3768  *
3769  * @work: work_struct.
3770  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3771 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3772 {
3773 	struct amdgpu_device *adev =
3774 		container_of(work, struct amdgpu_device, delayed_init_work.work);
3775 	int r;
3776 
3777 	r = amdgpu_ib_ring_tests(adev);
3778 	if (r)
3779 		dev_err(adev->dev, "ib ring test failed (%d).\n", r);
3780 }
3781 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3782 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3783 {
3784 	struct amdgpu_device *adev =
3785 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3786 
3787 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
3788 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3789 
3790 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0))
3791 		adev->gfx.gfx_off_state = true;
3792 }
3793 
3794 /**
3795  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3796  *
3797  * @adev: amdgpu_device pointer
3798  *
3799  * Main suspend function for hardware IPs.  The list of all the hardware
3800  * IPs that make up the asic is walked, clockgating is disabled and the
3801  * suspend callbacks are run.  suspend puts the hardware and software state
3802  * in each IP into a state suitable for suspend.
3803  * Returns 0 on success, negative error code on failure.
3804  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3805 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3806 {
3807 	int i, r, rec;
3808 
3809 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3810 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3811 
3812 	/*
3813 	 * Per PMFW team's suggestion, driver needs to handle gfxoff
3814 	 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3815 	 * scenario. Add the missing df cstate disablement here.
3816 	 */
3817 	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3818 		dev_warn(adev->dev, "Failed to disallow df cstate");
3819 
3820 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3821 		if (!adev->ip_blocks[i].status.valid)
3822 			continue;
3823 
3824 		/* displays are handled separately */
3825 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3826 			continue;
3827 
3828 		r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3829 		if (r)
3830 			goto unwind;
3831 	}
3832 
3833 	return 0;
3834 unwind:
3835 	rec = amdgpu_device_ip_resume_phase3(adev);
3836 	if (rec)
3837 		dev_err(adev->dev,
3838 			"amdgpu_device_ip_resume_phase3 failed during unwind: %d\n",
3839 			rec);
3840 
3841 	amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW);
3842 
3843 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3844 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3845 
3846 	return r;
3847 }
3848 
3849 /**
3850  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3851  *
3852  * @adev: amdgpu_device pointer
3853  *
3854  * Main suspend function for hardware IPs.  The list of all the hardware
3855  * IPs that make up the asic is walked, clockgating is disabled and the
3856  * suspend callbacks are run.  suspend puts the hardware and software state
3857  * in each IP into a state suitable for suspend.
3858  * Returns 0 on success, negative error code on failure.
3859  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3860 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3861 {
3862 	int i, r, rec;
3863 
3864 	if (adev->in_s0ix)
3865 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3866 
3867 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3868 		if (!adev->ip_blocks[i].status.valid)
3869 			continue;
3870 		/* displays are handled in phase1 */
3871 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3872 			continue;
3873 		/* PSP lost connection when err_event_athub occurs */
3874 		if (amdgpu_ras_intr_triggered() &&
3875 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3876 			adev->ip_blocks[i].status.hw = false;
3877 			continue;
3878 		}
3879 
3880 		/* skip unnecessary suspend if we do not initialize them yet */
3881 		if (!amdgpu_ip_member_of_hwini(
3882 			    adev, adev->ip_blocks[i].version->type))
3883 			continue;
3884 
3885 		/* Since we skip suspend for S0i3, we need to cancel the delayed
3886 		 * idle work here as the suspend callback never gets called.
3887 		 */
3888 		if (adev->in_s0ix &&
3889 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX &&
3890 		    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0))
3891 			cancel_delayed_work_sync(&adev->gfx.idle_work);
3892 		/* skip suspend of gfx/mes and psp for S0ix
3893 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3894 		 * like at runtime. PSP is also part of the always on hardware
3895 		 * so no need to suspend it.
3896 		 */
3897 		if (adev->in_s0ix &&
3898 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3899 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3900 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3901 			continue;
3902 
3903 		/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3904 		if (adev->in_s0ix &&
3905 		    (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3906 		     IP_VERSION(5, 0, 0)) &&
3907 		    (adev->ip_blocks[i].version->type ==
3908 		     AMD_IP_BLOCK_TYPE_SDMA))
3909 			continue;
3910 
3911 		/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3912 		 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3913 		 * from this location and RLC Autoload automatically also gets loaded
3914 		 * from here based on PMFW -> PSP message during re-init sequence.
3915 		 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3916 		 * the TMR and reload FWs again for IMU enabled APU ASICs.
3917 		 */
3918 		if (amdgpu_in_reset(adev) &&
3919 		    (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3920 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3921 			continue;
3922 
3923 		r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3924 		if (r)
3925 			goto unwind;
3926 
3927 		/* handle putting the SMC in the appropriate state */
3928 		if (!amdgpu_sriov_vf(adev)) {
3929 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3930 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3931 				if (r) {
3932 					dev_err(adev->dev,
3933 						"SMC failed to set mp1 state %d, %d\n",
3934 						adev->mp1_state, r);
3935 					goto unwind;
3936 				}
3937 			}
3938 		}
3939 	}
3940 
3941 	return 0;
3942 unwind:
3943 	/* suspend phase 2 = resume phase 1 + resume phase 2 */
3944 	rec = amdgpu_device_ip_resume_phase1(adev);
3945 	if (rec) {
3946 		dev_err(adev->dev,
3947 			"amdgpu_device_ip_resume_phase1 failed during unwind: %d\n",
3948 			rec);
3949 		return r;
3950 	}
3951 
3952 	rec = amdgpu_device_fw_loading(adev);
3953 	if (rec) {
3954 		dev_err(adev->dev,
3955 			"amdgpu_device_fw_loading failed during unwind: %d\n",
3956 			rec);
3957 		return r;
3958 	}
3959 
3960 	rec = amdgpu_device_ip_resume_phase2(adev);
3961 	if (rec) {
3962 		dev_err(adev->dev,
3963 			"amdgpu_device_ip_resume_phase2 failed during unwind: %d\n",
3964 			rec);
3965 		return r;
3966 	}
3967 
3968 	return r;
3969 }
3970 
3971 /**
3972  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3973  *
3974  * @adev: amdgpu_device pointer
3975  *
3976  * Main suspend function for hardware IPs.  The list of all the hardware
3977  * IPs that make up the asic is walked, clockgating is disabled and the
3978  * suspend callbacks are run.  suspend puts the hardware and software state
3979  * in each IP into a state suitable for suspend.
3980  * Returns 0 on success, negative error code on failure.
3981  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3982 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3983 {
3984 	int r;
3985 
3986 	if (amdgpu_sriov_vf(adev)) {
3987 		amdgpu_virt_fini_data_exchange(adev);
3988 		amdgpu_virt_request_full_gpu(adev, false);
3989 	}
3990 
3991 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
3992 
3993 	r = amdgpu_device_ip_suspend_phase1(adev);
3994 	if (r)
3995 		return r;
3996 	r = amdgpu_device_ip_suspend_phase2(adev);
3997 
3998 	if (amdgpu_sriov_vf(adev))
3999 		amdgpu_virt_release_full_gpu(adev, false);
4000 
4001 	return r;
4002 }
4003 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)4004 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
4005 {
4006 	int i, r;
4007 
4008 	static enum amd_ip_block_type ip_order[] = {
4009 		AMD_IP_BLOCK_TYPE_COMMON,
4010 		AMD_IP_BLOCK_TYPE_GMC,
4011 		AMD_IP_BLOCK_TYPE_PSP,
4012 		AMD_IP_BLOCK_TYPE_IH,
4013 	};
4014 
4015 	for (i = 0; i < adev->num_ip_blocks; i++) {
4016 		int j;
4017 		struct amdgpu_ip_block *block;
4018 
4019 		block = &adev->ip_blocks[i];
4020 		block->status.hw = false;
4021 
4022 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
4023 
4024 			if (block->version->type != ip_order[j] ||
4025 				!block->status.valid)
4026 				continue;
4027 
4028 			r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
4029 			if (r) {
4030 				dev_err(adev->dev, "RE-INIT-early: %s failed\n",
4031 					 block->version->funcs->name);
4032 				return r;
4033 			}
4034 			block->status.hw = true;
4035 		}
4036 	}
4037 
4038 	return 0;
4039 }
4040 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)4041 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
4042 {
4043 	struct amdgpu_ip_block *block;
4044 	int i, r = 0;
4045 
4046 	static enum amd_ip_block_type ip_order[] = {
4047 		AMD_IP_BLOCK_TYPE_SMC,
4048 		AMD_IP_BLOCK_TYPE_DCE,
4049 		AMD_IP_BLOCK_TYPE_GFX,
4050 		AMD_IP_BLOCK_TYPE_SDMA,
4051 		AMD_IP_BLOCK_TYPE_MES,
4052 		AMD_IP_BLOCK_TYPE_UVD,
4053 		AMD_IP_BLOCK_TYPE_VCE,
4054 		AMD_IP_BLOCK_TYPE_VCN,
4055 		AMD_IP_BLOCK_TYPE_JPEG
4056 	};
4057 
4058 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
4059 		block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
4060 
4061 		if (!block)
4062 			continue;
4063 
4064 		if (block->status.valid && !block->status.hw) {
4065 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
4066 				r = amdgpu_ip_block_resume(block);
4067 			} else {
4068 				r = block->version->funcs->hw_init(block);
4069 			}
4070 
4071 			if (r) {
4072 				dev_err(adev->dev, "RE-INIT-late: %s failed\n",
4073 					 block->version->funcs->name);
4074 				break;
4075 			}
4076 			block->status.hw = true;
4077 		}
4078 	}
4079 
4080 	return r;
4081 }
4082 
4083 /**
4084  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
4085  *
4086  * @adev: amdgpu_device pointer
4087  *
4088  * First resume function for hardware IPs.  The list of all the hardware
4089  * IPs that make up the asic is walked and the resume callbacks are run for
4090  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
4091  * after a suspend and updates the software state as necessary.  This
4092  * function is also used for restoring the GPU after a GPU reset.
4093  * Returns 0 on success, negative error code on failure.
4094  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)4095 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
4096 {
4097 	int i, r;
4098 
4099 	for (i = 0; i < adev->num_ip_blocks; i++) {
4100 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
4101 			continue;
4102 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
4103 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
4104 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
4105 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
4106 
4107 			r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
4108 			if (r)
4109 				return r;
4110 		}
4111 	}
4112 
4113 	return 0;
4114 }
4115 
4116 /**
4117  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
4118  *
4119  * @adev: amdgpu_device pointer
4120  *
4121  * Second resume function for hardware IPs.  The list of all the hardware
4122  * IPs that make up the asic is walked and the resume callbacks are run for
4123  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
4124  * functional state after a suspend and updates the software state as
4125  * necessary.  This function is also used for restoring the GPU after a GPU
4126  * reset.
4127  * Returns 0 on success, negative error code on failure.
4128  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)4129 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
4130 {
4131 	int i, r;
4132 
4133 	for (i = 0; i < adev->num_ip_blocks; i++) {
4134 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
4135 			continue;
4136 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
4137 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
4138 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
4139 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
4140 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
4141 			continue;
4142 		r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
4143 		if (r)
4144 			return r;
4145 	}
4146 
4147 	return 0;
4148 }
4149 
4150 /**
4151  * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
4152  *
4153  * @adev: amdgpu_device pointer
4154  *
4155  * Third resume function for hardware IPs.  The list of all the hardware
4156  * IPs that make up the asic is walked and the resume callbacks are run for
4157  * all DCE.  resume puts the hardware into a functional state after a suspend
4158  * and updates the software state as necessary.  This function is also used
4159  * for restoring the GPU after a GPU reset.
4160  *
4161  * Returns 0 on success, negative error code on failure.
4162  */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)4163 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
4164 {
4165 	int i, r;
4166 
4167 	for (i = 0; i < adev->num_ip_blocks; i++) {
4168 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
4169 			continue;
4170 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
4171 			r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
4172 			if (r)
4173 				return r;
4174 		}
4175 	}
4176 
4177 	return 0;
4178 }
4179 
4180 /**
4181  * amdgpu_device_ip_resume - run resume for hardware IPs
4182  *
4183  * @adev: amdgpu_device pointer
4184  *
4185  * Main resume function for hardware IPs.  The hardware IPs
4186  * are split into two resume functions because they are
4187  * also used in recovering from a GPU reset and some additional
4188  * steps need to be take between them.  In this case (S3/S4) they are
4189  * run sequentially.
4190  * Returns 0 on success, negative error code on failure.
4191  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)4192 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
4193 {
4194 	int r;
4195 
4196 	r = amdgpu_device_ip_resume_phase1(adev);
4197 	if (r)
4198 		return r;
4199 
4200 	r = amdgpu_device_fw_loading(adev);
4201 	if (r)
4202 		return r;
4203 
4204 	r = amdgpu_device_ip_resume_phase2(adev);
4205 
4206 	if (adev->mman.buffer_funcs_ring->sched.ready)
4207 		amdgpu_ttm_set_buffer_funcs_status(adev, true);
4208 
4209 	if (r)
4210 		return r;
4211 
4212 	amdgpu_fence_driver_hw_init(adev);
4213 
4214 	r = amdgpu_device_ip_resume_phase3(adev);
4215 
4216 	return r;
4217 }
4218 
4219 /**
4220  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
4221  *
4222  * @adev: amdgpu_device pointer
4223  *
4224  * Query the VBIOS data tables to determine if the board supports SR-IOV.
4225  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)4226 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
4227 {
4228 	if (amdgpu_sriov_vf(adev)) {
4229 		if (adev->is_atom_fw) {
4230 			if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
4231 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4232 		} else {
4233 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
4234 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4235 		}
4236 
4237 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
4238 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
4239 	}
4240 }
4241 
4242 /**
4243  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
4244  *
4245  * @pdev : pci device context
4246  * @asic_type: AMD asic type
4247  *
4248  * Check if there is DC (new modesetting infrastructre) support for an asic.
4249  * returns true if DC has support, false if not.
4250  */
amdgpu_device_asic_has_dc_support(struct pci_dev * pdev,enum amd_asic_type asic_type)4251 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev,
4252 				       enum amd_asic_type asic_type)
4253 {
4254 	switch (asic_type) {
4255 #ifdef CONFIG_DRM_AMDGPU_SI
4256 	case CHIP_HAINAN:
4257 #endif
4258 	case CHIP_TOPAZ:
4259 		/* chips with no display hardware */
4260 		return false;
4261 #if defined(CONFIG_DRM_AMD_DC)
4262 	case CHIP_TAHITI:
4263 	case CHIP_PITCAIRN:
4264 	case CHIP_VERDE:
4265 	case CHIP_OLAND:
4266 		return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI);
4267 	case CHIP_KAVERI:
4268 	case CHIP_KABINI:
4269 	case CHIP_MULLINS:
4270 		/*
4271 		 * We have systems in the wild with these ASICs that require
4272 		 * TRAVIS and NUTMEG support which is not supported with DC.
4273 		 *
4274 		 * Fallback to the non-DC driver here by default so as not to
4275 		 * cause regressions.
4276 		 */
4277 		return amdgpu_dc > 0;
4278 	default:
4279 		return amdgpu_dc != 0;
4280 #else
4281 	default:
4282 		if (amdgpu_dc > 0)
4283 			dev_info_once(
4284 				&pdev->dev,
4285 				"Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
4286 		return false;
4287 #endif
4288 	}
4289 }
4290 
4291 /**
4292  * amdgpu_device_has_dc_support - check if dc is supported
4293  *
4294  * @adev: amdgpu_device pointer
4295  *
4296  * Returns true for supported, false for not supported
4297  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)4298 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
4299 {
4300 	if (adev->enable_virtual_display ||
4301 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
4302 		return false;
4303 
4304 	return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type);
4305 }
4306 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)4307 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
4308 {
4309 	struct amdgpu_device *adev =
4310 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
4311 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
4312 
4313 	/* It's a bug to not have a hive within this function */
4314 	if (WARN_ON(!hive))
4315 		return;
4316 
4317 	/*
4318 	 * Use task barrier to synchronize all xgmi reset works across the
4319 	 * hive. task_barrier_enter and task_barrier_exit will block
4320 	 * until all the threads running the xgmi reset works reach
4321 	 * those points. task_barrier_full will do both blocks.
4322 	 */
4323 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
4324 
4325 		task_barrier_enter(&hive->tb);
4326 		adev->asic_reset_res = amdgpu_device_baco_enter(adev);
4327 
4328 		if (adev->asic_reset_res)
4329 			goto fail;
4330 
4331 		task_barrier_exit(&hive->tb);
4332 		adev->asic_reset_res = amdgpu_device_baco_exit(adev);
4333 
4334 		if (adev->asic_reset_res)
4335 			goto fail;
4336 
4337 		amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
4338 	} else {
4339 
4340 		task_barrier_full(&hive->tb);
4341 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
4342 	}
4343 
4344 fail:
4345 	if (adev->asic_reset_res)
4346 		dev_warn(adev->dev,
4347 			 "ASIC reset failed with error, %d for drm dev, %s",
4348 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
4349 	amdgpu_put_xgmi_hive(hive);
4350 }
4351 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)4352 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4353 {
4354 	char *input = amdgpu_lockup_timeout;
4355 	char *timeout_setting = NULL;
4356 	int index = 0;
4357 	long timeout;
4358 	int ret = 0;
4359 
4360 	/* By default timeout for all queues is 2 sec */
4361 	adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
4362 		adev->video_timeout = msecs_to_jiffies(2000);
4363 
4364 	if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
4365 		return 0;
4366 
4367 	while ((timeout_setting = strsep(&input, ",")) &&
4368 	       strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4369 		ret = kstrtol(timeout_setting, 0, &timeout);
4370 		if (ret)
4371 			return ret;
4372 
4373 		if (timeout == 0) {
4374 			index++;
4375 			continue;
4376 		} else if (timeout < 0) {
4377 			timeout = MAX_SCHEDULE_TIMEOUT;
4378 			dev_warn(adev->dev, "lockup timeout disabled");
4379 			add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4380 		} else {
4381 			timeout = msecs_to_jiffies(timeout);
4382 		}
4383 
4384 		switch (index++) {
4385 		case 0:
4386 			adev->gfx_timeout = timeout;
4387 			break;
4388 		case 1:
4389 			adev->compute_timeout = timeout;
4390 			break;
4391 		case 2:
4392 			adev->sdma_timeout = timeout;
4393 			break;
4394 		case 3:
4395 			adev->video_timeout = timeout;
4396 			break;
4397 		default:
4398 			break;
4399 		}
4400 	}
4401 
4402 	/* When only one value specified apply it to all queues. */
4403 	if (index == 1)
4404 		adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
4405 			adev->video_timeout = timeout;
4406 
4407 	return ret;
4408 }
4409 
4410 /**
4411  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4412  *
4413  * @adev: amdgpu_device pointer
4414  *
4415  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4416  */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4417 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4418 {
4419 	struct iommu_domain *domain;
4420 
4421 	domain = iommu_get_domain_for_dev(adev->dev);
4422 	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4423 		adev->ram_is_direct_mapped = true;
4424 }
4425 
4426 #if defined(CONFIG_HSA_AMD_P2P)
4427 /**
4428  * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4429  *
4430  * @adev: amdgpu_device pointer
4431  *
4432  * return if IOMMU remapping bar address
4433  */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4434 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4435 {
4436 	struct iommu_domain *domain;
4437 
4438 	domain = iommu_get_domain_for_dev(adev->dev);
4439 	if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4440 		domain->type ==	IOMMU_DOMAIN_DMA_FQ))
4441 		return true;
4442 
4443 	return false;
4444 }
4445 #endif
4446 
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4447 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4448 {
4449 	if (amdgpu_mcbp == 1)
4450 		adev->gfx.mcbp = true;
4451 	else if (amdgpu_mcbp == 0)
4452 		adev->gfx.mcbp = false;
4453 
4454 	if (amdgpu_sriov_vf(adev))
4455 		adev->gfx.mcbp = true;
4456 
4457 	if (adev->gfx.mcbp)
4458 		dev_info(adev->dev, "MCBP is enabled\n");
4459 }
4460 
amdgpu_device_sys_interface_init(struct amdgpu_device * adev)4461 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev)
4462 {
4463 	int r;
4464 
4465 	r = amdgpu_atombios_sysfs_init(adev);
4466 	if (r)
4467 		drm_err(&adev->ddev,
4468 			"registering atombios sysfs failed (%d).\n", r);
4469 
4470 	r = amdgpu_pm_sysfs_init(adev);
4471 	if (r)
4472 		dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r);
4473 
4474 	r = amdgpu_ucode_sysfs_init(adev);
4475 	if (r) {
4476 		adev->ucode_sysfs_en = false;
4477 		dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r);
4478 	} else
4479 		adev->ucode_sysfs_en = true;
4480 
4481 	r = amdgpu_device_attr_sysfs_init(adev);
4482 	if (r)
4483 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
4484 
4485 	r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4486 	if (r)
4487 		dev_err(adev->dev,
4488 			"Could not create amdgpu board attributes\n");
4489 
4490 	amdgpu_fru_sysfs_init(adev);
4491 	amdgpu_reg_state_sysfs_init(adev);
4492 	amdgpu_xcp_sysfs_init(adev);
4493 
4494 	return r;
4495 }
4496 
amdgpu_device_sys_interface_fini(struct amdgpu_device * adev)4497 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev)
4498 {
4499 	if (adev->pm.sysfs_initialized)
4500 		amdgpu_pm_sysfs_fini(adev);
4501 	if (adev->ucode_sysfs_en)
4502 		amdgpu_ucode_sysfs_fini(adev);
4503 	amdgpu_device_attr_sysfs_fini(adev);
4504 	amdgpu_fru_sysfs_fini(adev);
4505 
4506 	amdgpu_reg_state_sysfs_fini(adev);
4507 	amdgpu_xcp_sysfs_fini(adev);
4508 }
4509 
4510 /**
4511  * amdgpu_device_init - initialize the driver
4512  *
4513  * @adev: amdgpu_device pointer
4514  * @flags: driver flags
4515  *
4516  * Initializes the driver info and hw (all asics).
4517  * Returns 0 for success or an error on failure.
4518  * Called at driver startup.
4519  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4520 int amdgpu_device_init(struct amdgpu_device *adev,
4521 		       uint32_t flags)
4522 {
4523 	struct pci_dev *pdev = adev->pdev;
4524 	int r, i;
4525 	bool px = false;
4526 	u32 max_MBps;
4527 	int tmp;
4528 
4529 	adev->shutdown = false;
4530 	adev->flags = flags;
4531 
4532 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4533 		adev->asic_type = amdgpu_force_asic_type;
4534 	else
4535 		adev->asic_type = flags & AMD_ASIC_MASK;
4536 
4537 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4538 	if (amdgpu_emu_mode == 1)
4539 		adev->usec_timeout *= 10;
4540 	adev->gmc.gart_size = 512 * 1024 * 1024;
4541 	adev->accel_working = false;
4542 	adev->num_rings = 0;
4543 	RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4544 	adev->mman.buffer_funcs = NULL;
4545 	adev->mman.buffer_funcs_ring = NULL;
4546 	adev->vm_manager.vm_pte_funcs = NULL;
4547 	adev->vm_manager.vm_pte_num_scheds = 0;
4548 	adev->gmc.gmc_funcs = NULL;
4549 	adev->harvest_ip_mask = 0x0;
4550 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4551 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4552 
4553 	adev->smc_rreg = &amdgpu_invalid_rreg;
4554 	adev->smc_wreg = &amdgpu_invalid_wreg;
4555 	adev->pcie_rreg = &amdgpu_invalid_rreg;
4556 	adev->pcie_wreg = &amdgpu_invalid_wreg;
4557 	adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4558 	adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4559 	adev->pciep_rreg = &amdgpu_invalid_rreg;
4560 	adev->pciep_wreg = &amdgpu_invalid_wreg;
4561 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4562 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4563 	adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4564 	adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4565 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4566 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4567 	adev->didt_rreg = &amdgpu_invalid_rreg;
4568 	adev->didt_wreg = &amdgpu_invalid_wreg;
4569 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4570 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4571 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4572 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4573 
4574 	dev_info(
4575 		adev->dev,
4576 		"initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4577 		amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4578 		pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4579 
4580 	/* mutex initialization are all done here so we
4581 	 * can recall function without having locking issues
4582 	 */
4583 	mutex_init(&adev->firmware.mutex);
4584 	mutex_init(&adev->pm.mutex);
4585 	mutex_init(&adev->gfx.gpu_clock_mutex);
4586 	mutex_init(&adev->srbm_mutex);
4587 	mutex_init(&adev->gfx.pipe_reserve_mutex);
4588 	mutex_init(&adev->gfx.gfx_off_mutex);
4589 	mutex_init(&adev->gfx.partition_mutex);
4590 	mutex_init(&adev->grbm_idx_mutex);
4591 	mutex_init(&adev->mn_lock);
4592 	mutex_init(&adev->virt.vf_errors.lock);
4593 	hash_init(adev->mn_hash);
4594 	mutex_init(&adev->psp.mutex);
4595 	mutex_init(&adev->notifier_lock);
4596 	mutex_init(&adev->pm.stable_pstate_ctx_lock);
4597 	mutex_init(&adev->benchmark_mutex);
4598 	mutex_init(&adev->gfx.reset_sem_mutex);
4599 	/* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4600 	mutex_init(&adev->enforce_isolation_mutex);
4601 	for (i = 0; i < MAX_XCP; ++i) {
4602 		adev->isolation[i].spearhead = dma_fence_get_stub();
4603 		amdgpu_sync_create(&adev->isolation[i].active);
4604 		amdgpu_sync_create(&adev->isolation[i].prev);
4605 	}
4606 	mutex_init(&adev->gfx.userq_sch_mutex);
4607 	mutex_init(&adev->gfx.workload_profile_mutex);
4608 	mutex_init(&adev->vcn.workload_profile_mutex);
4609 
4610 	amdgpu_device_init_apu_flags(adev);
4611 
4612 	r = amdgpu_device_check_arguments(adev);
4613 	if (r)
4614 		return r;
4615 
4616 	spin_lock_init(&adev->mmio_idx_lock);
4617 	spin_lock_init(&adev->smc_idx_lock);
4618 	spin_lock_init(&adev->pcie_idx_lock);
4619 	spin_lock_init(&adev->uvd_ctx_idx_lock);
4620 	spin_lock_init(&adev->didt_idx_lock);
4621 	spin_lock_init(&adev->gc_cac_idx_lock);
4622 	spin_lock_init(&adev->se_cac_idx_lock);
4623 	spin_lock_init(&adev->audio_endpt_idx_lock);
4624 	spin_lock_init(&adev->mm_stats.lock);
4625 	spin_lock_init(&adev->virt.rlcg_reg_lock);
4626 	spin_lock_init(&adev->wb.lock);
4627 
4628 	xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ);
4629 
4630 	INIT_LIST_HEAD(&adev->reset_list);
4631 
4632 	INIT_LIST_HEAD(&adev->ras_list);
4633 
4634 	INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4635 
4636 	xa_init(&adev->userq_doorbell_xa);
4637 
4638 	INIT_DELAYED_WORK(&adev->delayed_init_work,
4639 			  amdgpu_device_delayed_init_work_handler);
4640 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4641 			  amdgpu_device_delay_enable_gfx_off);
4642 	/*
4643 	 * Initialize the enforce_isolation work structures for each XCP
4644 	 * partition.  This work handler is responsible for enforcing shader
4645 	 * isolation on AMD GPUs.  It counts the number of emitted fences for
4646 	 * each GFX and compute ring.  If there are any fences, it schedules
4647 	 * the `enforce_isolation_work` to be run after a delay.  If there are
4648 	 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4649 	 * runqueue.
4650 	 */
4651 	for (i = 0; i < MAX_XCP; i++) {
4652 		INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4653 				  amdgpu_gfx_enforce_isolation_handler);
4654 		adev->gfx.enforce_isolation[i].adev = adev;
4655 		adev->gfx.enforce_isolation[i].xcp_id = i;
4656 	}
4657 
4658 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4659 	INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
4660 
4661 	adev->gfx.gfx_off_req_count = 1;
4662 	adev->gfx.gfx_off_residency = 0;
4663 	adev->gfx.gfx_off_entrycount = 0;
4664 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4665 
4666 	atomic_set(&adev->throttling_logging_enabled, 1);
4667 	/*
4668 	 * If throttling continues, logging will be performed every minute
4669 	 * to avoid log flooding. "-1" is subtracted since the thermal
4670 	 * throttling interrupt comes every second. Thus, the total logging
4671 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4672 	 * for throttling interrupt) = 60 seconds.
4673 	 */
4674 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4675 
4676 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4677 
4678 	/* Registers mapping */
4679 	/* TODO: block userspace mapping of io register */
4680 	if (adev->asic_type >= CHIP_BONAIRE) {
4681 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4682 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4683 	} else {
4684 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4685 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4686 	}
4687 
4688 	for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4689 		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4690 
4691 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4692 	if (!adev->rmmio)
4693 		return -ENOMEM;
4694 
4695 	dev_info(adev->dev, "register mmio base: 0x%08X\n",
4696 		 (uint32_t)adev->rmmio_base);
4697 	dev_info(adev->dev, "register mmio size: %u\n",
4698 		 (unsigned int)adev->rmmio_size);
4699 
4700 	/*
4701 	 * Reset domain needs to be present early, before XGMI hive discovered
4702 	 * (if any) and initialized to use reset sem and in_gpu reset flag
4703 	 * early on during init and before calling to RREG32.
4704 	 */
4705 	adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4706 	if (!adev->reset_domain)
4707 		return -ENOMEM;
4708 
4709 	/* detect hw virtualization here */
4710 	amdgpu_virt_init(adev);
4711 
4712 	amdgpu_device_get_pcie_info(adev);
4713 
4714 	r = amdgpu_device_get_job_timeout_settings(adev);
4715 	if (r) {
4716 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4717 		return r;
4718 	}
4719 
4720 	amdgpu_device_set_mcbp(adev);
4721 
4722 	/*
4723 	 * By default, use default mode where all blocks are expected to be
4724 	 * initialized. At present a 'swinit' of blocks is required to be
4725 	 * completed before the need for a different level is detected.
4726 	 */
4727 	amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4728 	/* early init functions */
4729 	r = amdgpu_device_ip_early_init(adev);
4730 	if (r)
4731 		return r;
4732 
4733 	/*
4734 	 * No need to remove conflicting FBs for non-display class devices.
4735 	 * This prevents the sysfb from being freed accidently.
4736 	 */
4737 	if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA ||
4738 	    (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) {
4739 		/* Get rid of things like offb */
4740 		r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4741 		if (r)
4742 			return r;
4743 	}
4744 
4745 	/* Enable TMZ based on IP_VERSION */
4746 	amdgpu_gmc_tmz_set(adev);
4747 
4748 	if (amdgpu_sriov_vf(adev) &&
4749 	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4750 		/* VF MMIO access (except mailbox range) from CPU
4751 		 * will be blocked during sriov runtime
4752 		 */
4753 		adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4754 
4755 	amdgpu_gmc_noretry_set(adev);
4756 	/* Need to get xgmi info early to decide the reset behavior*/
4757 	if (adev->gmc.xgmi.supported) {
4758 		r = adev->gfxhub.funcs->get_xgmi_info(adev);
4759 		if (r)
4760 			return r;
4761 	}
4762 
4763 	/* enable PCIE atomic ops */
4764 	if (amdgpu_sriov_vf(adev)) {
4765 		if (adev->virt.fw_reserve.p_pf2vf)
4766 			adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4767 						      adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4768 				(PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4769 	/* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4770 	 * internal path natively support atomics, set have_atomics_support to true.
4771 	 */
4772 	} else if ((adev->flags & AMD_IS_APU) &&
4773 		   (amdgpu_ip_version(adev, GC_HWIP, 0) >
4774 		    IP_VERSION(9, 0, 0))) {
4775 		adev->have_atomics_support = true;
4776 	} else {
4777 		adev->have_atomics_support =
4778 			!pci_enable_atomic_ops_to_root(adev->pdev,
4779 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4780 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4781 	}
4782 
4783 	if (!adev->have_atomics_support)
4784 		dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4785 
4786 	/* doorbell bar mapping and doorbell index init*/
4787 	amdgpu_doorbell_init(adev);
4788 
4789 	if (amdgpu_emu_mode == 1) {
4790 		/* post the asic on emulation mode */
4791 		emu_soc_asic_init(adev);
4792 		goto fence_driver_init;
4793 	}
4794 
4795 	amdgpu_reset_init(adev);
4796 
4797 	/* detect if we are with an SRIOV vbios */
4798 	if (adev->bios)
4799 		amdgpu_device_detect_sriov_bios(adev);
4800 
4801 	/* check if we need to reset the asic
4802 	 *  E.g., driver was not cleanly unloaded previously, etc.
4803 	 */
4804 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4805 		if (adev->gmc.xgmi.num_physical_nodes) {
4806 			dev_info(adev->dev, "Pending hive reset.\n");
4807 			amdgpu_set_init_level(adev,
4808 					      AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4809 		} else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4810 				   !amdgpu_device_has_display_hardware(adev)) {
4811 					r = psp_gpu_reset(adev);
4812 		} else {
4813 				tmp = amdgpu_reset_method;
4814 				/* It should do a default reset when loading or reloading the driver,
4815 				 * regardless of the module parameter reset_method.
4816 				 */
4817 				amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4818 				r = amdgpu_asic_reset(adev);
4819 				amdgpu_reset_method = tmp;
4820 		}
4821 
4822 		if (r) {
4823 		  dev_err(adev->dev, "asic reset on init failed\n");
4824 		  goto failed;
4825 		}
4826 	}
4827 
4828 	/* Post card if necessary */
4829 	if (amdgpu_device_need_post(adev)) {
4830 		if (!adev->bios) {
4831 			dev_err(adev->dev, "no vBIOS found\n");
4832 			r = -EINVAL;
4833 			goto failed;
4834 		}
4835 		dev_info(adev->dev, "GPU posting now...\n");
4836 		r = amdgpu_device_asic_init(adev);
4837 		if (r) {
4838 			dev_err(adev->dev, "gpu post error!\n");
4839 			goto failed;
4840 		}
4841 	}
4842 
4843 	if (adev->bios) {
4844 		if (adev->is_atom_fw) {
4845 			/* Initialize clocks */
4846 			r = amdgpu_atomfirmware_get_clock_info(adev);
4847 			if (r) {
4848 				dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4849 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4850 				goto failed;
4851 			}
4852 		} else {
4853 			/* Initialize clocks */
4854 			r = amdgpu_atombios_get_clock_info(adev);
4855 			if (r) {
4856 				dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4857 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4858 				goto failed;
4859 			}
4860 			/* init i2c buses */
4861 			amdgpu_i2c_init(adev);
4862 		}
4863 	}
4864 
4865 fence_driver_init:
4866 	/* Fence driver */
4867 	r = amdgpu_fence_driver_sw_init(adev);
4868 	if (r) {
4869 		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4870 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4871 		goto failed;
4872 	}
4873 
4874 	/* init the mode config */
4875 	drm_mode_config_init(adev_to_drm(adev));
4876 
4877 	r = amdgpu_device_ip_init(adev);
4878 	if (r) {
4879 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4880 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4881 		goto release_ras_con;
4882 	}
4883 
4884 	amdgpu_fence_driver_hw_init(adev);
4885 
4886 	dev_info(adev->dev,
4887 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4888 			adev->gfx.config.max_shader_engines,
4889 			adev->gfx.config.max_sh_per_se,
4890 			adev->gfx.config.max_cu_per_sh,
4891 			adev->gfx.cu_info.number);
4892 
4893 	adev->accel_working = true;
4894 
4895 	amdgpu_vm_check_compute_bug(adev);
4896 
4897 	/* Initialize the buffer migration limit. */
4898 	if (amdgpu_moverate >= 0)
4899 		max_MBps = amdgpu_moverate;
4900 	else
4901 		max_MBps = 8; /* Allow 8 MB/s. */
4902 	/* Get a log2 for easy divisions. */
4903 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4904 
4905 	/*
4906 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4907 	 * Otherwise the mgpu fan boost feature will be skipped due to the
4908 	 * gpu instance is counted less.
4909 	 */
4910 	amdgpu_register_gpu_instance(adev);
4911 
4912 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
4913 	 * explicit gating rather than handling it automatically.
4914 	 */
4915 	if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4916 		r = amdgpu_device_ip_late_init(adev);
4917 		if (r) {
4918 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4919 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4920 			goto release_ras_con;
4921 		}
4922 		/* must succeed. */
4923 		amdgpu_ras_resume(adev);
4924 		queue_delayed_work(system_wq, &adev->delayed_init_work,
4925 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
4926 	}
4927 
4928 	if (amdgpu_sriov_vf(adev)) {
4929 		amdgpu_virt_release_full_gpu(adev, true);
4930 		flush_delayed_work(&adev->delayed_init_work);
4931 	}
4932 
4933 	if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4934 		amdgpu_xgmi_reset_on_init(adev);
4935 	/*
4936 	 * Place those sysfs registering after `late_init`. As some of those
4937 	 * operations performed in `late_init` might affect the sysfs
4938 	 * interfaces creating.
4939 	 */
4940 	r = amdgpu_device_sys_interface_init(adev);
4941 
4942 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4943 		r = amdgpu_pmu_init(adev);
4944 	if (r)
4945 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4946 
4947 	/* Have stored pci confspace at hand for restore in sudden PCI error */
4948 	if (amdgpu_device_cache_pci_state(adev->pdev))
4949 		pci_restore_state(pdev);
4950 
4951 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4952 	/* this will fail for cards that aren't VGA class devices, just
4953 	 * ignore it
4954 	 */
4955 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4956 		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4957 
4958 	px = amdgpu_device_supports_px(adev);
4959 
4960 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
4961 				apple_gmux_detect(NULL, NULL)))
4962 		vga_switcheroo_register_client(adev->pdev,
4963 					       &amdgpu_switcheroo_ops, px);
4964 
4965 	if (px)
4966 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4967 
4968 	amdgpu_device_check_iommu_direct_map(adev);
4969 
4970 	adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4971 	r = register_pm_notifier(&adev->pm_nb);
4972 	if (r)
4973 		goto failed;
4974 
4975 	return 0;
4976 
4977 release_ras_con:
4978 	if (amdgpu_sriov_vf(adev))
4979 		amdgpu_virt_release_full_gpu(adev, true);
4980 
4981 	/* failed in exclusive mode due to timeout */
4982 	if (amdgpu_sriov_vf(adev) &&
4983 		!amdgpu_sriov_runtime(adev) &&
4984 		amdgpu_virt_mmio_blocked(adev) &&
4985 		!amdgpu_virt_wait_reset(adev)) {
4986 		dev_err(adev->dev, "VF exclusive mode timeout\n");
4987 		/* Don't send request since VF is inactive. */
4988 		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4989 		adev->virt.ops = NULL;
4990 		r = -EAGAIN;
4991 	}
4992 	amdgpu_release_ras_context(adev);
4993 
4994 failed:
4995 	amdgpu_vf_error_trans_all(adev);
4996 
4997 	return r;
4998 }
4999 
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)5000 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
5001 {
5002 
5003 	/* Clear all CPU mappings pointing to this device */
5004 	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
5005 
5006 	/* Unmap all mapped bars - Doorbell, registers and VRAM */
5007 	amdgpu_doorbell_fini(adev);
5008 
5009 	iounmap(adev->rmmio);
5010 	adev->rmmio = NULL;
5011 	if (adev->mman.aper_base_kaddr)
5012 		iounmap(adev->mman.aper_base_kaddr);
5013 	adev->mman.aper_base_kaddr = NULL;
5014 
5015 	/* Memory manager related */
5016 	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
5017 		arch_phys_wc_del(adev->gmc.vram_mtrr);
5018 		arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
5019 	}
5020 }
5021 
5022 /**
5023  * amdgpu_device_fini_hw - tear down the driver
5024  *
5025  * @adev: amdgpu_device pointer
5026  *
5027  * Tear down the driver info (all asics).
5028  * Called at driver shutdown.
5029  */
amdgpu_device_fini_hw(struct amdgpu_device * adev)5030 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
5031 {
5032 	dev_info(adev->dev, "amdgpu: finishing device.\n");
5033 	flush_delayed_work(&adev->delayed_init_work);
5034 
5035 	if (adev->mman.initialized)
5036 		drain_workqueue(adev->mman.bdev.wq);
5037 	adev->shutdown = true;
5038 
5039 	unregister_pm_notifier(&adev->pm_nb);
5040 
5041 	/* make sure IB test finished before entering exclusive mode
5042 	 * to avoid preemption on IB test
5043 	 */
5044 	if (amdgpu_sriov_vf(adev)) {
5045 		amdgpu_virt_request_full_gpu(adev, false);
5046 		amdgpu_virt_fini_data_exchange(adev);
5047 	}
5048 
5049 	/* disable all interrupts */
5050 	amdgpu_irq_disable_all(adev);
5051 	if (adev->mode_info.mode_config_initialized) {
5052 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
5053 			drm_helper_force_disable_all(adev_to_drm(adev));
5054 		else
5055 			drm_atomic_helper_shutdown(adev_to_drm(adev));
5056 	}
5057 	amdgpu_fence_driver_hw_fini(adev);
5058 
5059 	amdgpu_device_sys_interface_fini(adev);
5060 
5061 	/* disable ras feature must before hw fini */
5062 	amdgpu_ras_pre_fini(adev);
5063 
5064 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
5065 
5066 	/*
5067 	 * device went through surprise hotplug; we need to destroy topology
5068 	 * before ip_fini_early to prevent kfd locking refcount issues by calling
5069 	 * amdgpu_amdkfd_suspend()
5070 	 */
5071 	if (drm_dev_is_unplugged(adev_to_drm(adev)))
5072 		amdgpu_amdkfd_device_fini_sw(adev);
5073 
5074 	amdgpu_device_ip_fini_early(adev);
5075 
5076 	amdgpu_irq_fini_hw(adev);
5077 
5078 	if (adev->mman.initialized)
5079 		ttm_device_clear_dma_mappings(&adev->mman.bdev);
5080 
5081 	amdgpu_gart_dummy_page_fini(adev);
5082 
5083 	if (drm_dev_is_unplugged(adev_to_drm(adev)))
5084 		amdgpu_device_unmap_mmio(adev);
5085 
5086 }
5087 
amdgpu_device_fini_sw(struct amdgpu_device * adev)5088 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
5089 {
5090 	int i, idx;
5091 	bool px;
5092 
5093 	amdgpu_device_ip_fini(adev);
5094 	amdgpu_fence_driver_sw_fini(adev);
5095 	amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
5096 	adev->accel_working = false;
5097 	dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
5098 	for (i = 0; i < MAX_XCP; ++i) {
5099 		dma_fence_put(adev->isolation[i].spearhead);
5100 		amdgpu_sync_free(&adev->isolation[i].active);
5101 		amdgpu_sync_free(&adev->isolation[i].prev);
5102 	}
5103 
5104 	amdgpu_reset_fini(adev);
5105 
5106 	/* free i2c buses */
5107 	amdgpu_i2c_fini(adev);
5108 
5109 	if (adev->bios) {
5110 		if (amdgpu_emu_mode != 1)
5111 			amdgpu_atombios_fini(adev);
5112 		amdgpu_bios_release(adev);
5113 	}
5114 
5115 	kfree(adev->fru_info);
5116 	adev->fru_info = NULL;
5117 
5118 	kfree(adev->xcp_mgr);
5119 	adev->xcp_mgr = NULL;
5120 
5121 	px = amdgpu_device_supports_px(adev);
5122 
5123 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
5124 				apple_gmux_detect(NULL, NULL)))
5125 		vga_switcheroo_unregister_client(adev->pdev);
5126 
5127 	if (px)
5128 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
5129 
5130 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
5131 		vga_client_unregister(adev->pdev);
5132 
5133 	if (drm_dev_enter(adev_to_drm(adev), &idx)) {
5134 
5135 		iounmap(adev->rmmio);
5136 		adev->rmmio = NULL;
5137 		drm_dev_exit(idx);
5138 	}
5139 
5140 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
5141 		amdgpu_pmu_fini(adev);
5142 	if (adev->discovery.bin)
5143 		amdgpu_discovery_fini(adev);
5144 
5145 	amdgpu_reset_put_reset_domain(adev->reset_domain);
5146 	adev->reset_domain = NULL;
5147 
5148 	kfree(adev->pci_state);
5149 	kfree(adev->pcie_reset_ctx.swds_pcistate);
5150 	kfree(adev->pcie_reset_ctx.swus_pcistate);
5151 }
5152 
5153 /**
5154  * amdgpu_device_evict_resources - evict device resources
5155  * @adev: amdgpu device object
5156  *
5157  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
5158  * of the vram memory type. Mainly used for evicting device resources
5159  * at suspend time.
5160  *
5161  */
amdgpu_device_evict_resources(struct amdgpu_device * adev)5162 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
5163 {
5164 	int ret;
5165 
5166 	/* No need to evict vram on APUs unless going to S4 */
5167 	if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
5168 		return 0;
5169 
5170 	/* No need to evict when going to S5 through S4 callbacks */
5171 	if (system_state == SYSTEM_POWER_OFF)
5172 		return 0;
5173 
5174 	ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
5175 	if (ret) {
5176 		dev_warn(adev->dev, "evicting device resources failed\n");
5177 		return ret;
5178 	}
5179 
5180 	if (adev->in_s4) {
5181 		ret = ttm_device_prepare_hibernation(&adev->mman.bdev);
5182 		if (ret)
5183 			dev_err(adev->dev, "prepare hibernation failed, %d\n", ret);
5184 	}
5185 	return ret;
5186 }
5187 
5188 /*
5189  * Suspend & resume.
5190  */
5191 /**
5192  * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
5193  * @nb: notifier block
5194  * @mode: suspend mode
5195  * @data: data
5196  *
5197  * This function is called when the system is about to suspend or hibernate.
5198  * It is used to set the appropriate flags so that eviction can be optimized
5199  * in the pm prepare callback.
5200  */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)5201 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
5202 				     void *data)
5203 {
5204 	struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
5205 
5206 	switch (mode) {
5207 	case PM_HIBERNATION_PREPARE:
5208 		adev->in_s4 = true;
5209 		break;
5210 	case PM_POST_HIBERNATION:
5211 		adev->in_s4 = false;
5212 		break;
5213 	}
5214 
5215 	return NOTIFY_DONE;
5216 }
5217 
5218 /**
5219  * amdgpu_device_prepare - prepare for device suspend
5220  *
5221  * @dev: drm dev pointer
5222  *
5223  * Prepare to put the hw in the suspend state (all asics).
5224  * Returns 0 for success or an error on failure.
5225  * Called at driver suspend.
5226  */
amdgpu_device_prepare(struct drm_device * dev)5227 int amdgpu_device_prepare(struct drm_device *dev)
5228 {
5229 	struct amdgpu_device *adev = drm_to_adev(dev);
5230 	int i, r;
5231 
5232 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5233 		return 0;
5234 
5235 	/* Evict the majority of BOs before starting suspend sequence */
5236 	r = amdgpu_device_evict_resources(adev);
5237 	if (r)
5238 		return r;
5239 
5240 	flush_delayed_work(&adev->gfx.gfx_off_delay_work);
5241 
5242 	for (i = 0; i < adev->num_ip_blocks; i++) {
5243 		if (!adev->ip_blocks[i].status.valid)
5244 			continue;
5245 		if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
5246 			continue;
5247 		r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
5248 		if (r)
5249 			return r;
5250 	}
5251 
5252 	return 0;
5253 }
5254 
5255 /**
5256  * amdgpu_device_complete - complete power state transition
5257  *
5258  * @dev: drm dev pointer
5259  *
5260  * Undo the changes from amdgpu_device_prepare. This will be
5261  * called on all resume transitions, including those that failed.
5262  */
amdgpu_device_complete(struct drm_device * dev)5263 void amdgpu_device_complete(struct drm_device *dev)
5264 {
5265 	struct amdgpu_device *adev = drm_to_adev(dev);
5266 	int i;
5267 
5268 	for (i = 0; i < adev->num_ip_blocks; i++) {
5269 		if (!adev->ip_blocks[i].status.valid)
5270 			continue;
5271 		if (!adev->ip_blocks[i].version->funcs->complete)
5272 			continue;
5273 		adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]);
5274 	}
5275 }
5276 
5277 /**
5278  * amdgpu_device_suspend - initiate device suspend
5279  *
5280  * @dev: drm dev pointer
5281  * @notify_clients: notify in-kernel DRM clients
5282  *
5283  * Puts the hw in the suspend state (all asics).
5284  * Returns 0 for success or an error on failure.
5285  * Called at driver suspend.
5286  */
amdgpu_device_suspend(struct drm_device * dev,bool notify_clients)5287 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
5288 {
5289 	struct amdgpu_device *adev = drm_to_adev(dev);
5290 	int r, rec;
5291 
5292 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5293 		return 0;
5294 
5295 	adev->in_suspend = true;
5296 
5297 	if (amdgpu_sriov_vf(adev)) {
5298 		if (!adev->in_runpm)
5299 			amdgpu_amdkfd_suspend_process(adev);
5300 		amdgpu_virt_fini_data_exchange(adev);
5301 		r = amdgpu_virt_request_full_gpu(adev, false);
5302 		if (r)
5303 			return r;
5304 	}
5305 
5306 	r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
5307 	if (r)
5308 		goto unwind_sriov;
5309 
5310 	if (notify_clients)
5311 		drm_client_dev_suspend(adev_to_drm(adev));
5312 
5313 	cancel_delayed_work_sync(&adev->delayed_init_work);
5314 
5315 	amdgpu_ras_suspend(adev);
5316 
5317 	r = amdgpu_device_ip_suspend_phase1(adev);
5318 	if (r)
5319 		goto unwind_smartshift;
5320 
5321 	amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5322 	r = amdgpu_userq_suspend(adev);
5323 	if (r)
5324 		goto unwind_ip_phase1;
5325 
5326 	r = amdgpu_device_evict_resources(adev);
5327 	if (r)
5328 		goto unwind_userq;
5329 
5330 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
5331 
5332 	amdgpu_fence_driver_hw_fini(adev);
5333 
5334 	r = amdgpu_device_ip_suspend_phase2(adev);
5335 	if (r)
5336 		goto unwind_evict;
5337 
5338 	if (amdgpu_sriov_vf(adev))
5339 		amdgpu_virt_release_full_gpu(adev, false);
5340 
5341 	return 0;
5342 
5343 unwind_evict:
5344 	if (adev->mman.buffer_funcs_ring->sched.ready)
5345 		amdgpu_ttm_set_buffer_funcs_status(adev, true);
5346 	amdgpu_fence_driver_hw_init(adev);
5347 
5348 unwind_userq:
5349 	rec = amdgpu_userq_resume(adev);
5350 	if (rec) {
5351 		dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec);
5352 		return r;
5353 	}
5354 	rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5355 	if (rec) {
5356 		dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
5357 		return r;
5358 	}
5359 
5360 unwind_ip_phase1:
5361 	/* suspend phase 1 = resume phase 3 */
5362 	rec = amdgpu_device_ip_resume_phase3(adev);
5363 	if (rec) {
5364 		dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
5365 		return r;
5366 	}
5367 
5368 unwind_smartshift:
5369 	rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
5370 	if (rec) {
5371 		dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec);
5372 		return r;
5373 	}
5374 
5375 	if (notify_clients)
5376 		drm_client_dev_resume(adev_to_drm(adev));
5377 
5378 	amdgpu_ras_resume(adev);
5379 
5380 unwind_sriov:
5381 	if (amdgpu_sriov_vf(adev)) {
5382 		rec = amdgpu_virt_request_full_gpu(adev, true);
5383 		if (rec) {
5384 			dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec);
5385 			return r;
5386 		}
5387 	}
5388 
5389 	adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
5390 
5391 	return r;
5392 }
5393 
amdgpu_virt_resume(struct amdgpu_device * adev)5394 static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
5395 {
5396 	int r;
5397 	unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
5398 
5399 	/* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO)
5400 	 * may not work. The access could be blocked by nBIF protection as VF isn't in
5401 	 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX
5402 	 * so that QEMU reprograms MSIX table.
5403 	 */
5404 	amdgpu_restore_msix(adev);
5405 
5406 	r = adev->gfxhub.funcs->get_xgmi_info(adev);
5407 	if (r)
5408 		return r;
5409 
5410 	dev_info(adev->dev, "xgmi node, old id %d, new id %d\n",
5411 		prev_physical_node_id, adev->gmc.xgmi.physical_node_id);
5412 
5413 	adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev);
5414 	adev->vm_manager.vram_base_offset +=
5415 		adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
5416 
5417 	return 0;
5418 }
5419 
5420 /**
5421  * amdgpu_device_resume - initiate device resume
5422  *
5423  * @dev: drm dev pointer
5424  * @notify_clients: notify in-kernel DRM clients
5425  *
5426  * Bring the hw back to operating state (all asics).
5427  * Returns 0 for success or an error on failure.
5428  * Called at driver resume.
5429  */
amdgpu_device_resume(struct drm_device * dev,bool notify_clients)5430 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
5431 {
5432 	struct amdgpu_device *adev = drm_to_adev(dev);
5433 	int r = 0;
5434 
5435 	if (amdgpu_sriov_vf(adev)) {
5436 		r = amdgpu_virt_request_full_gpu(adev, true);
5437 		if (r)
5438 			return r;
5439 	}
5440 
5441 	if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
5442 		r = amdgpu_virt_resume(adev);
5443 		if (r)
5444 			goto exit;
5445 	}
5446 
5447 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5448 		return 0;
5449 
5450 	if (adev->in_s0ix)
5451 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
5452 
5453 	/* post card */
5454 	if (amdgpu_device_need_post(adev)) {
5455 		r = amdgpu_device_asic_init(adev);
5456 		if (r)
5457 			dev_err(adev->dev, "amdgpu asic init failed\n");
5458 	}
5459 
5460 	r = amdgpu_device_ip_resume(adev);
5461 
5462 	if (r) {
5463 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
5464 		goto exit;
5465 	}
5466 
5467 	r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5468 	if (r)
5469 		goto exit;
5470 
5471 	r = amdgpu_userq_resume(adev);
5472 	if (r)
5473 		goto exit;
5474 
5475 	r = amdgpu_device_ip_late_init(adev);
5476 	if (r)
5477 		goto exit;
5478 
5479 	queue_delayed_work(system_wq, &adev->delayed_init_work,
5480 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
5481 exit:
5482 	if (amdgpu_sriov_vf(adev)) {
5483 		amdgpu_virt_init_data_exchange(adev);
5484 		amdgpu_virt_release_full_gpu(adev, true);
5485 
5486 		if (!r && !adev->in_runpm)
5487 			r = amdgpu_amdkfd_resume_process(adev);
5488 	}
5489 
5490 	if (r)
5491 		return r;
5492 
5493 	/* Make sure IB tests flushed */
5494 	flush_delayed_work(&adev->delayed_init_work);
5495 
5496 	if (notify_clients)
5497 		drm_client_dev_resume(adev_to_drm(adev));
5498 
5499 	amdgpu_ras_resume(adev);
5500 
5501 	if (adev->mode_info.num_crtc) {
5502 		/*
5503 		 * Most of the connector probing functions try to acquire runtime pm
5504 		 * refs to ensure that the GPU is powered on when connector polling is
5505 		 * performed. Since we're calling this from a runtime PM callback,
5506 		 * trying to acquire rpm refs will cause us to deadlock.
5507 		 *
5508 		 * Since we're guaranteed to be holding the rpm lock, it's safe to
5509 		 * temporarily disable the rpm helpers so this doesn't deadlock us.
5510 		 */
5511 #ifdef CONFIG_PM
5512 		dev->dev->power.disable_depth++;
5513 #endif
5514 		if (!adev->dc_enabled)
5515 			drm_helper_hpd_irq_event(dev);
5516 		else
5517 			drm_kms_helper_hotplug_event(dev);
5518 #ifdef CONFIG_PM
5519 		dev->dev->power.disable_depth--;
5520 #endif
5521 	}
5522 
5523 	amdgpu_vram_mgr_clear_reset_blocks(adev);
5524 	adev->in_suspend = false;
5525 
5526 	if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0))
5527 		dev_warn(adev->dev, "smart shift update failed\n");
5528 
5529 	return 0;
5530 }
5531 
5532 /**
5533  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5534  *
5535  * @adev: amdgpu_device pointer
5536  *
5537  * The list of all the hardware IPs that make up the asic is walked and
5538  * the check_soft_reset callbacks are run.  check_soft_reset determines
5539  * if the asic is still hung or not.
5540  * Returns true if any of the IPs are still in a hung state, false if not.
5541  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)5542 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5543 {
5544 	int i;
5545 	bool asic_hang = false;
5546 
5547 	if (amdgpu_sriov_vf(adev))
5548 		return true;
5549 
5550 	if (amdgpu_asic_need_full_reset(adev))
5551 		return true;
5552 
5553 	for (i = 0; i < adev->num_ip_blocks; i++) {
5554 		if (!adev->ip_blocks[i].status.valid)
5555 			continue;
5556 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5557 			adev->ip_blocks[i].status.hang =
5558 				adev->ip_blocks[i].version->funcs->check_soft_reset(
5559 					&adev->ip_blocks[i]);
5560 		if (adev->ip_blocks[i].status.hang) {
5561 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5562 			asic_hang = true;
5563 		}
5564 	}
5565 	return asic_hang;
5566 }
5567 
5568 /**
5569  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5570  *
5571  * @adev: amdgpu_device pointer
5572  *
5573  * The list of all the hardware IPs that make up the asic is walked and the
5574  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
5575  * handles any IP specific hardware or software state changes that are
5576  * necessary for a soft reset to succeed.
5577  * Returns 0 on success, negative error code on failure.
5578  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5579 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5580 {
5581 	int i, r = 0;
5582 
5583 	for (i = 0; i < adev->num_ip_blocks; i++) {
5584 		if (!adev->ip_blocks[i].status.valid)
5585 			continue;
5586 		if (adev->ip_blocks[i].status.hang &&
5587 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5588 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5589 			if (r)
5590 				return r;
5591 		}
5592 	}
5593 
5594 	return 0;
5595 }
5596 
5597 /**
5598  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5599  *
5600  * @adev: amdgpu_device pointer
5601  *
5602  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
5603  * reset is necessary to recover.
5604  * Returns true if a full asic reset is required, false if not.
5605  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5606 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5607 {
5608 	int i;
5609 
5610 	if (amdgpu_asic_need_full_reset(adev))
5611 		return true;
5612 
5613 	for (i = 0; i < adev->num_ip_blocks; i++) {
5614 		if (!adev->ip_blocks[i].status.valid)
5615 			continue;
5616 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5617 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5618 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5619 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5620 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5621 			if (adev->ip_blocks[i].status.hang) {
5622 				dev_info(adev->dev, "Some block need full reset!\n");
5623 				return true;
5624 			}
5625 		}
5626 	}
5627 	return false;
5628 }
5629 
5630 /**
5631  * amdgpu_device_ip_soft_reset - do a soft reset
5632  *
5633  * @adev: amdgpu_device pointer
5634  *
5635  * The list of all the hardware IPs that make up the asic is walked and the
5636  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
5637  * IP specific hardware or software state changes that are necessary to soft
5638  * reset the IP.
5639  * Returns 0 on success, negative error code on failure.
5640  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5641 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5642 {
5643 	int i, r = 0;
5644 
5645 	for (i = 0; i < adev->num_ip_blocks; i++) {
5646 		if (!adev->ip_blocks[i].status.valid)
5647 			continue;
5648 		if (adev->ip_blocks[i].status.hang &&
5649 		    adev->ip_blocks[i].version->funcs->soft_reset) {
5650 			r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5651 			if (r)
5652 				return r;
5653 		}
5654 	}
5655 
5656 	return 0;
5657 }
5658 
5659 /**
5660  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5661  *
5662  * @adev: amdgpu_device pointer
5663  *
5664  * The list of all the hardware IPs that make up the asic is walked and the
5665  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
5666  * handles any IP specific hardware or software state changes that are
5667  * necessary after the IP has been soft reset.
5668  * Returns 0 on success, negative error code on failure.
5669  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5670 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5671 {
5672 	int i, r = 0;
5673 
5674 	for (i = 0; i < adev->num_ip_blocks; i++) {
5675 		if (!adev->ip_blocks[i].status.valid)
5676 			continue;
5677 		if (adev->ip_blocks[i].status.hang &&
5678 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
5679 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5680 		if (r)
5681 			return r;
5682 	}
5683 
5684 	return 0;
5685 }
5686 
5687 /**
5688  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5689  *
5690  * @adev: amdgpu_device pointer
5691  * @reset_context: amdgpu reset context pointer
5692  *
5693  * do VF FLR and reinitialize Asic
5694  * return 0 means succeeded otherwise failed
5695  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5696 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5697 				     struct amdgpu_reset_context *reset_context)
5698 {
5699 	int r;
5700 	struct amdgpu_hive_info *hive = NULL;
5701 
5702 	if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5703 		if (!amdgpu_ras_get_fed_status(adev))
5704 			amdgpu_virt_ready_to_reset(adev);
5705 		amdgpu_virt_wait_reset(adev);
5706 		clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5707 		r = amdgpu_virt_request_full_gpu(adev, true);
5708 	} else {
5709 		r = amdgpu_virt_reset_gpu(adev);
5710 	}
5711 	if (r)
5712 		return r;
5713 
5714 	amdgpu_ras_clear_err_state(adev);
5715 	amdgpu_irq_gpu_reset_resume_helper(adev);
5716 
5717 	/* some sw clean up VF needs to do before recover */
5718 	amdgpu_virt_post_reset(adev);
5719 
5720 	/* Resume IP prior to SMC */
5721 	r = amdgpu_device_ip_reinit_early_sriov(adev);
5722 	if (r)
5723 		return r;
5724 
5725 	amdgpu_virt_init_data_exchange(adev);
5726 
5727 	r = amdgpu_device_fw_loading(adev);
5728 	if (r)
5729 		return r;
5730 
5731 	/* now we are okay to resume SMC/CP/SDMA */
5732 	r = amdgpu_device_ip_reinit_late_sriov(adev);
5733 	if (r)
5734 		return r;
5735 
5736 	hive = amdgpu_get_xgmi_hive(adev);
5737 	/* Update PSP FW topology after reset */
5738 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5739 		r = amdgpu_xgmi_update_topology(hive, adev);
5740 	if (hive)
5741 		amdgpu_put_xgmi_hive(hive);
5742 	if (r)
5743 		return r;
5744 
5745 	r = amdgpu_ib_ring_tests(adev);
5746 	if (r)
5747 		return r;
5748 
5749 	if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5750 		amdgpu_inc_vram_lost(adev);
5751 
5752 	/* need to be called during full access so we can't do it later like
5753 	 * bare-metal does.
5754 	 */
5755 	amdgpu_amdkfd_post_reset(adev);
5756 	amdgpu_virt_release_full_gpu(adev, true);
5757 
5758 	/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5759 	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5760 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5761 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5762 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
5763 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5764 		amdgpu_ras_resume(adev);
5765 
5766 	amdgpu_virt_ras_telemetry_post_reset(adev);
5767 
5768 	return 0;
5769 }
5770 
5771 /**
5772  * amdgpu_device_has_job_running - check if there is any unfinished job
5773  *
5774  * @adev: amdgpu_device pointer
5775  *
5776  * check if there is any job running on the device when guest driver receives
5777  * FLR notification from host driver. If there are still jobs running, then
5778  * the guest driver will not respond the FLR reset. Instead, let the job hit
5779  * the timeout and guest driver then issue the reset request.
5780  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5781 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5782 {
5783 	int i;
5784 
5785 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5786 		struct amdgpu_ring *ring = adev->rings[i];
5787 
5788 		if (!amdgpu_ring_sched_ready(ring))
5789 			continue;
5790 
5791 		if (amdgpu_fence_count_emitted(ring))
5792 			return true;
5793 	}
5794 	return false;
5795 }
5796 
5797 /**
5798  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5799  *
5800  * @adev: amdgpu_device pointer
5801  *
5802  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5803  * a hung GPU.
5804  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5805 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5806 {
5807 
5808 	if (amdgpu_gpu_recovery == 0)
5809 		goto disabled;
5810 
5811 	/* Skip soft reset check in fatal error mode */
5812 	if (!amdgpu_ras_is_poison_mode_supported(adev))
5813 		return true;
5814 
5815 	if (amdgpu_sriov_vf(adev))
5816 		return true;
5817 
5818 	if (amdgpu_gpu_recovery == -1) {
5819 		switch (adev->asic_type) {
5820 #ifdef CONFIG_DRM_AMDGPU_SI
5821 		case CHIP_VERDE:
5822 		case CHIP_TAHITI:
5823 		case CHIP_PITCAIRN:
5824 		case CHIP_OLAND:
5825 		case CHIP_HAINAN:
5826 #endif
5827 #ifdef CONFIG_DRM_AMDGPU_CIK
5828 		case CHIP_KAVERI:
5829 		case CHIP_KABINI:
5830 		case CHIP_MULLINS:
5831 #endif
5832 		case CHIP_CARRIZO:
5833 		case CHIP_STONEY:
5834 		case CHIP_CYAN_SKILLFISH:
5835 			goto disabled;
5836 		default:
5837 			break;
5838 		}
5839 	}
5840 
5841 	return true;
5842 
5843 disabled:
5844 		dev_info(adev->dev, "GPU recovery disabled.\n");
5845 		return false;
5846 }
5847 
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5848 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5849 {
5850 	u32 i;
5851 	int ret = 0;
5852 
5853 	if (adev->bios)
5854 		amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5855 
5856 	dev_info(adev->dev, "GPU mode1 reset\n");
5857 
5858 	/* Cache the state before bus master disable. The saved config space
5859 	 * values are used in other cases like restore after mode-2 reset.
5860 	 */
5861 	amdgpu_device_cache_pci_state(adev->pdev);
5862 
5863 	/* disable BM */
5864 	pci_clear_master(adev->pdev);
5865 
5866 	if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5867 		dev_info(adev->dev, "GPU smu mode1 reset\n");
5868 		ret = amdgpu_dpm_mode1_reset(adev);
5869 	} else {
5870 		dev_info(adev->dev, "GPU psp mode1 reset\n");
5871 		ret = psp_gpu_reset(adev);
5872 	}
5873 
5874 	if (ret)
5875 		goto mode1_reset_failed;
5876 
5877 	/* enable mmio access after mode 1 reset completed */
5878 	adev->no_hw_access = false;
5879 
5880 	amdgpu_device_load_pci_state(adev->pdev);
5881 	ret = amdgpu_psp_wait_for_bootloader(adev);
5882 	if (ret)
5883 		goto mode1_reset_failed;
5884 
5885 	/* wait for asic to come out of reset */
5886 	for (i = 0; i < adev->usec_timeout; i++) {
5887 		u32 memsize = adev->nbio.funcs->get_memsize(adev);
5888 
5889 		if (memsize != 0xffffffff)
5890 			break;
5891 		udelay(1);
5892 	}
5893 
5894 	if (i >= adev->usec_timeout) {
5895 		ret = -ETIMEDOUT;
5896 		goto mode1_reset_failed;
5897 	}
5898 
5899 	if (adev->bios)
5900 		amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5901 
5902 	return 0;
5903 
5904 mode1_reset_failed:
5905 	dev_err(adev->dev, "GPU mode1 reset failed\n");
5906 	return ret;
5907 }
5908 
amdgpu_device_link_reset(struct amdgpu_device * adev)5909 int amdgpu_device_link_reset(struct amdgpu_device *adev)
5910 {
5911 	int ret = 0;
5912 
5913 	dev_info(adev->dev, "GPU link reset\n");
5914 
5915 	if (!amdgpu_reset_in_dpc(adev))
5916 		ret = amdgpu_dpm_link_reset(adev);
5917 
5918 	if (ret)
5919 		goto link_reset_failed;
5920 
5921 	ret = amdgpu_psp_wait_for_bootloader(adev);
5922 	if (ret)
5923 		goto link_reset_failed;
5924 
5925 	return 0;
5926 
5927 link_reset_failed:
5928 	dev_err(adev->dev, "GPU link reset failed\n");
5929 	return ret;
5930 }
5931 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5932 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5933 				 struct amdgpu_reset_context *reset_context)
5934 {
5935 	int i, r = 0;
5936 	struct amdgpu_job *job = NULL;
5937 	struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5938 	bool need_full_reset =
5939 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5940 
5941 	if (reset_context->reset_req_dev == adev)
5942 		job = reset_context->job;
5943 
5944 	if (amdgpu_sriov_vf(adev))
5945 		amdgpu_virt_pre_reset(adev);
5946 
5947 	amdgpu_fence_driver_isr_toggle(adev, true);
5948 
5949 	/* block all schedulers and reset given job's ring */
5950 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5951 		struct amdgpu_ring *ring = adev->rings[i];
5952 
5953 		if (!amdgpu_ring_sched_ready(ring))
5954 			continue;
5955 
5956 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5957 		amdgpu_fence_driver_force_completion(ring);
5958 	}
5959 
5960 	amdgpu_fence_driver_isr_toggle(adev, false);
5961 
5962 	if (job && job->vm)
5963 		drm_sched_increase_karma(&job->base);
5964 
5965 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5966 	/* If reset handler not implemented, continue; otherwise return */
5967 	if (r == -EOPNOTSUPP)
5968 		r = 0;
5969 	else
5970 		return r;
5971 
5972 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5973 	if (!amdgpu_sriov_vf(adev)) {
5974 
5975 		if (!need_full_reset)
5976 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5977 
5978 		if (!need_full_reset && amdgpu_gpu_recovery &&
5979 		    amdgpu_device_ip_check_soft_reset(adev)) {
5980 			amdgpu_device_ip_pre_soft_reset(adev);
5981 			r = amdgpu_device_ip_soft_reset(adev);
5982 			amdgpu_device_ip_post_soft_reset(adev);
5983 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5984 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5985 				need_full_reset = true;
5986 			}
5987 		}
5988 
5989 		if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5990 			dev_info(tmp_adev->dev, "Dumping IP State\n");
5991 			/* Trigger ip dump before we reset the asic */
5992 			for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5993 				if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5994 					tmp_adev->ip_blocks[i].version->funcs
5995 						->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5996 			dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5997 		}
5998 
5999 		if (need_full_reset)
6000 			r = amdgpu_device_ip_suspend(adev);
6001 		if (need_full_reset)
6002 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6003 		else
6004 			clear_bit(AMDGPU_NEED_FULL_RESET,
6005 				  &reset_context->flags);
6006 	}
6007 
6008 	return r;
6009 }
6010 
amdgpu_device_reinit_after_reset(struct amdgpu_reset_context * reset_context)6011 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
6012 {
6013 	struct list_head *device_list_handle;
6014 	bool full_reset, vram_lost = false;
6015 	struct amdgpu_device *tmp_adev;
6016 	int r, init_level;
6017 
6018 	device_list_handle = reset_context->reset_device_list;
6019 
6020 	if (!device_list_handle)
6021 		return -EINVAL;
6022 
6023 	full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6024 
6025 	/**
6026 	 * If it's reset on init, it's default init level, otherwise keep level
6027 	 * as recovery level.
6028 	 */
6029 	if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
6030 			init_level = AMDGPU_INIT_LEVEL_DEFAULT;
6031 	else
6032 			init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
6033 
6034 	r = 0;
6035 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6036 		amdgpu_set_init_level(tmp_adev, init_level);
6037 		if (full_reset) {
6038 			/* post card */
6039 			amdgpu_reset_set_dpc_status(tmp_adev, false);
6040 			amdgpu_ras_clear_err_state(tmp_adev);
6041 			r = amdgpu_device_asic_init(tmp_adev);
6042 			if (r) {
6043 				dev_warn(tmp_adev->dev, "asic atom init failed!");
6044 			} else {
6045 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
6046 
6047 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
6048 				if (r)
6049 					goto out;
6050 
6051 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
6052 
6053 				if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
6054 					amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
6055 
6056 				if (vram_lost) {
6057 					dev_info(
6058 						tmp_adev->dev,
6059 						"VRAM is lost due to GPU reset!\n");
6060 					amdgpu_inc_vram_lost(tmp_adev);
6061 				}
6062 
6063 				r = amdgpu_device_fw_loading(tmp_adev);
6064 				if (r)
6065 					return r;
6066 
6067 				r = amdgpu_xcp_restore_partition_mode(
6068 					tmp_adev->xcp_mgr);
6069 				if (r)
6070 					goto out;
6071 
6072 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
6073 				if (r)
6074 					goto out;
6075 
6076 				if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
6077 					amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
6078 
6079 				r = amdgpu_device_ip_resume_phase3(tmp_adev);
6080 				if (r)
6081 					goto out;
6082 
6083 				if (vram_lost)
6084 					amdgpu_device_fill_reset_magic(tmp_adev);
6085 
6086 				/*
6087 				 * Add this ASIC as tracked as reset was already
6088 				 * complete successfully.
6089 				 */
6090 				amdgpu_register_gpu_instance(tmp_adev);
6091 
6092 				if (!reset_context->hive &&
6093 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
6094 					amdgpu_xgmi_add_device(tmp_adev);
6095 
6096 				r = amdgpu_device_ip_late_init(tmp_adev);
6097 				if (r)
6098 					goto out;
6099 
6100 				r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
6101 				if (r)
6102 					goto out;
6103 
6104 				drm_client_dev_resume(adev_to_drm(tmp_adev));
6105 
6106 				/*
6107 				 * The GPU enters bad state once faulty pages
6108 				 * by ECC has reached the threshold, and ras
6109 				 * recovery is scheduled next. So add one check
6110 				 * here to break recovery if it indeed exceeds
6111 				 * bad page threshold, and remind user to
6112 				 * retire this GPU or setting one bigger
6113 				 * bad_page_threshold value to fix this once
6114 				 * probing driver again.
6115 				 */
6116 				if (!amdgpu_ras_is_rma(tmp_adev)) {
6117 					/* must succeed. */
6118 					amdgpu_ras_resume(tmp_adev);
6119 				} else {
6120 					r = -EINVAL;
6121 					goto out;
6122 				}
6123 
6124 				/* Update PSP FW topology after reset */
6125 				if (reset_context->hive &&
6126 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
6127 					r = amdgpu_xgmi_update_topology(
6128 						reset_context->hive, tmp_adev);
6129 			}
6130 		}
6131 
6132 out:
6133 		if (!r) {
6134 			/* IP init is complete now, set level as default */
6135 			amdgpu_set_init_level(tmp_adev,
6136 					      AMDGPU_INIT_LEVEL_DEFAULT);
6137 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
6138 			r = amdgpu_ib_ring_tests(tmp_adev);
6139 			if (r) {
6140 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
6141 				r = -EAGAIN;
6142 				goto end;
6143 			}
6144 		}
6145 
6146 		if (r)
6147 			tmp_adev->asic_reset_res = r;
6148 	}
6149 
6150 end:
6151 	return r;
6152 }
6153 
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)6154 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
6155 			 struct amdgpu_reset_context *reset_context)
6156 {
6157 	struct amdgpu_device *tmp_adev = NULL;
6158 	bool need_full_reset, skip_hw_reset;
6159 	int r = 0;
6160 
6161 	/* Try reset handler method first */
6162 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6163 				    reset_list);
6164 
6165 	reset_context->reset_device_list = device_list_handle;
6166 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
6167 	/* If reset handler not implemented, continue; otherwise return */
6168 	if (r == -EOPNOTSUPP)
6169 		r = 0;
6170 	else
6171 		return r;
6172 
6173 	/* Reset handler not implemented, use the default method */
6174 	need_full_reset =
6175 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6176 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
6177 
6178 	/*
6179 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
6180 	 * to allow proper links negotiation in FW (within 1 sec)
6181 	 */
6182 	if (!skip_hw_reset && need_full_reset) {
6183 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6184 			/* For XGMI run all resets in parallel to speed up the process */
6185 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
6186 				if (!queue_work(system_unbound_wq,
6187 						&tmp_adev->xgmi_reset_work))
6188 					r = -EALREADY;
6189 			} else
6190 				r = amdgpu_asic_reset(tmp_adev);
6191 
6192 			if (r) {
6193 				dev_err(tmp_adev->dev,
6194 					"ASIC reset failed with error, %d for drm dev, %s",
6195 					r, adev_to_drm(tmp_adev)->unique);
6196 				goto out;
6197 			}
6198 		}
6199 
6200 		/* For XGMI wait for all resets to complete before proceed */
6201 		if (!r) {
6202 			list_for_each_entry(tmp_adev, device_list_handle,
6203 					    reset_list) {
6204 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
6205 					flush_work(&tmp_adev->xgmi_reset_work);
6206 					r = tmp_adev->asic_reset_res;
6207 					if (r)
6208 						break;
6209 				}
6210 			}
6211 		}
6212 	}
6213 
6214 	if (!r && amdgpu_ras_intr_triggered()) {
6215 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6216 			amdgpu_ras_reset_error_count(tmp_adev,
6217 						     AMDGPU_RAS_BLOCK__MMHUB);
6218 		}
6219 
6220 		amdgpu_ras_intr_cleared();
6221 	}
6222 
6223 	r = amdgpu_device_reinit_after_reset(reset_context);
6224 	if (r == -EAGAIN)
6225 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6226 	else
6227 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6228 
6229 out:
6230 	return r;
6231 }
6232 
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)6233 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
6234 {
6235 
6236 	switch (amdgpu_asic_reset_method(adev)) {
6237 	case AMD_RESET_METHOD_MODE1:
6238 	case AMD_RESET_METHOD_LINK:
6239 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
6240 		break;
6241 	case AMD_RESET_METHOD_MODE2:
6242 		adev->mp1_state = PP_MP1_STATE_RESET;
6243 		break;
6244 	default:
6245 		adev->mp1_state = PP_MP1_STATE_NONE;
6246 		break;
6247 	}
6248 }
6249 
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)6250 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
6251 {
6252 	amdgpu_vf_error_trans_all(adev);
6253 	adev->mp1_state = PP_MP1_STATE_NONE;
6254 }
6255 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)6256 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
6257 {
6258 	struct pci_dev *p = NULL;
6259 
6260 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
6261 			adev->pdev->bus->number, 1);
6262 	if (p) {
6263 		pm_runtime_enable(&(p->dev));
6264 		pm_runtime_resume(&(p->dev));
6265 	}
6266 
6267 	pci_dev_put(p);
6268 }
6269 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)6270 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
6271 {
6272 	enum amd_reset_method reset_method;
6273 	struct pci_dev *p = NULL;
6274 	u64 expires;
6275 
6276 	/*
6277 	 * For now, only BACO and mode1 reset are confirmed
6278 	 * to suffer the audio issue without proper suspended.
6279 	 */
6280 	reset_method = amdgpu_asic_reset_method(adev);
6281 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
6282 	     (reset_method != AMD_RESET_METHOD_MODE1))
6283 		return -EINVAL;
6284 
6285 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
6286 			adev->pdev->bus->number, 1);
6287 	if (!p)
6288 		return -ENODEV;
6289 
6290 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
6291 	if (!expires)
6292 		/*
6293 		 * If we cannot get the audio device autosuspend delay,
6294 		 * a fixed 4S interval will be used. Considering 3S is
6295 		 * the audio controller default autosuspend delay setting.
6296 		 * 4S used here is guaranteed to cover that.
6297 		 */
6298 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
6299 
6300 	while (!pm_runtime_status_suspended(&(p->dev))) {
6301 		if (!pm_runtime_suspend(&(p->dev)))
6302 			break;
6303 
6304 		if (expires < ktime_get_mono_fast_ns()) {
6305 			dev_warn(adev->dev, "failed to suspend display audio\n");
6306 			pci_dev_put(p);
6307 			/* TODO: abort the succeeding gpu reset? */
6308 			return -ETIMEDOUT;
6309 		}
6310 	}
6311 
6312 	pm_runtime_disable(&(p->dev));
6313 
6314 	pci_dev_put(p);
6315 	return 0;
6316 }
6317 
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)6318 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
6319 {
6320 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
6321 
6322 #if defined(CONFIG_DEBUG_FS)
6323 	if (!amdgpu_sriov_vf(adev))
6324 		cancel_work(&adev->reset_work);
6325 #endif
6326 	cancel_work(&adev->userq_reset_work);
6327 
6328 	if (adev->kfd.dev)
6329 		cancel_work(&adev->kfd.reset_work);
6330 
6331 	if (amdgpu_sriov_vf(adev))
6332 		cancel_work(&adev->virt.flr_work);
6333 
6334 	if (con && adev->ras_enabled)
6335 		cancel_work(&con->recovery_work);
6336 
6337 }
6338 
amdgpu_device_health_check(struct list_head * device_list_handle)6339 static int amdgpu_device_health_check(struct list_head *device_list_handle)
6340 {
6341 	struct amdgpu_device *tmp_adev;
6342 	int ret = 0;
6343 
6344 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6345 		ret |= amdgpu_device_bus_status_check(tmp_adev);
6346 	}
6347 
6348 	return ret;
6349 }
6350 
amdgpu_device_recovery_prepare(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_hive_info * hive)6351 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
6352 					  struct list_head *device_list,
6353 					  struct amdgpu_hive_info *hive)
6354 {
6355 	struct amdgpu_device *tmp_adev = NULL;
6356 
6357 	/*
6358 	 * Build list of devices to reset.
6359 	 * In case we are in XGMI hive mode, resort the device list
6360 	 * to put adev in the 1st position.
6361 	 */
6362 	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
6363 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6364 			list_add_tail(&tmp_adev->reset_list, device_list);
6365 			if (adev->shutdown)
6366 				tmp_adev->shutdown = true;
6367 			if (amdgpu_reset_in_dpc(adev))
6368 				tmp_adev->pcie_reset_ctx.in_link_reset = true;
6369 		}
6370 		if (!list_is_first(&adev->reset_list, device_list))
6371 			list_rotate_to_front(&adev->reset_list, device_list);
6372 	} else {
6373 		list_add_tail(&adev->reset_list, device_list);
6374 	}
6375 }
6376 
amdgpu_device_recovery_get_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6377 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
6378 						  struct list_head *device_list)
6379 {
6380 	struct amdgpu_device *tmp_adev = NULL;
6381 
6382 	if (list_empty(device_list))
6383 		return;
6384 	tmp_adev =
6385 		list_first_entry(device_list, struct amdgpu_device, reset_list);
6386 	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
6387 }
6388 
amdgpu_device_recovery_put_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6389 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
6390 						  struct list_head *device_list)
6391 {
6392 	struct amdgpu_device *tmp_adev = NULL;
6393 
6394 	if (list_empty(device_list))
6395 		return;
6396 	tmp_adev =
6397 		list_first_entry(device_list, struct amdgpu_device, reset_list);
6398 	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6399 }
6400 
amdgpu_device_halt_activities(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context,struct list_head * device_list,struct amdgpu_hive_info * hive,bool need_emergency_restart)6401 static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
6402 					  struct amdgpu_job *job,
6403 					  struct amdgpu_reset_context *reset_context,
6404 					  struct list_head *device_list,
6405 					  struct amdgpu_hive_info *hive,
6406 					  bool need_emergency_restart)
6407 {
6408 	struct amdgpu_device *tmp_adev = NULL;
6409 	int i;
6410 
6411 	/* block all schedulers and reset given job's ring */
6412 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6413 		amdgpu_device_set_mp1_state(tmp_adev);
6414 
6415 		/*
6416 		 * Try to put the audio codec into suspend state
6417 		 * before gpu reset started.
6418 		 *
6419 		 * Due to the power domain of the graphics device
6420 		 * is shared with AZ power domain. Without this,
6421 		 * we may change the audio hardware from behind
6422 		 * the audio driver's back. That will trigger
6423 		 * some audio codec errors.
6424 		 */
6425 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
6426 			tmp_adev->pcie_reset_ctx.audio_suspended = true;
6427 
6428 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
6429 
6430 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
6431 
6432 		amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
6433 
6434 		/*
6435 		 * Mark these ASICs to be reset as untracked first
6436 		 * And add them back after reset completed
6437 		 */
6438 		amdgpu_unregister_gpu_instance(tmp_adev);
6439 
6440 		drm_client_dev_suspend(adev_to_drm(tmp_adev));
6441 
6442 		/* disable ras on ALL IPs */
6443 		if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) &&
6444 		    amdgpu_device_ip_need_full_reset(tmp_adev))
6445 			amdgpu_ras_suspend(tmp_adev);
6446 
6447 		amdgpu_userq_pre_reset(tmp_adev);
6448 
6449 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6450 			struct amdgpu_ring *ring = tmp_adev->rings[i];
6451 
6452 			if (!amdgpu_ring_sched_ready(ring))
6453 				continue;
6454 
6455 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
6456 
6457 			if (need_emergency_restart)
6458 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
6459 		}
6460 		atomic_inc(&tmp_adev->gpu_reset_counter);
6461 	}
6462 }
6463 
amdgpu_device_asic_reset(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_reset_context * reset_context)6464 static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
6465 			      struct list_head *device_list,
6466 			      struct amdgpu_reset_context *reset_context)
6467 {
6468 	struct amdgpu_device *tmp_adev = NULL;
6469 	int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
6470 	int r = 0;
6471 
6472 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
6473 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6474 		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
6475 		/*TODO Should we stop ?*/
6476 		if (r) {
6477 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6478 				  r, adev_to_drm(tmp_adev)->unique);
6479 			tmp_adev->asic_reset_res = r;
6480 		}
6481 	}
6482 
6483 	/* Actual ASIC resets if needed.*/
6484 	/* Host driver will handle XGMI hive reset for SRIOV */
6485 	if (amdgpu_sriov_vf(adev)) {
6486 
6487 		/* Bail out of reset early */
6488 		if (amdgpu_ras_is_rma(adev))
6489 			return -ENODEV;
6490 
6491 		if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
6492 			dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
6493 			amdgpu_ras_set_fed(adev, true);
6494 			set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
6495 		}
6496 
6497 		r = amdgpu_device_reset_sriov(adev, reset_context);
6498 		if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
6499 			amdgpu_virt_release_full_gpu(adev, true);
6500 			goto retry;
6501 		}
6502 		if (r)
6503 			adev->asic_reset_res = r;
6504 	} else {
6505 		r = amdgpu_do_asic_reset(device_list, reset_context);
6506 		if (r && r == -EAGAIN)
6507 			goto retry;
6508 	}
6509 
6510 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6511 		/*
6512 		 * Drop any pending non scheduler resets queued before reset is done.
6513 		 * Any reset scheduled after this point would be valid. Scheduler resets
6514 		 * were already dropped during drm_sched_stop and no new ones can come
6515 		 * in before drm_sched_start.
6516 		 */
6517 		amdgpu_device_stop_pending_resets(tmp_adev);
6518 	}
6519 
6520 	return r;
6521 }
6522 
amdgpu_device_sched_resume(struct list_head * device_list,struct amdgpu_reset_context * reset_context,bool job_signaled)6523 static int amdgpu_device_sched_resume(struct list_head *device_list,
6524 			      struct amdgpu_reset_context *reset_context,
6525 			      bool   job_signaled)
6526 {
6527 	struct amdgpu_device *tmp_adev = NULL;
6528 	int i, r = 0;
6529 
6530 	/* Post ASIC reset for all devs .*/
6531 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6532 
6533 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6534 			struct amdgpu_ring *ring = tmp_adev->rings[i];
6535 
6536 			if (!amdgpu_ring_sched_ready(ring))
6537 				continue;
6538 
6539 			drm_sched_start(&ring->sched, 0);
6540 		}
6541 
6542 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6543 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6544 
6545 		if (tmp_adev->asic_reset_res) {
6546 			/* bad news, how to tell it to userspace ?
6547 			 * for ras error, we should report GPU bad status instead of
6548 			 * reset failure
6549 			 */
6550 			if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6551 			    !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6552 				dev_info(
6553 					tmp_adev->dev,
6554 					"GPU reset(%d) failed with error %d \n",
6555 					atomic_read(
6556 						&tmp_adev->gpu_reset_counter),
6557 					tmp_adev->asic_reset_res);
6558 			amdgpu_vf_error_put(tmp_adev,
6559 					    AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0,
6560 					    tmp_adev->asic_reset_res);
6561 			if (!r)
6562 				r = tmp_adev->asic_reset_res;
6563 			tmp_adev->asic_reset_res = 0;
6564 		} else {
6565 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
6566 				 atomic_read(&tmp_adev->gpu_reset_counter));
6567 			if (amdgpu_acpi_smart_shift_update(tmp_adev,
6568 							   AMDGPU_SS_DEV_D0))
6569 				dev_warn(tmp_adev->dev,
6570 					 "smart shift update failed\n");
6571 		}
6572 	}
6573 
6574 	return r;
6575 }
6576 
amdgpu_device_gpu_resume(struct amdgpu_device * adev,struct list_head * device_list,bool need_emergency_restart)6577 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
6578 			      struct list_head *device_list,
6579 			      bool   need_emergency_restart)
6580 {
6581 	struct amdgpu_device *tmp_adev = NULL;
6582 
6583 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6584 		/* unlock kfd: SRIOV would do it separately */
6585 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6586 			amdgpu_amdkfd_post_reset(tmp_adev);
6587 
6588 		/* kfd_post_reset will do nothing if kfd device is not initialized,
6589 		 * need to bring up kfd here if it's not be initialized before
6590 		 */
6591 		if (!adev->kfd.init_complete)
6592 			amdgpu_amdkfd_device_init(adev);
6593 
6594 		if (tmp_adev->pcie_reset_ctx.audio_suspended)
6595 			amdgpu_device_resume_display_audio(tmp_adev);
6596 
6597 		amdgpu_device_unset_mp1_state(tmp_adev);
6598 
6599 		amdgpu_ras_set_error_query_ready(tmp_adev, true);
6600 
6601 	}
6602 }
6603 
6604 
6605 /**
6606  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
6607  *
6608  * @adev: amdgpu_device pointer
6609  * @job: which job trigger hang
6610  * @reset_context: amdgpu reset context pointer
6611  *
6612  * Attempt to reset the GPU if it has hung (all asics).
6613  * Attempt to do soft-reset or full-reset and reinitialize Asic
6614  * Returns 0 for success or an error on failure.
6615  */
6616 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)6617 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
6618 			      struct amdgpu_job *job,
6619 			      struct amdgpu_reset_context *reset_context)
6620 {
6621 	struct list_head device_list;
6622 	bool job_signaled = false;
6623 	struct amdgpu_hive_info *hive = NULL;
6624 	int r = 0;
6625 	bool need_emergency_restart = false;
6626 	/* save the pasid here as the job may be freed before the end of the reset */
6627 	int pasid = job ? job->pasid : -EINVAL;
6628 
6629 	/*
6630 	 * If it reaches here because of hang/timeout and a RAS error is
6631 	 * detected at the same time, let RAS recovery take care of it.
6632 	 */
6633 	if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
6634 	    !amdgpu_sriov_vf(adev) &&
6635 	    reset_context->src != AMDGPU_RESET_SRC_RAS) {
6636 		dev_dbg(adev->dev,
6637 			"Gpu recovery from source: %d yielding to RAS error recovery handling",
6638 			reset_context->src);
6639 		return 0;
6640 	}
6641 
6642 	/*
6643 	 * Special case: RAS triggered and full reset isn't supported
6644 	 */
6645 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
6646 
6647 	/*
6648 	 * Flush RAM to disk so that after reboot
6649 	 * the user can read log and see why the system rebooted.
6650 	 */
6651 	if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
6652 		amdgpu_ras_get_context(adev)->reboot) {
6653 		dev_warn(adev->dev, "Emergency reboot.");
6654 
6655 		ksys_sync_helper();
6656 		emergency_restart();
6657 	}
6658 
6659 	dev_info(adev->dev, "GPU %s begin!. Source:  %d\n",
6660 		 need_emergency_restart ? "jobs stop" : "reset",
6661 		 reset_context->src);
6662 
6663 	if (!amdgpu_sriov_vf(adev))
6664 		hive = amdgpu_get_xgmi_hive(adev);
6665 	if (hive)
6666 		mutex_lock(&hive->hive_lock);
6667 
6668 	reset_context->job = job;
6669 	reset_context->hive = hive;
6670 	INIT_LIST_HEAD(&device_list);
6671 
6672 	amdgpu_device_recovery_prepare(adev, &device_list, hive);
6673 
6674 	if (!amdgpu_sriov_vf(adev)) {
6675 		r = amdgpu_device_health_check(&device_list);
6676 		if (r)
6677 			goto end_reset;
6678 	}
6679 
6680 	/* Cannot be called after locking reset domain */
6681 	amdgpu_ras_pre_reset(adev, &device_list);
6682 
6683 	/* We need to lock reset domain only once both for XGMI and single device */
6684 	amdgpu_device_recovery_get_reset_lock(adev, &device_list);
6685 
6686 	amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
6687 				      hive, need_emergency_restart);
6688 	if (need_emergency_restart)
6689 		goto skip_sched_resume;
6690 	/*
6691 	 * Must check guilty signal here since after this point all old
6692 	 * HW fences are force signaled.
6693 	 *
6694 	 * job->base holds a reference to parent fence
6695 	 */
6696 	if (job && dma_fence_is_signaled(&job->hw_fence->base)) {
6697 		job_signaled = true;
6698 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
6699 		goto skip_hw_reset;
6700 	}
6701 
6702 	r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
6703 	if (r)
6704 		goto reset_unlock;
6705 skip_hw_reset:
6706 	r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
6707 	if (r)
6708 		goto reset_unlock;
6709 skip_sched_resume:
6710 	amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
6711 reset_unlock:
6712 	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6713 	amdgpu_ras_post_reset(adev, &device_list);
6714 end_reset:
6715 	if (hive) {
6716 		mutex_unlock(&hive->hive_lock);
6717 		amdgpu_put_xgmi_hive(hive);
6718 	}
6719 
6720 	if (r)
6721 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6722 
6723 	atomic_set(&adev->reset_domain->reset_res, r);
6724 
6725 	if (!r) {
6726 		struct amdgpu_task_info *ti = NULL;
6727 
6728 		/*
6729 		 * The job may already be freed at this point via the sched tdr workqueue so
6730 		 * use the cached pasid.
6731 		 */
6732 		if (pasid >= 0)
6733 			ti = amdgpu_vm_get_task_info_pasid(adev, pasid);
6734 
6735 		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE,
6736 				     ti ? &ti->task : NULL);
6737 
6738 		amdgpu_vm_put_task_info(ti);
6739 	}
6740 
6741 	return r;
6742 }
6743 
6744 /**
6745  * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6746  *
6747  * @adev: amdgpu_device pointer
6748  * @speed: pointer to the speed of the link
6749  * @width: pointer to the width of the link
6750  *
6751  * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6752  * first physical partner to an AMD dGPU.
6753  * This will exclude any virtual switches and links.
6754  */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6755 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6756 					    enum pci_bus_speed *speed,
6757 					    enum pcie_link_width *width)
6758 {
6759 	struct pci_dev *parent = adev->pdev;
6760 
6761 	if (!speed || !width)
6762 		return;
6763 
6764 	*speed = PCI_SPEED_UNKNOWN;
6765 	*width = PCIE_LNK_WIDTH_UNKNOWN;
6766 
6767 	if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6768 		while ((parent = pci_upstream_bridge(parent))) {
6769 			/* skip upstream/downstream switches internal to dGPU*/
6770 			if (parent->vendor == PCI_VENDOR_ID_ATI)
6771 				continue;
6772 			*speed = pcie_get_speed_cap(parent);
6773 			*width = pcie_get_width_cap(parent);
6774 			break;
6775 		}
6776 	} else {
6777 		/* use the current speeds rather than max if switching is not supported */
6778 		pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6779 	}
6780 }
6781 
6782 /**
6783  * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU
6784  *
6785  * @adev: amdgpu_device pointer
6786  * @speed: pointer to the speed of the link
6787  * @width: pointer to the width of the link
6788  *
6789  * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6790  * AMD dGPU which may be a virtual upstream bridge.
6791  */
amdgpu_device_gpu_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6792 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev,
6793 					enum pci_bus_speed *speed,
6794 					enum pcie_link_width *width)
6795 {
6796 	struct pci_dev *parent = adev->pdev;
6797 
6798 	if (!speed || !width)
6799 		return;
6800 
6801 	parent = pci_upstream_bridge(parent);
6802 	if (parent && parent->vendor == PCI_VENDOR_ID_ATI) {
6803 		/* use the upstream/downstream switches internal to dGPU */
6804 		*speed = pcie_get_speed_cap(parent);
6805 		*width = pcie_get_width_cap(parent);
6806 		while ((parent = pci_upstream_bridge(parent))) {
6807 			if (parent->vendor == PCI_VENDOR_ID_ATI) {
6808 				/* use the upstream/downstream switches internal to dGPU */
6809 				*speed = pcie_get_speed_cap(parent);
6810 				*width = pcie_get_width_cap(parent);
6811 			}
6812 		}
6813 	} else {
6814 		/* use the device itself */
6815 		*speed = pcie_get_speed_cap(adev->pdev);
6816 		*width = pcie_get_width_cap(adev->pdev);
6817 	}
6818 }
6819 
6820 /**
6821  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6822  *
6823  * @adev: amdgpu_device pointer
6824  *
6825  * Fetches and stores in the driver the PCIE capabilities (gen speed
6826  * and lanes) of the slot the device is in. Handles APUs and
6827  * virtualized environments where PCIE config space may not be available.
6828  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6829 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6830 {
6831 	enum pci_bus_speed speed_cap, platform_speed_cap;
6832 	enum pcie_link_width platform_link_width, link_width;
6833 
6834 	if (amdgpu_pcie_gen_cap)
6835 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6836 
6837 	if (amdgpu_pcie_lane_cap)
6838 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6839 
6840 	/* covers APUs as well */
6841 	if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6842 		if (adev->pm.pcie_gen_mask == 0)
6843 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6844 		if (adev->pm.pcie_mlw_mask == 0)
6845 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6846 		return;
6847 	}
6848 
6849 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6850 		return;
6851 
6852 	amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6853 					&platform_link_width);
6854 	amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width);
6855 
6856 	if (adev->pm.pcie_gen_mask == 0) {
6857 		/* asic caps */
6858 		if (speed_cap == PCI_SPEED_UNKNOWN) {
6859 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6860 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6861 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6862 		} else {
6863 			if (speed_cap == PCIE_SPEED_32_0GT)
6864 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6865 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6866 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6867 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6868 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6869 			else if (speed_cap == PCIE_SPEED_16_0GT)
6870 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6871 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6872 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6873 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6874 			else if (speed_cap == PCIE_SPEED_8_0GT)
6875 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6876 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6877 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6878 			else if (speed_cap == PCIE_SPEED_5_0GT)
6879 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6880 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6881 			else
6882 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6883 		}
6884 		/* platform caps */
6885 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6886 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6887 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6888 		} else {
6889 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
6890 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6891 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6892 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6893 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6894 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6895 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6896 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6897 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6898 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6899 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6900 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6901 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6902 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6903 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6904 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6905 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6906 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6907 			else
6908 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6909 
6910 		}
6911 	}
6912 	if (adev->pm.pcie_mlw_mask == 0) {
6913 		/* asic caps */
6914 		if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6915 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK;
6916 		} else {
6917 			switch (link_width) {
6918 			case PCIE_LNK_X32:
6919 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 |
6920 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6921 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6922 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6923 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6924 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6925 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6926 				break;
6927 			case PCIE_LNK_X16:
6928 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6929 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6930 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6931 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6932 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6933 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6934 				break;
6935 			case PCIE_LNK_X12:
6936 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6937 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6938 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6939 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6940 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6941 				break;
6942 			case PCIE_LNK_X8:
6943 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6944 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6945 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6946 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6947 				break;
6948 			case PCIE_LNK_X4:
6949 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6950 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6951 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6952 				break;
6953 			case PCIE_LNK_X2:
6954 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6955 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6956 				break;
6957 			case PCIE_LNK_X1:
6958 				adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1;
6959 				break;
6960 			default:
6961 				break;
6962 			}
6963 		}
6964 		/* platform caps */
6965 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6966 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6967 		} else {
6968 			switch (platform_link_width) {
6969 			case PCIE_LNK_X32:
6970 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6971 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6972 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6973 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6974 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6975 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6976 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6977 				break;
6978 			case PCIE_LNK_X16:
6979 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6980 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6981 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6982 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6983 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6984 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6985 				break;
6986 			case PCIE_LNK_X12:
6987 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6988 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6989 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6990 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6991 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6992 				break;
6993 			case PCIE_LNK_X8:
6994 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6995 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6996 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6997 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6998 				break;
6999 			case PCIE_LNK_X4:
7000 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
7001 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
7002 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
7003 				break;
7004 			case PCIE_LNK_X2:
7005 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
7006 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
7007 				break;
7008 			case PCIE_LNK_X1:
7009 				adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
7010 				break;
7011 			default:
7012 				break;
7013 			}
7014 		}
7015 	}
7016 }
7017 
7018 /**
7019  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
7020  *
7021  * @adev: amdgpu_device pointer
7022  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
7023  *
7024  * Return true if @peer_adev can access (DMA) @adev through the PCIe
7025  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
7026  * @peer_adev.
7027  */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)7028 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
7029 				      struct amdgpu_device *peer_adev)
7030 {
7031 #ifdef CONFIG_HSA_AMD_P2P
7032 	bool p2p_access =
7033 		!adev->gmc.xgmi.connected_to_cpu &&
7034 		!(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
7035 	if (!p2p_access)
7036 		dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
7037 			pci_name(peer_adev->pdev));
7038 
7039 	bool is_large_bar = adev->gmc.visible_vram_size &&
7040 		adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
7041 	bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
7042 
7043 	if (!p2p_addressable) {
7044 		uint64_t address_mask = peer_adev->dev->dma_mask ?
7045 			~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
7046 		resource_size_t aper_limit =
7047 			adev->gmc.aper_base + adev->gmc.aper_size - 1;
7048 
7049 		p2p_addressable = !(adev->gmc.aper_base & address_mask ||
7050 				     aper_limit & address_mask);
7051 	}
7052 	return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
7053 #else
7054 	return false;
7055 #endif
7056 }
7057 
amdgpu_device_baco_enter(struct amdgpu_device * adev)7058 int amdgpu_device_baco_enter(struct amdgpu_device *adev)
7059 {
7060 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
7061 
7062 	if (!amdgpu_device_supports_baco(adev))
7063 		return -ENOTSUPP;
7064 
7065 	if (ras && adev->ras_enabled &&
7066 	    adev->nbio.funcs->enable_doorbell_interrupt)
7067 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
7068 
7069 	return amdgpu_dpm_baco_enter(adev);
7070 }
7071 
amdgpu_device_baco_exit(struct amdgpu_device * adev)7072 int amdgpu_device_baco_exit(struct amdgpu_device *adev)
7073 {
7074 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
7075 	int ret = 0;
7076 
7077 	if (!amdgpu_device_supports_baco(adev))
7078 		return -ENOTSUPP;
7079 
7080 	ret = amdgpu_dpm_baco_exit(adev);
7081 	if (ret)
7082 		return ret;
7083 
7084 	if (ras && adev->ras_enabled &&
7085 	    adev->nbio.funcs->enable_doorbell_interrupt)
7086 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
7087 
7088 	if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
7089 	    adev->nbio.funcs->clear_doorbell_interrupt)
7090 		adev->nbio.funcs->clear_doorbell_interrupt(adev);
7091 
7092 	return 0;
7093 }
7094 
7095 /**
7096  * amdgpu_pci_error_detected - Called when a PCI error is detected.
7097  * @pdev: PCI device struct
7098  * @state: PCI channel state
7099  *
7100  * Description: Called when a PCI error is detected.
7101  *
7102  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
7103  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)7104 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
7105 {
7106 	struct drm_device *dev = pci_get_drvdata(pdev);
7107 	struct amdgpu_device *adev = drm_to_adev(dev);
7108 	struct amdgpu_hive_info *hive __free(xgmi_put_hive) =
7109 		amdgpu_get_xgmi_hive(adev);
7110 	struct amdgpu_reset_context reset_context;
7111 	struct list_head device_list;
7112 
7113 	dev_info(adev->dev, "PCI error: detected callback!!\n");
7114 
7115 	adev->pci_channel_state = state;
7116 
7117 	switch (state) {
7118 	case pci_channel_io_normal:
7119 		dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state);
7120 		return PCI_ERS_RESULT_CAN_RECOVER;
7121 	case pci_channel_io_frozen:
7122 		/* Fatal error, prepare for slot reset */
7123 		dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state);
7124 		if (hive) {
7125 			/* Hive devices should be able to support FW based
7126 			 * link reset on other devices, if not return.
7127 			 */
7128 			if (!amdgpu_dpm_is_link_reset_supported(adev)) {
7129 				dev_warn(adev->dev,
7130 					 "No support for XGMI hive yet...\n");
7131 				return PCI_ERS_RESULT_DISCONNECT;
7132 			}
7133 			/* Set dpc status only if device is part of hive
7134 			 * Non-hive devices should be able to recover after
7135 			 * link reset.
7136 			 */
7137 			amdgpu_reset_set_dpc_status(adev, true);
7138 
7139 			mutex_lock(&hive->hive_lock);
7140 		}
7141 		memset(&reset_context, 0, sizeof(reset_context));
7142 		INIT_LIST_HEAD(&device_list);
7143 
7144 		amdgpu_device_recovery_prepare(adev, &device_list, hive);
7145 		amdgpu_device_recovery_get_reset_lock(adev, &device_list);
7146 		amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
7147 					      hive, false);
7148 		if (hive)
7149 			mutex_unlock(&hive->hive_lock);
7150 		return PCI_ERS_RESULT_NEED_RESET;
7151 	case pci_channel_io_perm_failure:
7152 		/* Permanent error, prepare for device removal */
7153 		dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state);
7154 		return PCI_ERS_RESULT_DISCONNECT;
7155 	}
7156 
7157 	return PCI_ERS_RESULT_NEED_RESET;
7158 }
7159 
7160 /**
7161  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
7162  * @pdev: pointer to PCI device
7163  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)7164 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
7165 {
7166 	struct drm_device *dev = pci_get_drvdata(pdev);
7167 	struct amdgpu_device *adev = drm_to_adev(dev);
7168 
7169 	dev_info(adev->dev, "PCI error: mmio enabled callback!!\n");
7170 
7171 	/* TODO - dump whatever for debugging purposes */
7172 
7173 	/* This called only if amdgpu_pci_error_detected returns
7174 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
7175 	 * works, no need to reset slot.
7176 	 */
7177 
7178 	return PCI_ERS_RESULT_RECOVERED;
7179 }
7180 
7181 /**
7182  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
7183  * @pdev: PCI device struct
7184  *
7185  * Description: This routine is called by the pci error recovery
7186  * code after the PCI slot has been reset, just before we
7187  * should resume normal operations.
7188  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)7189 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
7190 {
7191 	struct drm_device *dev = pci_get_drvdata(pdev);
7192 	struct amdgpu_device *adev = drm_to_adev(dev);
7193 	struct amdgpu_reset_context reset_context;
7194 	struct amdgpu_device *tmp_adev;
7195 	struct amdgpu_hive_info *hive;
7196 	struct list_head device_list;
7197 	struct pci_dev *link_dev;
7198 	int r = 0, i, timeout;
7199 	u32 memsize;
7200 	u16 status;
7201 
7202 	dev_info(adev->dev, "PCI error: slot reset callback!!\n");
7203 
7204 	memset(&reset_context, 0, sizeof(reset_context));
7205 
7206 	if (adev->pcie_reset_ctx.swus)
7207 		link_dev = adev->pcie_reset_ctx.swus;
7208 	else
7209 		link_dev = adev->pdev;
7210 	/* wait for asic to come out of reset, timeout = 10s */
7211 	timeout = 10000;
7212 	do {
7213 		usleep_range(10000, 10500);
7214 		r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status);
7215 		timeout -= 10;
7216 	} while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) &&
7217 		 (status != PCI_VENDOR_ID_AMD));
7218 
7219 	if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) {
7220 		r = -ETIME;
7221 		goto out;
7222 	}
7223 
7224 	amdgpu_device_load_switch_state(adev);
7225 	/* Restore PCI confspace */
7226 	amdgpu_device_load_pci_state(pdev);
7227 
7228 	/* confirm  ASIC came out of reset */
7229 	for (i = 0; i < adev->usec_timeout; i++) {
7230 		memsize = amdgpu_asic_get_config_memsize(adev);
7231 
7232 		if (memsize != 0xffffffff)
7233 			break;
7234 		udelay(1);
7235 	}
7236 	if (memsize == 0xffffffff) {
7237 		r = -ETIME;
7238 		goto out;
7239 	}
7240 
7241 	reset_context.method = AMD_RESET_METHOD_NONE;
7242 	reset_context.reset_req_dev = adev;
7243 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
7244 	set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
7245 	INIT_LIST_HEAD(&device_list);
7246 
7247 	hive = amdgpu_get_xgmi_hive(adev);
7248 	if (hive) {
7249 		mutex_lock(&hive->hive_lock);
7250 		reset_context.hive = hive;
7251 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
7252 			tmp_adev->pcie_reset_ctx.in_link_reset = true;
7253 			list_add_tail(&tmp_adev->reset_list, &device_list);
7254 		}
7255 	} else {
7256 		set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
7257 		list_add_tail(&adev->reset_list, &device_list);
7258 	}
7259 
7260 	r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);
7261 out:
7262 	if (!r) {
7263 		if (amdgpu_device_cache_pci_state(adev->pdev))
7264 			pci_restore_state(adev->pdev);
7265 		dev_info(adev->dev, "PCIe error recovery succeeded\n");
7266 	} else {
7267 		dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r);
7268 		if (hive) {
7269 			list_for_each_entry(tmp_adev, &device_list, reset_list)
7270 				amdgpu_device_unset_mp1_state(tmp_adev);
7271 		}
7272 		amdgpu_device_recovery_put_reset_lock(adev, &device_list);
7273 	}
7274 
7275 	if (hive) {
7276 		mutex_unlock(&hive->hive_lock);
7277 		amdgpu_put_xgmi_hive(hive);
7278 	}
7279 
7280 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
7281 }
7282 
7283 /**
7284  * amdgpu_pci_resume() - resume normal ops after PCI reset
7285  * @pdev: pointer to PCI device
7286  *
7287  * Called when the error recovery driver tells us that its
7288  * OK to resume normal operation.
7289  */
amdgpu_pci_resume(struct pci_dev * pdev)7290 void amdgpu_pci_resume(struct pci_dev *pdev)
7291 {
7292 	struct drm_device *dev = pci_get_drvdata(pdev);
7293 	struct amdgpu_device *adev = drm_to_adev(dev);
7294 	struct list_head device_list;
7295 	struct amdgpu_hive_info *hive = NULL;
7296 	struct amdgpu_device *tmp_adev = NULL;
7297 
7298 	dev_info(adev->dev, "PCI error: resume callback!!\n");
7299 
7300 	/* Only continue execution for the case of pci_channel_io_frozen */
7301 	if (adev->pci_channel_state != pci_channel_io_frozen)
7302 		return;
7303 
7304 	INIT_LIST_HEAD(&device_list);
7305 
7306 	hive = amdgpu_get_xgmi_hive(adev);
7307 	if (hive) {
7308 		mutex_lock(&hive->hive_lock);
7309 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
7310 			tmp_adev->pcie_reset_ctx.in_link_reset = false;
7311 			list_add_tail(&tmp_adev->reset_list, &device_list);
7312 		}
7313 	} else
7314 		list_add_tail(&adev->reset_list, &device_list);
7315 
7316 	amdgpu_device_sched_resume(&device_list, NULL, NULL);
7317 	amdgpu_device_gpu_resume(adev, &device_list, false);
7318 	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
7319 
7320 	if (hive) {
7321 		mutex_unlock(&hive->hive_lock);
7322 		amdgpu_put_xgmi_hive(hive);
7323 	}
7324 }
7325 
amdgpu_device_cache_switch_state(struct amdgpu_device * adev)7326 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev)
7327 {
7328 	struct pci_dev *swus, *swds;
7329 	int r;
7330 
7331 	swds = pci_upstream_bridge(adev->pdev);
7332 	if (!swds || swds->vendor != PCI_VENDOR_ID_ATI ||
7333 	    pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM)
7334 		return;
7335 	swus = pci_upstream_bridge(swds);
7336 	if (!swus ||
7337 	    (swus->vendor != PCI_VENDOR_ID_ATI &&
7338 	     swus->vendor != PCI_VENDOR_ID_AMD) ||
7339 	    pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM)
7340 		return;
7341 
7342 	/* If already saved, return */
7343 	if (adev->pcie_reset_ctx.swus)
7344 		return;
7345 	/* Upstream bridge is ATI, assume it's SWUS/DS architecture */
7346 	r = pci_save_state(swds);
7347 	if (r)
7348 		return;
7349 	adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds);
7350 
7351 	r = pci_save_state(swus);
7352 	if (r)
7353 		return;
7354 	adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus);
7355 
7356 	adev->pcie_reset_ctx.swus = swus;
7357 }
7358 
amdgpu_device_load_switch_state(struct amdgpu_device * adev)7359 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev)
7360 {
7361 	struct pci_dev *pdev;
7362 	int r;
7363 
7364 	if (!adev->pcie_reset_ctx.swds_pcistate ||
7365 	    !adev->pcie_reset_ctx.swus_pcistate)
7366 		return;
7367 
7368 	pdev = adev->pcie_reset_ctx.swus;
7369 	r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate);
7370 	if (!r) {
7371 		pci_restore_state(pdev);
7372 	} else {
7373 		dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r);
7374 		return;
7375 	}
7376 
7377 	pdev = pci_upstream_bridge(adev->pdev);
7378 	r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate);
7379 	if (!r)
7380 		pci_restore_state(pdev);
7381 	else
7382 		dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r);
7383 }
7384 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)7385 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
7386 {
7387 	struct drm_device *dev = pci_get_drvdata(pdev);
7388 	struct amdgpu_device *adev = drm_to_adev(dev);
7389 	int r;
7390 
7391 	if (amdgpu_sriov_vf(adev))
7392 		return false;
7393 
7394 	r = pci_save_state(pdev);
7395 	if (!r) {
7396 		kfree(adev->pci_state);
7397 
7398 		adev->pci_state = pci_store_saved_state(pdev);
7399 
7400 		if (!adev->pci_state) {
7401 			dev_err(adev->dev, "Failed to store PCI saved state");
7402 			return false;
7403 		}
7404 	} else {
7405 		dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r);
7406 		return false;
7407 	}
7408 
7409 	amdgpu_device_cache_switch_state(adev);
7410 
7411 	return true;
7412 }
7413 
amdgpu_device_load_pci_state(struct pci_dev * pdev)7414 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
7415 {
7416 	struct drm_device *dev = pci_get_drvdata(pdev);
7417 	struct amdgpu_device *adev = drm_to_adev(dev);
7418 	int r;
7419 
7420 	if (!adev->pci_state)
7421 		return false;
7422 
7423 	r = pci_load_saved_state(pdev, adev->pci_state);
7424 
7425 	if (!r) {
7426 		pci_restore_state(pdev);
7427 	} else {
7428 		dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r);
7429 		return false;
7430 	}
7431 
7432 	return true;
7433 }
7434 
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7435 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
7436 		struct amdgpu_ring *ring)
7437 {
7438 #ifdef CONFIG_X86_64
7439 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7440 		return;
7441 #endif
7442 	if (adev->gmc.xgmi.connected_to_cpu)
7443 		return;
7444 
7445 	if (ring && ring->funcs->emit_hdp_flush) {
7446 		amdgpu_ring_emit_hdp_flush(ring);
7447 		return;
7448 	}
7449 
7450 	if (!ring && amdgpu_sriov_runtime(adev)) {
7451 		if (!amdgpu_kiq_hdp_flush(adev))
7452 			return;
7453 	}
7454 
7455 	amdgpu_hdp_flush(adev, ring);
7456 }
7457 
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7458 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
7459 		struct amdgpu_ring *ring)
7460 {
7461 #ifdef CONFIG_X86_64
7462 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7463 		return;
7464 #endif
7465 	if (adev->gmc.xgmi.connected_to_cpu)
7466 		return;
7467 
7468 	amdgpu_hdp_invalidate(adev, ring);
7469 }
7470 
amdgpu_in_reset(struct amdgpu_device * adev)7471 int amdgpu_in_reset(struct amdgpu_device *adev)
7472 {
7473 	return atomic_read(&adev->reset_domain->in_gpu_reset);
7474 }
7475 
7476 /**
7477  * amdgpu_device_halt() - bring hardware to some kind of halt state
7478  *
7479  * @adev: amdgpu_device pointer
7480  *
7481  * Bring hardware to some kind of halt state so that no one can touch it
7482  * any more. It will help to maintain error context when error occurred.
7483  * Compare to a simple hang, the system will keep stable at least for SSH
7484  * access. Then it should be trivial to inspect the hardware state and
7485  * see what's going on. Implemented as following:
7486  *
7487  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
7488  *    clears all CPU mappings to device, disallows remappings through page faults
7489  * 2. amdgpu_irq_disable_all() disables all interrupts
7490  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
7491  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
7492  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
7493  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
7494  *    flush any in flight DMA operations
7495  */
amdgpu_device_halt(struct amdgpu_device * adev)7496 void amdgpu_device_halt(struct amdgpu_device *adev)
7497 {
7498 	struct pci_dev *pdev = adev->pdev;
7499 	struct drm_device *ddev = adev_to_drm(adev);
7500 
7501 	amdgpu_xcp_dev_unplug(adev);
7502 	drm_dev_unplug(ddev);
7503 
7504 	amdgpu_irq_disable_all(adev);
7505 
7506 	amdgpu_fence_driver_hw_fini(adev);
7507 
7508 	adev->no_hw_access = true;
7509 
7510 	amdgpu_device_unmap_mmio(adev);
7511 
7512 	pci_disable_device(pdev);
7513 	pci_wait_for_pending_transaction(pdev);
7514 }
7515 
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)7516 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
7517 				u32 reg)
7518 {
7519 	unsigned long flags, address, data;
7520 	u32 r;
7521 
7522 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7523 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7524 
7525 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7526 	WREG32(address, reg * 4);
7527 	(void)RREG32(address);
7528 	r = RREG32(data);
7529 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7530 	return r;
7531 }
7532 
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)7533 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
7534 				u32 reg, u32 v)
7535 {
7536 	unsigned long flags, address, data;
7537 
7538 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7539 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7540 
7541 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7542 	WREG32(address, reg * 4);
7543 	(void)RREG32(address);
7544 	WREG32(data, v);
7545 	(void)RREG32(data);
7546 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7547 }
7548 
7549 /**
7550  * amdgpu_device_get_gang - return a reference to the current gang
7551  * @adev: amdgpu_device pointer
7552  *
7553  * Returns: A new reference to the current gang leader.
7554  */
amdgpu_device_get_gang(struct amdgpu_device * adev)7555 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
7556 {
7557 	struct dma_fence *fence;
7558 
7559 	rcu_read_lock();
7560 	fence = dma_fence_get_rcu_safe(&adev->gang_submit);
7561 	rcu_read_unlock();
7562 	return fence;
7563 }
7564 
7565 /**
7566  * amdgpu_device_switch_gang - switch to a new gang
7567  * @adev: amdgpu_device pointer
7568  * @gang: the gang to switch to
7569  *
7570  * Try to switch to a new gang.
7571  * Returns: NULL if we switched to the new gang or a reference to the current
7572  * gang leader.
7573  */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)7574 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
7575 					    struct dma_fence *gang)
7576 {
7577 	struct dma_fence *old = NULL;
7578 
7579 	dma_fence_get(gang);
7580 	do {
7581 		dma_fence_put(old);
7582 		old = amdgpu_device_get_gang(adev);
7583 		if (old == gang)
7584 			break;
7585 
7586 		if (!dma_fence_is_signaled(old)) {
7587 			dma_fence_put(gang);
7588 			return old;
7589 		}
7590 
7591 	} while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
7592 			 old, gang) != old);
7593 
7594 	/*
7595 	 * Drop it once for the exchanged reference in adev and once for the
7596 	 * thread local reference acquired in amdgpu_device_get_gang().
7597 	 */
7598 	dma_fence_put(old);
7599 	dma_fence_put(old);
7600 	return NULL;
7601 }
7602 
7603 /**
7604  * amdgpu_device_enforce_isolation - enforce HW isolation
7605  * @adev: the amdgpu device pointer
7606  * @ring: the HW ring the job is supposed to run on
7607  * @job: the job which is about to be pushed to the HW ring
7608  *
7609  * Makes sure that only one client at a time can use the GFX block.
7610  * Returns: The dependency to wait on before the job can be pushed to the HW.
7611  * The function is called multiple times until NULL is returned.
7612  */
amdgpu_device_enforce_isolation(struct amdgpu_device * adev,struct amdgpu_ring * ring,struct amdgpu_job * job)7613 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
7614 						  struct amdgpu_ring *ring,
7615 						  struct amdgpu_job *job)
7616 {
7617 	struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id];
7618 	struct drm_sched_fence *f = job->base.s_fence;
7619 	struct dma_fence *dep;
7620 	void *owner;
7621 	int r;
7622 
7623 	/*
7624 	 * For now enforce isolation only for the GFX block since we only need
7625 	 * the cleaner shader on those rings.
7626 	 */
7627 	if (ring->funcs->type != AMDGPU_RING_TYPE_GFX &&
7628 	    ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
7629 		return NULL;
7630 
7631 	/*
7632 	 * All submissions where enforce isolation is false are handled as if
7633 	 * they come from a single client. Use ~0l as the owner to distinct it
7634 	 * from kernel submissions where the owner is NULL.
7635 	 */
7636 	owner = job->enforce_isolation ? f->owner : (void *)~0l;
7637 
7638 	mutex_lock(&adev->enforce_isolation_mutex);
7639 
7640 	/*
7641 	 * The "spearhead" submission is the first one which changes the
7642 	 * ownership to its client. We always need to wait for it to be
7643 	 * pushed to the HW before proceeding with anything.
7644 	 */
7645 	if (&f->scheduled != isolation->spearhead &&
7646 	    !dma_fence_is_signaled(isolation->spearhead)) {
7647 		dep = isolation->spearhead;
7648 		goto out_grab_ref;
7649 	}
7650 
7651 	if (isolation->owner != owner) {
7652 
7653 		/*
7654 		 * Wait for any gang to be assembled before switching to a
7655 		 * different owner or otherwise we could deadlock the
7656 		 * submissions.
7657 		 */
7658 		if (!job->gang_submit) {
7659 			dep = amdgpu_device_get_gang(adev);
7660 			if (!dma_fence_is_signaled(dep))
7661 				goto out_return_dep;
7662 			dma_fence_put(dep);
7663 		}
7664 
7665 		dma_fence_put(isolation->spearhead);
7666 		isolation->spearhead = dma_fence_get(&f->scheduled);
7667 		amdgpu_sync_move(&isolation->active, &isolation->prev);
7668 		trace_amdgpu_isolation(isolation->owner, owner);
7669 		isolation->owner = owner;
7670 	}
7671 
7672 	/*
7673 	 * Specifying the ring here helps to pipeline submissions even when
7674 	 * isolation is enabled. If that is not desired for testing NULL can be
7675 	 * used instead of the ring to enforce a CPU round trip while switching
7676 	 * between clients.
7677 	 */
7678 	dep = amdgpu_sync_peek_fence(&isolation->prev, ring);
7679 	r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT);
7680 	if (r)
7681 		dev_warn(adev->dev, "OOM tracking isolation\n");
7682 
7683 out_grab_ref:
7684 	dma_fence_get(dep);
7685 out_return_dep:
7686 	mutex_unlock(&adev->enforce_isolation_mutex);
7687 	return dep;
7688 }
7689 
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)7690 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
7691 {
7692 	switch (adev->asic_type) {
7693 #ifdef CONFIG_DRM_AMDGPU_SI
7694 	case CHIP_HAINAN:
7695 #endif
7696 	case CHIP_TOPAZ:
7697 		/* chips with no display hardware */
7698 		return false;
7699 #ifdef CONFIG_DRM_AMDGPU_SI
7700 	case CHIP_TAHITI:
7701 	case CHIP_PITCAIRN:
7702 	case CHIP_VERDE:
7703 	case CHIP_OLAND:
7704 #endif
7705 #ifdef CONFIG_DRM_AMDGPU_CIK
7706 	case CHIP_BONAIRE:
7707 	case CHIP_HAWAII:
7708 	case CHIP_KAVERI:
7709 	case CHIP_KABINI:
7710 	case CHIP_MULLINS:
7711 #endif
7712 	case CHIP_TONGA:
7713 	case CHIP_FIJI:
7714 	case CHIP_POLARIS10:
7715 	case CHIP_POLARIS11:
7716 	case CHIP_POLARIS12:
7717 	case CHIP_VEGAM:
7718 	case CHIP_CARRIZO:
7719 	case CHIP_STONEY:
7720 		/* chips with display hardware */
7721 		return true;
7722 	default:
7723 		/* IP discovery */
7724 		if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
7725 		    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
7726 			return false;
7727 		return true;
7728 	}
7729 }
7730 
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)7731 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
7732 		uint32_t inst, uint32_t reg_addr, char reg_name[],
7733 		uint32_t expected_value, uint32_t mask)
7734 {
7735 	uint32_t ret = 0;
7736 	uint32_t old_ = 0;
7737 	uint32_t tmp_ = RREG32(reg_addr);
7738 	uint32_t loop = adev->usec_timeout;
7739 
7740 	while ((tmp_ & (mask)) != (expected_value)) {
7741 		if (old_ != tmp_) {
7742 			loop = adev->usec_timeout;
7743 			old_ = tmp_;
7744 		} else
7745 			udelay(1);
7746 		tmp_ = RREG32(reg_addr);
7747 		loop--;
7748 		if (!loop) {
7749 			dev_warn(
7750 				adev->dev,
7751 				"Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
7752 				inst, reg_name, (uint32_t)expected_value,
7753 				(uint32_t)(tmp_ & (mask)));
7754 			ret = -ETIMEDOUT;
7755 			break;
7756 		}
7757 	}
7758 	return ret;
7759 }
7760 
amdgpu_get_soft_full_reset_mask(struct amdgpu_ring * ring)7761 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
7762 {
7763 	ssize_t size = 0;
7764 
7765 	if (!ring || !ring->adev)
7766 		return size;
7767 
7768 	if (amdgpu_device_should_recover_gpu(ring->adev))
7769 		size |= AMDGPU_RESET_TYPE_FULL;
7770 
7771 	if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
7772 	    !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
7773 		size |= AMDGPU_RESET_TYPE_SOFT_RESET;
7774 
7775 	return size;
7776 }
7777 
amdgpu_show_reset_mask(char * buf,uint32_t supported_reset)7778 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
7779 {
7780 	ssize_t size = 0;
7781 
7782 	if (supported_reset == 0) {
7783 		size += sysfs_emit_at(buf, size, "unsupported");
7784 		size += sysfs_emit_at(buf, size, "\n");
7785 		return size;
7786 
7787 	}
7788 
7789 	if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
7790 		size += sysfs_emit_at(buf, size, "soft ");
7791 
7792 	if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
7793 		size += sysfs_emit_at(buf, size, "queue ");
7794 
7795 	if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
7796 		size += sysfs_emit_at(buf, size, "pipe ");
7797 
7798 	if (supported_reset & AMDGPU_RESET_TYPE_FULL)
7799 		size += sysfs_emit_at(buf, size, "full ");
7800 
7801 	size += sysfs_emit_at(buf, size, "\n");
7802 	return size;
7803 }
7804 
amdgpu_device_set_uid(struct amdgpu_uid * uid_info,enum amdgpu_uid_type type,uint8_t inst,uint64_t uid)7805 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info,
7806 			   enum amdgpu_uid_type type, uint8_t inst,
7807 			   uint64_t uid)
7808 {
7809 	if (!uid_info)
7810 		return;
7811 
7812 	if (type >= AMDGPU_UID_TYPE_MAX) {
7813 		dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n",
7814 			     type);
7815 		return;
7816 	}
7817 
7818 	if (inst >= AMDGPU_UID_INST_MAX) {
7819 		dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n",
7820 			     inst);
7821 		return;
7822 	}
7823 
7824 	if (uid_info->uid[type][inst] != 0) {
7825 		dev_warn_once(
7826 			uid_info->adev->dev,
7827 			"Overwriting existing UID %llu for type %d instance %d\n",
7828 			uid_info->uid[type][inst], type, inst);
7829 	}
7830 
7831 	uid_info->uid[type][inst] = uid;
7832 }
7833 
amdgpu_device_get_uid(struct amdgpu_uid * uid_info,enum amdgpu_uid_type type,uint8_t inst)7834 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info,
7835 			  enum amdgpu_uid_type type, uint8_t inst)
7836 {
7837 	if (!uid_info)
7838 		return 0;
7839 
7840 	if (type >= AMDGPU_UID_TYPE_MAX) {
7841 		dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n",
7842 			     type);
7843 		return 0;
7844 	}
7845 
7846 	if (inst >= AMDGPU_UID_INST_MAX) {
7847 		dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n",
7848 			     inst);
7849 		return 0;
7850 	}
7851 
7852 	return uid_info->uid[type][inst];
7853 }
7854