xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c (revision 25fae0b93d1d7ddb25958bcb90c3c0e5e0e202bd)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39 
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
49 #include "amdgpu.h"
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
52 #include "atom.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
55 #include "amd_pcie.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
57 #include "si.h"
58 #endif
59 #ifdef CONFIG_DRM_AMDGPU_CIK
60 #include "cik.h"
61 #endif
62 #include "vi.h"
63 #include "soc15.h"
64 #include "nv.h"
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
68 
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
71 
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77 #include "amdgpu_virt.h"
78 #include "amdgpu_dev_coredump.h"
79 
80 #include <linux/suspend.h>
81 #include <drm/task_barrier.h>
82 #include <linux/pm_runtime.h>
83 
84 #include <drm/drm_drv.h>
85 
86 #if IS_ENABLED(CONFIG_X86)
87 #include <asm/intel-family.h>
88 #include <asm/cpu_device_id.h>
89 #endif
90 
91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
98 
99 #define AMDGPU_RESUME_MS		2000
100 #define AMDGPU_MAX_RETRY_LIMIT		2
101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
105 
106 #define AMDGPU_VBIOS_SKIP (1U << 0)
107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1)
108 
109 static const struct drm_driver amdgpu_kms_driver;
110 
111 const char *amdgpu_asic_name[] = {
112 	"TAHITI",
113 	"PITCAIRN",
114 	"VERDE",
115 	"OLAND",
116 	"HAINAN",
117 	"BONAIRE",
118 	"KAVERI",
119 	"KABINI",
120 	"HAWAII",
121 	"MULLINS",
122 	"TOPAZ",
123 	"TONGA",
124 	"FIJI",
125 	"CARRIZO",
126 	"STONEY",
127 	"POLARIS10",
128 	"POLARIS11",
129 	"POLARIS12",
130 	"VEGAM",
131 	"VEGA10",
132 	"VEGA12",
133 	"VEGA20",
134 	"RAVEN",
135 	"ARCTURUS",
136 	"RENOIR",
137 	"ALDEBARAN",
138 	"NAVI10",
139 	"CYAN_SKILLFISH",
140 	"NAVI14",
141 	"NAVI12",
142 	"SIENNA_CICHLID",
143 	"NAVY_FLOUNDER",
144 	"VANGOGH",
145 	"DIMGREY_CAVEFISH",
146 	"BEIGE_GOBY",
147 	"YELLOW_CARP",
148 	"IP DISCOVERY",
149 	"LAST",
150 };
151 
152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM  - 1, 0)
153 /*
154  * Default init level where all blocks are expected to be initialized. This is
155  * the level of initialization expected by default and also after a full reset
156  * of the device.
157  */
158 struct amdgpu_init_level amdgpu_init_default = {
159 	.level = AMDGPU_INIT_LEVEL_DEFAULT,
160 	.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
161 };
162 
163 struct amdgpu_init_level amdgpu_init_recovery = {
164 	.level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
165 	.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
166 };
167 
168 /*
169  * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
170  * is used for cases like reset on initialization where the entire hive needs to
171  * be reset before first use.
172  */
173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
174 	.level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
175 	.hwini_ip_block_mask =
176 		BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
177 		BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
178 		BIT(AMD_IP_BLOCK_TYPE_PSP)
179 };
180 
amdgpu_ip_member_of_hwini(struct amdgpu_device * adev,enum amd_ip_block_type block)181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
182 					     enum amd_ip_block_type block)
183 {
184 	return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
185 }
186 
amdgpu_set_init_level(struct amdgpu_device * adev,enum amdgpu_init_lvl_id lvl)187 void amdgpu_set_init_level(struct amdgpu_device *adev,
188 			   enum amdgpu_init_lvl_id lvl)
189 {
190 	switch (lvl) {
191 	case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
192 		adev->init_lvl = &amdgpu_init_minimal_xgmi;
193 		break;
194 	case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
195 		adev->init_lvl = &amdgpu_init_recovery;
196 		break;
197 	case AMDGPU_INIT_LEVEL_DEFAULT:
198 		fallthrough;
199 	default:
200 		adev->init_lvl = &amdgpu_init_default;
201 		break;
202 	}
203 }
204 
205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
207 				     void *data);
208 
209 /**
210  * DOC: pcie_replay_count
211  *
212  * The amdgpu driver provides a sysfs API for reporting the total number
213  * of PCIe replays (NAKs).
214  * The file pcie_replay_count is used for this and returns the total
215  * number of replays as a sum of the NAKs generated and NAKs received.
216  */
217 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
219 		struct device_attribute *attr, char *buf)
220 {
221 	struct drm_device *ddev = dev_get_drvdata(dev);
222 	struct amdgpu_device *adev = drm_to_adev(ddev);
223 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
224 
225 	return sysfs_emit(buf, "%llu\n", cnt);
226 }
227 
228 static DEVICE_ATTR(pcie_replay_count, 0444,
229 		amdgpu_device_get_pcie_replay_count, NULL);
230 
amdgpu_device_attr_sysfs_init(struct amdgpu_device * adev)231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
232 {
233 	int ret = 0;
234 
235 	if (!amdgpu_sriov_vf(adev))
236 		ret = sysfs_create_file(&adev->dev->kobj,
237 					&dev_attr_pcie_replay_count.attr);
238 
239 	return ret;
240 }
241 
amdgpu_device_attr_sysfs_fini(struct amdgpu_device * adev)242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
243 {
244 	if (!amdgpu_sriov_vf(adev))
245 		sysfs_remove_file(&adev->dev->kobj,
246 				  &dev_attr_pcie_replay_count.attr);
247 }
248 
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,const struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
250 					  const struct bin_attribute *attr, char *buf,
251 					  loff_t ppos, size_t count)
252 {
253 	struct device *dev = kobj_to_dev(kobj);
254 	struct drm_device *ddev = dev_get_drvdata(dev);
255 	struct amdgpu_device *adev = drm_to_adev(ddev);
256 	ssize_t bytes_read;
257 
258 	switch (ppos) {
259 	case AMDGPU_SYS_REG_STATE_XGMI:
260 		bytes_read = amdgpu_asic_get_reg_state(
261 			adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
262 		break;
263 	case AMDGPU_SYS_REG_STATE_WAFL:
264 		bytes_read = amdgpu_asic_get_reg_state(
265 			adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
266 		break;
267 	case AMDGPU_SYS_REG_STATE_PCIE:
268 		bytes_read = amdgpu_asic_get_reg_state(
269 			adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
270 		break;
271 	case AMDGPU_SYS_REG_STATE_USR:
272 		bytes_read = amdgpu_asic_get_reg_state(
273 			adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
274 		break;
275 	case AMDGPU_SYS_REG_STATE_USR_1:
276 		bytes_read = amdgpu_asic_get_reg_state(
277 			adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
278 		break;
279 	default:
280 		return -EINVAL;
281 	}
282 
283 	return bytes_read;
284 }
285 
286 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
287 		      AMDGPU_SYS_REG_STATE_END);
288 
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
290 {
291 	int ret;
292 
293 	if (!amdgpu_asic_get_reg_state_supported(adev))
294 		return 0;
295 
296 	ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
297 
298 	return ret;
299 }
300 
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
302 {
303 	if (!amdgpu_asic_get_reg_state_supported(adev))
304 		return;
305 	sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
306 }
307 
amdgpu_ip_block_suspend(struct amdgpu_ip_block * ip_block)308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
309 {
310 	int r;
311 
312 	if (ip_block->version->funcs->suspend) {
313 		r = ip_block->version->funcs->suspend(ip_block);
314 		if (r) {
315 			dev_err(ip_block->adev->dev,
316 				"suspend of IP block <%s> failed %d\n",
317 				ip_block->version->funcs->name, r);
318 			return r;
319 		}
320 	}
321 
322 	ip_block->status.hw = false;
323 	return 0;
324 }
325 
amdgpu_ip_block_resume(struct amdgpu_ip_block * ip_block)326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
327 {
328 	int r;
329 
330 	if (ip_block->version->funcs->resume) {
331 		r = ip_block->version->funcs->resume(ip_block);
332 		if (r) {
333 			dev_err(ip_block->adev->dev,
334 				"resume of IP block <%s> failed %d\n",
335 				ip_block->version->funcs->name, r);
336 			return r;
337 		}
338 	}
339 
340 	ip_block->status.hw = true;
341 	return 0;
342 }
343 
344 /**
345  * DOC: board_info
346  *
347  * The amdgpu driver provides a sysfs API for giving board related information.
348  * It provides the form factor information in the format
349  *
350  *   type : form factor
351  *
352  * Possible form factor values
353  *
354  * - "cem"		- PCIE CEM card
355  * - "oam"		- Open Compute Accelerator Module
356  * - "unknown"	- Not known
357  *
358  */
359 
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)360 static ssize_t amdgpu_device_get_board_info(struct device *dev,
361 					    struct device_attribute *attr,
362 					    char *buf)
363 {
364 	struct drm_device *ddev = dev_get_drvdata(dev);
365 	struct amdgpu_device *adev = drm_to_adev(ddev);
366 	enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
367 	const char *pkg;
368 
369 	if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
370 		pkg_type = adev->smuio.funcs->get_pkg_type(adev);
371 
372 	switch (pkg_type) {
373 	case AMDGPU_PKG_TYPE_CEM:
374 		pkg = "cem";
375 		break;
376 	case AMDGPU_PKG_TYPE_OAM:
377 		pkg = "oam";
378 		break;
379 	default:
380 		pkg = "unknown";
381 		break;
382 	}
383 
384 	return sysfs_emit(buf, "%s : %s\n", "type", pkg);
385 }
386 
387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
388 
389 static struct attribute *amdgpu_board_attrs[] = {
390 	&dev_attr_board_info.attr,
391 	NULL,
392 };
393 
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
395 					     struct attribute *attr, int n)
396 {
397 	struct device *dev = kobj_to_dev(kobj);
398 	struct drm_device *ddev = dev_get_drvdata(dev);
399 	struct amdgpu_device *adev = drm_to_adev(ddev);
400 
401 	if (adev->flags & AMD_IS_APU)
402 		return 0;
403 
404 	return attr->mode;
405 }
406 
407 static const struct attribute_group amdgpu_board_attrs_group = {
408 	.attrs = amdgpu_board_attrs,
409 	.is_visible = amdgpu_board_attrs_is_visible
410 };
411 
412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
413 
414 
415 /**
416  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
417  *
418  * @dev: drm_device pointer
419  *
420  * Returns true if the device is a dGPU with ATPX power control,
421  * otherwise return false.
422  */
amdgpu_device_supports_px(struct drm_device * dev)423 bool amdgpu_device_supports_px(struct drm_device *dev)
424 {
425 	struct amdgpu_device *adev = drm_to_adev(dev);
426 
427 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
428 		return true;
429 	return false;
430 }
431 
432 /**
433  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
434  *
435  * @dev: drm_device pointer
436  *
437  * Returns true if the device is a dGPU with ACPI power control,
438  * otherwise return false.
439  */
amdgpu_device_supports_boco(struct drm_device * dev)440 bool amdgpu_device_supports_boco(struct drm_device *dev)
441 {
442 	struct amdgpu_device *adev = drm_to_adev(dev);
443 
444 	if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
445 		return false;
446 
447 	if (adev->has_pr3 ||
448 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
449 		return true;
450 	return false;
451 }
452 
453 /**
454  * amdgpu_device_supports_baco - Does the device support BACO
455  *
456  * @dev: drm_device pointer
457  *
458  * Return:
459  * 1 if the device supports BACO;
460  * 3 if the device supports MACO (only works if BACO is supported)
461  * otherwise return 0.
462  */
amdgpu_device_supports_baco(struct drm_device * dev)463 int amdgpu_device_supports_baco(struct drm_device *dev)
464 {
465 	struct amdgpu_device *adev = drm_to_adev(dev);
466 
467 	return amdgpu_asic_supports_baco(adev);
468 }
469 
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
471 {
472 	struct drm_device *dev;
473 	int bamaco_support;
474 
475 	dev = adev_to_drm(adev);
476 
477 	adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
478 	bamaco_support = amdgpu_device_supports_baco(dev);
479 
480 	switch (amdgpu_runtime_pm) {
481 	case 2:
482 		if (bamaco_support & MACO_SUPPORT) {
483 			adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
484 			dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
485 		} else if (bamaco_support == BACO_SUPPORT) {
486 			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
487 			dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
488 		}
489 		break;
490 	case 1:
491 		if (bamaco_support & BACO_SUPPORT) {
492 			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
493 			dev_info(adev->dev, "Forcing BACO for runtime pm\n");
494 		}
495 		break;
496 	case -1:
497 	case -2:
498 		if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
499 			adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
500 			dev_info(adev->dev, "Using ATPX for runtime pm\n");
501 		} else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
502 			adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
503 			dev_info(adev->dev, "Using BOCO for runtime pm\n");
504 		} else {
505 			if (!bamaco_support)
506 				goto no_runtime_pm;
507 
508 			switch (adev->asic_type) {
509 			case CHIP_VEGA20:
510 			case CHIP_ARCTURUS:
511 				/* BACO are not supported on vega20 and arctrus */
512 				break;
513 			case CHIP_VEGA10:
514 				/* enable BACO as runpm mode if noretry=0 */
515 				if (!adev->gmc.noretry && !amdgpu_passthrough(adev))
516 					adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
517 				break;
518 			default:
519 				/* enable BACO as runpm mode on CI+ */
520 				if (!amdgpu_passthrough(adev))
521 					adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
522 				break;
523 			}
524 
525 			if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
526 				if (bamaco_support & MACO_SUPPORT) {
527 					adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
528 					dev_info(adev->dev, "Using BAMACO for runtime pm\n");
529 				} else {
530 					dev_info(adev->dev, "Using BACO for runtime pm\n");
531 				}
532 			}
533 		}
534 		break;
535 	case 0:
536 		dev_info(adev->dev, "runtime pm is manually disabled\n");
537 		break;
538 	default:
539 		break;
540 	}
541 
542 no_runtime_pm:
543 	if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
544 		dev_info(adev->dev, "Runtime PM not available\n");
545 }
546 /**
547  * amdgpu_device_supports_smart_shift - Is the device dGPU with
548  * smart shift support
549  *
550  * @dev: drm_device pointer
551  *
552  * Returns true if the device is a dGPU with Smart Shift support,
553  * otherwise returns false.
554  */
amdgpu_device_supports_smart_shift(struct drm_device * dev)555 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
556 {
557 	return (amdgpu_device_supports_boco(dev) &&
558 		amdgpu_acpi_is_power_shift_control_supported());
559 }
560 
561 /*
562  * VRAM access helper functions
563  */
564 
565 /**
566  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
567  *
568  * @adev: amdgpu_device pointer
569  * @pos: offset of the buffer in vram
570  * @buf: virtual address of the buffer in system memory
571  * @size: read/write size, sizeof(@buf) must > @size
572  * @write: true - write to vram, otherwise - read from vram
573  */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
575 			     void *buf, size_t size, bool write)
576 {
577 	unsigned long flags;
578 	uint32_t hi = ~0, tmp = 0;
579 	uint32_t *data = buf;
580 	uint64_t last;
581 	int idx;
582 
583 	if (!drm_dev_enter(adev_to_drm(adev), &idx))
584 		return;
585 
586 	BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
587 
588 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
589 	for (last = pos + size; pos < last; pos += 4) {
590 		tmp = pos >> 31;
591 
592 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
593 		if (tmp != hi) {
594 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
595 			hi = tmp;
596 		}
597 		if (write)
598 			WREG32_NO_KIQ(mmMM_DATA, *data++);
599 		else
600 			*data++ = RREG32_NO_KIQ(mmMM_DATA);
601 	}
602 
603 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
604 	drm_dev_exit(idx);
605 }
606 
607 /**
608  * amdgpu_device_aper_access - access vram by vram aperture
609  *
610  * @adev: amdgpu_device pointer
611  * @pos: offset of the buffer in vram
612  * @buf: virtual address of the buffer in system memory
613  * @size: read/write size, sizeof(@buf) must > @size
614  * @write: true - write to vram, otherwise - read from vram
615  *
616  * The return value means how many bytes have been transferred.
617  */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
619 				 void *buf, size_t size, bool write)
620 {
621 #ifdef CONFIG_64BIT
622 	void __iomem *addr;
623 	size_t count = 0;
624 	uint64_t last;
625 
626 	if (!adev->mman.aper_base_kaddr)
627 		return 0;
628 
629 	last = min(pos + size, adev->gmc.visible_vram_size);
630 	if (last > pos) {
631 		addr = adev->mman.aper_base_kaddr + pos;
632 		count = last - pos;
633 
634 		if (write) {
635 			memcpy_toio(addr, buf, count);
636 			/* Make sure HDP write cache flush happens without any reordering
637 			 * after the system memory contents are sent over PCIe device
638 			 */
639 			mb();
640 			amdgpu_device_flush_hdp(adev, NULL);
641 		} else {
642 			amdgpu_device_invalidate_hdp(adev, NULL);
643 			/* Make sure HDP read cache is invalidated before issuing a read
644 			 * to the PCIe device
645 			 */
646 			mb();
647 			memcpy_fromio(buf, addr, count);
648 		}
649 
650 	}
651 
652 	return count;
653 #else
654 	return 0;
655 #endif
656 }
657 
658 /**
659  * amdgpu_device_vram_access - read/write a buffer in vram
660  *
661  * @adev: amdgpu_device pointer
662  * @pos: offset of the buffer in vram
663  * @buf: virtual address of the buffer in system memory
664  * @size: read/write size, sizeof(@buf) must > @size
665  * @write: true - write to vram, otherwise - read from vram
666  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
668 			       void *buf, size_t size, bool write)
669 {
670 	size_t count;
671 
672 	/* try to using vram apreature to access vram first */
673 	count = amdgpu_device_aper_access(adev, pos, buf, size, write);
674 	size -= count;
675 	if (size) {
676 		/* using MM to access rest vram */
677 		pos += count;
678 		buf += count;
679 		amdgpu_device_mm_access(adev, pos, buf, size, write);
680 	}
681 }
682 
683 /*
684  * register access helper functions.
685  */
686 
687 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
689 {
690 	if (adev->no_hw_access)
691 		return true;
692 
693 #ifdef CONFIG_LOCKDEP
694 	/*
695 	 * This is a bit complicated to understand, so worth a comment. What we assert
696 	 * here is that the GPU reset is not running on another thread in parallel.
697 	 *
698 	 * For this we trylock the read side of the reset semaphore, if that succeeds
699 	 * we know that the reset is not running in parallel.
700 	 *
701 	 * If the trylock fails we assert that we are either already holding the read
702 	 * side of the lock or are the reset thread itself and hold the write side of
703 	 * the lock.
704 	 */
705 	if (in_task()) {
706 		if (down_read_trylock(&adev->reset_domain->sem))
707 			up_read(&adev->reset_domain->sem);
708 		else
709 			lockdep_assert_held(&adev->reset_domain->sem);
710 	}
711 #endif
712 	return false;
713 }
714 
715 /**
716  * amdgpu_device_rreg - read a memory mapped IO or indirect register
717  *
718  * @adev: amdgpu_device pointer
719  * @reg: dword aligned register offset
720  * @acc_flags: access flags which require special behavior
721  *
722  * Returns the 32 bit value from the offset specified.
723  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
725 			    uint32_t reg, uint32_t acc_flags)
726 {
727 	uint32_t ret;
728 
729 	if (amdgpu_device_skip_hw_access(adev))
730 		return 0;
731 
732 	if ((reg * 4) < adev->rmmio_size) {
733 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
734 		    amdgpu_sriov_runtime(adev) &&
735 		    down_read_trylock(&adev->reset_domain->sem)) {
736 			ret = amdgpu_kiq_rreg(adev, reg, 0);
737 			up_read(&adev->reset_domain->sem);
738 		} else {
739 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
740 		}
741 	} else {
742 		ret = adev->pcie_rreg(adev, reg * 4);
743 	}
744 
745 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
746 
747 	return ret;
748 }
749 
750 /*
751  * MMIO register read with bytes helper functions
752  * @offset:bytes offset from MMIO start
753  */
754 
755 /**
756  * amdgpu_mm_rreg8 - read a memory mapped IO register
757  *
758  * @adev: amdgpu_device pointer
759  * @offset: byte aligned register offset
760  *
761  * Returns the 8 bit value from the offset specified.
762  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
764 {
765 	if (amdgpu_device_skip_hw_access(adev))
766 		return 0;
767 
768 	if (offset < adev->rmmio_size)
769 		return (readb(adev->rmmio + offset));
770 	BUG();
771 }
772 
773 
774 /**
775  * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
776  *
777  * @adev: amdgpu_device pointer
778  * @reg: dword aligned register offset
779  * @acc_flags: access flags which require special behavior
780  * @xcc_id: xcc accelerated compute core id
781  *
782  * Returns the 32 bit value from the offset specified.
783  */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
785 				uint32_t reg, uint32_t acc_flags,
786 				uint32_t xcc_id)
787 {
788 	uint32_t ret, rlcg_flag;
789 
790 	if (amdgpu_device_skip_hw_access(adev))
791 		return 0;
792 
793 	if ((reg * 4) < adev->rmmio_size) {
794 		if (amdgpu_sriov_vf(adev) &&
795 		    !amdgpu_sriov_runtime(adev) &&
796 		    adev->gfx.rlc.rlcg_reg_access_supported &&
797 		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
798 							 GC_HWIP, false,
799 							 &rlcg_flag)) {
800 			ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
801 		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
802 		    amdgpu_sriov_runtime(adev) &&
803 		    down_read_trylock(&adev->reset_domain->sem)) {
804 			ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
805 			up_read(&adev->reset_domain->sem);
806 		} else {
807 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
808 		}
809 	} else {
810 		ret = adev->pcie_rreg(adev, reg * 4);
811 	}
812 
813 	return ret;
814 }
815 
816 /*
817  * MMIO register write with bytes helper functions
818  * @offset:bytes offset from MMIO start
819  * @value: the value want to be written to the register
820  */
821 
822 /**
823  * amdgpu_mm_wreg8 - read a memory mapped IO register
824  *
825  * @adev: amdgpu_device pointer
826  * @offset: byte aligned register offset
827  * @value: 8 bit value to write
828  *
829  * Writes the value specified to the offset specified.
830  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
832 {
833 	if (amdgpu_device_skip_hw_access(adev))
834 		return;
835 
836 	if (offset < adev->rmmio_size)
837 		writeb(value, adev->rmmio + offset);
838 	else
839 		BUG();
840 }
841 
842 /**
843  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
844  *
845  * @adev: amdgpu_device pointer
846  * @reg: dword aligned register offset
847  * @v: 32 bit value to write to the register
848  * @acc_flags: access flags which require special behavior
849  *
850  * Writes the value specified to the offset specified.
851  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)852 void amdgpu_device_wreg(struct amdgpu_device *adev,
853 			uint32_t reg, uint32_t v,
854 			uint32_t acc_flags)
855 {
856 	if (amdgpu_device_skip_hw_access(adev))
857 		return;
858 
859 	if ((reg * 4) < adev->rmmio_size) {
860 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
861 		    amdgpu_sriov_runtime(adev) &&
862 		    down_read_trylock(&adev->reset_domain->sem)) {
863 			amdgpu_kiq_wreg(adev, reg, v, 0);
864 			up_read(&adev->reset_domain->sem);
865 		} else {
866 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
867 		}
868 	} else {
869 		adev->pcie_wreg(adev, reg * 4, v);
870 	}
871 
872 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
873 }
874 
875 /**
876  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
877  *
878  * @adev: amdgpu_device pointer
879  * @reg: mmio/rlc register
880  * @v: value to write
881  * @xcc_id: xcc accelerated compute core id
882  *
883  * this function is invoked only for the debugfs register access
884  */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
886 			     uint32_t reg, uint32_t v,
887 			     uint32_t xcc_id)
888 {
889 	if (amdgpu_device_skip_hw_access(adev))
890 		return;
891 
892 	if (amdgpu_sriov_fullaccess(adev) &&
893 	    adev->gfx.rlc.funcs &&
894 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
895 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
896 			return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
897 	} else if ((reg * 4) >= adev->rmmio_size) {
898 		adev->pcie_wreg(adev, reg * 4, v);
899 	} else {
900 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
901 	}
902 }
903 
904 /**
905  * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
906  *
907  * @adev: amdgpu_device pointer
908  * @reg: dword aligned register offset
909  * @v: 32 bit value to write to the register
910  * @acc_flags: access flags which require special behavior
911  * @xcc_id: xcc accelerated compute core id
912  *
913  * Writes the value specified to the offset specified.
914  */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
916 			uint32_t reg, uint32_t v,
917 			uint32_t acc_flags, uint32_t xcc_id)
918 {
919 	uint32_t rlcg_flag;
920 
921 	if (amdgpu_device_skip_hw_access(adev))
922 		return;
923 
924 	if ((reg * 4) < adev->rmmio_size) {
925 		if (amdgpu_sriov_vf(adev) &&
926 		    !amdgpu_sriov_runtime(adev) &&
927 		    adev->gfx.rlc.rlcg_reg_access_supported &&
928 		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
929 							 GC_HWIP, true,
930 							 &rlcg_flag)) {
931 			amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
932 		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
933 		    amdgpu_sriov_runtime(adev) &&
934 		    down_read_trylock(&adev->reset_domain->sem)) {
935 			amdgpu_kiq_wreg(adev, reg, v, xcc_id);
936 			up_read(&adev->reset_domain->sem);
937 		} else {
938 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
939 		}
940 	} else {
941 		adev->pcie_wreg(adev, reg * 4, v);
942 	}
943 }
944 
945 /**
946  * amdgpu_device_indirect_rreg - read an indirect register
947  *
948  * @adev: amdgpu_device pointer
949  * @reg_addr: indirect register address to read from
950  *
951  * Returns the value of indirect register @reg_addr
952  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
954 				u32 reg_addr)
955 {
956 	unsigned long flags, pcie_index, pcie_data;
957 	void __iomem *pcie_index_offset;
958 	void __iomem *pcie_data_offset;
959 	u32 r;
960 
961 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
962 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
963 
964 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
965 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
966 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
967 
968 	writel(reg_addr, pcie_index_offset);
969 	readl(pcie_index_offset);
970 	r = readl(pcie_data_offset);
971 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
972 
973 	return r;
974 }
975 
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
977 				    u64 reg_addr)
978 {
979 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
980 	u32 r;
981 	void __iomem *pcie_index_offset;
982 	void __iomem *pcie_index_hi_offset;
983 	void __iomem *pcie_data_offset;
984 
985 	if (unlikely(!adev->nbio.funcs)) {
986 		pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
987 		pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
988 	} else {
989 		pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
990 		pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
991 	}
992 
993 	if (reg_addr >> 32) {
994 		if (unlikely(!adev->nbio.funcs))
995 			pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
996 		else
997 			pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
998 	} else {
999 		pcie_index_hi = 0;
1000 	}
1001 
1002 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1003 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1004 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1005 	if (pcie_index_hi != 0)
1006 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1007 				pcie_index_hi * 4;
1008 
1009 	writel(reg_addr, pcie_index_offset);
1010 	readl(pcie_index_offset);
1011 	if (pcie_index_hi != 0) {
1012 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1013 		readl(pcie_index_hi_offset);
1014 	}
1015 	r = readl(pcie_data_offset);
1016 
1017 	/* clear the high bits */
1018 	if (pcie_index_hi != 0) {
1019 		writel(0, pcie_index_hi_offset);
1020 		readl(pcie_index_hi_offset);
1021 	}
1022 
1023 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1024 
1025 	return r;
1026 }
1027 
1028 /**
1029  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1030  *
1031  * @adev: amdgpu_device pointer
1032  * @reg_addr: indirect register address to read from
1033  *
1034  * Returns the value of indirect register @reg_addr
1035  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1037 				  u32 reg_addr)
1038 {
1039 	unsigned long flags, pcie_index, pcie_data;
1040 	void __iomem *pcie_index_offset;
1041 	void __iomem *pcie_data_offset;
1042 	u64 r;
1043 
1044 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1045 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1046 
1047 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1048 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1049 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1050 
1051 	/* read low 32 bits */
1052 	writel(reg_addr, pcie_index_offset);
1053 	readl(pcie_index_offset);
1054 	r = readl(pcie_data_offset);
1055 	/* read high 32 bits */
1056 	writel(reg_addr + 4, pcie_index_offset);
1057 	readl(pcie_index_offset);
1058 	r |= ((u64)readl(pcie_data_offset) << 32);
1059 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1060 
1061 	return r;
1062 }
1063 
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1065 				  u64 reg_addr)
1066 {
1067 	unsigned long flags, pcie_index, pcie_data;
1068 	unsigned long pcie_index_hi = 0;
1069 	void __iomem *pcie_index_offset;
1070 	void __iomem *pcie_index_hi_offset;
1071 	void __iomem *pcie_data_offset;
1072 	u64 r;
1073 
1074 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1075 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1076 	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1077 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1078 
1079 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1080 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1081 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1082 	if (pcie_index_hi != 0)
1083 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1084 			pcie_index_hi * 4;
1085 
1086 	/* read low 32 bits */
1087 	writel(reg_addr, pcie_index_offset);
1088 	readl(pcie_index_offset);
1089 	if (pcie_index_hi != 0) {
1090 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1091 		readl(pcie_index_hi_offset);
1092 	}
1093 	r = readl(pcie_data_offset);
1094 	/* read high 32 bits */
1095 	writel(reg_addr + 4, pcie_index_offset);
1096 	readl(pcie_index_offset);
1097 	if (pcie_index_hi != 0) {
1098 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1099 		readl(pcie_index_hi_offset);
1100 	}
1101 	r |= ((u64)readl(pcie_data_offset) << 32);
1102 
1103 	/* clear the high bits */
1104 	if (pcie_index_hi != 0) {
1105 		writel(0, pcie_index_hi_offset);
1106 		readl(pcie_index_hi_offset);
1107 	}
1108 
1109 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1110 
1111 	return r;
1112 }
1113 
1114 /**
1115  * amdgpu_device_indirect_wreg - write an indirect register address
1116  *
1117  * @adev: amdgpu_device pointer
1118  * @reg_addr: indirect register offset
1119  * @reg_data: indirect register data
1120  *
1121  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1123 				 u32 reg_addr, u32 reg_data)
1124 {
1125 	unsigned long flags, pcie_index, pcie_data;
1126 	void __iomem *pcie_index_offset;
1127 	void __iomem *pcie_data_offset;
1128 
1129 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1130 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1131 
1132 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1133 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1134 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1135 
1136 	writel(reg_addr, pcie_index_offset);
1137 	readl(pcie_index_offset);
1138 	writel(reg_data, pcie_data_offset);
1139 	readl(pcie_data_offset);
1140 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1141 }
1142 
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1144 				     u64 reg_addr, u32 reg_data)
1145 {
1146 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1147 	void __iomem *pcie_index_offset;
1148 	void __iomem *pcie_index_hi_offset;
1149 	void __iomem *pcie_data_offset;
1150 
1151 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1152 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1153 	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1154 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1155 	else
1156 		pcie_index_hi = 0;
1157 
1158 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1159 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1160 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1161 	if (pcie_index_hi != 0)
1162 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1163 				pcie_index_hi * 4;
1164 
1165 	writel(reg_addr, pcie_index_offset);
1166 	readl(pcie_index_offset);
1167 	if (pcie_index_hi != 0) {
1168 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1169 		readl(pcie_index_hi_offset);
1170 	}
1171 	writel(reg_data, pcie_data_offset);
1172 	readl(pcie_data_offset);
1173 
1174 	/* clear the high bits */
1175 	if (pcie_index_hi != 0) {
1176 		writel(0, pcie_index_hi_offset);
1177 		readl(pcie_index_hi_offset);
1178 	}
1179 
1180 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1181 }
1182 
1183 /**
1184  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1185  *
1186  * @adev: amdgpu_device pointer
1187  * @reg_addr: indirect register offset
1188  * @reg_data: indirect register data
1189  *
1190  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1192 				   u32 reg_addr, u64 reg_data)
1193 {
1194 	unsigned long flags, pcie_index, pcie_data;
1195 	void __iomem *pcie_index_offset;
1196 	void __iomem *pcie_data_offset;
1197 
1198 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1199 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1200 
1201 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1202 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1203 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1204 
1205 	/* write low 32 bits */
1206 	writel(reg_addr, pcie_index_offset);
1207 	readl(pcie_index_offset);
1208 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1209 	readl(pcie_data_offset);
1210 	/* write high 32 bits */
1211 	writel(reg_addr + 4, pcie_index_offset);
1212 	readl(pcie_index_offset);
1213 	writel((u32)(reg_data >> 32), pcie_data_offset);
1214 	readl(pcie_data_offset);
1215 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1216 }
1217 
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1219 				   u64 reg_addr, u64 reg_data)
1220 {
1221 	unsigned long flags, pcie_index, pcie_data;
1222 	unsigned long pcie_index_hi = 0;
1223 	void __iomem *pcie_index_offset;
1224 	void __iomem *pcie_index_hi_offset;
1225 	void __iomem *pcie_data_offset;
1226 
1227 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1228 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1229 	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1230 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1231 
1232 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1233 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1234 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1235 	if (pcie_index_hi != 0)
1236 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1237 				pcie_index_hi * 4;
1238 
1239 	/* write low 32 bits */
1240 	writel(reg_addr, pcie_index_offset);
1241 	readl(pcie_index_offset);
1242 	if (pcie_index_hi != 0) {
1243 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1244 		readl(pcie_index_hi_offset);
1245 	}
1246 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1247 	readl(pcie_data_offset);
1248 	/* write high 32 bits */
1249 	writel(reg_addr + 4, pcie_index_offset);
1250 	readl(pcie_index_offset);
1251 	if (pcie_index_hi != 0) {
1252 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1253 		readl(pcie_index_hi_offset);
1254 	}
1255 	writel((u32)(reg_data >> 32), pcie_data_offset);
1256 	readl(pcie_data_offset);
1257 
1258 	/* clear the high bits */
1259 	if (pcie_index_hi != 0) {
1260 		writel(0, pcie_index_hi_offset);
1261 		readl(pcie_index_hi_offset);
1262 	}
1263 
1264 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1265 }
1266 
1267 /**
1268  * amdgpu_device_get_rev_id - query device rev_id
1269  *
1270  * @adev: amdgpu_device pointer
1271  *
1272  * Return device rev_id
1273  */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1275 {
1276 	return adev->nbio.funcs->get_rev_id(adev);
1277 }
1278 
1279 /**
1280  * amdgpu_invalid_rreg - dummy reg read function
1281  *
1282  * @adev: amdgpu_device pointer
1283  * @reg: offset of register
1284  *
1285  * Dummy register read function.  Used for register blocks
1286  * that certain asics don't have (all asics).
1287  * Returns the value in the register.
1288  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1290 {
1291 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1292 	BUG();
1293 	return 0;
1294 }
1295 
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1297 {
1298 	DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1299 	BUG();
1300 	return 0;
1301 }
1302 
1303 /**
1304  * amdgpu_invalid_wreg - dummy reg write function
1305  *
1306  * @adev: amdgpu_device pointer
1307  * @reg: offset of register
1308  * @v: value to write to the register
1309  *
1310  * Dummy register read function.  Used for register blocks
1311  * that certain asics don't have (all asics).
1312  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1314 {
1315 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1316 		  reg, v);
1317 	BUG();
1318 }
1319 
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1320 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1321 {
1322 	DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1323 		  reg, v);
1324 	BUG();
1325 }
1326 
1327 /**
1328  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1329  *
1330  * @adev: amdgpu_device pointer
1331  * @reg: offset of register
1332  *
1333  * Dummy register read function.  Used for register blocks
1334  * that certain asics don't have (all asics).
1335  * Returns the value in the register.
1336  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1337 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1338 {
1339 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1340 	BUG();
1341 	return 0;
1342 }
1343 
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1344 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1345 {
1346 	DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1347 	BUG();
1348 	return 0;
1349 }
1350 
1351 /**
1352  * amdgpu_invalid_wreg64 - dummy reg write function
1353  *
1354  * @adev: amdgpu_device pointer
1355  * @reg: offset of register
1356  * @v: value to write to the register
1357  *
1358  * Dummy register read function.  Used for register blocks
1359  * that certain asics don't have (all asics).
1360  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1361 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1362 {
1363 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1364 		  reg, v);
1365 	BUG();
1366 }
1367 
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1368 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1369 {
1370 	DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1371 		  reg, v);
1372 	BUG();
1373 }
1374 
1375 /**
1376  * amdgpu_block_invalid_rreg - dummy reg read function
1377  *
1378  * @adev: amdgpu_device pointer
1379  * @block: offset of instance
1380  * @reg: offset of register
1381  *
1382  * Dummy register read function.  Used for register blocks
1383  * that certain asics don't have (all asics).
1384  * Returns the value in the register.
1385  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1386 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1387 					  uint32_t block, uint32_t reg)
1388 {
1389 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1390 		  reg, block);
1391 	BUG();
1392 	return 0;
1393 }
1394 
1395 /**
1396  * amdgpu_block_invalid_wreg - dummy reg write function
1397  *
1398  * @adev: amdgpu_device pointer
1399  * @block: offset of instance
1400  * @reg: offset of register
1401  * @v: value to write to the register
1402  *
1403  * Dummy register read function.  Used for register blocks
1404  * that certain asics don't have (all asics).
1405  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1406 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1407 				      uint32_t block,
1408 				      uint32_t reg, uint32_t v)
1409 {
1410 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1411 		  reg, block, v);
1412 	BUG();
1413 }
1414 
amdgpu_device_get_vbios_flags(struct amdgpu_device * adev)1415 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev)
1416 {
1417 	if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1418 		return AMDGPU_VBIOS_SKIP;
1419 
1420 	if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev))
1421 		return AMDGPU_VBIOS_OPTIONAL;
1422 
1423 	return 0;
1424 }
1425 
1426 /**
1427  * amdgpu_device_asic_init - Wrapper for atom asic_init
1428  *
1429  * @adev: amdgpu_device pointer
1430  *
1431  * Does any asic specific work and then calls atom asic init.
1432  */
amdgpu_device_asic_init(struct amdgpu_device * adev)1433 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1434 {
1435 	uint32_t flags;
1436 	bool optional;
1437 	int ret;
1438 
1439 	amdgpu_asic_pre_asic_init(adev);
1440 	flags = amdgpu_device_get_vbios_flags(adev);
1441 	optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP));
1442 
1443 	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1444 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1445 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
1446 	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1447 		amdgpu_psp_wait_for_bootloader(adev);
1448 		if (optional && !adev->bios)
1449 			return 0;
1450 
1451 		ret = amdgpu_atomfirmware_asic_init(adev, true);
1452 		return ret;
1453 	} else {
1454 		if (optional && !adev->bios)
1455 			return 0;
1456 
1457 		return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1458 	}
1459 
1460 	return 0;
1461 }
1462 
1463 /**
1464  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1465  *
1466  * @adev: amdgpu_device pointer
1467  *
1468  * Allocates a scratch page of VRAM for use by various things in the
1469  * driver.
1470  */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1471 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1472 {
1473 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1474 				       AMDGPU_GEM_DOMAIN_VRAM |
1475 				       AMDGPU_GEM_DOMAIN_GTT,
1476 				       &adev->mem_scratch.robj,
1477 				       &adev->mem_scratch.gpu_addr,
1478 				       (void **)&adev->mem_scratch.ptr);
1479 }
1480 
1481 /**
1482  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1483  *
1484  * @adev: amdgpu_device pointer
1485  *
1486  * Frees the VRAM scratch page.
1487  */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1488 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1489 {
1490 	amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1491 }
1492 
1493 /**
1494  * amdgpu_device_program_register_sequence - program an array of registers.
1495  *
1496  * @adev: amdgpu_device pointer
1497  * @registers: pointer to the register array
1498  * @array_size: size of the register array
1499  *
1500  * Programs an array or registers with and or masks.
1501  * This is a helper for setting golden registers.
1502  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1503 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1504 					     const u32 *registers,
1505 					     const u32 array_size)
1506 {
1507 	u32 tmp, reg, and_mask, or_mask;
1508 	int i;
1509 
1510 	if (array_size % 3)
1511 		return;
1512 
1513 	for (i = 0; i < array_size; i += 3) {
1514 		reg = registers[i + 0];
1515 		and_mask = registers[i + 1];
1516 		or_mask = registers[i + 2];
1517 
1518 		if (and_mask == 0xffffffff) {
1519 			tmp = or_mask;
1520 		} else {
1521 			tmp = RREG32(reg);
1522 			tmp &= ~and_mask;
1523 			if (adev->family >= AMDGPU_FAMILY_AI)
1524 				tmp |= (or_mask & and_mask);
1525 			else
1526 				tmp |= or_mask;
1527 		}
1528 		WREG32(reg, tmp);
1529 	}
1530 }
1531 
1532 /**
1533  * amdgpu_device_pci_config_reset - reset the GPU
1534  *
1535  * @adev: amdgpu_device pointer
1536  *
1537  * Resets the GPU using the pci config reset sequence.
1538  * Only applicable to asics prior to vega10.
1539  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1540 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1541 {
1542 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1543 }
1544 
1545 /**
1546  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1547  *
1548  * @adev: amdgpu_device pointer
1549  *
1550  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1551  */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1552 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1553 {
1554 	return pci_reset_function(adev->pdev);
1555 }
1556 
1557 /*
1558  * amdgpu_device_wb_*()
1559  * Writeback is the method by which the GPU updates special pages in memory
1560  * with the status of certain GPU events (fences, ring pointers,etc.).
1561  */
1562 
1563 /**
1564  * amdgpu_device_wb_fini - Disable Writeback and free memory
1565  *
1566  * @adev: amdgpu_device pointer
1567  *
1568  * Disables Writeback and frees the Writeback memory (all asics).
1569  * Used at driver shutdown.
1570  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1571 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1572 {
1573 	if (adev->wb.wb_obj) {
1574 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1575 				      &adev->wb.gpu_addr,
1576 				      (void **)&adev->wb.wb);
1577 		adev->wb.wb_obj = NULL;
1578 	}
1579 }
1580 
1581 /**
1582  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1583  *
1584  * @adev: amdgpu_device pointer
1585  *
1586  * Initializes writeback and allocates writeback memory (all asics).
1587  * Used at driver startup.
1588  * Returns 0 on success or an -error on failure.
1589  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1590 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1591 {
1592 	int r;
1593 
1594 	if (adev->wb.wb_obj == NULL) {
1595 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1596 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1597 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1598 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1599 					    (void **)&adev->wb.wb);
1600 		if (r) {
1601 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1602 			return r;
1603 		}
1604 
1605 		adev->wb.num_wb = AMDGPU_MAX_WB;
1606 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1607 
1608 		/* clear wb memory */
1609 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1610 	}
1611 
1612 	return 0;
1613 }
1614 
1615 /**
1616  * amdgpu_device_wb_get - Allocate a wb entry
1617  *
1618  * @adev: amdgpu_device pointer
1619  * @wb: wb index
1620  *
1621  * Allocate a wb slot for use by the driver (all asics).
1622  * Returns 0 on success or -EINVAL on failure.
1623  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1624 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1625 {
1626 	unsigned long flags, offset;
1627 
1628 	spin_lock_irqsave(&adev->wb.lock, flags);
1629 	offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1630 	if (offset < adev->wb.num_wb) {
1631 		__set_bit(offset, adev->wb.used);
1632 		spin_unlock_irqrestore(&adev->wb.lock, flags);
1633 		*wb = offset << 3; /* convert to dw offset */
1634 		return 0;
1635 	} else {
1636 		spin_unlock_irqrestore(&adev->wb.lock, flags);
1637 		return -EINVAL;
1638 	}
1639 }
1640 
1641 /**
1642  * amdgpu_device_wb_free - Free a wb entry
1643  *
1644  * @adev: amdgpu_device pointer
1645  * @wb: wb index
1646  *
1647  * Free a wb slot allocated for use by the driver (all asics)
1648  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1649 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1650 {
1651 	unsigned long flags;
1652 
1653 	wb >>= 3;
1654 	spin_lock_irqsave(&adev->wb.lock, flags);
1655 	if (wb < adev->wb.num_wb)
1656 		__clear_bit(wb, adev->wb.used);
1657 	spin_unlock_irqrestore(&adev->wb.lock, flags);
1658 }
1659 
1660 /**
1661  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1662  *
1663  * @adev: amdgpu_device pointer
1664  *
1665  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1666  * to fail, but if any of the BARs is not accessible after the size we abort
1667  * driver loading by returning -ENODEV.
1668  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1669 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1670 {
1671 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1672 	struct pci_bus *root;
1673 	struct resource *res;
1674 	unsigned int i;
1675 	u16 cmd;
1676 	int r;
1677 
1678 	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1679 		return 0;
1680 
1681 	/* Bypass for VF */
1682 	if (amdgpu_sriov_vf(adev))
1683 		return 0;
1684 
1685 	if (!amdgpu_rebar)
1686 		return 0;
1687 
1688 	/* resizing on Dell G5 SE platforms causes problems with runtime pm */
1689 	if ((amdgpu_runtime_pm != 0) &&
1690 	    adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1691 	    adev->pdev->device == 0x731f &&
1692 	    adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1693 		return 0;
1694 
1695 	/* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1696 	if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1697 		DRM_WARN("System can't access extended configuration space, please check!!\n");
1698 
1699 	/* skip if the bios has already enabled large BAR */
1700 	if (adev->gmc.real_vram_size &&
1701 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1702 		return 0;
1703 
1704 	/* Check if the root BUS has 64bit memory resources */
1705 	root = adev->pdev->bus;
1706 	while (root->parent)
1707 		root = root->parent;
1708 
1709 	pci_bus_for_each_resource(root, res, i) {
1710 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1711 		    res->start > 0x100000000ull)
1712 			break;
1713 	}
1714 
1715 	/* Trying to resize is pointless without a root hub window above 4GB */
1716 	if (!res)
1717 		return 0;
1718 
1719 	/* Limit the BAR size to what is available */
1720 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1721 			rbar_size);
1722 
1723 	/* Disable memory decoding while we change the BAR addresses and size */
1724 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1725 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1726 			      cmd & ~PCI_COMMAND_MEMORY);
1727 
1728 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1729 	amdgpu_doorbell_fini(adev);
1730 	if (adev->asic_type >= CHIP_BONAIRE)
1731 		pci_release_resource(adev->pdev, 2);
1732 
1733 	pci_release_resource(adev->pdev, 0);
1734 
1735 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1736 	if (r == -ENOSPC)
1737 		DRM_INFO("Not enough PCI address space for a large BAR.");
1738 	else if (r && r != -ENOTSUPP)
1739 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1740 
1741 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1742 
1743 	/* When the doorbell or fb BAR isn't available we have no chance of
1744 	 * using the device.
1745 	 */
1746 	r = amdgpu_doorbell_init(adev);
1747 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1748 		return -ENODEV;
1749 
1750 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1751 
1752 	return 0;
1753 }
1754 
1755 /*
1756  * GPU helpers function.
1757  */
1758 /**
1759  * amdgpu_device_need_post - check if the hw need post or not
1760  *
1761  * @adev: amdgpu_device pointer
1762  *
1763  * Check if the asic has been initialized (all asics) at driver startup
1764  * or post is needed if  hw reset is performed.
1765  * Returns true if need or false if not.
1766  */
amdgpu_device_need_post(struct amdgpu_device * adev)1767 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1768 {
1769 	uint32_t reg, flags;
1770 
1771 	if (amdgpu_sriov_vf(adev))
1772 		return false;
1773 
1774 	flags = amdgpu_device_get_vbios_flags(adev);
1775 	if (flags & AMDGPU_VBIOS_SKIP)
1776 		return false;
1777 	if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios)
1778 		return false;
1779 
1780 	if (amdgpu_passthrough(adev)) {
1781 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1782 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1783 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1784 		 * vpost executed for smc version below 22.15
1785 		 */
1786 		if (adev->asic_type == CHIP_FIJI) {
1787 			int err;
1788 			uint32_t fw_ver;
1789 
1790 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1791 			/* force vPost if error occurred */
1792 			if (err)
1793 				return true;
1794 
1795 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1796 			release_firmware(adev->pm.fw);
1797 			if (fw_ver < 0x00160e00)
1798 				return true;
1799 		}
1800 	}
1801 
1802 	/* Don't post if we need to reset whole hive on init */
1803 	if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1804 		return false;
1805 
1806 	if (adev->has_hw_reset) {
1807 		adev->has_hw_reset = false;
1808 		return true;
1809 	}
1810 
1811 	/* bios scratch used on CIK+ */
1812 	if (adev->asic_type >= CHIP_BONAIRE)
1813 		return amdgpu_atombios_scratch_need_asic_init(adev);
1814 
1815 	/* check MEM_SIZE for older asics */
1816 	reg = amdgpu_asic_get_config_memsize(adev);
1817 
1818 	if ((reg != 0) && (reg != 0xffffffff))
1819 		return false;
1820 
1821 	return true;
1822 }
1823 
1824 /*
1825  * Check whether seamless boot is supported.
1826  *
1827  * So far we only support seamless boot on DCE 3.0 or later.
1828  * If users report that it works on older ASICS as well, we may
1829  * loosen this.
1830  */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1831 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1832 {
1833 	switch (amdgpu_seamless) {
1834 	case -1:
1835 		break;
1836 	case 1:
1837 		return true;
1838 	case 0:
1839 		return false;
1840 	default:
1841 		DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1842 			  amdgpu_seamless);
1843 		return false;
1844 	}
1845 
1846 	if (!(adev->flags & AMD_IS_APU))
1847 		return false;
1848 
1849 	if (adev->mman.keep_stolen_vga_memory)
1850 		return false;
1851 
1852 	return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1853 }
1854 
1855 /*
1856  * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1857  * don't support dynamic speed switching. Until we have confirmation from Intel
1858  * that a specific host supports it, it's safer that we keep it disabled for all.
1859  *
1860  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1861  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1862  */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1863 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1864 {
1865 #if IS_ENABLED(CONFIG_X86)
1866 	struct cpuinfo_x86 *c = &cpu_data(0);
1867 
1868 	/* eGPU change speeds based on USB4 fabric conditions */
1869 	if (dev_is_removable(adev->dev))
1870 		return true;
1871 
1872 	if (c->x86_vendor == X86_VENDOR_INTEL)
1873 		return false;
1874 #endif
1875 	return true;
1876 }
1877 
amdgpu_device_aspm_support_quirk(struct amdgpu_device * adev)1878 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev)
1879 {
1880 #if IS_ENABLED(CONFIG_X86)
1881 	struct cpuinfo_x86 *c = &cpu_data(0);
1882 
1883 	if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) ||
1884 		  amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1)))
1885 		return false;
1886 
1887 	if (c->x86 == 6 &&
1888 		adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) {
1889 		switch (c->x86_model) {
1890 		case VFM_MODEL(INTEL_ALDERLAKE):
1891 		case VFM_MODEL(INTEL_ALDERLAKE_L):
1892 		case VFM_MODEL(INTEL_RAPTORLAKE):
1893 		case VFM_MODEL(INTEL_RAPTORLAKE_P):
1894 		case VFM_MODEL(INTEL_RAPTORLAKE_S):
1895 			return true;
1896 		default:
1897 			return false;
1898 		}
1899 	} else {
1900 		return false;
1901 	}
1902 #else
1903 	return false;
1904 #endif
1905 }
1906 
1907 /**
1908  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1909  *
1910  * @adev: amdgpu_device pointer
1911  *
1912  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1913  * be set for this device.
1914  *
1915  * Returns true if it should be used or false if not.
1916  */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1917 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1918 {
1919 	switch (amdgpu_aspm) {
1920 	case -1:
1921 		break;
1922 	case 0:
1923 		return false;
1924 	case 1:
1925 		return true;
1926 	default:
1927 		return false;
1928 	}
1929 	if (adev->flags & AMD_IS_APU)
1930 		return false;
1931 	if (amdgpu_device_aspm_support_quirk(adev))
1932 		return false;
1933 	return pcie_aspm_enabled(adev->pdev);
1934 }
1935 
1936 /* if we get transitioned to only one device, take VGA back */
1937 /**
1938  * amdgpu_device_vga_set_decode - enable/disable vga decode
1939  *
1940  * @pdev: PCI device pointer
1941  * @state: enable/disable vga decode
1942  *
1943  * Enable/disable vga decode (all asics).
1944  * Returns VGA resource flags.
1945  */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1946 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1947 		bool state)
1948 {
1949 	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1950 
1951 	amdgpu_asic_set_vga_state(adev, state);
1952 	if (state)
1953 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1954 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1955 	else
1956 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1957 }
1958 
1959 /**
1960  * amdgpu_device_check_block_size - validate the vm block size
1961  *
1962  * @adev: amdgpu_device pointer
1963  *
1964  * Validates the vm block size specified via module parameter.
1965  * The vm block size defines number of bits in page table versus page directory,
1966  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1967  * page table and the remaining bits are in the page directory.
1968  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1969 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1970 {
1971 	/* defines number of bits in page table versus page directory,
1972 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1973 	 * page table and the remaining bits are in the page directory
1974 	 */
1975 	if (amdgpu_vm_block_size == -1)
1976 		return;
1977 
1978 	if (amdgpu_vm_block_size < 9) {
1979 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1980 			 amdgpu_vm_block_size);
1981 		amdgpu_vm_block_size = -1;
1982 	}
1983 }
1984 
1985 /**
1986  * amdgpu_device_check_vm_size - validate the vm size
1987  *
1988  * @adev: amdgpu_device pointer
1989  *
1990  * Validates the vm size in GB specified via module parameter.
1991  * The VM size is the size of the GPU virtual memory space in GB.
1992  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1993 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1994 {
1995 	/* no need to check the default value */
1996 	if (amdgpu_vm_size == -1)
1997 		return;
1998 
1999 	if (amdgpu_vm_size < 1) {
2000 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
2001 			 amdgpu_vm_size);
2002 		amdgpu_vm_size = -1;
2003 	}
2004 }
2005 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)2006 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
2007 {
2008 	struct sysinfo si;
2009 	bool is_os_64 = (sizeof(void *) == 8);
2010 	uint64_t total_memory;
2011 	uint64_t dram_size_seven_GB = 0x1B8000000;
2012 	uint64_t dram_size_three_GB = 0xB8000000;
2013 
2014 	if (amdgpu_smu_memory_pool_size == 0)
2015 		return;
2016 
2017 	if (!is_os_64) {
2018 		DRM_WARN("Not 64-bit OS, feature not supported\n");
2019 		goto def_value;
2020 	}
2021 	si_meminfo(&si);
2022 	total_memory = (uint64_t)si.totalram * si.mem_unit;
2023 
2024 	if ((amdgpu_smu_memory_pool_size == 1) ||
2025 		(amdgpu_smu_memory_pool_size == 2)) {
2026 		if (total_memory < dram_size_three_GB)
2027 			goto def_value1;
2028 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
2029 		(amdgpu_smu_memory_pool_size == 8)) {
2030 		if (total_memory < dram_size_seven_GB)
2031 			goto def_value1;
2032 	} else {
2033 		DRM_WARN("Smu memory pool size not supported\n");
2034 		goto def_value;
2035 	}
2036 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
2037 
2038 	return;
2039 
2040 def_value1:
2041 	DRM_WARN("No enough system memory\n");
2042 def_value:
2043 	adev->pm.smu_prv_buffer_size = 0;
2044 }
2045 
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)2046 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
2047 {
2048 	if (!(adev->flags & AMD_IS_APU) ||
2049 	    adev->asic_type < CHIP_RAVEN)
2050 		return 0;
2051 
2052 	switch (adev->asic_type) {
2053 	case CHIP_RAVEN:
2054 		if (adev->pdev->device == 0x15dd)
2055 			adev->apu_flags |= AMD_APU_IS_RAVEN;
2056 		if (adev->pdev->device == 0x15d8)
2057 			adev->apu_flags |= AMD_APU_IS_PICASSO;
2058 		break;
2059 	case CHIP_RENOIR:
2060 		if ((adev->pdev->device == 0x1636) ||
2061 		    (adev->pdev->device == 0x164c))
2062 			adev->apu_flags |= AMD_APU_IS_RENOIR;
2063 		else
2064 			adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
2065 		break;
2066 	case CHIP_VANGOGH:
2067 		adev->apu_flags |= AMD_APU_IS_VANGOGH;
2068 		break;
2069 	case CHIP_YELLOW_CARP:
2070 		break;
2071 	case CHIP_CYAN_SKILLFISH:
2072 		if ((adev->pdev->device == 0x13FE) ||
2073 		    (adev->pdev->device == 0x143F))
2074 			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
2075 		break;
2076 	default:
2077 		break;
2078 	}
2079 
2080 	return 0;
2081 }
2082 
2083 /**
2084  * amdgpu_device_check_arguments - validate module params
2085  *
2086  * @adev: amdgpu_device pointer
2087  *
2088  * Validates certain module parameters and updates
2089  * the associated values used by the driver (all asics).
2090  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)2091 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2092 {
2093 	int i;
2094 
2095 	if (amdgpu_sched_jobs < 4) {
2096 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2097 			 amdgpu_sched_jobs);
2098 		amdgpu_sched_jobs = 4;
2099 	} else if (!is_power_of_2(amdgpu_sched_jobs)) {
2100 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2101 			 amdgpu_sched_jobs);
2102 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2103 	}
2104 
2105 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2106 		/* gart size must be greater or equal to 32M */
2107 		dev_warn(adev->dev, "gart size (%d) too small\n",
2108 			 amdgpu_gart_size);
2109 		amdgpu_gart_size = -1;
2110 	}
2111 
2112 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2113 		/* gtt size must be greater or equal to 32M */
2114 		dev_warn(adev->dev, "gtt size (%d) too small\n",
2115 				 amdgpu_gtt_size);
2116 		amdgpu_gtt_size = -1;
2117 	}
2118 
2119 	/* valid range is between 4 and 9 inclusive */
2120 	if (amdgpu_vm_fragment_size != -1 &&
2121 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2122 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
2123 		amdgpu_vm_fragment_size = -1;
2124 	}
2125 
2126 	if (amdgpu_sched_hw_submission < 2) {
2127 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2128 			 amdgpu_sched_hw_submission);
2129 		amdgpu_sched_hw_submission = 2;
2130 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2131 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2132 			 amdgpu_sched_hw_submission);
2133 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2134 	}
2135 
2136 	if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2137 		dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2138 		amdgpu_reset_method = -1;
2139 	}
2140 
2141 	amdgpu_device_check_smu_prv_buffer_size(adev);
2142 
2143 	amdgpu_device_check_vm_size(adev);
2144 
2145 	amdgpu_device_check_block_size(adev);
2146 
2147 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2148 
2149 	for (i = 0; i < MAX_XCP; i++) {
2150 		switch (amdgpu_enforce_isolation) {
2151 		case -1:
2152 		case 0:
2153 		default:
2154 			/* disable */
2155 			adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
2156 			break;
2157 		case 1:
2158 			/* enable */
2159 			adev->enforce_isolation[i] =
2160 				AMDGPU_ENFORCE_ISOLATION_ENABLE;
2161 			break;
2162 		case 2:
2163 			/* enable legacy mode */
2164 			adev->enforce_isolation[i] =
2165 				AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
2166 			break;
2167 		case 3:
2168 			/* enable only process isolation without submitting cleaner shader */
2169 			adev->enforce_isolation[i] =
2170 				AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER;
2171 			break;
2172 		}
2173 	}
2174 
2175 	return 0;
2176 }
2177 
2178 /**
2179  * amdgpu_switcheroo_set_state - set switcheroo state
2180  *
2181  * @pdev: pci dev pointer
2182  * @state: vga_switcheroo state
2183  *
2184  * Callback for the switcheroo driver.  Suspends or resumes
2185  * the asics before or after it is powered up using ACPI methods.
2186  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2187 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2188 					enum vga_switcheroo_state state)
2189 {
2190 	struct drm_device *dev = pci_get_drvdata(pdev);
2191 	int r;
2192 
2193 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2194 		return;
2195 
2196 	if (state == VGA_SWITCHEROO_ON) {
2197 		pr_info("switched on\n");
2198 		/* don't suspend or resume card normally */
2199 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2200 
2201 		pci_set_power_state(pdev, PCI_D0);
2202 		amdgpu_device_load_pci_state(pdev);
2203 		r = pci_enable_device(pdev);
2204 		if (r)
2205 			DRM_WARN("pci_enable_device failed (%d)\n", r);
2206 		amdgpu_device_resume(dev, true);
2207 
2208 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
2209 	} else {
2210 		pr_info("switched off\n");
2211 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2212 		amdgpu_device_prepare(dev);
2213 		amdgpu_device_suspend(dev, true);
2214 		amdgpu_device_cache_pci_state(pdev);
2215 		/* Shut down the device */
2216 		pci_disable_device(pdev);
2217 		pci_set_power_state(pdev, PCI_D3cold);
2218 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2219 	}
2220 }
2221 
2222 /**
2223  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2224  *
2225  * @pdev: pci dev pointer
2226  *
2227  * Callback for the switcheroo driver.  Check of the switcheroo
2228  * state can be changed.
2229  * Returns true if the state can be changed, false if not.
2230  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2231 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2232 {
2233 	struct drm_device *dev = pci_get_drvdata(pdev);
2234 
2235        /*
2236 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
2237 	* locking inversion with the driver load path. And the access here is
2238 	* completely racy anyway. So don't bother with locking for now.
2239 	*/
2240 	return atomic_read(&dev->open_count) == 0;
2241 }
2242 
2243 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2244 	.set_gpu_state = amdgpu_switcheroo_set_state,
2245 	.reprobe = NULL,
2246 	.can_switch = amdgpu_switcheroo_can_switch,
2247 };
2248 
2249 /**
2250  * amdgpu_device_ip_set_clockgating_state - set the CG state
2251  *
2252  * @dev: amdgpu_device pointer
2253  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2254  * @state: clockgating state (gate or ungate)
2255  *
2256  * Sets the requested clockgating state for all instances of
2257  * the hardware IP specified.
2258  * Returns the error code from the last instance.
2259  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2260 int amdgpu_device_ip_set_clockgating_state(void *dev,
2261 					   enum amd_ip_block_type block_type,
2262 					   enum amd_clockgating_state state)
2263 {
2264 	struct amdgpu_device *adev = dev;
2265 	int i, r = 0;
2266 
2267 	for (i = 0; i < adev->num_ip_blocks; i++) {
2268 		if (!adev->ip_blocks[i].status.valid)
2269 			continue;
2270 		if (adev->ip_blocks[i].version->type != block_type)
2271 			continue;
2272 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2273 			continue;
2274 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2275 			&adev->ip_blocks[i], state);
2276 		if (r)
2277 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2278 				  adev->ip_blocks[i].version->funcs->name, r);
2279 	}
2280 	return r;
2281 }
2282 
2283 /**
2284  * amdgpu_device_ip_set_powergating_state - set the PG state
2285  *
2286  * @dev: amdgpu_device pointer
2287  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2288  * @state: powergating state (gate or ungate)
2289  *
2290  * Sets the requested powergating state for all instances of
2291  * the hardware IP specified.
2292  * Returns the error code from the last instance.
2293  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2294 int amdgpu_device_ip_set_powergating_state(void *dev,
2295 					   enum amd_ip_block_type block_type,
2296 					   enum amd_powergating_state state)
2297 {
2298 	struct amdgpu_device *adev = dev;
2299 	int i, r = 0;
2300 
2301 	for (i = 0; i < adev->num_ip_blocks; i++) {
2302 		if (!adev->ip_blocks[i].status.valid)
2303 			continue;
2304 		if (adev->ip_blocks[i].version->type != block_type)
2305 			continue;
2306 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2307 			continue;
2308 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2309 			&adev->ip_blocks[i], state);
2310 		if (r)
2311 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2312 				  adev->ip_blocks[i].version->funcs->name, r);
2313 	}
2314 	return r;
2315 }
2316 
2317 /**
2318  * amdgpu_device_ip_get_clockgating_state - get the CG state
2319  *
2320  * @adev: amdgpu_device pointer
2321  * @flags: clockgating feature flags
2322  *
2323  * Walks the list of IPs on the device and updates the clockgating
2324  * flags for each IP.
2325  * Updates @flags with the feature flags for each hardware IP where
2326  * clockgating is enabled.
2327  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2328 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2329 					    u64 *flags)
2330 {
2331 	int i;
2332 
2333 	for (i = 0; i < adev->num_ip_blocks; i++) {
2334 		if (!adev->ip_blocks[i].status.valid)
2335 			continue;
2336 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2337 			adev->ip_blocks[i].version->funcs->get_clockgating_state(
2338 				&adev->ip_blocks[i], flags);
2339 	}
2340 }
2341 
2342 /**
2343  * amdgpu_device_ip_wait_for_idle - wait for idle
2344  *
2345  * @adev: amdgpu_device pointer
2346  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2347  *
2348  * Waits for the request hardware IP to be idle.
2349  * Returns 0 for success or a negative error code on failure.
2350  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2351 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2352 				   enum amd_ip_block_type block_type)
2353 {
2354 	int i, r;
2355 
2356 	for (i = 0; i < adev->num_ip_blocks; i++) {
2357 		if (!adev->ip_blocks[i].status.valid)
2358 			continue;
2359 		if (adev->ip_blocks[i].version->type == block_type) {
2360 			if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2361 				r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2362 								&adev->ip_blocks[i]);
2363 				if (r)
2364 					return r;
2365 			}
2366 			break;
2367 		}
2368 	}
2369 	return 0;
2370 
2371 }
2372 
2373 /**
2374  * amdgpu_device_ip_is_valid - is the hardware IP enabled
2375  *
2376  * @adev: amdgpu_device pointer
2377  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2378  *
2379  * Check if the hardware IP is enable or not.
2380  * Returns true if it the IP is enable, false if not.
2381  */
amdgpu_device_ip_is_valid(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2382 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2383 			       enum amd_ip_block_type block_type)
2384 {
2385 	int i;
2386 
2387 	for (i = 0; i < adev->num_ip_blocks; i++) {
2388 		if (adev->ip_blocks[i].version->type == block_type)
2389 			return adev->ip_blocks[i].status.valid;
2390 	}
2391 	return false;
2392 
2393 }
2394 
2395 /**
2396  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2397  *
2398  * @adev: amdgpu_device pointer
2399  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2400  *
2401  * Returns a pointer to the hardware IP block structure
2402  * if it exists for the asic, otherwise NULL.
2403  */
2404 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2405 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2406 			      enum amd_ip_block_type type)
2407 {
2408 	int i;
2409 
2410 	for (i = 0; i < adev->num_ip_blocks; i++)
2411 		if (adev->ip_blocks[i].version->type == type)
2412 			return &adev->ip_blocks[i];
2413 
2414 	return NULL;
2415 }
2416 
2417 /**
2418  * amdgpu_device_ip_block_version_cmp
2419  *
2420  * @adev: amdgpu_device pointer
2421  * @type: enum amd_ip_block_type
2422  * @major: major version
2423  * @minor: minor version
2424  *
2425  * return 0 if equal or greater
2426  * return 1 if smaller or the ip_block doesn't exist
2427  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2428 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2429 				       enum amd_ip_block_type type,
2430 				       u32 major, u32 minor)
2431 {
2432 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2433 
2434 	if (ip_block && ((ip_block->version->major > major) ||
2435 			((ip_block->version->major == major) &&
2436 			(ip_block->version->minor >= minor))))
2437 		return 0;
2438 
2439 	return 1;
2440 }
2441 
2442 /**
2443  * amdgpu_device_ip_block_add
2444  *
2445  * @adev: amdgpu_device pointer
2446  * @ip_block_version: pointer to the IP to add
2447  *
2448  * Adds the IP block driver information to the collection of IPs
2449  * on the asic.
2450  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2451 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2452 			       const struct amdgpu_ip_block_version *ip_block_version)
2453 {
2454 	if (!ip_block_version)
2455 		return -EINVAL;
2456 
2457 	switch (ip_block_version->type) {
2458 	case AMD_IP_BLOCK_TYPE_VCN:
2459 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2460 			return 0;
2461 		break;
2462 	case AMD_IP_BLOCK_TYPE_JPEG:
2463 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2464 			return 0;
2465 		break;
2466 	default:
2467 		break;
2468 	}
2469 
2470 	dev_info(adev->dev, "detected ip block number %d <%s>\n",
2471 		 adev->num_ip_blocks, ip_block_version->funcs->name);
2472 
2473 	adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2474 
2475 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2476 
2477 	return 0;
2478 }
2479 
2480 /**
2481  * amdgpu_device_enable_virtual_display - enable virtual display feature
2482  *
2483  * @adev: amdgpu_device pointer
2484  *
2485  * Enabled the virtual display feature if the user has enabled it via
2486  * the module parameter virtual_display.  This feature provides a virtual
2487  * display hardware on headless boards or in virtualized environments.
2488  * This function parses and validates the configuration string specified by
2489  * the user and configures the virtual display configuration (number of
2490  * virtual connectors, crtcs, etc.) specified.
2491  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2492 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2493 {
2494 	adev->enable_virtual_display = false;
2495 
2496 	if (amdgpu_virtual_display) {
2497 		const char *pci_address_name = pci_name(adev->pdev);
2498 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2499 
2500 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2501 		pciaddstr_tmp = pciaddstr;
2502 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2503 			pciaddname = strsep(&pciaddname_tmp, ",");
2504 			if (!strcmp("all", pciaddname)
2505 			    || !strcmp(pci_address_name, pciaddname)) {
2506 				long num_crtc;
2507 				int res = -1;
2508 
2509 				adev->enable_virtual_display = true;
2510 
2511 				if (pciaddname_tmp)
2512 					res = kstrtol(pciaddname_tmp, 10,
2513 						      &num_crtc);
2514 
2515 				if (!res) {
2516 					if (num_crtc < 1)
2517 						num_crtc = 1;
2518 					if (num_crtc > 6)
2519 						num_crtc = 6;
2520 					adev->mode_info.num_crtc = num_crtc;
2521 				} else {
2522 					adev->mode_info.num_crtc = 1;
2523 				}
2524 				break;
2525 			}
2526 		}
2527 
2528 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2529 			 amdgpu_virtual_display, pci_address_name,
2530 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
2531 
2532 		kfree(pciaddstr);
2533 	}
2534 }
2535 
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2536 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2537 {
2538 	if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2539 		adev->mode_info.num_crtc = 1;
2540 		adev->enable_virtual_display = true;
2541 		DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2542 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
2543 	}
2544 }
2545 
2546 /**
2547  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2548  *
2549  * @adev: amdgpu_device pointer
2550  *
2551  * Parses the asic configuration parameters specified in the gpu info
2552  * firmware and makes them available to the driver for use in configuring
2553  * the asic.
2554  * Returns 0 on success, -EINVAL on failure.
2555  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2556 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2557 {
2558 	const char *chip_name;
2559 	int err;
2560 	const struct gpu_info_firmware_header_v1_0 *hdr;
2561 
2562 	adev->firmware.gpu_info_fw = NULL;
2563 
2564 	if (adev->mman.discovery_bin)
2565 		return 0;
2566 
2567 	switch (adev->asic_type) {
2568 	default:
2569 		return 0;
2570 	case CHIP_VEGA10:
2571 		chip_name = "vega10";
2572 		break;
2573 	case CHIP_VEGA12:
2574 		chip_name = "vega12";
2575 		break;
2576 	case CHIP_RAVEN:
2577 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2578 			chip_name = "raven2";
2579 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2580 			chip_name = "picasso";
2581 		else
2582 			chip_name = "raven";
2583 		break;
2584 	case CHIP_ARCTURUS:
2585 		chip_name = "arcturus";
2586 		break;
2587 	case CHIP_NAVI12:
2588 		chip_name = "navi12";
2589 		break;
2590 	}
2591 
2592 	err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2593 				   AMDGPU_UCODE_OPTIONAL,
2594 				   "amdgpu/%s_gpu_info.bin", chip_name);
2595 	if (err) {
2596 		dev_err(adev->dev,
2597 			"Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2598 			chip_name);
2599 		goto out;
2600 	}
2601 
2602 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2603 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2604 
2605 	switch (hdr->version_major) {
2606 	case 1:
2607 	{
2608 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2609 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2610 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2611 
2612 		/*
2613 		 * Should be dropped when DAL no longer needs it.
2614 		 */
2615 		if (adev->asic_type == CHIP_NAVI12)
2616 			goto parse_soc_bounding_box;
2617 
2618 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2619 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2620 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2621 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2622 		adev->gfx.config.max_texture_channel_caches =
2623 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
2624 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2625 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2626 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2627 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2628 		adev->gfx.config.double_offchip_lds_buf =
2629 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2630 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2631 		adev->gfx.cu_info.max_waves_per_simd =
2632 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2633 		adev->gfx.cu_info.max_scratch_slots_per_cu =
2634 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2635 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2636 		if (hdr->version_minor >= 1) {
2637 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2638 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2639 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2640 			adev->gfx.config.num_sc_per_sh =
2641 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2642 			adev->gfx.config.num_packer_per_sc =
2643 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2644 		}
2645 
2646 parse_soc_bounding_box:
2647 		/*
2648 		 * soc bounding box info is not integrated in disocovery table,
2649 		 * we always need to parse it from gpu info firmware if needed.
2650 		 */
2651 		if (hdr->version_minor == 2) {
2652 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2653 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2654 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2655 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2656 		}
2657 		break;
2658 	}
2659 	default:
2660 		dev_err(adev->dev,
2661 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2662 		err = -EINVAL;
2663 		goto out;
2664 	}
2665 out:
2666 	return err;
2667 }
2668 
2669 /**
2670  * amdgpu_device_ip_early_init - run early init for hardware IPs
2671  *
2672  * @adev: amdgpu_device pointer
2673  *
2674  * Early initialization pass for hardware IPs.  The hardware IPs that make
2675  * up each asic are discovered each IP's early_init callback is run.  This
2676  * is the first stage in initializing the asic.
2677  * Returns 0 on success, negative error code on failure.
2678  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2679 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2680 {
2681 	struct amdgpu_ip_block *ip_block;
2682 	struct pci_dev *parent;
2683 	bool total, skip_bios;
2684 	uint32_t bios_flags;
2685 	int i, r;
2686 
2687 	amdgpu_device_enable_virtual_display(adev);
2688 
2689 	if (amdgpu_sriov_vf(adev)) {
2690 		r = amdgpu_virt_request_full_gpu(adev, true);
2691 		if (r)
2692 			return r;
2693 	}
2694 
2695 	switch (adev->asic_type) {
2696 #ifdef CONFIG_DRM_AMDGPU_SI
2697 	case CHIP_VERDE:
2698 	case CHIP_TAHITI:
2699 	case CHIP_PITCAIRN:
2700 	case CHIP_OLAND:
2701 	case CHIP_HAINAN:
2702 		adev->family = AMDGPU_FAMILY_SI;
2703 		r = si_set_ip_blocks(adev);
2704 		if (r)
2705 			return r;
2706 		break;
2707 #endif
2708 #ifdef CONFIG_DRM_AMDGPU_CIK
2709 	case CHIP_BONAIRE:
2710 	case CHIP_HAWAII:
2711 	case CHIP_KAVERI:
2712 	case CHIP_KABINI:
2713 	case CHIP_MULLINS:
2714 		if (adev->flags & AMD_IS_APU)
2715 			adev->family = AMDGPU_FAMILY_KV;
2716 		else
2717 			adev->family = AMDGPU_FAMILY_CI;
2718 
2719 		r = cik_set_ip_blocks(adev);
2720 		if (r)
2721 			return r;
2722 		break;
2723 #endif
2724 	case CHIP_TOPAZ:
2725 	case CHIP_TONGA:
2726 	case CHIP_FIJI:
2727 	case CHIP_POLARIS10:
2728 	case CHIP_POLARIS11:
2729 	case CHIP_POLARIS12:
2730 	case CHIP_VEGAM:
2731 	case CHIP_CARRIZO:
2732 	case CHIP_STONEY:
2733 		if (adev->flags & AMD_IS_APU)
2734 			adev->family = AMDGPU_FAMILY_CZ;
2735 		else
2736 			adev->family = AMDGPU_FAMILY_VI;
2737 
2738 		r = vi_set_ip_blocks(adev);
2739 		if (r)
2740 			return r;
2741 		break;
2742 	default:
2743 		r = amdgpu_discovery_set_ip_blocks(adev);
2744 		if (r)
2745 			return r;
2746 		break;
2747 	}
2748 
2749 	/* Check for IP version 9.4.3 with A0 hardware */
2750 	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
2751 	    !amdgpu_device_get_rev_id(adev)) {
2752 		dev_err(adev->dev, "Unsupported A0 hardware\n");
2753 		return -ENODEV;	/* device unsupported - no device error */
2754 	}
2755 
2756 	if (amdgpu_has_atpx() &&
2757 	    (amdgpu_is_atpx_hybrid() ||
2758 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
2759 	    ((adev->flags & AMD_IS_APU) == 0) &&
2760 	    !dev_is_removable(&adev->pdev->dev))
2761 		adev->flags |= AMD_IS_PX;
2762 
2763 	if (!(adev->flags & AMD_IS_APU)) {
2764 		parent = pcie_find_root_port(adev->pdev);
2765 		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2766 	}
2767 
2768 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2769 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2770 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2771 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2772 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2773 	if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2774 		adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2775 
2776 	total = true;
2777 	for (i = 0; i < adev->num_ip_blocks; i++) {
2778 		ip_block = &adev->ip_blocks[i];
2779 
2780 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2781 			DRM_WARN("disabled ip block: %d <%s>\n",
2782 				  i, adev->ip_blocks[i].version->funcs->name);
2783 			adev->ip_blocks[i].status.valid = false;
2784 		} else if (ip_block->version->funcs->early_init) {
2785 			r = ip_block->version->funcs->early_init(ip_block);
2786 			if (r == -ENOENT) {
2787 				adev->ip_blocks[i].status.valid = false;
2788 			} else if (r) {
2789 				DRM_ERROR("early_init of IP block <%s> failed %d\n",
2790 					  adev->ip_blocks[i].version->funcs->name, r);
2791 				total = false;
2792 			} else {
2793 				adev->ip_blocks[i].status.valid = true;
2794 			}
2795 		} else {
2796 			adev->ip_blocks[i].status.valid = true;
2797 		}
2798 		/* get the vbios after the asic_funcs are set up */
2799 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2800 			r = amdgpu_device_parse_gpu_info_fw(adev);
2801 			if (r)
2802 				return r;
2803 
2804 			bios_flags = amdgpu_device_get_vbios_flags(adev);
2805 			skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP);
2806 			/* Read BIOS */
2807 			if (!skip_bios) {
2808 				bool optional =
2809 					!!(bios_flags & AMDGPU_VBIOS_OPTIONAL);
2810 				if (!amdgpu_get_bios(adev) && !optional)
2811 					return -EINVAL;
2812 
2813 				if (optional && !adev->bios)
2814 					dev_info(
2815 						adev->dev,
2816 						"VBIOS image optional, proceeding without VBIOS image");
2817 
2818 				if (adev->bios) {
2819 					r = amdgpu_atombios_init(adev);
2820 					if (r) {
2821 						dev_err(adev->dev,
2822 							"amdgpu_atombios_init failed\n");
2823 						amdgpu_vf_error_put(
2824 							adev,
2825 							AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL,
2826 							0, 0);
2827 						return r;
2828 					}
2829 				}
2830 			}
2831 
2832 			/*get pf2vf msg info at it's earliest time*/
2833 			if (amdgpu_sriov_vf(adev))
2834 				amdgpu_virt_init_data_exchange(adev);
2835 
2836 		}
2837 	}
2838 	if (!total)
2839 		return -ENODEV;
2840 
2841 	if (adev->gmc.xgmi.supported)
2842 		amdgpu_xgmi_early_init(adev);
2843 
2844 	ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2845 	if (ip_block->status.valid != false)
2846 		amdgpu_amdkfd_device_probe(adev);
2847 
2848 	adev->cg_flags &= amdgpu_cg_mask;
2849 	adev->pg_flags &= amdgpu_pg_mask;
2850 
2851 	return 0;
2852 }
2853 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2854 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2855 {
2856 	int i, r;
2857 
2858 	for (i = 0; i < adev->num_ip_blocks; i++) {
2859 		if (!adev->ip_blocks[i].status.sw)
2860 			continue;
2861 		if (adev->ip_blocks[i].status.hw)
2862 			continue;
2863 		if (!amdgpu_ip_member_of_hwini(
2864 			    adev, adev->ip_blocks[i].version->type))
2865 			continue;
2866 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2867 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2868 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2869 			r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2870 			if (r) {
2871 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2872 					  adev->ip_blocks[i].version->funcs->name, r);
2873 				return r;
2874 			}
2875 			adev->ip_blocks[i].status.hw = true;
2876 		}
2877 	}
2878 
2879 	return 0;
2880 }
2881 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2882 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2883 {
2884 	int i, r;
2885 
2886 	for (i = 0; i < adev->num_ip_blocks; i++) {
2887 		if (!adev->ip_blocks[i].status.sw)
2888 			continue;
2889 		if (adev->ip_blocks[i].status.hw)
2890 			continue;
2891 		if (!amdgpu_ip_member_of_hwini(
2892 			    adev, adev->ip_blocks[i].version->type))
2893 			continue;
2894 		r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2895 		if (r) {
2896 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2897 				  adev->ip_blocks[i].version->funcs->name, r);
2898 			return r;
2899 		}
2900 		adev->ip_blocks[i].status.hw = true;
2901 	}
2902 
2903 	return 0;
2904 }
2905 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2906 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2907 {
2908 	int r = 0;
2909 	int i;
2910 	uint32_t smu_version;
2911 
2912 	if (adev->asic_type >= CHIP_VEGA10) {
2913 		for (i = 0; i < adev->num_ip_blocks; i++) {
2914 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2915 				continue;
2916 
2917 			if (!amdgpu_ip_member_of_hwini(adev,
2918 						       AMD_IP_BLOCK_TYPE_PSP))
2919 				break;
2920 
2921 			if (!adev->ip_blocks[i].status.sw)
2922 				continue;
2923 
2924 			/* no need to do the fw loading again if already done*/
2925 			if (adev->ip_blocks[i].status.hw == true)
2926 				break;
2927 
2928 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2929 				r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2930 				if (r)
2931 					return r;
2932 			} else {
2933 				r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2934 				if (r) {
2935 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2936 							  adev->ip_blocks[i].version->funcs->name, r);
2937 					return r;
2938 				}
2939 				adev->ip_blocks[i].status.hw = true;
2940 			}
2941 			break;
2942 		}
2943 	}
2944 
2945 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2946 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2947 
2948 	return r;
2949 }
2950 
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2951 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2952 {
2953 	struct drm_sched_init_args args = {
2954 		.ops = &amdgpu_sched_ops,
2955 		.num_rqs = DRM_SCHED_PRIORITY_COUNT,
2956 		.timeout_wq = adev->reset_domain->wq,
2957 		.dev = adev->dev,
2958 	};
2959 	long timeout;
2960 	int r, i;
2961 
2962 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2963 		struct amdgpu_ring *ring = adev->rings[i];
2964 
2965 		/* No need to setup the GPU scheduler for rings that don't need it */
2966 		if (!ring || ring->no_scheduler)
2967 			continue;
2968 
2969 		switch (ring->funcs->type) {
2970 		case AMDGPU_RING_TYPE_GFX:
2971 			timeout = adev->gfx_timeout;
2972 			break;
2973 		case AMDGPU_RING_TYPE_COMPUTE:
2974 			timeout = adev->compute_timeout;
2975 			break;
2976 		case AMDGPU_RING_TYPE_SDMA:
2977 			timeout = adev->sdma_timeout;
2978 			break;
2979 		default:
2980 			timeout = adev->video_timeout;
2981 			break;
2982 		}
2983 
2984 		args.timeout = timeout;
2985 		args.credit_limit = ring->num_hw_submission;
2986 		args.score = ring->sched_score;
2987 		args.name = ring->name;
2988 
2989 		r = drm_sched_init(&ring->sched, &args);
2990 		if (r) {
2991 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
2992 				  ring->name);
2993 			return r;
2994 		}
2995 		r = amdgpu_uvd_entity_init(adev, ring);
2996 		if (r) {
2997 			DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2998 				  ring->name);
2999 			return r;
3000 		}
3001 		r = amdgpu_vce_entity_init(adev, ring);
3002 		if (r) {
3003 			DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
3004 				  ring->name);
3005 			return r;
3006 		}
3007 	}
3008 
3009 	amdgpu_xcp_update_partition_sched_list(adev);
3010 
3011 	return 0;
3012 }
3013 
3014 
3015 /**
3016  * amdgpu_device_ip_init - run init for hardware IPs
3017  *
3018  * @adev: amdgpu_device pointer
3019  *
3020  * Main initialization pass for hardware IPs.  The list of all the hardware
3021  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
3022  * are run.  sw_init initializes the software state associated with each IP
3023  * and hw_init initializes the hardware associated with each IP.
3024  * Returns 0 on success, negative error code on failure.
3025  */
amdgpu_device_ip_init(struct amdgpu_device * adev)3026 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
3027 {
3028 	bool init_badpage;
3029 	int i, r;
3030 
3031 	r = amdgpu_ras_init(adev);
3032 	if (r)
3033 		return r;
3034 
3035 	for (i = 0; i < adev->num_ip_blocks; i++) {
3036 		if (!adev->ip_blocks[i].status.valid)
3037 			continue;
3038 		if (adev->ip_blocks[i].version->funcs->sw_init) {
3039 			r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
3040 			if (r) {
3041 				DRM_ERROR("sw_init of IP block <%s> failed %d\n",
3042 					  adev->ip_blocks[i].version->funcs->name, r);
3043 				goto init_failed;
3044 			}
3045 		}
3046 		adev->ip_blocks[i].status.sw = true;
3047 
3048 		if (!amdgpu_ip_member_of_hwini(
3049 			    adev, adev->ip_blocks[i].version->type))
3050 			continue;
3051 
3052 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
3053 			/* need to do common hw init early so everything is set up for gmc */
3054 			r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3055 			if (r) {
3056 				DRM_ERROR("hw_init %d failed %d\n", i, r);
3057 				goto init_failed;
3058 			}
3059 			adev->ip_blocks[i].status.hw = true;
3060 		} else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3061 			/* need to do gmc hw init early so we can allocate gpu mem */
3062 			/* Try to reserve bad pages early */
3063 			if (amdgpu_sriov_vf(adev))
3064 				amdgpu_virt_exchange_data(adev);
3065 
3066 			r = amdgpu_device_mem_scratch_init(adev);
3067 			if (r) {
3068 				DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
3069 				goto init_failed;
3070 			}
3071 			r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3072 			if (r) {
3073 				DRM_ERROR("hw_init %d failed %d\n", i, r);
3074 				goto init_failed;
3075 			}
3076 			r = amdgpu_device_wb_init(adev);
3077 			if (r) {
3078 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
3079 				goto init_failed;
3080 			}
3081 			adev->ip_blocks[i].status.hw = true;
3082 
3083 			/* right after GMC hw init, we create CSA */
3084 			if (adev->gfx.mcbp) {
3085 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
3086 							       AMDGPU_GEM_DOMAIN_VRAM |
3087 							       AMDGPU_GEM_DOMAIN_GTT,
3088 							       AMDGPU_CSA_SIZE);
3089 				if (r) {
3090 					DRM_ERROR("allocate CSA failed %d\n", r);
3091 					goto init_failed;
3092 				}
3093 			}
3094 
3095 			r = amdgpu_seq64_init(adev);
3096 			if (r) {
3097 				DRM_ERROR("allocate seq64 failed %d\n", r);
3098 				goto init_failed;
3099 			}
3100 		}
3101 	}
3102 
3103 	if (amdgpu_sriov_vf(adev))
3104 		amdgpu_virt_init_data_exchange(adev);
3105 
3106 	r = amdgpu_ib_pool_init(adev);
3107 	if (r) {
3108 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
3109 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
3110 		goto init_failed;
3111 	}
3112 
3113 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
3114 	if (r)
3115 		goto init_failed;
3116 
3117 	r = amdgpu_device_ip_hw_init_phase1(adev);
3118 	if (r)
3119 		goto init_failed;
3120 
3121 	r = amdgpu_device_fw_loading(adev);
3122 	if (r)
3123 		goto init_failed;
3124 
3125 	r = amdgpu_device_ip_hw_init_phase2(adev);
3126 	if (r)
3127 		goto init_failed;
3128 
3129 	/*
3130 	 * retired pages will be loaded from eeprom and reserved here,
3131 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
3132 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3133 	 * for I2C communication which only true at this point.
3134 	 *
3135 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3136 	 * failure from bad gpu situation and stop amdgpu init process
3137 	 * accordingly. For other failed cases, it will still release all
3138 	 * the resource and print error message, rather than returning one
3139 	 * negative value to upper level.
3140 	 *
3141 	 * Note: theoretically, this should be called before all vram allocations
3142 	 * to protect retired page from abusing
3143 	 */
3144 	init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3145 	r = amdgpu_ras_recovery_init(adev, init_badpage);
3146 	if (r)
3147 		goto init_failed;
3148 
3149 	/**
3150 	 * In case of XGMI grab extra reference for reset domain for this device
3151 	 */
3152 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
3153 		if (amdgpu_xgmi_add_device(adev) == 0) {
3154 			if (!amdgpu_sriov_vf(adev)) {
3155 				struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3156 
3157 				if (WARN_ON(!hive)) {
3158 					r = -ENOENT;
3159 					goto init_failed;
3160 				}
3161 
3162 				if (!hive->reset_domain ||
3163 				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3164 					r = -ENOENT;
3165 					amdgpu_put_xgmi_hive(hive);
3166 					goto init_failed;
3167 				}
3168 
3169 				/* Drop the early temporary reset domain we created for device */
3170 				amdgpu_reset_put_reset_domain(adev->reset_domain);
3171 				adev->reset_domain = hive->reset_domain;
3172 				amdgpu_put_xgmi_hive(hive);
3173 			}
3174 		}
3175 	}
3176 
3177 	r = amdgpu_device_init_schedulers(adev);
3178 	if (r)
3179 		goto init_failed;
3180 
3181 	if (adev->mman.buffer_funcs_ring->sched.ready)
3182 		amdgpu_ttm_set_buffer_funcs_status(adev, true);
3183 
3184 	/* Don't init kfd if whole hive need to be reset during init */
3185 	if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3186 		kgd2kfd_init_zone_device(adev);
3187 		amdgpu_amdkfd_device_init(adev);
3188 	}
3189 
3190 	amdgpu_fru_get_product_info(adev);
3191 
3192 	if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev))
3193 		r = amdgpu_cper_init(adev);
3194 
3195 init_failed:
3196 
3197 	return r;
3198 }
3199 
3200 /**
3201  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3202  *
3203  * @adev: amdgpu_device pointer
3204  *
3205  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
3206  * this function before a GPU reset.  If the value is retained after a
3207  * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3208  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)3209 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3210 {
3211 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3212 }
3213 
3214 /**
3215  * amdgpu_device_check_vram_lost - check if vram is valid
3216  *
3217  * @adev: amdgpu_device pointer
3218  *
3219  * Checks the reset magic value written to the gart pointer in VRAM.
3220  * The driver calls this after a GPU reset to see if the contents of
3221  * VRAM is lost or now.
3222  * returns true if vram is lost, false if not.
3223  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3224 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3225 {
3226 	if (memcmp(adev->gart.ptr, adev->reset_magic,
3227 			AMDGPU_RESET_MAGIC_NUM))
3228 		return true;
3229 
3230 	if (!amdgpu_in_reset(adev))
3231 		return false;
3232 
3233 	/*
3234 	 * For all ASICs with baco/mode1 reset, the VRAM is
3235 	 * always assumed to be lost.
3236 	 */
3237 	switch (amdgpu_asic_reset_method(adev)) {
3238 	case AMD_RESET_METHOD_LINK:
3239 	case AMD_RESET_METHOD_BACO:
3240 	case AMD_RESET_METHOD_MODE1:
3241 		return true;
3242 	default:
3243 		return false;
3244 	}
3245 }
3246 
3247 /**
3248  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3249  *
3250  * @adev: amdgpu_device pointer
3251  * @state: clockgating state (gate or ungate)
3252  *
3253  * The list of all the hardware IPs that make up the asic is walked and the
3254  * set_clockgating_state callbacks are run.
3255  * Late initialization pass enabling clockgating for hardware IPs.
3256  * Fini or suspend, pass disabling clockgating for hardware IPs.
3257  * Returns 0 on success, negative error code on failure.
3258  */
3259 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3260 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3261 			       enum amd_clockgating_state state)
3262 {
3263 	int i, j, r;
3264 
3265 	if (amdgpu_emu_mode == 1)
3266 		return 0;
3267 
3268 	for (j = 0; j < adev->num_ip_blocks; j++) {
3269 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3270 		if (!adev->ip_blocks[i].status.late_initialized)
3271 			continue;
3272 		/* skip CG for GFX, SDMA on S0ix */
3273 		if (adev->in_s0ix &&
3274 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3275 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3276 			continue;
3277 		/* skip CG for VCE/UVD, it's handled specially */
3278 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3279 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3280 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3281 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3282 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3283 			/* enable clockgating to save power */
3284 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
3285 										     state);
3286 			if (r) {
3287 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3288 					  adev->ip_blocks[i].version->funcs->name, r);
3289 				return r;
3290 			}
3291 		}
3292 	}
3293 
3294 	return 0;
3295 }
3296 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3297 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3298 			       enum amd_powergating_state state)
3299 {
3300 	int i, j, r;
3301 
3302 	if (amdgpu_emu_mode == 1)
3303 		return 0;
3304 
3305 	for (j = 0; j < adev->num_ip_blocks; j++) {
3306 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3307 		if (!adev->ip_blocks[i].status.late_initialized)
3308 			continue;
3309 		/* skip PG for GFX, SDMA on S0ix */
3310 		if (adev->in_s0ix &&
3311 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3312 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3313 			continue;
3314 		/* skip CG for VCE/UVD, it's handled specially */
3315 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3316 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3317 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3318 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3319 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
3320 			/* enable powergating to save power */
3321 			r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
3322 											state);
3323 			if (r) {
3324 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3325 					  adev->ip_blocks[i].version->funcs->name, r);
3326 				return r;
3327 			}
3328 		}
3329 	}
3330 	return 0;
3331 }
3332 
amdgpu_device_enable_mgpu_fan_boost(void)3333 static int amdgpu_device_enable_mgpu_fan_boost(void)
3334 {
3335 	struct amdgpu_gpu_instance *gpu_ins;
3336 	struct amdgpu_device *adev;
3337 	int i, ret = 0;
3338 
3339 	mutex_lock(&mgpu_info.mutex);
3340 
3341 	/*
3342 	 * MGPU fan boost feature should be enabled
3343 	 * only when there are two or more dGPUs in
3344 	 * the system
3345 	 */
3346 	if (mgpu_info.num_dgpu < 2)
3347 		goto out;
3348 
3349 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
3350 		gpu_ins = &(mgpu_info.gpu_ins[i]);
3351 		adev = gpu_ins->adev;
3352 		if (!(adev->flags & AMD_IS_APU) &&
3353 		    !gpu_ins->mgpu_fan_enabled) {
3354 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3355 			if (ret)
3356 				break;
3357 
3358 			gpu_ins->mgpu_fan_enabled = 1;
3359 		}
3360 	}
3361 
3362 out:
3363 	mutex_unlock(&mgpu_info.mutex);
3364 
3365 	return ret;
3366 }
3367 
3368 /**
3369  * amdgpu_device_ip_late_init - run late init for hardware IPs
3370  *
3371  * @adev: amdgpu_device pointer
3372  *
3373  * Late initialization pass for hardware IPs.  The list of all the hardware
3374  * IPs that make up the asic is walked and the late_init callbacks are run.
3375  * late_init covers any special initialization that an IP requires
3376  * after all of the have been initialized or something that needs to happen
3377  * late in the init process.
3378  * Returns 0 on success, negative error code on failure.
3379  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3380 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3381 {
3382 	struct amdgpu_gpu_instance *gpu_instance;
3383 	int i = 0, r;
3384 
3385 	for (i = 0; i < adev->num_ip_blocks; i++) {
3386 		if (!adev->ip_blocks[i].status.hw)
3387 			continue;
3388 		if (adev->ip_blocks[i].version->funcs->late_init) {
3389 			r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3390 			if (r) {
3391 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
3392 					  adev->ip_blocks[i].version->funcs->name, r);
3393 				return r;
3394 			}
3395 		}
3396 		adev->ip_blocks[i].status.late_initialized = true;
3397 	}
3398 
3399 	r = amdgpu_ras_late_init(adev);
3400 	if (r) {
3401 		DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3402 		return r;
3403 	}
3404 
3405 	if (!amdgpu_reset_in_recovery(adev))
3406 		amdgpu_ras_set_error_query_ready(adev, true);
3407 
3408 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3409 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3410 
3411 	amdgpu_device_fill_reset_magic(adev);
3412 
3413 	r = amdgpu_device_enable_mgpu_fan_boost();
3414 	if (r)
3415 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3416 
3417 	/* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3418 	if (amdgpu_passthrough(adev) &&
3419 	    ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3420 	     adev->asic_type == CHIP_ALDEBARAN))
3421 		amdgpu_dpm_handle_passthrough_sbr(adev, true);
3422 
3423 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
3424 		mutex_lock(&mgpu_info.mutex);
3425 
3426 		/*
3427 		 * Reset device p-state to low as this was booted with high.
3428 		 *
3429 		 * This should be performed only after all devices from the same
3430 		 * hive get initialized.
3431 		 *
3432 		 * However, it's unknown how many device in the hive in advance.
3433 		 * As this is counted one by one during devices initializations.
3434 		 *
3435 		 * So, we wait for all XGMI interlinked devices initialized.
3436 		 * This may bring some delays as those devices may come from
3437 		 * different hives. But that should be OK.
3438 		 */
3439 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3440 			for (i = 0; i < mgpu_info.num_gpu; i++) {
3441 				gpu_instance = &(mgpu_info.gpu_ins[i]);
3442 				if (gpu_instance->adev->flags & AMD_IS_APU)
3443 					continue;
3444 
3445 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3446 						AMDGPU_XGMI_PSTATE_MIN);
3447 				if (r) {
3448 					DRM_ERROR("pstate setting failed (%d).\n", r);
3449 					break;
3450 				}
3451 			}
3452 		}
3453 
3454 		mutex_unlock(&mgpu_info.mutex);
3455 	}
3456 
3457 	return 0;
3458 }
3459 
amdgpu_ip_block_hw_fini(struct amdgpu_ip_block * ip_block)3460 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3461 {
3462 	int r;
3463 
3464 	if (!ip_block->version->funcs->hw_fini) {
3465 		DRM_ERROR("hw_fini of IP block <%s> not defined\n",
3466 			  ip_block->version->funcs->name);
3467 	} else {
3468 		r = ip_block->version->funcs->hw_fini(ip_block);
3469 		/* XXX handle errors */
3470 		if (r) {
3471 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3472 				  ip_block->version->funcs->name, r);
3473 		}
3474 	}
3475 
3476 	ip_block->status.hw = false;
3477 }
3478 
3479 /**
3480  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3481  *
3482  * @adev: amdgpu_device pointer
3483  *
3484  * For ASICs need to disable SMC first
3485  */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3486 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3487 {
3488 	int i;
3489 
3490 	if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3491 		return;
3492 
3493 	for (i = 0; i < adev->num_ip_blocks; i++) {
3494 		if (!adev->ip_blocks[i].status.hw)
3495 			continue;
3496 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3497 			amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3498 			break;
3499 		}
3500 	}
3501 }
3502 
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3503 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3504 {
3505 	int i, r;
3506 
3507 	for (i = 0; i < adev->num_ip_blocks; i++) {
3508 		if (!adev->ip_blocks[i].version->funcs->early_fini)
3509 			continue;
3510 
3511 		r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3512 		if (r) {
3513 			DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3514 				  adev->ip_blocks[i].version->funcs->name, r);
3515 		}
3516 	}
3517 
3518 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3519 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3520 
3521 	amdgpu_amdkfd_suspend(adev, false);
3522 	amdgpu_userq_suspend(adev);
3523 
3524 	/* Workaround for ASICs need to disable SMC first */
3525 	amdgpu_device_smu_fini_early(adev);
3526 
3527 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3528 		if (!adev->ip_blocks[i].status.hw)
3529 			continue;
3530 
3531 		amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3532 	}
3533 
3534 	if (amdgpu_sriov_vf(adev)) {
3535 		if (amdgpu_virt_release_full_gpu(adev, false))
3536 			DRM_ERROR("failed to release exclusive mode on fini\n");
3537 	}
3538 
3539 	return 0;
3540 }
3541 
3542 /**
3543  * amdgpu_device_ip_fini - run fini for hardware IPs
3544  *
3545  * @adev: amdgpu_device pointer
3546  *
3547  * Main teardown pass for hardware IPs.  The list of all the hardware
3548  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3549  * are run.  hw_fini tears down the hardware associated with each IP
3550  * and sw_fini tears down any software state associated with each IP.
3551  * Returns 0 on success, negative error code on failure.
3552  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3553 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3554 {
3555 	int i, r;
3556 
3557 	amdgpu_cper_fini(adev);
3558 
3559 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3560 		amdgpu_virt_release_ras_err_handler_data(adev);
3561 
3562 	if (adev->gmc.xgmi.num_physical_nodes > 1)
3563 		amdgpu_xgmi_remove_device(adev);
3564 
3565 	amdgpu_amdkfd_device_fini_sw(adev);
3566 
3567 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3568 		if (!adev->ip_blocks[i].status.sw)
3569 			continue;
3570 
3571 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3572 			amdgpu_ucode_free_bo(adev);
3573 			amdgpu_free_static_csa(&adev->virt.csa_obj);
3574 			amdgpu_device_wb_fini(adev);
3575 			amdgpu_device_mem_scratch_fini(adev);
3576 			amdgpu_ib_pool_fini(adev);
3577 			amdgpu_seq64_fini(adev);
3578 			amdgpu_doorbell_fini(adev);
3579 		}
3580 		if (adev->ip_blocks[i].version->funcs->sw_fini) {
3581 			r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3582 			/* XXX handle errors */
3583 			if (r) {
3584 				DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3585 					  adev->ip_blocks[i].version->funcs->name, r);
3586 			}
3587 		}
3588 		adev->ip_blocks[i].status.sw = false;
3589 		adev->ip_blocks[i].status.valid = false;
3590 	}
3591 
3592 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3593 		if (!adev->ip_blocks[i].status.late_initialized)
3594 			continue;
3595 		if (adev->ip_blocks[i].version->funcs->late_fini)
3596 			adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3597 		adev->ip_blocks[i].status.late_initialized = false;
3598 	}
3599 
3600 	amdgpu_ras_fini(adev);
3601 
3602 	return 0;
3603 }
3604 
3605 /**
3606  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3607  *
3608  * @work: work_struct.
3609  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3610 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3611 {
3612 	struct amdgpu_device *adev =
3613 		container_of(work, struct amdgpu_device, delayed_init_work.work);
3614 	int r;
3615 
3616 	r = amdgpu_ib_ring_tests(adev);
3617 	if (r)
3618 		DRM_ERROR("ib ring test failed (%d).\n", r);
3619 }
3620 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3621 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3622 {
3623 	struct amdgpu_device *adev =
3624 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3625 
3626 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
3627 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3628 
3629 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0))
3630 		adev->gfx.gfx_off_state = true;
3631 }
3632 
3633 /**
3634  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3635  *
3636  * @adev: amdgpu_device pointer
3637  *
3638  * Main suspend function for hardware IPs.  The list of all the hardware
3639  * IPs that make up the asic is walked, clockgating is disabled and the
3640  * suspend callbacks are run.  suspend puts the hardware and software state
3641  * in each IP into a state suitable for suspend.
3642  * Returns 0 on success, negative error code on failure.
3643  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3644 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3645 {
3646 	int i, r;
3647 
3648 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3649 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3650 
3651 	/*
3652 	 * Per PMFW team's suggestion, driver needs to handle gfxoff
3653 	 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3654 	 * scenario. Add the missing df cstate disablement here.
3655 	 */
3656 	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3657 		dev_warn(adev->dev, "Failed to disallow df cstate");
3658 
3659 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3660 		if (!adev->ip_blocks[i].status.valid)
3661 			continue;
3662 
3663 		/* displays are handled separately */
3664 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3665 			continue;
3666 
3667 		/* XXX handle errors */
3668 		r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3669 		if (r)
3670 			return r;
3671 	}
3672 
3673 	return 0;
3674 }
3675 
3676 /**
3677  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3678  *
3679  * @adev: amdgpu_device pointer
3680  *
3681  * Main suspend function for hardware IPs.  The list of all the hardware
3682  * IPs that make up the asic is walked, clockgating is disabled and the
3683  * suspend callbacks are run.  suspend puts the hardware and software state
3684  * in each IP into a state suitable for suspend.
3685  * Returns 0 on success, negative error code on failure.
3686  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3687 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3688 {
3689 	int i, r;
3690 
3691 	if (adev->in_s0ix)
3692 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3693 
3694 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3695 		if (!adev->ip_blocks[i].status.valid)
3696 			continue;
3697 		/* displays are handled in phase1 */
3698 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3699 			continue;
3700 		/* PSP lost connection when err_event_athub occurs */
3701 		if (amdgpu_ras_intr_triggered() &&
3702 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3703 			adev->ip_blocks[i].status.hw = false;
3704 			continue;
3705 		}
3706 
3707 		/* skip unnecessary suspend if we do not initialize them yet */
3708 		if (!amdgpu_ip_member_of_hwini(
3709 			    adev, adev->ip_blocks[i].version->type))
3710 			continue;
3711 
3712 		/* Since we skip suspend for S0i3, we need to cancel the delayed
3713 		 * idle work here as the suspend callback never gets called.
3714 		 */
3715 		if (adev->in_s0ix &&
3716 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX &&
3717 		    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0))
3718 			cancel_delayed_work_sync(&adev->gfx.idle_work);
3719 		/* skip suspend of gfx/mes and psp for S0ix
3720 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3721 		 * like at runtime. PSP is also part of the always on hardware
3722 		 * so no need to suspend it.
3723 		 */
3724 		if (adev->in_s0ix &&
3725 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3726 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3727 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3728 			continue;
3729 
3730 		/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3731 		if (adev->in_s0ix &&
3732 		    (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3733 		     IP_VERSION(5, 0, 0)) &&
3734 		    (adev->ip_blocks[i].version->type ==
3735 		     AMD_IP_BLOCK_TYPE_SDMA))
3736 			continue;
3737 
3738 		/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3739 		 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3740 		 * from this location and RLC Autoload automatically also gets loaded
3741 		 * from here based on PMFW -> PSP message during re-init sequence.
3742 		 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3743 		 * the TMR and reload FWs again for IMU enabled APU ASICs.
3744 		 */
3745 		if (amdgpu_in_reset(adev) &&
3746 		    (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3747 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3748 			continue;
3749 
3750 		/* XXX handle errors */
3751 		r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3752 		adev->ip_blocks[i].status.hw = false;
3753 
3754 		/* handle putting the SMC in the appropriate state */
3755 		if (!amdgpu_sriov_vf(adev)) {
3756 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3757 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3758 				if (r) {
3759 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3760 							adev->mp1_state, r);
3761 					return r;
3762 				}
3763 			}
3764 		}
3765 	}
3766 
3767 	return 0;
3768 }
3769 
3770 /**
3771  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3772  *
3773  * @adev: amdgpu_device pointer
3774  *
3775  * Main suspend function for hardware IPs.  The list of all the hardware
3776  * IPs that make up the asic is walked, clockgating is disabled and the
3777  * suspend callbacks are run.  suspend puts the hardware and software state
3778  * in each IP into a state suitable for suspend.
3779  * Returns 0 on success, negative error code on failure.
3780  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3781 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3782 {
3783 	int r;
3784 
3785 	if (amdgpu_sriov_vf(adev)) {
3786 		amdgpu_virt_fini_data_exchange(adev);
3787 		amdgpu_virt_request_full_gpu(adev, false);
3788 	}
3789 
3790 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
3791 
3792 	r = amdgpu_device_ip_suspend_phase1(adev);
3793 	if (r)
3794 		return r;
3795 	r = amdgpu_device_ip_suspend_phase2(adev);
3796 
3797 	if (amdgpu_sriov_vf(adev))
3798 		amdgpu_virt_release_full_gpu(adev, false);
3799 
3800 	return r;
3801 }
3802 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3803 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3804 {
3805 	int i, r;
3806 
3807 	static enum amd_ip_block_type ip_order[] = {
3808 		AMD_IP_BLOCK_TYPE_COMMON,
3809 		AMD_IP_BLOCK_TYPE_GMC,
3810 		AMD_IP_BLOCK_TYPE_PSP,
3811 		AMD_IP_BLOCK_TYPE_IH,
3812 	};
3813 
3814 	for (i = 0; i < adev->num_ip_blocks; i++) {
3815 		int j;
3816 		struct amdgpu_ip_block *block;
3817 
3818 		block = &adev->ip_blocks[i];
3819 		block->status.hw = false;
3820 
3821 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3822 
3823 			if (block->version->type != ip_order[j] ||
3824 				!block->status.valid)
3825 				continue;
3826 
3827 			r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3828 			if (r) {
3829 				dev_err(adev->dev, "RE-INIT-early: %s failed\n",
3830 					 block->version->funcs->name);
3831 				return r;
3832 			}
3833 			block->status.hw = true;
3834 		}
3835 	}
3836 
3837 	return 0;
3838 }
3839 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3840 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3841 {
3842 	struct amdgpu_ip_block *block;
3843 	int i, r = 0;
3844 
3845 	static enum amd_ip_block_type ip_order[] = {
3846 		AMD_IP_BLOCK_TYPE_SMC,
3847 		AMD_IP_BLOCK_TYPE_DCE,
3848 		AMD_IP_BLOCK_TYPE_GFX,
3849 		AMD_IP_BLOCK_TYPE_SDMA,
3850 		AMD_IP_BLOCK_TYPE_MES,
3851 		AMD_IP_BLOCK_TYPE_UVD,
3852 		AMD_IP_BLOCK_TYPE_VCE,
3853 		AMD_IP_BLOCK_TYPE_VCN,
3854 		AMD_IP_BLOCK_TYPE_JPEG
3855 	};
3856 
3857 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3858 		block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
3859 
3860 		if (!block)
3861 			continue;
3862 
3863 		if (block->status.valid && !block->status.hw) {
3864 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3865 				r = amdgpu_ip_block_resume(block);
3866 			} else {
3867 				r = block->version->funcs->hw_init(block);
3868 			}
3869 
3870 			if (r) {
3871 				dev_err(adev->dev, "RE-INIT-late: %s failed\n",
3872 					 block->version->funcs->name);
3873 				break;
3874 			}
3875 			block->status.hw = true;
3876 		}
3877 	}
3878 
3879 	return r;
3880 }
3881 
3882 /**
3883  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3884  *
3885  * @adev: amdgpu_device pointer
3886  *
3887  * First resume function for hardware IPs.  The list of all the hardware
3888  * IPs that make up the asic is walked and the resume callbacks are run for
3889  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3890  * after a suspend and updates the software state as necessary.  This
3891  * function is also used for restoring the GPU after a GPU reset.
3892  * Returns 0 on success, negative error code on failure.
3893  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3894 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3895 {
3896 	int i, r;
3897 
3898 	for (i = 0; i < adev->num_ip_blocks; i++) {
3899 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3900 			continue;
3901 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3902 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3903 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3904 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3905 
3906 			r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3907 			if (r)
3908 				return r;
3909 		}
3910 	}
3911 
3912 	return 0;
3913 }
3914 
3915 /**
3916  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3917  *
3918  * @adev: amdgpu_device pointer
3919  *
3920  * Second resume function for hardware IPs.  The list of all the hardware
3921  * IPs that make up the asic is walked and the resume callbacks are run for
3922  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3923  * functional state after a suspend and updates the software state as
3924  * necessary.  This function is also used for restoring the GPU after a GPU
3925  * reset.
3926  * Returns 0 on success, negative error code on failure.
3927  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3928 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3929 {
3930 	int i, r;
3931 
3932 	for (i = 0; i < adev->num_ip_blocks; i++) {
3933 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3934 			continue;
3935 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3936 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3937 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3938 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3939 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3940 			continue;
3941 		r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3942 		if (r)
3943 			return r;
3944 	}
3945 
3946 	return 0;
3947 }
3948 
3949 /**
3950  * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3951  *
3952  * @adev: amdgpu_device pointer
3953  *
3954  * Third resume function for hardware IPs.  The list of all the hardware
3955  * IPs that make up the asic is walked and the resume callbacks are run for
3956  * all DCE.  resume puts the hardware into a functional state after a suspend
3957  * and updates the software state as necessary.  This function is also used
3958  * for restoring the GPU after a GPU reset.
3959  *
3960  * Returns 0 on success, negative error code on failure.
3961  */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3962 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3963 {
3964 	int i, r;
3965 
3966 	for (i = 0; i < adev->num_ip_blocks; i++) {
3967 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3968 			continue;
3969 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3970 			r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3971 			if (r)
3972 				return r;
3973 		}
3974 	}
3975 
3976 	return 0;
3977 }
3978 
3979 /**
3980  * amdgpu_device_ip_resume - run resume for hardware IPs
3981  *
3982  * @adev: amdgpu_device pointer
3983  *
3984  * Main resume function for hardware IPs.  The hardware IPs
3985  * are split into two resume functions because they are
3986  * also used in recovering from a GPU reset and some additional
3987  * steps need to be take between them.  In this case (S3/S4) they are
3988  * run sequentially.
3989  * Returns 0 on success, negative error code on failure.
3990  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3991 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3992 {
3993 	int r;
3994 
3995 	r = amdgpu_device_ip_resume_phase1(adev);
3996 	if (r)
3997 		return r;
3998 
3999 	r = amdgpu_device_fw_loading(adev);
4000 	if (r)
4001 		return r;
4002 
4003 	r = amdgpu_device_ip_resume_phase2(adev);
4004 
4005 	if (adev->mman.buffer_funcs_ring->sched.ready)
4006 		amdgpu_ttm_set_buffer_funcs_status(adev, true);
4007 
4008 	if (r)
4009 		return r;
4010 
4011 	amdgpu_fence_driver_hw_init(adev);
4012 
4013 	r = amdgpu_device_ip_resume_phase3(adev);
4014 
4015 	return r;
4016 }
4017 
4018 /**
4019  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
4020  *
4021  * @adev: amdgpu_device pointer
4022  *
4023  * Query the VBIOS data tables to determine if the board supports SR-IOV.
4024  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)4025 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
4026 {
4027 	if (amdgpu_sriov_vf(adev)) {
4028 		if (adev->is_atom_fw) {
4029 			if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
4030 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4031 		} else {
4032 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
4033 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4034 		}
4035 
4036 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
4037 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
4038 	}
4039 }
4040 
4041 /**
4042  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
4043  *
4044  * @asic_type: AMD asic type
4045  *
4046  * Check if there is DC (new modesetting infrastructre) support for an asic.
4047  * returns true if DC has support, false if not.
4048  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)4049 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
4050 {
4051 	switch (asic_type) {
4052 #ifdef CONFIG_DRM_AMDGPU_SI
4053 	case CHIP_HAINAN:
4054 #endif
4055 	case CHIP_TOPAZ:
4056 		/* chips with no display hardware */
4057 		return false;
4058 #if defined(CONFIG_DRM_AMD_DC)
4059 	case CHIP_TAHITI:
4060 	case CHIP_PITCAIRN:
4061 	case CHIP_VERDE:
4062 	case CHIP_OLAND:
4063 		/*
4064 		 * We have systems in the wild with these ASICs that require
4065 		 * LVDS and VGA support which is not supported with DC.
4066 		 *
4067 		 * Fallback to the non-DC driver here by default so as not to
4068 		 * cause regressions.
4069 		 */
4070 #if defined(CONFIG_DRM_AMD_DC_SI)
4071 		return amdgpu_dc > 0;
4072 #else
4073 		return false;
4074 #endif
4075 	case CHIP_BONAIRE:
4076 	case CHIP_KAVERI:
4077 	case CHIP_KABINI:
4078 	case CHIP_MULLINS:
4079 		/*
4080 		 * We have systems in the wild with these ASICs that require
4081 		 * VGA support which is not supported with DC.
4082 		 *
4083 		 * Fallback to the non-DC driver here by default so as not to
4084 		 * cause regressions.
4085 		 */
4086 		return amdgpu_dc > 0;
4087 	default:
4088 		return amdgpu_dc != 0;
4089 #else
4090 	default:
4091 		if (amdgpu_dc > 0)
4092 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
4093 		return false;
4094 #endif
4095 	}
4096 }
4097 
4098 /**
4099  * amdgpu_device_has_dc_support - check if dc is supported
4100  *
4101  * @adev: amdgpu_device pointer
4102  *
4103  * Returns true for supported, false for not supported
4104  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)4105 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
4106 {
4107 	if (adev->enable_virtual_display ||
4108 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
4109 		return false;
4110 
4111 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
4112 }
4113 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)4114 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
4115 {
4116 	struct amdgpu_device *adev =
4117 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
4118 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
4119 
4120 	/* It's a bug to not have a hive within this function */
4121 	if (WARN_ON(!hive))
4122 		return;
4123 
4124 	/*
4125 	 * Use task barrier to synchronize all xgmi reset works across the
4126 	 * hive. task_barrier_enter and task_barrier_exit will block
4127 	 * until all the threads running the xgmi reset works reach
4128 	 * those points. task_barrier_full will do both blocks.
4129 	 */
4130 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
4131 
4132 		task_barrier_enter(&hive->tb);
4133 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
4134 
4135 		if (adev->asic_reset_res)
4136 			goto fail;
4137 
4138 		task_barrier_exit(&hive->tb);
4139 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
4140 
4141 		if (adev->asic_reset_res)
4142 			goto fail;
4143 
4144 		amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
4145 	} else {
4146 
4147 		task_barrier_full(&hive->tb);
4148 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
4149 	}
4150 
4151 fail:
4152 	if (adev->asic_reset_res)
4153 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4154 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
4155 	amdgpu_put_xgmi_hive(hive);
4156 }
4157 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)4158 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4159 {
4160 	char *input = amdgpu_lockup_timeout;
4161 	char *timeout_setting = NULL;
4162 	int index = 0;
4163 	long timeout;
4164 	int ret = 0;
4165 
4166 	/*
4167 	 * By default timeout for non compute jobs is 10000
4168 	 * and 60000 for compute jobs.
4169 	 * In SR-IOV or passthrough mode, timeout for compute
4170 	 * jobs are 60000 by default.
4171 	 */
4172 	adev->gfx_timeout = msecs_to_jiffies(10000);
4173 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4174 	if (amdgpu_sriov_vf(adev))
4175 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
4176 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
4177 	else
4178 		adev->compute_timeout =  msecs_to_jiffies(60000);
4179 
4180 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4181 		while ((timeout_setting = strsep(&input, ",")) &&
4182 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4183 			ret = kstrtol(timeout_setting, 0, &timeout);
4184 			if (ret)
4185 				return ret;
4186 
4187 			if (timeout == 0) {
4188 				index++;
4189 				continue;
4190 			} else if (timeout < 0) {
4191 				timeout = MAX_SCHEDULE_TIMEOUT;
4192 				dev_warn(adev->dev, "lockup timeout disabled");
4193 				add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4194 			} else {
4195 				timeout = msecs_to_jiffies(timeout);
4196 			}
4197 
4198 			switch (index++) {
4199 			case 0:
4200 				adev->gfx_timeout = timeout;
4201 				break;
4202 			case 1:
4203 				adev->compute_timeout = timeout;
4204 				break;
4205 			case 2:
4206 				adev->sdma_timeout = timeout;
4207 				break;
4208 			case 3:
4209 				adev->video_timeout = timeout;
4210 				break;
4211 			default:
4212 				break;
4213 			}
4214 		}
4215 		/*
4216 		 * There is only one value specified and
4217 		 * it should apply to all non-compute jobs.
4218 		 */
4219 		if (index == 1) {
4220 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4221 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4222 				adev->compute_timeout = adev->gfx_timeout;
4223 		}
4224 	}
4225 
4226 	return ret;
4227 }
4228 
4229 /**
4230  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4231  *
4232  * @adev: amdgpu_device pointer
4233  *
4234  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4235  */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4236 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4237 {
4238 	struct iommu_domain *domain;
4239 
4240 	domain = iommu_get_domain_for_dev(adev->dev);
4241 	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4242 		adev->ram_is_direct_mapped = true;
4243 }
4244 
4245 #if defined(CONFIG_HSA_AMD_P2P)
4246 /**
4247  * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4248  *
4249  * @adev: amdgpu_device pointer
4250  *
4251  * return if IOMMU remapping bar address
4252  */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4253 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4254 {
4255 	struct iommu_domain *domain;
4256 
4257 	domain = iommu_get_domain_for_dev(adev->dev);
4258 	if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4259 		domain->type ==	IOMMU_DOMAIN_DMA_FQ))
4260 		return true;
4261 
4262 	return false;
4263 }
4264 #endif
4265 
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4266 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4267 {
4268 	if (amdgpu_mcbp == 1)
4269 		adev->gfx.mcbp = true;
4270 	else if (amdgpu_mcbp == 0)
4271 		adev->gfx.mcbp = false;
4272 
4273 	if (amdgpu_sriov_vf(adev))
4274 		adev->gfx.mcbp = true;
4275 
4276 	if (adev->gfx.mcbp)
4277 		DRM_INFO("MCBP is enabled\n");
4278 }
4279 
4280 /**
4281  * amdgpu_device_init - initialize the driver
4282  *
4283  * @adev: amdgpu_device pointer
4284  * @flags: driver flags
4285  *
4286  * Initializes the driver info and hw (all asics).
4287  * Returns 0 for success or an error on failure.
4288  * Called at driver startup.
4289  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4290 int amdgpu_device_init(struct amdgpu_device *adev,
4291 		       uint32_t flags)
4292 {
4293 	struct drm_device *ddev = adev_to_drm(adev);
4294 	struct pci_dev *pdev = adev->pdev;
4295 	int r, i;
4296 	bool px = false;
4297 	u32 max_MBps;
4298 	int tmp;
4299 
4300 	adev->shutdown = false;
4301 	adev->flags = flags;
4302 
4303 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4304 		adev->asic_type = amdgpu_force_asic_type;
4305 	else
4306 		adev->asic_type = flags & AMD_ASIC_MASK;
4307 
4308 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4309 	if (amdgpu_emu_mode == 1)
4310 		adev->usec_timeout *= 10;
4311 	adev->gmc.gart_size = 512 * 1024 * 1024;
4312 	adev->accel_working = false;
4313 	adev->num_rings = 0;
4314 	RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4315 	adev->mman.buffer_funcs = NULL;
4316 	adev->mman.buffer_funcs_ring = NULL;
4317 	adev->vm_manager.vm_pte_funcs = NULL;
4318 	adev->vm_manager.vm_pte_num_scheds = 0;
4319 	adev->gmc.gmc_funcs = NULL;
4320 	adev->harvest_ip_mask = 0x0;
4321 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4322 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4323 
4324 	adev->smc_rreg = &amdgpu_invalid_rreg;
4325 	adev->smc_wreg = &amdgpu_invalid_wreg;
4326 	adev->pcie_rreg = &amdgpu_invalid_rreg;
4327 	adev->pcie_wreg = &amdgpu_invalid_wreg;
4328 	adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4329 	adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4330 	adev->pciep_rreg = &amdgpu_invalid_rreg;
4331 	adev->pciep_wreg = &amdgpu_invalid_wreg;
4332 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4333 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4334 	adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4335 	adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4336 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4337 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4338 	adev->didt_rreg = &amdgpu_invalid_rreg;
4339 	adev->didt_wreg = &amdgpu_invalid_wreg;
4340 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4341 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4342 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4343 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4344 
4345 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4346 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4347 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4348 
4349 	/* mutex initialization are all done here so we
4350 	 * can recall function without having locking issues
4351 	 */
4352 	mutex_init(&adev->firmware.mutex);
4353 	mutex_init(&adev->pm.mutex);
4354 	mutex_init(&adev->gfx.gpu_clock_mutex);
4355 	mutex_init(&adev->srbm_mutex);
4356 	mutex_init(&adev->gfx.pipe_reserve_mutex);
4357 	mutex_init(&adev->gfx.gfx_off_mutex);
4358 	mutex_init(&adev->gfx.partition_mutex);
4359 	mutex_init(&adev->grbm_idx_mutex);
4360 	mutex_init(&adev->mn_lock);
4361 	mutex_init(&adev->virt.vf_errors.lock);
4362 	hash_init(adev->mn_hash);
4363 	mutex_init(&adev->psp.mutex);
4364 	mutex_init(&adev->notifier_lock);
4365 	mutex_init(&adev->pm.stable_pstate_ctx_lock);
4366 	mutex_init(&adev->benchmark_mutex);
4367 	mutex_init(&adev->gfx.reset_sem_mutex);
4368 	/* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4369 	mutex_init(&adev->enforce_isolation_mutex);
4370 	for (i = 0; i < MAX_XCP; ++i) {
4371 		adev->isolation[i].spearhead = dma_fence_get_stub();
4372 		amdgpu_sync_create(&adev->isolation[i].active);
4373 		amdgpu_sync_create(&adev->isolation[i].prev);
4374 	}
4375 	mutex_init(&adev->gfx.userq_sch_mutex);
4376 	mutex_init(&adev->gfx.workload_profile_mutex);
4377 	mutex_init(&adev->vcn.workload_profile_mutex);
4378 	mutex_init(&adev->userq_mutex);
4379 
4380 	amdgpu_device_init_apu_flags(adev);
4381 
4382 	r = amdgpu_device_check_arguments(adev);
4383 	if (r)
4384 		return r;
4385 
4386 	spin_lock_init(&adev->mmio_idx_lock);
4387 	spin_lock_init(&adev->smc_idx_lock);
4388 	spin_lock_init(&adev->pcie_idx_lock);
4389 	spin_lock_init(&adev->uvd_ctx_idx_lock);
4390 	spin_lock_init(&adev->didt_idx_lock);
4391 	spin_lock_init(&adev->gc_cac_idx_lock);
4392 	spin_lock_init(&adev->se_cac_idx_lock);
4393 	spin_lock_init(&adev->audio_endpt_idx_lock);
4394 	spin_lock_init(&adev->mm_stats.lock);
4395 	spin_lock_init(&adev->virt.rlcg_reg_lock);
4396 	spin_lock_init(&adev->wb.lock);
4397 
4398 	xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ);
4399 
4400 	INIT_LIST_HEAD(&adev->reset_list);
4401 
4402 	INIT_LIST_HEAD(&adev->ras_list);
4403 
4404 	INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4405 
4406 	INIT_LIST_HEAD(&adev->userq_mgr_list);
4407 
4408 	INIT_DELAYED_WORK(&adev->delayed_init_work,
4409 			  amdgpu_device_delayed_init_work_handler);
4410 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4411 			  amdgpu_device_delay_enable_gfx_off);
4412 	/*
4413 	 * Initialize the enforce_isolation work structures for each XCP
4414 	 * partition.  This work handler is responsible for enforcing shader
4415 	 * isolation on AMD GPUs.  It counts the number of emitted fences for
4416 	 * each GFX and compute ring.  If there are any fences, it schedules
4417 	 * the `enforce_isolation_work` to be run after a delay.  If there are
4418 	 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4419 	 * runqueue.
4420 	 */
4421 	for (i = 0; i < MAX_XCP; i++) {
4422 		INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4423 				  amdgpu_gfx_enforce_isolation_handler);
4424 		adev->gfx.enforce_isolation[i].adev = adev;
4425 		adev->gfx.enforce_isolation[i].xcp_id = i;
4426 	}
4427 
4428 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4429 
4430 	adev->gfx.gfx_off_req_count = 1;
4431 	adev->gfx.gfx_off_residency = 0;
4432 	adev->gfx.gfx_off_entrycount = 0;
4433 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4434 
4435 	atomic_set(&adev->throttling_logging_enabled, 1);
4436 	/*
4437 	 * If throttling continues, logging will be performed every minute
4438 	 * to avoid log flooding. "-1" is subtracted since the thermal
4439 	 * throttling interrupt comes every second. Thus, the total logging
4440 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4441 	 * for throttling interrupt) = 60 seconds.
4442 	 */
4443 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4444 
4445 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4446 
4447 	/* Registers mapping */
4448 	/* TODO: block userspace mapping of io register */
4449 	if (adev->asic_type >= CHIP_BONAIRE) {
4450 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4451 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4452 	} else {
4453 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4454 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4455 	}
4456 
4457 	for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4458 		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4459 
4460 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4461 	if (!adev->rmmio)
4462 		return -ENOMEM;
4463 
4464 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4465 	DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4466 
4467 	/*
4468 	 * Reset domain needs to be present early, before XGMI hive discovered
4469 	 * (if any) and initialized to use reset sem and in_gpu reset flag
4470 	 * early on during init and before calling to RREG32.
4471 	 */
4472 	adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4473 	if (!adev->reset_domain)
4474 		return -ENOMEM;
4475 
4476 	/* detect hw virtualization here */
4477 	amdgpu_virt_init(adev);
4478 
4479 	amdgpu_device_get_pcie_info(adev);
4480 
4481 	r = amdgpu_device_get_job_timeout_settings(adev);
4482 	if (r) {
4483 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4484 		return r;
4485 	}
4486 
4487 	amdgpu_device_set_mcbp(adev);
4488 
4489 	/*
4490 	 * By default, use default mode where all blocks are expected to be
4491 	 * initialized. At present a 'swinit' of blocks is required to be
4492 	 * completed before the need for a different level is detected.
4493 	 */
4494 	amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4495 	/* early init functions */
4496 	r = amdgpu_device_ip_early_init(adev);
4497 	if (r)
4498 		return r;
4499 
4500 	/*
4501 	 * No need to remove conflicting FBs for non-display class devices.
4502 	 * This prevents the sysfb from being freed accidently.
4503 	 */
4504 	if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA ||
4505 	    (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) {
4506 		/* Get rid of things like offb */
4507 		r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4508 		if (r)
4509 			return r;
4510 	}
4511 
4512 	/* Enable TMZ based on IP_VERSION */
4513 	amdgpu_gmc_tmz_set(adev);
4514 
4515 	if (amdgpu_sriov_vf(adev) &&
4516 	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4517 		/* VF MMIO access (except mailbox range) from CPU
4518 		 * will be blocked during sriov runtime
4519 		 */
4520 		adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4521 
4522 	amdgpu_gmc_noretry_set(adev);
4523 	/* Need to get xgmi info early to decide the reset behavior*/
4524 	if (adev->gmc.xgmi.supported) {
4525 		r = adev->gfxhub.funcs->get_xgmi_info(adev);
4526 		if (r)
4527 			return r;
4528 	}
4529 
4530 	/* enable PCIE atomic ops */
4531 	if (amdgpu_sriov_vf(adev)) {
4532 		if (adev->virt.fw_reserve.p_pf2vf)
4533 			adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4534 						      adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4535 				(PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4536 	/* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4537 	 * internal path natively support atomics, set have_atomics_support to true.
4538 	 */
4539 	} else if ((adev->flags & AMD_IS_APU) &&
4540 		   (amdgpu_ip_version(adev, GC_HWIP, 0) >
4541 		    IP_VERSION(9, 0, 0))) {
4542 		adev->have_atomics_support = true;
4543 	} else {
4544 		adev->have_atomics_support =
4545 			!pci_enable_atomic_ops_to_root(adev->pdev,
4546 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4547 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4548 	}
4549 
4550 	if (!adev->have_atomics_support)
4551 		dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4552 
4553 	/* doorbell bar mapping and doorbell index init*/
4554 	amdgpu_doorbell_init(adev);
4555 
4556 	if (amdgpu_emu_mode == 1) {
4557 		/* post the asic on emulation mode */
4558 		emu_soc_asic_init(adev);
4559 		goto fence_driver_init;
4560 	}
4561 
4562 	amdgpu_reset_init(adev);
4563 
4564 	/* detect if we are with an SRIOV vbios */
4565 	if (adev->bios)
4566 		amdgpu_device_detect_sriov_bios(adev);
4567 
4568 	/* check if we need to reset the asic
4569 	 *  E.g., driver was not cleanly unloaded previously, etc.
4570 	 */
4571 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4572 		if (adev->gmc.xgmi.num_physical_nodes) {
4573 			dev_info(adev->dev, "Pending hive reset.\n");
4574 			amdgpu_set_init_level(adev,
4575 					      AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4576 		} else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4577 				   !amdgpu_device_has_display_hardware(adev)) {
4578 					r = psp_gpu_reset(adev);
4579 		} else {
4580 				tmp = amdgpu_reset_method;
4581 				/* It should do a default reset when loading or reloading the driver,
4582 				 * regardless of the module parameter reset_method.
4583 				 */
4584 				amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4585 				r = amdgpu_asic_reset(adev);
4586 				amdgpu_reset_method = tmp;
4587 		}
4588 
4589 		if (r) {
4590 		  dev_err(adev->dev, "asic reset on init failed\n");
4591 		  goto failed;
4592 		}
4593 	}
4594 
4595 	/* Post card if necessary */
4596 	if (amdgpu_device_need_post(adev)) {
4597 		if (!adev->bios) {
4598 			dev_err(adev->dev, "no vBIOS found\n");
4599 			r = -EINVAL;
4600 			goto failed;
4601 		}
4602 		DRM_INFO("GPU posting now...\n");
4603 		r = amdgpu_device_asic_init(adev);
4604 		if (r) {
4605 			dev_err(adev->dev, "gpu post error!\n");
4606 			goto failed;
4607 		}
4608 	}
4609 
4610 	if (adev->bios) {
4611 		if (adev->is_atom_fw) {
4612 			/* Initialize clocks */
4613 			r = amdgpu_atomfirmware_get_clock_info(adev);
4614 			if (r) {
4615 				dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4616 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4617 				goto failed;
4618 			}
4619 		} else {
4620 			/* Initialize clocks */
4621 			r = amdgpu_atombios_get_clock_info(adev);
4622 			if (r) {
4623 				dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4624 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4625 				goto failed;
4626 			}
4627 			/* init i2c buses */
4628 			amdgpu_i2c_init(adev);
4629 		}
4630 	}
4631 
4632 fence_driver_init:
4633 	/* Fence driver */
4634 	r = amdgpu_fence_driver_sw_init(adev);
4635 	if (r) {
4636 		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4637 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4638 		goto failed;
4639 	}
4640 
4641 	/* init the mode config */
4642 	drm_mode_config_init(adev_to_drm(adev));
4643 
4644 	r = amdgpu_device_ip_init(adev);
4645 	if (r) {
4646 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4647 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4648 		goto release_ras_con;
4649 	}
4650 
4651 	amdgpu_fence_driver_hw_init(adev);
4652 
4653 	dev_info(adev->dev,
4654 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4655 			adev->gfx.config.max_shader_engines,
4656 			adev->gfx.config.max_sh_per_se,
4657 			adev->gfx.config.max_cu_per_sh,
4658 			adev->gfx.cu_info.number);
4659 
4660 	adev->accel_working = true;
4661 
4662 	amdgpu_vm_check_compute_bug(adev);
4663 
4664 	/* Initialize the buffer migration limit. */
4665 	if (amdgpu_moverate >= 0)
4666 		max_MBps = amdgpu_moverate;
4667 	else
4668 		max_MBps = 8; /* Allow 8 MB/s. */
4669 	/* Get a log2 for easy divisions. */
4670 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4671 
4672 	/*
4673 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4674 	 * Otherwise the mgpu fan boost feature will be skipped due to the
4675 	 * gpu instance is counted less.
4676 	 */
4677 	amdgpu_register_gpu_instance(adev);
4678 
4679 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
4680 	 * explicit gating rather than handling it automatically.
4681 	 */
4682 	if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4683 		r = amdgpu_device_ip_late_init(adev);
4684 		if (r) {
4685 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4686 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4687 			goto release_ras_con;
4688 		}
4689 		/* must succeed. */
4690 		amdgpu_ras_resume(adev);
4691 		queue_delayed_work(system_wq, &adev->delayed_init_work,
4692 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
4693 	}
4694 
4695 	if (amdgpu_sriov_vf(adev)) {
4696 		amdgpu_virt_release_full_gpu(adev, true);
4697 		flush_delayed_work(&adev->delayed_init_work);
4698 	}
4699 
4700 	/*
4701 	 * Place those sysfs registering after `late_init`. As some of those
4702 	 * operations performed in `late_init` might affect the sysfs
4703 	 * interfaces creating.
4704 	 */
4705 	r = amdgpu_atombios_sysfs_init(adev);
4706 	if (r)
4707 		drm_err(&adev->ddev,
4708 			"registering atombios sysfs failed (%d).\n", r);
4709 
4710 	r = amdgpu_pm_sysfs_init(adev);
4711 	if (r)
4712 		DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4713 
4714 	r = amdgpu_ucode_sysfs_init(adev);
4715 	if (r) {
4716 		adev->ucode_sysfs_en = false;
4717 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4718 	} else
4719 		adev->ucode_sysfs_en = true;
4720 
4721 	r = amdgpu_device_attr_sysfs_init(adev);
4722 	if (r)
4723 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
4724 
4725 	r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4726 	if (r)
4727 		dev_err(adev->dev,
4728 			"Could not create amdgpu board attributes\n");
4729 
4730 	amdgpu_fru_sysfs_init(adev);
4731 	amdgpu_reg_state_sysfs_init(adev);
4732 	amdgpu_xcp_sysfs_init(adev);
4733 
4734 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4735 		r = amdgpu_pmu_init(adev);
4736 	if (r)
4737 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4738 
4739 	/* Have stored pci confspace at hand for restore in sudden PCI error */
4740 	if (amdgpu_device_cache_pci_state(adev->pdev))
4741 		pci_restore_state(pdev);
4742 
4743 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4744 	/* this will fail for cards that aren't VGA class devices, just
4745 	 * ignore it
4746 	 */
4747 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4748 		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4749 
4750 	px = amdgpu_device_supports_px(ddev);
4751 
4752 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
4753 				apple_gmux_detect(NULL, NULL)))
4754 		vga_switcheroo_register_client(adev->pdev,
4755 					       &amdgpu_switcheroo_ops, px);
4756 
4757 	if (px)
4758 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4759 
4760 	if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4761 		amdgpu_xgmi_reset_on_init(adev);
4762 
4763 	amdgpu_device_check_iommu_direct_map(adev);
4764 
4765 	adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4766 	r = register_pm_notifier(&adev->pm_nb);
4767 	if (r)
4768 		goto failed;
4769 
4770 	return 0;
4771 
4772 release_ras_con:
4773 	if (amdgpu_sriov_vf(adev))
4774 		amdgpu_virt_release_full_gpu(adev, true);
4775 
4776 	/* failed in exclusive mode due to timeout */
4777 	if (amdgpu_sriov_vf(adev) &&
4778 		!amdgpu_sriov_runtime(adev) &&
4779 		amdgpu_virt_mmio_blocked(adev) &&
4780 		!amdgpu_virt_wait_reset(adev)) {
4781 		dev_err(adev->dev, "VF exclusive mode timeout\n");
4782 		/* Don't send request since VF is inactive. */
4783 		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4784 		adev->virt.ops = NULL;
4785 		r = -EAGAIN;
4786 	}
4787 	amdgpu_release_ras_context(adev);
4788 
4789 failed:
4790 	amdgpu_vf_error_trans_all(adev);
4791 
4792 	return r;
4793 }
4794 
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4795 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4796 {
4797 
4798 	/* Clear all CPU mappings pointing to this device */
4799 	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4800 
4801 	/* Unmap all mapped bars - Doorbell, registers and VRAM */
4802 	amdgpu_doorbell_fini(adev);
4803 
4804 	iounmap(adev->rmmio);
4805 	adev->rmmio = NULL;
4806 	if (adev->mman.aper_base_kaddr)
4807 		iounmap(adev->mman.aper_base_kaddr);
4808 	adev->mman.aper_base_kaddr = NULL;
4809 
4810 	/* Memory manager related */
4811 	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4812 		arch_phys_wc_del(adev->gmc.vram_mtrr);
4813 		arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4814 	}
4815 }
4816 
4817 /**
4818  * amdgpu_device_fini_hw - tear down the driver
4819  *
4820  * @adev: amdgpu_device pointer
4821  *
4822  * Tear down the driver info (all asics).
4823  * Called at driver shutdown.
4824  */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4825 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4826 {
4827 	dev_info(adev->dev, "amdgpu: finishing device.\n");
4828 	flush_delayed_work(&adev->delayed_init_work);
4829 
4830 	if (adev->mman.initialized)
4831 		drain_workqueue(adev->mman.bdev.wq);
4832 	adev->shutdown = true;
4833 
4834 	unregister_pm_notifier(&adev->pm_nb);
4835 
4836 	/* make sure IB test finished before entering exclusive mode
4837 	 * to avoid preemption on IB test
4838 	 */
4839 	if (amdgpu_sriov_vf(adev)) {
4840 		amdgpu_virt_request_full_gpu(adev, false);
4841 		amdgpu_virt_fini_data_exchange(adev);
4842 	}
4843 
4844 	/* disable all interrupts */
4845 	amdgpu_irq_disable_all(adev);
4846 	if (adev->mode_info.mode_config_initialized) {
4847 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4848 			drm_helper_force_disable_all(adev_to_drm(adev));
4849 		else
4850 			drm_atomic_helper_shutdown(adev_to_drm(adev));
4851 	}
4852 	amdgpu_fence_driver_hw_fini(adev);
4853 
4854 	if (adev->pm.sysfs_initialized)
4855 		amdgpu_pm_sysfs_fini(adev);
4856 	if (adev->ucode_sysfs_en)
4857 		amdgpu_ucode_sysfs_fini(adev);
4858 	amdgpu_device_attr_sysfs_fini(adev);
4859 	amdgpu_fru_sysfs_fini(adev);
4860 
4861 	amdgpu_reg_state_sysfs_fini(adev);
4862 	amdgpu_xcp_sysfs_fini(adev);
4863 
4864 	/* disable ras feature must before hw fini */
4865 	amdgpu_ras_pre_fini(adev);
4866 
4867 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
4868 
4869 	amdgpu_device_ip_fini_early(adev);
4870 
4871 	amdgpu_irq_fini_hw(adev);
4872 
4873 	if (adev->mman.initialized)
4874 		ttm_device_clear_dma_mappings(&adev->mman.bdev);
4875 
4876 	amdgpu_gart_dummy_page_fini(adev);
4877 
4878 	if (drm_dev_is_unplugged(adev_to_drm(adev)))
4879 		amdgpu_device_unmap_mmio(adev);
4880 
4881 }
4882 
amdgpu_device_fini_sw(struct amdgpu_device * adev)4883 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4884 {
4885 	int i, idx;
4886 	bool px;
4887 
4888 	amdgpu_device_ip_fini(adev);
4889 	amdgpu_fence_driver_sw_fini(adev);
4890 	amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4891 	adev->accel_working = false;
4892 	dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4893 	for (i = 0; i < MAX_XCP; ++i) {
4894 		dma_fence_put(adev->isolation[i].spearhead);
4895 		amdgpu_sync_free(&adev->isolation[i].active);
4896 		amdgpu_sync_free(&adev->isolation[i].prev);
4897 	}
4898 
4899 	amdgpu_reset_fini(adev);
4900 
4901 	/* free i2c buses */
4902 	amdgpu_i2c_fini(adev);
4903 
4904 	if (adev->bios) {
4905 		if (amdgpu_emu_mode != 1)
4906 			amdgpu_atombios_fini(adev);
4907 		amdgpu_bios_release(adev);
4908 	}
4909 
4910 	kfree(adev->fru_info);
4911 	adev->fru_info = NULL;
4912 
4913 	kfree(adev->xcp_mgr);
4914 	adev->xcp_mgr = NULL;
4915 
4916 	px = amdgpu_device_supports_px(adev_to_drm(adev));
4917 
4918 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
4919 				apple_gmux_detect(NULL, NULL)))
4920 		vga_switcheroo_unregister_client(adev->pdev);
4921 
4922 	if (px)
4923 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
4924 
4925 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4926 		vga_client_unregister(adev->pdev);
4927 
4928 	if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4929 
4930 		iounmap(adev->rmmio);
4931 		adev->rmmio = NULL;
4932 		drm_dev_exit(idx);
4933 	}
4934 
4935 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4936 		amdgpu_pmu_fini(adev);
4937 	if (adev->mman.discovery_bin)
4938 		amdgpu_discovery_fini(adev);
4939 
4940 	amdgpu_reset_put_reset_domain(adev->reset_domain);
4941 	adev->reset_domain = NULL;
4942 
4943 	kfree(adev->pci_state);
4944 
4945 }
4946 
4947 /**
4948  * amdgpu_device_evict_resources - evict device resources
4949  * @adev: amdgpu device object
4950  *
4951  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4952  * of the vram memory type. Mainly used for evicting device resources
4953  * at suspend time.
4954  *
4955  */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4956 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4957 {
4958 	int ret;
4959 
4960 	/* No need to evict vram on APUs unless going to S4 */
4961 	if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
4962 		return 0;
4963 
4964 	ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4965 	if (ret)
4966 		DRM_WARN("evicting device resources failed\n");
4967 	return ret;
4968 }
4969 
4970 /*
4971  * Suspend & resume.
4972  */
4973 /**
4974  * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
4975  * @nb: notifier block
4976  * @mode: suspend mode
4977  * @data: data
4978  *
4979  * This function is called when the system is about to suspend or hibernate.
4980  * It is used to set the appropriate flags so that eviction can be optimized
4981  * in the pm prepare callback.
4982  */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)4983 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
4984 				     void *data)
4985 {
4986 	struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
4987 
4988 	switch (mode) {
4989 	case PM_HIBERNATION_PREPARE:
4990 		adev->in_s4 = true;
4991 		break;
4992 	case PM_POST_HIBERNATION:
4993 		adev->in_s4 = false;
4994 		break;
4995 	}
4996 
4997 	return NOTIFY_DONE;
4998 }
4999 
5000 /**
5001  * amdgpu_device_prepare - prepare for device suspend
5002  *
5003  * @dev: drm dev pointer
5004  *
5005  * Prepare to put the hw in the suspend state (all asics).
5006  * Returns 0 for success or an error on failure.
5007  * Called at driver suspend.
5008  */
amdgpu_device_prepare(struct drm_device * dev)5009 int amdgpu_device_prepare(struct drm_device *dev)
5010 {
5011 	struct amdgpu_device *adev = drm_to_adev(dev);
5012 	int i, r;
5013 
5014 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5015 		return 0;
5016 
5017 	/* Evict the majority of BOs before starting suspend sequence */
5018 	r = amdgpu_device_evict_resources(adev);
5019 	if (r)
5020 		return r;
5021 
5022 	flush_delayed_work(&adev->gfx.gfx_off_delay_work);
5023 
5024 	for (i = 0; i < adev->num_ip_blocks; i++) {
5025 		if (!adev->ip_blocks[i].status.valid)
5026 			continue;
5027 		if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
5028 			continue;
5029 		r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
5030 		if (r)
5031 			return r;
5032 	}
5033 
5034 	return 0;
5035 }
5036 
5037 /**
5038  * amdgpu_device_suspend - initiate device suspend
5039  *
5040  * @dev: drm dev pointer
5041  * @notify_clients: notify in-kernel DRM clients
5042  *
5043  * Puts the hw in the suspend state (all asics).
5044  * Returns 0 for success or an error on failure.
5045  * Called at driver suspend.
5046  */
amdgpu_device_suspend(struct drm_device * dev,bool notify_clients)5047 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
5048 {
5049 	struct amdgpu_device *adev = drm_to_adev(dev);
5050 	int r = 0;
5051 
5052 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5053 		return 0;
5054 
5055 	adev->in_suspend = true;
5056 
5057 	if (amdgpu_sriov_vf(adev)) {
5058 		amdgpu_virt_fini_data_exchange(adev);
5059 		r = amdgpu_virt_request_full_gpu(adev, false);
5060 		if (r)
5061 			return r;
5062 	}
5063 
5064 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
5065 		DRM_WARN("smart shift update failed\n");
5066 
5067 	if (notify_clients)
5068 		drm_client_dev_suspend(adev_to_drm(adev), false);
5069 
5070 	cancel_delayed_work_sync(&adev->delayed_init_work);
5071 
5072 	amdgpu_ras_suspend(adev);
5073 
5074 	amdgpu_device_ip_suspend_phase1(adev);
5075 
5076 	if (!adev->in_s0ix) {
5077 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
5078 		amdgpu_userq_suspend(adev);
5079 	}
5080 
5081 	r = amdgpu_device_evict_resources(adev);
5082 	if (r)
5083 		return r;
5084 
5085 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
5086 
5087 	amdgpu_fence_driver_hw_fini(adev);
5088 
5089 	amdgpu_device_ip_suspend_phase2(adev);
5090 
5091 	if (amdgpu_sriov_vf(adev))
5092 		amdgpu_virt_release_full_gpu(adev, false);
5093 
5094 	r = amdgpu_dpm_notify_rlc_state(adev, false);
5095 	if (r)
5096 		return r;
5097 
5098 	return 0;
5099 }
5100 
5101 /**
5102  * amdgpu_device_resume - initiate device resume
5103  *
5104  * @dev: drm dev pointer
5105  * @notify_clients: notify in-kernel DRM clients
5106  *
5107  * Bring the hw back to operating state (all asics).
5108  * Returns 0 for success or an error on failure.
5109  * Called at driver resume.
5110  */
amdgpu_device_resume(struct drm_device * dev,bool notify_clients)5111 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
5112 {
5113 	struct amdgpu_device *adev = drm_to_adev(dev);
5114 	int r = 0;
5115 
5116 	if (amdgpu_sriov_vf(adev)) {
5117 		r = amdgpu_virt_request_full_gpu(adev, true);
5118 		if (r)
5119 			return r;
5120 	}
5121 
5122 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5123 		return 0;
5124 
5125 	if (adev->in_s0ix)
5126 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
5127 
5128 	/* post card */
5129 	if (amdgpu_device_need_post(adev)) {
5130 		r = amdgpu_device_asic_init(adev);
5131 		if (r)
5132 			dev_err(adev->dev, "amdgpu asic init failed\n");
5133 	}
5134 
5135 	r = amdgpu_device_ip_resume(adev);
5136 
5137 	if (r) {
5138 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
5139 		goto exit;
5140 	}
5141 
5142 	if (!adev->in_s0ix) {
5143 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
5144 		if (r)
5145 			goto exit;
5146 
5147 		r = amdgpu_userq_resume(adev);
5148 		if (r)
5149 			goto exit;
5150 	}
5151 
5152 	r = amdgpu_device_ip_late_init(adev);
5153 	if (r)
5154 		goto exit;
5155 
5156 	queue_delayed_work(system_wq, &adev->delayed_init_work,
5157 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
5158 exit:
5159 	if (amdgpu_sriov_vf(adev)) {
5160 		amdgpu_virt_init_data_exchange(adev);
5161 		amdgpu_virt_release_full_gpu(adev, true);
5162 	}
5163 
5164 	if (r)
5165 		return r;
5166 
5167 	/* Make sure IB tests flushed */
5168 	flush_delayed_work(&adev->delayed_init_work);
5169 
5170 	if (notify_clients)
5171 		drm_client_dev_resume(adev_to_drm(adev), false);
5172 
5173 	amdgpu_ras_resume(adev);
5174 
5175 	if (adev->mode_info.num_crtc) {
5176 		/*
5177 		 * Most of the connector probing functions try to acquire runtime pm
5178 		 * refs to ensure that the GPU is powered on when connector polling is
5179 		 * performed. Since we're calling this from a runtime PM callback,
5180 		 * trying to acquire rpm refs will cause us to deadlock.
5181 		 *
5182 		 * Since we're guaranteed to be holding the rpm lock, it's safe to
5183 		 * temporarily disable the rpm helpers so this doesn't deadlock us.
5184 		 */
5185 #ifdef CONFIG_PM
5186 		dev->dev->power.disable_depth++;
5187 #endif
5188 		if (!adev->dc_enabled)
5189 			drm_helper_hpd_irq_event(dev);
5190 		else
5191 			drm_kms_helper_hotplug_event(dev);
5192 #ifdef CONFIG_PM
5193 		dev->dev->power.disable_depth--;
5194 #endif
5195 	}
5196 
5197 	amdgpu_vram_mgr_clear_reset_blocks(adev);
5198 	adev->in_suspend = false;
5199 
5200 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
5201 		DRM_WARN("smart shift update failed\n");
5202 
5203 	return 0;
5204 }
5205 
5206 /**
5207  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5208  *
5209  * @adev: amdgpu_device pointer
5210  *
5211  * The list of all the hardware IPs that make up the asic is walked and
5212  * the check_soft_reset callbacks are run.  check_soft_reset determines
5213  * if the asic is still hung or not.
5214  * Returns true if any of the IPs are still in a hung state, false if not.
5215  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)5216 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5217 {
5218 	int i;
5219 	bool asic_hang = false;
5220 
5221 	if (amdgpu_sriov_vf(adev))
5222 		return true;
5223 
5224 	if (amdgpu_asic_need_full_reset(adev))
5225 		return true;
5226 
5227 	for (i = 0; i < adev->num_ip_blocks; i++) {
5228 		if (!adev->ip_blocks[i].status.valid)
5229 			continue;
5230 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5231 			adev->ip_blocks[i].status.hang =
5232 				adev->ip_blocks[i].version->funcs->check_soft_reset(
5233 					&adev->ip_blocks[i]);
5234 		if (adev->ip_blocks[i].status.hang) {
5235 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5236 			asic_hang = true;
5237 		}
5238 	}
5239 	return asic_hang;
5240 }
5241 
5242 /**
5243  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5244  *
5245  * @adev: amdgpu_device pointer
5246  *
5247  * The list of all the hardware IPs that make up the asic is walked and the
5248  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
5249  * handles any IP specific hardware or software state changes that are
5250  * necessary for a soft reset to succeed.
5251  * Returns 0 on success, negative error code on failure.
5252  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5253 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5254 {
5255 	int i, r = 0;
5256 
5257 	for (i = 0; i < adev->num_ip_blocks; i++) {
5258 		if (!adev->ip_blocks[i].status.valid)
5259 			continue;
5260 		if (adev->ip_blocks[i].status.hang &&
5261 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5262 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5263 			if (r)
5264 				return r;
5265 		}
5266 	}
5267 
5268 	return 0;
5269 }
5270 
5271 /**
5272  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5273  *
5274  * @adev: amdgpu_device pointer
5275  *
5276  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
5277  * reset is necessary to recover.
5278  * Returns true if a full asic reset is required, false if not.
5279  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5280 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5281 {
5282 	int i;
5283 
5284 	if (amdgpu_asic_need_full_reset(adev))
5285 		return true;
5286 
5287 	for (i = 0; i < adev->num_ip_blocks; i++) {
5288 		if (!adev->ip_blocks[i].status.valid)
5289 			continue;
5290 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5291 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5292 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5293 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5294 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5295 			if (adev->ip_blocks[i].status.hang) {
5296 				dev_info(adev->dev, "Some block need full reset!\n");
5297 				return true;
5298 			}
5299 		}
5300 	}
5301 	return false;
5302 }
5303 
5304 /**
5305  * amdgpu_device_ip_soft_reset - do a soft reset
5306  *
5307  * @adev: amdgpu_device pointer
5308  *
5309  * The list of all the hardware IPs that make up the asic is walked and the
5310  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
5311  * IP specific hardware or software state changes that are necessary to soft
5312  * reset the IP.
5313  * Returns 0 on success, negative error code on failure.
5314  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5315 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5316 {
5317 	int i, r = 0;
5318 
5319 	for (i = 0; i < adev->num_ip_blocks; i++) {
5320 		if (!adev->ip_blocks[i].status.valid)
5321 			continue;
5322 		if (adev->ip_blocks[i].status.hang &&
5323 		    adev->ip_blocks[i].version->funcs->soft_reset) {
5324 			r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5325 			if (r)
5326 				return r;
5327 		}
5328 	}
5329 
5330 	return 0;
5331 }
5332 
5333 /**
5334  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5335  *
5336  * @adev: amdgpu_device pointer
5337  *
5338  * The list of all the hardware IPs that make up the asic is walked and the
5339  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
5340  * handles any IP specific hardware or software state changes that are
5341  * necessary after the IP has been soft reset.
5342  * Returns 0 on success, negative error code on failure.
5343  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5344 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5345 {
5346 	int i, r = 0;
5347 
5348 	for (i = 0; i < adev->num_ip_blocks; i++) {
5349 		if (!adev->ip_blocks[i].status.valid)
5350 			continue;
5351 		if (adev->ip_blocks[i].status.hang &&
5352 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
5353 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5354 		if (r)
5355 			return r;
5356 	}
5357 
5358 	return 0;
5359 }
5360 
5361 /**
5362  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5363  *
5364  * @adev: amdgpu_device pointer
5365  * @reset_context: amdgpu reset context pointer
5366  *
5367  * do VF FLR and reinitialize Asic
5368  * return 0 means succeeded otherwise failed
5369  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5370 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5371 				     struct amdgpu_reset_context *reset_context)
5372 {
5373 	int r;
5374 	struct amdgpu_hive_info *hive = NULL;
5375 
5376 	if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5377 		if (!amdgpu_ras_get_fed_status(adev))
5378 			amdgpu_virt_ready_to_reset(adev);
5379 		amdgpu_virt_wait_reset(adev);
5380 		clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5381 		r = amdgpu_virt_request_full_gpu(adev, true);
5382 	} else {
5383 		r = amdgpu_virt_reset_gpu(adev);
5384 	}
5385 	if (r)
5386 		return r;
5387 
5388 	amdgpu_ras_clear_err_state(adev);
5389 	amdgpu_irq_gpu_reset_resume_helper(adev);
5390 
5391 	/* some sw clean up VF needs to do before recover */
5392 	amdgpu_virt_post_reset(adev);
5393 
5394 	/* Resume IP prior to SMC */
5395 	r = amdgpu_device_ip_reinit_early_sriov(adev);
5396 	if (r)
5397 		return r;
5398 
5399 	amdgpu_virt_init_data_exchange(adev);
5400 
5401 	r = amdgpu_device_fw_loading(adev);
5402 	if (r)
5403 		return r;
5404 
5405 	/* now we are okay to resume SMC/CP/SDMA */
5406 	r = amdgpu_device_ip_reinit_late_sriov(adev);
5407 	if (r)
5408 		return r;
5409 
5410 	hive = amdgpu_get_xgmi_hive(adev);
5411 	/* Update PSP FW topology after reset */
5412 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5413 		r = amdgpu_xgmi_update_topology(hive, adev);
5414 	if (hive)
5415 		amdgpu_put_xgmi_hive(hive);
5416 	if (r)
5417 		return r;
5418 
5419 	r = amdgpu_ib_ring_tests(adev);
5420 	if (r)
5421 		return r;
5422 
5423 	if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5424 		amdgpu_inc_vram_lost(adev);
5425 
5426 	/* need to be called during full access so we can't do it later like
5427 	 * bare-metal does.
5428 	 */
5429 	amdgpu_amdkfd_post_reset(adev);
5430 	amdgpu_virt_release_full_gpu(adev, true);
5431 
5432 	/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5433 	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5434 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5435 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5436 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
5437 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5438 		amdgpu_ras_resume(adev);
5439 
5440 	amdgpu_virt_ras_telemetry_post_reset(adev);
5441 
5442 	return 0;
5443 }
5444 
5445 /**
5446  * amdgpu_device_has_job_running - check if there is any unfinished job
5447  *
5448  * @adev: amdgpu_device pointer
5449  *
5450  * check if there is any job running on the device when guest driver receives
5451  * FLR notification from host driver. If there are still jobs running, then
5452  * the guest driver will not respond the FLR reset. Instead, let the job hit
5453  * the timeout and guest driver then issue the reset request.
5454  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5455 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5456 {
5457 	int i;
5458 
5459 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5460 		struct amdgpu_ring *ring = adev->rings[i];
5461 
5462 		if (!amdgpu_ring_sched_ready(ring))
5463 			continue;
5464 
5465 		if (amdgpu_fence_count_emitted(ring))
5466 			return true;
5467 	}
5468 	return false;
5469 }
5470 
5471 /**
5472  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5473  *
5474  * @adev: amdgpu_device pointer
5475  *
5476  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5477  * a hung GPU.
5478  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5479 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5480 {
5481 
5482 	if (amdgpu_gpu_recovery == 0)
5483 		goto disabled;
5484 
5485 	/* Skip soft reset check in fatal error mode */
5486 	if (!amdgpu_ras_is_poison_mode_supported(adev))
5487 		return true;
5488 
5489 	if (amdgpu_sriov_vf(adev))
5490 		return true;
5491 
5492 	if (amdgpu_gpu_recovery == -1) {
5493 		switch (adev->asic_type) {
5494 #ifdef CONFIG_DRM_AMDGPU_SI
5495 		case CHIP_VERDE:
5496 		case CHIP_TAHITI:
5497 		case CHIP_PITCAIRN:
5498 		case CHIP_OLAND:
5499 		case CHIP_HAINAN:
5500 #endif
5501 #ifdef CONFIG_DRM_AMDGPU_CIK
5502 		case CHIP_KAVERI:
5503 		case CHIP_KABINI:
5504 		case CHIP_MULLINS:
5505 #endif
5506 		case CHIP_CARRIZO:
5507 		case CHIP_STONEY:
5508 		case CHIP_CYAN_SKILLFISH:
5509 			goto disabled;
5510 		default:
5511 			break;
5512 		}
5513 	}
5514 
5515 	return true;
5516 
5517 disabled:
5518 		dev_info(adev->dev, "GPU recovery disabled.\n");
5519 		return false;
5520 }
5521 
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5522 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5523 {
5524 	u32 i;
5525 	int ret = 0;
5526 
5527 	if (adev->bios)
5528 		amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5529 
5530 	dev_info(adev->dev, "GPU mode1 reset\n");
5531 
5532 	/* Cache the state before bus master disable. The saved config space
5533 	 * values are used in other cases like restore after mode-2 reset.
5534 	 */
5535 	amdgpu_device_cache_pci_state(adev->pdev);
5536 
5537 	/* disable BM */
5538 	pci_clear_master(adev->pdev);
5539 
5540 	if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5541 		dev_info(adev->dev, "GPU smu mode1 reset\n");
5542 		ret = amdgpu_dpm_mode1_reset(adev);
5543 	} else {
5544 		dev_info(adev->dev, "GPU psp mode1 reset\n");
5545 		ret = psp_gpu_reset(adev);
5546 	}
5547 
5548 	if (ret)
5549 		goto mode1_reset_failed;
5550 
5551 	amdgpu_device_load_pci_state(adev->pdev);
5552 	ret = amdgpu_psp_wait_for_bootloader(adev);
5553 	if (ret)
5554 		goto mode1_reset_failed;
5555 
5556 	/* wait for asic to come out of reset */
5557 	for (i = 0; i < adev->usec_timeout; i++) {
5558 		u32 memsize = adev->nbio.funcs->get_memsize(adev);
5559 
5560 		if (memsize != 0xffffffff)
5561 			break;
5562 		udelay(1);
5563 	}
5564 
5565 	if (i >= adev->usec_timeout) {
5566 		ret = -ETIMEDOUT;
5567 		goto mode1_reset_failed;
5568 	}
5569 
5570 	if (adev->bios)
5571 		amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5572 
5573 	return 0;
5574 
5575 mode1_reset_failed:
5576 	dev_err(adev->dev, "GPU mode1 reset failed\n");
5577 	return ret;
5578 }
5579 
amdgpu_device_link_reset(struct amdgpu_device * adev)5580 int amdgpu_device_link_reset(struct amdgpu_device *adev)
5581 {
5582 	int ret = 0;
5583 
5584 	dev_info(adev->dev, "GPU link reset\n");
5585 
5586 	if (!adev->pcie_reset_ctx.occurs_dpc)
5587 		ret = amdgpu_dpm_link_reset(adev);
5588 
5589 	if (ret)
5590 		goto link_reset_failed;
5591 
5592 	ret = amdgpu_psp_wait_for_bootloader(adev);
5593 	if (ret)
5594 		goto link_reset_failed;
5595 
5596 	return 0;
5597 
5598 link_reset_failed:
5599 	dev_err(adev->dev, "GPU link reset failed\n");
5600 	return ret;
5601 }
5602 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5603 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5604 				 struct amdgpu_reset_context *reset_context)
5605 {
5606 	int i, r = 0;
5607 	struct amdgpu_job *job = NULL;
5608 	struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5609 	bool need_full_reset =
5610 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5611 
5612 	if (reset_context->reset_req_dev == adev)
5613 		job = reset_context->job;
5614 
5615 	if (amdgpu_sriov_vf(adev))
5616 		amdgpu_virt_pre_reset(adev);
5617 
5618 	amdgpu_fence_driver_isr_toggle(adev, true);
5619 
5620 	/* block all schedulers and reset given job's ring */
5621 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5622 		struct amdgpu_ring *ring = adev->rings[i];
5623 
5624 		if (!amdgpu_ring_sched_ready(ring))
5625 			continue;
5626 
5627 		/* Clear job fence from fence drv to avoid force_completion
5628 		 * leave NULL and vm flush fence in fence drv
5629 		 */
5630 		amdgpu_fence_driver_clear_job_fences(ring);
5631 
5632 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5633 		amdgpu_fence_driver_force_completion(ring);
5634 	}
5635 
5636 	amdgpu_fence_driver_isr_toggle(adev, false);
5637 
5638 	if (job && job->vm)
5639 		drm_sched_increase_karma(&job->base);
5640 
5641 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5642 	/* If reset handler not implemented, continue; otherwise return */
5643 	if (r == -EOPNOTSUPP)
5644 		r = 0;
5645 	else
5646 		return r;
5647 
5648 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5649 	if (!amdgpu_sriov_vf(adev)) {
5650 
5651 		if (!need_full_reset)
5652 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5653 
5654 		if (!need_full_reset && amdgpu_gpu_recovery &&
5655 		    amdgpu_device_ip_check_soft_reset(adev)) {
5656 			amdgpu_device_ip_pre_soft_reset(adev);
5657 			r = amdgpu_device_ip_soft_reset(adev);
5658 			amdgpu_device_ip_post_soft_reset(adev);
5659 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5660 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5661 				need_full_reset = true;
5662 			}
5663 		}
5664 
5665 		if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5666 			dev_info(tmp_adev->dev, "Dumping IP State\n");
5667 			/* Trigger ip dump before we reset the asic */
5668 			for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5669 				if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5670 					tmp_adev->ip_blocks[i].version->funcs
5671 						->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5672 			dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5673 		}
5674 
5675 		if (need_full_reset)
5676 			r = amdgpu_device_ip_suspend(adev);
5677 		if (need_full_reset)
5678 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5679 		else
5680 			clear_bit(AMDGPU_NEED_FULL_RESET,
5681 				  &reset_context->flags);
5682 	}
5683 
5684 	return r;
5685 }
5686 
amdgpu_device_reinit_after_reset(struct amdgpu_reset_context * reset_context)5687 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5688 {
5689 	struct list_head *device_list_handle;
5690 	bool full_reset, vram_lost = false;
5691 	struct amdgpu_device *tmp_adev;
5692 	int r, init_level;
5693 
5694 	device_list_handle = reset_context->reset_device_list;
5695 
5696 	if (!device_list_handle)
5697 		return -EINVAL;
5698 
5699 	full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5700 
5701 	/**
5702 	 * If it's reset on init, it's default init level, otherwise keep level
5703 	 * as recovery level.
5704 	 */
5705 	if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5706 			init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5707 	else
5708 			init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5709 
5710 	r = 0;
5711 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5712 		amdgpu_set_init_level(tmp_adev, init_level);
5713 		if (full_reset) {
5714 			/* post card */
5715 			amdgpu_ras_clear_err_state(tmp_adev);
5716 			r = amdgpu_device_asic_init(tmp_adev);
5717 			if (r) {
5718 				dev_warn(tmp_adev->dev, "asic atom init failed!");
5719 			} else {
5720 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5721 
5722 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
5723 				if (r)
5724 					goto out;
5725 
5726 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5727 
5728 				if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5729 					amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5730 
5731 				if (vram_lost) {
5732 					DRM_INFO("VRAM is lost due to GPU reset!\n");
5733 					amdgpu_inc_vram_lost(tmp_adev);
5734 				}
5735 
5736 				r = amdgpu_device_fw_loading(tmp_adev);
5737 				if (r)
5738 					return r;
5739 
5740 				r = amdgpu_xcp_restore_partition_mode(
5741 					tmp_adev->xcp_mgr);
5742 				if (r)
5743 					goto out;
5744 
5745 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
5746 				if (r)
5747 					goto out;
5748 
5749 				if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5750 					amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5751 
5752 				r = amdgpu_device_ip_resume_phase3(tmp_adev);
5753 				if (r)
5754 					goto out;
5755 
5756 				if (vram_lost)
5757 					amdgpu_device_fill_reset_magic(tmp_adev);
5758 
5759 				/*
5760 				 * Add this ASIC as tracked as reset was already
5761 				 * complete successfully.
5762 				 */
5763 				amdgpu_register_gpu_instance(tmp_adev);
5764 
5765 				if (!reset_context->hive &&
5766 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5767 					amdgpu_xgmi_add_device(tmp_adev);
5768 
5769 				r = amdgpu_device_ip_late_init(tmp_adev);
5770 				if (r)
5771 					goto out;
5772 
5773 				drm_client_dev_resume(adev_to_drm(tmp_adev), false);
5774 
5775 				/*
5776 				 * The GPU enters bad state once faulty pages
5777 				 * by ECC has reached the threshold, and ras
5778 				 * recovery is scheduled next. So add one check
5779 				 * here to break recovery if it indeed exceeds
5780 				 * bad page threshold, and remind user to
5781 				 * retire this GPU or setting one bigger
5782 				 * bad_page_threshold value to fix this once
5783 				 * probing driver again.
5784 				 */
5785 				if (!amdgpu_ras_is_rma(tmp_adev)) {
5786 					/* must succeed. */
5787 					amdgpu_ras_resume(tmp_adev);
5788 				} else {
5789 					r = -EINVAL;
5790 					goto out;
5791 				}
5792 
5793 				/* Update PSP FW topology after reset */
5794 				if (reset_context->hive &&
5795 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5796 					r = amdgpu_xgmi_update_topology(
5797 						reset_context->hive, tmp_adev);
5798 			}
5799 		}
5800 
5801 out:
5802 		if (!r) {
5803 			/* IP init is complete now, set level as default */
5804 			amdgpu_set_init_level(tmp_adev,
5805 					      AMDGPU_INIT_LEVEL_DEFAULT);
5806 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5807 			r = amdgpu_ib_ring_tests(tmp_adev);
5808 			if (r) {
5809 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5810 				r = -EAGAIN;
5811 				goto end;
5812 			}
5813 		}
5814 
5815 		if (r)
5816 			tmp_adev->asic_reset_res = r;
5817 	}
5818 
5819 end:
5820 	return r;
5821 }
5822 
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5823 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5824 			 struct amdgpu_reset_context *reset_context)
5825 {
5826 	struct amdgpu_device *tmp_adev = NULL;
5827 	bool need_full_reset, skip_hw_reset;
5828 	int r = 0;
5829 
5830 	/* Try reset handler method first */
5831 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5832 				    reset_list);
5833 
5834 	reset_context->reset_device_list = device_list_handle;
5835 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5836 	/* If reset handler not implemented, continue; otherwise return */
5837 	if (r == -EOPNOTSUPP)
5838 		r = 0;
5839 	else
5840 		return r;
5841 
5842 	/* Reset handler not implemented, use the default method */
5843 	need_full_reset =
5844 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5845 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5846 
5847 	/*
5848 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
5849 	 * to allow proper links negotiation in FW (within 1 sec)
5850 	 */
5851 	if (!skip_hw_reset && need_full_reset) {
5852 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5853 			/* For XGMI run all resets in parallel to speed up the process */
5854 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5855 				if (!queue_work(system_unbound_wq,
5856 						&tmp_adev->xgmi_reset_work))
5857 					r = -EALREADY;
5858 			} else
5859 				r = amdgpu_asic_reset(tmp_adev);
5860 
5861 			if (r) {
5862 				dev_err(tmp_adev->dev,
5863 					"ASIC reset failed with error, %d for drm dev, %s",
5864 					r, adev_to_drm(tmp_adev)->unique);
5865 				goto out;
5866 			}
5867 		}
5868 
5869 		/* For XGMI wait for all resets to complete before proceed */
5870 		if (!r) {
5871 			list_for_each_entry(tmp_adev, device_list_handle,
5872 					    reset_list) {
5873 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5874 					flush_work(&tmp_adev->xgmi_reset_work);
5875 					r = tmp_adev->asic_reset_res;
5876 					if (r)
5877 						break;
5878 				}
5879 			}
5880 		}
5881 	}
5882 
5883 	if (!r && amdgpu_ras_intr_triggered()) {
5884 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5885 			amdgpu_ras_reset_error_count(tmp_adev,
5886 						     AMDGPU_RAS_BLOCK__MMHUB);
5887 		}
5888 
5889 		amdgpu_ras_intr_cleared();
5890 	}
5891 
5892 	r = amdgpu_device_reinit_after_reset(reset_context);
5893 	if (r == -EAGAIN)
5894 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5895 	else
5896 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5897 
5898 out:
5899 	return r;
5900 }
5901 
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5902 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5903 {
5904 
5905 	switch (amdgpu_asic_reset_method(adev)) {
5906 	case AMD_RESET_METHOD_MODE1:
5907 	case AMD_RESET_METHOD_LINK:
5908 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5909 		break;
5910 	case AMD_RESET_METHOD_MODE2:
5911 		adev->mp1_state = PP_MP1_STATE_RESET;
5912 		break;
5913 	default:
5914 		adev->mp1_state = PP_MP1_STATE_NONE;
5915 		break;
5916 	}
5917 }
5918 
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5919 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5920 {
5921 	amdgpu_vf_error_trans_all(adev);
5922 	adev->mp1_state = PP_MP1_STATE_NONE;
5923 }
5924 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5925 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5926 {
5927 	struct pci_dev *p = NULL;
5928 
5929 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5930 			adev->pdev->bus->number, 1);
5931 	if (p) {
5932 		pm_runtime_enable(&(p->dev));
5933 		pm_runtime_resume(&(p->dev));
5934 	}
5935 
5936 	pci_dev_put(p);
5937 }
5938 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5939 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5940 {
5941 	enum amd_reset_method reset_method;
5942 	struct pci_dev *p = NULL;
5943 	u64 expires;
5944 
5945 	/*
5946 	 * For now, only BACO and mode1 reset are confirmed
5947 	 * to suffer the audio issue without proper suspended.
5948 	 */
5949 	reset_method = amdgpu_asic_reset_method(adev);
5950 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
5951 	     (reset_method != AMD_RESET_METHOD_MODE1))
5952 		return -EINVAL;
5953 
5954 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5955 			adev->pdev->bus->number, 1);
5956 	if (!p)
5957 		return -ENODEV;
5958 
5959 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
5960 	if (!expires)
5961 		/*
5962 		 * If we cannot get the audio device autosuspend delay,
5963 		 * a fixed 4S interval will be used. Considering 3S is
5964 		 * the audio controller default autosuspend delay setting.
5965 		 * 4S used here is guaranteed to cover that.
5966 		 */
5967 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5968 
5969 	while (!pm_runtime_status_suspended(&(p->dev))) {
5970 		if (!pm_runtime_suspend(&(p->dev)))
5971 			break;
5972 
5973 		if (expires < ktime_get_mono_fast_ns()) {
5974 			dev_warn(adev->dev, "failed to suspend display audio\n");
5975 			pci_dev_put(p);
5976 			/* TODO: abort the succeeding gpu reset? */
5977 			return -ETIMEDOUT;
5978 		}
5979 	}
5980 
5981 	pm_runtime_disable(&(p->dev));
5982 
5983 	pci_dev_put(p);
5984 	return 0;
5985 }
5986 
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5987 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5988 {
5989 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5990 
5991 #if defined(CONFIG_DEBUG_FS)
5992 	if (!amdgpu_sriov_vf(adev))
5993 		cancel_work(&adev->reset_work);
5994 #endif
5995 
5996 	if (adev->kfd.dev)
5997 		cancel_work(&adev->kfd.reset_work);
5998 
5999 	if (amdgpu_sriov_vf(adev))
6000 		cancel_work(&adev->virt.flr_work);
6001 
6002 	if (con && adev->ras_enabled)
6003 		cancel_work(&con->recovery_work);
6004 
6005 }
6006 
amdgpu_device_health_check(struct list_head * device_list_handle)6007 static int amdgpu_device_health_check(struct list_head *device_list_handle)
6008 {
6009 	struct amdgpu_device *tmp_adev;
6010 	int ret = 0;
6011 	u32 status;
6012 
6013 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6014 		pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
6015 		if (PCI_POSSIBLE_ERROR(status)) {
6016 			dev_err(tmp_adev->dev, "device lost from bus!");
6017 			ret = -ENODEV;
6018 		}
6019 	}
6020 
6021 	return ret;
6022 }
6023 
amdgpu_device_recovery_prepare(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_hive_info * hive)6024 static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
6025 					  struct list_head *device_list,
6026 					  struct amdgpu_hive_info *hive)
6027 {
6028 	struct amdgpu_device *tmp_adev = NULL;
6029 	int r;
6030 
6031 	/*
6032 	 * Build list of devices to reset.
6033 	 * In case we are in XGMI hive mode, resort the device list
6034 	 * to put adev in the 1st position.
6035 	 */
6036 	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
6037 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6038 			list_add_tail(&tmp_adev->reset_list, device_list);
6039 			if (adev->shutdown)
6040 				tmp_adev->shutdown = true;
6041 			if (adev->pcie_reset_ctx.occurs_dpc)
6042 				tmp_adev->pcie_reset_ctx.in_link_reset = true;
6043 		}
6044 		if (!list_is_first(&adev->reset_list, device_list))
6045 			list_rotate_to_front(&adev->reset_list, device_list);
6046 	} else {
6047 		list_add_tail(&adev->reset_list, device_list);
6048 	}
6049 
6050 	if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
6051 		r = amdgpu_device_health_check(device_list);
6052 		if (r)
6053 			return r;
6054 	}
6055 
6056 	return 0;
6057 }
6058 
amdgpu_device_recovery_get_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6059 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
6060 						  struct list_head *device_list)
6061 {
6062 	struct amdgpu_device *tmp_adev = NULL;
6063 
6064 	if (list_empty(device_list))
6065 		return;
6066 	tmp_adev =
6067 		list_first_entry(device_list, struct amdgpu_device, reset_list);
6068 	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
6069 }
6070 
amdgpu_device_recovery_put_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6071 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
6072 						  struct list_head *device_list)
6073 {
6074 	struct amdgpu_device *tmp_adev = NULL;
6075 
6076 	if (list_empty(device_list))
6077 		return;
6078 	tmp_adev =
6079 		list_first_entry(device_list, struct amdgpu_device, reset_list);
6080 	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6081 }
6082 
amdgpu_device_halt_activities(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context,struct list_head * device_list,struct amdgpu_hive_info * hive,bool need_emergency_restart)6083 static int amdgpu_device_halt_activities(
6084 	struct amdgpu_device *adev, struct amdgpu_job *job,
6085 	struct amdgpu_reset_context *reset_context,
6086 	struct list_head *device_list, struct amdgpu_hive_info *hive,
6087 	bool need_emergency_restart)
6088 {
6089 	struct amdgpu_device *tmp_adev = NULL;
6090 	int i, r = 0;
6091 
6092 	/* block all schedulers and reset given job's ring */
6093 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6094 		amdgpu_device_set_mp1_state(tmp_adev);
6095 
6096 		/*
6097 		 * Try to put the audio codec into suspend state
6098 		 * before gpu reset started.
6099 		 *
6100 		 * Due to the power domain of the graphics device
6101 		 * is shared with AZ power domain. Without this,
6102 		 * we may change the audio hardware from behind
6103 		 * the audio driver's back. That will trigger
6104 		 * some audio codec errors.
6105 		 */
6106 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
6107 			tmp_adev->pcie_reset_ctx.audio_suspended = true;
6108 
6109 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
6110 
6111 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
6112 
6113 		amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
6114 
6115 		/*
6116 		 * Mark these ASICs to be reset as untracked first
6117 		 * And add them back after reset completed
6118 		 */
6119 		amdgpu_unregister_gpu_instance(tmp_adev);
6120 
6121 		drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
6122 
6123 		/* disable ras on ALL IPs */
6124 		if (!need_emergency_restart &&
6125 		      (!adev->pcie_reset_ctx.occurs_dpc) &&
6126 		      amdgpu_device_ip_need_full_reset(tmp_adev))
6127 			amdgpu_ras_suspend(tmp_adev);
6128 
6129 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6130 			struct amdgpu_ring *ring = tmp_adev->rings[i];
6131 
6132 			if (!amdgpu_ring_sched_ready(ring))
6133 				continue;
6134 
6135 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
6136 
6137 			if (need_emergency_restart)
6138 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
6139 		}
6140 		atomic_inc(&tmp_adev->gpu_reset_counter);
6141 	}
6142 
6143 	return r;
6144 }
6145 
amdgpu_device_asic_reset(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_reset_context * reset_context)6146 static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
6147 			      struct list_head *device_list,
6148 			      struct amdgpu_reset_context *reset_context)
6149 {
6150 	struct amdgpu_device *tmp_adev = NULL;
6151 	int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
6152 	int r = 0;
6153 
6154 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
6155 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6156 		if (adev->pcie_reset_ctx.occurs_dpc)
6157 			tmp_adev->no_hw_access = true;
6158 		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
6159 		if (adev->pcie_reset_ctx.occurs_dpc)
6160 			tmp_adev->no_hw_access = false;
6161 		/*TODO Should we stop ?*/
6162 		if (r) {
6163 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6164 				  r, adev_to_drm(tmp_adev)->unique);
6165 			tmp_adev->asic_reset_res = r;
6166 		}
6167 	}
6168 
6169 	/* Actual ASIC resets if needed.*/
6170 	/* Host driver will handle XGMI hive reset for SRIOV */
6171 	if (amdgpu_sriov_vf(adev)) {
6172 
6173 		/* Bail out of reset early */
6174 		if (amdgpu_ras_is_rma(adev))
6175 			return -ENODEV;
6176 
6177 		if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
6178 			dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
6179 			amdgpu_ras_set_fed(adev, true);
6180 			set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
6181 		}
6182 
6183 		r = amdgpu_device_reset_sriov(adev, reset_context);
6184 		if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
6185 			amdgpu_virt_release_full_gpu(adev, true);
6186 			goto retry;
6187 		}
6188 		if (r)
6189 			adev->asic_reset_res = r;
6190 	} else {
6191 		r = amdgpu_do_asic_reset(device_list, reset_context);
6192 		if (r && r == -EAGAIN)
6193 			goto retry;
6194 	}
6195 
6196 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6197 		/*
6198 		 * Drop any pending non scheduler resets queued before reset is done.
6199 		 * Any reset scheduled after this point would be valid. Scheduler resets
6200 		 * were already dropped during drm_sched_stop and no new ones can come
6201 		 * in before drm_sched_start.
6202 		 */
6203 		amdgpu_device_stop_pending_resets(tmp_adev);
6204 	}
6205 
6206 	return r;
6207 }
6208 
amdgpu_device_sched_resume(struct list_head * device_list,struct amdgpu_reset_context * reset_context,bool job_signaled)6209 static int amdgpu_device_sched_resume(struct list_head *device_list,
6210 			      struct amdgpu_reset_context *reset_context,
6211 			      bool   job_signaled)
6212 {
6213 	struct amdgpu_device *tmp_adev = NULL;
6214 	int i, r = 0;
6215 
6216 	/* Post ASIC reset for all devs .*/
6217 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6218 
6219 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6220 			struct amdgpu_ring *ring = tmp_adev->rings[i];
6221 
6222 			if (!amdgpu_ring_sched_ready(ring))
6223 				continue;
6224 
6225 			drm_sched_start(&ring->sched, 0);
6226 		}
6227 
6228 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6229 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6230 
6231 		if (tmp_adev->asic_reset_res)
6232 			r = tmp_adev->asic_reset_res;
6233 
6234 		tmp_adev->asic_reset_res = 0;
6235 
6236 		if (r) {
6237 			/* bad news, how to tell it to userspace ?
6238 			 * for ras error, we should report GPU bad status instead of
6239 			 * reset failure
6240 			 */
6241 			if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6242 			    !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6243 				dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6244 					atomic_read(&tmp_adev->gpu_reset_counter));
6245 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6246 		} else {
6247 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6248 			if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6249 				DRM_WARN("smart shift update failed\n");
6250 		}
6251 	}
6252 
6253 	return r;
6254 }
6255 
amdgpu_device_gpu_resume(struct amdgpu_device * adev,struct list_head * device_list,bool need_emergency_restart)6256 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
6257 			      struct list_head *device_list,
6258 			      bool   need_emergency_restart)
6259 {
6260 	struct amdgpu_device *tmp_adev = NULL;
6261 
6262 	list_for_each_entry(tmp_adev, device_list, reset_list) {
6263 		/* unlock kfd: SRIOV would do it separately */
6264 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6265 			amdgpu_amdkfd_post_reset(tmp_adev);
6266 
6267 		/* kfd_post_reset will do nothing if kfd device is not initialized,
6268 		 * need to bring up kfd here if it's not be initialized before
6269 		 */
6270 		if (!adev->kfd.init_complete)
6271 			amdgpu_amdkfd_device_init(adev);
6272 
6273 		if (tmp_adev->pcie_reset_ctx.audio_suspended)
6274 			amdgpu_device_resume_display_audio(tmp_adev);
6275 
6276 		amdgpu_device_unset_mp1_state(tmp_adev);
6277 
6278 		amdgpu_ras_set_error_query_ready(tmp_adev, true);
6279 
6280 	}
6281 }
6282 
6283 
6284 /**
6285  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
6286  *
6287  * @adev: amdgpu_device pointer
6288  * @job: which job trigger hang
6289  * @reset_context: amdgpu reset context pointer
6290  *
6291  * Attempt to reset the GPU if it has hung (all asics).
6292  * Attempt to do soft-reset or full-reset and reinitialize Asic
6293  * Returns 0 for success or an error on failure.
6294  */
6295 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)6296 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
6297 			      struct amdgpu_job *job,
6298 			      struct amdgpu_reset_context *reset_context)
6299 {
6300 	struct list_head device_list;
6301 	bool job_signaled = false;
6302 	struct amdgpu_hive_info *hive = NULL;
6303 	int r = 0;
6304 	bool need_emergency_restart = false;
6305 
6306 	/*
6307 	 * If it reaches here because of hang/timeout and a RAS error is
6308 	 * detected at the same time, let RAS recovery take care of it.
6309 	 */
6310 	if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
6311 	    !amdgpu_sriov_vf(adev) &&
6312 	    reset_context->src != AMDGPU_RESET_SRC_RAS) {
6313 		dev_dbg(adev->dev,
6314 			"Gpu recovery from source: %d yielding to RAS error recovery handling",
6315 			reset_context->src);
6316 		return 0;
6317 	}
6318 
6319 	/*
6320 	 * Special case: RAS triggered and full reset isn't supported
6321 	 */
6322 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
6323 
6324 	/*
6325 	 * Flush RAM to disk so that after reboot
6326 	 * the user can read log and see why the system rebooted.
6327 	 */
6328 	if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
6329 		amdgpu_ras_get_context(adev)->reboot) {
6330 		DRM_WARN("Emergency reboot.");
6331 
6332 		ksys_sync_helper();
6333 		emergency_restart();
6334 	}
6335 
6336 	dev_info(adev->dev, "GPU %s begin!\n",
6337 		need_emergency_restart ? "jobs stop":"reset");
6338 
6339 	if (!amdgpu_sriov_vf(adev))
6340 		hive = amdgpu_get_xgmi_hive(adev);
6341 	if (hive)
6342 		mutex_lock(&hive->hive_lock);
6343 
6344 	reset_context->job = job;
6345 	reset_context->hive = hive;
6346 	INIT_LIST_HEAD(&device_list);
6347 
6348 	if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
6349 		goto end_reset;
6350 
6351 	/* We need to lock reset domain only once both for XGMI and single device */
6352 	amdgpu_device_recovery_get_reset_lock(adev, &device_list);
6353 
6354 	r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
6355 					 hive, need_emergency_restart);
6356 	if (r)
6357 		goto reset_unlock;
6358 
6359 	if (need_emergency_restart)
6360 		goto skip_sched_resume;
6361 	/*
6362 	 * Must check guilty signal here since after this point all old
6363 	 * HW fences are force signaled.
6364 	 *
6365 	 * job->base holds a reference to parent fence
6366 	 */
6367 	if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
6368 		job_signaled = true;
6369 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
6370 		goto skip_hw_reset;
6371 	}
6372 
6373 	r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
6374 	if (r)
6375 		goto reset_unlock;
6376 skip_hw_reset:
6377 	r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
6378 	if (r)
6379 		goto reset_unlock;
6380 skip_sched_resume:
6381 	amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
6382 reset_unlock:
6383 	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6384 end_reset:
6385 	if (hive) {
6386 		mutex_unlock(&hive->hive_lock);
6387 		amdgpu_put_xgmi_hive(hive);
6388 	}
6389 
6390 	if (r)
6391 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6392 
6393 	atomic_set(&adev->reset_domain->reset_res, r);
6394 
6395 	if (!r)
6396 		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
6397 
6398 	return r;
6399 }
6400 
6401 /**
6402  * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6403  *
6404  * @adev: amdgpu_device pointer
6405  * @speed: pointer to the speed of the link
6406  * @width: pointer to the width of the link
6407  *
6408  * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6409  * first physical partner to an AMD dGPU.
6410  * This will exclude any virtual switches and links.
6411  */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6412 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6413 					    enum pci_bus_speed *speed,
6414 					    enum pcie_link_width *width)
6415 {
6416 	struct pci_dev *parent = adev->pdev;
6417 
6418 	if (!speed || !width)
6419 		return;
6420 
6421 	*speed = PCI_SPEED_UNKNOWN;
6422 	*width = PCIE_LNK_WIDTH_UNKNOWN;
6423 
6424 	if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6425 		while ((parent = pci_upstream_bridge(parent))) {
6426 			/* skip upstream/downstream switches internal to dGPU*/
6427 			if (parent->vendor == PCI_VENDOR_ID_ATI)
6428 				continue;
6429 			*speed = pcie_get_speed_cap(parent);
6430 			*width = pcie_get_width_cap(parent);
6431 			break;
6432 		}
6433 	} else {
6434 		/* use the current speeds rather than max if switching is not supported */
6435 		pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6436 	}
6437 }
6438 
6439 /**
6440  * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU
6441  *
6442  * @adev: amdgpu_device pointer
6443  * @speed: pointer to the speed of the link
6444  * @width: pointer to the width of the link
6445  *
6446  * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6447  * AMD dGPU which may be a virtual upstream bridge.
6448  */
amdgpu_device_gpu_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6449 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev,
6450 					enum pci_bus_speed *speed,
6451 					enum pcie_link_width *width)
6452 {
6453 	struct pci_dev *parent = adev->pdev;
6454 
6455 	if (!speed || !width)
6456 		return;
6457 
6458 	parent = pci_upstream_bridge(parent);
6459 	if (parent && parent->vendor == PCI_VENDOR_ID_ATI) {
6460 		/* use the upstream/downstream switches internal to dGPU */
6461 		*speed = pcie_get_speed_cap(parent);
6462 		*width = pcie_get_width_cap(parent);
6463 		while ((parent = pci_upstream_bridge(parent))) {
6464 			if (parent->vendor == PCI_VENDOR_ID_ATI) {
6465 				/* use the upstream/downstream switches internal to dGPU */
6466 				*speed = pcie_get_speed_cap(parent);
6467 				*width = pcie_get_width_cap(parent);
6468 			}
6469 		}
6470 	} else {
6471 		/* use the device itself */
6472 		*speed = pcie_get_speed_cap(adev->pdev);
6473 		*width = pcie_get_width_cap(adev->pdev);
6474 	}
6475 }
6476 
6477 /**
6478  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6479  *
6480  * @adev: amdgpu_device pointer
6481  *
6482  * Fetches and stores in the driver the PCIE capabilities (gen speed
6483  * and lanes) of the slot the device is in. Handles APUs and
6484  * virtualized environments where PCIE config space may not be available.
6485  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6486 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6487 {
6488 	enum pci_bus_speed speed_cap, platform_speed_cap;
6489 	enum pcie_link_width platform_link_width, link_width;
6490 
6491 	if (amdgpu_pcie_gen_cap)
6492 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6493 
6494 	if (amdgpu_pcie_lane_cap)
6495 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6496 
6497 	/* covers APUs as well */
6498 	if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6499 		if (adev->pm.pcie_gen_mask == 0)
6500 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6501 		if (adev->pm.pcie_mlw_mask == 0)
6502 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6503 		return;
6504 	}
6505 
6506 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6507 		return;
6508 
6509 	amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6510 					&platform_link_width);
6511 	amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width);
6512 
6513 	if (adev->pm.pcie_gen_mask == 0) {
6514 		/* asic caps */
6515 		if (speed_cap == PCI_SPEED_UNKNOWN) {
6516 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6517 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6518 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6519 		} else {
6520 			if (speed_cap == PCIE_SPEED_32_0GT)
6521 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6522 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6523 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6524 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6525 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6526 			else if (speed_cap == PCIE_SPEED_16_0GT)
6527 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6528 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6529 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6530 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6531 			else if (speed_cap == PCIE_SPEED_8_0GT)
6532 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6533 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6534 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6535 			else if (speed_cap == PCIE_SPEED_5_0GT)
6536 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6537 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6538 			else
6539 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6540 		}
6541 		/* platform caps */
6542 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6543 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6544 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6545 		} else {
6546 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
6547 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6548 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6549 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6550 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6551 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6552 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6553 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6554 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6555 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6556 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6557 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6558 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6559 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6560 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6561 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6562 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6563 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6564 			else
6565 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6566 
6567 		}
6568 	}
6569 	if (adev->pm.pcie_mlw_mask == 0) {
6570 		/* asic caps */
6571 		if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6572 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK;
6573 		} else {
6574 			switch (link_width) {
6575 			case PCIE_LNK_X32:
6576 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 |
6577 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6578 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6579 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6580 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6581 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6582 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6583 				break;
6584 			case PCIE_LNK_X16:
6585 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6586 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6587 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6588 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6589 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6590 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6591 				break;
6592 			case PCIE_LNK_X12:
6593 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6594 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6595 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6596 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6597 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6598 				break;
6599 			case PCIE_LNK_X8:
6600 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6601 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6602 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6603 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6604 				break;
6605 			case PCIE_LNK_X4:
6606 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6607 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6608 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6609 				break;
6610 			case PCIE_LNK_X2:
6611 				adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6612 							   CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6613 				break;
6614 			case PCIE_LNK_X1:
6615 				adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1;
6616 				break;
6617 			default:
6618 				break;
6619 			}
6620 		}
6621 		/* platform caps */
6622 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6623 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6624 		} else {
6625 			switch (platform_link_width) {
6626 			case PCIE_LNK_X32:
6627 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6628 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6629 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6630 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6631 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6632 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6633 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6634 				break;
6635 			case PCIE_LNK_X16:
6636 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6637 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6638 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6639 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6640 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6641 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6642 				break;
6643 			case PCIE_LNK_X12:
6644 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6645 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6646 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6647 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6648 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6649 				break;
6650 			case PCIE_LNK_X8:
6651 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6652 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6653 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6654 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6655 				break;
6656 			case PCIE_LNK_X4:
6657 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6658 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6659 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6660 				break;
6661 			case PCIE_LNK_X2:
6662 				adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6663 							   CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6664 				break;
6665 			case PCIE_LNK_X1:
6666 				adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6667 				break;
6668 			default:
6669 				break;
6670 			}
6671 		}
6672 	}
6673 }
6674 
6675 /**
6676  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6677  *
6678  * @adev: amdgpu_device pointer
6679  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6680  *
6681  * Return true if @peer_adev can access (DMA) @adev through the PCIe
6682  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6683  * @peer_adev.
6684  */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6685 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6686 				      struct amdgpu_device *peer_adev)
6687 {
6688 #ifdef CONFIG_HSA_AMD_P2P
6689 	bool p2p_access =
6690 		!adev->gmc.xgmi.connected_to_cpu &&
6691 		!(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6692 	if (!p2p_access)
6693 		dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6694 			pci_name(peer_adev->pdev));
6695 
6696 	bool is_large_bar = adev->gmc.visible_vram_size &&
6697 		adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6698 	bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6699 
6700 	if (!p2p_addressable) {
6701 		uint64_t address_mask = peer_adev->dev->dma_mask ?
6702 			~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6703 		resource_size_t aper_limit =
6704 			adev->gmc.aper_base + adev->gmc.aper_size - 1;
6705 
6706 		p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6707 				     aper_limit & address_mask);
6708 	}
6709 	return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6710 #else
6711 	return false;
6712 #endif
6713 }
6714 
amdgpu_device_baco_enter(struct drm_device * dev)6715 int amdgpu_device_baco_enter(struct drm_device *dev)
6716 {
6717 	struct amdgpu_device *adev = drm_to_adev(dev);
6718 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6719 
6720 	if (!amdgpu_device_supports_baco(dev))
6721 		return -ENOTSUPP;
6722 
6723 	if (ras && adev->ras_enabled &&
6724 	    adev->nbio.funcs->enable_doorbell_interrupt)
6725 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6726 
6727 	return amdgpu_dpm_baco_enter(adev);
6728 }
6729 
amdgpu_device_baco_exit(struct drm_device * dev)6730 int amdgpu_device_baco_exit(struct drm_device *dev)
6731 {
6732 	struct amdgpu_device *adev = drm_to_adev(dev);
6733 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6734 	int ret = 0;
6735 
6736 	if (!amdgpu_device_supports_baco(dev))
6737 		return -ENOTSUPP;
6738 
6739 	ret = amdgpu_dpm_baco_exit(adev);
6740 	if (ret)
6741 		return ret;
6742 
6743 	if (ras && adev->ras_enabled &&
6744 	    adev->nbio.funcs->enable_doorbell_interrupt)
6745 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6746 
6747 	if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6748 	    adev->nbio.funcs->clear_doorbell_interrupt)
6749 		adev->nbio.funcs->clear_doorbell_interrupt(adev);
6750 
6751 	return 0;
6752 }
6753 
6754 /**
6755  * amdgpu_pci_error_detected - Called when a PCI error is detected.
6756  * @pdev: PCI device struct
6757  * @state: PCI channel state
6758  *
6759  * Description: Called when a PCI error is detected.
6760  *
6761  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6762  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6763 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6764 {
6765 	struct drm_device *dev = pci_get_drvdata(pdev);
6766 	struct amdgpu_device *adev = drm_to_adev(dev);
6767 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
6768 	struct amdgpu_reset_context reset_context;
6769 	struct list_head device_list;
6770 	int r = 0;
6771 
6772 	dev_info(adev->dev, "PCI error: detected callback!!\n");
6773 
6774 	if (!amdgpu_dpm_is_link_reset_supported(adev)) {
6775 		dev_warn(adev->dev, "No support for XGMI hive yet...\n");
6776 		return PCI_ERS_RESULT_DISCONNECT;
6777 	}
6778 
6779 	adev->pci_channel_state = state;
6780 
6781 	switch (state) {
6782 	case pci_channel_io_normal:
6783 		dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state);
6784 		return PCI_ERS_RESULT_CAN_RECOVER;
6785 	case pci_channel_io_frozen:
6786 		/* Fatal error, prepare for slot reset */
6787 		dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state);
6788 
6789 		if (hive)
6790 			mutex_lock(&hive->hive_lock);
6791 		adev->pcie_reset_ctx.occurs_dpc = true;
6792 		memset(&reset_context, 0, sizeof(reset_context));
6793 		INIT_LIST_HEAD(&device_list);
6794 
6795 		amdgpu_device_recovery_prepare(adev, &device_list, hive);
6796 		amdgpu_device_recovery_get_reset_lock(adev, &device_list);
6797 		r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
6798 					 hive, false);
6799 		if (hive) {
6800 			mutex_unlock(&hive->hive_lock);
6801 			amdgpu_put_xgmi_hive(hive);
6802 		}
6803 		if (r)
6804 			return PCI_ERS_RESULT_DISCONNECT;
6805 		return PCI_ERS_RESULT_NEED_RESET;
6806 	case pci_channel_io_perm_failure:
6807 		/* Permanent error, prepare for device removal */
6808 		dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state);
6809 		return PCI_ERS_RESULT_DISCONNECT;
6810 	}
6811 
6812 	return PCI_ERS_RESULT_NEED_RESET;
6813 }
6814 
6815 /**
6816  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6817  * @pdev: pointer to PCI device
6818  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)6819 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6820 {
6821 	struct drm_device *dev = pci_get_drvdata(pdev);
6822 	struct amdgpu_device *adev = drm_to_adev(dev);
6823 
6824 	dev_info(adev->dev, "PCI error: mmio enabled callback!!\n");
6825 
6826 	/* TODO - dump whatever for debugging purposes */
6827 
6828 	/* This called only if amdgpu_pci_error_detected returns
6829 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6830 	 * works, no need to reset slot.
6831 	 */
6832 
6833 	return PCI_ERS_RESULT_RECOVERED;
6834 }
6835 
6836 /**
6837  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6838  * @pdev: PCI device struct
6839  *
6840  * Description: This routine is called by the pci error recovery
6841  * code after the PCI slot has been reset, just before we
6842  * should resume normal operations.
6843  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)6844 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6845 {
6846 	struct drm_device *dev = pci_get_drvdata(pdev);
6847 	struct amdgpu_device *adev = drm_to_adev(dev);
6848 	struct amdgpu_reset_context reset_context;
6849 	struct amdgpu_device *tmp_adev;
6850 	struct amdgpu_hive_info *hive;
6851 	struct list_head device_list;
6852 	int r = 0, i;
6853 	u32 memsize;
6854 
6855 	/* PCI error slot reset should be skipped During RAS recovery */
6856 	if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6857 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6858 	    amdgpu_ras_in_recovery(adev))
6859 		return PCI_ERS_RESULT_RECOVERED;
6860 
6861 	dev_info(adev->dev, "PCI error: slot reset callback!!\n");
6862 
6863 	memset(&reset_context, 0, sizeof(reset_context));
6864 
6865 	/* wait for asic to come out of reset */
6866 	msleep(700);
6867 
6868 	/* Restore PCI confspace */
6869 	amdgpu_device_load_pci_state(pdev);
6870 
6871 	/* confirm  ASIC came out of reset */
6872 	for (i = 0; i < adev->usec_timeout; i++) {
6873 		memsize = amdgpu_asic_get_config_memsize(adev);
6874 
6875 		if (memsize != 0xffffffff)
6876 			break;
6877 		udelay(1);
6878 	}
6879 	if (memsize == 0xffffffff) {
6880 		r = -ETIME;
6881 		goto out;
6882 	}
6883 
6884 	reset_context.method = AMD_RESET_METHOD_NONE;
6885 	reset_context.reset_req_dev = adev;
6886 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6887 	set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
6888 	INIT_LIST_HEAD(&device_list);
6889 
6890 	hive = amdgpu_get_xgmi_hive(adev);
6891 	if (hive) {
6892 		mutex_lock(&hive->hive_lock);
6893 		reset_context.hive = hive;
6894 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6895 			tmp_adev->pcie_reset_ctx.in_link_reset = true;
6896 			list_add_tail(&tmp_adev->reset_list, &device_list);
6897 		}
6898 	} else {
6899 		set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6900 		list_add_tail(&adev->reset_list, &device_list);
6901 	}
6902 
6903 	r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);
6904 out:
6905 	if (!r) {
6906 		if (amdgpu_device_cache_pci_state(adev->pdev))
6907 			pci_restore_state(adev->pdev);
6908 		dev_info(adev->dev, "PCIe error recovery succeeded\n");
6909 	} else {
6910 		dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r);
6911 		if (hive) {
6912 			list_for_each_entry(tmp_adev, &device_list, reset_list)
6913 				amdgpu_device_unset_mp1_state(tmp_adev);
6914 		}
6915 		amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6916 	}
6917 
6918 	if (hive) {
6919 		mutex_unlock(&hive->hive_lock);
6920 		amdgpu_put_xgmi_hive(hive);
6921 	}
6922 
6923 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6924 }
6925 
6926 /**
6927  * amdgpu_pci_resume() - resume normal ops after PCI reset
6928  * @pdev: pointer to PCI device
6929  *
6930  * Called when the error recovery driver tells us that its
6931  * OK to resume normal operation.
6932  */
amdgpu_pci_resume(struct pci_dev * pdev)6933 void amdgpu_pci_resume(struct pci_dev *pdev)
6934 {
6935 	struct drm_device *dev = pci_get_drvdata(pdev);
6936 	struct amdgpu_device *adev = drm_to_adev(dev);
6937 	struct list_head device_list;
6938 	struct amdgpu_hive_info *hive = NULL;
6939 	struct amdgpu_device *tmp_adev = NULL;
6940 
6941 	dev_info(adev->dev, "PCI error: resume callback!!\n");
6942 
6943 	/* Only continue execution for the case of pci_channel_io_frozen */
6944 	if (adev->pci_channel_state != pci_channel_io_frozen)
6945 		return;
6946 
6947 	INIT_LIST_HEAD(&device_list);
6948 
6949 	hive = amdgpu_get_xgmi_hive(adev);
6950 	if (hive) {
6951 		mutex_lock(&hive->hive_lock);
6952 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6953 			tmp_adev->pcie_reset_ctx.in_link_reset = false;
6954 			list_add_tail(&tmp_adev->reset_list, &device_list);
6955 		}
6956 	} else
6957 		list_add_tail(&adev->reset_list, &device_list);
6958 
6959 	amdgpu_device_sched_resume(&device_list, NULL, NULL);
6960 	amdgpu_device_gpu_resume(adev, &device_list, false);
6961 	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6962 	adev->pcie_reset_ctx.occurs_dpc = false;
6963 
6964 	if (hive) {
6965 		mutex_unlock(&hive->hive_lock);
6966 		amdgpu_put_xgmi_hive(hive);
6967 	}
6968 }
6969 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6970 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6971 {
6972 	struct drm_device *dev = pci_get_drvdata(pdev);
6973 	struct amdgpu_device *adev = drm_to_adev(dev);
6974 	int r;
6975 
6976 	if (amdgpu_sriov_vf(adev))
6977 		return false;
6978 
6979 	r = pci_save_state(pdev);
6980 	if (!r) {
6981 		kfree(adev->pci_state);
6982 
6983 		adev->pci_state = pci_store_saved_state(pdev);
6984 
6985 		if (!adev->pci_state) {
6986 			DRM_ERROR("Failed to store PCI saved state");
6987 			return false;
6988 		}
6989 	} else {
6990 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
6991 		return false;
6992 	}
6993 
6994 	return true;
6995 }
6996 
amdgpu_device_load_pci_state(struct pci_dev * pdev)6997 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6998 {
6999 	struct drm_device *dev = pci_get_drvdata(pdev);
7000 	struct amdgpu_device *adev = drm_to_adev(dev);
7001 	int r;
7002 
7003 	if (!adev->pci_state)
7004 		return false;
7005 
7006 	r = pci_load_saved_state(pdev, adev->pci_state);
7007 
7008 	if (!r) {
7009 		pci_restore_state(pdev);
7010 	} else {
7011 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
7012 		return false;
7013 	}
7014 
7015 	return true;
7016 }
7017 
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7018 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
7019 		struct amdgpu_ring *ring)
7020 {
7021 #ifdef CONFIG_X86_64
7022 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7023 		return;
7024 #endif
7025 	if (adev->gmc.xgmi.connected_to_cpu)
7026 		return;
7027 
7028 	if (ring && ring->funcs->emit_hdp_flush)
7029 		amdgpu_ring_emit_hdp_flush(ring);
7030 	else
7031 		amdgpu_asic_flush_hdp(adev, ring);
7032 }
7033 
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7034 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
7035 		struct amdgpu_ring *ring)
7036 {
7037 #ifdef CONFIG_X86_64
7038 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7039 		return;
7040 #endif
7041 	if (adev->gmc.xgmi.connected_to_cpu)
7042 		return;
7043 
7044 	amdgpu_asic_invalidate_hdp(adev, ring);
7045 }
7046 
amdgpu_in_reset(struct amdgpu_device * adev)7047 int amdgpu_in_reset(struct amdgpu_device *adev)
7048 {
7049 	return atomic_read(&adev->reset_domain->in_gpu_reset);
7050 }
7051 
7052 /**
7053  * amdgpu_device_halt() - bring hardware to some kind of halt state
7054  *
7055  * @adev: amdgpu_device pointer
7056  *
7057  * Bring hardware to some kind of halt state so that no one can touch it
7058  * any more. It will help to maintain error context when error occurred.
7059  * Compare to a simple hang, the system will keep stable at least for SSH
7060  * access. Then it should be trivial to inspect the hardware state and
7061  * see what's going on. Implemented as following:
7062  *
7063  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
7064  *    clears all CPU mappings to device, disallows remappings through page faults
7065  * 2. amdgpu_irq_disable_all() disables all interrupts
7066  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
7067  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
7068  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
7069  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
7070  *    flush any in flight DMA operations
7071  */
amdgpu_device_halt(struct amdgpu_device * adev)7072 void amdgpu_device_halt(struct amdgpu_device *adev)
7073 {
7074 	struct pci_dev *pdev = adev->pdev;
7075 	struct drm_device *ddev = adev_to_drm(adev);
7076 
7077 	amdgpu_xcp_dev_unplug(adev);
7078 	drm_dev_unplug(ddev);
7079 
7080 	amdgpu_irq_disable_all(adev);
7081 
7082 	amdgpu_fence_driver_hw_fini(adev);
7083 
7084 	adev->no_hw_access = true;
7085 
7086 	amdgpu_device_unmap_mmio(adev);
7087 
7088 	pci_disable_device(pdev);
7089 	pci_wait_for_pending_transaction(pdev);
7090 }
7091 
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)7092 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
7093 				u32 reg)
7094 {
7095 	unsigned long flags, address, data;
7096 	u32 r;
7097 
7098 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7099 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7100 
7101 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7102 	WREG32(address, reg * 4);
7103 	(void)RREG32(address);
7104 	r = RREG32(data);
7105 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7106 	return r;
7107 }
7108 
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)7109 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
7110 				u32 reg, u32 v)
7111 {
7112 	unsigned long flags, address, data;
7113 
7114 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7115 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7116 
7117 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7118 	WREG32(address, reg * 4);
7119 	(void)RREG32(address);
7120 	WREG32(data, v);
7121 	(void)RREG32(data);
7122 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7123 }
7124 
7125 /**
7126  * amdgpu_device_get_gang - return a reference to the current gang
7127  * @adev: amdgpu_device pointer
7128  *
7129  * Returns: A new reference to the current gang leader.
7130  */
amdgpu_device_get_gang(struct amdgpu_device * adev)7131 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
7132 {
7133 	struct dma_fence *fence;
7134 
7135 	rcu_read_lock();
7136 	fence = dma_fence_get_rcu_safe(&adev->gang_submit);
7137 	rcu_read_unlock();
7138 	return fence;
7139 }
7140 
7141 /**
7142  * amdgpu_device_switch_gang - switch to a new gang
7143  * @adev: amdgpu_device pointer
7144  * @gang: the gang to switch to
7145  *
7146  * Try to switch to a new gang.
7147  * Returns: NULL if we switched to the new gang or a reference to the current
7148  * gang leader.
7149  */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)7150 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
7151 					    struct dma_fence *gang)
7152 {
7153 	struct dma_fence *old = NULL;
7154 
7155 	dma_fence_get(gang);
7156 	do {
7157 		dma_fence_put(old);
7158 		old = amdgpu_device_get_gang(adev);
7159 		if (old == gang)
7160 			break;
7161 
7162 		if (!dma_fence_is_signaled(old)) {
7163 			dma_fence_put(gang);
7164 			return old;
7165 		}
7166 
7167 	} while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
7168 			 old, gang) != old);
7169 
7170 	/*
7171 	 * Drop it once for the exchanged reference in adev and once for the
7172 	 * thread local reference acquired in amdgpu_device_get_gang().
7173 	 */
7174 	dma_fence_put(old);
7175 	dma_fence_put(old);
7176 	return NULL;
7177 }
7178 
7179 /**
7180  * amdgpu_device_enforce_isolation - enforce HW isolation
7181  * @adev: the amdgpu device pointer
7182  * @ring: the HW ring the job is supposed to run on
7183  * @job: the job which is about to be pushed to the HW ring
7184  *
7185  * Makes sure that only one client at a time can use the GFX block.
7186  * Returns: The dependency to wait on before the job can be pushed to the HW.
7187  * The function is called multiple times until NULL is returned.
7188  */
amdgpu_device_enforce_isolation(struct amdgpu_device * adev,struct amdgpu_ring * ring,struct amdgpu_job * job)7189 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
7190 						  struct amdgpu_ring *ring,
7191 						  struct amdgpu_job *job)
7192 {
7193 	struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id];
7194 	struct drm_sched_fence *f = job->base.s_fence;
7195 	struct dma_fence *dep;
7196 	void *owner;
7197 	int r;
7198 
7199 	/*
7200 	 * For now enforce isolation only for the GFX block since we only need
7201 	 * the cleaner shader on those rings.
7202 	 */
7203 	if (ring->funcs->type != AMDGPU_RING_TYPE_GFX &&
7204 	    ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
7205 		return NULL;
7206 
7207 	/*
7208 	 * All submissions where enforce isolation is false are handled as if
7209 	 * they come from a single client. Use ~0l as the owner to distinct it
7210 	 * from kernel submissions where the owner is NULL.
7211 	 */
7212 	owner = job->enforce_isolation ? f->owner : (void *)~0l;
7213 
7214 	mutex_lock(&adev->enforce_isolation_mutex);
7215 
7216 	/*
7217 	 * The "spearhead" submission is the first one which changes the
7218 	 * ownership to its client. We always need to wait for it to be
7219 	 * pushed to the HW before proceeding with anything.
7220 	 */
7221 	if (&f->scheduled != isolation->spearhead &&
7222 	    !dma_fence_is_signaled(isolation->spearhead)) {
7223 		dep = isolation->spearhead;
7224 		goto out_grab_ref;
7225 	}
7226 
7227 	if (isolation->owner != owner) {
7228 
7229 		/*
7230 		 * Wait for any gang to be assembled before switching to a
7231 		 * different owner or otherwise we could deadlock the
7232 		 * submissions.
7233 		 */
7234 		if (!job->gang_submit) {
7235 			dep = amdgpu_device_get_gang(adev);
7236 			if (!dma_fence_is_signaled(dep))
7237 				goto out_return_dep;
7238 			dma_fence_put(dep);
7239 		}
7240 
7241 		dma_fence_put(isolation->spearhead);
7242 		isolation->spearhead = dma_fence_get(&f->scheduled);
7243 		amdgpu_sync_move(&isolation->active, &isolation->prev);
7244 		trace_amdgpu_isolation(isolation->owner, owner);
7245 		isolation->owner = owner;
7246 	}
7247 
7248 	/*
7249 	 * Specifying the ring here helps to pipeline submissions even when
7250 	 * isolation is enabled. If that is not desired for testing NULL can be
7251 	 * used instead of the ring to enforce a CPU round trip while switching
7252 	 * between clients.
7253 	 */
7254 	dep = amdgpu_sync_peek_fence(&isolation->prev, ring);
7255 	r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT);
7256 	if (r)
7257 		DRM_WARN("OOM tracking isolation\n");
7258 
7259 out_grab_ref:
7260 	dma_fence_get(dep);
7261 out_return_dep:
7262 	mutex_unlock(&adev->enforce_isolation_mutex);
7263 	return dep;
7264 }
7265 
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)7266 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
7267 {
7268 	switch (adev->asic_type) {
7269 #ifdef CONFIG_DRM_AMDGPU_SI
7270 	case CHIP_HAINAN:
7271 #endif
7272 	case CHIP_TOPAZ:
7273 		/* chips with no display hardware */
7274 		return false;
7275 #ifdef CONFIG_DRM_AMDGPU_SI
7276 	case CHIP_TAHITI:
7277 	case CHIP_PITCAIRN:
7278 	case CHIP_VERDE:
7279 	case CHIP_OLAND:
7280 #endif
7281 #ifdef CONFIG_DRM_AMDGPU_CIK
7282 	case CHIP_BONAIRE:
7283 	case CHIP_HAWAII:
7284 	case CHIP_KAVERI:
7285 	case CHIP_KABINI:
7286 	case CHIP_MULLINS:
7287 #endif
7288 	case CHIP_TONGA:
7289 	case CHIP_FIJI:
7290 	case CHIP_POLARIS10:
7291 	case CHIP_POLARIS11:
7292 	case CHIP_POLARIS12:
7293 	case CHIP_VEGAM:
7294 	case CHIP_CARRIZO:
7295 	case CHIP_STONEY:
7296 		/* chips with display hardware */
7297 		return true;
7298 	default:
7299 		/* IP discovery */
7300 		if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
7301 		    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
7302 			return false;
7303 		return true;
7304 	}
7305 }
7306 
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)7307 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
7308 		uint32_t inst, uint32_t reg_addr, char reg_name[],
7309 		uint32_t expected_value, uint32_t mask)
7310 {
7311 	uint32_t ret = 0;
7312 	uint32_t old_ = 0;
7313 	uint32_t tmp_ = RREG32(reg_addr);
7314 	uint32_t loop = adev->usec_timeout;
7315 
7316 	while ((tmp_ & (mask)) != (expected_value)) {
7317 		if (old_ != tmp_) {
7318 			loop = adev->usec_timeout;
7319 			old_ = tmp_;
7320 		} else
7321 			udelay(1);
7322 		tmp_ = RREG32(reg_addr);
7323 		loop--;
7324 		if (!loop) {
7325 			DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
7326 				  inst, reg_name, (uint32_t)expected_value,
7327 				  (uint32_t)(tmp_ & (mask)));
7328 			ret = -ETIMEDOUT;
7329 			break;
7330 		}
7331 	}
7332 	return ret;
7333 }
7334 
amdgpu_get_soft_full_reset_mask(struct amdgpu_ring * ring)7335 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
7336 {
7337 	ssize_t size = 0;
7338 
7339 	if (!ring || !ring->adev)
7340 		return size;
7341 
7342 	if (amdgpu_device_should_recover_gpu(ring->adev))
7343 		size |= AMDGPU_RESET_TYPE_FULL;
7344 
7345 	if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
7346 	    !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
7347 		size |= AMDGPU_RESET_TYPE_SOFT_RESET;
7348 
7349 	return size;
7350 }
7351 
amdgpu_show_reset_mask(char * buf,uint32_t supported_reset)7352 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
7353 {
7354 	ssize_t size = 0;
7355 
7356 	if (supported_reset == 0) {
7357 		size += sysfs_emit_at(buf, size, "unsupported");
7358 		size += sysfs_emit_at(buf, size, "\n");
7359 		return size;
7360 
7361 	}
7362 
7363 	if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
7364 		size += sysfs_emit_at(buf, size, "soft ");
7365 
7366 	if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
7367 		size += sysfs_emit_at(buf, size, "queue ");
7368 
7369 	if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
7370 		size += sysfs_emit_at(buf, size, "pipe ");
7371 
7372 	if (supported_reset & AMDGPU_RESET_TYPE_FULL)
7373 		size += sysfs_emit_at(buf, size, "full ");
7374 
7375 	size += sysfs_emit_at(buf, size, "\n");
7376 	return size;
7377 }
7378