xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c (revision 9c39c6ffe0c2945c7cf814814c096bc23b63f53d)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 #include "amdgpu_reset.h"
69 
70 #include <linux/suspend.h>
71 #include <drm/task_barrier.h>
72 #include <linux/pm_runtime.h>
73 
74 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
85 
86 #define AMDGPU_RESUME_MS		2000
87 
88 const char *amdgpu_asic_name[] = {
89 	"TAHITI",
90 	"PITCAIRN",
91 	"VERDE",
92 	"OLAND",
93 	"HAINAN",
94 	"BONAIRE",
95 	"KAVERI",
96 	"KABINI",
97 	"HAWAII",
98 	"MULLINS",
99 	"TOPAZ",
100 	"TONGA",
101 	"FIJI",
102 	"CARRIZO",
103 	"STONEY",
104 	"POLARIS10",
105 	"POLARIS11",
106 	"POLARIS12",
107 	"VEGAM",
108 	"VEGA10",
109 	"VEGA12",
110 	"VEGA20",
111 	"RAVEN",
112 	"ARCTURUS",
113 	"RENOIR",
114 	"ALDEBARAN",
115 	"NAVI10",
116 	"NAVI14",
117 	"NAVI12",
118 	"SIENNA_CICHLID",
119 	"NAVY_FLOUNDER",
120 	"VANGOGH",
121 	"DIMGREY_CAVEFISH",
122 	"LAST",
123 };
124 
125 /**
126  * DOC: pcie_replay_count
127  *
128  * The amdgpu driver provides a sysfs API for reporting the total number
129  * of PCIe replays (NAKs)
130  * The file pcie_replay_count is used for this and returns the total
131  * number of replays as a sum of the NAKs generated and NAKs received
132  */
133 
134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
135 		struct device_attribute *attr, char *buf)
136 {
137 	struct drm_device *ddev = dev_get_drvdata(dev);
138 	struct amdgpu_device *adev = drm_to_adev(ddev);
139 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
140 
141 	return sysfs_emit(buf, "%llu\n", cnt);
142 }
143 
144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
145 		amdgpu_device_get_pcie_replay_count, NULL);
146 
147 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
148 
149 /**
150  * DOC: product_name
151  *
152  * The amdgpu driver provides a sysfs API for reporting the product name
153  * for the device
154  * The file serial_number is used for this and returns the product name
155  * as returned from the FRU.
156  * NOTE: This is only available for certain server cards
157  */
158 
159 static ssize_t amdgpu_device_get_product_name(struct device *dev,
160 		struct device_attribute *attr, char *buf)
161 {
162 	struct drm_device *ddev = dev_get_drvdata(dev);
163 	struct amdgpu_device *adev = drm_to_adev(ddev);
164 
165 	return sysfs_emit(buf, "%s\n", adev->product_name);
166 }
167 
168 static DEVICE_ATTR(product_name, S_IRUGO,
169 		amdgpu_device_get_product_name, NULL);
170 
171 /**
172  * DOC: product_number
173  *
174  * The amdgpu driver provides a sysfs API for reporting the part number
175  * for the device
176  * The file serial_number is used for this and returns the part number
177  * as returned from the FRU.
178  * NOTE: This is only available for certain server cards
179  */
180 
181 static ssize_t amdgpu_device_get_product_number(struct device *dev,
182 		struct device_attribute *attr, char *buf)
183 {
184 	struct drm_device *ddev = dev_get_drvdata(dev);
185 	struct amdgpu_device *adev = drm_to_adev(ddev);
186 
187 	return sysfs_emit(buf, "%s\n", adev->product_number);
188 }
189 
190 static DEVICE_ATTR(product_number, S_IRUGO,
191 		amdgpu_device_get_product_number, NULL);
192 
193 /**
194  * DOC: serial_number
195  *
196  * The amdgpu driver provides a sysfs API for reporting the serial number
197  * for the device
198  * The file serial_number is used for this and returns the serial number
199  * as returned from the FRU.
200  * NOTE: This is only available for certain server cards
201  */
202 
203 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
204 		struct device_attribute *attr, char *buf)
205 {
206 	struct drm_device *ddev = dev_get_drvdata(dev);
207 	struct amdgpu_device *adev = drm_to_adev(ddev);
208 
209 	return sysfs_emit(buf, "%s\n", adev->serial);
210 }
211 
212 static DEVICE_ATTR(serial_number, S_IRUGO,
213 		amdgpu_device_get_serial_number, NULL);
214 
215 /**
216  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
217  *
218  * @dev: drm_device pointer
219  *
220  * Returns true if the device is a dGPU with ATPX power control,
221  * otherwise return false.
222  */
223 bool amdgpu_device_supports_px(struct drm_device *dev)
224 {
225 	struct amdgpu_device *adev = drm_to_adev(dev);
226 
227 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
228 		return true;
229 	return false;
230 }
231 
232 /**
233  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
234  *
235  * @dev: drm_device pointer
236  *
237  * Returns true if the device is a dGPU with ACPI power control,
238  * otherwise return false.
239  */
240 bool amdgpu_device_supports_boco(struct drm_device *dev)
241 {
242 	struct amdgpu_device *adev = drm_to_adev(dev);
243 
244 	if (adev->has_pr3 ||
245 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
246 		return true;
247 	return false;
248 }
249 
250 /**
251  * amdgpu_device_supports_baco - Does the device support BACO
252  *
253  * @dev: drm_device pointer
254  *
255  * Returns true if the device supporte BACO,
256  * otherwise return false.
257  */
258 bool amdgpu_device_supports_baco(struct drm_device *dev)
259 {
260 	struct amdgpu_device *adev = drm_to_adev(dev);
261 
262 	return amdgpu_asic_supports_baco(adev);
263 }
264 
265 /*
266  * VRAM access helper functions
267  */
268 
269 /**
270  * amdgpu_device_vram_access - read/write a buffer in vram
271  *
272  * @adev: amdgpu_device pointer
273  * @pos: offset of the buffer in vram
274  * @buf: virtual address of the buffer in system memory
275  * @size: read/write size, sizeof(@buf) must > @size
276  * @write: true - write to vram, otherwise - read from vram
277  */
278 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
279 			       uint32_t *buf, size_t size, bool write)
280 {
281 	unsigned long flags;
282 	uint32_t hi = ~0;
283 	uint64_t last;
284 
285 
286 #ifdef CONFIG_64BIT
287 	last = min(pos + size, adev->gmc.visible_vram_size);
288 	if (last > pos) {
289 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
290 		size_t count = last - pos;
291 
292 		if (write) {
293 			memcpy_toio(addr, buf, count);
294 			mb();
295 			amdgpu_asic_flush_hdp(adev, NULL);
296 		} else {
297 			amdgpu_asic_invalidate_hdp(adev, NULL);
298 			mb();
299 			memcpy_fromio(buf, addr, count);
300 		}
301 
302 		if (count == size)
303 			return;
304 
305 		pos += count;
306 		buf += count / 4;
307 		size -= count;
308 	}
309 #endif
310 
311 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
312 	for (last = pos + size; pos < last; pos += 4) {
313 		uint32_t tmp = pos >> 31;
314 
315 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
316 		if (tmp != hi) {
317 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
318 			hi = tmp;
319 		}
320 		if (write)
321 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
322 		else
323 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
324 	}
325 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
326 }
327 
328 /*
329  * register access helper functions.
330  */
331 
332 /* Check if hw access should be skipped because of hotplug or device error */
333 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
334 {
335 	if (adev->in_pci_err_recovery)
336 		return true;
337 
338 #ifdef CONFIG_LOCKDEP
339 	/*
340 	 * This is a bit complicated to understand, so worth a comment. What we assert
341 	 * here is that the GPU reset is not running on another thread in parallel.
342 	 *
343 	 * For this we trylock the read side of the reset semaphore, if that succeeds
344 	 * we know that the reset is not running in paralell.
345 	 *
346 	 * If the trylock fails we assert that we are either already holding the read
347 	 * side of the lock or are the reset thread itself and hold the write side of
348 	 * the lock.
349 	 */
350 	if (in_task()) {
351 		if (down_read_trylock(&adev->reset_sem))
352 			up_read(&adev->reset_sem);
353 		else
354 			lockdep_assert_held(&adev->reset_sem);
355 	}
356 #endif
357 	return false;
358 }
359 
360 /**
361  * amdgpu_device_rreg - read a memory mapped IO or indirect register
362  *
363  * @adev: amdgpu_device pointer
364  * @reg: dword aligned register offset
365  * @acc_flags: access flags which require special behavior
366  *
367  * Returns the 32 bit value from the offset specified.
368  */
369 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
370 			    uint32_t reg, uint32_t acc_flags)
371 {
372 	uint32_t ret;
373 
374 	if (amdgpu_device_skip_hw_access(adev))
375 		return 0;
376 
377 	if ((reg * 4) < adev->rmmio_size) {
378 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
379 		    amdgpu_sriov_runtime(adev) &&
380 		    down_read_trylock(&adev->reset_sem)) {
381 			ret = amdgpu_kiq_rreg(adev, reg);
382 			up_read(&adev->reset_sem);
383 		} else {
384 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
385 		}
386 	} else {
387 		ret = adev->pcie_rreg(adev, reg * 4);
388 	}
389 
390 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
391 
392 	return ret;
393 }
394 
395 /*
396  * MMIO register read with bytes helper functions
397  * @offset:bytes offset from MMIO start
398  *
399 */
400 
401 /**
402  * amdgpu_mm_rreg8 - read a memory mapped IO register
403  *
404  * @adev: amdgpu_device pointer
405  * @offset: byte aligned register offset
406  *
407  * Returns the 8 bit value from the offset specified.
408  */
409 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
410 {
411 	if (amdgpu_device_skip_hw_access(adev))
412 		return 0;
413 
414 	if (offset < adev->rmmio_size)
415 		return (readb(adev->rmmio + offset));
416 	BUG();
417 }
418 
419 /*
420  * MMIO register write with bytes helper functions
421  * @offset:bytes offset from MMIO start
422  * @value: the value want to be written to the register
423  *
424 */
425 /**
426  * amdgpu_mm_wreg8 - read a memory mapped IO register
427  *
428  * @adev: amdgpu_device pointer
429  * @offset: byte aligned register offset
430  * @value: 8 bit value to write
431  *
432  * Writes the value specified to the offset specified.
433  */
434 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
435 {
436 	if (amdgpu_device_skip_hw_access(adev))
437 		return;
438 
439 	if (offset < adev->rmmio_size)
440 		writeb(value, adev->rmmio + offset);
441 	else
442 		BUG();
443 }
444 
445 /**
446  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
447  *
448  * @adev: amdgpu_device pointer
449  * @reg: dword aligned register offset
450  * @v: 32 bit value to write to the register
451  * @acc_flags: access flags which require special behavior
452  *
453  * Writes the value specified to the offset specified.
454  */
455 void amdgpu_device_wreg(struct amdgpu_device *adev,
456 			uint32_t reg, uint32_t v,
457 			uint32_t acc_flags)
458 {
459 	if (amdgpu_device_skip_hw_access(adev))
460 		return;
461 
462 	if ((reg * 4) < adev->rmmio_size) {
463 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
464 		    amdgpu_sriov_runtime(adev) &&
465 		    down_read_trylock(&adev->reset_sem)) {
466 			amdgpu_kiq_wreg(adev, reg, v);
467 			up_read(&adev->reset_sem);
468 		} else {
469 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
470 		}
471 	} else {
472 		adev->pcie_wreg(adev, reg * 4, v);
473 	}
474 
475 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
476 }
477 
478 /*
479  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
480  *
481  * this function is invoked only the debugfs register access
482  * */
483 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
484 			     uint32_t reg, uint32_t v)
485 {
486 	if (amdgpu_device_skip_hw_access(adev))
487 		return;
488 
489 	if (amdgpu_sriov_fullaccess(adev) &&
490 	    adev->gfx.rlc.funcs &&
491 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
492 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
493 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0);
494 	} else {
495 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
496 	}
497 }
498 
499 /**
500  * amdgpu_mm_rdoorbell - read a doorbell dword
501  *
502  * @adev: amdgpu_device pointer
503  * @index: doorbell index
504  *
505  * Returns the value in the doorbell aperture at the
506  * requested doorbell index (CIK).
507  */
508 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
509 {
510 	if (amdgpu_device_skip_hw_access(adev))
511 		return 0;
512 
513 	if (index < adev->doorbell.num_doorbells) {
514 		return readl(adev->doorbell.ptr + index);
515 	} else {
516 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
517 		return 0;
518 	}
519 }
520 
521 /**
522  * amdgpu_mm_wdoorbell - write a doorbell dword
523  *
524  * @adev: amdgpu_device pointer
525  * @index: doorbell index
526  * @v: value to write
527  *
528  * Writes @v to the doorbell aperture at the
529  * requested doorbell index (CIK).
530  */
531 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
532 {
533 	if (amdgpu_device_skip_hw_access(adev))
534 		return;
535 
536 	if (index < adev->doorbell.num_doorbells) {
537 		writel(v, adev->doorbell.ptr + index);
538 	} else {
539 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
540 	}
541 }
542 
543 /**
544  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
545  *
546  * @adev: amdgpu_device pointer
547  * @index: doorbell index
548  *
549  * Returns the value in the doorbell aperture at the
550  * requested doorbell index (VEGA10+).
551  */
552 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
553 {
554 	if (amdgpu_device_skip_hw_access(adev))
555 		return 0;
556 
557 	if (index < adev->doorbell.num_doorbells) {
558 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
559 	} else {
560 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
561 		return 0;
562 	}
563 }
564 
565 /**
566  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
567  *
568  * @adev: amdgpu_device pointer
569  * @index: doorbell index
570  * @v: value to write
571  *
572  * Writes @v to the doorbell aperture at the
573  * requested doorbell index (VEGA10+).
574  */
575 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
576 {
577 	if (amdgpu_device_skip_hw_access(adev))
578 		return;
579 
580 	if (index < adev->doorbell.num_doorbells) {
581 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
582 	} else {
583 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
584 	}
585 }
586 
587 /**
588  * amdgpu_device_indirect_rreg - read an indirect register
589  *
590  * @adev: amdgpu_device pointer
591  * @pcie_index: mmio register offset
592  * @pcie_data: mmio register offset
593  * @reg_addr: indirect register address to read from
594  *
595  * Returns the value of indirect register @reg_addr
596  */
597 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
598 				u32 pcie_index, u32 pcie_data,
599 				u32 reg_addr)
600 {
601 	unsigned long flags;
602 	u32 r;
603 	void __iomem *pcie_index_offset;
604 	void __iomem *pcie_data_offset;
605 
606 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
607 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
608 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
609 
610 	writel(reg_addr, pcie_index_offset);
611 	readl(pcie_index_offset);
612 	r = readl(pcie_data_offset);
613 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
614 
615 	return r;
616 }
617 
618 /**
619  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
620  *
621  * @adev: amdgpu_device pointer
622  * @pcie_index: mmio register offset
623  * @pcie_data: mmio register offset
624  * @reg_addr: indirect register address to read from
625  *
626  * Returns the value of indirect register @reg_addr
627  */
628 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
629 				  u32 pcie_index, u32 pcie_data,
630 				  u32 reg_addr)
631 {
632 	unsigned long flags;
633 	u64 r;
634 	void __iomem *pcie_index_offset;
635 	void __iomem *pcie_data_offset;
636 
637 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
638 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
639 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
640 
641 	/* read low 32 bits */
642 	writel(reg_addr, pcie_index_offset);
643 	readl(pcie_index_offset);
644 	r = readl(pcie_data_offset);
645 	/* read high 32 bits */
646 	writel(reg_addr + 4, pcie_index_offset);
647 	readl(pcie_index_offset);
648 	r |= ((u64)readl(pcie_data_offset) << 32);
649 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
650 
651 	return r;
652 }
653 
654 /**
655  * amdgpu_device_indirect_wreg - write an indirect register address
656  *
657  * @adev: amdgpu_device pointer
658  * @pcie_index: mmio register offset
659  * @pcie_data: mmio register offset
660  * @reg_addr: indirect register offset
661  * @reg_data: indirect register data
662  *
663  */
664 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
665 				 u32 pcie_index, u32 pcie_data,
666 				 u32 reg_addr, u32 reg_data)
667 {
668 	unsigned long flags;
669 	void __iomem *pcie_index_offset;
670 	void __iomem *pcie_data_offset;
671 
672 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
673 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
674 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
675 
676 	writel(reg_addr, pcie_index_offset);
677 	readl(pcie_index_offset);
678 	writel(reg_data, pcie_data_offset);
679 	readl(pcie_data_offset);
680 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
681 }
682 
683 /**
684  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
685  *
686  * @adev: amdgpu_device pointer
687  * @pcie_index: mmio register offset
688  * @pcie_data: mmio register offset
689  * @reg_addr: indirect register offset
690  * @reg_data: indirect register data
691  *
692  */
693 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
694 				   u32 pcie_index, u32 pcie_data,
695 				   u32 reg_addr, u64 reg_data)
696 {
697 	unsigned long flags;
698 	void __iomem *pcie_index_offset;
699 	void __iomem *pcie_data_offset;
700 
701 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
702 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
703 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
704 
705 	/* write low 32 bits */
706 	writel(reg_addr, pcie_index_offset);
707 	readl(pcie_index_offset);
708 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
709 	readl(pcie_data_offset);
710 	/* write high 32 bits */
711 	writel(reg_addr + 4, pcie_index_offset);
712 	readl(pcie_index_offset);
713 	writel((u32)(reg_data >> 32), pcie_data_offset);
714 	readl(pcie_data_offset);
715 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
716 }
717 
718 /**
719  * amdgpu_invalid_rreg - dummy reg read function
720  *
721  * @adev: amdgpu_device pointer
722  * @reg: offset of register
723  *
724  * Dummy register read function.  Used for register blocks
725  * that certain asics don't have (all asics).
726  * Returns the value in the register.
727  */
728 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
729 {
730 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
731 	BUG();
732 	return 0;
733 }
734 
735 /**
736  * amdgpu_invalid_wreg - dummy reg write function
737  *
738  * @adev: amdgpu_device pointer
739  * @reg: offset of register
740  * @v: value to write to the register
741  *
742  * Dummy register read function.  Used for register blocks
743  * that certain asics don't have (all asics).
744  */
745 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
746 {
747 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
748 		  reg, v);
749 	BUG();
750 }
751 
752 /**
753  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
754  *
755  * @adev: amdgpu_device pointer
756  * @reg: offset of register
757  *
758  * Dummy register read function.  Used for register blocks
759  * that certain asics don't have (all asics).
760  * Returns the value in the register.
761  */
762 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
763 {
764 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
765 	BUG();
766 	return 0;
767 }
768 
769 /**
770  * amdgpu_invalid_wreg64 - dummy reg write function
771  *
772  * @adev: amdgpu_device pointer
773  * @reg: offset of register
774  * @v: value to write to the register
775  *
776  * Dummy register read function.  Used for register blocks
777  * that certain asics don't have (all asics).
778  */
779 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
780 {
781 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
782 		  reg, v);
783 	BUG();
784 }
785 
786 /**
787  * amdgpu_block_invalid_rreg - dummy reg read function
788  *
789  * @adev: amdgpu_device pointer
790  * @block: offset of instance
791  * @reg: offset of register
792  *
793  * Dummy register read function.  Used for register blocks
794  * that certain asics don't have (all asics).
795  * Returns the value in the register.
796  */
797 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
798 					  uint32_t block, uint32_t reg)
799 {
800 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
801 		  reg, block);
802 	BUG();
803 	return 0;
804 }
805 
806 /**
807  * amdgpu_block_invalid_wreg - dummy reg write function
808  *
809  * @adev: amdgpu_device pointer
810  * @block: offset of instance
811  * @reg: offset of register
812  * @v: value to write to the register
813  *
814  * Dummy register read function.  Used for register blocks
815  * that certain asics don't have (all asics).
816  */
817 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
818 				      uint32_t block,
819 				      uint32_t reg, uint32_t v)
820 {
821 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
822 		  reg, block, v);
823 	BUG();
824 }
825 
826 /**
827  * amdgpu_device_asic_init - Wrapper for atom asic_init
828  *
829  * @adev: amdgpu_device pointer
830  *
831  * Does any asic specific work and then calls atom asic init.
832  */
833 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
834 {
835 	amdgpu_asic_pre_asic_init(adev);
836 
837 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
838 }
839 
840 /**
841  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
842  *
843  * @adev: amdgpu_device pointer
844  *
845  * Allocates a scratch page of VRAM for use by various things in the
846  * driver.
847  */
848 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
849 {
850 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
851 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
852 				       &adev->vram_scratch.robj,
853 				       &adev->vram_scratch.gpu_addr,
854 				       (void **)&adev->vram_scratch.ptr);
855 }
856 
857 /**
858  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
859  *
860  * @adev: amdgpu_device pointer
861  *
862  * Frees the VRAM scratch page.
863  */
864 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
865 {
866 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
867 }
868 
869 /**
870  * amdgpu_device_program_register_sequence - program an array of registers.
871  *
872  * @adev: amdgpu_device pointer
873  * @registers: pointer to the register array
874  * @array_size: size of the register array
875  *
876  * Programs an array or registers with and and or masks.
877  * This is a helper for setting golden registers.
878  */
879 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
880 					     const u32 *registers,
881 					     const u32 array_size)
882 {
883 	u32 tmp, reg, and_mask, or_mask;
884 	int i;
885 
886 	if (array_size % 3)
887 		return;
888 
889 	for (i = 0; i < array_size; i +=3) {
890 		reg = registers[i + 0];
891 		and_mask = registers[i + 1];
892 		or_mask = registers[i + 2];
893 
894 		if (and_mask == 0xffffffff) {
895 			tmp = or_mask;
896 		} else {
897 			tmp = RREG32(reg);
898 			tmp &= ~and_mask;
899 			if (adev->family >= AMDGPU_FAMILY_AI)
900 				tmp |= (or_mask & and_mask);
901 			else
902 				tmp |= or_mask;
903 		}
904 		WREG32(reg, tmp);
905 	}
906 }
907 
908 /**
909  * amdgpu_device_pci_config_reset - reset the GPU
910  *
911  * @adev: amdgpu_device pointer
912  *
913  * Resets the GPU using the pci config reset sequence.
914  * Only applicable to asics prior to vega10.
915  */
916 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
917 {
918 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
919 }
920 
921 /**
922  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
923  *
924  * @adev: amdgpu_device pointer
925  *
926  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
927  */
928 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
929 {
930 	return pci_reset_function(adev->pdev);
931 }
932 
933 /*
934  * GPU doorbell aperture helpers function.
935  */
936 /**
937  * amdgpu_device_doorbell_init - Init doorbell driver information.
938  *
939  * @adev: amdgpu_device pointer
940  *
941  * Init doorbell driver information (CIK)
942  * Returns 0 on success, error on failure.
943  */
944 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
945 {
946 
947 	/* No doorbell on SI hardware generation */
948 	if (adev->asic_type < CHIP_BONAIRE) {
949 		adev->doorbell.base = 0;
950 		adev->doorbell.size = 0;
951 		adev->doorbell.num_doorbells = 0;
952 		adev->doorbell.ptr = NULL;
953 		return 0;
954 	}
955 
956 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
957 		return -EINVAL;
958 
959 	amdgpu_asic_init_doorbell_index(adev);
960 
961 	/* doorbell bar mapping */
962 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
963 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
964 
965 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
966 					     adev->doorbell_index.max_assignment+1);
967 	if (adev->doorbell.num_doorbells == 0)
968 		return -EINVAL;
969 
970 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
971 	 * paging queue doorbell use the second page. The
972 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
973 	 * doorbells are in the first page. So with paging queue enabled,
974 	 * the max num_doorbells should + 1 page (0x400 in dword)
975 	 */
976 	if (adev->asic_type >= CHIP_VEGA10)
977 		adev->doorbell.num_doorbells += 0x400;
978 
979 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
980 				     adev->doorbell.num_doorbells *
981 				     sizeof(u32));
982 	if (adev->doorbell.ptr == NULL)
983 		return -ENOMEM;
984 
985 	return 0;
986 }
987 
988 /**
989  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
990  *
991  * @adev: amdgpu_device pointer
992  *
993  * Tear down doorbell driver information (CIK)
994  */
995 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
996 {
997 	iounmap(adev->doorbell.ptr);
998 	adev->doorbell.ptr = NULL;
999 }
1000 
1001 
1002 
1003 /*
1004  * amdgpu_device_wb_*()
1005  * Writeback is the method by which the GPU updates special pages in memory
1006  * with the status of certain GPU events (fences, ring pointers,etc.).
1007  */
1008 
1009 /**
1010  * amdgpu_device_wb_fini - Disable Writeback and free memory
1011  *
1012  * @adev: amdgpu_device pointer
1013  *
1014  * Disables Writeback and frees the Writeback memory (all asics).
1015  * Used at driver shutdown.
1016  */
1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 	if (adev->wb.wb_obj) {
1020 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 				      &adev->wb.gpu_addr,
1022 				      (void **)&adev->wb.wb);
1023 		adev->wb.wb_obj = NULL;
1024 	}
1025 }
1026 
1027 /**
1028  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1029  *
1030  * @adev: amdgpu_device pointer
1031  *
1032  * Initializes writeback and allocates writeback memory (all asics).
1033  * Used at driver startup.
1034  * Returns 0 on success or an -error on failure.
1035  */
1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 	int r;
1039 
1040 	if (adev->wb.wb_obj == NULL) {
1041 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 					    (void **)&adev->wb.wb);
1046 		if (r) {
1047 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 			return r;
1049 		}
1050 
1051 		adev->wb.num_wb = AMDGPU_MAX_WB;
1052 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053 
1054 		/* clear wb memory */
1055 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 	}
1057 
1058 	return 0;
1059 }
1060 
1061 /**
1062  * amdgpu_device_wb_get - Allocate a wb entry
1063  *
1064  * @adev: amdgpu_device pointer
1065  * @wb: wb index
1066  *
1067  * Allocate a wb slot for use by the driver (all asics).
1068  * Returns 0 on success or -EINVAL on failure.
1069  */
1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073 
1074 	if (offset < adev->wb.num_wb) {
1075 		__set_bit(offset, adev->wb.used);
1076 		*wb = offset << 3; /* convert to dw offset */
1077 		return 0;
1078 	} else {
1079 		return -EINVAL;
1080 	}
1081 }
1082 
1083 /**
1084  * amdgpu_device_wb_free - Free a wb entry
1085  *
1086  * @adev: amdgpu_device pointer
1087  * @wb: wb index
1088  *
1089  * Free a wb slot allocated for use by the driver (all asics)
1090  */
1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 	wb >>= 3;
1094 	if (wb < adev->wb.num_wb)
1095 		__clear_bit(wb, adev->wb.used);
1096 }
1097 
1098 /**
1099  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100  *
1101  * @adev: amdgpu_device pointer
1102  *
1103  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104  * to fail, but if any of the BARs is not accessible after the size we abort
1105  * driver loading by returning -ENODEV.
1106  */
1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1110 	struct pci_bus *root;
1111 	struct resource *res;
1112 	unsigned i;
1113 	u16 cmd;
1114 	int r;
1115 
1116 	/* Bypass for VF */
1117 	if (amdgpu_sriov_vf(adev))
1118 		return 0;
1119 
1120 	/* skip if the bios has already enabled large BAR */
1121 	if (adev->gmc.real_vram_size &&
1122 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1123 		return 0;
1124 
1125 	/* Check if the root BUS has 64bit memory resources */
1126 	root = adev->pdev->bus;
1127 	while (root->parent)
1128 		root = root->parent;
1129 
1130 	pci_bus_for_each_resource(root, res, i) {
1131 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1132 		    res->start > 0x100000000ull)
1133 			break;
1134 	}
1135 
1136 	/* Trying to resize is pointless without a root hub window above 4GB */
1137 	if (!res)
1138 		return 0;
1139 
1140 	/* Limit the BAR size to what is available */
1141 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1142 			rbar_size);
1143 
1144 	/* Disable memory decoding while we change the BAR addresses and size */
1145 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1146 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1147 			      cmd & ~PCI_COMMAND_MEMORY);
1148 
1149 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1150 	amdgpu_device_doorbell_fini(adev);
1151 	if (adev->asic_type >= CHIP_BONAIRE)
1152 		pci_release_resource(adev->pdev, 2);
1153 
1154 	pci_release_resource(adev->pdev, 0);
1155 
1156 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1157 	if (r == -ENOSPC)
1158 		DRM_INFO("Not enough PCI address space for a large BAR.");
1159 	else if (r && r != -ENOTSUPP)
1160 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1161 
1162 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1163 
1164 	/* When the doorbell or fb BAR isn't available we have no chance of
1165 	 * using the device.
1166 	 */
1167 	r = amdgpu_device_doorbell_init(adev);
1168 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1169 		return -ENODEV;
1170 
1171 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1172 
1173 	return 0;
1174 }
1175 
1176 /*
1177  * GPU helpers function.
1178  */
1179 /**
1180  * amdgpu_device_need_post - check if the hw need post or not
1181  *
1182  * @adev: amdgpu_device pointer
1183  *
1184  * Check if the asic has been initialized (all asics) at driver startup
1185  * or post is needed if  hw reset is performed.
1186  * Returns true if need or false if not.
1187  */
1188 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1189 {
1190 	uint32_t reg;
1191 
1192 	if (amdgpu_sriov_vf(adev))
1193 		return false;
1194 
1195 	if (amdgpu_passthrough(adev)) {
1196 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1197 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1198 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1199 		 * vpost executed for smc version below 22.15
1200 		 */
1201 		if (adev->asic_type == CHIP_FIJI) {
1202 			int err;
1203 			uint32_t fw_ver;
1204 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1205 			/* force vPost if error occured */
1206 			if (err)
1207 				return true;
1208 
1209 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1210 			if (fw_ver < 0x00160e00)
1211 				return true;
1212 		}
1213 	}
1214 
1215 	/* Don't post if we need to reset whole hive on init */
1216 	if (adev->gmc.xgmi.pending_reset)
1217 		return false;
1218 
1219 	if (adev->has_hw_reset) {
1220 		adev->has_hw_reset = false;
1221 		return true;
1222 	}
1223 
1224 	/* bios scratch used on CIK+ */
1225 	if (adev->asic_type >= CHIP_BONAIRE)
1226 		return amdgpu_atombios_scratch_need_asic_init(adev);
1227 
1228 	/* check MEM_SIZE for older asics */
1229 	reg = amdgpu_asic_get_config_memsize(adev);
1230 
1231 	if ((reg != 0) && (reg != 0xffffffff))
1232 		return false;
1233 
1234 	return true;
1235 }
1236 
1237 /* if we get transitioned to only one device, take VGA back */
1238 /**
1239  * amdgpu_device_vga_set_decode - enable/disable vga decode
1240  *
1241  * @cookie: amdgpu_device pointer
1242  * @state: enable/disable vga decode
1243  *
1244  * Enable/disable vga decode (all asics).
1245  * Returns VGA resource flags.
1246  */
1247 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1248 {
1249 	struct amdgpu_device *adev = cookie;
1250 	amdgpu_asic_set_vga_state(adev, state);
1251 	if (state)
1252 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1253 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1254 	else
1255 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1256 }
1257 
1258 /**
1259  * amdgpu_device_check_block_size - validate the vm block size
1260  *
1261  * @adev: amdgpu_device pointer
1262  *
1263  * Validates the vm block size specified via module parameter.
1264  * The vm block size defines number of bits in page table versus page directory,
1265  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1266  * page table and the remaining bits are in the page directory.
1267  */
1268 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1269 {
1270 	/* defines number of bits in page table versus page directory,
1271 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1272 	 * page table and the remaining bits are in the page directory */
1273 	if (amdgpu_vm_block_size == -1)
1274 		return;
1275 
1276 	if (amdgpu_vm_block_size < 9) {
1277 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1278 			 amdgpu_vm_block_size);
1279 		amdgpu_vm_block_size = -1;
1280 	}
1281 }
1282 
1283 /**
1284  * amdgpu_device_check_vm_size - validate the vm size
1285  *
1286  * @adev: amdgpu_device pointer
1287  *
1288  * Validates the vm size in GB specified via module parameter.
1289  * The VM size is the size of the GPU virtual memory space in GB.
1290  */
1291 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1292 {
1293 	/* no need to check the default value */
1294 	if (amdgpu_vm_size == -1)
1295 		return;
1296 
1297 	if (amdgpu_vm_size < 1) {
1298 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1299 			 amdgpu_vm_size);
1300 		amdgpu_vm_size = -1;
1301 	}
1302 }
1303 
1304 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1305 {
1306 	struct sysinfo si;
1307 	bool is_os_64 = (sizeof(void *) == 8);
1308 	uint64_t total_memory;
1309 	uint64_t dram_size_seven_GB = 0x1B8000000;
1310 	uint64_t dram_size_three_GB = 0xB8000000;
1311 
1312 	if (amdgpu_smu_memory_pool_size == 0)
1313 		return;
1314 
1315 	if (!is_os_64) {
1316 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1317 		goto def_value;
1318 	}
1319 	si_meminfo(&si);
1320 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1321 
1322 	if ((amdgpu_smu_memory_pool_size == 1) ||
1323 		(amdgpu_smu_memory_pool_size == 2)) {
1324 		if (total_memory < dram_size_three_GB)
1325 			goto def_value1;
1326 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1327 		(amdgpu_smu_memory_pool_size == 8)) {
1328 		if (total_memory < dram_size_seven_GB)
1329 			goto def_value1;
1330 	} else {
1331 		DRM_WARN("Smu memory pool size not supported\n");
1332 		goto def_value;
1333 	}
1334 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1335 
1336 	return;
1337 
1338 def_value1:
1339 	DRM_WARN("No enough system memory\n");
1340 def_value:
1341 	adev->pm.smu_prv_buffer_size = 0;
1342 }
1343 
1344 /**
1345  * amdgpu_device_check_arguments - validate module params
1346  *
1347  * @adev: amdgpu_device pointer
1348  *
1349  * Validates certain module parameters and updates
1350  * the associated values used by the driver (all asics).
1351  */
1352 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1353 {
1354 	if (amdgpu_sched_jobs < 4) {
1355 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1356 			 amdgpu_sched_jobs);
1357 		amdgpu_sched_jobs = 4;
1358 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1359 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1360 			 amdgpu_sched_jobs);
1361 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1362 	}
1363 
1364 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1365 		/* gart size must be greater or equal to 32M */
1366 		dev_warn(adev->dev, "gart size (%d) too small\n",
1367 			 amdgpu_gart_size);
1368 		amdgpu_gart_size = -1;
1369 	}
1370 
1371 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1372 		/* gtt size must be greater or equal to 32M */
1373 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1374 				 amdgpu_gtt_size);
1375 		amdgpu_gtt_size = -1;
1376 	}
1377 
1378 	/* valid range is between 4 and 9 inclusive */
1379 	if (amdgpu_vm_fragment_size != -1 &&
1380 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1381 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1382 		amdgpu_vm_fragment_size = -1;
1383 	}
1384 
1385 	if (amdgpu_sched_hw_submission < 2) {
1386 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1387 			 amdgpu_sched_hw_submission);
1388 		amdgpu_sched_hw_submission = 2;
1389 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1390 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1391 			 amdgpu_sched_hw_submission);
1392 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1393 	}
1394 
1395 	amdgpu_device_check_smu_prv_buffer_size(adev);
1396 
1397 	amdgpu_device_check_vm_size(adev);
1398 
1399 	amdgpu_device_check_block_size(adev);
1400 
1401 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1402 
1403 	amdgpu_gmc_tmz_set(adev);
1404 
1405 	amdgpu_gmc_noretry_set(adev);
1406 
1407 	return 0;
1408 }
1409 
1410 /**
1411  * amdgpu_switcheroo_set_state - set switcheroo state
1412  *
1413  * @pdev: pci dev pointer
1414  * @state: vga_switcheroo state
1415  *
1416  * Callback for the switcheroo driver.  Suspends or resumes the
1417  * the asics before or after it is powered up using ACPI methods.
1418  */
1419 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1420 					enum vga_switcheroo_state state)
1421 {
1422 	struct drm_device *dev = pci_get_drvdata(pdev);
1423 	int r;
1424 
1425 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1426 		return;
1427 
1428 	if (state == VGA_SWITCHEROO_ON) {
1429 		pr_info("switched on\n");
1430 		/* don't suspend or resume card normally */
1431 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1432 
1433 		pci_set_power_state(pdev, PCI_D0);
1434 		amdgpu_device_load_pci_state(pdev);
1435 		r = pci_enable_device(pdev);
1436 		if (r)
1437 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1438 		amdgpu_device_resume(dev, true);
1439 
1440 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1441 	} else {
1442 		pr_info("switched off\n");
1443 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1444 		amdgpu_device_suspend(dev, true);
1445 		amdgpu_device_cache_pci_state(pdev);
1446 		/* Shut down the device */
1447 		pci_disable_device(pdev);
1448 		pci_set_power_state(pdev, PCI_D3cold);
1449 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1450 	}
1451 }
1452 
1453 /**
1454  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1455  *
1456  * @pdev: pci dev pointer
1457  *
1458  * Callback for the switcheroo driver.  Check of the switcheroo
1459  * state can be changed.
1460  * Returns true if the state can be changed, false if not.
1461  */
1462 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1463 {
1464 	struct drm_device *dev = pci_get_drvdata(pdev);
1465 
1466 	/*
1467 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1468 	* locking inversion with the driver load path. And the access here is
1469 	* completely racy anyway. So don't bother with locking for now.
1470 	*/
1471 	return atomic_read(&dev->open_count) == 0;
1472 }
1473 
1474 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1475 	.set_gpu_state = amdgpu_switcheroo_set_state,
1476 	.reprobe = NULL,
1477 	.can_switch = amdgpu_switcheroo_can_switch,
1478 };
1479 
1480 /**
1481  * amdgpu_device_ip_set_clockgating_state - set the CG state
1482  *
1483  * @dev: amdgpu_device pointer
1484  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1485  * @state: clockgating state (gate or ungate)
1486  *
1487  * Sets the requested clockgating state for all instances of
1488  * the hardware IP specified.
1489  * Returns the error code from the last instance.
1490  */
1491 int amdgpu_device_ip_set_clockgating_state(void *dev,
1492 					   enum amd_ip_block_type block_type,
1493 					   enum amd_clockgating_state state)
1494 {
1495 	struct amdgpu_device *adev = dev;
1496 	int i, r = 0;
1497 
1498 	for (i = 0; i < adev->num_ip_blocks; i++) {
1499 		if (!adev->ip_blocks[i].status.valid)
1500 			continue;
1501 		if (adev->ip_blocks[i].version->type != block_type)
1502 			continue;
1503 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1504 			continue;
1505 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1506 			(void *)adev, state);
1507 		if (r)
1508 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1509 				  adev->ip_blocks[i].version->funcs->name, r);
1510 	}
1511 	return r;
1512 }
1513 
1514 /**
1515  * amdgpu_device_ip_set_powergating_state - set the PG state
1516  *
1517  * @dev: amdgpu_device pointer
1518  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1519  * @state: powergating state (gate or ungate)
1520  *
1521  * Sets the requested powergating state for all instances of
1522  * the hardware IP specified.
1523  * Returns the error code from the last instance.
1524  */
1525 int amdgpu_device_ip_set_powergating_state(void *dev,
1526 					   enum amd_ip_block_type block_type,
1527 					   enum amd_powergating_state state)
1528 {
1529 	struct amdgpu_device *adev = dev;
1530 	int i, r = 0;
1531 
1532 	for (i = 0; i < adev->num_ip_blocks; i++) {
1533 		if (!adev->ip_blocks[i].status.valid)
1534 			continue;
1535 		if (adev->ip_blocks[i].version->type != block_type)
1536 			continue;
1537 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1538 			continue;
1539 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1540 			(void *)adev, state);
1541 		if (r)
1542 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1543 				  adev->ip_blocks[i].version->funcs->name, r);
1544 	}
1545 	return r;
1546 }
1547 
1548 /**
1549  * amdgpu_device_ip_get_clockgating_state - get the CG state
1550  *
1551  * @adev: amdgpu_device pointer
1552  * @flags: clockgating feature flags
1553  *
1554  * Walks the list of IPs on the device and updates the clockgating
1555  * flags for each IP.
1556  * Updates @flags with the feature flags for each hardware IP where
1557  * clockgating is enabled.
1558  */
1559 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1560 					    u32 *flags)
1561 {
1562 	int i;
1563 
1564 	for (i = 0; i < adev->num_ip_blocks; i++) {
1565 		if (!adev->ip_blocks[i].status.valid)
1566 			continue;
1567 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1568 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1569 	}
1570 }
1571 
1572 /**
1573  * amdgpu_device_ip_wait_for_idle - wait for idle
1574  *
1575  * @adev: amdgpu_device pointer
1576  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1577  *
1578  * Waits for the request hardware IP to be idle.
1579  * Returns 0 for success or a negative error code on failure.
1580  */
1581 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1582 				   enum amd_ip_block_type block_type)
1583 {
1584 	int i, r;
1585 
1586 	for (i = 0; i < adev->num_ip_blocks; i++) {
1587 		if (!adev->ip_blocks[i].status.valid)
1588 			continue;
1589 		if (adev->ip_blocks[i].version->type == block_type) {
1590 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1591 			if (r)
1592 				return r;
1593 			break;
1594 		}
1595 	}
1596 	return 0;
1597 
1598 }
1599 
1600 /**
1601  * amdgpu_device_ip_is_idle - is the hardware IP idle
1602  *
1603  * @adev: amdgpu_device pointer
1604  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1605  *
1606  * Check if the hardware IP is idle or not.
1607  * Returns true if it the IP is idle, false if not.
1608  */
1609 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1610 			      enum amd_ip_block_type block_type)
1611 {
1612 	int i;
1613 
1614 	for (i = 0; i < adev->num_ip_blocks; i++) {
1615 		if (!adev->ip_blocks[i].status.valid)
1616 			continue;
1617 		if (adev->ip_blocks[i].version->type == block_type)
1618 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1619 	}
1620 	return true;
1621 
1622 }
1623 
1624 /**
1625  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1626  *
1627  * @adev: amdgpu_device pointer
1628  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1629  *
1630  * Returns a pointer to the hardware IP block structure
1631  * if it exists for the asic, otherwise NULL.
1632  */
1633 struct amdgpu_ip_block *
1634 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1635 			      enum amd_ip_block_type type)
1636 {
1637 	int i;
1638 
1639 	for (i = 0; i < adev->num_ip_blocks; i++)
1640 		if (adev->ip_blocks[i].version->type == type)
1641 			return &adev->ip_blocks[i];
1642 
1643 	return NULL;
1644 }
1645 
1646 /**
1647  * amdgpu_device_ip_block_version_cmp
1648  *
1649  * @adev: amdgpu_device pointer
1650  * @type: enum amd_ip_block_type
1651  * @major: major version
1652  * @minor: minor version
1653  *
1654  * return 0 if equal or greater
1655  * return 1 if smaller or the ip_block doesn't exist
1656  */
1657 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1658 				       enum amd_ip_block_type type,
1659 				       u32 major, u32 minor)
1660 {
1661 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1662 
1663 	if (ip_block && ((ip_block->version->major > major) ||
1664 			((ip_block->version->major == major) &&
1665 			(ip_block->version->minor >= minor))))
1666 		return 0;
1667 
1668 	return 1;
1669 }
1670 
1671 /**
1672  * amdgpu_device_ip_block_add
1673  *
1674  * @adev: amdgpu_device pointer
1675  * @ip_block_version: pointer to the IP to add
1676  *
1677  * Adds the IP block driver information to the collection of IPs
1678  * on the asic.
1679  */
1680 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1681 			       const struct amdgpu_ip_block_version *ip_block_version)
1682 {
1683 	if (!ip_block_version)
1684 		return -EINVAL;
1685 
1686 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1687 		  ip_block_version->funcs->name);
1688 
1689 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1690 
1691 	return 0;
1692 }
1693 
1694 /**
1695  * amdgpu_device_enable_virtual_display - enable virtual display feature
1696  *
1697  * @adev: amdgpu_device pointer
1698  *
1699  * Enabled the virtual display feature if the user has enabled it via
1700  * the module parameter virtual_display.  This feature provides a virtual
1701  * display hardware on headless boards or in virtualized environments.
1702  * This function parses and validates the configuration string specified by
1703  * the user and configues the virtual display configuration (number of
1704  * virtual connectors, crtcs, etc.) specified.
1705  */
1706 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1707 {
1708 	adev->enable_virtual_display = false;
1709 
1710 	if (amdgpu_virtual_display) {
1711 		const char *pci_address_name = pci_name(adev->pdev);
1712 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1713 
1714 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1715 		pciaddstr_tmp = pciaddstr;
1716 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1717 			pciaddname = strsep(&pciaddname_tmp, ",");
1718 			if (!strcmp("all", pciaddname)
1719 			    || !strcmp(pci_address_name, pciaddname)) {
1720 				long num_crtc;
1721 				int res = -1;
1722 
1723 				adev->enable_virtual_display = true;
1724 
1725 				if (pciaddname_tmp)
1726 					res = kstrtol(pciaddname_tmp, 10,
1727 						      &num_crtc);
1728 
1729 				if (!res) {
1730 					if (num_crtc < 1)
1731 						num_crtc = 1;
1732 					if (num_crtc > 6)
1733 						num_crtc = 6;
1734 					adev->mode_info.num_crtc = num_crtc;
1735 				} else {
1736 					adev->mode_info.num_crtc = 1;
1737 				}
1738 				break;
1739 			}
1740 		}
1741 
1742 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1743 			 amdgpu_virtual_display, pci_address_name,
1744 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1745 
1746 		kfree(pciaddstr);
1747 	}
1748 }
1749 
1750 /**
1751  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1752  *
1753  * @adev: amdgpu_device pointer
1754  *
1755  * Parses the asic configuration parameters specified in the gpu info
1756  * firmware and makes them availale to the driver for use in configuring
1757  * the asic.
1758  * Returns 0 on success, -EINVAL on failure.
1759  */
1760 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1761 {
1762 	const char *chip_name;
1763 	char fw_name[40];
1764 	int err;
1765 	const struct gpu_info_firmware_header_v1_0 *hdr;
1766 
1767 	adev->firmware.gpu_info_fw = NULL;
1768 
1769 	if (adev->mman.discovery_bin) {
1770 		amdgpu_discovery_get_gfx_info(adev);
1771 
1772 		/*
1773 		 * FIXME: The bounding box is still needed by Navi12, so
1774 		 * temporarily read it from gpu_info firmware. Should be droped
1775 		 * when DAL no longer needs it.
1776 		 */
1777 		if (adev->asic_type != CHIP_NAVI12)
1778 			return 0;
1779 	}
1780 
1781 	switch (adev->asic_type) {
1782 #ifdef CONFIG_DRM_AMDGPU_SI
1783 	case CHIP_VERDE:
1784 	case CHIP_TAHITI:
1785 	case CHIP_PITCAIRN:
1786 	case CHIP_OLAND:
1787 	case CHIP_HAINAN:
1788 #endif
1789 #ifdef CONFIG_DRM_AMDGPU_CIK
1790 	case CHIP_BONAIRE:
1791 	case CHIP_HAWAII:
1792 	case CHIP_KAVERI:
1793 	case CHIP_KABINI:
1794 	case CHIP_MULLINS:
1795 #endif
1796 	case CHIP_TOPAZ:
1797 	case CHIP_TONGA:
1798 	case CHIP_FIJI:
1799 	case CHIP_POLARIS10:
1800 	case CHIP_POLARIS11:
1801 	case CHIP_POLARIS12:
1802 	case CHIP_VEGAM:
1803 	case CHIP_CARRIZO:
1804 	case CHIP_STONEY:
1805 	case CHIP_VEGA20:
1806 	case CHIP_ALDEBARAN:
1807 	case CHIP_SIENNA_CICHLID:
1808 	case CHIP_NAVY_FLOUNDER:
1809 	case CHIP_DIMGREY_CAVEFISH:
1810 	default:
1811 		return 0;
1812 	case CHIP_VEGA10:
1813 		chip_name = "vega10";
1814 		break;
1815 	case CHIP_VEGA12:
1816 		chip_name = "vega12";
1817 		break;
1818 	case CHIP_RAVEN:
1819 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1820 			chip_name = "raven2";
1821 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1822 			chip_name = "picasso";
1823 		else
1824 			chip_name = "raven";
1825 		break;
1826 	case CHIP_ARCTURUS:
1827 		chip_name = "arcturus";
1828 		break;
1829 	case CHIP_RENOIR:
1830 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1831 			chip_name = "renoir";
1832 		else
1833 			chip_name = "green_sardine";
1834 		break;
1835 	case CHIP_NAVI10:
1836 		chip_name = "navi10";
1837 		break;
1838 	case CHIP_NAVI14:
1839 		chip_name = "navi14";
1840 		break;
1841 	case CHIP_NAVI12:
1842 		chip_name = "navi12";
1843 		break;
1844 	case CHIP_VANGOGH:
1845 		chip_name = "vangogh";
1846 		break;
1847 	}
1848 
1849 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1850 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1851 	if (err) {
1852 		dev_err(adev->dev,
1853 			"Failed to load gpu_info firmware \"%s\"\n",
1854 			fw_name);
1855 		goto out;
1856 	}
1857 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1858 	if (err) {
1859 		dev_err(adev->dev,
1860 			"Failed to validate gpu_info firmware \"%s\"\n",
1861 			fw_name);
1862 		goto out;
1863 	}
1864 
1865 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1866 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1867 
1868 	switch (hdr->version_major) {
1869 	case 1:
1870 	{
1871 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1872 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1873 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1874 
1875 		/*
1876 		 * Should be droped when DAL no longer needs it.
1877 		 */
1878 		if (adev->asic_type == CHIP_NAVI12)
1879 			goto parse_soc_bounding_box;
1880 
1881 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1882 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1883 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1884 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1885 		adev->gfx.config.max_texture_channel_caches =
1886 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1887 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1888 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1889 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1890 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1891 		adev->gfx.config.double_offchip_lds_buf =
1892 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1893 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1894 		adev->gfx.cu_info.max_waves_per_simd =
1895 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1896 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1897 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1898 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1899 		if (hdr->version_minor >= 1) {
1900 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1901 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1902 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1903 			adev->gfx.config.num_sc_per_sh =
1904 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1905 			adev->gfx.config.num_packer_per_sc =
1906 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1907 		}
1908 
1909 parse_soc_bounding_box:
1910 		/*
1911 		 * soc bounding box info is not integrated in disocovery table,
1912 		 * we always need to parse it from gpu info firmware if needed.
1913 		 */
1914 		if (hdr->version_minor == 2) {
1915 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1916 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1917 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1918 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1919 		}
1920 		break;
1921 	}
1922 	default:
1923 		dev_err(adev->dev,
1924 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1925 		err = -EINVAL;
1926 		goto out;
1927 	}
1928 out:
1929 	return err;
1930 }
1931 
1932 /**
1933  * amdgpu_device_ip_early_init - run early init for hardware IPs
1934  *
1935  * @adev: amdgpu_device pointer
1936  *
1937  * Early initialization pass for hardware IPs.  The hardware IPs that make
1938  * up each asic are discovered each IP's early_init callback is run.  This
1939  * is the first stage in initializing the asic.
1940  * Returns 0 on success, negative error code on failure.
1941  */
1942 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1943 {
1944 	int i, r;
1945 
1946 	amdgpu_device_enable_virtual_display(adev);
1947 
1948 	if (amdgpu_sriov_vf(adev)) {
1949 		r = amdgpu_virt_request_full_gpu(adev, true);
1950 		if (r)
1951 			return r;
1952 	}
1953 
1954 	switch (adev->asic_type) {
1955 #ifdef CONFIG_DRM_AMDGPU_SI
1956 	case CHIP_VERDE:
1957 	case CHIP_TAHITI:
1958 	case CHIP_PITCAIRN:
1959 	case CHIP_OLAND:
1960 	case CHIP_HAINAN:
1961 		adev->family = AMDGPU_FAMILY_SI;
1962 		r = si_set_ip_blocks(adev);
1963 		if (r)
1964 			return r;
1965 		break;
1966 #endif
1967 #ifdef CONFIG_DRM_AMDGPU_CIK
1968 	case CHIP_BONAIRE:
1969 	case CHIP_HAWAII:
1970 	case CHIP_KAVERI:
1971 	case CHIP_KABINI:
1972 	case CHIP_MULLINS:
1973 		if (adev->flags & AMD_IS_APU)
1974 			adev->family = AMDGPU_FAMILY_KV;
1975 		else
1976 			adev->family = AMDGPU_FAMILY_CI;
1977 
1978 		r = cik_set_ip_blocks(adev);
1979 		if (r)
1980 			return r;
1981 		break;
1982 #endif
1983 	case CHIP_TOPAZ:
1984 	case CHIP_TONGA:
1985 	case CHIP_FIJI:
1986 	case CHIP_POLARIS10:
1987 	case CHIP_POLARIS11:
1988 	case CHIP_POLARIS12:
1989 	case CHIP_VEGAM:
1990 	case CHIP_CARRIZO:
1991 	case CHIP_STONEY:
1992 		if (adev->flags & AMD_IS_APU)
1993 			adev->family = AMDGPU_FAMILY_CZ;
1994 		else
1995 			adev->family = AMDGPU_FAMILY_VI;
1996 
1997 		r = vi_set_ip_blocks(adev);
1998 		if (r)
1999 			return r;
2000 		break;
2001 	case CHIP_VEGA10:
2002 	case CHIP_VEGA12:
2003 	case CHIP_VEGA20:
2004 	case CHIP_RAVEN:
2005 	case CHIP_ARCTURUS:
2006 	case CHIP_RENOIR:
2007 	case CHIP_ALDEBARAN:
2008 		if (adev->flags & AMD_IS_APU)
2009 			adev->family = AMDGPU_FAMILY_RV;
2010 		else
2011 			adev->family = AMDGPU_FAMILY_AI;
2012 
2013 		r = soc15_set_ip_blocks(adev);
2014 		if (r)
2015 			return r;
2016 		break;
2017 	case  CHIP_NAVI10:
2018 	case  CHIP_NAVI14:
2019 	case  CHIP_NAVI12:
2020 	case  CHIP_SIENNA_CICHLID:
2021 	case  CHIP_NAVY_FLOUNDER:
2022 	case  CHIP_DIMGREY_CAVEFISH:
2023 	case CHIP_VANGOGH:
2024 		if (adev->asic_type == CHIP_VANGOGH)
2025 			adev->family = AMDGPU_FAMILY_VGH;
2026 		else
2027 			adev->family = AMDGPU_FAMILY_NV;
2028 
2029 		r = nv_set_ip_blocks(adev);
2030 		if (r)
2031 			return r;
2032 		break;
2033 	default:
2034 		/* FIXME: not supported yet */
2035 		return -EINVAL;
2036 	}
2037 
2038 	amdgpu_amdkfd_device_probe(adev);
2039 
2040 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2041 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2042 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2043 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2044 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2045 
2046 	for (i = 0; i < adev->num_ip_blocks; i++) {
2047 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2048 			DRM_ERROR("disabled ip block: %d <%s>\n",
2049 				  i, adev->ip_blocks[i].version->funcs->name);
2050 			adev->ip_blocks[i].status.valid = false;
2051 		} else {
2052 			if (adev->ip_blocks[i].version->funcs->early_init) {
2053 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2054 				if (r == -ENOENT) {
2055 					adev->ip_blocks[i].status.valid = false;
2056 				} else if (r) {
2057 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2058 						  adev->ip_blocks[i].version->funcs->name, r);
2059 					return r;
2060 				} else {
2061 					adev->ip_blocks[i].status.valid = true;
2062 				}
2063 			} else {
2064 				adev->ip_blocks[i].status.valid = true;
2065 			}
2066 		}
2067 		/* get the vbios after the asic_funcs are set up */
2068 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2069 			r = amdgpu_device_parse_gpu_info_fw(adev);
2070 			if (r)
2071 				return r;
2072 
2073 			/* Read BIOS */
2074 			if (!amdgpu_get_bios(adev))
2075 				return -EINVAL;
2076 
2077 			r = amdgpu_atombios_init(adev);
2078 			if (r) {
2079 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2080 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2081 				return r;
2082 			}
2083 
2084 			/*get pf2vf msg info at it's earliest time*/
2085 			if (amdgpu_sriov_vf(adev))
2086 				amdgpu_virt_init_data_exchange(adev);
2087 
2088 		}
2089 	}
2090 
2091 	adev->cg_flags &= amdgpu_cg_mask;
2092 	adev->pg_flags &= amdgpu_pg_mask;
2093 
2094 	return 0;
2095 }
2096 
2097 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2098 {
2099 	int i, r;
2100 
2101 	for (i = 0; i < adev->num_ip_blocks; i++) {
2102 		if (!adev->ip_blocks[i].status.sw)
2103 			continue;
2104 		if (adev->ip_blocks[i].status.hw)
2105 			continue;
2106 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2107 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2108 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2109 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2110 			if (r) {
2111 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2112 					  adev->ip_blocks[i].version->funcs->name, r);
2113 				return r;
2114 			}
2115 			adev->ip_blocks[i].status.hw = true;
2116 		}
2117 	}
2118 
2119 	return 0;
2120 }
2121 
2122 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2123 {
2124 	int i, r;
2125 
2126 	for (i = 0; i < adev->num_ip_blocks; i++) {
2127 		if (!adev->ip_blocks[i].status.sw)
2128 			continue;
2129 		if (adev->ip_blocks[i].status.hw)
2130 			continue;
2131 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2132 		if (r) {
2133 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2134 				  adev->ip_blocks[i].version->funcs->name, r);
2135 			return r;
2136 		}
2137 		adev->ip_blocks[i].status.hw = true;
2138 	}
2139 
2140 	return 0;
2141 }
2142 
2143 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2144 {
2145 	int r = 0;
2146 	int i;
2147 	uint32_t smu_version;
2148 
2149 	if (adev->asic_type >= CHIP_VEGA10) {
2150 		for (i = 0; i < adev->num_ip_blocks; i++) {
2151 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2152 				continue;
2153 
2154 			if (!adev->ip_blocks[i].status.sw)
2155 				continue;
2156 
2157 			/* no need to do the fw loading again if already done*/
2158 			if (adev->ip_blocks[i].status.hw == true)
2159 				break;
2160 
2161 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2162 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2163 				if (r) {
2164 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2165 							  adev->ip_blocks[i].version->funcs->name, r);
2166 					return r;
2167 				}
2168 			} else {
2169 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2170 				if (r) {
2171 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2172 							  adev->ip_blocks[i].version->funcs->name, r);
2173 					return r;
2174 				}
2175 			}
2176 
2177 			adev->ip_blocks[i].status.hw = true;
2178 			break;
2179 		}
2180 	}
2181 
2182 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2183 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2184 
2185 	return r;
2186 }
2187 
2188 /**
2189  * amdgpu_device_ip_init - run init for hardware IPs
2190  *
2191  * @adev: amdgpu_device pointer
2192  *
2193  * Main initialization pass for hardware IPs.  The list of all the hardware
2194  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2195  * are run.  sw_init initializes the software state associated with each IP
2196  * and hw_init initializes the hardware associated with each IP.
2197  * Returns 0 on success, negative error code on failure.
2198  */
2199 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2200 {
2201 	int i, r;
2202 
2203 	r = amdgpu_ras_init(adev);
2204 	if (r)
2205 		return r;
2206 
2207 	for (i = 0; i < adev->num_ip_blocks; i++) {
2208 		if (!adev->ip_blocks[i].status.valid)
2209 			continue;
2210 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2211 		if (r) {
2212 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2213 				  adev->ip_blocks[i].version->funcs->name, r);
2214 			goto init_failed;
2215 		}
2216 		adev->ip_blocks[i].status.sw = true;
2217 
2218 		/* need to do gmc hw init early so we can allocate gpu mem */
2219 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2220 			r = amdgpu_device_vram_scratch_init(adev);
2221 			if (r) {
2222 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2223 				goto init_failed;
2224 			}
2225 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2226 			if (r) {
2227 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2228 				goto init_failed;
2229 			}
2230 			r = amdgpu_device_wb_init(adev);
2231 			if (r) {
2232 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2233 				goto init_failed;
2234 			}
2235 			adev->ip_blocks[i].status.hw = true;
2236 
2237 			/* right after GMC hw init, we create CSA */
2238 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2239 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2240 								AMDGPU_GEM_DOMAIN_VRAM,
2241 								AMDGPU_CSA_SIZE);
2242 				if (r) {
2243 					DRM_ERROR("allocate CSA failed %d\n", r);
2244 					goto init_failed;
2245 				}
2246 			}
2247 		}
2248 	}
2249 
2250 	if (amdgpu_sriov_vf(adev))
2251 		amdgpu_virt_init_data_exchange(adev);
2252 
2253 	r = amdgpu_ib_pool_init(adev);
2254 	if (r) {
2255 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2256 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2257 		goto init_failed;
2258 	}
2259 
2260 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2261 	if (r)
2262 		goto init_failed;
2263 
2264 	r = amdgpu_device_ip_hw_init_phase1(adev);
2265 	if (r)
2266 		goto init_failed;
2267 
2268 	r = amdgpu_device_fw_loading(adev);
2269 	if (r)
2270 		goto init_failed;
2271 
2272 	r = amdgpu_device_ip_hw_init_phase2(adev);
2273 	if (r)
2274 		goto init_failed;
2275 
2276 	/*
2277 	 * retired pages will be loaded from eeprom and reserved here,
2278 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2279 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2280 	 * for I2C communication which only true at this point.
2281 	 *
2282 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2283 	 * failure from bad gpu situation and stop amdgpu init process
2284 	 * accordingly. For other failed cases, it will still release all
2285 	 * the resource and print error message, rather than returning one
2286 	 * negative value to upper level.
2287 	 *
2288 	 * Note: theoretically, this should be called before all vram allocations
2289 	 * to protect retired page from abusing
2290 	 */
2291 	r = amdgpu_ras_recovery_init(adev);
2292 	if (r)
2293 		goto init_failed;
2294 
2295 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2296 		amdgpu_xgmi_add_device(adev);
2297 
2298 	/* Don't init kfd if whole hive need to be reset during init */
2299 	if (!adev->gmc.xgmi.pending_reset)
2300 		amdgpu_amdkfd_device_init(adev);
2301 
2302 	amdgpu_fru_get_product_info(adev);
2303 
2304 init_failed:
2305 	if (amdgpu_sriov_vf(adev))
2306 		amdgpu_virt_release_full_gpu(adev, true);
2307 
2308 	return r;
2309 }
2310 
2311 /**
2312  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2313  *
2314  * @adev: amdgpu_device pointer
2315  *
2316  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2317  * this function before a GPU reset.  If the value is retained after a
2318  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2319  */
2320 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2321 {
2322 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2323 }
2324 
2325 /**
2326  * amdgpu_device_check_vram_lost - check if vram is valid
2327  *
2328  * @adev: amdgpu_device pointer
2329  *
2330  * Checks the reset magic value written to the gart pointer in VRAM.
2331  * The driver calls this after a GPU reset to see if the contents of
2332  * VRAM is lost or now.
2333  * returns true if vram is lost, false if not.
2334  */
2335 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2336 {
2337 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2338 			AMDGPU_RESET_MAGIC_NUM))
2339 		return true;
2340 
2341 	if (!amdgpu_in_reset(adev))
2342 		return false;
2343 
2344 	/*
2345 	 * For all ASICs with baco/mode1 reset, the VRAM is
2346 	 * always assumed to be lost.
2347 	 */
2348 	switch (amdgpu_asic_reset_method(adev)) {
2349 	case AMD_RESET_METHOD_BACO:
2350 	case AMD_RESET_METHOD_MODE1:
2351 		return true;
2352 	default:
2353 		return false;
2354 	}
2355 }
2356 
2357 /**
2358  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2359  *
2360  * @adev: amdgpu_device pointer
2361  * @state: clockgating state (gate or ungate)
2362  *
2363  * The list of all the hardware IPs that make up the asic is walked and the
2364  * set_clockgating_state callbacks are run.
2365  * Late initialization pass enabling clockgating for hardware IPs.
2366  * Fini or suspend, pass disabling clockgating for hardware IPs.
2367  * Returns 0 on success, negative error code on failure.
2368  */
2369 
2370 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2371 			       enum amd_clockgating_state state)
2372 {
2373 	int i, j, r;
2374 
2375 	if (amdgpu_emu_mode == 1)
2376 		return 0;
2377 
2378 	for (j = 0; j < adev->num_ip_blocks; j++) {
2379 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2380 		if (!adev->ip_blocks[i].status.late_initialized)
2381 			continue;
2382 		/* skip CG for GFX on S0ix */
2383 		if (adev->in_s0ix &&
2384 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2385 			continue;
2386 		/* skip CG for VCE/UVD, it's handled specially */
2387 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2388 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2389 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2390 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2391 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2392 			/* enable clockgating to save power */
2393 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2394 										     state);
2395 			if (r) {
2396 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2397 					  adev->ip_blocks[i].version->funcs->name, r);
2398 				return r;
2399 			}
2400 		}
2401 	}
2402 
2403 	return 0;
2404 }
2405 
2406 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2407 			       enum amd_powergating_state state)
2408 {
2409 	int i, j, r;
2410 
2411 	if (amdgpu_emu_mode == 1)
2412 		return 0;
2413 
2414 	for (j = 0; j < adev->num_ip_blocks; j++) {
2415 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2416 		if (!adev->ip_blocks[i].status.late_initialized)
2417 			continue;
2418 		/* skip PG for GFX on S0ix */
2419 		if (adev->in_s0ix &&
2420 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2421 			continue;
2422 		/* skip CG for VCE/UVD, it's handled specially */
2423 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2424 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2425 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2426 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2427 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2428 			/* enable powergating to save power */
2429 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2430 											state);
2431 			if (r) {
2432 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2433 					  adev->ip_blocks[i].version->funcs->name, r);
2434 				return r;
2435 			}
2436 		}
2437 	}
2438 	return 0;
2439 }
2440 
2441 static int amdgpu_device_enable_mgpu_fan_boost(void)
2442 {
2443 	struct amdgpu_gpu_instance *gpu_ins;
2444 	struct amdgpu_device *adev;
2445 	int i, ret = 0;
2446 
2447 	mutex_lock(&mgpu_info.mutex);
2448 
2449 	/*
2450 	 * MGPU fan boost feature should be enabled
2451 	 * only when there are two or more dGPUs in
2452 	 * the system
2453 	 */
2454 	if (mgpu_info.num_dgpu < 2)
2455 		goto out;
2456 
2457 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2458 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2459 		adev = gpu_ins->adev;
2460 		if (!(adev->flags & AMD_IS_APU) &&
2461 		    !gpu_ins->mgpu_fan_enabled) {
2462 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2463 			if (ret)
2464 				break;
2465 
2466 			gpu_ins->mgpu_fan_enabled = 1;
2467 		}
2468 	}
2469 
2470 out:
2471 	mutex_unlock(&mgpu_info.mutex);
2472 
2473 	return ret;
2474 }
2475 
2476 /**
2477  * amdgpu_device_ip_late_init - run late init for hardware IPs
2478  *
2479  * @adev: amdgpu_device pointer
2480  *
2481  * Late initialization pass for hardware IPs.  The list of all the hardware
2482  * IPs that make up the asic is walked and the late_init callbacks are run.
2483  * late_init covers any special initialization that an IP requires
2484  * after all of the have been initialized or something that needs to happen
2485  * late in the init process.
2486  * Returns 0 on success, negative error code on failure.
2487  */
2488 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2489 {
2490 	struct amdgpu_gpu_instance *gpu_instance;
2491 	int i = 0, r;
2492 
2493 	for (i = 0; i < adev->num_ip_blocks; i++) {
2494 		if (!adev->ip_blocks[i].status.hw)
2495 			continue;
2496 		if (adev->ip_blocks[i].version->funcs->late_init) {
2497 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2498 			if (r) {
2499 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2500 					  adev->ip_blocks[i].version->funcs->name, r);
2501 				return r;
2502 			}
2503 		}
2504 		adev->ip_blocks[i].status.late_initialized = true;
2505 	}
2506 
2507 	amdgpu_ras_set_error_query_ready(adev, true);
2508 
2509 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2510 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2511 
2512 	amdgpu_device_fill_reset_magic(adev);
2513 
2514 	r = amdgpu_device_enable_mgpu_fan_boost();
2515 	if (r)
2516 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2517 
2518 	/* For XGMI + passthrough configuration on arcturus, enable light SBR */
2519 	if (adev->asic_type == CHIP_ARCTURUS &&
2520 	    amdgpu_passthrough(adev) &&
2521 	    adev->gmc.xgmi.num_physical_nodes > 1)
2522 		smu_set_light_sbr(&adev->smu, true);
2523 
2524 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2525 		mutex_lock(&mgpu_info.mutex);
2526 
2527 		/*
2528 		 * Reset device p-state to low as this was booted with high.
2529 		 *
2530 		 * This should be performed only after all devices from the same
2531 		 * hive get initialized.
2532 		 *
2533 		 * However, it's unknown how many device in the hive in advance.
2534 		 * As this is counted one by one during devices initializations.
2535 		 *
2536 		 * So, we wait for all XGMI interlinked devices initialized.
2537 		 * This may bring some delays as those devices may come from
2538 		 * different hives. But that should be OK.
2539 		 */
2540 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2541 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2542 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2543 				if (gpu_instance->adev->flags & AMD_IS_APU)
2544 					continue;
2545 
2546 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2547 						AMDGPU_XGMI_PSTATE_MIN);
2548 				if (r) {
2549 					DRM_ERROR("pstate setting failed (%d).\n", r);
2550 					break;
2551 				}
2552 			}
2553 		}
2554 
2555 		mutex_unlock(&mgpu_info.mutex);
2556 	}
2557 
2558 	return 0;
2559 }
2560 
2561 /**
2562  * amdgpu_device_ip_fini - run fini for hardware IPs
2563  *
2564  * @adev: amdgpu_device pointer
2565  *
2566  * Main teardown pass for hardware IPs.  The list of all the hardware
2567  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2568  * are run.  hw_fini tears down the hardware associated with each IP
2569  * and sw_fini tears down any software state associated with each IP.
2570  * Returns 0 on success, negative error code on failure.
2571  */
2572 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2573 {
2574 	int i, r;
2575 
2576 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2577 		amdgpu_virt_release_ras_err_handler_data(adev);
2578 
2579 	amdgpu_ras_pre_fini(adev);
2580 
2581 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2582 		amdgpu_xgmi_remove_device(adev);
2583 
2584 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2585 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2586 
2587 	amdgpu_amdkfd_device_fini(adev);
2588 
2589 	/* need to disable SMC first */
2590 	for (i = 0; i < adev->num_ip_blocks; i++) {
2591 		if (!adev->ip_blocks[i].status.hw)
2592 			continue;
2593 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2594 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2595 			/* XXX handle errors */
2596 			if (r) {
2597 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2598 					  adev->ip_blocks[i].version->funcs->name, r);
2599 			}
2600 			adev->ip_blocks[i].status.hw = false;
2601 			break;
2602 		}
2603 	}
2604 
2605 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2606 		if (!adev->ip_blocks[i].status.hw)
2607 			continue;
2608 
2609 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2610 		/* XXX handle errors */
2611 		if (r) {
2612 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2613 				  adev->ip_blocks[i].version->funcs->name, r);
2614 		}
2615 
2616 		adev->ip_blocks[i].status.hw = false;
2617 	}
2618 
2619 
2620 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2621 		if (!adev->ip_blocks[i].status.sw)
2622 			continue;
2623 
2624 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2625 			amdgpu_ucode_free_bo(adev);
2626 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2627 			amdgpu_device_wb_fini(adev);
2628 			amdgpu_device_vram_scratch_fini(adev);
2629 			amdgpu_ib_pool_fini(adev);
2630 		}
2631 
2632 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2633 		/* XXX handle errors */
2634 		if (r) {
2635 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2636 				  adev->ip_blocks[i].version->funcs->name, r);
2637 		}
2638 		adev->ip_blocks[i].status.sw = false;
2639 		adev->ip_blocks[i].status.valid = false;
2640 	}
2641 
2642 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2643 		if (!adev->ip_blocks[i].status.late_initialized)
2644 			continue;
2645 		if (adev->ip_blocks[i].version->funcs->late_fini)
2646 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2647 		adev->ip_blocks[i].status.late_initialized = false;
2648 	}
2649 
2650 	amdgpu_ras_fini(adev);
2651 
2652 	if (amdgpu_sriov_vf(adev))
2653 		if (amdgpu_virt_release_full_gpu(adev, false))
2654 			DRM_ERROR("failed to release exclusive mode on fini\n");
2655 
2656 	return 0;
2657 }
2658 
2659 /**
2660  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2661  *
2662  * @work: work_struct.
2663  */
2664 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2665 {
2666 	struct amdgpu_device *adev =
2667 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2668 	int r;
2669 
2670 	r = amdgpu_ib_ring_tests(adev);
2671 	if (r)
2672 		DRM_ERROR("ib ring test failed (%d).\n", r);
2673 }
2674 
2675 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2676 {
2677 	struct amdgpu_device *adev =
2678 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2679 
2680 	mutex_lock(&adev->gfx.gfx_off_mutex);
2681 	if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2682 		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2683 			adev->gfx.gfx_off_state = true;
2684 	}
2685 	mutex_unlock(&adev->gfx.gfx_off_mutex);
2686 }
2687 
2688 /**
2689  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2690  *
2691  * @adev: amdgpu_device pointer
2692  *
2693  * Main suspend function for hardware IPs.  The list of all the hardware
2694  * IPs that make up the asic is walked, clockgating is disabled and the
2695  * suspend callbacks are run.  suspend puts the hardware and software state
2696  * in each IP into a state suitable for suspend.
2697  * Returns 0 on success, negative error code on failure.
2698  */
2699 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2700 {
2701 	int i, r;
2702 
2703 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2704 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2705 
2706 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2707 		if (!adev->ip_blocks[i].status.valid)
2708 			continue;
2709 
2710 		/* displays are handled separately */
2711 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2712 			continue;
2713 
2714 		/* XXX handle errors */
2715 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2716 		/* XXX handle errors */
2717 		if (r) {
2718 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2719 				  adev->ip_blocks[i].version->funcs->name, r);
2720 			return r;
2721 		}
2722 
2723 		adev->ip_blocks[i].status.hw = false;
2724 	}
2725 
2726 	return 0;
2727 }
2728 
2729 /**
2730  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2731  *
2732  * @adev: amdgpu_device pointer
2733  *
2734  * Main suspend function for hardware IPs.  The list of all the hardware
2735  * IPs that make up the asic is walked, clockgating is disabled and the
2736  * suspend callbacks are run.  suspend puts the hardware and software state
2737  * in each IP into a state suitable for suspend.
2738  * Returns 0 on success, negative error code on failure.
2739  */
2740 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2741 {
2742 	int i, r;
2743 
2744 	if (adev->in_s0ix)
2745 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
2746 
2747 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2748 		if (!adev->ip_blocks[i].status.valid)
2749 			continue;
2750 		/* displays are handled in phase1 */
2751 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2752 			continue;
2753 		/* PSP lost connection when err_event_athub occurs */
2754 		if (amdgpu_ras_intr_triggered() &&
2755 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2756 			adev->ip_blocks[i].status.hw = false;
2757 			continue;
2758 		}
2759 
2760 		/* skip unnecessary suspend if we do not initialize them yet */
2761 		if (adev->gmc.xgmi.pending_reset &&
2762 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2763 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2764 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2765 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2766 			adev->ip_blocks[i].status.hw = false;
2767 			continue;
2768 		}
2769 
2770 		/* skip suspend of gfx and psp for S0ix
2771 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2772 		 * like at runtime. PSP is also part of the always on hardware
2773 		 * so no need to suspend it.
2774 		 */
2775 		if (adev->in_s0ix &&
2776 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2777 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
2778 			continue;
2779 
2780 		/* XXX handle errors */
2781 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2782 		/* XXX handle errors */
2783 		if (r) {
2784 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2785 				  adev->ip_blocks[i].version->funcs->name, r);
2786 		}
2787 		adev->ip_blocks[i].status.hw = false;
2788 		/* handle putting the SMC in the appropriate state */
2789 		if(!amdgpu_sriov_vf(adev)){
2790 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2791 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2792 				if (r) {
2793 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2794 							adev->mp1_state, r);
2795 					return r;
2796 				}
2797 			}
2798 		}
2799 	}
2800 
2801 	return 0;
2802 }
2803 
2804 /**
2805  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2806  *
2807  * @adev: amdgpu_device pointer
2808  *
2809  * Main suspend function for hardware IPs.  The list of all the hardware
2810  * IPs that make up the asic is walked, clockgating is disabled and the
2811  * suspend callbacks are run.  suspend puts the hardware and software state
2812  * in each IP into a state suitable for suspend.
2813  * Returns 0 on success, negative error code on failure.
2814  */
2815 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2816 {
2817 	int r;
2818 
2819 	if (amdgpu_sriov_vf(adev)) {
2820 		amdgpu_virt_fini_data_exchange(adev);
2821 		amdgpu_virt_request_full_gpu(adev, false);
2822 	}
2823 
2824 	r = amdgpu_device_ip_suspend_phase1(adev);
2825 	if (r)
2826 		return r;
2827 	r = amdgpu_device_ip_suspend_phase2(adev);
2828 
2829 	if (amdgpu_sriov_vf(adev))
2830 		amdgpu_virt_release_full_gpu(adev, false);
2831 
2832 	return r;
2833 }
2834 
2835 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2836 {
2837 	int i, r;
2838 
2839 	static enum amd_ip_block_type ip_order[] = {
2840 		AMD_IP_BLOCK_TYPE_GMC,
2841 		AMD_IP_BLOCK_TYPE_COMMON,
2842 		AMD_IP_BLOCK_TYPE_PSP,
2843 		AMD_IP_BLOCK_TYPE_IH,
2844 	};
2845 
2846 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2847 		int j;
2848 		struct amdgpu_ip_block *block;
2849 
2850 		block = &adev->ip_blocks[i];
2851 		block->status.hw = false;
2852 
2853 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2854 
2855 			if (block->version->type != ip_order[j] ||
2856 				!block->status.valid)
2857 				continue;
2858 
2859 			r = block->version->funcs->hw_init(adev);
2860 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2861 			if (r)
2862 				return r;
2863 			block->status.hw = true;
2864 		}
2865 	}
2866 
2867 	return 0;
2868 }
2869 
2870 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2871 {
2872 	int i, r;
2873 
2874 	static enum amd_ip_block_type ip_order[] = {
2875 		AMD_IP_BLOCK_TYPE_SMC,
2876 		AMD_IP_BLOCK_TYPE_DCE,
2877 		AMD_IP_BLOCK_TYPE_GFX,
2878 		AMD_IP_BLOCK_TYPE_SDMA,
2879 		AMD_IP_BLOCK_TYPE_UVD,
2880 		AMD_IP_BLOCK_TYPE_VCE,
2881 		AMD_IP_BLOCK_TYPE_VCN
2882 	};
2883 
2884 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2885 		int j;
2886 		struct amdgpu_ip_block *block;
2887 
2888 		for (j = 0; j < adev->num_ip_blocks; j++) {
2889 			block = &adev->ip_blocks[j];
2890 
2891 			if (block->version->type != ip_order[i] ||
2892 				!block->status.valid ||
2893 				block->status.hw)
2894 				continue;
2895 
2896 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2897 				r = block->version->funcs->resume(adev);
2898 			else
2899 				r = block->version->funcs->hw_init(adev);
2900 
2901 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2902 			if (r)
2903 				return r;
2904 			block->status.hw = true;
2905 		}
2906 	}
2907 
2908 	return 0;
2909 }
2910 
2911 /**
2912  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2913  *
2914  * @adev: amdgpu_device pointer
2915  *
2916  * First resume function for hardware IPs.  The list of all the hardware
2917  * IPs that make up the asic is walked and the resume callbacks are run for
2918  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2919  * after a suspend and updates the software state as necessary.  This
2920  * function is also used for restoring the GPU after a GPU reset.
2921  * Returns 0 on success, negative error code on failure.
2922  */
2923 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2924 {
2925 	int i, r;
2926 
2927 	for (i = 0; i < adev->num_ip_blocks; i++) {
2928 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2929 			continue;
2930 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2931 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2932 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2933 
2934 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2935 			if (r) {
2936 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2937 					  adev->ip_blocks[i].version->funcs->name, r);
2938 				return r;
2939 			}
2940 			adev->ip_blocks[i].status.hw = true;
2941 		}
2942 	}
2943 
2944 	return 0;
2945 }
2946 
2947 /**
2948  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2949  *
2950  * @adev: amdgpu_device pointer
2951  *
2952  * First resume function for hardware IPs.  The list of all the hardware
2953  * IPs that make up the asic is walked and the resume callbacks are run for
2954  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2955  * functional state after a suspend and updates the software state as
2956  * necessary.  This function is also used for restoring the GPU after a GPU
2957  * reset.
2958  * Returns 0 on success, negative error code on failure.
2959  */
2960 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2961 {
2962 	int i, r;
2963 
2964 	for (i = 0; i < adev->num_ip_blocks; i++) {
2965 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2966 			continue;
2967 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2968 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2969 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2970 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2971 			continue;
2972 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2973 		if (r) {
2974 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2975 				  adev->ip_blocks[i].version->funcs->name, r);
2976 			return r;
2977 		}
2978 		adev->ip_blocks[i].status.hw = true;
2979 	}
2980 
2981 	return 0;
2982 }
2983 
2984 /**
2985  * amdgpu_device_ip_resume - run resume for hardware IPs
2986  *
2987  * @adev: amdgpu_device pointer
2988  *
2989  * Main resume function for hardware IPs.  The hardware IPs
2990  * are split into two resume functions because they are
2991  * are also used in in recovering from a GPU reset and some additional
2992  * steps need to be take between them.  In this case (S3/S4) they are
2993  * run sequentially.
2994  * Returns 0 on success, negative error code on failure.
2995  */
2996 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2997 {
2998 	int r;
2999 
3000 	r = amdgpu_device_ip_resume_phase1(adev);
3001 	if (r)
3002 		return r;
3003 
3004 	r = amdgpu_device_fw_loading(adev);
3005 	if (r)
3006 		return r;
3007 
3008 	r = amdgpu_device_ip_resume_phase2(adev);
3009 
3010 	return r;
3011 }
3012 
3013 /**
3014  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3015  *
3016  * @adev: amdgpu_device pointer
3017  *
3018  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3019  */
3020 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3021 {
3022 	if (amdgpu_sriov_vf(adev)) {
3023 		if (adev->is_atom_fw) {
3024 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
3025 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3026 		} else {
3027 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3028 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3029 		}
3030 
3031 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3032 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3033 	}
3034 }
3035 
3036 /**
3037  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3038  *
3039  * @asic_type: AMD asic type
3040  *
3041  * Check if there is DC (new modesetting infrastructre) support for an asic.
3042  * returns true if DC has support, false if not.
3043  */
3044 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3045 {
3046 	switch (asic_type) {
3047 #if defined(CONFIG_DRM_AMD_DC)
3048 #if defined(CONFIG_DRM_AMD_DC_SI)
3049 	case CHIP_TAHITI:
3050 	case CHIP_PITCAIRN:
3051 	case CHIP_VERDE:
3052 	case CHIP_OLAND:
3053 #endif
3054 	case CHIP_BONAIRE:
3055 	case CHIP_KAVERI:
3056 	case CHIP_KABINI:
3057 	case CHIP_MULLINS:
3058 		/*
3059 		 * We have systems in the wild with these ASICs that require
3060 		 * LVDS and VGA support which is not supported with DC.
3061 		 *
3062 		 * Fallback to the non-DC driver here by default so as not to
3063 		 * cause regressions.
3064 		 */
3065 		return amdgpu_dc > 0;
3066 	case CHIP_HAWAII:
3067 	case CHIP_CARRIZO:
3068 	case CHIP_STONEY:
3069 	case CHIP_POLARIS10:
3070 	case CHIP_POLARIS11:
3071 	case CHIP_POLARIS12:
3072 	case CHIP_VEGAM:
3073 	case CHIP_TONGA:
3074 	case CHIP_FIJI:
3075 	case CHIP_VEGA10:
3076 	case CHIP_VEGA12:
3077 	case CHIP_VEGA20:
3078 #if defined(CONFIG_DRM_AMD_DC_DCN)
3079 	case CHIP_RAVEN:
3080 	case CHIP_NAVI10:
3081 	case CHIP_NAVI14:
3082 	case CHIP_NAVI12:
3083 	case CHIP_RENOIR:
3084 	case CHIP_SIENNA_CICHLID:
3085 	case CHIP_NAVY_FLOUNDER:
3086 	case CHIP_DIMGREY_CAVEFISH:
3087 	case CHIP_VANGOGH:
3088 #endif
3089 		return amdgpu_dc != 0;
3090 #endif
3091 	default:
3092 		if (amdgpu_dc > 0)
3093 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3094 					 "but isn't supported by ASIC, ignoring\n");
3095 		return false;
3096 	}
3097 }
3098 
3099 /**
3100  * amdgpu_device_has_dc_support - check if dc is supported
3101  *
3102  * @adev: amdgpu_device pointer
3103  *
3104  * Returns true for supported, false for not supported
3105  */
3106 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3107 {
3108 	if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3109 		return false;
3110 
3111 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3112 }
3113 
3114 
3115 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3116 {
3117 	struct amdgpu_device *adev =
3118 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3119 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3120 
3121 	/* It's a bug to not have a hive within this function */
3122 	if (WARN_ON(!hive))
3123 		return;
3124 
3125 	/*
3126 	 * Use task barrier to synchronize all xgmi reset works across the
3127 	 * hive. task_barrier_enter and task_barrier_exit will block
3128 	 * until all the threads running the xgmi reset works reach
3129 	 * those points. task_barrier_full will do both blocks.
3130 	 */
3131 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3132 
3133 		task_barrier_enter(&hive->tb);
3134 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3135 
3136 		if (adev->asic_reset_res)
3137 			goto fail;
3138 
3139 		task_barrier_exit(&hive->tb);
3140 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3141 
3142 		if (adev->asic_reset_res)
3143 			goto fail;
3144 
3145 		if (adev->mmhub.ras_funcs &&
3146 		    adev->mmhub.ras_funcs->reset_ras_error_count)
3147 			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
3148 	} else {
3149 
3150 		task_barrier_full(&hive->tb);
3151 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3152 	}
3153 
3154 fail:
3155 	if (adev->asic_reset_res)
3156 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3157 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3158 	amdgpu_put_xgmi_hive(hive);
3159 }
3160 
3161 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3162 {
3163 	char *input = amdgpu_lockup_timeout;
3164 	char *timeout_setting = NULL;
3165 	int index = 0;
3166 	long timeout;
3167 	int ret = 0;
3168 
3169 	/*
3170 	 * By default timeout for non compute jobs is 10000.
3171 	 * And there is no timeout enforced on compute jobs.
3172 	 * In SR-IOV or passthrough mode, timeout for compute
3173 	 * jobs are 60000 by default.
3174 	 */
3175 	adev->gfx_timeout = msecs_to_jiffies(10000);
3176 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3177 	if (amdgpu_sriov_vf(adev))
3178 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3179 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3180 	else if (amdgpu_passthrough(adev))
3181 		adev->compute_timeout =  msecs_to_jiffies(60000);
3182 	else
3183 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3184 
3185 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3186 		while ((timeout_setting = strsep(&input, ",")) &&
3187 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3188 			ret = kstrtol(timeout_setting, 0, &timeout);
3189 			if (ret)
3190 				return ret;
3191 
3192 			if (timeout == 0) {
3193 				index++;
3194 				continue;
3195 			} else if (timeout < 0) {
3196 				timeout = MAX_SCHEDULE_TIMEOUT;
3197 			} else {
3198 				timeout = msecs_to_jiffies(timeout);
3199 			}
3200 
3201 			switch (index++) {
3202 			case 0:
3203 				adev->gfx_timeout = timeout;
3204 				break;
3205 			case 1:
3206 				adev->compute_timeout = timeout;
3207 				break;
3208 			case 2:
3209 				adev->sdma_timeout = timeout;
3210 				break;
3211 			case 3:
3212 				adev->video_timeout = timeout;
3213 				break;
3214 			default:
3215 				break;
3216 			}
3217 		}
3218 		/*
3219 		 * There is only one value specified and
3220 		 * it should apply to all non-compute jobs.
3221 		 */
3222 		if (index == 1) {
3223 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3224 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3225 				adev->compute_timeout = adev->gfx_timeout;
3226 		}
3227 	}
3228 
3229 	return ret;
3230 }
3231 
3232 static const struct attribute *amdgpu_dev_attributes[] = {
3233 	&dev_attr_product_name.attr,
3234 	&dev_attr_product_number.attr,
3235 	&dev_attr_serial_number.attr,
3236 	&dev_attr_pcie_replay_count.attr,
3237 	NULL
3238 };
3239 
3240 
3241 /**
3242  * amdgpu_device_init - initialize the driver
3243  *
3244  * @adev: amdgpu_device pointer
3245  * @flags: driver flags
3246  *
3247  * Initializes the driver info and hw (all asics).
3248  * Returns 0 for success or an error on failure.
3249  * Called at driver startup.
3250  */
3251 int amdgpu_device_init(struct amdgpu_device *adev,
3252 		       uint32_t flags)
3253 {
3254 	struct drm_device *ddev = adev_to_drm(adev);
3255 	struct pci_dev *pdev = adev->pdev;
3256 	int r, i;
3257 	bool px = false;
3258 	u32 max_MBps;
3259 
3260 	adev->shutdown = false;
3261 	adev->flags = flags;
3262 
3263 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3264 		adev->asic_type = amdgpu_force_asic_type;
3265 	else
3266 		adev->asic_type = flags & AMD_ASIC_MASK;
3267 
3268 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3269 	if (amdgpu_emu_mode == 1)
3270 		adev->usec_timeout *= 10;
3271 	adev->gmc.gart_size = 512 * 1024 * 1024;
3272 	adev->accel_working = false;
3273 	adev->num_rings = 0;
3274 	adev->mman.buffer_funcs = NULL;
3275 	adev->mman.buffer_funcs_ring = NULL;
3276 	adev->vm_manager.vm_pte_funcs = NULL;
3277 	adev->vm_manager.vm_pte_num_scheds = 0;
3278 	adev->gmc.gmc_funcs = NULL;
3279 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3280 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3281 
3282 	adev->smc_rreg = &amdgpu_invalid_rreg;
3283 	adev->smc_wreg = &amdgpu_invalid_wreg;
3284 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3285 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3286 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3287 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3288 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3289 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3290 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3291 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3292 	adev->didt_rreg = &amdgpu_invalid_rreg;
3293 	adev->didt_wreg = &amdgpu_invalid_wreg;
3294 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3295 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3296 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3297 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3298 
3299 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3300 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3301 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3302 
3303 	/* mutex initialization are all done here so we
3304 	 * can recall function without having locking issues */
3305 	mutex_init(&adev->firmware.mutex);
3306 	mutex_init(&adev->pm.mutex);
3307 	mutex_init(&adev->gfx.gpu_clock_mutex);
3308 	mutex_init(&adev->srbm_mutex);
3309 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3310 	mutex_init(&adev->gfx.gfx_off_mutex);
3311 	mutex_init(&adev->grbm_idx_mutex);
3312 	mutex_init(&adev->mn_lock);
3313 	mutex_init(&adev->virt.vf_errors.lock);
3314 	hash_init(adev->mn_hash);
3315 	atomic_set(&adev->in_gpu_reset, 0);
3316 	init_rwsem(&adev->reset_sem);
3317 	mutex_init(&adev->psp.mutex);
3318 	mutex_init(&adev->notifier_lock);
3319 
3320 	r = amdgpu_device_check_arguments(adev);
3321 	if (r)
3322 		return r;
3323 
3324 	spin_lock_init(&adev->mmio_idx_lock);
3325 	spin_lock_init(&adev->smc_idx_lock);
3326 	spin_lock_init(&adev->pcie_idx_lock);
3327 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3328 	spin_lock_init(&adev->didt_idx_lock);
3329 	spin_lock_init(&adev->gc_cac_idx_lock);
3330 	spin_lock_init(&adev->se_cac_idx_lock);
3331 	spin_lock_init(&adev->audio_endpt_idx_lock);
3332 	spin_lock_init(&adev->mm_stats.lock);
3333 
3334 	INIT_LIST_HEAD(&adev->shadow_list);
3335 	mutex_init(&adev->shadow_list_lock);
3336 
3337 	INIT_LIST_HEAD(&adev->reset_list);
3338 
3339 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3340 			  amdgpu_device_delayed_init_work_handler);
3341 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3342 			  amdgpu_device_delay_enable_gfx_off);
3343 
3344 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3345 
3346 	adev->gfx.gfx_off_req_count = 1;
3347 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3348 
3349 	atomic_set(&adev->throttling_logging_enabled, 1);
3350 	/*
3351 	 * If throttling continues, logging will be performed every minute
3352 	 * to avoid log flooding. "-1" is subtracted since the thermal
3353 	 * throttling interrupt comes every second. Thus, the total logging
3354 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3355 	 * for throttling interrupt) = 60 seconds.
3356 	 */
3357 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3358 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3359 
3360 	/* Registers mapping */
3361 	/* TODO: block userspace mapping of io register */
3362 	if (adev->asic_type >= CHIP_BONAIRE) {
3363 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3364 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3365 	} else {
3366 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3367 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3368 	}
3369 
3370 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3371 	if (adev->rmmio == NULL) {
3372 		return -ENOMEM;
3373 	}
3374 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3375 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3376 
3377 	/* enable PCIE atomic ops */
3378 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3379 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3380 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3381 	if (r) {
3382 		adev->have_atomics_support = false;
3383 		DRM_INFO("PCIE atomic ops is not supported\n");
3384 	} else {
3385 		adev->have_atomics_support = true;
3386 	}
3387 
3388 	amdgpu_device_get_pcie_info(adev);
3389 
3390 	if (amdgpu_mcbp)
3391 		DRM_INFO("MCBP is enabled\n");
3392 
3393 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3394 		adev->enable_mes = true;
3395 
3396 	/* detect hw virtualization here */
3397 	amdgpu_detect_virtualization(adev);
3398 
3399 	r = amdgpu_device_get_job_timeout_settings(adev);
3400 	if (r) {
3401 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3402 		goto failed_unmap;
3403 	}
3404 
3405 	/* early init functions */
3406 	r = amdgpu_device_ip_early_init(adev);
3407 	if (r)
3408 		goto failed_unmap;
3409 
3410 	/* doorbell bar mapping and doorbell index init*/
3411 	amdgpu_device_doorbell_init(adev);
3412 
3413 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3414 	/* this will fail for cards that aren't VGA class devices, just
3415 	 * ignore it */
3416 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3417 		vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3418 
3419 	if (amdgpu_device_supports_px(ddev)) {
3420 		px = true;
3421 		vga_switcheroo_register_client(adev->pdev,
3422 					       &amdgpu_switcheroo_ops, px);
3423 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3424 	}
3425 
3426 	if (amdgpu_emu_mode == 1) {
3427 		/* post the asic on emulation mode */
3428 		emu_soc_asic_init(adev);
3429 		goto fence_driver_init;
3430 	}
3431 
3432 	amdgpu_reset_init(adev);
3433 
3434 	/* detect if we are with an SRIOV vbios */
3435 	amdgpu_device_detect_sriov_bios(adev);
3436 
3437 	/* check if we need to reset the asic
3438 	 *  E.g., driver was not cleanly unloaded previously, etc.
3439 	 */
3440 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3441 		if (adev->gmc.xgmi.num_physical_nodes) {
3442 			dev_info(adev->dev, "Pending hive reset.\n");
3443 			adev->gmc.xgmi.pending_reset = true;
3444 			/* Only need to init necessary block for SMU to handle the reset */
3445 			for (i = 0; i < adev->num_ip_blocks; i++) {
3446 				if (!adev->ip_blocks[i].status.valid)
3447 					continue;
3448 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3449 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3450 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3451 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3452 					DRM_DEBUG("IP %s disabled for hw_init.\n",
3453 						adev->ip_blocks[i].version->funcs->name);
3454 					adev->ip_blocks[i].status.hw = true;
3455 				}
3456 			}
3457 		} else {
3458 			r = amdgpu_asic_reset(adev);
3459 			if (r) {
3460 				dev_err(adev->dev, "asic reset on init failed\n");
3461 				goto failed;
3462 			}
3463 		}
3464 	}
3465 
3466 	pci_enable_pcie_error_reporting(adev->pdev);
3467 
3468 	/* Post card if necessary */
3469 	if (amdgpu_device_need_post(adev)) {
3470 		if (!adev->bios) {
3471 			dev_err(adev->dev, "no vBIOS found\n");
3472 			r = -EINVAL;
3473 			goto failed;
3474 		}
3475 		DRM_INFO("GPU posting now...\n");
3476 		r = amdgpu_device_asic_init(adev);
3477 		if (r) {
3478 			dev_err(adev->dev, "gpu post error!\n");
3479 			goto failed;
3480 		}
3481 	}
3482 
3483 	if (adev->is_atom_fw) {
3484 		/* Initialize clocks */
3485 		r = amdgpu_atomfirmware_get_clock_info(adev);
3486 		if (r) {
3487 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3488 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3489 			goto failed;
3490 		}
3491 	} else {
3492 		/* Initialize clocks */
3493 		r = amdgpu_atombios_get_clock_info(adev);
3494 		if (r) {
3495 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3496 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3497 			goto failed;
3498 		}
3499 		/* init i2c buses */
3500 		if (!amdgpu_device_has_dc_support(adev))
3501 			amdgpu_atombios_i2c_init(adev);
3502 	}
3503 
3504 fence_driver_init:
3505 	/* Fence driver */
3506 	r = amdgpu_fence_driver_init(adev);
3507 	if (r) {
3508 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3509 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3510 		goto failed;
3511 	}
3512 
3513 	/* init the mode config */
3514 	drm_mode_config_init(adev_to_drm(adev));
3515 
3516 	r = amdgpu_device_ip_init(adev);
3517 	if (r) {
3518 		/* failed in exclusive mode due to timeout */
3519 		if (amdgpu_sriov_vf(adev) &&
3520 		    !amdgpu_sriov_runtime(adev) &&
3521 		    amdgpu_virt_mmio_blocked(adev) &&
3522 		    !amdgpu_virt_wait_reset(adev)) {
3523 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3524 			/* Don't send request since VF is inactive. */
3525 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3526 			adev->virt.ops = NULL;
3527 			r = -EAGAIN;
3528 			goto release_ras_con;
3529 		}
3530 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3531 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3532 		goto release_ras_con;
3533 	}
3534 
3535 	dev_info(adev->dev,
3536 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3537 			adev->gfx.config.max_shader_engines,
3538 			adev->gfx.config.max_sh_per_se,
3539 			adev->gfx.config.max_cu_per_sh,
3540 			adev->gfx.cu_info.number);
3541 
3542 	adev->accel_working = true;
3543 
3544 	amdgpu_vm_check_compute_bug(adev);
3545 
3546 	/* Initialize the buffer migration limit. */
3547 	if (amdgpu_moverate >= 0)
3548 		max_MBps = amdgpu_moverate;
3549 	else
3550 		max_MBps = 8; /* Allow 8 MB/s. */
3551 	/* Get a log2 for easy divisions. */
3552 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3553 
3554 	amdgpu_fbdev_init(adev);
3555 
3556 	r = amdgpu_pm_sysfs_init(adev);
3557 	if (r) {
3558 		adev->pm_sysfs_en = false;
3559 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3560 	} else
3561 		adev->pm_sysfs_en = true;
3562 
3563 	r = amdgpu_ucode_sysfs_init(adev);
3564 	if (r) {
3565 		adev->ucode_sysfs_en = false;
3566 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3567 	} else
3568 		adev->ucode_sysfs_en = true;
3569 
3570 	if ((amdgpu_testing & 1)) {
3571 		if (adev->accel_working)
3572 			amdgpu_test_moves(adev);
3573 		else
3574 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3575 	}
3576 	if (amdgpu_benchmarking) {
3577 		if (adev->accel_working)
3578 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3579 		else
3580 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3581 	}
3582 
3583 	/*
3584 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3585 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3586 	 * gpu instance is counted less.
3587 	 */
3588 	amdgpu_register_gpu_instance(adev);
3589 
3590 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3591 	 * explicit gating rather than handling it automatically.
3592 	 */
3593 	if (!adev->gmc.xgmi.pending_reset) {
3594 		r = amdgpu_device_ip_late_init(adev);
3595 		if (r) {
3596 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3597 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3598 			goto release_ras_con;
3599 		}
3600 		/* must succeed. */
3601 		amdgpu_ras_resume(adev);
3602 		queue_delayed_work(system_wq, &adev->delayed_init_work,
3603 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3604 	}
3605 
3606 	if (amdgpu_sriov_vf(adev))
3607 		flush_delayed_work(&adev->delayed_init_work);
3608 
3609 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3610 	if (r)
3611 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3612 
3613 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3614 		r = amdgpu_pmu_init(adev);
3615 	if (r)
3616 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3617 
3618 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3619 	if (amdgpu_device_cache_pci_state(adev->pdev))
3620 		pci_restore_state(pdev);
3621 
3622 	if (adev->gmc.xgmi.pending_reset)
3623 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3624 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3625 
3626 	return 0;
3627 
3628 release_ras_con:
3629 	amdgpu_release_ras_context(adev);
3630 
3631 failed:
3632 	amdgpu_vf_error_trans_all(adev);
3633 	if (px)
3634 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3635 
3636 failed_unmap:
3637 	iounmap(adev->rmmio);
3638 	adev->rmmio = NULL;
3639 
3640 	return r;
3641 }
3642 
3643 /**
3644  * amdgpu_device_fini - tear down the driver
3645  *
3646  * @adev: amdgpu_device pointer
3647  *
3648  * Tear down the driver info (all asics).
3649  * Called at driver shutdown.
3650  */
3651 void amdgpu_device_fini(struct amdgpu_device *adev)
3652 {
3653 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3654 	flush_delayed_work(&adev->delayed_init_work);
3655 	ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3656 	adev->shutdown = true;
3657 
3658 	kfree(adev->pci_state);
3659 
3660 	/* make sure IB test finished before entering exclusive mode
3661 	 * to avoid preemption on IB test
3662 	 * */
3663 	if (amdgpu_sriov_vf(adev)) {
3664 		amdgpu_virt_request_full_gpu(adev, false);
3665 		amdgpu_virt_fini_data_exchange(adev);
3666 	}
3667 
3668 	/* disable all interrupts */
3669 	amdgpu_irq_disable_all(adev);
3670 	if (adev->mode_info.mode_config_initialized){
3671 		if (!amdgpu_device_has_dc_support(adev))
3672 			drm_helper_force_disable_all(adev_to_drm(adev));
3673 		else
3674 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3675 	}
3676 	amdgpu_fence_driver_fini(adev);
3677 	if (adev->pm_sysfs_en)
3678 		amdgpu_pm_sysfs_fini(adev);
3679 	amdgpu_fbdev_fini(adev);
3680 	amdgpu_device_ip_fini(adev);
3681 	release_firmware(adev->firmware.gpu_info_fw);
3682 	adev->firmware.gpu_info_fw = NULL;
3683 	adev->accel_working = false;
3684 
3685 	amdgpu_reset_fini(adev);
3686 
3687 	/* free i2c buses */
3688 	if (!amdgpu_device_has_dc_support(adev))
3689 		amdgpu_i2c_fini(adev);
3690 
3691 	if (amdgpu_emu_mode != 1)
3692 		amdgpu_atombios_fini(adev);
3693 
3694 	kfree(adev->bios);
3695 	adev->bios = NULL;
3696 	if (amdgpu_device_supports_px(adev_to_drm(adev))) {
3697 		vga_switcheroo_unregister_client(adev->pdev);
3698 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3699 	}
3700 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3701 		vga_client_register(adev->pdev, NULL, NULL, NULL);
3702 	iounmap(adev->rmmio);
3703 	adev->rmmio = NULL;
3704 	amdgpu_device_doorbell_fini(adev);
3705 
3706 	if (adev->ucode_sysfs_en)
3707 		amdgpu_ucode_sysfs_fini(adev);
3708 
3709 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3710 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3711 		amdgpu_pmu_fini(adev);
3712 	if (adev->mman.discovery_bin)
3713 		amdgpu_discovery_fini(adev);
3714 }
3715 
3716 
3717 /*
3718  * Suspend & resume.
3719  */
3720 /**
3721  * amdgpu_device_suspend - initiate device suspend
3722  *
3723  * @dev: drm dev pointer
3724  * @fbcon : notify the fbdev of suspend
3725  *
3726  * Puts the hw in the suspend state (all asics).
3727  * Returns 0 for success or an error on failure.
3728  * Called at driver suspend.
3729  */
3730 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3731 {
3732 	struct amdgpu_device *adev = drm_to_adev(dev);
3733 	int r;
3734 
3735 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3736 		return 0;
3737 
3738 	adev->in_suspend = true;
3739 	drm_kms_helper_poll_disable(dev);
3740 
3741 	if (fbcon)
3742 		amdgpu_fbdev_set_suspend(adev, 1);
3743 
3744 	cancel_delayed_work_sync(&adev->delayed_init_work);
3745 
3746 	amdgpu_ras_suspend(adev);
3747 
3748 	r = amdgpu_device_ip_suspend_phase1(adev);
3749 
3750 	if (!adev->in_s0ix)
3751 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
3752 
3753 	/* evict vram memory */
3754 	amdgpu_bo_evict_vram(adev);
3755 
3756 	amdgpu_fence_driver_suspend(adev);
3757 
3758 	r = amdgpu_device_ip_suspend_phase2(adev);
3759 	/* evict remaining vram memory
3760 	 * This second call to evict vram is to evict the gart page table
3761 	 * using the CPU.
3762 	 */
3763 	amdgpu_bo_evict_vram(adev);
3764 
3765 	return 0;
3766 }
3767 
3768 /**
3769  * amdgpu_device_resume - initiate device resume
3770  *
3771  * @dev: drm dev pointer
3772  * @fbcon : notify the fbdev of resume
3773  *
3774  * Bring the hw back to operating state (all asics).
3775  * Returns 0 for success or an error on failure.
3776  * Called at driver resume.
3777  */
3778 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3779 {
3780 	struct amdgpu_device *adev = drm_to_adev(dev);
3781 	int r = 0;
3782 
3783 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3784 		return 0;
3785 
3786 	if (adev->in_s0ix)
3787 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
3788 
3789 	/* post card */
3790 	if (amdgpu_device_need_post(adev)) {
3791 		r = amdgpu_device_asic_init(adev);
3792 		if (r)
3793 			dev_err(adev->dev, "amdgpu asic init failed\n");
3794 	}
3795 
3796 	r = amdgpu_device_ip_resume(adev);
3797 	if (r) {
3798 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3799 		return r;
3800 	}
3801 	amdgpu_fence_driver_resume(adev);
3802 
3803 
3804 	r = amdgpu_device_ip_late_init(adev);
3805 	if (r)
3806 		return r;
3807 
3808 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3809 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3810 
3811 	if (!adev->in_s0ix) {
3812 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
3813 		if (r)
3814 			return r;
3815 	}
3816 
3817 	/* Make sure IB tests flushed */
3818 	flush_delayed_work(&adev->delayed_init_work);
3819 
3820 	if (fbcon)
3821 		amdgpu_fbdev_set_suspend(adev, 0);
3822 
3823 	drm_kms_helper_poll_enable(dev);
3824 
3825 	amdgpu_ras_resume(adev);
3826 
3827 	/*
3828 	 * Most of the connector probing functions try to acquire runtime pm
3829 	 * refs to ensure that the GPU is powered on when connector polling is
3830 	 * performed. Since we're calling this from a runtime PM callback,
3831 	 * trying to acquire rpm refs will cause us to deadlock.
3832 	 *
3833 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3834 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3835 	 */
3836 #ifdef CONFIG_PM
3837 	dev->dev->power.disable_depth++;
3838 #endif
3839 	if (!amdgpu_device_has_dc_support(adev))
3840 		drm_helper_hpd_irq_event(dev);
3841 	else
3842 		drm_kms_helper_hotplug_event(dev);
3843 #ifdef CONFIG_PM
3844 	dev->dev->power.disable_depth--;
3845 #endif
3846 	adev->in_suspend = false;
3847 
3848 	return 0;
3849 }
3850 
3851 /**
3852  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3853  *
3854  * @adev: amdgpu_device pointer
3855  *
3856  * The list of all the hardware IPs that make up the asic is walked and
3857  * the check_soft_reset callbacks are run.  check_soft_reset determines
3858  * if the asic is still hung or not.
3859  * Returns true if any of the IPs are still in a hung state, false if not.
3860  */
3861 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3862 {
3863 	int i;
3864 	bool asic_hang = false;
3865 
3866 	if (amdgpu_sriov_vf(adev))
3867 		return true;
3868 
3869 	if (amdgpu_asic_need_full_reset(adev))
3870 		return true;
3871 
3872 	for (i = 0; i < adev->num_ip_blocks; i++) {
3873 		if (!adev->ip_blocks[i].status.valid)
3874 			continue;
3875 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3876 			adev->ip_blocks[i].status.hang =
3877 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3878 		if (adev->ip_blocks[i].status.hang) {
3879 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3880 			asic_hang = true;
3881 		}
3882 	}
3883 	return asic_hang;
3884 }
3885 
3886 /**
3887  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3888  *
3889  * @adev: amdgpu_device pointer
3890  *
3891  * The list of all the hardware IPs that make up the asic is walked and the
3892  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3893  * handles any IP specific hardware or software state changes that are
3894  * necessary for a soft reset to succeed.
3895  * Returns 0 on success, negative error code on failure.
3896  */
3897 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3898 {
3899 	int i, r = 0;
3900 
3901 	for (i = 0; i < adev->num_ip_blocks; i++) {
3902 		if (!adev->ip_blocks[i].status.valid)
3903 			continue;
3904 		if (adev->ip_blocks[i].status.hang &&
3905 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3906 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3907 			if (r)
3908 				return r;
3909 		}
3910 	}
3911 
3912 	return 0;
3913 }
3914 
3915 /**
3916  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3917  *
3918  * @adev: amdgpu_device pointer
3919  *
3920  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3921  * reset is necessary to recover.
3922  * Returns true if a full asic reset is required, false if not.
3923  */
3924 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3925 {
3926 	int i;
3927 
3928 	if (amdgpu_asic_need_full_reset(adev))
3929 		return true;
3930 
3931 	for (i = 0; i < adev->num_ip_blocks; i++) {
3932 		if (!adev->ip_blocks[i].status.valid)
3933 			continue;
3934 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3935 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3936 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3937 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3938 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3939 			if (adev->ip_blocks[i].status.hang) {
3940 				dev_info(adev->dev, "Some block need full reset!\n");
3941 				return true;
3942 			}
3943 		}
3944 	}
3945 	return false;
3946 }
3947 
3948 /**
3949  * amdgpu_device_ip_soft_reset - do a soft reset
3950  *
3951  * @adev: amdgpu_device pointer
3952  *
3953  * The list of all the hardware IPs that make up the asic is walked and the
3954  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3955  * IP specific hardware or software state changes that are necessary to soft
3956  * reset the IP.
3957  * Returns 0 on success, negative error code on failure.
3958  */
3959 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3960 {
3961 	int i, r = 0;
3962 
3963 	for (i = 0; i < adev->num_ip_blocks; i++) {
3964 		if (!adev->ip_blocks[i].status.valid)
3965 			continue;
3966 		if (adev->ip_blocks[i].status.hang &&
3967 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3968 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3969 			if (r)
3970 				return r;
3971 		}
3972 	}
3973 
3974 	return 0;
3975 }
3976 
3977 /**
3978  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3979  *
3980  * @adev: amdgpu_device pointer
3981  *
3982  * The list of all the hardware IPs that make up the asic is walked and the
3983  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3984  * handles any IP specific hardware or software state changes that are
3985  * necessary after the IP has been soft reset.
3986  * Returns 0 on success, negative error code on failure.
3987  */
3988 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3989 {
3990 	int i, r = 0;
3991 
3992 	for (i = 0; i < adev->num_ip_blocks; i++) {
3993 		if (!adev->ip_blocks[i].status.valid)
3994 			continue;
3995 		if (adev->ip_blocks[i].status.hang &&
3996 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
3997 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3998 		if (r)
3999 			return r;
4000 	}
4001 
4002 	return 0;
4003 }
4004 
4005 /**
4006  * amdgpu_device_recover_vram - Recover some VRAM contents
4007  *
4008  * @adev: amdgpu_device pointer
4009  *
4010  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4011  * restore things like GPUVM page tables after a GPU reset where
4012  * the contents of VRAM might be lost.
4013  *
4014  * Returns:
4015  * 0 on success, negative error code on failure.
4016  */
4017 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4018 {
4019 	struct dma_fence *fence = NULL, *next = NULL;
4020 	struct amdgpu_bo *shadow;
4021 	long r = 1, tmo;
4022 
4023 	if (amdgpu_sriov_runtime(adev))
4024 		tmo = msecs_to_jiffies(8000);
4025 	else
4026 		tmo = msecs_to_jiffies(100);
4027 
4028 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4029 	mutex_lock(&adev->shadow_list_lock);
4030 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4031 
4032 		/* No need to recover an evicted BO */
4033 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4034 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4035 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4036 			continue;
4037 
4038 		r = amdgpu_bo_restore_shadow(shadow, &next);
4039 		if (r)
4040 			break;
4041 
4042 		if (fence) {
4043 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4044 			dma_fence_put(fence);
4045 			fence = next;
4046 			if (tmo == 0) {
4047 				r = -ETIMEDOUT;
4048 				break;
4049 			} else if (tmo < 0) {
4050 				r = tmo;
4051 				break;
4052 			}
4053 		} else {
4054 			fence = next;
4055 		}
4056 	}
4057 	mutex_unlock(&adev->shadow_list_lock);
4058 
4059 	if (fence)
4060 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4061 	dma_fence_put(fence);
4062 
4063 	if (r < 0 || tmo <= 0) {
4064 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4065 		return -EIO;
4066 	}
4067 
4068 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4069 	return 0;
4070 }
4071 
4072 
4073 /**
4074  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4075  *
4076  * @adev: amdgpu_device pointer
4077  * @from_hypervisor: request from hypervisor
4078  *
4079  * do VF FLR and reinitialize Asic
4080  * return 0 means succeeded otherwise failed
4081  */
4082 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4083 				     bool from_hypervisor)
4084 {
4085 	int r;
4086 
4087 	if (from_hypervisor)
4088 		r = amdgpu_virt_request_full_gpu(adev, true);
4089 	else
4090 		r = amdgpu_virt_reset_gpu(adev);
4091 	if (r)
4092 		return r;
4093 
4094 	amdgpu_amdkfd_pre_reset(adev);
4095 
4096 	/* Resume IP prior to SMC */
4097 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4098 	if (r)
4099 		goto error;
4100 
4101 	amdgpu_virt_init_data_exchange(adev);
4102 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4103 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4104 
4105 	r = amdgpu_device_fw_loading(adev);
4106 	if (r)
4107 		return r;
4108 
4109 	/* now we are okay to resume SMC/CP/SDMA */
4110 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4111 	if (r)
4112 		goto error;
4113 
4114 	amdgpu_irq_gpu_reset_resume_helper(adev);
4115 	r = amdgpu_ib_ring_tests(adev);
4116 	amdgpu_amdkfd_post_reset(adev);
4117 
4118 error:
4119 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4120 		amdgpu_inc_vram_lost(adev);
4121 		r = amdgpu_device_recover_vram(adev);
4122 	}
4123 	amdgpu_virt_release_full_gpu(adev, true);
4124 
4125 	return r;
4126 }
4127 
4128 /**
4129  * amdgpu_device_has_job_running - check if there is any job in mirror list
4130  *
4131  * @adev: amdgpu_device pointer
4132  *
4133  * check if there is any job in mirror list
4134  */
4135 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4136 {
4137 	int i;
4138 	struct drm_sched_job *job;
4139 
4140 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4141 		struct amdgpu_ring *ring = adev->rings[i];
4142 
4143 		if (!ring || !ring->sched.thread)
4144 			continue;
4145 
4146 		spin_lock(&ring->sched.job_list_lock);
4147 		job = list_first_entry_or_null(&ring->sched.pending_list,
4148 					       struct drm_sched_job, list);
4149 		spin_unlock(&ring->sched.job_list_lock);
4150 		if (job)
4151 			return true;
4152 	}
4153 	return false;
4154 }
4155 
4156 /**
4157  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4158  *
4159  * @adev: amdgpu_device pointer
4160  *
4161  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4162  * a hung GPU.
4163  */
4164 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4165 {
4166 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4167 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4168 		return false;
4169 	}
4170 
4171 	if (amdgpu_gpu_recovery == 0)
4172 		goto disabled;
4173 
4174 	if (amdgpu_sriov_vf(adev))
4175 		return true;
4176 
4177 	if (amdgpu_gpu_recovery == -1) {
4178 		switch (adev->asic_type) {
4179 		case CHIP_BONAIRE:
4180 		case CHIP_HAWAII:
4181 		case CHIP_TOPAZ:
4182 		case CHIP_TONGA:
4183 		case CHIP_FIJI:
4184 		case CHIP_POLARIS10:
4185 		case CHIP_POLARIS11:
4186 		case CHIP_POLARIS12:
4187 		case CHIP_VEGAM:
4188 		case CHIP_VEGA20:
4189 		case CHIP_VEGA10:
4190 		case CHIP_VEGA12:
4191 		case CHIP_RAVEN:
4192 		case CHIP_ARCTURUS:
4193 		case CHIP_RENOIR:
4194 		case CHIP_NAVI10:
4195 		case CHIP_NAVI14:
4196 		case CHIP_NAVI12:
4197 		case CHIP_SIENNA_CICHLID:
4198 		case CHIP_NAVY_FLOUNDER:
4199 		case CHIP_DIMGREY_CAVEFISH:
4200 		case CHIP_VANGOGH:
4201 		case CHIP_ALDEBARAN:
4202 			break;
4203 		default:
4204 			goto disabled;
4205 		}
4206 	}
4207 
4208 	return true;
4209 
4210 disabled:
4211 		dev_info(adev->dev, "GPU recovery disabled.\n");
4212 		return false;
4213 }
4214 
4215 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4216 {
4217         u32 i;
4218         int ret = 0;
4219 
4220         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4221 
4222         dev_info(adev->dev, "GPU mode1 reset\n");
4223 
4224         /* disable BM */
4225         pci_clear_master(adev->pdev);
4226 
4227         amdgpu_device_cache_pci_state(adev->pdev);
4228 
4229         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4230                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4231                 ret = amdgpu_dpm_mode1_reset(adev);
4232         } else {
4233                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4234                 ret = psp_gpu_reset(adev);
4235         }
4236 
4237         if (ret)
4238                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4239 
4240         amdgpu_device_load_pci_state(adev->pdev);
4241 
4242         /* wait for asic to come out of reset */
4243         for (i = 0; i < adev->usec_timeout; i++) {
4244                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4245 
4246                 if (memsize != 0xffffffff)
4247                         break;
4248                 udelay(1);
4249         }
4250 
4251         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4252         return ret;
4253 }
4254 
4255 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4256 				 struct amdgpu_reset_context *reset_context)
4257 {
4258 	int i, r = 0;
4259 	struct amdgpu_job *job = NULL;
4260 	bool need_full_reset =
4261 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4262 
4263 	if (reset_context->reset_req_dev == adev)
4264 		job = reset_context->job;
4265 
4266 	/* no need to dump if device is not in good state during probe period */
4267 	if (!adev->gmc.xgmi.pending_reset)
4268 		amdgpu_debugfs_wait_dump(adev);
4269 
4270 	if (amdgpu_sriov_vf(adev)) {
4271 		/* stop the data exchange thread */
4272 		amdgpu_virt_fini_data_exchange(adev);
4273 	}
4274 
4275 	/* block all schedulers and reset given job's ring */
4276 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4277 		struct amdgpu_ring *ring = adev->rings[i];
4278 
4279 		if (!ring || !ring->sched.thread)
4280 			continue;
4281 
4282 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4283 		amdgpu_fence_driver_force_completion(ring);
4284 	}
4285 
4286 	if(job)
4287 		drm_sched_increase_karma(&job->base);
4288 
4289 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4290 	/* If reset handler not implemented, continue; otherwise return */
4291 	if (r == -ENOSYS)
4292 		r = 0;
4293 	else
4294 		return r;
4295 
4296 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4297 	if (!amdgpu_sriov_vf(adev)) {
4298 
4299 		if (!need_full_reset)
4300 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4301 
4302 		if (!need_full_reset) {
4303 			amdgpu_device_ip_pre_soft_reset(adev);
4304 			r = amdgpu_device_ip_soft_reset(adev);
4305 			amdgpu_device_ip_post_soft_reset(adev);
4306 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4307 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4308 				need_full_reset = true;
4309 			}
4310 		}
4311 
4312 		if (need_full_reset)
4313 			r = amdgpu_device_ip_suspend(adev);
4314 		if (need_full_reset)
4315 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4316 		else
4317 			clear_bit(AMDGPU_NEED_FULL_RESET,
4318 				  &reset_context->flags);
4319 	}
4320 
4321 	return r;
4322 }
4323 
4324 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4325 			 struct amdgpu_reset_context *reset_context)
4326 {
4327 	struct amdgpu_device *tmp_adev = NULL;
4328 	bool need_full_reset, skip_hw_reset, vram_lost = false;
4329 	int r = 0;
4330 
4331 	/* Try reset handler method first */
4332 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4333 				    reset_list);
4334 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4335 	/* If reset handler not implemented, continue; otherwise return */
4336 	if (r == -ENOSYS)
4337 		r = 0;
4338 	else
4339 		return r;
4340 
4341 	/* Reset handler not implemented, use the default method */
4342 	need_full_reset =
4343 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4344 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4345 
4346 	/*
4347 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
4348 	 * to allow proper links negotiation in FW (within 1 sec)
4349 	 */
4350 	if (!skip_hw_reset && need_full_reset) {
4351 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4352 			/* For XGMI run all resets in parallel to speed up the process */
4353 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4354 				tmp_adev->gmc.xgmi.pending_reset = false;
4355 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4356 					r = -EALREADY;
4357 			} else
4358 				r = amdgpu_asic_reset(tmp_adev);
4359 
4360 			if (r) {
4361 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4362 					 r, adev_to_drm(tmp_adev)->unique);
4363 				break;
4364 			}
4365 		}
4366 
4367 		/* For XGMI wait for all resets to complete before proceed */
4368 		if (!r) {
4369 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4370 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4371 					flush_work(&tmp_adev->xgmi_reset_work);
4372 					r = tmp_adev->asic_reset_res;
4373 					if (r)
4374 						break;
4375 				}
4376 			}
4377 		}
4378 	}
4379 
4380 	if (!r && amdgpu_ras_intr_triggered()) {
4381 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4382 			if (tmp_adev->mmhub.ras_funcs &&
4383 			    tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4384 				tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
4385 		}
4386 
4387 		amdgpu_ras_intr_cleared();
4388 	}
4389 
4390 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4391 		if (need_full_reset) {
4392 			/* post card */
4393 			r = amdgpu_device_asic_init(tmp_adev);
4394 			if (r) {
4395 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4396 			} else {
4397 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4398 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4399 				if (r)
4400 					goto out;
4401 
4402 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4403 				if (vram_lost) {
4404 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4405 					amdgpu_inc_vram_lost(tmp_adev);
4406 				}
4407 
4408 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4409 				if (r)
4410 					goto out;
4411 
4412 				r = amdgpu_device_fw_loading(tmp_adev);
4413 				if (r)
4414 					return r;
4415 
4416 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4417 				if (r)
4418 					goto out;
4419 
4420 				if (vram_lost)
4421 					amdgpu_device_fill_reset_magic(tmp_adev);
4422 
4423 				/*
4424 				 * Add this ASIC as tracked as reset was already
4425 				 * complete successfully.
4426 				 */
4427 				amdgpu_register_gpu_instance(tmp_adev);
4428 
4429 				if (!reset_context->hive &&
4430 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4431 					amdgpu_xgmi_add_device(tmp_adev);
4432 
4433 				r = amdgpu_device_ip_late_init(tmp_adev);
4434 				if (r)
4435 					goto out;
4436 
4437 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4438 
4439 				/*
4440 				 * The GPU enters bad state once faulty pages
4441 				 * by ECC has reached the threshold, and ras
4442 				 * recovery is scheduled next. So add one check
4443 				 * here to break recovery if it indeed exceeds
4444 				 * bad page threshold, and remind user to
4445 				 * retire this GPU or setting one bigger
4446 				 * bad_page_threshold value to fix this once
4447 				 * probing driver again.
4448 				 */
4449 				if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4450 					/* must succeed. */
4451 					amdgpu_ras_resume(tmp_adev);
4452 				} else {
4453 					r = -EINVAL;
4454 					goto out;
4455 				}
4456 
4457 				/* Update PSP FW topology after reset */
4458 				if (reset_context->hive &&
4459 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4460 					r = amdgpu_xgmi_update_topology(
4461 						reset_context->hive, tmp_adev);
4462 			}
4463 		}
4464 
4465 out:
4466 		if (!r) {
4467 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4468 			r = amdgpu_ib_ring_tests(tmp_adev);
4469 			if (r) {
4470 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4471 				r = amdgpu_device_ip_suspend(tmp_adev);
4472 				need_full_reset = true;
4473 				r = -EAGAIN;
4474 				goto end;
4475 			}
4476 		}
4477 
4478 		if (!r)
4479 			r = amdgpu_device_recover_vram(tmp_adev);
4480 		else
4481 			tmp_adev->asic_reset_res = r;
4482 	}
4483 
4484 end:
4485 	if (need_full_reset)
4486 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4487 	else
4488 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4489 	return r;
4490 }
4491 
4492 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4493 				struct amdgpu_hive_info *hive)
4494 {
4495 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4496 		return false;
4497 
4498 	if (hive) {
4499 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4500 	} else {
4501 		down_write(&adev->reset_sem);
4502 	}
4503 
4504 	switch (amdgpu_asic_reset_method(adev)) {
4505 	case AMD_RESET_METHOD_MODE1:
4506 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4507 		break;
4508 	case AMD_RESET_METHOD_MODE2:
4509 		adev->mp1_state = PP_MP1_STATE_RESET;
4510 		break;
4511 	default:
4512 		adev->mp1_state = PP_MP1_STATE_NONE;
4513 		break;
4514 	}
4515 
4516 	return true;
4517 }
4518 
4519 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4520 {
4521 	amdgpu_vf_error_trans_all(adev);
4522 	adev->mp1_state = PP_MP1_STATE_NONE;
4523 	atomic_set(&adev->in_gpu_reset, 0);
4524 	up_write(&adev->reset_sem);
4525 }
4526 
4527 /*
4528  * to lockup a list of amdgpu devices in a hive safely, if not a hive
4529  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4530  *
4531  * unlock won't require roll back.
4532  */
4533 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4534 {
4535 	struct amdgpu_device *tmp_adev = NULL;
4536 
4537 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4538 		if (!hive) {
4539 			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4540 			return -ENODEV;
4541 		}
4542 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4543 			if (!amdgpu_device_lock_adev(tmp_adev, hive))
4544 				goto roll_back;
4545 		}
4546 	} else if (!amdgpu_device_lock_adev(adev, hive))
4547 		return -EAGAIN;
4548 
4549 	return 0;
4550 roll_back:
4551 	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4552 		/*
4553 		 * if the lockup iteration break in the middle of a hive,
4554 		 * it may means there may has a race issue,
4555 		 * or a hive device locked up independently.
4556 		 * we may be in trouble and may not, so will try to roll back
4557 		 * the lock and give out a warnning.
4558 		 */
4559 		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4560 		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4561 			amdgpu_device_unlock_adev(tmp_adev);
4562 		}
4563 	}
4564 	return -EAGAIN;
4565 }
4566 
4567 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4568 {
4569 	struct pci_dev *p = NULL;
4570 
4571 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4572 			adev->pdev->bus->number, 1);
4573 	if (p) {
4574 		pm_runtime_enable(&(p->dev));
4575 		pm_runtime_resume(&(p->dev));
4576 	}
4577 }
4578 
4579 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4580 {
4581 	enum amd_reset_method reset_method;
4582 	struct pci_dev *p = NULL;
4583 	u64 expires;
4584 
4585 	/*
4586 	 * For now, only BACO and mode1 reset are confirmed
4587 	 * to suffer the audio issue without proper suspended.
4588 	 */
4589 	reset_method = amdgpu_asic_reset_method(adev);
4590 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4591 	     (reset_method != AMD_RESET_METHOD_MODE1))
4592 		return -EINVAL;
4593 
4594 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4595 			adev->pdev->bus->number, 1);
4596 	if (!p)
4597 		return -ENODEV;
4598 
4599 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4600 	if (!expires)
4601 		/*
4602 		 * If we cannot get the audio device autosuspend delay,
4603 		 * a fixed 4S interval will be used. Considering 3S is
4604 		 * the audio controller default autosuspend delay setting.
4605 		 * 4S used here is guaranteed to cover that.
4606 		 */
4607 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4608 
4609 	while (!pm_runtime_status_suspended(&(p->dev))) {
4610 		if (!pm_runtime_suspend(&(p->dev)))
4611 			break;
4612 
4613 		if (expires < ktime_get_mono_fast_ns()) {
4614 			dev_warn(adev->dev, "failed to suspend display audio\n");
4615 			/* TODO: abort the succeeding gpu reset? */
4616 			return -ETIMEDOUT;
4617 		}
4618 	}
4619 
4620 	pm_runtime_disable(&(p->dev));
4621 
4622 	return 0;
4623 }
4624 
4625 void amdgpu_device_recheck_guilty_jobs(
4626 	struct amdgpu_device *adev, struct list_head *device_list_handle,
4627 	struct amdgpu_reset_context *reset_context)
4628 {
4629 	int i, r = 0;
4630 
4631 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4632 		struct amdgpu_ring *ring = adev->rings[i];
4633 		int ret = 0;
4634 		struct drm_sched_job *s_job;
4635 
4636 		if (!ring || !ring->sched.thread)
4637 			continue;
4638 
4639 		s_job = list_first_entry_or_null(&ring->sched.pending_list,
4640 				struct drm_sched_job, list);
4641 		if (s_job == NULL)
4642 			continue;
4643 
4644 		/* clear job's guilty and depend the folowing step to decide the real one */
4645 		drm_sched_reset_karma(s_job);
4646 		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
4647 
4648 		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
4649 		if (ret == 0) { /* timeout */
4650 			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
4651 						ring->sched.name, s_job->id);
4652 
4653 			/* set guilty */
4654 			drm_sched_increase_karma(s_job);
4655 retry:
4656 			/* do hw reset */
4657 			if (amdgpu_sriov_vf(adev)) {
4658 				amdgpu_virt_fini_data_exchange(adev);
4659 				r = amdgpu_device_reset_sriov(adev, false);
4660 				if (r)
4661 					adev->asic_reset_res = r;
4662 			} else {
4663 				clear_bit(AMDGPU_SKIP_HW_RESET,
4664 					  &reset_context->flags);
4665 				r = amdgpu_do_asic_reset(device_list_handle,
4666 							 reset_context);
4667 				if (r && r == -EAGAIN)
4668 					goto retry;
4669 			}
4670 
4671 			/*
4672 			 * add reset counter so that the following
4673 			 * resubmitted job could flush vmid
4674 			 */
4675 			atomic_inc(&adev->gpu_reset_counter);
4676 			continue;
4677 		}
4678 
4679 		/* got the hw fence, signal finished fence */
4680 		atomic_dec(ring->sched.score);
4681 		dma_fence_get(&s_job->s_fence->finished);
4682 		dma_fence_signal(&s_job->s_fence->finished);
4683 		dma_fence_put(&s_job->s_fence->finished);
4684 
4685 		/* remove node from list and free the job */
4686 		spin_lock(&ring->sched.job_list_lock);
4687 		list_del_init(&s_job->list);
4688 		spin_unlock(&ring->sched.job_list_lock);
4689 		ring->sched.ops->free_job(s_job);
4690 	}
4691 }
4692 
4693 /**
4694  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4695  *
4696  * @adev: amdgpu_device pointer
4697  * @job: which job trigger hang
4698  *
4699  * Attempt to reset the GPU if it has hung (all asics).
4700  * Attempt to do soft-reset or full-reset and reinitialize Asic
4701  * Returns 0 for success or an error on failure.
4702  */
4703 
4704 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4705 			      struct amdgpu_job *job)
4706 {
4707 	struct list_head device_list, *device_list_handle =  NULL;
4708 	bool job_signaled = false;
4709 	struct amdgpu_hive_info *hive = NULL;
4710 	struct amdgpu_device *tmp_adev = NULL;
4711 	int i, r = 0;
4712 	bool need_emergency_restart = false;
4713 	bool audio_suspended = false;
4714 	int tmp_vram_lost_counter;
4715 	struct amdgpu_reset_context reset_context;
4716 
4717 	memset(&reset_context, 0, sizeof(reset_context));
4718 
4719 	/*
4720 	 * Special case: RAS triggered and full reset isn't supported
4721 	 */
4722 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4723 
4724 	/*
4725 	 * Flush RAM to disk so that after reboot
4726 	 * the user can read log and see why the system rebooted.
4727 	 */
4728 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4729 		DRM_WARN("Emergency reboot.");
4730 
4731 		ksys_sync_helper();
4732 		emergency_restart();
4733 	}
4734 
4735 	dev_info(adev->dev, "GPU %s begin!\n",
4736 		need_emergency_restart ? "jobs stop":"reset");
4737 
4738 	/*
4739 	 * Here we trylock to avoid chain of resets executing from
4740 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4741 	 * different schedulers for same device while this TO handler is running.
4742 	 * We always reset all schedulers for device and all devices for XGMI
4743 	 * hive so that should take care of them too.
4744 	 */
4745 	hive = amdgpu_get_xgmi_hive(adev);
4746 	if (hive) {
4747 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4748 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4749 				job ? job->base.id : -1, hive->hive_id);
4750 			amdgpu_put_xgmi_hive(hive);
4751 			if (job)
4752 				drm_sched_increase_karma(&job->base);
4753 			return 0;
4754 		}
4755 		mutex_lock(&hive->hive_lock);
4756 	}
4757 
4758 	reset_context.method = AMD_RESET_METHOD_NONE;
4759 	reset_context.reset_req_dev = adev;
4760 	reset_context.job = job;
4761 	reset_context.hive = hive;
4762 	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
4763 
4764 	/*
4765 	 * lock the device before we try to operate the linked list
4766 	 * if didn't get the device lock, don't touch the linked list since
4767 	 * others may iterating it.
4768 	 */
4769 	r = amdgpu_device_lock_hive_adev(adev, hive);
4770 	if (r) {
4771 		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4772 					job ? job->base.id : -1);
4773 
4774 		/* even we skipped this reset, still need to set the job to guilty */
4775 		if (job)
4776 			drm_sched_increase_karma(&job->base);
4777 		goto skip_recovery;
4778 	}
4779 
4780 	/*
4781 	 * Build list of devices to reset.
4782 	 * In case we are in XGMI hive mode, resort the device list
4783 	 * to put adev in the 1st position.
4784 	 */
4785 	INIT_LIST_HEAD(&device_list);
4786 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4787 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
4788 			list_add_tail(&tmp_adev->reset_list, &device_list);
4789 		if (!list_is_first(&adev->reset_list, &device_list))
4790 			list_rotate_to_front(&adev->reset_list, &device_list);
4791 		device_list_handle = &device_list;
4792 	} else {
4793 		list_add_tail(&adev->reset_list, &device_list);
4794 		device_list_handle = &device_list;
4795 	}
4796 
4797 	/* block all schedulers and reset given job's ring */
4798 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4799 		/*
4800 		 * Try to put the audio codec into suspend state
4801 		 * before gpu reset started.
4802 		 *
4803 		 * Due to the power domain of the graphics device
4804 		 * is shared with AZ power domain. Without this,
4805 		 * we may change the audio hardware from behind
4806 		 * the audio driver's back. That will trigger
4807 		 * some audio codec errors.
4808 		 */
4809 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4810 			audio_suspended = true;
4811 
4812 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4813 
4814 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4815 
4816 		if (!amdgpu_sriov_vf(tmp_adev))
4817 			amdgpu_amdkfd_pre_reset(tmp_adev);
4818 
4819 		/*
4820 		 * Mark these ASICs to be reseted as untracked first
4821 		 * And add them back after reset completed
4822 		 */
4823 		amdgpu_unregister_gpu_instance(tmp_adev);
4824 
4825 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4826 
4827 		/* disable ras on ALL IPs */
4828 		if (!need_emergency_restart &&
4829 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4830 			amdgpu_ras_suspend(tmp_adev);
4831 
4832 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4833 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4834 
4835 			if (!ring || !ring->sched.thread)
4836 				continue;
4837 
4838 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4839 
4840 			if (need_emergency_restart)
4841 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4842 		}
4843 		atomic_inc(&tmp_adev->gpu_reset_counter);
4844 	}
4845 
4846 	if (need_emergency_restart)
4847 		goto skip_sched_resume;
4848 
4849 	/*
4850 	 * Must check guilty signal here since after this point all old
4851 	 * HW fences are force signaled.
4852 	 *
4853 	 * job->base holds a reference to parent fence
4854 	 */
4855 	if (job && job->base.s_fence->parent &&
4856 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4857 		job_signaled = true;
4858 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4859 		goto skip_hw_reset;
4860 	}
4861 
4862 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4863 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4864 		r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
4865 		/*TODO Should we stop ?*/
4866 		if (r) {
4867 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4868 				  r, adev_to_drm(tmp_adev)->unique);
4869 			tmp_adev->asic_reset_res = r;
4870 		}
4871 	}
4872 
4873 	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
4874 	/* Actual ASIC resets if needed.*/
4875 	/* TODO Implement XGMI hive reset logic for SRIOV */
4876 	if (amdgpu_sriov_vf(adev)) {
4877 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4878 		if (r)
4879 			adev->asic_reset_res = r;
4880 	} else {
4881 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
4882 		if (r && r == -EAGAIN)
4883 			goto retry;
4884 	}
4885 
4886 skip_hw_reset:
4887 
4888 	/* Post ASIC reset for all devs .*/
4889 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4890 
4891 		/*
4892 		 * Sometimes a later bad compute job can block a good gfx job as gfx
4893 		 * and compute ring share internal GC HW mutually. We add an additional
4894 		 * guilty jobs recheck step to find the real guilty job, it synchronously
4895 		 * submits and pends for the first job being signaled. If it gets timeout,
4896 		 * we identify it as a real guilty job.
4897 		 */
4898 		if (amdgpu_gpu_recovery == 2 &&
4899 			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
4900 			amdgpu_device_recheck_guilty_jobs(
4901 				tmp_adev, device_list_handle, &reset_context);
4902 
4903 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4904 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4905 
4906 			if (!ring || !ring->sched.thread)
4907 				continue;
4908 
4909 			/* No point to resubmit jobs if we didn't HW reset*/
4910 			if (!tmp_adev->asic_reset_res && !job_signaled)
4911 				drm_sched_resubmit_jobs(&ring->sched);
4912 
4913 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4914 		}
4915 
4916 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4917 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4918 		}
4919 
4920 		tmp_adev->asic_reset_res = 0;
4921 
4922 		if (r) {
4923 			/* bad news, how to tell it to userspace ? */
4924 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4925 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4926 		} else {
4927 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4928 		}
4929 	}
4930 
4931 skip_sched_resume:
4932 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4933 		/* unlock kfd: SRIOV would do it separately */
4934 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4935 	                amdgpu_amdkfd_post_reset(tmp_adev);
4936 
4937 		/* kfd_post_reset will do nothing if kfd device is not initialized,
4938 		 * need to bring up kfd here if it's not be initialized before
4939 		 */
4940 		if (!adev->kfd.init_complete)
4941 			amdgpu_amdkfd_device_init(adev);
4942 
4943 		if (audio_suspended)
4944 			amdgpu_device_resume_display_audio(tmp_adev);
4945 		amdgpu_device_unlock_adev(tmp_adev);
4946 	}
4947 
4948 skip_recovery:
4949 	if (hive) {
4950 		atomic_set(&hive->in_reset, 0);
4951 		mutex_unlock(&hive->hive_lock);
4952 		amdgpu_put_xgmi_hive(hive);
4953 	}
4954 
4955 	if (r && r != -EAGAIN)
4956 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4957 	return r;
4958 }
4959 
4960 /**
4961  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4962  *
4963  * @adev: amdgpu_device pointer
4964  *
4965  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4966  * and lanes) of the slot the device is in. Handles APUs and
4967  * virtualized environments where PCIE config space may not be available.
4968  */
4969 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4970 {
4971 	struct pci_dev *pdev;
4972 	enum pci_bus_speed speed_cap, platform_speed_cap;
4973 	enum pcie_link_width platform_link_width;
4974 
4975 	if (amdgpu_pcie_gen_cap)
4976 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4977 
4978 	if (amdgpu_pcie_lane_cap)
4979 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4980 
4981 	/* covers APUs as well */
4982 	if (pci_is_root_bus(adev->pdev->bus)) {
4983 		if (adev->pm.pcie_gen_mask == 0)
4984 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4985 		if (adev->pm.pcie_mlw_mask == 0)
4986 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4987 		return;
4988 	}
4989 
4990 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4991 		return;
4992 
4993 	pcie_bandwidth_available(adev->pdev, NULL,
4994 				 &platform_speed_cap, &platform_link_width);
4995 
4996 	if (adev->pm.pcie_gen_mask == 0) {
4997 		/* asic caps */
4998 		pdev = adev->pdev;
4999 		speed_cap = pcie_get_speed_cap(pdev);
5000 		if (speed_cap == PCI_SPEED_UNKNOWN) {
5001 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5002 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5003 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5004 		} else {
5005 			if (speed_cap == PCIE_SPEED_32_0GT)
5006 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5007 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5008 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5009 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5010 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5011 			else if (speed_cap == PCIE_SPEED_16_0GT)
5012 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5013 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5014 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5015 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5016 			else if (speed_cap == PCIE_SPEED_8_0GT)
5017 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5018 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5019 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5020 			else if (speed_cap == PCIE_SPEED_5_0GT)
5021 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5022 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5023 			else
5024 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5025 		}
5026 		/* platform caps */
5027 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5028 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5029 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5030 		} else {
5031 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
5032 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5033 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5034 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5035 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5036 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5037 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5038 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5039 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5040 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5041 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5042 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5043 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5044 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5045 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5046 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5047 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5048 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5049 			else
5050 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5051 
5052 		}
5053 	}
5054 	if (adev->pm.pcie_mlw_mask == 0) {
5055 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5056 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5057 		} else {
5058 			switch (platform_link_width) {
5059 			case PCIE_LNK_X32:
5060 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5061 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5062 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5063 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5064 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5065 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5066 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5067 				break;
5068 			case PCIE_LNK_X16:
5069 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5070 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5071 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5072 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5073 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5074 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5075 				break;
5076 			case PCIE_LNK_X12:
5077 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5078 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5079 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5080 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5081 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5082 				break;
5083 			case PCIE_LNK_X8:
5084 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5085 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5086 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5087 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5088 				break;
5089 			case PCIE_LNK_X4:
5090 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5091 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5092 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5093 				break;
5094 			case PCIE_LNK_X2:
5095 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5096 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5097 				break;
5098 			case PCIE_LNK_X1:
5099 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5100 				break;
5101 			default:
5102 				break;
5103 			}
5104 		}
5105 	}
5106 }
5107 
5108 int amdgpu_device_baco_enter(struct drm_device *dev)
5109 {
5110 	struct amdgpu_device *adev = drm_to_adev(dev);
5111 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5112 
5113 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5114 		return -ENOTSUPP;
5115 
5116 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5117 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5118 
5119 	return amdgpu_dpm_baco_enter(adev);
5120 }
5121 
5122 int amdgpu_device_baco_exit(struct drm_device *dev)
5123 {
5124 	struct amdgpu_device *adev = drm_to_adev(dev);
5125 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5126 	int ret = 0;
5127 
5128 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5129 		return -ENOTSUPP;
5130 
5131 	ret = amdgpu_dpm_baco_exit(adev);
5132 	if (ret)
5133 		return ret;
5134 
5135 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5136 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5137 
5138 	return 0;
5139 }
5140 
5141 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5142 {
5143 	int i;
5144 
5145 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5146 		struct amdgpu_ring *ring = adev->rings[i];
5147 
5148 		if (!ring || !ring->sched.thread)
5149 			continue;
5150 
5151 		cancel_delayed_work_sync(&ring->sched.work_tdr);
5152 	}
5153 }
5154 
5155 /**
5156  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5157  * @pdev: PCI device struct
5158  * @state: PCI channel state
5159  *
5160  * Description: Called when a PCI error is detected.
5161  *
5162  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5163  */
5164 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5165 {
5166 	struct drm_device *dev = pci_get_drvdata(pdev);
5167 	struct amdgpu_device *adev = drm_to_adev(dev);
5168 	int i;
5169 
5170 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5171 
5172 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5173 		DRM_WARN("No support for XGMI hive yet...");
5174 		return PCI_ERS_RESULT_DISCONNECT;
5175 	}
5176 
5177 	switch (state) {
5178 	case pci_channel_io_normal:
5179 		return PCI_ERS_RESULT_CAN_RECOVER;
5180 	/* Fatal error, prepare for slot reset */
5181 	case pci_channel_io_frozen:
5182 		/*
5183 		 * Cancel and wait for all TDRs in progress if failing to
5184 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
5185 		 *
5186 		 * Locking adev->reset_sem will prevent any external access
5187 		 * to GPU during PCI error recovery
5188 		 */
5189 		while (!amdgpu_device_lock_adev(adev, NULL))
5190 			amdgpu_cancel_all_tdr(adev);
5191 
5192 		/*
5193 		 * Block any work scheduling as we do for regular GPU reset
5194 		 * for the duration of the recovery
5195 		 */
5196 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5197 			struct amdgpu_ring *ring = adev->rings[i];
5198 
5199 			if (!ring || !ring->sched.thread)
5200 				continue;
5201 
5202 			drm_sched_stop(&ring->sched, NULL);
5203 		}
5204 		atomic_inc(&adev->gpu_reset_counter);
5205 		return PCI_ERS_RESULT_NEED_RESET;
5206 	case pci_channel_io_perm_failure:
5207 		/* Permanent error, prepare for device removal */
5208 		return PCI_ERS_RESULT_DISCONNECT;
5209 	}
5210 
5211 	return PCI_ERS_RESULT_NEED_RESET;
5212 }
5213 
5214 /**
5215  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5216  * @pdev: pointer to PCI device
5217  */
5218 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5219 {
5220 
5221 	DRM_INFO("PCI error: mmio enabled callback!!\n");
5222 
5223 	/* TODO - dump whatever for debugging purposes */
5224 
5225 	/* This called only if amdgpu_pci_error_detected returns
5226 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5227 	 * works, no need to reset slot.
5228 	 */
5229 
5230 	return PCI_ERS_RESULT_RECOVERED;
5231 }
5232 
5233 /**
5234  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5235  * @pdev: PCI device struct
5236  *
5237  * Description: This routine is called by the pci error recovery
5238  * code after the PCI slot has been reset, just before we
5239  * should resume normal operations.
5240  */
5241 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5242 {
5243 	struct drm_device *dev = pci_get_drvdata(pdev);
5244 	struct amdgpu_device *adev = drm_to_adev(dev);
5245 	int r, i;
5246 	struct amdgpu_reset_context reset_context;
5247 	u32 memsize;
5248 	struct list_head device_list;
5249 
5250 	DRM_INFO("PCI error: slot reset callback!!\n");
5251 
5252 	memset(&reset_context, 0, sizeof(reset_context));
5253 
5254 	INIT_LIST_HEAD(&device_list);
5255 	list_add_tail(&adev->reset_list, &device_list);
5256 
5257 	/* wait for asic to come out of reset */
5258 	msleep(500);
5259 
5260 	/* Restore PCI confspace */
5261 	amdgpu_device_load_pci_state(pdev);
5262 
5263 	/* confirm  ASIC came out of reset */
5264 	for (i = 0; i < adev->usec_timeout; i++) {
5265 		memsize = amdgpu_asic_get_config_memsize(adev);
5266 
5267 		if (memsize != 0xffffffff)
5268 			break;
5269 		udelay(1);
5270 	}
5271 	if (memsize == 0xffffffff) {
5272 		r = -ETIME;
5273 		goto out;
5274 	}
5275 
5276 	reset_context.method = AMD_RESET_METHOD_NONE;
5277 	reset_context.reset_req_dev = adev;
5278 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5279 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5280 
5281 	adev->in_pci_err_recovery = true;
5282 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5283 	adev->in_pci_err_recovery = false;
5284 	if (r)
5285 		goto out;
5286 
5287 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
5288 
5289 out:
5290 	if (!r) {
5291 		if (amdgpu_device_cache_pci_state(adev->pdev))
5292 			pci_restore_state(adev->pdev);
5293 
5294 		DRM_INFO("PCIe error recovery succeeded\n");
5295 	} else {
5296 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5297 		amdgpu_device_unlock_adev(adev);
5298 	}
5299 
5300 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5301 }
5302 
5303 /**
5304  * amdgpu_pci_resume() - resume normal ops after PCI reset
5305  * @pdev: pointer to PCI device
5306  *
5307  * Called when the error recovery driver tells us that its
5308  * OK to resume normal operation.
5309  */
5310 void amdgpu_pci_resume(struct pci_dev *pdev)
5311 {
5312 	struct drm_device *dev = pci_get_drvdata(pdev);
5313 	struct amdgpu_device *adev = drm_to_adev(dev);
5314 	int i;
5315 
5316 
5317 	DRM_INFO("PCI error: resume callback!!\n");
5318 
5319 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5320 		struct amdgpu_ring *ring = adev->rings[i];
5321 
5322 		if (!ring || !ring->sched.thread)
5323 			continue;
5324 
5325 
5326 		drm_sched_resubmit_jobs(&ring->sched);
5327 		drm_sched_start(&ring->sched, true);
5328 	}
5329 
5330 	amdgpu_device_unlock_adev(adev);
5331 }
5332 
5333 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5334 {
5335 	struct drm_device *dev = pci_get_drvdata(pdev);
5336 	struct amdgpu_device *adev = drm_to_adev(dev);
5337 	int r;
5338 
5339 	r = pci_save_state(pdev);
5340 	if (!r) {
5341 		kfree(adev->pci_state);
5342 
5343 		adev->pci_state = pci_store_saved_state(pdev);
5344 
5345 		if (!adev->pci_state) {
5346 			DRM_ERROR("Failed to store PCI saved state");
5347 			return false;
5348 		}
5349 	} else {
5350 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5351 		return false;
5352 	}
5353 
5354 	return true;
5355 }
5356 
5357 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5358 {
5359 	struct drm_device *dev = pci_get_drvdata(pdev);
5360 	struct amdgpu_device *adev = drm_to_adev(dev);
5361 	int r;
5362 
5363 	if (!adev->pci_state)
5364 		return false;
5365 
5366 	r = pci_load_saved_state(pdev, adev->pci_state);
5367 
5368 	if (!r) {
5369 		pci_restore_state(pdev);
5370 	} else {
5371 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5372 		return false;
5373 	}
5374 
5375 	return true;
5376 }
5377 
5378 
5379