xref: /linux/drivers/gpu/drm/xe/xe_hw_error.c (revision 0a2a873d615a39e8a87d3f15285ed888341ddce8)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "regs/xe_hw_error_regs.h"
7 #include "regs/xe_irq_regs.h"
8 
9 #include "xe_device.h"
10 #include "xe_hw_error.h"
11 #include "xe_mmio.h"
12 
13 /* Error categories reported by hardware */
14 enum hardware_error {
15 	HARDWARE_ERROR_CORRECTABLE = 0,
16 	HARDWARE_ERROR_NONFATAL = 1,
17 	HARDWARE_ERROR_FATAL = 2,
18 	HARDWARE_ERROR_MAX,
19 };
20 
21 static const char *hw_error_to_str(const enum hardware_error hw_err)
22 {
23 	switch (hw_err) {
24 	case HARDWARE_ERROR_CORRECTABLE:
25 		return "CORRECTABLE";
26 	case HARDWARE_ERROR_NONFATAL:
27 		return "NONFATAL";
28 	case HARDWARE_ERROR_FATAL:
29 		return "FATAL";
30 	default:
31 		return "UNKNOWN";
32 	}
33 }
34 
35 static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
36 {
37 	const char *hw_err_str = hw_error_to_str(hw_err);
38 	struct xe_device *xe = tile_to_xe(tile);
39 	unsigned long flags;
40 	u32 err_src;
41 
42 	if (xe->info.platform != XE_BATTLEMAGE)
43 		return;
44 
45 	spin_lock_irqsave(&xe->irq.lock, flags);
46 	err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err));
47 	if (!err_src) {
48 		drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported DEV_ERR_STAT_%s blank!\n",
49 				    tile->id, hw_err_str);
50 		goto unlock;
51 	}
52 
53 	/* TODO: Process errrors per source */
54 
55 	xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);
56 
57 unlock:
58 	spin_unlock_irqrestore(&xe->irq.lock, flags);
59 }
60 
61 /**
62  * xe_hw_error_irq_handler - irq handling for hw errors
63  * @tile: tile instance
64  * @master_ctl: value read from master interrupt register
65  *
66  * Xe platforms add three error bits to the master interrupt register to support error handling.
67  * These three bits are used to convey the class of error FATAL, NONFATAL, or CORRECTABLE.
68  * To process the interrupt, determine the source of error by reading the Device Error Source
69  * Register that corresponds to the class of error being serviced.
70  */
71 void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
72 {
73 	enum hardware_error hw_err;
74 
75 	for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
76 		if (master_ctl & ERROR_IRQ(hw_err))
77 			hw_error_source_handler(tile, hw_err);
78 }
79 
80 /*
81  * Process hardware errors during boot
82  */
83 static void process_hw_errors(struct xe_device *xe)
84 {
85 	struct xe_tile *tile;
86 	u32 master_ctl;
87 	u8 id;
88 
89 	for_each_tile(tile, xe, id) {
90 		master_ctl = xe_mmio_read32(&tile->mmio, GFX_MSTR_IRQ);
91 		xe_hw_error_irq_handler(tile, master_ctl);
92 		xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ, master_ctl);
93 	}
94 }
95 
96 /**
97  * xe_hw_error_init - Initialize hw errors
98  * @xe: xe device instance
99  *
100  * Initialize and check for errors that occurred during boot
101  * prior to driver load
102  */
103 void xe_hw_error_init(struct xe_device *xe)
104 {
105 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
106 		return;
107 
108 	process_hw_errors(xe);
109 }
110