xref: /linux/drivers/gpu/drm/xe/xe_survivability_mode.c (revision 8a5f956a9fb7d74fff681145082acfad5afa6bb8)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8 
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12 
13 #include "xe_configfs.h"
14 #include "xe_device.h"
15 #include "xe_gt.h"
16 #include "xe_heci_gsc.h"
17 #include "xe_i2c.h"
18 #include "xe_mmio.h"
19 #include "xe_pcode_api.h"
20 #include "xe_vsec.h"
21 
22 #define MAX_SCRATCH_MMIO 8
23 
24 /**
25  * DOC: Xe Boot Survivability
26  *
27  * Boot Survivability is a software based workflow for recovering a system in a failed boot state
28  * Here system recoverability is concerned with recovering the firmware responsible for boot.
29  *
30  * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
31  * to be flashed through mei and collect telemetry. The driver's probe flow is modified
32  * such that it enters survivability mode when pcode initialization is incomplete and boot status
33  * denotes a failure.
34  *
35  * Survivability mode can also be entered manually using the survivability mode attribute available
36  * through configfs which is beneficial in several usecases. It can be used to address scenarios
37  * where pcode does not detect failure or for validation purposes. It can also be used in
38  * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node.
39  *
40  * Use below command enable survivability mode manually::
41  *
42  *	# echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
43  *
44  * It is the responsibility of the user to clear the mode once firmware flash is complete.
45  *
46  * Refer :ref:`xe_configfs` for more details on how to use configfs
47  *
48  * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
49  * debug information::
50  *
51  *	/sys/bus/pci/devices/<device>/surivability_mode
52  *
53  * Capability Information:
54  *	Provides boot status
55  * Postcode Information:
56  *	Provides information about the failure
57  * Overflow Information
58  *	Provides history of previous failures
59  * Auxiliary Information
60  *	Certain failures may have information in addition to postcode information
61  */
62 
63 static u32 aux_history_offset(u32 reg_value)
64 {
65 	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
66 }
67 
68 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
69 				   int id, char *name)
70 {
71 	strscpy(info[id].name, name, sizeof(info[id].name));
72 	info[id].reg = PCODE_SCRATCH(id).raw;
73 	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
74 }
75 
76 static void populate_survivability_info(struct xe_device *xe)
77 {
78 	struct xe_survivability *survivability = &xe->survivability;
79 	struct xe_survivability_info *info = survivability->info;
80 	struct xe_mmio *mmio;
81 	u32 id = 0, reg_value;
82 	char name[NAME_MAX];
83 	int index;
84 
85 	mmio = xe_root_tile_mmio(xe);
86 	set_survivability_info(mmio, info, id, "Capability Info");
87 	reg_value = info[id].value;
88 
89 	if (reg_value & HISTORY_TRACKING) {
90 		id++;
91 		set_survivability_info(mmio, info, id, "Postcode Info");
92 
93 		if (reg_value & OVERFLOW_SUPPORT) {
94 			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
95 			set_survivability_info(mmio, info, id, "Overflow Info");
96 		}
97 	}
98 
99 	if (reg_value & AUXINFO_SUPPORT) {
100 		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
101 
102 		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
103 		     id = aux_history_offset(reg_value)) {
104 			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
105 			set_survivability_info(mmio, info, id, name);
106 		}
107 	}
108 }
109 
110 static void log_survivability_info(struct pci_dev *pdev)
111 {
112 	struct xe_device *xe = pdev_to_xe_device(pdev);
113 	struct xe_survivability *survivability = &xe->survivability;
114 	struct xe_survivability_info *info = survivability->info;
115 	int id;
116 
117 	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
118 		 survivability->boot_status);
119 	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
120 		if (info[id].reg)
121 			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
122 				 info[id].reg, info[id].value);
123 	}
124 }
125 
126 static ssize_t survivability_mode_show(struct device *dev,
127 				       struct device_attribute *attr, char *buff)
128 {
129 	struct pci_dev *pdev = to_pci_dev(dev);
130 	struct xe_device *xe = pdev_to_xe_device(pdev);
131 	struct xe_survivability *survivability = &xe->survivability;
132 	struct xe_survivability_info *info = survivability->info;
133 	int index = 0, count = 0;
134 
135 	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
136 		if (info[index].reg)
137 			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
138 					       info[index].reg, info[index].value);
139 	}
140 
141 	return count;
142 }
143 
144 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
145 
146 static void xe_survivability_mode_fini(void *arg)
147 {
148 	struct xe_device *xe = arg;
149 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
150 	struct device *dev = &pdev->dev;
151 
152 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
153 }
154 
155 static int enable_survivability_mode(struct pci_dev *pdev)
156 {
157 	struct device *dev = &pdev->dev;
158 	struct xe_device *xe = pdev_to_xe_device(pdev);
159 	struct xe_survivability *survivability = &xe->survivability;
160 	int ret = 0;
161 
162 	/* create survivability mode sysfs */
163 	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
164 	if (ret) {
165 		dev_warn(dev, "Failed to create survivability sysfs files\n");
166 		return ret;
167 	}
168 
169 	ret = devm_add_action_or_reset(xe->drm.dev,
170 				       xe_survivability_mode_fini, xe);
171 	if (ret)
172 		return ret;
173 
174 	/* Make sure xe_heci_gsc_init() knows about survivability mode */
175 	survivability->mode = true;
176 
177 	ret = xe_heci_gsc_init(xe);
178 	if (ret)
179 		goto err;
180 
181 	xe_vsec_init(xe);
182 
183 	ret = xe_i2c_probe(xe);
184 	if (ret)
185 		goto err;
186 
187 	dev_err(dev, "In Survivability Mode\n");
188 
189 	return 0;
190 
191 err:
192 	survivability->mode = false;
193 	return ret;
194 }
195 
196 /**
197  * xe_survivability_mode_is_enabled - check if survivability mode is enabled
198  * @xe: xe device instance
199  *
200  * Returns true if in survivability mode, false otherwise
201  */
202 bool xe_survivability_mode_is_enabled(struct xe_device *xe)
203 {
204 	return xe->survivability.mode;
205 }
206 
207 /**
208  * xe_survivability_mode_is_requested - check if it's possible to enable survivability
209  *					mode that was requested by firmware or userspace
210  * @xe: xe device instance
211  *
212  * This function reads configfs and  boot status from Pcode.
213  *
214  * Return: true if platform support is available and boot status indicates
215  * failure or if survivability mode is requested, false otherwise.
216  */
217 bool xe_survivability_mode_is_requested(struct xe_device *xe)
218 {
219 	struct xe_survivability *survivability = &xe->survivability;
220 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
221 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
222 	u32 data;
223 	bool survivability_mode;
224 
225 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
226 		return false;
227 
228 	survivability_mode = xe_configfs_get_survivability_mode(pdev);
229 
230 	if (xe->info.platform < XE_BATTLEMAGE) {
231 		if (survivability_mode) {
232 			dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n");
233 			xe_configfs_clear_survivability_mode(pdev);
234 		}
235 		return false;
236 	}
237 
238 	/* Enable survivability mode if set via configfs */
239 	if (survivability_mode)
240 		return true;
241 
242 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
243 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
244 
245 	return survivability->boot_status == NON_CRITICAL_FAILURE ||
246 		survivability->boot_status == CRITICAL_FAILURE;
247 }
248 
249 /**
250  * xe_survivability_mode_enable - Initialize and enable the survivability mode
251  * @xe: xe device instance
252  *
253  * Initialize survivability information and enable survivability mode
254  *
255  * Return: 0 if survivability mode is enabled or not requested; negative error
256  * code otherwise.
257  */
258 int xe_survivability_mode_enable(struct xe_device *xe)
259 {
260 	struct xe_survivability *survivability = &xe->survivability;
261 	struct xe_survivability_info *info;
262 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
263 
264 	if (!xe_survivability_mode_is_requested(xe))
265 		return 0;
266 
267 	survivability->size = MAX_SCRATCH_MMIO;
268 
269 	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
270 			    GFP_KERNEL);
271 	if (!info)
272 		return -ENOMEM;
273 
274 	survivability->info = info;
275 
276 	populate_survivability_info(xe);
277 
278 	/* Only log debug information and exit if it is a critical failure */
279 	if (survivability->boot_status == CRITICAL_FAILURE) {
280 		log_survivability_info(pdev);
281 		return -ENXIO;
282 	}
283 
284 	return enable_survivability_mode(pdev);
285 }
286