xref: /linux/drivers/gpu/drm/xe/xe_survivability_mode.c (revision fb7399cf2d0b33825b8039f95c45395c7deba25c)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8 
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12 
13 #include "xe_configfs.h"
14 #include "xe_device.h"
15 #include "xe_gt.h"
16 #include "xe_heci_gsc.h"
17 #include "xe_mmio.h"
18 #include "xe_pcode_api.h"
19 #include "xe_vsec.h"
20 
21 #define MAX_SCRATCH_MMIO 8
22 
23 /**
24  * DOC: Xe Boot Survivability
25  *
26  * Boot Survivability is a software based workflow for recovering a system in a failed boot state
27  * Here system recoverability is concerned with recovering the firmware responsible for boot.
28  *
29  * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
30  * to be flashed through mei and collect telemetry. The driver's probe flow is modified
31  * such that it enters survivability mode when pcode initialization is incomplete and boot status
32  * denotes a failure.
33  *
34  * Survivability mode can also be entered manually using the survivability mode attribute available
35  * through configfs which is beneficial in several usecases. It can be used to address scenarios
36  * where pcode does not detect failure or for validation purposes. It can also be used in
37  * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node.
38  *
39  * Use below command enable survivability mode manually::
40  *
41  *	# echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
42  *
43  * Refer :ref:`xe_configfs` for more details on how to use configfs
44  *
45  * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
46  * debug information::
47  *
48  *	/sys/bus/pci/devices/<device>/surivability_mode
49  *
50  * Capability Information:
51  *	Provides boot status
52  * Postcode Information:
53  *	Provides information about the failure
54  * Overflow Information
55  *	Provides history of previous failures
56  * Auxiliary Information
57  *	Certain failures may have information in addition to postcode information
58  */
59 
60 static u32 aux_history_offset(u32 reg_value)
61 {
62 	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
63 }
64 
65 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
66 				   int id, char *name)
67 {
68 	strscpy(info[id].name, name, sizeof(info[id].name));
69 	info[id].reg = PCODE_SCRATCH(id).raw;
70 	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
71 }
72 
73 static void populate_survivability_info(struct xe_device *xe)
74 {
75 	struct xe_survivability *survivability = &xe->survivability;
76 	struct xe_survivability_info *info = survivability->info;
77 	struct xe_mmio *mmio;
78 	u32 id = 0, reg_value;
79 	char name[NAME_MAX];
80 	int index;
81 
82 	mmio = xe_root_tile_mmio(xe);
83 	set_survivability_info(mmio, info, id, "Capability Info");
84 	reg_value = info[id].value;
85 
86 	if (reg_value & HISTORY_TRACKING) {
87 		id++;
88 		set_survivability_info(mmio, info, id, "Postcode Info");
89 
90 		if (reg_value & OVERFLOW_SUPPORT) {
91 			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
92 			set_survivability_info(mmio, info, id, "Overflow Info");
93 		}
94 	}
95 
96 	if (reg_value & AUXINFO_SUPPORT) {
97 		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
98 
99 		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
100 		     id = aux_history_offset(reg_value)) {
101 			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
102 			set_survivability_info(mmio, info, id, name);
103 		}
104 	}
105 }
106 
107 static void log_survivability_info(struct pci_dev *pdev)
108 {
109 	struct xe_device *xe = pdev_to_xe_device(pdev);
110 	struct xe_survivability *survivability = &xe->survivability;
111 	struct xe_survivability_info *info = survivability->info;
112 	int id;
113 
114 	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
115 		 survivability->boot_status);
116 	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
117 		if (info[id].reg)
118 			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
119 				 info[id].reg, info[id].value);
120 	}
121 }
122 
123 static ssize_t survivability_mode_show(struct device *dev,
124 				       struct device_attribute *attr, char *buff)
125 {
126 	struct pci_dev *pdev = to_pci_dev(dev);
127 	struct xe_device *xe = pdev_to_xe_device(pdev);
128 	struct xe_survivability *survivability = &xe->survivability;
129 	struct xe_survivability_info *info = survivability->info;
130 	int index = 0, count = 0;
131 
132 	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
133 		if (info[index].reg)
134 			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
135 					       info[index].reg, info[index].value);
136 	}
137 
138 	return count;
139 }
140 
141 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
142 
143 static void xe_survivability_mode_fini(void *arg)
144 {
145 	struct xe_device *xe = arg;
146 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
147 	struct device *dev = &pdev->dev;
148 
149 	xe_configfs_clear_survivability_mode(pdev);
150 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
151 }
152 
153 static int enable_survivability_mode(struct pci_dev *pdev)
154 {
155 	struct device *dev = &pdev->dev;
156 	struct xe_device *xe = pdev_to_xe_device(pdev);
157 	struct xe_survivability *survivability = &xe->survivability;
158 	int ret = 0;
159 
160 	/* create survivability mode sysfs */
161 	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
162 	if (ret) {
163 		dev_warn(dev, "Failed to create survivability sysfs files\n");
164 		return ret;
165 	}
166 
167 	ret = devm_add_action_or_reset(xe->drm.dev,
168 				       xe_survivability_mode_fini, xe);
169 	if (ret)
170 		return ret;
171 
172 	/* Make sure xe_heci_gsc_init() knows about survivability mode */
173 	survivability->mode = true;
174 
175 	ret = xe_heci_gsc_init(xe);
176 	if (ret) {
177 		/*
178 		 * But if it fails, device can't enter survivability
179 		 * so move it back for correct error handling
180 		 */
181 		survivability->mode = false;
182 		return ret;
183 	}
184 
185 	xe_vsec_init(xe);
186 
187 	dev_err(dev, "In Survivability Mode\n");
188 
189 	return 0;
190 }
191 
192 /**
193  * xe_survivability_mode_is_enabled - check if survivability mode is enabled
194  * @xe: xe device instance
195  *
196  * Returns true if in survivability mode, false otherwise
197  */
198 bool xe_survivability_mode_is_enabled(struct xe_device *xe)
199 {
200 	return xe->survivability.mode;
201 }
202 
203 /**
204  * xe_survivability_mode_is_requested - check if it's possible to enable survivability
205  *					mode that was requested by firmware or userspace
206  * @xe: xe device instance
207  *
208  * This function reads configfs and  boot status from Pcode.
209  *
210  * Return: true if platform support is available and boot status indicates
211  * failure or if survivability mode is requested, false otherwise.
212  */
213 bool xe_survivability_mode_is_requested(struct xe_device *xe)
214 {
215 	struct xe_survivability *survivability = &xe->survivability;
216 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
217 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
218 	u32 data;
219 	bool survivability_mode;
220 
221 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
222 		return false;
223 
224 	survivability_mode = xe_configfs_get_survivability_mode(pdev);
225 
226 	if (xe->info.platform < XE_BATTLEMAGE) {
227 		if (survivability_mode) {
228 			dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n");
229 			xe_configfs_clear_survivability_mode(pdev);
230 		}
231 		return false;
232 	}
233 
234 	/* Enable survivability mode if set via configfs */
235 	if (survivability_mode)
236 		return true;
237 
238 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
239 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
240 
241 	return survivability->boot_status == NON_CRITICAL_FAILURE ||
242 		survivability->boot_status == CRITICAL_FAILURE;
243 }
244 
245 /**
246  * xe_survivability_mode_enable - Initialize and enable the survivability mode
247  * @xe: xe device instance
248  *
249  * Initialize survivability information and enable survivability mode
250  *
251  * Return: 0 if survivability mode is enabled or not requested; negative error
252  * code otherwise.
253  */
254 int xe_survivability_mode_enable(struct xe_device *xe)
255 {
256 	struct xe_survivability *survivability = &xe->survivability;
257 	struct xe_survivability_info *info;
258 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
259 
260 	if (!xe_survivability_mode_is_requested(xe))
261 		return 0;
262 
263 	survivability->size = MAX_SCRATCH_MMIO;
264 
265 	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
266 			    GFP_KERNEL);
267 	if (!info)
268 		return -ENOMEM;
269 
270 	survivability->info = info;
271 
272 	populate_survivability_info(xe);
273 
274 	/* Only log debug information and exit if it is a critical failure */
275 	if (survivability->boot_status == CRITICAL_FAILURE) {
276 		log_survivability_info(pdev);
277 		return -ENXIO;
278 	}
279 
280 	return enable_survivability_mode(pdev);
281 }
282