xref: /linux/drivers/crypto/intel/qat/qat_common/adf_aer.c (revision 7255fcc80d4b525cc10cfaaf7f485830d4ed2000)
1 // SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only)
2 /* Copyright(c) 2014 - 2020 Intel Corporation */
3 #include <linux/kernel.h>
4 #include <linux/pci.h>
5 #include <linux/completion.h>
6 #include <linux/workqueue.h>
7 #include <linux/delay.h>
8 #include "adf_accel_devices.h"
9 #include "adf_common_drv.h"
10 #include "adf_pfvf_pf_msg.h"
11 
12 struct adf_fatal_error_data {
13 	struct adf_accel_dev *accel_dev;
14 	struct work_struct work;
15 };
16 
17 static struct workqueue_struct *device_reset_wq;
18 static struct workqueue_struct *device_sriov_wq;
19 
20 static pci_ers_result_t adf_error_detected(struct pci_dev *pdev,
21 					   pci_channel_state_t state)
22 {
23 	struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev);
24 
25 	dev_info(&pdev->dev, "Acceleration driver hardware error detected.\n");
26 	if (!accel_dev) {
27 		dev_err(&pdev->dev, "Can't find acceleration device\n");
28 		return PCI_ERS_RESULT_DISCONNECT;
29 	}
30 
31 	if (state == pci_channel_io_perm_failure) {
32 		dev_err(&pdev->dev, "Can't recover from device error\n");
33 		return PCI_ERS_RESULT_DISCONNECT;
34 	}
35 
36 	set_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
37 	if (accel_dev->hw_device->exit_arb) {
38 		dev_dbg(&pdev->dev, "Disabling arbitration\n");
39 		accel_dev->hw_device->exit_arb(accel_dev);
40 	}
41 	adf_error_notifier(accel_dev);
42 	adf_pf2vf_notify_fatal_error(accel_dev);
43 	adf_dev_restarting_notify(accel_dev);
44 	adf_pf2vf_notify_restarting(accel_dev);
45 	adf_pf2vf_wait_for_restarting_complete(accel_dev);
46 	pci_clear_master(pdev);
47 	adf_dev_down(accel_dev, false);
48 
49 	return PCI_ERS_RESULT_NEED_RESET;
50 }
51 
52 /* reset dev data */
53 struct adf_reset_dev_data {
54 	int mode;
55 	struct adf_accel_dev *accel_dev;
56 	struct completion compl;
57 	struct work_struct reset_work;
58 };
59 
60 /* sriov dev data */
61 struct adf_sriov_dev_data {
62 	struct adf_accel_dev *accel_dev;
63 	struct completion compl;
64 	struct work_struct sriov_work;
65 };
66 
67 void adf_reset_sbr(struct adf_accel_dev *accel_dev)
68 {
69 	struct pci_dev *pdev = accel_to_pci_dev(accel_dev);
70 	struct pci_dev *parent = pdev->bus->self;
71 	u16 bridge_ctl = 0;
72 
73 	if (!parent)
74 		parent = pdev;
75 
76 	if (!pci_wait_for_pending_transaction(pdev))
77 		dev_info(&GET_DEV(accel_dev),
78 			 "Transaction still in progress. Proceeding\n");
79 
80 	dev_info(&GET_DEV(accel_dev), "Secondary bus reset\n");
81 
82 	pci_read_config_word(parent, PCI_BRIDGE_CONTROL, &bridge_ctl);
83 	bridge_ctl |= PCI_BRIDGE_CTL_BUS_RESET;
84 	pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl);
85 	msleep(100);
86 	bridge_ctl &= ~PCI_BRIDGE_CTL_BUS_RESET;
87 	pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl);
88 	msleep(100);
89 }
90 EXPORT_SYMBOL_GPL(adf_reset_sbr);
91 
92 void adf_reset_flr(struct adf_accel_dev *accel_dev)
93 {
94 	pcie_flr(accel_to_pci_dev(accel_dev));
95 }
96 EXPORT_SYMBOL_GPL(adf_reset_flr);
97 
98 void adf_dev_restore(struct adf_accel_dev *accel_dev)
99 {
100 	struct adf_hw_device_data *hw_device = accel_dev->hw_device;
101 	struct pci_dev *pdev = accel_to_pci_dev(accel_dev);
102 
103 	if (hw_device->reset_device) {
104 		dev_info(&GET_DEV(accel_dev), "Resetting device qat_dev%d\n",
105 			 accel_dev->accel_id);
106 		hw_device->reset_device(accel_dev);
107 		pci_restore_state(pdev);
108 		pci_save_state(pdev);
109 	}
110 }
111 
112 static void adf_device_sriov_worker(struct work_struct *work)
113 {
114 	struct adf_sriov_dev_data *sriov_data =
115 		container_of(work, struct adf_sriov_dev_data, sriov_work);
116 
117 	adf_reenable_sriov(sriov_data->accel_dev);
118 	complete(&sriov_data->compl);
119 }
120 
121 static void adf_device_reset_worker(struct work_struct *work)
122 {
123 	struct adf_reset_dev_data *reset_data =
124 		  container_of(work, struct adf_reset_dev_data, reset_work);
125 	struct adf_accel_dev *accel_dev = reset_data->accel_dev;
126 	unsigned long wait_jiffies = msecs_to_jiffies(10000);
127 	struct adf_sriov_dev_data sriov_data;
128 
129 	adf_dev_restarting_notify(accel_dev);
130 	if (adf_dev_restart(accel_dev)) {
131 		/* The device hanged and we can't restart it so stop here */
132 		dev_err(&GET_DEV(accel_dev), "Restart device failed\n");
133 		if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
134 		    completion_done(&reset_data->compl))
135 			kfree(reset_data);
136 		WARN(1, "QAT: device restart failed. Device is unusable\n");
137 		return;
138 	}
139 
140 	sriov_data.accel_dev = accel_dev;
141 	init_completion(&sriov_data.compl);
142 	INIT_WORK(&sriov_data.sriov_work, adf_device_sriov_worker);
143 	queue_work(device_sriov_wq, &sriov_data.sriov_work);
144 	if (wait_for_completion_timeout(&sriov_data.compl, wait_jiffies))
145 		adf_pf2vf_notify_restarted(accel_dev);
146 
147 	adf_dev_restarted_notify(accel_dev);
148 	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
149 
150 	/*
151 	 * The dev is back alive. Notify the caller if in sync mode
152 	 *
153 	 * If device restart will take a more time than expected,
154 	 * the schedule_reset() function can timeout and exit. This can be
155 	 * detected by calling the completion_done() function. In this case
156 	 * the reset_data structure needs to be freed here.
157 	 */
158 	if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
159 	    completion_done(&reset_data->compl))
160 		kfree(reset_data);
161 	else
162 		complete(&reset_data->compl);
163 }
164 
165 static int adf_dev_aer_schedule_reset(struct adf_accel_dev *accel_dev,
166 				      enum adf_dev_reset_mode mode)
167 {
168 	struct adf_reset_dev_data *reset_data;
169 
170 	if (!adf_dev_started(accel_dev) ||
171 	    test_bit(ADF_STATUS_RESTARTING, &accel_dev->status))
172 		return 0;
173 
174 	set_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
175 	reset_data = kzalloc(sizeof(*reset_data), GFP_KERNEL);
176 	if (!reset_data)
177 		return -ENOMEM;
178 	reset_data->accel_dev = accel_dev;
179 	init_completion(&reset_data->compl);
180 	reset_data->mode = mode;
181 	INIT_WORK(&reset_data->reset_work, adf_device_reset_worker);
182 	queue_work(device_reset_wq, &reset_data->reset_work);
183 
184 	/* If in sync mode wait for the result */
185 	if (mode == ADF_DEV_RESET_SYNC) {
186 		int ret = 0;
187 		/* Maximum device reset time is 10 seconds */
188 		unsigned long wait_jiffies = msecs_to_jiffies(10000);
189 		unsigned long timeout = wait_for_completion_timeout(
190 				   &reset_data->compl, wait_jiffies);
191 		if (!timeout) {
192 			dev_err(&GET_DEV(accel_dev),
193 				"Reset device timeout expired\n");
194 			ret = -EFAULT;
195 		} else {
196 			kfree(reset_data);
197 		}
198 		return ret;
199 	}
200 	return 0;
201 }
202 
203 static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev)
204 {
205 	struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev);
206 	int res = 0;
207 
208 	if (!accel_dev) {
209 		pr_err("QAT: Can't find acceleration device\n");
210 		return PCI_ERS_RESULT_DISCONNECT;
211 	}
212 
213 	if (!pdev->is_busmaster)
214 		pci_set_master(pdev);
215 	pci_restore_state(pdev);
216 	pci_save_state(pdev);
217 	res = adf_dev_up(accel_dev, false);
218 	if (res && res != -EALREADY)
219 		return PCI_ERS_RESULT_DISCONNECT;
220 
221 	adf_reenable_sriov(accel_dev);
222 	adf_pf2vf_notify_restarted(accel_dev);
223 	adf_dev_restarted_notify(accel_dev);
224 	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
225 	return PCI_ERS_RESULT_RECOVERED;
226 }
227 
228 static void adf_resume(struct pci_dev *pdev)
229 {
230 	dev_info(&pdev->dev, "Acceleration driver reset completed\n");
231 	dev_info(&pdev->dev, "Device is up and running\n");
232 }
233 
234 const struct pci_error_handlers adf_err_handler = {
235 	.error_detected = adf_error_detected,
236 	.slot_reset = adf_slot_reset,
237 	.resume = adf_resume,
238 };
239 EXPORT_SYMBOL_GPL(adf_err_handler);
240 
241 int adf_dev_autoreset(struct adf_accel_dev *accel_dev)
242 {
243 	if (accel_dev->autoreset_on_error)
244 		return adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_ASYNC);
245 
246 	return 0;
247 }
248 
249 static void adf_notify_fatal_error_worker(struct work_struct *work)
250 {
251 	struct adf_fatal_error_data *wq_data =
252 			container_of(work, struct adf_fatal_error_data, work);
253 	struct adf_accel_dev *accel_dev = wq_data->accel_dev;
254 	struct adf_hw_device_data *hw_device = accel_dev->hw_device;
255 
256 	adf_error_notifier(accel_dev);
257 
258 	if (!accel_dev->is_vf) {
259 		/* Disable arbitration to stop processing of new requests */
260 		if (accel_dev->autoreset_on_error && hw_device->exit_arb)
261 			hw_device->exit_arb(accel_dev);
262 		if (accel_dev->pf.vf_info)
263 			adf_pf2vf_notify_fatal_error(accel_dev);
264 		adf_dev_autoreset(accel_dev);
265 	}
266 
267 	kfree(wq_data);
268 }
269 
270 int adf_notify_fatal_error(struct adf_accel_dev *accel_dev)
271 {
272 	struct adf_fatal_error_data *wq_data;
273 
274 	wq_data = kzalloc(sizeof(*wq_data), GFP_ATOMIC);
275 	if (!wq_data)
276 		return -ENOMEM;
277 
278 	wq_data->accel_dev = accel_dev;
279 	INIT_WORK(&wq_data->work, adf_notify_fatal_error_worker);
280 	adf_misc_wq_queue_work(&wq_data->work);
281 
282 	return 0;
283 }
284 
285 int adf_init_aer(void)
286 {
287 	device_reset_wq = alloc_workqueue("qat_device_reset_wq",
288 					  WQ_MEM_RECLAIM, 0);
289 	if (!device_reset_wq)
290 		return -EFAULT;
291 
292 	device_sriov_wq = alloc_workqueue("qat_device_sriov_wq", 0, 0);
293 	if (!device_sriov_wq)
294 		return -EFAULT;
295 
296 	return 0;
297 }
298 
299 void adf_exit_aer(void)
300 {
301 	if (device_reset_wq)
302 		destroy_workqueue(device_reset_wq);
303 	device_reset_wq = NULL;
304 
305 	if (device_sriov_wq)
306 		destroy_workqueue(device_sriov_wq);
307 	device_sriov_wq = NULL;
308 }
309