1 // SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only) 2 /* Copyright(c) 2014 - 2020 Intel Corporation */ 3 #include <linux/kernel.h> 4 #include <linux/pci.h> 5 #include <linux/completion.h> 6 #include <linux/workqueue.h> 7 #include <linux/delay.h> 8 #include "adf_accel_devices.h" 9 #include "adf_common_drv.h" 10 #include "adf_pfvf_pf_msg.h" 11 12 struct adf_fatal_error_data { 13 struct adf_accel_dev *accel_dev; 14 struct work_struct work; 15 }; 16 17 static struct workqueue_struct *device_reset_wq; 18 static struct workqueue_struct *device_sriov_wq; 19 20 static pci_ers_result_t adf_error_detected(struct pci_dev *pdev, 21 pci_channel_state_t state) 22 { 23 struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev); 24 25 dev_info(&pdev->dev, "Acceleration driver hardware error detected.\n"); 26 if (!accel_dev) { 27 dev_err(&pdev->dev, "Can't find acceleration device\n"); 28 return PCI_ERS_RESULT_DISCONNECT; 29 } 30 31 if (state == pci_channel_io_perm_failure) { 32 dev_err(&pdev->dev, "Can't recover from device error\n"); 33 return PCI_ERS_RESULT_DISCONNECT; 34 } 35 36 set_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 37 if (accel_dev->hw_device->exit_arb) { 38 dev_dbg(&pdev->dev, "Disabling arbitration\n"); 39 accel_dev->hw_device->exit_arb(accel_dev); 40 } 41 adf_error_notifier(accel_dev); 42 adf_pf2vf_notify_fatal_error(accel_dev); 43 adf_dev_restarting_notify(accel_dev); 44 adf_pf2vf_notify_restarting(accel_dev); 45 adf_pf2vf_wait_for_restarting_complete(accel_dev); 46 pci_clear_master(pdev); 47 adf_dev_down(accel_dev, false); 48 49 return PCI_ERS_RESULT_NEED_RESET; 50 } 51 52 /* reset dev data */ 53 struct adf_reset_dev_data { 54 int mode; 55 struct adf_accel_dev *accel_dev; 56 struct completion compl; 57 struct work_struct reset_work; 58 }; 59 60 /* sriov dev data */ 61 struct adf_sriov_dev_data { 62 struct adf_accel_dev *accel_dev; 63 struct completion compl; 64 struct work_struct sriov_work; 65 }; 66 67 void adf_reset_sbr(struct adf_accel_dev *accel_dev) 68 { 69 struct pci_dev *pdev = accel_to_pci_dev(accel_dev); 70 struct pci_dev *parent = pdev->bus->self; 71 u16 bridge_ctl = 0; 72 73 if (!parent) 74 parent = pdev; 75 76 if (!pci_wait_for_pending_transaction(pdev)) 77 dev_info(&GET_DEV(accel_dev), 78 "Transaction still in progress. Proceeding\n"); 79 80 dev_info(&GET_DEV(accel_dev), "Secondary bus reset\n"); 81 82 pci_read_config_word(parent, PCI_BRIDGE_CONTROL, &bridge_ctl); 83 bridge_ctl |= PCI_BRIDGE_CTL_BUS_RESET; 84 pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl); 85 msleep(100); 86 bridge_ctl &= ~PCI_BRIDGE_CTL_BUS_RESET; 87 pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl); 88 msleep(100); 89 } 90 EXPORT_SYMBOL_GPL(adf_reset_sbr); 91 92 void adf_reset_flr(struct adf_accel_dev *accel_dev) 93 { 94 pcie_flr(accel_to_pci_dev(accel_dev)); 95 } 96 EXPORT_SYMBOL_GPL(adf_reset_flr); 97 98 void adf_dev_restore(struct adf_accel_dev *accel_dev) 99 { 100 struct adf_hw_device_data *hw_device = accel_dev->hw_device; 101 struct pci_dev *pdev = accel_to_pci_dev(accel_dev); 102 103 if (hw_device->reset_device) { 104 dev_info(&GET_DEV(accel_dev), "Resetting device qat_dev%d\n", 105 accel_dev->accel_id); 106 hw_device->reset_device(accel_dev); 107 pci_restore_state(pdev); 108 pci_save_state(pdev); 109 } 110 } 111 112 static void adf_device_sriov_worker(struct work_struct *work) 113 { 114 struct adf_sriov_dev_data *sriov_data = 115 container_of(work, struct adf_sriov_dev_data, sriov_work); 116 117 adf_reenable_sriov(sriov_data->accel_dev); 118 complete(&sriov_data->compl); 119 } 120 121 static void adf_device_reset_worker(struct work_struct *work) 122 { 123 struct adf_reset_dev_data *reset_data = 124 container_of(work, struct adf_reset_dev_data, reset_work); 125 struct adf_accel_dev *accel_dev = reset_data->accel_dev; 126 unsigned long wait_jiffies = msecs_to_jiffies(10000); 127 struct adf_sriov_dev_data sriov_data; 128 129 adf_dev_restarting_notify(accel_dev); 130 if (adf_dev_restart(accel_dev)) { 131 /* The device hanged and we can't restart it so stop here */ 132 dev_err(&GET_DEV(accel_dev), "Restart device failed\n"); 133 if (reset_data->mode == ADF_DEV_RESET_ASYNC || 134 completion_done(&reset_data->compl)) 135 kfree(reset_data); 136 WARN(1, "QAT: device restart failed. Device is unusable\n"); 137 return; 138 } 139 140 sriov_data.accel_dev = accel_dev; 141 init_completion(&sriov_data.compl); 142 INIT_WORK(&sriov_data.sriov_work, adf_device_sriov_worker); 143 queue_work(device_sriov_wq, &sriov_data.sriov_work); 144 if (wait_for_completion_timeout(&sriov_data.compl, wait_jiffies)) 145 adf_pf2vf_notify_restarted(accel_dev); 146 147 adf_dev_restarted_notify(accel_dev); 148 clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 149 150 /* 151 * The dev is back alive. Notify the caller if in sync mode 152 * 153 * If device restart will take a more time than expected, 154 * the schedule_reset() function can timeout and exit. This can be 155 * detected by calling the completion_done() function. In this case 156 * the reset_data structure needs to be freed here. 157 */ 158 if (reset_data->mode == ADF_DEV_RESET_ASYNC || 159 completion_done(&reset_data->compl)) 160 kfree(reset_data); 161 else 162 complete(&reset_data->compl); 163 } 164 165 static int adf_dev_aer_schedule_reset(struct adf_accel_dev *accel_dev, 166 enum adf_dev_reset_mode mode) 167 { 168 struct adf_reset_dev_data *reset_data; 169 170 if (!adf_dev_started(accel_dev) || 171 test_bit(ADF_STATUS_RESTARTING, &accel_dev->status)) 172 return 0; 173 174 set_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 175 reset_data = kzalloc(sizeof(*reset_data), GFP_KERNEL); 176 if (!reset_data) 177 return -ENOMEM; 178 reset_data->accel_dev = accel_dev; 179 init_completion(&reset_data->compl); 180 reset_data->mode = mode; 181 INIT_WORK(&reset_data->reset_work, adf_device_reset_worker); 182 queue_work(device_reset_wq, &reset_data->reset_work); 183 184 /* If in sync mode wait for the result */ 185 if (mode == ADF_DEV_RESET_SYNC) { 186 int ret = 0; 187 /* Maximum device reset time is 10 seconds */ 188 unsigned long wait_jiffies = msecs_to_jiffies(10000); 189 unsigned long timeout = wait_for_completion_timeout( 190 &reset_data->compl, wait_jiffies); 191 if (!timeout) { 192 dev_err(&GET_DEV(accel_dev), 193 "Reset device timeout expired\n"); 194 ret = -EFAULT; 195 } else { 196 kfree(reset_data); 197 } 198 return ret; 199 } 200 return 0; 201 } 202 203 static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev) 204 { 205 struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev); 206 int res = 0; 207 208 if (!accel_dev) { 209 pr_err("QAT: Can't find acceleration device\n"); 210 return PCI_ERS_RESULT_DISCONNECT; 211 } 212 213 if (!pdev->is_busmaster) 214 pci_set_master(pdev); 215 pci_restore_state(pdev); 216 pci_save_state(pdev); 217 res = adf_dev_up(accel_dev, false); 218 if (res && res != -EALREADY) 219 return PCI_ERS_RESULT_DISCONNECT; 220 221 adf_reenable_sriov(accel_dev); 222 adf_pf2vf_notify_restarted(accel_dev); 223 adf_dev_restarted_notify(accel_dev); 224 clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 225 return PCI_ERS_RESULT_RECOVERED; 226 } 227 228 static void adf_resume(struct pci_dev *pdev) 229 { 230 dev_info(&pdev->dev, "Acceleration driver reset completed\n"); 231 dev_info(&pdev->dev, "Device is up and running\n"); 232 } 233 234 const struct pci_error_handlers adf_err_handler = { 235 .error_detected = adf_error_detected, 236 .slot_reset = adf_slot_reset, 237 .resume = adf_resume, 238 }; 239 EXPORT_SYMBOL_GPL(adf_err_handler); 240 241 int adf_dev_autoreset(struct adf_accel_dev *accel_dev) 242 { 243 if (accel_dev->autoreset_on_error) 244 return adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_ASYNC); 245 246 return 0; 247 } 248 249 static void adf_notify_fatal_error_worker(struct work_struct *work) 250 { 251 struct adf_fatal_error_data *wq_data = 252 container_of(work, struct adf_fatal_error_data, work); 253 struct adf_accel_dev *accel_dev = wq_data->accel_dev; 254 struct adf_hw_device_data *hw_device = accel_dev->hw_device; 255 256 adf_error_notifier(accel_dev); 257 258 if (!accel_dev->is_vf) { 259 /* Disable arbitration to stop processing of new requests */ 260 if (accel_dev->autoreset_on_error && hw_device->exit_arb) 261 hw_device->exit_arb(accel_dev); 262 if (accel_dev->pf.vf_info) 263 adf_pf2vf_notify_fatal_error(accel_dev); 264 adf_dev_autoreset(accel_dev); 265 } 266 267 kfree(wq_data); 268 } 269 270 int adf_notify_fatal_error(struct adf_accel_dev *accel_dev) 271 { 272 struct adf_fatal_error_data *wq_data; 273 274 wq_data = kzalloc(sizeof(*wq_data), GFP_ATOMIC); 275 if (!wq_data) 276 return -ENOMEM; 277 278 wq_data->accel_dev = accel_dev; 279 INIT_WORK(&wq_data->work, adf_notify_fatal_error_worker); 280 adf_misc_wq_queue_work(&wq_data->work); 281 282 return 0; 283 } 284 285 int adf_init_aer(void) 286 { 287 device_reset_wq = alloc_workqueue("qat_device_reset_wq", 288 WQ_MEM_RECLAIM, 0); 289 if (!device_reset_wq) 290 return -EFAULT; 291 292 device_sriov_wq = alloc_workqueue("qat_device_sriov_wq", 0, 0); 293 if (!device_sriov_wq) 294 return -EFAULT; 295 296 return 0; 297 } 298 299 void adf_exit_aer(void) 300 { 301 if (device_reset_wq) 302 destroy_workqueue(device_reset_wq); 303 device_reset_wq = NULL; 304 305 if (device_sriov_wq) 306 destroy_workqueue(device_sriov_wq); 307 device_sriov_wq = NULL; 308 } 309