1 // SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only) 2 /* Copyright(c) 2014 - 2020 Intel Corporation */ 3 #include <linux/kernel.h> 4 #include <linux/pci.h> 5 #include <linux/completion.h> 6 #include <linux/workqueue.h> 7 #include <linux/delay.h> 8 #include "adf_accel_devices.h" 9 #include "adf_common_drv.h" 10 #include "adf_pfvf_pf_msg.h" 11 12 struct adf_fatal_error_data { 13 struct adf_accel_dev *accel_dev; 14 struct work_struct work; 15 }; 16 17 static struct workqueue_struct *device_reset_wq; 18 static struct workqueue_struct *device_sriov_wq; 19 20 static pci_ers_result_t reset_prepare(struct pci_dev *pdev) 21 { 22 struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev); 23 24 if (!accel_dev) { 25 pci_err(pdev, "Can't find acceleration device\n"); 26 return PCI_ERS_RESULT_DISCONNECT; 27 } 28 29 if (!adf_dev_started(accel_dev)) 30 return PCI_ERS_RESULT_CAN_RECOVER; 31 32 set_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 33 if (accel_dev->hw_device->exit_arb) { 34 dev_dbg(&pdev->dev, "Disabling arbitration\n"); 35 accel_dev->hw_device->exit_arb(accel_dev); 36 } 37 adf_dev_restarting_notify(accel_dev); 38 adf_dev_down(accel_dev); 39 40 return PCI_ERS_RESULT_NEED_RESET; 41 } 42 43 static pci_ers_result_t reset_done(struct pci_dev *pdev) 44 { 45 struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev); 46 int res; 47 48 if (!accel_dev) { 49 pci_err(pdev, "Can't find acceleration device\n"); 50 return PCI_ERS_RESULT_DISCONNECT; 51 } 52 53 if (!adf_devmgr_in_reset(accel_dev)) 54 goto reset_complete; 55 56 pci_restore_state(pdev); 57 res = adf_dev_up(accel_dev, false); 58 if (res && res != -EALREADY) 59 return PCI_ERS_RESULT_DISCONNECT; 60 61 adf_reenable_sriov(accel_dev); 62 adf_pf2vf_notify_restarted(accel_dev); 63 adf_dev_restarted_notify(accel_dev); 64 clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 65 66 reset_complete: 67 pci_info(pdev, "Device reset completed successfully\n"); 68 69 return PCI_ERS_RESULT_RECOVERED; 70 } 71 72 static pci_ers_result_t adf_error_detected(struct pci_dev *pdev, 73 pci_channel_state_t state) 74 { 75 struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev); 76 77 pci_info(pdev, "Acceleration driver hardware error detected.\n"); 78 if (!accel_dev) { 79 pci_err(pdev, "Can't find acceleration device\n"); 80 return PCI_ERS_RESULT_DISCONNECT; 81 } 82 83 if (state == pci_channel_io_perm_failure) { 84 pci_err(pdev, "Can't recover from device error\n"); 85 return PCI_ERS_RESULT_DISCONNECT; 86 } 87 88 adf_error_notifier(accel_dev); 89 adf_pf2vf_notify_fatal_error(accel_dev); 90 91 return reset_prepare(pdev); 92 } 93 94 /* reset dev data */ 95 struct adf_reset_dev_data { 96 int mode; 97 struct adf_accel_dev *accel_dev; 98 struct completion compl; 99 struct work_struct reset_work; 100 }; 101 102 /* sriov dev data */ 103 struct adf_sriov_dev_data { 104 struct adf_accel_dev *accel_dev; 105 struct completion compl; 106 struct work_struct sriov_work; 107 }; 108 109 void adf_reset_sbr(struct adf_accel_dev *accel_dev) 110 { 111 struct pci_dev *pdev = accel_to_pci_dev(accel_dev); 112 struct pci_dev *parent = pdev->bus->self; 113 u16 bridge_ctl = 0; 114 115 if (!parent) 116 parent = pdev; 117 118 if (!pci_wait_for_pending_transaction(pdev)) 119 pci_info(pdev, "Transaction still in progress. Proceeding\n"); 120 121 pci_info(pdev, "Secondary bus reset\n"); 122 123 pci_read_config_word(parent, PCI_BRIDGE_CONTROL, &bridge_ctl); 124 bridge_ctl |= PCI_BRIDGE_CTL_BUS_RESET; 125 pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl); 126 msleep(100); 127 bridge_ctl &= ~PCI_BRIDGE_CTL_BUS_RESET; 128 pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl); 129 msleep(100); 130 } 131 EXPORT_SYMBOL_GPL(adf_reset_sbr); 132 133 void adf_reset_flr(struct adf_accel_dev *accel_dev) 134 { 135 pcie_flr(accel_to_pci_dev(accel_dev)); 136 } 137 EXPORT_SYMBOL_GPL(adf_reset_flr); 138 139 void adf_dev_restore(struct adf_accel_dev *accel_dev) 140 { 141 struct adf_hw_device_data *hw_device = accel_dev->hw_device; 142 struct pci_dev *pdev = accel_to_pci_dev(accel_dev); 143 144 if (hw_device->reset_device) { 145 dev_info(&GET_DEV(accel_dev), "Resetting device qat_dev%d\n", 146 accel_dev->accel_id); 147 hw_device->reset_device(accel_dev); 148 pci_restore_state(pdev); 149 } 150 } 151 152 void adf_set_bme(struct adf_accel_dev *accel_dev) 153 { 154 struct pci_dev *pdev = accel_to_pci_dev(accel_dev); 155 156 pci_set_master(pdev); 157 } 158 159 static void adf_device_sriov_worker(struct work_struct *work) 160 { 161 struct adf_sriov_dev_data *sriov_data = 162 container_of(work, struct adf_sriov_dev_data, sriov_work); 163 164 adf_reenable_sriov(sriov_data->accel_dev); 165 complete(&sriov_data->compl); 166 } 167 168 static void adf_device_reset_worker(struct work_struct *work) 169 { 170 struct adf_reset_dev_data *reset_data = 171 container_of(work, struct adf_reset_dev_data, reset_work); 172 struct adf_accel_dev *accel_dev = reset_data->accel_dev; 173 unsigned long wait_jiffies = msecs_to_jiffies(10000); 174 struct adf_sriov_dev_data sriov_data; 175 176 adf_dev_restarting_notify(accel_dev); 177 if (adf_dev_restart(accel_dev)) { 178 /* The device hanged and we can't restart it so stop here */ 179 dev_err(&GET_DEV(accel_dev), "Restart device failed\n"); 180 if (reset_data->mode == ADF_DEV_RESET_ASYNC) 181 kfree(reset_data); 182 WARN(1, "QAT: device restart failed. Device is unusable\n"); 183 return; 184 } 185 186 sriov_data.accel_dev = accel_dev; 187 init_completion(&sriov_data.compl); 188 INIT_WORK(&sriov_data.sriov_work, adf_device_sriov_worker); 189 queue_work(device_sriov_wq, &sriov_data.sriov_work); 190 if (wait_for_completion_timeout(&sriov_data.compl, wait_jiffies)) 191 adf_pf2vf_notify_restarted(accel_dev); 192 193 adf_dev_restarted_notify(accel_dev); 194 clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 195 196 /* The dev is back alive. Notify the caller if in sync mode */ 197 if (reset_data->mode == ADF_DEV_RESET_ASYNC) 198 kfree(reset_data); 199 else 200 complete(&reset_data->compl); 201 } 202 203 static int adf_dev_aer_schedule_reset(struct adf_accel_dev *accel_dev, 204 enum adf_dev_reset_mode mode) 205 { 206 struct adf_reset_dev_data *reset_data; 207 208 if (!adf_dev_started(accel_dev) || 209 test_and_set_bit(ADF_STATUS_RESTARTING, &accel_dev->status)) 210 return 0; 211 212 reset_data = kzalloc_obj(*reset_data); 213 if (!reset_data) { 214 clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 215 return -ENOMEM; 216 } 217 reset_data->accel_dev = accel_dev; 218 init_completion(&reset_data->compl); 219 reset_data->mode = mode; 220 INIT_WORK(&reset_data->reset_work, adf_device_reset_worker); 221 queue_work(device_reset_wq, &reset_data->reset_work); 222 223 /* If in sync mode wait for the result */ 224 if (mode == ADF_DEV_RESET_SYNC) { 225 int ret = 0; 226 /* Maximum device reset time is 10 seconds */ 227 unsigned long wait_jiffies = msecs_to_jiffies(10000); 228 unsigned long timeout = wait_for_completion_timeout( 229 &reset_data->compl, wait_jiffies); 230 if (!timeout) { 231 dev_err(&GET_DEV(accel_dev), 232 "Reset device timeout expired\n"); 233 cancel_work_sync(&reset_data->reset_work); 234 ret = -EFAULT; 235 } 236 kfree(reset_data); 237 return ret; 238 } 239 return 0; 240 } 241 242 static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev) 243 { 244 return reset_done(pdev); 245 } 246 247 static void adf_resume(struct pci_dev *pdev) 248 { 249 pci_info(pdev, "Acceleration driver reset completed\n"); 250 pci_info(pdev, "Device is up and running\n"); 251 } 252 253 static void adf_reset_prepare(struct pci_dev *pdev) 254 { 255 reset_prepare(pdev); 256 } 257 258 static void adf_reset_done(struct pci_dev *pdev) 259 { 260 reset_done(pdev); 261 } 262 263 const struct pci_error_handlers adf_err_handler = { 264 .error_detected = adf_error_detected, 265 .slot_reset = adf_slot_reset, 266 .resume = adf_resume, 267 .reset_prepare = adf_reset_prepare, 268 .reset_done = adf_reset_done, 269 }; 270 EXPORT_SYMBOL_GPL(adf_err_handler); 271 272 static int adf_dev_autoreset(struct adf_accel_dev *accel_dev) 273 { 274 if (accel_dev->autoreset_on_error) 275 return adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_ASYNC); 276 277 return 0; 278 } 279 280 static void adf_notify_fatal_error_worker(struct work_struct *work) 281 { 282 struct adf_fatal_error_data *wq_data = 283 container_of(work, struct adf_fatal_error_data, work); 284 struct adf_accel_dev *accel_dev = wq_data->accel_dev; 285 struct adf_hw_device_data *hw_device = accel_dev->hw_device; 286 287 adf_error_notifier(accel_dev); 288 289 if (!accel_dev->is_vf) { 290 /* Disable arbitration to stop processing of new requests */ 291 if (accel_dev->autoreset_on_error && hw_device->exit_arb) 292 hw_device->exit_arb(accel_dev); 293 if (accel_dev->pf.vf_info) 294 adf_pf2vf_notify_fatal_error(accel_dev); 295 adf_dev_autoreset(accel_dev); 296 } 297 298 kfree(wq_data); 299 } 300 301 int adf_notify_fatal_error(struct adf_accel_dev *accel_dev) 302 { 303 struct adf_fatal_error_data *wq_data; 304 305 wq_data = kzalloc_obj(*wq_data, GFP_ATOMIC); 306 if (!wq_data) 307 return -ENOMEM; 308 309 wq_data->accel_dev = accel_dev; 310 INIT_WORK(&wq_data->work, adf_notify_fatal_error_worker); 311 adf_misc_wq_queue_work(&wq_data->work); 312 313 return 0; 314 } 315 316 int adf_init_aer(void) 317 { 318 device_reset_wq = alloc_workqueue("qat_device_reset_wq", 319 WQ_MEM_RECLAIM | WQ_PERCPU, 0); 320 if (!device_reset_wq) 321 return -EFAULT; 322 323 device_sriov_wq = alloc_workqueue("qat_device_sriov_wq", WQ_PERCPU, 0); 324 if (!device_sriov_wq) { 325 destroy_workqueue(device_reset_wq); 326 device_reset_wq = NULL; 327 return -EFAULT; 328 } 329 330 return 0; 331 } 332 333 void adf_exit_aer(void) 334 { 335 if (device_reset_wq) 336 destroy_workqueue(device_reset_wq); 337 device_reset_wq = NULL; 338 339 if (device_sriov_wq) 340 destroy_workqueue(device_sriov_wq); 341 device_sriov_wq = NULL; 342 } 343