xref: /linux/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c (revision 8f7aa3d3c7323f4ca2768a9e74ebbe359c4f8f88)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020, Mellanox Technologies inc.  All rights reserved. */
3 
4 #include <devlink.h>
5 
6 #include "fw_reset.h"
7 #include "diag/fw_tracer.h"
8 #include "lib/tout.h"
9 #include "sf/sf.h"
10 
11 enum {
12 	MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
13 	MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
14 	MLX5_FW_RESET_FLAGS_PENDING_COMP,
15 	MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS,
16 	MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED,
17 	MLX5_FW_RESET_FLAGS_UNLOAD_EVENT,
18 };
19 
20 struct mlx5_fw_reset {
21 	struct mlx5_core_dev *dev;
22 	struct mlx5_nb nb;
23 	struct workqueue_struct *wq;
24 	struct work_struct fw_live_patch_work;
25 	struct work_struct reset_request_work;
26 	struct work_struct reset_unload_work;
27 	struct work_struct reset_reload_work;
28 	struct work_struct reset_now_work;
29 	struct work_struct reset_abort_work;
30 	struct delayed_work reset_timeout_work;
31 	unsigned long reset_flags;
32 	u8 reset_method;
33 	struct timer_list timer;
34 	struct completion done;
35 	int ret;
36 };
37 
38 enum {
39 	MLX5_FW_RST_STATE_IDLE = 0,
40 	MLX5_FW_RST_STATE_TOGGLE_REQ = 4,
41 	MLX5_FW_RST_STATE_DROP_MODE = 5,
42 };
43 
44 enum {
45 	MLX5_RST_STATE_BIT_NUM = 12,
46 	MLX5_RST_ACK_BIT_NUM = 22,
47 };
48 
49 static u8 mlx5_get_fw_rst_state(struct mlx5_core_dev *dev)
50 {
51 	return (ioread32be(&dev->iseg->initializing) >> MLX5_RST_STATE_BIT_NUM) & 0xF;
52 }
53 
54 static void mlx5_set_fw_rst_ack(struct mlx5_core_dev *dev)
55 {
56 	iowrite32be(BIT(MLX5_RST_ACK_BIT_NUM), &dev->iseg->initializing);
57 }
58 
59 static int mlx5_fw_reset_enable_remote_dev_reset_set(struct devlink *devlink, u32 id,
60 						     struct devlink_param_gset_ctx *ctx,
61 						     struct netlink_ext_ack *extack)
62 {
63 	struct mlx5_core_dev *dev = devlink_priv(devlink);
64 	struct mlx5_fw_reset *fw_reset;
65 
66 	fw_reset = dev->priv.fw_reset;
67 
68 	if (ctx->val.vbool)
69 		clear_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags);
70 	else
71 		set_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags);
72 	return 0;
73 }
74 
75 static int mlx5_fw_reset_enable_remote_dev_reset_get(struct devlink *devlink, u32 id,
76 						     struct devlink_param_gset_ctx *ctx,
77 						     struct netlink_ext_ack *extack)
78 {
79 	struct mlx5_core_dev *dev = devlink_priv(devlink);
80 	struct mlx5_fw_reset *fw_reset;
81 
82 	fw_reset = dev->priv.fw_reset;
83 
84 	ctx->val.vbool = !test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
85 				   &fw_reset->reset_flags);
86 	return 0;
87 }
88 
89 static int mlx5_reg_mfrl_set(struct mlx5_core_dev *dev, u8 reset_level,
90 			     u8 reset_type_sel, u8 sync_resp, bool sync_start)
91 {
92 	u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
93 	u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
94 
95 	MLX5_SET(mfrl_reg, in, reset_level, reset_level);
96 	MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel);
97 	MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_resp, sync_resp);
98 	MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, sync_start);
99 
100 	return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 1);
101 }
102 
103 static int mlx5_reg_mfrl_query(struct mlx5_core_dev *dev, u8 *reset_level,
104 			       u8 *reset_type, u8 *reset_state, u8 *reset_method)
105 {
106 	u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
107 	u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
108 	int err;
109 
110 	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 0);
111 	if (err)
112 		return err;
113 
114 	if (reset_level)
115 		*reset_level = MLX5_GET(mfrl_reg, out, reset_level);
116 	if (reset_type)
117 		*reset_type = MLX5_GET(mfrl_reg, out, reset_type);
118 	if (reset_state)
119 		*reset_state = MLX5_GET(mfrl_reg, out, reset_state);
120 	if (reset_method)
121 		*reset_method = MLX5_GET(mfrl_reg, out, pci_reset_req_method);
122 
123 	return 0;
124 }
125 
126 int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type)
127 {
128 	return mlx5_reg_mfrl_query(dev, reset_level, reset_type, NULL, NULL);
129 }
130 
131 static int mlx5_fw_reset_get_reset_method(struct mlx5_core_dev *dev,
132 					  u8 *reset_method)
133 {
134 	if (!MLX5_CAP_GEN(dev, pcie_reset_using_hotreset_method)) {
135 		*reset_method = MLX5_MFRL_REG_PCI_RESET_METHOD_LINK_TOGGLE;
136 		return 0;
137 	}
138 
139 	return mlx5_reg_mfrl_query(dev, NULL, NULL, NULL, reset_method);
140 }
141 
142 static int mlx5_fw_reset_get_reset_state_err(struct mlx5_core_dev *dev,
143 					     struct netlink_ext_ack *extack)
144 {
145 	u8 reset_state;
146 
147 	if (mlx5_reg_mfrl_query(dev, NULL, NULL, &reset_state, NULL))
148 		goto out;
149 
150 	if (!reset_state)
151 		return 0;
152 
153 	switch (reset_state) {
154 	case MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION:
155 	case MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS:
156 		NL_SET_ERR_MSG_MOD(extack, "Sync reset still in progress");
157 		return -EBUSY;
158 	case MLX5_MFRL_REG_RESET_STATE_NEG_TIMEOUT:
159 		NL_SET_ERR_MSG_MOD(extack, "Sync reset negotiation timeout");
160 		return -ETIMEDOUT;
161 	case MLX5_MFRL_REG_RESET_STATE_NACK:
162 		NL_SET_ERR_MSG_MOD(extack, "One of the hosts disabled reset");
163 		return -EPERM;
164 	case MLX5_MFRL_REG_RESET_STATE_UNLOAD_TIMEOUT:
165 		NL_SET_ERR_MSG_MOD(extack, "Sync reset unload timeout");
166 		return -ETIMEDOUT;
167 	}
168 
169 out:
170 	NL_SET_ERR_MSG_MOD(extack, "Sync reset failed");
171 	return -EIO;
172 }
173 
174 int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
175 				 struct netlink_ext_ack *extack)
176 {
177 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
178 	u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
179 	u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
180 	int err, rst_res;
181 
182 	set_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
183 
184 	MLX5_SET(mfrl_reg, in, reset_level, MLX5_MFRL_REG_RESET_LEVEL3);
185 	MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel);
186 	MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, 1);
187 	err = mlx5_access_reg(dev, in, sizeof(in), out, sizeof(out),
188 			      MLX5_REG_MFRL, 0, 1, false);
189 	if (!err)
190 		return 0;
191 
192 	clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
193 	if (err == -EREMOTEIO && MLX5_CAP_MCAM_FEATURE(dev, reset_state)) {
194 		rst_res = mlx5_fw_reset_get_reset_state_err(dev, extack);
195 		return rst_res ? rst_res : err;
196 	}
197 
198 	NL_SET_ERR_MSG_MOD(extack, "Sync reset command failed");
199 	return mlx5_cmd_check(dev, err, in, out);
200 }
201 
202 int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev,
203 				     struct netlink_ext_ack *extack)
204 {
205 	u8 rst_state;
206 	int err;
207 
208 	err = mlx5_fw_reset_get_reset_state_err(dev, extack);
209 	if (err)
210 		return err;
211 
212 	rst_state = mlx5_get_fw_rst_state(dev);
213 	if (!rst_state)
214 		return 0;
215 
216 	mlx5_core_err(dev, "Sync reset did not complete, state=%d\n", rst_state);
217 	NL_SET_ERR_MSG_MOD(extack, "Sync reset did not complete successfully");
218 	return rst_state;
219 }
220 
221 int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev)
222 {
223 	return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false);
224 }
225 
226 static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
227 {
228 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
229 	struct devlink *devlink = priv_to_devlink(dev);
230 
231 	/* if this is the driver that initiated the fw reset, devlink completed the reload */
232 	if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
233 		complete(&fw_reset->done);
234 	} else {
235 		mlx5_sync_reset_unload_flow(dev, false);
236 		if (mlx5_health_wait_pci_up(dev))
237 			mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
238 		else
239 			mlx5_load_one(dev, true);
240 		devl_lock(devlink);
241 		devlink_remote_reload_actions_performed(devlink, 0,
242 							BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
243 							BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
244 		devl_unlock(devlink);
245 	}
246 }
247 
248 static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev)
249 {
250 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
251 
252 	timer_delete_sync(&fw_reset->timer);
253 }
254 
255 static int mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health)
256 {
257 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
258 
259 	if (!test_and_clear_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) {
260 		mlx5_core_warn(dev, "Reset request was already cleared\n");
261 		return -EALREADY;
262 	}
263 
264 	if (current_work() != &fw_reset->reset_timeout_work.work)
265 		cancel_delayed_work(&fw_reset->reset_timeout_work);
266 	mlx5_stop_sync_reset_poll(dev);
267 	if (poll_health)
268 		mlx5_start_health_poll(dev);
269 	return 0;
270 }
271 
272 static void mlx5_sync_reset_reload_work(struct work_struct *work)
273 {
274 	struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
275 						      reset_reload_work);
276 	struct mlx5_core_dev *dev = fw_reset->dev;
277 
278 	mlx5_sync_reset_clear_reset_requested(dev, false);
279 	mlx5_enter_error_state(dev, true);
280 	mlx5_fw_reset_complete_reload(dev);
281 }
282 
283 #define MLX5_RESET_POLL_INTERVAL	(HZ / 10)
284 static void poll_sync_reset(struct timer_list *t)
285 {
286 	struct mlx5_fw_reset *fw_reset = timer_container_of(fw_reset, t,
287 							    timer);
288 	struct mlx5_core_dev *dev = fw_reset->dev;
289 	u32 fatal_error;
290 
291 	if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags))
292 		return;
293 
294 	fatal_error = mlx5_health_check_fatal_sensors(dev);
295 
296 	if (fatal_error) {
297 		mlx5_core_warn(dev, "Got Device Reset\n");
298 		if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags))
299 			queue_work(fw_reset->wq, &fw_reset->reset_reload_work);
300 		else
301 			mlx5_core_err(dev, "Device is being removed, Drop new reset work\n");
302 		return;
303 	}
304 
305 	mod_timer(&fw_reset->timer, round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL));
306 }
307 
308 static void mlx5_start_sync_reset_poll(struct mlx5_core_dev *dev)
309 {
310 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
311 
312 	timer_setup(&fw_reset->timer, poll_sync_reset, 0);
313 	fw_reset->timer.expires = round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL);
314 	add_timer(&fw_reset->timer);
315 }
316 
317 static int mlx5_fw_reset_set_reset_sync_ack(struct mlx5_core_dev *dev)
318 {
319 	return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 1, false);
320 }
321 
322 static int mlx5_fw_reset_set_reset_sync_nack(struct mlx5_core_dev *dev)
323 {
324 	return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 2, false);
325 }
326 
327 static int mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev)
328 {
329 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
330 
331 	if (test_and_set_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) {
332 		mlx5_core_warn(dev, "Reset request was already set\n");
333 		return -EALREADY;
334 	}
335 	mlx5_stop_health_poll(dev, true);
336 	mlx5_start_sync_reset_poll(dev);
337 
338 	if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS,
339 		      &fw_reset->reset_flags))
340 		schedule_delayed_work(&fw_reset->reset_timeout_work,
341 			msecs_to_jiffies(mlx5_tout_ms(dev, PCI_SYNC_UPDATE)));
342 	return 0;
343 }
344 
345 static void mlx5_fw_live_patch_event(struct work_struct *work)
346 {
347 	struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
348 						      fw_live_patch_work);
349 	struct mlx5_core_dev *dev = fw_reset->dev;
350 
351 	mlx5_core_info(dev, "Live patch updated firmware version: %d.%d.%d\n", fw_rev_maj(dev),
352 		       fw_rev_min(dev), fw_rev_sub(dev));
353 
354 	if (mlx5_fw_tracer_reload(dev->tracer))
355 		mlx5_core_err(dev, "Failed to reload FW tracer\n");
356 }
357 
358 #if IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)
359 static int mlx5_check_hotplug_interrupt(struct mlx5_core_dev *dev,
360 					struct pci_dev *bridge)
361 {
362 	u16 reg16;
363 	int err;
364 
365 	err = pcie_capability_read_word(bridge, PCI_EXP_SLTCTL, &reg16);
366 	if (err)
367 		return err;
368 
369 	if ((reg16 & PCI_EXP_SLTCTL_HPIE) && (reg16 & PCI_EXP_SLTCTL_DLLSCE)) {
370 		mlx5_core_warn(dev, "FW reset is not supported as HotPlug is enabled\n");
371 		return -EOPNOTSUPP;
372 	}
373 
374 	return 0;
375 }
376 #endif
377 
378 static const struct pci_device_id mgt_ifc_device_ids[] = {
379 	{ PCI_VDEVICE(MELLANOX, 0xc2d2) }, /* BlueField1 MGT interface device ID */
380 	{ PCI_VDEVICE(MELLANOX, 0xc2d3) }, /* BlueField2 MGT interface device ID */
381 	{ PCI_VDEVICE(MELLANOX, 0xc2d4) }, /* BlueField3-Lx MGT interface device ID */
382 	{ PCI_VDEVICE(MELLANOX, 0xc2d5) }, /* BlueField3 MGT interface device ID */
383 	{ PCI_VDEVICE(MELLANOX, 0xc2d6) }, /* BlueField4 MGT interface device ID */
384 };
385 
386 static bool mlx5_is_mgt_ifc_pci_device(struct mlx5_core_dev *dev, u16 dev_id)
387 {
388 	int i;
389 
390 	for (i = 0; i < ARRAY_SIZE(mgt_ifc_device_ids); ++i)
391 		if (mgt_ifc_device_ids[i].device == dev_id)
392 			return true;
393 
394 	return false;
395 }
396 
397 static int mlx5_check_dev_ids(struct mlx5_core_dev *dev, u16 dev_id)
398 {
399 	struct pci_bus *bridge_bus = dev->pdev->bus;
400 	struct pci_dev *sdev;
401 	u16 sdev_id;
402 	int err;
403 
404 	/* Check that all functions under the pci bridge are PFs of
405 	 * this device otherwise fail this function.
406 	 */
407 	list_for_each_entry(sdev, &bridge_bus->devices, bus_list) {
408 		err = pci_read_config_word(sdev, PCI_DEVICE_ID, &sdev_id);
409 		if (err)
410 			return pcibios_err_to_errno(err);
411 
412 		if (sdev_id == dev_id)
413 			continue;
414 
415 		if (mlx5_is_mgt_ifc_pci_device(dev, sdev_id))
416 			continue;
417 
418 		mlx5_core_warn(dev, "unrecognized dev_id (0x%x)\n", sdev_id);
419 		return -EPERM;
420 	}
421 	return 0;
422 }
423 
424 static bool mlx5_is_reset_now_capable(struct mlx5_core_dev *dev,
425 				      u8 reset_method)
426 {
427 	struct pci_dev *bridge = dev->pdev->bus->self;
428 	u16 dev_id;
429 	int err;
430 
431 	if (!bridge) {
432 		mlx5_core_warn(dev, "PCI bus bridge is not accessible\n");
433 		return false;
434 	}
435 
436 	if (!MLX5_CAP_GEN(dev, fast_teardown)) {
437 		mlx5_core_warn(dev, "fast teardown is not supported by firmware\n");
438 		return false;
439 	}
440 
441 	if (!mlx5_core_is_ecpf(dev) && !mlx5_sf_table_empty(dev)) {
442 		mlx5_core_warn(dev, "SFs should be removed before reset\n");
443 		return false;
444 	}
445 
446 #if IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)
447 	if (reset_method != MLX5_MFRL_REG_PCI_RESET_METHOD_HOT_RESET) {
448 		err = mlx5_check_hotplug_interrupt(dev, bridge);
449 		if (err)
450 			return false;
451 	}
452 #endif
453 
454 	err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, &dev_id);
455 	if (err)
456 		return false;
457 	return (!mlx5_check_dev_ids(dev, dev_id));
458 }
459 
460 static void mlx5_sync_reset_request_event(struct work_struct *work)
461 {
462 	struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
463 						      reset_request_work);
464 	struct mlx5_core_dev *dev = fw_reset->dev;
465 	int err;
466 
467 	err = mlx5_fw_reset_get_reset_method(dev, &fw_reset->reset_method);
468 	if (err)
469 		mlx5_core_warn(dev, "Failed reading MFRL, err %d\n", err);
470 
471 	if (err || test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags) ||
472 	    !mlx5_is_reset_now_capable(dev, fw_reset->reset_method)) {
473 		err = mlx5_fw_reset_set_reset_sync_nack(dev);
474 		mlx5_core_warn(dev, "PCI Sync FW Update Reset Nack %s",
475 			       err ? "Failed" : "Sent");
476 		return;
477 	}
478 	if (mlx5_sync_reset_set_reset_requested(dev))
479 		return;
480 
481 	err = mlx5_fw_reset_set_reset_sync_ack(dev);
482 	if (err)
483 		mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack Failed. Error code: %d\n", err);
484 	else
485 		mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n");
486 }
487 
488 static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev, u16 dev_id)
489 {
490 	struct pci_bus *bridge_bus = dev->pdev->bus;
491 	struct pci_dev *bridge = bridge_bus->self;
492 	unsigned long timeout;
493 	struct pci_dev *sdev;
494 	int cap, err;
495 	u16 reg16;
496 
497 	cap = pci_find_capability(bridge, PCI_CAP_ID_EXP);
498 	if (!cap)
499 		return -EOPNOTSUPP;
500 
501 	list_for_each_entry(sdev, &bridge_bus->devices, bus_list) {
502 		pci_save_state(sdev);
503 		pci_cfg_access_lock(sdev);
504 	}
505 	/* PCI link toggle */
506 	err = pcie_capability_set_word(bridge, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_LD);
507 	if (err)
508 		return pcibios_err_to_errno(err);
509 	msleep(500);
510 	err = pcie_capability_clear_word(bridge, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_LD);
511 	if (err)
512 		return pcibios_err_to_errno(err);
513 
514 	/* Check link */
515 	if (!bridge->link_active_reporting) {
516 		mlx5_core_warn(dev, "No PCI link reporting capability\n");
517 		msleep(1000);
518 		goto restore;
519 	}
520 
521 	timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_TOGGLE));
522 	do {
523 		err = pci_read_config_word(bridge, cap + PCI_EXP_LNKSTA, &reg16);
524 		if (err)
525 			return pcibios_err_to_errno(err);
526 		if (reg16 & PCI_EXP_LNKSTA_DLLLA)
527 			break;
528 		msleep(20);
529 	} while (!time_after(jiffies, timeout));
530 
531 	if (reg16 & PCI_EXP_LNKSTA_DLLLA) {
532 		mlx5_core_info(dev, "PCI Link up\n");
533 	} else {
534 		mlx5_core_err(dev, "PCI link not ready (0x%04x) after %llu ms\n",
535 			      reg16, mlx5_tout_ms(dev, PCI_TOGGLE));
536 		err = -ETIMEDOUT;
537 		goto restore;
538 	}
539 
540 	do {
541 		err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, &reg16);
542 		if (err)
543 			return pcibios_err_to_errno(err);
544 		if (reg16 == dev_id)
545 			break;
546 		msleep(20);
547 	} while (!time_after(jiffies, timeout));
548 
549 	if (reg16 == dev_id) {
550 		mlx5_core_info(dev, "Firmware responds to PCI config cycles again\n");
551 	} else {
552 		mlx5_core_err(dev, "Firmware is not responsive (0x%04x) after %llu ms\n",
553 			      reg16, mlx5_tout_ms(dev, PCI_TOGGLE));
554 		err = -ETIMEDOUT;
555 	}
556 
557 restore:
558 	list_for_each_entry(sdev, &bridge_bus->devices, bus_list) {
559 		pci_cfg_access_unlock(sdev);
560 		pci_restore_state(sdev);
561 	}
562 
563 	return err;
564 }
565 
566 static int mlx5_pci_reset_bus(struct mlx5_core_dev *dev)
567 {
568 	if (!MLX5_CAP_GEN(dev, pcie_reset_using_hotreset_method))
569 		return -EOPNOTSUPP;
570 
571 	return pci_reset_bus(dev->pdev);
572 }
573 
574 static int mlx5_sync_pci_reset(struct mlx5_core_dev *dev, u8 reset_method)
575 {
576 	u16 dev_id;
577 	int err;
578 
579 	err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, &dev_id);
580 	if (err)
581 		return pcibios_err_to_errno(err);
582 	err = mlx5_check_dev_ids(dev, dev_id);
583 	if (err)
584 		return err;
585 
586 	switch (reset_method) {
587 	case MLX5_MFRL_REG_PCI_RESET_METHOD_LINK_TOGGLE:
588 		err = mlx5_pci_link_toggle(dev, dev_id);
589 		if (err)
590 			mlx5_core_warn(dev, "mlx5_pci_link_toggle failed\n");
591 		break;
592 	case MLX5_MFRL_REG_PCI_RESET_METHOD_HOT_RESET:
593 		err = mlx5_pci_reset_bus(dev);
594 		if (err)
595 			mlx5_core_warn(dev, "mlx5_pci_reset_bus failed\n");
596 		break;
597 	default:
598 		return -EOPNOTSUPP;
599 	}
600 
601 	return err;
602 }
603 
604 void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked)
605 {
606 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
607 	unsigned long timeout;
608 	int poll_freq = 20;
609 	bool reset_action;
610 	u8 rst_state;
611 	int err;
612 
613 	if (locked)
614 		mlx5_unload_one_devl_locked(dev, false);
615 	else
616 		mlx5_unload_one(dev, false);
617 
618 	if (!test_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags))
619 		return;
620 
621 	mlx5_set_fw_rst_ack(dev);
622 	mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n");
623 
624 	reset_action = false;
625 	timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD));
626 	do {
627 		rst_state = mlx5_get_fw_rst_state(dev);
628 		if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ ||
629 		    rst_state == MLX5_FW_RST_STATE_IDLE) {
630 			reset_action = true;
631 			break;
632 		}
633 		if (rst_state == MLX5_FW_RST_STATE_DROP_MODE) {
634 			mlx5_core_info(dev, "Sync Reset Drop mode ack\n");
635 			mlx5_set_fw_rst_ack(dev);
636 			poll_freq = 1000;
637 		}
638 		msleep(poll_freq);
639 	} while (!time_after(jiffies, timeout));
640 
641 	if (!reset_action) {
642 		mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n",
643 			      rst_state);
644 		fw_reset->ret = -ETIMEDOUT;
645 		goto done;
646 	}
647 
648 	mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n",
649 		       rst_state);
650 	if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) {
651 		err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
652 		if (err) {
653 			mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n",
654 				       err);
655 			fw_reset->ret = err;
656 		}
657 	}
658 
659 done:
660 	clear_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
661 }
662 
663 static void mlx5_sync_reset_now_event(struct work_struct *work)
664 {
665 	struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
666 						      reset_now_work);
667 	struct mlx5_core_dev *dev = fw_reset->dev;
668 	int err;
669 
670 	if (mlx5_sync_reset_clear_reset_requested(dev, false))
671 		return;
672 
673 	mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n");
674 
675 	err = mlx5_cmd_fast_teardown_hca(dev);
676 	if (err) {
677 		mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n", err);
678 		goto done;
679 	}
680 
681 	err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
682 	if (err) {
683 		mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, no reset done, err %d\n", err);
684 		set_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags);
685 	}
686 
687 	mlx5_enter_error_state(dev, true);
688 done:
689 	fw_reset->ret = err;
690 	mlx5_fw_reset_complete_reload(dev);
691 }
692 
693 static void mlx5_sync_reset_unload_event(struct work_struct *work)
694 {
695 	struct mlx5_fw_reset *fw_reset;
696 	struct mlx5_core_dev *dev;
697 	int err;
698 
699 	fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work);
700 	dev = fw_reset->dev;
701 
702 	if (mlx5_sync_reset_clear_reset_requested(dev, false))
703 		return;
704 
705 	set_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
706 	mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n");
707 
708 	err = mlx5_cmd_fast_teardown_hca(dev);
709 	if (err)
710 		mlx5_core_warn(dev, "Fast teardown failed, unloading, err %d\n", err);
711 	else
712 		mlx5_enter_error_state(dev, true);
713 
714 	mlx5_fw_reset_complete_reload(dev);
715 }
716 
717 static void mlx5_sync_reset_abort_event(struct work_struct *work)
718 {
719 	struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
720 						      reset_abort_work);
721 	struct mlx5_core_dev *dev = fw_reset->dev;
722 
723 	if (mlx5_sync_reset_clear_reset_requested(dev, true))
724 		return;
725 	mlx5_core_warn(dev, "PCI Sync FW Update Reset Aborted.\n");
726 }
727 
728 static void mlx5_sync_reset_events_handle(struct mlx5_fw_reset *fw_reset, struct mlx5_eqe *eqe)
729 {
730 	struct mlx5_eqe_sync_fw_update *sync_fw_update_eqe;
731 	u8 sync_event_rst_type;
732 
733 	sync_fw_update_eqe = &eqe->data.sync_fw_update;
734 	sync_event_rst_type = sync_fw_update_eqe->sync_rst_state & SYNC_RST_STATE_MASK;
735 	switch (sync_event_rst_type) {
736 	case MLX5_SYNC_RST_STATE_RESET_REQUEST:
737 		queue_work(fw_reset->wq, &fw_reset->reset_request_work);
738 		break;
739 	case MLX5_SYNC_RST_STATE_RESET_UNLOAD:
740 		queue_work(fw_reset->wq, &fw_reset->reset_unload_work);
741 		break;
742 	case MLX5_SYNC_RST_STATE_RESET_NOW:
743 		queue_work(fw_reset->wq, &fw_reset->reset_now_work);
744 		break;
745 	case MLX5_SYNC_RST_STATE_RESET_ABORT:
746 		queue_work(fw_reset->wq, &fw_reset->reset_abort_work);
747 		break;
748 	}
749 }
750 
751 static void mlx5_sync_reset_timeout_work(struct work_struct *work)
752 {
753 	struct delayed_work *dwork = container_of(work, struct delayed_work,
754 						  work);
755 	struct mlx5_fw_reset *fw_reset =
756 		container_of(dwork, struct mlx5_fw_reset, reset_timeout_work);
757 	struct mlx5_core_dev *dev = fw_reset->dev;
758 
759 	if (mlx5_sync_reset_clear_reset_requested(dev, true))
760 		return;
761 	mlx5_core_warn(dev, "PCI Sync FW Update Reset Timeout.\n");
762 }
763 
764 static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long action, void *data)
765 {
766 	struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb);
767 	struct mlx5_eqe *eqe = data;
768 
769 	if (test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags))
770 		return NOTIFY_DONE;
771 
772 	switch (eqe->sub_type) {
773 	case MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT:
774 		queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work);
775 		break;
776 	case MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT:
777 		mlx5_sync_reset_events_handle(fw_reset, eqe);
778 		break;
779 	default:
780 		return NOTIFY_DONE;
781 	}
782 
783 	return NOTIFY_OK;
784 }
785 
786 int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev)
787 {
788 	unsigned long pci_sync_update_timeout = mlx5_tout_ms(dev, PCI_SYNC_UPDATE);
789 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
790 	unsigned long timeout;
791 	int err;
792 
793 	if (MLX5_CAP_GEN(dev, pci_sync_for_fw_update_with_driver_unload))
794 		pci_sync_update_timeout += mlx5_tout_ms(dev, RESET_UNLOAD);
795 	timeout = msecs_to_jiffies(pci_sync_update_timeout);
796 	if (!wait_for_completion_timeout(&fw_reset->done, timeout)) {
797 		mlx5_core_warn(dev, "FW sync reset timeout after %lu seconds\n",
798 			       pci_sync_update_timeout / 1000);
799 		err = -ETIMEDOUT;
800 		goto out;
801 	}
802 	err = fw_reset->ret;
803 	if (test_and_clear_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags)) {
804 		mlx5_unload_one_devl_locked(dev, false);
805 		mlx5_load_one_devl_locked(dev, true);
806 	}
807 out:
808 	clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
809 	return err;
810 }
811 
812 void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev)
813 {
814 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
815 
816 	if (!fw_reset)
817 		return;
818 
819 	MLX5_NB_INIT(&fw_reset->nb, fw_reset_event_notifier, GENERAL_EVENT);
820 	mlx5_eq_notifier_register(dev, &fw_reset->nb);
821 }
822 
823 void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev)
824 {
825 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
826 
827 	if (!fw_reset)
828 		return;
829 
830 	mlx5_eq_notifier_unregister(dev, &fw_reset->nb);
831 }
832 
833 void mlx5_drain_fw_reset(struct mlx5_core_dev *dev)
834 {
835 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
836 
837 	if (!fw_reset)
838 		return;
839 
840 	set_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags);
841 	cancel_work_sync(&fw_reset->fw_live_patch_work);
842 	cancel_work_sync(&fw_reset->reset_request_work);
843 	cancel_work_sync(&fw_reset->reset_unload_work);
844 	cancel_work_sync(&fw_reset->reset_reload_work);
845 	cancel_work_sync(&fw_reset->reset_now_work);
846 	cancel_work_sync(&fw_reset->reset_abort_work);
847 	cancel_delayed_work(&fw_reset->reset_timeout_work);
848 }
849 
850 static const struct devlink_param mlx5_fw_reset_devlink_params[] = {
851 	DEVLINK_PARAM_GENERIC(ENABLE_REMOTE_DEV_RESET, BIT(DEVLINK_PARAM_CMODE_RUNTIME),
852 			      mlx5_fw_reset_enable_remote_dev_reset_get,
853 			      mlx5_fw_reset_enable_remote_dev_reset_set, NULL),
854 };
855 
856 int mlx5_fw_reset_init(struct mlx5_core_dev *dev)
857 {
858 	struct mlx5_fw_reset *fw_reset;
859 	int err;
860 
861 	if (!MLX5_CAP_MCAM_REG(dev, mfrl))
862 		return 0;
863 
864 	fw_reset = kzalloc(sizeof(*fw_reset), GFP_KERNEL);
865 	if (!fw_reset)
866 		return -ENOMEM;
867 	fw_reset->wq = create_singlethread_workqueue("mlx5_fw_reset_events");
868 	if (!fw_reset->wq) {
869 		kfree(fw_reset);
870 		return -ENOMEM;
871 	}
872 
873 	fw_reset->dev = dev;
874 	dev->priv.fw_reset = fw_reset;
875 
876 	err = devl_params_register(priv_to_devlink(dev),
877 				   mlx5_fw_reset_devlink_params,
878 				   ARRAY_SIZE(mlx5_fw_reset_devlink_params));
879 	if (err) {
880 		destroy_workqueue(fw_reset->wq);
881 		kfree(fw_reset);
882 		return err;
883 	}
884 
885 	INIT_WORK(&fw_reset->fw_live_patch_work, mlx5_fw_live_patch_event);
886 	INIT_WORK(&fw_reset->reset_request_work, mlx5_sync_reset_request_event);
887 	INIT_WORK(&fw_reset->reset_unload_work, mlx5_sync_reset_unload_event);
888 	INIT_WORK(&fw_reset->reset_reload_work, mlx5_sync_reset_reload_work);
889 	INIT_WORK(&fw_reset->reset_now_work, mlx5_sync_reset_now_event);
890 	INIT_WORK(&fw_reset->reset_abort_work, mlx5_sync_reset_abort_event);
891 	INIT_DELAYED_WORK(&fw_reset->reset_timeout_work,
892 			  mlx5_sync_reset_timeout_work);
893 
894 	init_completion(&fw_reset->done);
895 	return 0;
896 }
897 
898 void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev)
899 {
900 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
901 
902 	if (!fw_reset)
903 		return;
904 
905 	devl_params_unregister(priv_to_devlink(dev),
906 			       mlx5_fw_reset_devlink_params,
907 			       ARRAY_SIZE(mlx5_fw_reset_devlink_params));
908 	destroy_workqueue(fw_reset->wq);
909 	kfree(dev->priv.fw_reset);
910 }
911