1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ 3 4 #include <devlink.h> 5 6 #include "fw_reset.h" 7 #include "diag/fw_tracer.h" 8 #include "lib/tout.h" 9 #include "sf/sf.h" 10 11 enum { 12 MLX5_FW_RESET_FLAGS_RESET_REQUESTED, 13 MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, 14 MLX5_FW_RESET_FLAGS_PENDING_COMP, 15 MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, 16 MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, 17 MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, 18 }; 19 20 struct mlx5_fw_reset { 21 struct mlx5_core_dev *dev; 22 struct mlx5_nb nb; 23 struct workqueue_struct *wq; 24 struct work_struct fw_live_patch_work; 25 struct work_struct reset_request_work; 26 struct work_struct reset_unload_work; 27 struct work_struct reset_reload_work; 28 struct work_struct reset_now_work; 29 struct work_struct reset_abort_work; 30 struct delayed_work reset_timeout_work; 31 unsigned long reset_flags; 32 u8 reset_method; 33 struct timer_list timer; 34 struct completion done; 35 int ret; 36 }; 37 38 enum { 39 MLX5_FW_RST_STATE_IDLE = 0, 40 MLX5_FW_RST_STATE_TOGGLE_REQ = 4, 41 MLX5_FW_RST_STATE_DROP_MODE = 5, 42 }; 43 44 enum { 45 MLX5_RST_STATE_BIT_NUM = 12, 46 MLX5_RST_ACK_BIT_NUM = 22, 47 }; 48 49 static u8 mlx5_get_fw_rst_state(struct mlx5_core_dev *dev) 50 { 51 return (ioread32be(&dev->iseg->initializing) >> MLX5_RST_STATE_BIT_NUM) & 0xF; 52 } 53 54 static void mlx5_set_fw_rst_ack(struct mlx5_core_dev *dev) 55 { 56 iowrite32be(BIT(MLX5_RST_ACK_BIT_NUM), &dev->iseg->initializing); 57 } 58 59 static int mlx5_fw_reset_enable_remote_dev_reset_set(struct devlink *devlink, u32 id, 60 struct devlink_param_gset_ctx *ctx, 61 struct netlink_ext_ack *extack) 62 { 63 struct mlx5_core_dev *dev = devlink_priv(devlink); 64 struct mlx5_fw_reset *fw_reset; 65 66 fw_reset = dev->priv.fw_reset; 67 68 if (ctx->val.vbool) 69 clear_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags); 70 else 71 set_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags); 72 return 0; 73 } 74 75 static int mlx5_fw_reset_enable_remote_dev_reset_get(struct devlink *devlink, u32 id, 76 struct devlink_param_gset_ctx *ctx, 77 struct netlink_ext_ack *extack) 78 { 79 struct mlx5_core_dev *dev = devlink_priv(devlink); 80 struct mlx5_fw_reset *fw_reset; 81 82 fw_reset = dev->priv.fw_reset; 83 84 ctx->val.vbool = !test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, 85 &fw_reset->reset_flags); 86 return 0; 87 } 88 89 static int mlx5_reg_mfrl_set(struct mlx5_core_dev *dev, u8 reset_level, 90 u8 reset_type_sel, u8 sync_resp, bool sync_start) 91 { 92 u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; 93 u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; 94 95 MLX5_SET(mfrl_reg, in, reset_level, reset_level); 96 MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel); 97 MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_resp, sync_resp); 98 MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, sync_start); 99 100 return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 1); 101 } 102 103 static int mlx5_reg_mfrl_query(struct mlx5_core_dev *dev, u8 *reset_level, 104 u8 *reset_type, u8 *reset_state, u8 *reset_method) 105 { 106 u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; 107 u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; 108 int err; 109 110 err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 0); 111 if (err) 112 return err; 113 114 if (reset_level) 115 *reset_level = MLX5_GET(mfrl_reg, out, reset_level); 116 if (reset_type) 117 *reset_type = MLX5_GET(mfrl_reg, out, reset_type); 118 if (reset_state) 119 *reset_state = MLX5_GET(mfrl_reg, out, reset_state); 120 if (reset_method) 121 *reset_method = MLX5_GET(mfrl_reg, out, pci_reset_req_method); 122 123 return 0; 124 } 125 126 int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type) 127 { 128 return mlx5_reg_mfrl_query(dev, reset_level, reset_type, NULL, NULL); 129 } 130 131 static int mlx5_fw_reset_get_reset_method(struct mlx5_core_dev *dev, 132 u8 *reset_method) 133 { 134 if (!MLX5_CAP_GEN(dev, pcie_reset_using_hotreset_method)) { 135 *reset_method = MLX5_MFRL_REG_PCI_RESET_METHOD_LINK_TOGGLE; 136 return 0; 137 } 138 139 return mlx5_reg_mfrl_query(dev, NULL, NULL, NULL, reset_method); 140 } 141 142 static int mlx5_fw_reset_get_reset_state_err(struct mlx5_core_dev *dev, 143 struct netlink_ext_ack *extack) 144 { 145 u8 reset_state; 146 147 if (mlx5_reg_mfrl_query(dev, NULL, NULL, &reset_state, NULL)) 148 goto out; 149 150 if (!reset_state) 151 return 0; 152 153 switch (reset_state) { 154 case MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION: 155 case MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS: 156 NL_SET_ERR_MSG_MOD(extack, "Sync reset still in progress"); 157 return -EBUSY; 158 case MLX5_MFRL_REG_RESET_STATE_NEG_TIMEOUT: 159 NL_SET_ERR_MSG_MOD(extack, "Sync reset negotiation timeout"); 160 return -ETIMEDOUT; 161 case MLX5_MFRL_REG_RESET_STATE_NACK: 162 NL_SET_ERR_MSG_MOD(extack, "One of the hosts disabled reset"); 163 return -EPERM; 164 case MLX5_MFRL_REG_RESET_STATE_UNLOAD_TIMEOUT: 165 NL_SET_ERR_MSG_MOD(extack, "Sync reset unload timeout"); 166 return -ETIMEDOUT; 167 } 168 169 out: 170 NL_SET_ERR_MSG_MOD(extack, "Sync reset failed"); 171 return -EIO; 172 } 173 174 int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel, 175 struct netlink_ext_ack *extack) 176 { 177 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 178 u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; 179 u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; 180 int err, rst_res; 181 182 set_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); 183 184 MLX5_SET(mfrl_reg, in, reset_level, MLX5_MFRL_REG_RESET_LEVEL3); 185 MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel); 186 MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, 1); 187 err = mlx5_access_reg(dev, in, sizeof(in), out, sizeof(out), 188 MLX5_REG_MFRL, 0, 1, false); 189 if (!err) 190 return 0; 191 192 clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); 193 if (err == -EREMOTEIO && MLX5_CAP_MCAM_FEATURE(dev, reset_state)) { 194 rst_res = mlx5_fw_reset_get_reset_state_err(dev, extack); 195 return rst_res ? rst_res : err; 196 } 197 198 NL_SET_ERR_MSG_MOD(extack, "Sync reset command failed"); 199 return mlx5_cmd_check(dev, err, in, out); 200 } 201 202 int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev, 203 struct netlink_ext_ack *extack) 204 { 205 u8 rst_state; 206 int err; 207 208 err = mlx5_fw_reset_get_reset_state_err(dev, extack); 209 if (err) 210 return err; 211 212 rst_state = mlx5_get_fw_rst_state(dev); 213 if (!rst_state) 214 return 0; 215 216 mlx5_core_err(dev, "Sync reset did not complete, state=%d\n", rst_state); 217 NL_SET_ERR_MSG_MOD(extack, "Sync reset did not complete successfully"); 218 return rst_state; 219 } 220 221 int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev) 222 { 223 return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false); 224 } 225 226 static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev) 227 { 228 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 229 struct devlink *devlink = priv_to_devlink(dev); 230 231 /* if this is the driver that initiated the fw reset, devlink completed the reload */ 232 if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) { 233 complete(&fw_reset->done); 234 } else { 235 mlx5_sync_reset_unload_flow(dev, false); 236 if (mlx5_health_wait_pci_up(dev)) 237 mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); 238 else 239 mlx5_load_one(dev, true); 240 devl_lock(devlink); 241 devlink_remote_reload_actions_performed(devlink, 0, 242 BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | 243 BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE)); 244 devl_unlock(devlink); 245 } 246 } 247 248 static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev) 249 { 250 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 251 252 timer_delete_sync(&fw_reset->timer); 253 } 254 255 static int mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health) 256 { 257 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 258 259 if (!test_and_clear_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) { 260 mlx5_core_warn(dev, "Reset request was already cleared\n"); 261 return -EALREADY; 262 } 263 264 if (current_work() != &fw_reset->reset_timeout_work.work) 265 cancel_delayed_work(&fw_reset->reset_timeout_work); 266 mlx5_stop_sync_reset_poll(dev); 267 if (poll_health) 268 mlx5_start_health_poll(dev); 269 return 0; 270 } 271 272 static void mlx5_sync_reset_reload_work(struct work_struct *work) 273 { 274 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, 275 reset_reload_work); 276 struct mlx5_core_dev *dev = fw_reset->dev; 277 278 mlx5_sync_reset_clear_reset_requested(dev, false); 279 mlx5_enter_error_state(dev, true); 280 mlx5_fw_reset_complete_reload(dev); 281 } 282 283 #define MLX5_RESET_POLL_INTERVAL (HZ / 10) 284 static void poll_sync_reset(struct timer_list *t) 285 { 286 struct mlx5_fw_reset *fw_reset = timer_container_of(fw_reset, t, 287 timer); 288 struct mlx5_core_dev *dev = fw_reset->dev; 289 u32 fatal_error; 290 291 if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) 292 return; 293 294 fatal_error = mlx5_health_check_fatal_sensors(dev); 295 296 if (fatal_error) { 297 mlx5_core_warn(dev, "Got Device Reset\n"); 298 if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags)) 299 queue_work(fw_reset->wq, &fw_reset->reset_reload_work); 300 else 301 mlx5_core_err(dev, "Device is being removed, Drop new reset work\n"); 302 return; 303 } 304 305 mod_timer(&fw_reset->timer, round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL)); 306 } 307 308 static void mlx5_start_sync_reset_poll(struct mlx5_core_dev *dev) 309 { 310 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 311 312 timer_setup(&fw_reset->timer, poll_sync_reset, 0); 313 fw_reset->timer.expires = round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL); 314 add_timer(&fw_reset->timer); 315 } 316 317 static int mlx5_fw_reset_set_reset_sync_ack(struct mlx5_core_dev *dev) 318 { 319 return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 1, false); 320 } 321 322 static int mlx5_fw_reset_set_reset_sync_nack(struct mlx5_core_dev *dev) 323 { 324 return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 2, false); 325 } 326 327 static int mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev) 328 { 329 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 330 331 if (test_and_set_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) { 332 mlx5_core_warn(dev, "Reset request was already set\n"); 333 return -EALREADY; 334 } 335 mlx5_stop_health_poll(dev, true); 336 mlx5_start_sync_reset_poll(dev); 337 338 if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, 339 &fw_reset->reset_flags)) 340 schedule_delayed_work(&fw_reset->reset_timeout_work, 341 msecs_to_jiffies(mlx5_tout_ms(dev, PCI_SYNC_UPDATE))); 342 return 0; 343 } 344 345 static void mlx5_fw_live_patch_event(struct work_struct *work) 346 { 347 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, 348 fw_live_patch_work); 349 struct mlx5_core_dev *dev = fw_reset->dev; 350 351 mlx5_core_info(dev, "Live patch updated firmware version: %d.%d.%d\n", fw_rev_maj(dev), 352 fw_rev_min(dev), fw_rev_sub(dev)); 353 354 if (mlx5_fw_tracer_reload(dev->tracer)) 355 mlx5_core_err(dev, "Failed to reload FW tracer\n"); 356 } 357 358 #if IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE) 359 static int mlx5_check_hotplug_interrupt(struct mlx5_core_dev *dev, 360 struct pci_dev *bridge) 361 { 362 u16 reg16; 363 int err; 364 365 err = pcie_capability_read_word(bridge, PCI_EXP_SLTCTL, ®16); 366 if (err) 367 return err; 368 369 if ((reg16 & PCI_EXP_SLTCTL_HPIE) && (reg16 & PCI_EXP_SLTCTL_DLLSCE)) { 370 mlx5_core_warn(dev, "FW reset is not supported as HotPlug is enabled\n"); 371 return -EOPNOTSUPP; 372 } 373 374 return 0; 375 } 376 #endif 377 378 static const struct pci_device_id mgt_ifc_device_ids[] = { 379 { PCI_VDEVICE(MELLANOX, 0xc2d2) }, /* BlueField1 MGT interface device ID */ 380 { PCI_VDEVICE(MELLANOX, 0xc2d3) }, /* BlueField2 MGT interface device ID */ 381 { PCI_VDEVICE(MELLANOX, 0xc2d4) }, /* BlueField3-Lx MGT interface device ID */ 382 { PCI_VDEVICE(MELLANOX, 0xc2d5) }, /* BlueField3 MGT interface device ID */ 383 { PCI_VDEVICE(MELLANOX, 0xc2d6) }, /* BlueField4 MGT interface device ID */ 384 }; 385 386 static bool mlx5_is_mgt_ifc_pci_device(struct mlx5_core_dev *dev, u16 dev_id) 387 { 388 int i; 389 390 for (i = 0; i < ARRAY_SIZE(mgt_ifc_device_ids); ++i) 391 if (mgt_ifc_device_ids[i].device == dev_id) 392 return true; 393 394 return false; 395 } 396 397 static int mlx5_check_dev_ids(struct mlx5_core_dev *dev, u16 dev_id) 398 { 399 struct pci_bus *bridge_bus = dev->pdev->bus; 400 struct pci_dev *sdev; 401 u16 sdev_id; 402 int err; 403 404 /* Check that all functions under the pci bridge are PFs of 405 * this device otherwise fail this function. 406 */ 407 list_for_each_entry(sdev, &bridge_bus->devices, bus_list) { 408 err = pci_read_config_word(sdev, PCI_DEVICE_ID, &sdev_id); 409 if (err) 410 return pcibios_err_to_errno(err); 411 412 if (sdev_id == dev_id) 413 continue; 414 415 if (mlx5_is_mgt_ifc_pci_device(dev, sdev_id)) 416 continue; 417 418 mlx5_core_warn(dev, "unrecognized dev_id (0x%x)\n", sdev_id); 419 return -EPERM; 420 } 421 return 0; 422 } 423 424 static bool mlx5_is_reset_now_capable(struct mlx5_core_dev *dev, 425 u8 reset_method) 426 { 427 struct pci_dev *bridge = dev->pdev->bus->self; 428 u16 dev_id; 429 int err; 430 431 if (!bridge) { 432 mlx5_core_warn(dev, "PCI bus bridge is not accessible\n"); 433 return false; 434 } 435 436 if (!MLX5_CAP_GEN(dev, fast_teardown)) { 437 mlx5_core_warn(dev, "fast teardown is not supported by firmware\n"); 438 return false; 439 } 440 441 if (!mlx5_core_is_ecpf(dev) && !mlx5_sf_table_empty(dev)) { 442 mlx5_core_warn(dev, "SFs should be removed before reset\n"); 443 return false; 444 } 445 446 #if IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE) 447 if (reset_method != MLX5_MFRL_REG_PCI_RESET_METHOD_HOT_RESET) { 448 err = mlx5_check_hotplug_interrupt(dev, bridge); 449 if (err) 450 return false; 451 } 452 #endif 453 454 err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, &dev_id); 455 if (err) 456 return false; 457 return (!mlx5_check_dev_ids(dev, dev_id)); 458 } 459 460 static void mlx5_sync_reset_request_event(struct work_struct *work) 461 { 462 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, 463 reset_request_work); 464 struct mlx5_core_dev *dev = fw_reset->dev; 465 int err; 466 467 err = mlx5_fw_reset_get_reset_method(dev, &fw_reset->reset_method); 468 if (err) 469 mlx5_core_warn(dev, "Failed reading MFRL, err %d\n", err); 470 471 if (err || test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags) || 472 !mlx5_is_reset_now_capable(dev, fw_reset->reset_method)) { 473 err = mlx5_fw_reset_set_reset_sync_nack(dev); 474 mlx5_core_warn(dev, "PCI Sync FW Update Reset Nack %s", 475 err ? "Failed" : "Sent"); 476 return; 477 } 478 if (mlx5_sync_reset_set_reset_requested(dev)) 479 return; 480 481 err = mlx5_fw_reset_set_reset_sync_ack(dev); 482 if (err) 483 mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack Failed. Error code: %d\n", err); 484 else 485 mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n"); 486 } 487 488 static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev, u16 dev_id) 489 { 490 struct pci_bus *bridge_bus = dev->pdev->bus; 491 struct pci_dev *bridge = bridge_bus->self; 492 unsigned long timeout; 493 struct pci_dev *sdev; 494 int cap, err; 495 u16 reg16; 496 497 cap = pci_find_capability(bridge, PCI_CAP_ID_EXP); 498 if (!cap) 499 return -EOPNOTSUPP; 500 501 list_for_each_entry(sdev, &bridge_bus->devices, bus_list) { 502 pci_save_state(sdev); 503 pci_cfg_access_lock(sdev); 504 } 505 /* PCI link toggle */ 506 err = pcie_capability_set_word(bridge, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_LD); 507 if (err) 508 return pcibios_err_to_errno(err); 509 msleep(500); 510 err = pcie_capability_clear_word(bridge, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_LD); 511 if (err) 512 return pcibios_err_to_errno(err); 513 514 /* Check link */ 515 if (!bridge->link_active_reporting) { 516 mlx5_core_warn(dev, "No PCI link reporting capability\n"); 517 msleep(1000); 518 goto restore; 519 } 520 521 timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_TOGGLE)); 522 do { 523 err = pci_read_config_word(bridge, cap + PCI_EXP_LNKSTA, ®16); 524 if (err) 525 return pcibios_err_to_errno(err); 526 if (reg16 & PCI_EXP_LNKSTA_DLLLA) 527 break; 528 msleep(20); 529 } while (!time_after(jiffies, timeout)); 530 531 if (reg16 & PCI_EXP_LNKSTA_DLLLA) { 532 mlx5_core_info(dev, "PCI Link up\n"); 533 } else { 534 mlx5_core_err(dev, "PCI link not ready (0x%04x) after %llu ms\n", 535 reg16, mlx5_tout_ms(dev, PCI_TOGGLE)); 536 err = -ETIMEDOUT; 537 goto restore; 538 } 539 540 do { 541 err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, ®16); 542 if (err) 543 return pcibios_err_to_errno(err); 544 if (reg16 == dev_id) 545 break; 546 msleep(20); 547 } while (!time_after(jiffies, timeout)); 548 549 if (reg16 == dev_id) { 550 mlx5_core_info(dev, "Firmware responds to PCI config cycles again\n"); 551 } else { 552 mlx5_core_err(dev, "Firmware is not responsive (0x%04x) after %llu ms\n", 553 reg16, mlx5_tout_ms(dev, PCI_TOGGLE)); 554 err = -ETIMEDOUT; 555 } 556 557 restore: 558 list_for_each_entry(sdev, &bridge_bus->devices, bus_list) { 559 pci_cfg_access_unlock(sdev); 560 pci_restore_state(sdev); 561 } 562 563 return err; 564 } 565 566 static int mlx5_pci_reset_bus(struct mlx5_core_dev *dev) 567 { 568 if (!MLX5_CAP_GEN(dev, pcie_reset_using_hotreset_method)) 569 return -EOPNOTSUPP; 570 571 return pci_reset_bus(dev->pdev); 572 } 573 574 static int mlx5_sync_pci_reset(struct mlx5_core_dev *dev, u8 reset_method) 575 { 576 u16 dev_id; 577 int err; 578 579 err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, &dev_id); 580 if (err) 581 return pcibios_err_to_errno(err); 582 err = mlx5_check_dev_ids(dev, dev_id); 583 if (err) 584 return err; 585 586 switch (reset_method) { 587 case MLX5_MFRL_REG_PCI_RESET_METHOD_LINK_TOGGLE: 588 err = mlx5_pci_link_toggle(dev, dev_id); 589 if (err) 590 mlx5_core_warn(dev, "mlx5_pci_link_toggle failed\n"); 591 break; 592 case MLX5_MFRL_REG_PCI_RESET_METHOD_HOT_RESET: 593 err = mlx5_pci_reset_bus(dev); 594 if (err) 595 mlx5_core_warn(dev, "mlx5_pci_reset_bus failed\n"); 596 break; 597 default: 598 return -EOPNOTSUPP; 599 } 600 601 return err; 602 } 603 604 void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked) 605 { 606 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 607 unsigned long timeout; 608 int poll_freq = 20; 609 bool reset_action; 610 u8 rst_state; 611 int err; 612 613 if (locked) 614 mlx5_unload_one_devl_locked(dev, false); 615 else 616 mlx5_unload_one(dev, false); 617 618 if (!test_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags)) 619 return; 620 621 mlx5_set_fw_rst_ack(dev); 622 mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n"); 623 624 reset_action = false; 625 timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD)); 626 do { 627 rst_state = mlx5_get_fw_rst_state(dev); 628 if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ || 629 rst_state == MLX5_FW_RST_STATE_IDLE) { 630 reset_action = true; 631 break; 632 } 633 if (rst_state == MLX5_FW_RST_STATE_DROP_MODE) { 634 mlx5_core_info(dev, "Sync Reset Drop mode ack\n"); 635 mlx5_set_fw_rst_ack(dev); 636 poll_freq = 1000; 637 } 638 msleep(poll_freq); 639 } while (!time_after(jiffies, timeout)); 640 641 if (!reset_action) { 642 mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n", 643 rst_state); 644 fw_reset->ret = -ETIMEDOUT; 645 goto done; 646 } 647 648 mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n", 649 rst_state); 650 if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) { 651 err = mlx5_sync_pci_reset(dev, fw_reset->reset_method); 652 if (err) { 653 mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n", 654 err); 655 fw_reset->ret = err; 656 } 657 } 658 659 done: 660 clear_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags); 661 } 662 663 static void mlx5_sync_reset_now_event(struct work_struct *work) 664 { 665 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, 666 reset_now_work); 667 struct mlx5_core_dev *dev = fw_reset->dev; 668 int err; 669 670 if (mlx5_sync_reset_clear_reset_requested(dev, false)) 671 return; 672 673 mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n"); 674 675 err = mlx5_cmd_fast_teardown_hca(dev); 676 if (err) { 677 mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n", err); 678 goto done; 679 } 680 681 err = mlx5_sync_pci_reset(dev, fw_reset->reset_method); 682 if (err) { 683 mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, no reset done, err %d\n", err); 684 set_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags); 685 } 686 687 mlx5_enter_error_state(dev, true); 688 done: 689 fw_reset->ret = err; 690 mlx5_fw_reset_complete_reload(dev); 691 } 692 693 static void mlx5_sync_reset_unload_event(struct work_struct *work) 694 { 695 struct mlx5_fw_reset *fw_reset; 696 struct mlx5_core_dev *dev; 697 int err; 698 699 fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work); 700 dev = fw_reset->dev; 701 702 if (mlx5_sync_reset_clear_reset_requested(dev, false)) 703 return; 704 705 set_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags); 706 mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n"); 707 708 err = mlx5_cmd_fast_teardown_hca(dev); 709 if (err) 710 mlx5_core_warn(dev, "Fast teardown failed, unloading, err %d\n", err); 711 else 712 mlx5_enter_error_state(dev, true); 713 714 mlx5_fw_reset_complete_reload(dev); 715 } 716 717 static void mlx5_sync_reset_abort_event(struct work_struct *work) 718 { 719 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, 720 reset_abort_work); 721 struct mlx5_core_dev *dev = fw_reset->dev; 722 723 if (mlx5_sync_reset_clear_reset_requested(dev, true)) 724 return; 725 mlx5_core_warn(dev, "PCI Sync FW Update Reset Aborted.\n"); 726 } 727 728 static void mlx5_sync_reset_events_handle(struct mlx5_fw_reset *fw_reset, struct mlx5_eqe *eqe) 729 { 730 struct mlx5_eqe_sync_fw_update *sync_fw_update_eqe; 731 u8 sync_event_rst_type; 732 733 sync_fw_update_eqe = &eqe->data.sync_fw_update; 734 sync_event_rst_type = sync_fw_update_eqe->sync_rst_state & SYNC_RST_STATE_MASK; 735 switch (sync_event_rst_type) { 736 case MLX5_SYNC_RST_STATE_RESET_REQUEST: 737 queue_work(fw_reset->wq, &fw_reset->reset_request_work); 738 break; 739 case MLX5_SYNC_RST_STATE_RESET_UNLOAD: 740 queue_work(fw_reset->wq, &fw_reset->reset_unload_work); 741 break; 742 case MLX5_SYNC_RST_STATE_RESET_NOW: 743 queue_work(fw_reset->wq, &fw_reset->reset_now_work); 744 break; 745 case MLX5_SYNC_RST_STATE_RESET_ABORT: 746 queue_work(fw_reset->wq, &fw_reset->reset_abort_work); 747 break; 748 } 749 } 750 751 static void mlx5_sync_reset_timeout_work(struct work_struct *work) 752 { 753 struct delayed_work *dwork = container_of(work, struct delayed_work, 754 work); 755 struct mlx5_fw_reset *fw_reset = 756 container_of(dwork, struct mlx5_fw_reset, reset_timeout_work); 757 struct mlx5_core_dev *dev = fw_reset->dev; 758 759 if (mlx5_sync_reset_clear_reset_requested(dev, true)) 760 return; 761 mlx5_core_warn(dev, "PCI Sync FW Update Reset Timeout.\n"); 762 } 763 764 static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long action, void *data) 765 { 766 struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb); 767 struct mlx5_eqe *eqe = data; 768 769 if (test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags)) 770 return NOTIFY_DONE; 771 772 switch (eqe->sub_type) { 773 case MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT: 774 queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work); 775 break; 776 case MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT: 777 mlx5_sync_reset_events_handle(fw_reset, eqe); 778 break; 779 default: 780 return NOTIFY_DONE; 781 } 782 783 return NOTIFY_OK; 784 } 785 786 int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev) 787 { 788 unsigned long pci_sync_update_timeout = mlx5_tout_ms(dev, PCI_SYNC_UPDATE); 789 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 790 unsigned long timeout; 791 int err; 792 793 if (MLX5_CAP_GEN(dev, pci_sync_for_fw_update_with_driver_unload)) 794 pci_sync_update_timeout += mlx5_tout_ms(dev, RESET_UNLOAD); 795 timeout = msecs_to_jiffies(pci_sync_update_timeout); 796 if (!wait_for_completion_timeout(&fw_reset->done, timeout)) { 797 mlx5_core_warn(dev, "FW sync reset timeout after %lu seconds\n", 798 pci_sync_update_timeout / 1000); 799 err = -ETIMEDOUT; 800 goto out; 801 } 802 err = fw_reset->ret; 803 if (test_and_clear_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags)) { 804 mlx5_unload_one_devl_locked(dev, false); 805 mlx5_load_one_devl_locked(dev, true); 806 } 807 out: 808 clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); 809 return err; 810 } 811 812 void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev) 813 { 814 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 815 816 if (!fw_reset) 817 return; 818 819 MLX5_NB_INIT(&fw_reset->nb, fw_reset_event_notifier, GENERAL_EVENT); 820 mlx5_eq_notifier_register(dev, &fw_reset->nb); 821 } 822 823 void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev) 824 { 825 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 826 827 if (!fw_reset) 828 return; 829 830 mlx5_eq_notifier_unregister(dev, &fw_reset->nb); 831 } 832 833 void mlx5_drain_fw_reset(struct mlx5_core_dev *dev) 834 { 835 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 836 837 if (!fw_reset) 838 return; 839 840 set_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags); 841 cancel_work_sync(&fw_reset->fw_live_patch_work); 842 cancel_work_sync(&fw_reset->reset_request_work); 843 cancel_work_sync(&fw_reset->reset_unload_work); 844 cancel_work_sync(&fw_reset->reset_reload_work); 845 cancel_work_sync(&fw_reset->reset_now_work); 846 cancel_work_sync(&fw_reset->reset_abort_work); 847 cancel_delayed_work(&fw_reset->reset_timeout_work); 848 } 849 850 static const struct devlink_param mlx5_fw_reset_devlink_params[] = { 851 DEVLINK_PARAM_GENERIC(ENABLE_REMOTE_DEV_RESET, BIT(DEVLINK_PARAM_CMODE_RUNTIME), 852 mlx5_fw_reset_enable_remote_dev_reset_get, 853 mlx5_fw_reset_enable_remote_dev_reset_set, NULL), 854 }; 855 856 int mlx5_fw_reset_init(struct mlx5_core_dev *dev) 857 { 858 struct mlx5_fw_reset *fw_reset; 859 int err; 860 861 if (!MLX5_CAP_MCAM_REG(dev, mfrl)) 862 return 0; 863 864 fw_reset = kzalloc(sizeof(*fw_reset), GFP_KERNEL); 865 if (!fw_reset) 866 return -ENOMEM; 867 fw_reset->wq = create_singlethread_workqueue("mlx5_fw_reset_events"); 868 if (!fw_reset->wq) { 869 kfree(fw_reset); 870 return -ENOMEM; 871 } 872 873 fw_reset->dev = dev; 874 dev->priv.fw_reset = fw_reset; 875 876 err = devl_params_register(priv_to_devlink(dev), 877 mlx5_fw_reset_devlink_params, 878 ARRAY_SIZE(mlx5_fw_reset_devlink_params)); 879 if (err) { 880 destroy_workqueue(fw_reset->wq); 881 kfree(fw_reset); 882 return err; 883 } 884 885 INIT_WORK(&fw_reset->fw_live_patch_work, mlx5_fw_live_patch_event); 886 INIT_WORK(&fw_reset->reset_request_work, mlx5_sync_reset_request_event); 887 INIT_WORK(&fw_reset->reset_unload_work, mlx5_sync_reset_unload_event); 888 INIT_WORK(&fw_reset->reset_reload_work, mlx5_sync_reset_reload_work); 889 INIT_WORK(&fw_reset->reset_now_work, mlx5_sync_reset_now_event); 890 INIT_WORK(&fw_reset->reset_abort_work, mlx5_sync_reset_abort_event); 891 INIT_DELAYED_WORK(&fw_reset->reset_timeout_work, 892 mlx5_sync_reset_timeout_work); 893 894 init_completion(&fw_reset->done); 895 return 0; 896 } 897 898 void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev) 899 { 900 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; 901 902 if (!fw_reset) 903 return; 904 905 devl_params_unregister(priv_to_devlink(dev), 906 mlx5_fw_reset_devlink_params, 907 ARRAY_SIZE(mlx5_fw_reset_devlink_params)); 908 destroy_workqueue(fw_reset->wq); 909 kfree(dev->priv.fw_reset); 910 } 911