1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include "cmd.h" 7 8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; 9 10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id) 11 { 12 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); 13 void *query_cap = NULL, *cap; 14 int ret; 15 16 query_cap = kzalloc(query_sz, GFP_KERNEL); 17 if (!query_cap) 18 return -ENOMEM; 19 20 ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap, 21 MLX5_CAP_GENERAL_2); 22 if (ret) 23 goto out; 24 25 cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability); 26 if (!MLX5_GET(cmd_hca_cap_2, cap, migratable)) 27 ret = -EOPNOTSUPP; 28 out: 29 kfree(query_cap); 30 return ret; 31 } 32 33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 34 u16 *vhca_id); 35 static void 36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); 37 38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 39 { 40 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 41 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; 42 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; 43 int err; 44 45 lockdep_assert_held(&mvdev->state_mutex); 46 if (mvdev->mdev_detach) 47 return -ENOTCONN; 48 49 /* 50 * In case PRE_COPY is used, saving_migf is exposed while the device is 51 * running. Make sure to run only once there is no active save command. 52 * Running both in parallel, might end-up with a failure in the save 53 * command once it will try to turn on 'tracking' on a suspended device. 54 */ 55 if (migf) { 56 err = wait_for_completion_interruptible(&migf->save_comp); 57 if (err) 58 return err; 59 } 60 61 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); 62 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); 63 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); 64 65 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); 66 if (migf) 67 complete(&migf->save_comp); 68 69 return err; 70 } 71 72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 73 { 74 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {}; 75 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {}; 76 77 lockdep_assert_held(&mvdev->state_mutex); 78 if (mvdev->mdev_detach) 79 return -ENOTCONN; 80 81 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA); 82 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id); 83 MLX5_SET(resume_vhca_in, in, op_mod, op_mod); 84 85 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out); 86 } 87 88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 89 size_t *state_size, u64 *total_size, 90 u8 query_flags) 91 { 92 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; 93 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; 94 bool inc = query_flags & MLX5VF_QUERY_INC; 95 int ret; 96 97 lockdep_assert_held(&mvdev->state_mutex); 98 if (mvdev->mdev_detach) 99 return -ENOTCONN; 100 101 /* 102 * In case PRE_COPY is used, saving_migf is exposed while device is 103 * running. Make sure to run only once there is no active save command. 104 * Running both in parallel, might end-up with a failure in the 105 * incremental query command on un-tracked vhca. 106 */ 107 if (inc) { 108 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); 109 if (ret) 110 return ret; 111 if (mvdev->saving_migf->state == 112 MLX5_MIGF_STATE_PRE_COPY_ERROR) { 113 /* 114 * In case we had a PRE_COPY error, only query full 115 * image for final image 116 */ 117 if (!(query_flags & MLX5VF_QUERY_FINAL)) { 118 *state_size = 0; 119 complete(&mvdev->saving_migf->save_comp); 120 return 0; 121 } 122 query_flags &= ~MLX5VF_QUERY_INC; 123 } 124 } 125 126 MLX5_SET(query_vhca_migration_state_in, in, opcode, 127 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); 128 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); 129 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); 130 MLX5_SET(query_vhca_migration_state_in, in, incremental, 131 query_flags & MLX5VF_QUERY_INC); 132 MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode); 133 134 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, 135 out); 136 if (inc) 137 complete(&mvdev->saving_migf->save_comp); 138 139 if (ret) 140 return ret; 141 142 *state_size = MLX5_GET(query_vhca_migration_state_out, out, 143 required_umem_size); 144 if (total_size) 145 *total_size = mvdev->chunk_mode ? 146 MLX5_GET64(query_vhca_migration_state_out, out, 147 remaining_total_size) : *state_size; 148 149 return 0; 150 } 151 152 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) 153 { 154 /* Mark the tracker under an error and wake it up if it's running */ 155 mvdev->tracker.is_err = true; 156 complete(&mvdev->tracker_comp); 157 } 158 159 static int mlx5fv_vf_event(struct notifier_block *nb, 160 unsigned long event, void *data) 161 { 162 struct mlx5vf_pci_core_device *mvdev = 163 container_of(nb, struct mlx5vf_pci_core_device, nb); 164 165 switch (event) { 166 case MLX5_PF_NOTIFY_ENABLE_VF: 167 mutex_lock(&mvdev->state_mutex); 168 mvdev->mdev_detach = false; 169 mlx5vf_state_mutex_unlock(mvdev); 170 break; 171 case MLX5_PF_NOTIFY_DISABLE_VF: 172 mlx5vf_cmd_close_migratable(mvdev); 173 mutex_lock(&mvdev->state_mutex); 174 mvdev->mdev_detach = true; 175 mlx5vf_state_mutex_unlock(mvdev); 176 break; 177 default: 178 break; 179 } 180 181 return 0; 182 } 183 184 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) 185 { 186 if (!mvdev->migrate_cap) 187 return; 188 189 /* Must be done outside the lock to let it progress */ 190 set_tracker_error(mvdev); 191 mutex_lock(&mvdev->state_mutex); 192 mlx5vf_disable_fds(mvdev); 193 _mlx5vf_free_page_tracker_resources(mvdev); 194 mlx5vf_state_mutex_unlock(mvdev); 195 } 196 197 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) 198 { 199 if (!mvdev->migrate_cap) 200 return; 201 202 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, 203 &mvdev->nb); 204 destroy_workqueue(mvdev->cb_wq); 205 } 206 207 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, 208 const struct vfio_migration_ops *mig_ops, 209 const struct vfio_log_ops *log_ops) 210 { 211 struct pci_dev *pdev = mvdev->core_device.pdev; 212 int ret; 213 214 if (!pdev->is_virtfn) 215 return; 216 217 mvdev->mdev = mlx5_vf_get_core_dev(pdev); 218 if (!mvdev->mdev) 219 return; 220 221 if (!MLX5_CAP_GEN(mvdev->mdev, migration)) 222 goto end; 223 224 mvdev->vf_id = pci_iov_vf_id(pdev); 225 if (mvdev->vf_id < 0) 226 goto end; 227 228 ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1); 229 if (ret) 230 goto end; 231 232 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, 233 &mvdev->vhca_id)) 234 goto end; 235 236 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0); 237 if (!mvdev->cb_wq) 238 goto end; 239 240 mutex_init(&mvdev->state_mutex); 241 spin_lock_init(&mvdev->reset_lock); 242 mvdev->nb.notifier_call = mlx5fv_vf_event; 243 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, 244 &mvdev->nb); 245 if (ret) { 246 destroy_workqueue(mvdev->cb_wq); 247 goto end; 248 } 249 250 mvdev->migrate_cap = 1; 251 mvdev->core_device.vdev.migration_flags = 252 VFIO_MIGRATION_STOP_COPY | 253 VFIO_MIGRATION_P2P; 254 mvdev->core_device.vdev.mig_ops = mig_ops; 255 init_completion(&mvdev->tracker_comp); 256 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) 257 mvdev->core_device.vdev.log_ops = log_ops; 258 259 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 260 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) 261 mvdev->core_device.vdev.migration_flags |= 262 VFIO_MIGRATION_PRE_COPY; 263 264 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) 265 mvdev->chunk_mode = 1; 266 267 end: 268 mlx5_vf_put_core_dev(mvdev->mdev); 269 } 270 271 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 272 u16 *vhca_id) 273 { 274 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; 275 int out_size; 276 void *out; 277 int ret; 278 279 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); 280 out = kzalloc(out_size, GFP_KERNEL); 281 if (!out) 282 return -ENOMEM; 283 284 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); 285 MLX5_SET(query_hca_cap_in, in, other_function, 1); 286 MLX5_SET(query_hca_cap_in, in, function_id, function_id); 287 MLX5_SET(query_hca_cap_in, in, op_mod, 288 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | 289 HCA_CAP_OPMOD_GET_CUR); 290 291 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); 292 if (ret) 293 goto err_exec; 294 295 *vhca_id = MLX5_GET(query_hca_cap_out, out, 296 capability.cmd_hca_cap.vhca_id); 297 298 err_exec: 299 kfree(out); 300 return ret; 301 } 302 303 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, 304 struct mlx5_vhca_data_buffer *buf, 305 struct mlx5_vhca_recv_buf *recv_buf, 306 u32 *mkey) 307 { 308 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : 309 recv_buf->npages; 310 int err = 0, inlen; 311 __be64 *mtt; 312 void *mkc; 313 u32 *in; 314 315 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 316 sizeof(*mtt) * round_up(npages, 2); 317 318 in = kvzalloc(inlen, GFP_KERNEL); 319 if (!in) 320 return -ENOMEM; 321 322 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 323 DIV_ROUND_UP(npages, 2)); 324 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 325 326 if (buf) { 327 struct sg_dma_page_iter dma_iter; 328 329 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) 330 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); 331 } else { 332 int i; 333 334 for (i = 0; i < npages; i++) 335 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); 336 } 337 338 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 339 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 340 MLX5_SET(mkc, mkc, lr, 1); 341 MLX5_SET(mkc, mkc, lw, 1); 342 MLX5_SET(mkc, mkc, rr, 1); 343 MLX5_SET(mkc, mkc, rw, 1); 344 MLX5_SET(mkc, mkc, pd, pdn); 345 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 346 MLX5_SET(mkc, mkc, qpn, 0xffffff); 347 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 348 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); 349 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); 350 err = mlx5_core_create_mkey(mdev, mkey, in, inlen); 351 kvfree(in); 352 return err; 353 } 354 355 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) 356 { 357 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; 358 struct mlx5_core_dev *mdev = mvdev->mdev; 359 int ret; 360 361 lockdep_assert_held(&mvdev->state_mutex); 362 if (mvdev->mdev_detach) 363 return -ENOTCONN; 364 365 if (buf->dmaed || !buf->allocated_length) 366 return -EINVAL; 367 368 ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 369 if (ret) 370 return ret; 371 372 ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); 373 if (ret) 374 goto err; 375 376 buf->dmaed = true; 377 378 return 0; 379 err: 380 dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 381 return ret; 382 } 383 384 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) 385 { 386 struct mlx5_vf_migration_file *migf = buf->migf; 387 struct sg_page_iter sg_iter; 388 389 lockdep_assert_held(&migf->mvdev->state_mutex); 390 WARN_ON(migf->mvdev->mdev_detach); 391 392 if (buf->dmaed) { 393 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); 394 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, 395 buf->dma_dir, 0); 396 } 397 398 /* Undo alloc_pages_bulk_array() */ 399 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) 400 __free_page(sg_page_iter_page(&sg_iter)); 401 sg_free_append_table(&buf->table); 402 kfree(buf); 403 } 404 405 struct mlx5_vhca_data_buffer * 406 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, 407 size_t length, 408 enum dma_data_direction dma_dir) 409 { 410 struct mlx5_vhca_data_buffer *buf; 411 int ret; 412 413 buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 414 if (!buf) 415 return ERR_PTR(-ENOMEM); 416 417 buf->dma_dir = dma_dir; 418 buf->migf = migf; 419 if (length) { 420 ret = mlx5vf_add_migration_pages(buf, 421 DIV_ROUND_UP_ULL(length, PAGE_SIZE)); 422 if (ret) 423 goto end; 424 425 if (dma_dir != DMA_NONE) { 426 ret = mlx5vf_dma_data_buffer(buf); 427 if (ret) 428 goto end; 429 } 430 } 431 432 return buf; 433 end: 434 mlx5vf_free_data_buffer(buf); 435 return ERR_PTR(ret); 436 } 437 438 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) 439 { 440 spin_lock_irq(&buf->migf->list_lock); 441 buf->stop_copy_chunk_num = 0; 442 list_add_tail(&buf->buf_elm, &buf->migf->avail_list); 443 spin_unlock_irq(&buf->migf->list_lock); 444 } 445 446 struct mlx5_vhca_data_buffer * 447 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 448 size_t length, enum dma_data_direction dma_dir) 449 { 450 struct mlx5_vhca_data_buffer *buf, *temp_buf; 451 struct list_head free_list; 452 453 lockdep_assert_held(&migf->mvdev->state_mutex); 454 if (migf->mvdev->mdev_detach) 455 return ERR_PTR(-ENOTCONN); 456 457 INIT_LIST_HEAD(&free_list); 458 459 spin_lock_irq(&migf->list_lock); 460 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { 461 if (buf->dma_dir == dma_dir) { 462 list_del_init(&buf->buf_elm); 463 if (buf->allocated_length >= length) { 464 spin_unlock_irq(&migf->list_lock); 465 goto found; 466 } 467 /* 468 * Prevent holding redundant buffers. Put in a free 469 * list and call at the end not under the spin lock 470 * (&migf->list_lock) to mlx5vf_free_data_buffer which 471 * might sleep. 472 */ 473 list_add(&buf->buf_elm, &free_list); 474 } 475 } 476 spin_unlock_irq(&migf->list_lock); 477 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); 478 479 found: 480 while ((temp_buf = list_first_entry_or_null(&free_list, 481 struct mlx5_vhca_data_buffer, buf_elm))) { 482 list_del(&temp_buf->buf_elm); 483 mlx5vf_free_data_buffer(temp_buf); 484 } 485 486 return buf; 487 } 488 489 static void 490 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf, 491 struct mlx5vf_async_data *async_data) 492 { 493 kvfree(async_data->out); 494 complete(&migf->save_comp); 495 fput(migf->filp); 496 } 497 498 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) 499 { 500 struct mlx5vf_async_data *async_data = container_of(_work, 501 struct mlx5vf_async_data, work); 502 struct mlx5_vf_migration_file *migf = container_of(async_data, 503 struct mlx5_vf_migration_file, async_data); 504 505 mutex_lock(&migf->lock); 506 if (async_data->status) { 507 mlx5vf_put_data_buffer(async_data->buf); 508 if (async_data->header_buf) 509 mlx5vf_put_data_buffer(async_data->header_buf); 510 if (!async_data->stop_copy_chunk && 511 async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) 512 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; 513 else 514 migf->state = MLX5_MIGF_STATE_ERROR; 515 wake_up_interruptible(&migf->poll_wait); 516 } 517 mutex_unlock(&migf->lock); 518 mlx5vf_save_callback_complete(migf, async_data); 519 } 520 521 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, 522 size_t image_size, bool initial_pre_copy) 523 { 524 struct mlx5_vf_migration_file *migf = header_buf->migf; 525 struct mlx5_vf_migration_header header = {}; 526 unsigned long flags; 527 struct page *page; 528 u8 *to_buff; 529 530 header.record_size = cpu_to_le64(image_size); 531 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY); 532 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA); 533 page = mlx5vf_get_migration_page(header_buf, 0); 534 if (!page) 535 return -EINVAL; 536 to_buff = kmap_local_page(page); 537 memcpy(to_buff, &header, sizeof(header)); 538 kunmap_local(to_buff); 539 header_buf->length = sizeof(header); 540 header_buf->start_pos = header_buf->migf->max_pos; 541 migf->max_pos += header_buf->length; 542 spin_lock_irqsave(&migf->list_lock, flags); 543 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 544 spin_unlock_irqrestore(&migf->list_lock, flags); 545 if (initial_pre_copy) 546 migf->pre_copy_initial_bytes += sizeof(header); 547 return 0; 548 } 549 550 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) 551 { 552 struct mlx5vf_async_data *async_data = container_of(context, 553 struct mlx5vf_async_data, cb_work); 554 struct mlx5_vf_migration_file *migf = container_of(async_data, 555 struct mlx5_vf_migration_file, async_data); 556 557 if (!status) { 558 size_t next_required_umem_size = 0; 559 bool stop_copy_last_chunk; 560 size_t image_size; 561 unsigned long flags; 562 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && 563 !async_data->stop_copy_chunk; 564 565 image_size = MLX5_GET(save_vhca_state_out, async_data->out, 566 actual_image_size); 567 if (async_data->buf->stop_copy_chunk_num) 568 next_required_umem_size = MLX5_GET(save_vhca_state_out, 569 async_data->out, next_required_umem_size); 570 stop_copy_last_chunk = async_data->stop_copy_chunk && 571 !next_required_umem_size; 572 if (async_data->header_buf) { 573 status = add_buf_header(async_data->header_buf, image_size, 574 initial_pre_copy); 575 if (status) 576 goto err; 577 } 578 async_data->buf->length = image_size; 579 async_data->buf->start_pos = migf->max_pos; 580 migf->max_pos += async_data->buf->length; 581 spin_lock_irqsave(&migf->list_lock, flags); 582 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); 583 if (async_data->buf->stop_copy_chunk_num) { 584 migf->num_ready_chunks++; 585 if (next_required_umem_size && 586 migf->num_ready_chunks >= MAX_NUM_CHUNKS) { 587 /* Delay the next SAVE till one chunk be consumed */ 588 migf->next_required_umem_size = next_required_umem_size; 589 next_required_umem_size = 0; 590 } 591 } 592 spin_unlock_irqrestore(&migf->list_lock, flags); 593 if (initial_pre_copy) { 594 migf->pre_copy_initial_bytes += image_size; 595 migf->state = MLX5_MIGF_STATE_PRE_COPY; 596 } 597 if (stop_copy_last_chunk) 598 migf->state = MLX5_MIGF_STATE_COMPLETE; 599 wake_up_interruptible(&migf->poll_wait); 600 if (next_required_umem_size) 601 mlx5vf_mig_file_set_save_work(migf, 602 /* Picking up the next chunk num */ 603 (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1, 604 next_required_umem_size); 605 mlx5vf_save_callback_complete(migf, async_data); 606 return; 607 } 608 609 err: 610 /* The error flow can't run from an interrupt context */ 611 if (status == -EREMOTEIO) 612 status = MLX5_GET(save_vhca_state_out, async_data->out, status); 613 async_data->status = status; 614 queue_work(migf->mvdev->cb_wq, &async_data->work); 615 } 616 617 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, 618 struct mlx5_vf_migration_file *migf, 619 struct mlx5_vhca_data_buffer *buf, bool inc, 620 bool track) 621 { 622 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); 623 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 624 struct mlx5_vhca_data_buffer *header_buf = NULL; 625 struct mlx5vf_async_data *async_data; 626 int err; 627 628 lockdep_assert_held(&mvdev->state_mutex); 629 if (mvdev->mdev_detach) 630 return -ENOTCONN; 631 632 err = wait_for_completion_interruptible(&migf->save_comp); 633 if (err) 634 return err; 635 636 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) 637 /* 638 * In case we had a PRE_COPY error, SAVE is triggered only for 639 * the final image, read device full image. 640 */ 641 inc = false; 642 643 MLX5_SET(save_vhca_state_in, in, opcode, 644 MLX5_CMD_OP_SAVE_VHCA_STATE); 645 MLX5_SET(save_vhca_state_in, in, op_mod, 0); 646 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); 647 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); 648 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); 649 MLX5_SET(save_vhca_state_in, in, incremental, inc); 650 MLX5_SET(save_vhca_state_in, in, set_track, track); 651 652 async_data = &migf->async_data; 653 async_data->buf = buf; 654 async_data->stop_copy_chunk = !track; 655 async_data->out = kvzalloc(out_size, GFP_KERNEL); 656 if (!async_data->out) { 657 err = -ENOMEM; 658 goto err_out; 659 } 660 661 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 662 if (async_data->stop_copy_chunk) { 663 u8 header_idx = buf->stop_copy_chunk_num ? 664 buf->stop_copy_chunk_num - 1 : 0; 665 666 header_buf = migf->buf_header[header_idx]; 667 migf->buf_header[header_idx] = NULL; 668 } 669 670 if (!header_buf) { 671 header_buf = mlx5vf_get_data_buffer(migf, 672 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 673 if (IS_ERR(header_buf)) { 674 err = PTR_ERR(header_buf); 675 goto err_free; 676 } 677 } 678 } 679 680 if (async_data->stop_copy_chunk) 681 migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK; 682 683 async_data->header_buf = header_buf; 684 get_file(migf->filp); 685 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), 686 async_data->out, 687 out_size, mlx5vf_save_callback, 688 &async_data->cb_work); 689 if (err) 690 goto err_exec; 691 692 return 0; 693 694 err_exec: 695 if (header_buf) 696 mlx5vf_put_data_buffer(header_buf); 697 fput(migf->filp); 698 err_free: 699 kvfree(async_data->out); 700 err_out: 701 complete(&migf->save_comp); 702 return err; 703 } 704 705 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, 706 struct mlx5_vf_migration_file *migf, 707 struct mlx5_vhca_data_buffer *buf) 708 { 709 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; 710 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; 711 int err; 712 713 lockdep_assert_held(&mvdev->state_mutex); 714 if (mvdev->mdev_detach) 715 return -ENOTCONN; 716 717 if (!buf->dmaed) { 718 err = mlx5vf_dma_data_buffer(buf); 719 if (err) 720 return err; 721 } 722 723 MLX5_SET(load_vhca_state_in, in, opcode, 724 MLX5_CMD_OP_LOAD_VHCA_STATE); 725 MLX5_SET(load_vhca_state_in, in, op_mod, 0); 726 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); 727 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); 728 MLX5_SET(load_vhca_state_in, in, size, buf->length); 729 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); 730 } 731 732 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) 733 { 734 int err; 735 736 lockdep_assert_held(&migf->mvdev->state_mutex); 737 if (migf->mvdev->mdev_detach) 738 return -ENOTCONN; 739 740 err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); 741 return err; 742 } 743 744 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) 745 { 746 lockdep_assert_held(&migf->mvdev->state_mutex); 747 if (migf->mvdev->mdev_detach) 748 return; 749 750 mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); 751 } 752 753 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) 754 { 755 struct mlx5_vhca_data_buffer *entry; 756 int i; 757 758 lockdep_assert_held(&migf->mvdev->state_mutex); 759 WARN_ON(migf->mvdev->mdev_detach); 760 761 for (i = 0; i < MAX_NUM_CHUNKS; i++) { 762 if (migf->buf[i]) { 763 mlx5vf_free_data_buffer(migf->buf[i]); 764 migf->buf[i] = NULL; 765 } 766 767 if (migf->buf_header[i]) { 768 mlx5vf_free_data_buffer(migf->buf_header[i]); 769 migf->buf_header[i] = NULL; 770 } 771 } 772 773 list_splice(&migf->avail_list, &migf->buf_list); 774 775 while ((entry = list_first_entry_or_null(&migf->buf_list, 776 struct mlx5_vhca_data_buffer, buf_elm))) { 777 list_del(&entry->buf_elm); 778 mlx5vf_free_data_buffer(entry); 779 } 780 781 mlx5vf_cmd_dealloc_pd(migf); 782 } 783 784 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, 785 struct mlx5vf_pci_core_device *mvdev, 786 struct rb_root_cached *ranges, u32 nnodes) 787 { 788 int max_num_range = 789 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range); 790 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 791 int record_size = MLX5_ST_SZ_BYTES(page_track_range); 792 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 793 struct interval_tree_node *node = NULL; 794 u64 total_ranges_len = 0; 795 u32 num_ranges = nnodes; 796 u8 log_addr_space_size; 797 void *range_list_ptr; 798 void *obj_context; 799 void *cmd_hdr; 800 int inlen; 801 void *in; 802 int err; 803 int i; 804 805 if (num_ranges > max_num_range) { 806 vfio_combine_iova_ranges(ranges, nnodes, max_num_range); 807 num_ranges = max_num_range; 808 } 809 810 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) + 811 record_size * num_ranges; 812 in = kzalloc(inlen, GFP_KERNEL); 813 if (!in) 814 return -ENOMEM; 815 816 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in, 817 general_obj_in_cmd_hdr); 818 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 819 MLX5_CMD_OP_CREATE_GENERAL_OBJECT); 820 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 821 MLX5_OBJ_TYPE_PAGE_TRACK); 822 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context); 823 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id); 824 MLX5_SET(page_track, obj_context, track_type, 1); 825 MLX5_SET(page_track, obj_context, log_page_size, 826 ilog2(tracker->host_qp->tracked_page_size)); 827 MLX5_SET(page_track, obj_context, log_msg_size, 828 ilog2(tracker->host_qp->max_msg_size)); 829 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn); 830 MLX5_SET(page_track, obj_context, num_ranges, num_ranges); 831 832 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range); 833 node = interval_tree_iter_first(ranges, 0, ULONG_MAX); 834 for (i = 0; i < num_ranges; i++) { 835 void *addr_range_i_base = range_list_ptr + record_size * i; 836 unsigned long length = node->last - node->start + 1; 837 838 MLX5_SET64(page_track_range, addr_range_i_base, start_address, 839 node->start); 840 MLX5_SET64(page_track_range, addr_range_i_base, length, length); 841 total_ranges_len += length; 842 node = interval_tree_iter_next(node, 0, ULONG_MAX); 843 } 844 845 WARN_ON(node); 846 log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len)); 847 if (log_addr_space_size < 848 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || 849 log_addr_space_size > 850 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) { 851 err = -EOPNOTSUPP; 852 goto out; 853 } 854 855 MLX5_SET(page_track, obj_context, log_addr_space_size, 856 log_addr_space_size); 857 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 858 if (err) 859 goto out; 860 861 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); 862 out: 863 kfree(in); 864 return err; 865 } 866 867 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, 868 u32 tracker_id) 869 { 870 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; 871 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 872 873 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); 874 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 875 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id); 876 877 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 878 } 879 880 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, 881 u32 tracker_id, unsigned long iova, 882 unsigned long length, u32 tracker_state) 883 { 884 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {}; 885 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 886 void *obj_context; 887 void *cmd_hdr; 888 889 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); 890 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); 891 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 892 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id); 893 894 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context); 895 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3); 896 MLX5_SET64(page_track, obj_context, range_start_address, iova); 897 MLX5_SET64(page_track, obj_context, length, length); 898 MLX5_SET(page_track, obj_context, state, tracker_state); 899 900 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 901 } 902 903 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, 904 struct mlx5_vhca_cq_buf *buf, int nent, 905 int cqe_size) 906 { 907 struct mlx5_frag_buf *frag_buf = &buf->frag_buf; 908 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); 909 u8 log_wq_sz = ilog2(cqe_size); 910 int err; 911 912 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf, 913 mdev->priv.numa_node); 914 if (err) 915 return err; 916 917 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); 918 buf->cqe_size = cqe_size; 919 buf->nent = nent; 920 return 0; 921 } 922 923 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf) 924 { 925 struct mlx5_cqe64 *cqe64; 926 void *cqe; 927 int i; 928 929 for (i = 0; i < buf->nent; i++) { 930 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); 931 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; 932 cqe64->op_own = MLX5_CQE_INVALID << 4; 933 } 934 } 935 936 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, 937 struct mlx5_vhca_cq *cq) 938 { 939 mlx5_core_destroy_cq(mdev, &cq->mcq); 940 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 941 mlx5_db_free(mdev, &cq->db); 942 } 943 944 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) 945 { 946 if (type != MLX5_EVENT_TYPE_CQ_ERROR) 947 return; 948 949 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device, 950 tracker.cq.mcq)); 951 } 952 953 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, 954 void *data) 955 { 956 struct mlx5_vhca_page_tracker *tracker = 957 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); 958 struct mlx5vf_pci_core_device *mvdev = container_of( 959 tracker, struct mlx5vf_pci_core_device, tracker); 960 struct mlx5_eqe *eqe = data; 961 u8 event_type = (u8)type; 962 u8 queue_type; 963 int qp_num; 964 965 switch (event_type) { 966 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: 967 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: 968 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: 969 queue_type = eqe->data.qp_srq.type; 970 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP) 971 break; 972 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; 973 if (qp_num != tracker->host_qp->qpn && 974 qp_num != tracker->fw_qp->qpn) 975 break; 976 set_tracker_error(mvdev); 977 break; 978 default: 979 break; 980 } 981 982 return NOTIFY_OK; 983 } 984 985 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, 986 struct mlx5_eqe *eqe) 987 { 988 struct mlx5vf_pci_core_device *mvdev = 989 container_of(mcq, struct mlx5vf_pci_core_device, 990 tracker.cq.mcq); 991 992 complete(&mvdev->tracker_comp); 993 } 994 995 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, 996 struct mlx5_vhca_page_tracker *tracker, 997 size_t ncqe) 998 { 999 int cqe_size = cache_line_size() == 128 ? 128 : 64; 1000 u32 out[MLX5_ST_SZ_DW(create_cq_out)]; 1001 struct mlx5_vhca_cq *cq; 1002 int inlen, err, eqn; 1003 void *cqc, *in; 1004 __be64 *pas; 1005 int vector; 1006 1007 cq = &tracker->cq; 1008 ncqe = roundup_pow_of_two(ncqe); 1009 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node); 1010 if (err) 1011 return err; 1012 1013 cq->ncqe = ncqe; 1014 cq->mcq.set_ci_db = cq->db.db; 1015 cq->mcq.arm_db = cq->db.db + 1; 1016 cq->mcq.cqe_sz = cqe_size; 1017 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size); 1018 if (err) 1019 goto err_db_free; 1020 1021 init_cq_frag_buf(&cq->buf); 1022 inlen = MLX5_ST_SZ_BYTES(create_cq_in) + 1023 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * 1024 cq->buf.frag_buf.npages; 1025 in = kvzalloc(inlen, GFP_KERNEL); 1026 if (!in) { 1027 err = -ENOMEM; 1028 goto err_buff; 1029 } 1030 1031 vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); 1032 err = mlx5_comp_eqn_get(mdev, vector, &eqn); 1033 if (err) 1034 goto err_vec; 1035 1036 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); 1037 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); 1038 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); 1039 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index); 1040 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift - 1041 MLX5_ADAPTER_PAGE_SHIFT); 1042 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); 1043 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); 1044 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); 1045 cq->mcq.comp = mlx5vf_cq_complete; 1046 cq->mcq.event = mlx5vf_cq_event; 1047 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); 1048 if (err) 1049 goto err_vec; 1050 1051 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1052 cq->mcq.cons_index); 1053 kvfree(in); 1054 return 0; 1055 1056 err_vec: 1057 kvfree(in); 1058 err_buff: 1059 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 1060 err_db_free: 1061 mlx5_db_free(mdev, &cq->db); 1062 return err; 1063 } 1064 1065 static struct mlx5_vhca_qp * 1066 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, 1067 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr) 1068 { 1069 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; 1070 struct mlx5_vhca_qp *qp; 1071 u8 log_rq_stride; 1072 u8 log_rq_sz; 1073 void *qpc; 1074 int inlen; 1075 void *in; 1076 int err; 1077 1078 qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT); 1079 if (!qp) 1080 return ERR_PTR(-ENOMEM); 1081 1082 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); 1083 if (err) 1084 goto err_free; 1085 1086 if (max_recv_wr) { 1087 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); 1088 log_rq_stride = ilog2(MLX5_SEND_WQE_DS); 1089 log_rq_sz = ilog2(qp->rq.wqe_cnt); 1090 err = mlx5_frag_buf_alloc_node(mdev, 1091 wq_get_byte_sz(log_rq_sz, log_rq_stride), 1092 &qp->buf, mdev->priv.numa_node); 1093 if (err) 1094 goto err_db_free; 1095 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc); 1096 } 1097 1098 qp->rq.db = &qp->db.db[MLX5_RCV_DBR]; 1099 inlen = MLX5_ST_SZ_BYTES(create_qp_in) + 1100 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * 1101 qp->buf.npages; 1102 in = kvzalloc(inlen, GFP_KERNEL); 1103 if (!in) { 1104 err = -ENOMEM; 1105 goto err_in; 1106 } 1107 1108 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); 1109 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); 1110 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); 1111 MLX5_SET(qpc, qpc, pd, tracker->pdn); 1112 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index); 1113 MLX5_SET(qpc, qpc, log_page_size, 1114 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); 1115 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); 1116 if (MLX5_CAP_GEN(mdev, cqe_version) == 1) 1117 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); 1118 MLX5_SET(qpc, qpc, no_sq, 1); 1119 if (max_recv_wr) { 1120 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn); 1121 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4); 1122 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz); 1123 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); 1124 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); 1125 mlx5_fill_page_frag_array(&qp->buf, 1126 (__be64 *)MLX5_ADDR_OF(create_qp_in, 1127 in, pas)); 1128 } else { 1129 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); 1130 } 1131 1132 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); 1133 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 1134 kvfree(in); 1135 if (err) 1136 goto err_in; 1137 1138 qp->qpn = MLX5_GET(create_qp_out, out, qpn); 1139 return qp; 1140 1141 err_in: 1142 if (max_recv_wr) 1143 mlx5_frag_buf_free(mdev, &qp->buf); 1144 err_db_free: 1145 mlx5_db_free(mdev, &qp->db); 1146 err_free: 1147 kfree(qp); 1148 return ERR_PTR(err); 1149 } 1150 1151 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp) 1152 { 1153 struct mlx5_wqe_data_seg *data; 1154 unsigned int ix; 1155 1156 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt); 1157 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1); 1158 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix); 1159 data->byte_count = cpu_to_be32(qp->max_msg_size); 1160 data->lkey = cpu_to_be32(qp->recv_buf.mkey); 1161 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset); 1162 qp->rq.pc++; 1163 /* Make sure that descriptors are written before doorbell record. */ 1164 dma_wmb(); 1165 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff); 1166 } 1167 1168 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev, 1169 struct mlx5_vhca_qp *qp, u32 remote_qpn, 1170 bool host_qp) 1171 { 1172 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; 1173 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; 1174 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; 1175 void *qpc; 1176 int ret; 1177 1178 /* Init */ 1179 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc); 1180 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1181 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); 1182 MLX5_SET(qpc, qpc, rre, 1); 1183 MLX5_SET(qpc, qpc, rwe, 1); 1184 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP); 1185 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn); 1186 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in); 1187 if (ret) 1188 return ret; 1189 1190 if (host_qp) { 1191 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1192 int i; 1193 1194 for (i = 0; i < qp->rq.wqe_cnt; i++) { 1195 mlx5vf_post_recv(qp); 1196 recv_buf->next_rq_offset += qp->max_msg_size; 1197 } 1198 } 1199 1200 /* RTR */ 1201 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc); 1202 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1203 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); 1204 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg)); 1205 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn); 1206 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1207 MLX5_SET(qpc, qpc, primary_address_path.fl, 1); 1208 MLX5_SET(qpc, qpc, min_rnr_nak, 1); 1209 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); 1210 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1211 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in); 1212 if (ret || host_qp) 1213 return ret; 1214 1215 /* RTS */ 1216 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc); 1217 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1218 MLX5_SET(qpc, qpc, retry_count, 7); 1219 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ 1220 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ 1221 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); 1222 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1223 1224 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in); 1225 } 1226 1227 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, 1228 struct mlx5_vhca_qp *qp) 1229 { 1230 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; 1231 1232 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); 1233 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); 1234 mlx5_cmd_exec_in(mdev, destroy_qp, in); 1235 1236 mlx5_frag_buf_free(mdev, &qp->buf); 1237 mlx5_db_free(mdev, &qp->db); 1238 kfree(qp); 1239 } 1240 1241 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) 1242 { 1243 int i; 1244 1245 /* Undo alloc_pages_bulk_array() */ 1246 for (i = 0; i < recv_buf->npages; i++) 1247 __free_page(recv_buf->page_list[i]); 1248 1249 kvfree(recv_buf->page_list); 1250 } 1251 1252 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, 1253 unsigned int npages) 1254 { 1255 unsigned int filled = 0, done = 0; 1256 int i; 1257 1258 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), 1259 GFP_KERNEL_ACCOUNT); 1260 if (!recv_buf->page_list) 1261 return -ENOMEM; 1262 1263 for (;;) { 1264 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, 1265 npages - done, 1266 recv_buf->page_list + done); 1267 if (!filled) 1268 goto err; 1269 1270 done += filled; 1271 if (done == npages) 1272 break; 1273 } 1274 1275 recv_buf->npages = npages; 1276 return 0; 1277 1278 err: 1279 for (i = 0; i < npages; i++) { 1280 if (recv_buf->page_list[i]) 1281 __free_page(recv_buf->page_list[i]); 1282 } 1283 1284 kvfree(recv_buf->page_list); 1285 return -ENOMEM; 1286 } 1287 1288 static int register_dma_recv_pages(struct mlx5_core_dev *mdev, 1289 struct mlx5_vhca_recv_buf *recv_buf) 1290 { 1291 int i, j; 1292 1293 recv_buf->dma_addrs = kvcalloc(recv_buf->npages, 1294 sizeof(*recv_buf->dma_addrs), 1295 GFP_KERNEL_ACCOUNT); 1296 if (!recv_buf->dma_addrs) 1297 return -ENOMEM; 1298 1299 for (i = 0; i < recv_buf->npages; i++) { 1300 recv_buf->dma_addrs[i] = dma_map_page(mdev->device, 1301 recv_buf->page_list[i], 1302 0, PAGE_SIZE, 1303 DMA_FROM_DEVICE); 1304 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) 1305 goto error; 1306 } 1307 return 0; 1308 1309 error: 1310 for (j = 0; j < i; j++) 1311 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], 1312 PAGE_SIZE, DMA_FROM_DEVICE); 1313 1314 kvfree(recv_buf->dma_addrs); 1315 return -ENOMEM; 1316 } 1317 1318 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, 1319 struct mlx5_vhca_recv_buf *recv_buf) 1320 { 1321 int i; 1322 1323 for (i = 0; i < recv_buf->npages; i++) 1324 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], 1325 PAGE_SIZE, DMA_FROM_DEVICE); 1326 1327 kvfree(recv_buf->dma_addrs); 1328 } 1329 1330 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, 1331 struct mlx5_vhca_qp *qp) 1332 { 1333 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1334 1335 mlx5_core_destroy_mkey(mdev, recv_buf->mkey); 1336 unregister_dma_recv_pages(mdev, recv_buf); 1337 free_recv_pages(&qp->recv_buf); 1338 } 1339 1340 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, 1341 struct mlx5_vhca_qp *qp, u32 pdn, 1342 u64 rq_size) 1343 { 1344 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE); 1345 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1346 int err; 1347 1348 err = alloc_recv_pages(recv_buf, npages); 1349 if (err < 0) 1350 return err; 1351 1352 err = register_dma_recv_pages(mdev, recv_buf); 1353 if (err) 1354 goto end; 1355 1356 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); 1357 if (err) 1358 goto err_create_mkey; 1359 1360 return 0; 1361 1362 err_create_mkey: 1363 unregister_dma_recv_pages(mdev, recv_buf); 1364 end: 1365 free_recv_pages(recv_buf); 1366 return err; 1367 } 1368 1369 static void 1370 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) 1371 { 1372 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1373 struct mlx5_core_dev *mdev = mvdev->mdev; 1374 1375 lockdep_assert_held(&mvdev->state_mutex); 1376 1377 if (!mvdev->log_active) 1378 return; 1379 1380 WARN_ON(mvdev->mdev_detach); 1381 1382 mlx5_eq_notifier_unregister(mdev, &tracker->nb); 1383 mlx5vf_cmd_destroy_tracker(mdev, tracker->id); 1384 mlx5vf_destroy_qp(mdev, tracker->fw_qp); 1385 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); 1386 mlx5vf_destroy_qp(mdev, tracker->host_qp); 1387 mlx5vf_destroy_cq(mdev, &tracker->cq); 1388 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1389 mlx5_put_uars_page(mdev, tracker->uar); 1390 mvdev->log_active = false; 1391 } 1392 1393 int mlx5vf_stop_page_tracker(struct vfio_device *vdev) 1394 { 1395 struct mlx5vf_pci_core_device *mvdev = container_of( 1396 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1397 1398 mutex_lock(&mvdev->state_mutex); 1399 if (!mvdev->log_active) 1400 goto end; 1401 1402 _mlx5vf_free_page_tracker_resources(mvdev); 1403 mvdev->log_active = false; 1404 end: 1405 mlx5vf_state_mutex_unlock(mvdev); 1406 return 0; 1407 } 1408 1409 int mlx5vf_start_page_tracker(struct vfio_device *vdev, 1410 struct rb_root_cached *ranges, u32 nnodes, 1411 u64 *page_size) 1412 { 1413 struct mlx5vf_pci_core_device *mvdev = container_of( 1414 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1415 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1416 u8 log_tracked_page = ilog2(*page_size); 1417 struct mlx5_vhca_qp *host_qp; 1418 struct mlx5_vhca_qp *fw_qp; 1419 struct mlx5_core_dev *mdev; 1420 u32 max_msg_size = PAGE_SIZE; 1421 u64 rq_size = SZ_2M; 1422 u32 max_recv_wr; 1423 int err; 1424 1425 mutex_lock(&mvdev->state_mutex); 1426 if (mvdev->mdev_detach) { 1427 err = -ENOTCONN; 1428 goto end; 1429 } 1430 1431 if (mvdev->log_active) { 1432 err = -EINVAL; 1433 goto end; 1434 } 1435 1436 mdev = mvdev->mdev; 1437 memset(tracker, 0, sizeof(*tracker)); 1438 tracker->uar = mlx5_get_uars_page(mdev); 1439 if (IS_ERR(tracker->uar)) { 1440 err = PTR_ERR(tracker->uar); 1441 goto end; 1442 } 1443 1444 err = mlx5_core_alloc_pd(mdev, &tracker->pdn); 1445 if (err) 1446 goto err_uar; 1447 1448 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size); 1449 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr); 1450 if (err) 1451 goto err_dealloc_pd; 1452 1453 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr); 1454 if (IS_ERR(host_qp)) { 1455 err = PTR_ERR(host_qp); 1456 goto err_cq; 1457 } 1458 1459 host_qp->max_msg_size = max_msg_size; 1460 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1461 pg_track_log_min_page_size)) { 1462 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1463 pg_track_log_min_page_size); 1464 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1465 pg_track_log_max_page_size)) { 1466 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1467 pg_track_log_max_page_size); 1468 } 1469 1470 host_qp->tracked_page_size = (1ULL << log_tracked_page); 1471 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn, 1472 rq_size); 1473 if (err) 1474 goto err_host_qp; 1475 1476 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0); 1477 if (IS_ERR(fw_qp)) { 1478 err = PTR_ERR(fw_qp); 1479 goto err_recv_resources; 1480 } 1481 1482 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true); 1483 if (err) 1484 goto err_activate; 1485 1486 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false); 1487 if (err) 1488 goto err_activate; 1489 1490 tracker->host_qp = host_qp; 1491 tracker->fw_qp = fw_qp; 1492 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes); 1493 if (err) 1494 goto err_activate; 1495 1496 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY); 1497 mlx5_eq_notifier_register(mdev, &tracker->nb); 1498 *page_size = host_qp->tracked_page_size; 1499 mvdev->log_active = true; 1500 mlx5vf_state_mutex_unlock(mvdev); 1501 return 0; 1502 1503 err_activate: 1504 mlx5vf_destroy_qp(mdev, fw_qp); 1505 err_recv_resources: 1506 mlx5vf_free_qp_recv_resources(mdev, host_qp); 1507 err_host_qp: 1508 mlx5vf_destroy_qp(mdev, host_qp); 1509 err_cq: 1510 mlx5vf_destroy_cq(mdev, &tracker->cq); 1511 err_dealloc_pd: 1512 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1513 err_uar: 1514 mlx5_put_uars_page(mdev, tracker->uar); 1515 end: 1516 mlx5vf_state_mutex_unlock(mvdev); 1517 return err; 1518 } 1519 1520 static void 1521 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, 1522 struct iova_bitmap *dirty) 1523 { 1524 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); 1525 u32 nent = size / entry_size; 1526 struct page *page; 1527 u64 addr; 1528 u64 *buf; 1529 int i; 1530 1531 if (WARN_ON(index >= qp->recv_buf.npages || 1532 (nent > qp->max_msg_size / entry_size))) 1533 return; 1534 1535 page = qp->recv_buf.page_list[index]; 1536 buf = kmap_local_page(page); 1537 for (i = 0; i < nent; i++) { 1538 addr = MLX5_GET(page_track_report_entry, buf + i, 1539 dirty_address_low); 1540 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, 1541 dirty_address_high) << 32; 1542 iova_bitmap_set(dirty, addr, qp->tracked_page_size); 1543 } 1544 kunmap_local(buf); 1545 } 1546 1547 static void 1548 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe, 1549 struct iova_bitmap *dirty, int *tracker_status) 1550 { 1551 u32 size; 1552 int ix; 1553 1554 qp->rq.cc++; 1555 *tracker_status = be32_to_cpu(cqe->immediate) >> 28; 1556 size = be32_to_cpu(cqe->byte_cnt); 1557 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1); 1558 1559 /* zero length CQE, no data */ 1560 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING); 1561 if (size) 1562 set_report_output(size, ix, qp, dirty); 1563 1564 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size; 1565 mlx5vf_post_recv(qp); 1566 } 1567 1568 static void *get_cqe(struct mlx5_vhca_cq *cq, int n) 1569 { 1570 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); 1571 } 1572 1573 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n) 1574 { 1575 void *cqe = get_cqe(cq, n & (cq->ncqe - 1)); 1576 struct mlx5_cqe64 *cqe64; 1577 1578 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; 1579 1580 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && 1581 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) { 1582 return cqe64; 1583 } else { 1584 return NULL; 1585 } 1586 } 1587 1588 static int 1589 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp, 1590 struct iova_bitmap *dirty, int *tracker_status) 1591 { 1592 struct mlx5_cqe64 *cqe; 1593 u8 opcode; 1594 1595 cqe = get_sw_cqe(cq, cq->mcq.cons_index); 1596 if (!cqe) 1597 return CQ_EMPTY; 1598 1599 ++cq->mcq.cons_index; 1600 /* 1601 * Make sure we read CQ entry contents after we've checked the 1602 * ownership bit. 1603 */ 1604 rmb(); 1605 opcode = get_cqe_opcode(cqe); 1606 switch (opcode) { 1607 case MLX5_CQE_RESP_SEND_IMM: 1608 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status); 1609 return CQ_OK; 1610 default: 1611 return CQ_POLL_ERR; 1612 } 1613 } 1614 1615 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, 1616 unsigned long length, 1617 struct iova_bitmap *dirty) 1618 { 1619 struct mlx5vf_pci_core_device *mvdev = container_of( 1620 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1621 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1622 struct mlx5_vhca_cq *cq = &tracker->cq; 1623 struct mlx5_core_dev *mdev; 1624 int poll_err, err; 1625 1626 mutex_lock(&mvdev->state_mutex); 1627 if (!mvdev->log_active) { 1628 err = -EINVAL; 1629 goto end; 1630 } 1631 1632 if (mvdev->mdev_detach) { 1633 err = -ENOTCONN; 1634 goto end; 1635 } 1636 1637 mdev = mvdev->mdev; 1638 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, 1639 MLX5_PAGE_TRACK_STATE_REPORTING); 1640 if (err) 1641 goto end; 1642 1643 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; 1644 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING && 1645 !tracker->is_err) { 1646 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, 1647 &tracker->status); 1648 if (poll_err == CQ_EMPTY) { 1649 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1650 cq->mcq.cons_index); 1651 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, 1652 dirty, &tracker->status); 1653 if (poll_err == CQ_EMPTY) { 1654 wait_for_completion(&mvdev->tracker_comp); 1655 continue; 1656 } 1657 } 1658 if (poll_err == CQ_POLL_ERR) { 1659 err = -EIO; 1660 goto end; 1661 } 1662 mlx5_cq_set_ci(&cq->mcq); 1663 } 1664 1665 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) 1666 tracker->is_err = true; 1667 1668 if (tracker->is_err) 1669 err = -EIO; 1670 end: 1671 mlx5vf_state_mutex_unlock(mvdev); 1672 return err; 1673 } 1674