1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include "cmd.h" 7 8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; 9 10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id) 11 { 12 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); 13 void *query_cap = NULL, *cap; 14 int ret; 15 16 query_cap = kzalloc(query_sz, GFP_KERNEL); 17 if (!query_cap) 18 return -ENOMEM; 19 20 ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap, 21 MLX5_CAP_GENERAL_2); 22 if (ret) 23 goto out; 24 25 cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability); 26 if (!MLX5_GET(cmd_hca_cap_2, cap, migratable)) 27 ret = -EOPNOTSUPP; 28 out: 29 kfree(query_cap); 30 return ret; 31 } 32 33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 34 u16 *vhca_id); 35 static void 36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); 37 38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 39 { 40 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 41 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; 42 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; 43 int err; 44 45 lockdep_assert_held(&mvdev->state_mutex); 46 if (mvdev->mdev_detach) 47 return -ENOTCONN; 48 49 /* 50 * In case PRE_COPY is used, saving_migf is exposed while the device is 51 * running. Make sure to run only once there is no active save command. 52 * Running both in parallel, might end-up with a failure in the save 53 * command once it will try to turn on 'tracking' on a suspended device. 54 */ 55 if (migf) { 56 err = wait_for_completion_interruptible(&migf->save_comp); 57 if (err) 58 return err; 59 } 60 61 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); 62 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); 63 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); 64 65 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); 66 if (migf) 67 complete(&migf->save_comp); 68 69 return err; 70 } 71 72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 73 { 74 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {}; 75 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {}; 76 77 lockdep_assert_held(&mvdev->state_mutex); 78 if (mvdev->mdev_detach) 79 return -ENOTCONN; 80 81 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA); 82 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id); 83 MLX5_SET(resume_vhca_in, in, op_mod, op_mod); 84 85 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out); 86 } 87 88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 89 size_t *state_size, u64 *total_size, 90 u8 query_flags) 91 { 92 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; 93 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; 94 bool inc = query_flags & MLX5VF_QUERY_INC; 95 int ret; 96 97 lockdep_assert_held(&mvdev->state_mutex); 98 if (mvdev->mdev_detach) 99 return -ENOTCONN; 100 101 /* 102 * In case PRE_COPY is used, saving_migf is exposed while device is 103 * running. Make sure to run only once there is no active save command. 104 * Running both in parallel, might end-up with a failure in the 105 * incremental query command on un-tracked vhca. 106 */ 107 if (inc) { 108 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); 109 if (ret) 110 return ret; 111 /* Upon cleanup, ignore previous pre_copy error state */ 112 if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR && 113 !(query_flags & MLX5VF_QUERY_CLEANUP)) { 114 /* 115 * In case we had a PRE_COPY error, only query full 116 * image for final image 117 */ 118 if (!(query_flags & MLX5VF_QUERY_FINAL)) { 119 *state_size = 0; 120 complete(&mvdev->saving_migf->save_comp); 121 return 0; 122 } 123 query_flags &= ~MLX5VF_QUERY_INC; 124 } 125 /* Block incremental query which is state-dependent */ 126 if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) { 127 complete(&mvdev->saving_migf->save_comp); 128 return -ENODEV; 129 } 130 } 131 132 MLX5_SET(query_vhca_migration_state_in, in, opcode, 133 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); 134 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); 135 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); 136 MLX5_SET(query_vhca_migration_state_in, in, incremental, 137 query_flags & MLX5VF_QUERY_INC); 138 MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode); 139 140 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, 141 out); 142 if (inc) 143 complete(&mvdev->saving_migf->save_comp); 144 145 if (ret) 146 return ret; 147 148 *state_size = MLX5_GET(query_vhca_migration_state_out, out, 149 required_umem_size); 150 if (total_size) 151 *total_size = mvdev->chunk_mode ? 152 MLX5_GET64(query_vhca_migration_state_out, out, 153 remaining_total_size) : *state_size; 154 155 return 0; 156 } 157 158 static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev) 159 { 160 mvdev->tracker.object_changed = true; 161 complete(&mvdev->tracker_comp); 162 } 163 164 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) 165 { 166 /* Mark the tracker under an error and wake it up if it's running */ 167 mvdev->tracker.is_err = true; 168 complete(&mvdev->tracker_comp); 169 } 170 171 static int mlx5fv_vf_event(struct notifier_block *nb, 172 unsigned long event, void *data) 173 { 174 struct mlx5vf_pci_core_device *mvdev = 175 container_of(nb, struct mlx5vf_pci_core_device, nb); 176 177 switch (event) { 178 case MLX5_PF_NOTIFY_ENABLE_VF: 179 mutex_lock(&mvdev->state_mutex); 180 mvdev->mdev_detach = false; 181 mlx5vf_state_mutex_unlock(mvdev); 182 break; 183 case MLX5_PF_NOTIFY_DISABLE_VF: 184 mlx5vf_cmd_close_migratable(mvdev); 185 mutex_lock(&mvdev->state_mutex); 186 mvdev->mdev_detach = true; 187 mlx5vf_state_mutex_unlock(mvdev); 188 break; 189 default: 190 break; 191 } 192 193 return 0; 194 } 195 196 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) 197 { 198 if (!mvdev->migrate_cap) 199 return; 200 201 /* Must be done outside the lock to let it progress */ 202 set_tracker_error(mvdev); 203 mutex_lock(&mvdev->state_mutex); 204 mlx5vf_disable_fds(mvdev, NULL); 205 _mlx5vf_free_page_tracker_resources(mvdev); 206 mlx5vf_state_mutex_unlock(mvdev); 207 } 208 209 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) 210 { 211 if (!mvdev->migrate_cap) 212 return; 213 214 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, 215 &mvdev->nb); 216 destroy_workqueue(mvdev->cb_wq); 217 } 218 219 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, 220 const struct vfio_migration_ops *mig_ops, 221 const struct vfio_log_ops *log_ops) 222 { 223 struct pci_dev *pdev = mvdev->core_device.pdev; 224 int ret; 225 226 if (!pdev->is_virtfn) 227 return; 228 229 mvdev->mdev = mlx5_vf_get_core_dev(pdev); 230 if (!mvdev->mdev) 231 return; 232 233 if (!MLX5_CAP_GEN(mvdev->mdev, migration)) 234 goto end; 235 236 if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 237 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))) 238 goto end; 239 240 mvdev->vf_id = pci_iov_vf_id(pdev); 241 if (mvdev->vf_id < 0) 242 goto end; 243 244 ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1); 245 if (ret) 246 goto end; 247 248 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, 249 &mvdev->vhca_id)) 250 goto end; 251 252 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0); 253 if (!mvdev->cb_wq) 254 goto end; 255 256 mutex_init(&mvdev->state_mutex); 257 spin_lock_init(&mvdev->reset_lock); 258 mvdev->nb.notifier_call = mlx5fv_vf_event; 259 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, 260 &mvdev->nb); 261 if (ret) { 262 destroy_workqueue(mvdev->cb_wq); 263 goto end; 264 } 265 266 mvdev->migrate_cap = 1; 267 mvdev->core_device.vdev.migration_flags = 268 VFIO_MIGRATION_STOP_COPY | 269 VFIO_MIGRATION_P2P | 270 VFIO_MIGRATION_PRE_COPY; 271 272 mvdev->core_device.vdev.mig_ops = mig_ops; 273 init_completion(&mvdev->tracker_comp); 274 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) 275 mvdev->core_device.vdev.log_ops = log_ops; 276 277 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) 278 mvdev->chunk_mode = 1; 279 280 end: 281 mlx5_vf_put_core_dev(mvdev->mdev); 282 } 283 284 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 285 u16 *vhca_id) 286 { 287 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; 288 int out_size; 289 void *out; 290 int ret; 291 292 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); 293 out = kzalloc(out_size, GFP_KERNEL); 294 if (!out) 295 return -ENOMEM; 296 297 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); 298 MLX5_SET(query_hca_cap_in, in, other_function, 1); 299 MLX5_SET(query_hca_cap_in, in, function_id, function_id); 300 MLX5_SET(query_hca_cap_in, in, op_mod, 301 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | 302 HCA_CAP_OPMOD_GET_CUR); 303 304 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); 305 if (ret) 306 goto err_exec; 307 308 *vhca_id = MLX5_GET(query_hca_cap_out, out, 309 capability.cmd_hca_cap.vhca_id); 310 311 err_exec: 312 kfree(out); 313 return ret; 314 } 315 316 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, 317 struct mlx5_vhca_data_buffer *buf, 318 struct mlx5_vhca_recv_buf *recv_buf, 319 u32 *mkey) 320 { 321 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : 322 recv_buf->npages; 323 int err = 0, inlen; 324 __be64 *mtt; 325 void *mkc; 326 u32 *in; 327 328 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 329 sizeof(*mtt) * round_up(npages, 2); 330 331 in = kvzalloc(inlen, GFP_KERNEL); 332 if (!in) 333 return -ENOMEM; 334 335 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 336 DIV_ROUND_UP(npages, 2)); 337 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 338 339 if (buf) { 340 struct sg_dma_page_iter dma_iter; 341 342 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) 343 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); 344 } else { 345 int i; 346 347 for (i = 0; i < npages; i++) 348 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); 349 } 350 351 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 352 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 353 MLX5_SET(mkc, mkc, lr, 1); 354 MLX5_SET(mkc, mkc, lw, 1); 355 MLX5_SET(mkc, mkc, rr, 1); 356 MLX5_SET(mkc, mkc, rw, 1); 357 MLX5_SET(mkc, mkc, pd, pdn); 358 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 359 MLX5_SET(mkc, mkc, qpn, 0xffffff); 360 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 361 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); 362 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); 363 err = mlx5_core_create_mkey(mdev, mkey, in, inlen); 364 kvfree(in); 365 return err; 366 } 367 368 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) 369 { 370 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; 371 struct mlx5_core_dev *mdev = mvdev->mdev; 372 int ret; 373 374 lockdep_assert_held(&mvdev->state_mutex); 375 if (mvdev->mdev_detach) 376 return -ENOTCONN; 377 378 if (buf->dmaed || !buf->allocated_length) 379 return -EINVAL; 380 381 ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 382 if (ret) 383 return ret; 384 385 ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); 386 if (ret) 387 goto err; 388 389 buf->dmaed = true; 390 391 return 0; 392 err: 393 dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 394 return ret; 395 } 396 397 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) 398 { 399 struct mlx5_vf_migration_file *migf = buf->migf; 400 struct sg_page_iter sg_iter; 401 402 lockdep_assert_held(&migf->mvdev->state_mutex); 403 WARN_ON(migf->mvdev->mdev_detach); 404 405 if (buf->dmaed) { 406 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); 407 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, 408 buf->dma_dir, 0); 409 } 410 411 /* Undo alloc_pages_bulk_array() */ 412 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) 413 __free_page(sg_page_iter_page(&sg_iter)); 414 sg_free_append_table(&buf->table); 415 kfree(buf); 416 } 417 418 static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 419 unsigned int npages) 420 { 421 unsigned int to_alloc = npages; 422 struct page **page_list; 423 unsigned long filled; 424 unsigned int to_fill; 425 int ret; 426 int i; 427 428 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 429 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); 430 if (!page_list) 431 return -ENOMEM; 432 433 do { 434 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, 435 page_list); 436 if (!filled) { 437 ret = -ENOMEM; 438 goto err; 439 } 440 to_alloc -= filled; 441 ret = sg_alloc_append_table_from_pages( 442 &buf->table, page_list, filled, 0, 443 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 444 GFP_KERNEL_ACCOUNT); 445 446 if (ret) 447 goto err_append; 448 buf->allocated_length += filled * PAGE_SIZE; 449 /* clean input for another bulk allocation */ 450 memset(page_list, 0, filled * sizeof(*page_list)); 451 to_fill = min_t(unsigned int, to_alloc, 452 PAGE_SIZE / sizeof(*page_list)); 453 } while (to_alloc > 0); 454 455 kvfree(page_list); 456 return 0; 457 458 err_append: 459 for (i = filled - 1; i >= 0; i--) 460 __free_page(page_list[i]); 461 err: 462 kvfree(page_list); 463 return ret; 464 } 465 466 struct mlx5_vhca_data_buffer * 467 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, 468 size_t length, 469 enum dma_data_direction dma_dir) 470 { 471 struct mlx5_vhca_data_buffer *buf; 472 int ret; 473 474 buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 475 if (!buf) 476 return ERR_PTR(-ENOMEM); 477 478 buf->dma_dir = dma_dir; 479 buf->migf = migf; 480 if (length) { 481 ret = mlx5vf_add_migration_pages(buf, 482 DIV_ROUND_UP_ULL(length, PAGE_SIZE)); 483 if (ret) 484 goto end; 485 486 if (dma_dir != DMA_NONE) { 487 ret = mlx5vf_dma_data_buffer(buf); 488 if (ret) 489 goto end; 490 } 491 } 492 493 return buf; 494 end: 495 mlx5vf_free_data_buffer(buf); 496 return ERR_PTR(ret); 497 } 498 499 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) 500 { 501 spin_lock_irq(&buf->migf->list_lock); 502 buf->stop_copy_chunk_num = 0; 503 list_add_tail(&buf->buf_elm, &buf->migf->avail_list); 504 spin_unlock_irq(&buf->migf->list_lock); 505 } 506 507 struct mlx5_vhca_data_buffer * 508 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 509 size_t length, enum dma_data_direction dma_dir) 510 { 511 struct mlx5_vhca_data_buffer *buf, *temp_buf; 512 struct list_head free_list; 513 514 lockdep_assert_held(&migf->mvdev->state_mutex); 515 if (migf->mvdev->mdev_detach) 516 return ERR_PTR(-ENOTCONN); 517 518 INIT_LIST_HEAD(&free_list); 519 520 spin_lock_irq(&migf->list_lock); 521 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { 522 if (buf->dma_dir == dma_dir) { 523 list_del_init(&buf->buf_elm); 524 if (buf->allocated_length >= length) { 525 spin_unlock_irq(&migf->list_lock); 526 goto found; 527 } 528 /* 529 * Prevent holding redundant buffers. Put in a free 530 * list and call at the end not under the spin lock 531 * (&migf->list_lock) to mlx5vf_free_data_buffer which 532 * might sleep. 533 */ 534 list_add(&buf->buf_elm, &free_list); 535 } 536 } 537 spin_unlock_irq(&migf->list_lock); 538 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); 539 540 found: 541 while ((temp_buf = list_first_entry_or_null(&free_list, 542 struct mlx5_vhca_data_buffer, buf_elm))) { 543 list_del(&temp_buf->buf_elm); 544 mlx5vf_free_data_buffer(temp_buf); 545 } 546 547 return buf; 548 } 549 550 static void 551 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf, 552 struct mlx5vf_async_data *async_data) 553 { 554 kvfree(async_data->out); 555 complete(&migf->save_comp); 556 fput(migf->filp); 557 } 558 559 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) 560 { 561 struct mlx5vf_async_data *async_data = container_of(_work, 562 struct mlx5vf_async_data, work); 563 struct mlx5_vf_migration_file *migf = container_of(async_data, 564 struct mlx5_vf_migration_file, async_data); 565 566 mutex_lock(&migf->lock); 567 if (async_data->status) { 568 mlx5vf_put_data_buffer(async_data->buf); 569 if (async_data->header_buf) 570 mlx5vf_put_data_buffer(async_data->header_buf); 571 if (!async_data->stop_copy_chunk && 572 async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) 573 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; 574 else 575 migf->state = MLX5_MIGF_STATE_ERROR; 576 wake_up_interruptible(&migf->poll_wait); 577 } 578 mutex_unlock(&migf->lock); 579 mlx5vf_save_callback_complete(migf, async_data); 580 } 581 582 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, 583 size_t image_size, bool initial_pre_copy) 584 { 585 struct mlx5_vf_migration_file *migf = header_buf->migf; 586 struct mlx5_vf_migration_header header = {}; 587 unsigned long flags; 588 struct page *page; 589 u8 *to_buff; 590 591 header.record_size = cpu_to_le64(image_size); 592 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY); 593 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA); 594 page = mlx5vf_get_migration_page(header_buf, 0); 595 if (!page) 596 return -EINVAL; 597 to_buff = kmap_local_page(page); 598 memcpy(to_buff, &header, sizeof(header)); 599 kunmap_local(to_buff); 600 header_buf->length = sizeof(header); 601 header_buf->start_pos = header_buf->migf->max_pos; 602 migf->max_pos += header_buf->length; 603 spin_lock_irqsave(&migf->list_lock, flags); 604 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 605 spin_unlock_irqrestore(&migf->list_lock, flags); 606 if (initial_pre_copy) 607 migf->pre_copy_initial_bytes += sizeof(header); 608 return 0; 609 } 610 611 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) 612 { 613 struct mlx5vf_async_data *async_data = container_of(context, 614 struct mlx5vf_async_data, cb_work); 615 struct mlx5_vf_migration_file *migf = container_of(async_data, 616 struct mlx5_vf_migration_file, async_data); 617 618 if (!status) { 619 size_t next_required_umem_size = 0; 620 bool stop_copy_last_chunk; 621 size_t image_size; 622 unsigned long flags; 623 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && 624 !async_data->stop_copy_chunk; 625 626 image_size = MLX5_GET(save_vhca_state_out, async_data->out, 627 actual_image_size); 628 if (async_data->buf->stop_copy_chunk_num) 629 next_required_umem_size = MLX5_GET(save_vhca_state_out, 630 async_data->out, next_required_umem_size); 631 stop_copy_last_chunk = async_data->stop_copy_chunk && 632 !next_required_umem_size; 633 if (async_data->header_buf) { 634 status = add_buf_header(async_data->header_buf, image_size, 635 initial_pre_copy); 636 if (status) 637 goto err; 638 } 639 async_data->buf->length = image_size; 640 async_data->buf->start_pos = migf->max_pos; 641 migf->max_pos += async_data->buf->length; 642 spin_lock_irqsave(&migf->list_lock, flags); 643 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); 644 if (async_data->buf->stop_copy_chunk_num) { 645 migf->num_ready_chunks++; 646 if (next_required_umem_size && 647 migf->num_ready_chunks >= MAX_NUM_CHUNKS) { 648 /* Delay the next SAVE till one chunk be consumed */ 649 migf->next_required_umem_size = next_required_umem_size; 650 next_required_umem_size = 0; 651 } 652 } 653 spin_unlock_irqrestore(&migf->list_lock, flags); 654 if (initial_pre_copy) { 655 migf->pre_copy_initial_bytes += image_size; 656 migf->state = MLX5_MIGF_STATE_PRE_COPY; 657 } 658 if (stop_copy_last_chunk) 659 migf->state = MLX5_MIGF_STATE_COMPLETE; 660 wake_up_interruptible(&migf->poll_wait); 661 if (next_required_umem_size) 662 mlx5vf_mig_file_set_save_work(migf, 663 /* Picking up the next chunk num */ 664 (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1, 665 next_required_umem_size); 666 mlx5vf_save_callback_complete(migf, async_data); 667 return; 668 } 669 670 err: 671 /* The error flow can't run from an interrupt context */ 672 if (status == -EREMOTEIO) { 673 status = MLX5_GET(save_vhca_state_out, async_data->out, status); 674 /* Failed in FW, print cmd out failure details */ 675 mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0, 676 async_data->out); 677 } 678 679 async_data->status = status; 680 queue_work(migf->mvdev->cb_wq, &async_data->work); 681 } 682 683 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, 684 struct mlx5_vf_migration_file *migf, 685 struct mlx5_vhca_data_buffer *buf, bool inc, 686 bool track) 687 { 688 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); 689 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 690 struct mlx5_vhca_data_buffer *header_buf = NULL; 691 struct mlx5vf_async_data *async_data; 692 bool pre_copy_cleanup = false; 693 int err; 694 695 lockdep_assert_held(&mvdev->state_mutex); 696 if (mvdev->mdev_detach) 697 return -ENOTCONN; 698 699 err = wait_for_completion_interruptible(&migf->save_comp); 700 if (err) 701 return err; 702 703 if ((migf->state == MLX5_MIGF_STATE_PRE_COPY || 704 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc) 705 pre_copy_cleanup = true; 706 707 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) 708 /* 709 * In case we had a PRE_COPY error, SAVE is triggered only for 710 * the final image, read device full image. 711 */ 712 inc = false; 713 714 MLX5_SET(save_vhca_state_in, in, opcode, 715 MLX5_CMD_OP_SAVE_VHCA_STATE); 716 MLX5_SET(save_vhca_state_in, in, op_mod, 0); 717 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); 718 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); 719 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); 720 MLX5_SET(save_vhca_state_in, in, incremental, inc); 721 MLX5_SET(save_vhca_state_in, in, set_track, track); 722 723 async_data = &migf->async_data; 724 async_data->buf = buf; 725 async_data->stop_copy_chunk = (!track && !pre_copy_cleanup); 726 async_data->out = kvzalloc(out_size, GFP_KERNEL); 727 if (!async_data->out) { 728 err = -ENOMEM; 729 goto err_out; 730 } 731 732 if (async_data->stop_copy_chunk) { 733 u8 header_idx = buf->stop_copy_chunk_num ? 734 buf->stop_copy_chunk_num - 1 : 0; 735 736 header_buf = migf->buf_header[header_idx]; 737 migf->buf_header[header_idx] = NULL; 738 } 739 740 if (!header_buf) { 741 header_buf = mlx5vf_get_data_buffer(migf, 742 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 743 if (IS_ERR(header_buf)) { 744 err = PTR_ERR(header_buf); 745 goto err_free; 746 } 747 } 748 749 if (async_data->stop_copy_chunk) 750 migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK; 751 752 async_data->header_buf = header_buf; 753 get_file(migf->filp); 754 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), 755 async_data->out, 756 out_size, mlx5vf_save_callback, 757 &async_data->cb_work); 758 if (err) 759 goto err_exec; 760 761 return 0; 762 763 err_exec: 764 if (header_buf) 765 mlx5vf_put_data_buffer(header_buf); 766 fput(migf->filp); 767 err_free: 768 kvfree(async_data->out); 769 err_out: 770 complete(&migf->save_comp); 771 return err; 772 } 773 774 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, 775 struct mlx5_vf_migration_file *migf, 776 struct mlx5_vhca_data_buffer *buf) 777 { 778 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; 779 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; 780 int err; 781 782 lockdep_assert_held(&mvdev->state_mutex); 783 if (mvdev->mdev_detach) 784 return -ENOTCONN; 785 786 if (!buf->dmaed) { 787 err = mlx5vf_dma_data_buffer(buf); 788 if (err) 789 return err; 790 } 791 792 MLX5_SET(load_vhca_state_in, in, opcode, 793 MLX5_CMD_OP_LOAD_VHCA_STATE); 794 MLX5_SET(load_vhca_state_in, in, op_mod, 0); 795 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); 796 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); 797 MLX5_SET(load_vhca_state_in, in, size, buf->length); 798 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); 799 } 800 801 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) 802 { 803 int err; 804 805 lockdep_assert_held(&migf->mvdev->state_mutex); 806 if (migf->mvdev->mdev_detach) 807 return -ENOTCONN; 808 809 err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); 810 return err; 811 } 812 813 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) 814 { 815 lockdep_assert_held(&migf->mvdev->state_mutex); 816 if (migf->mvdev->mdev_detach) 817 return; 818 819 mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); 820 } 821 822 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) 823 { 824 struct mlx5_vhca_data_buffer *entry; 825 int i; 826 827 lockdep_assert_held(&migf->mvdev->state_mutex); 828 WARN_ON(migf->mvdev->mdev_detach); 829 830 for (i = 0; i < MAX_NUM_CHUNKS; i++) { 831 if (migf->buf[i]) { 832 mlx5vf_free_data_buffer(migf->buf[i]); 833 migf->buf[i] = NULL; 834 } 835 836 if (migf->buf_header[i]) { 837 mlx5vf_free_data_buffer(migf->buf_header[i]); 838 migf->buf_header[i] = NULL; 839 } 840 } 841 842 list_splice(&migf->avail_list, &migf->buf_list); 843 844 while ((entry = list_first_entry_or_null(&migf->buf_list, 845 struct mlx5_vhca_data_buffer, buf_elm))) { 846 list_del(&entry->buf_elm); 847 mlx5vf_free_data_buffer(entry); 848 } 849 850 mlx5vf_cmd_dealloc_pd(migf); 851 } 852 853 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, 854 struct mlx5vf_pci_core_device *mvdev, 855 struct rb_root_cached *ranges, u32 nnodes) 856 { 857 int max_num_range = 858 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range); 859 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 860 int record_size = MLX5_ST_SZ_BYTES(page_track_range); 861 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 862 struct interval_tree_node *node = NULL; 863 u64 total_ranges_len = 0; 864 u32 num_ranges = nnodes; 865 u8 log_addr_space_size; 866 void *range_list_ptr; 867 void *obj_context; 868 void *cmd_hdr; 869 int inlen; 870 void *in; 871 int err; 872 int i; 873 874 if (num_ranges > max_num_range) { 875 vfio_combine_iova_ranges(ranges, nnodes, max_num_range); 876 num_ranges = max_num_range; 877 } 878 879 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) + 880 record_size * num_ranges; 881 in = kzalloc(inlen, GFP_KERNEL); 882 if (!in) 883 return -ENOMEM; 884 885 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in, 886 general_obj_in_cmd_hdr); 887 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 888 MLX5_CMD_OP_CREATE_GENERAL_OBJECT); 889 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 890 MLX5_OBJ_TYPE_PAGE_TRACK); 891 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context); 892 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id); 893 MLX5_SET(page_track, obj_context, track_type, 1); 894 MLX5_SET(page_track, obj_context, log_page_size, 895 ilog2(tracker->host_qp->tracked_page_size)); 896 MLX5_SET(page_track, obj_context, log_msg_size, 897 ilog2(tracker->host_qp->max_msg_size)); 898 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn); 899 MLX5_SET(page_track, obj_context, num_ranges, num_ranges); 900 901 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range); 902 node = interval_tree_iter_first(ranges, 0, ULONG_MAX); 903 for (i = 0; i < num_ranges; i++) { 904 void *addr_range_i_base = range_list_ptr + record_size * i; 905 unsigned long length = node->last - node->start + 1; 906 907 MLX5_SET64(page_track_range, addr_range_i_base, start_address, 908 node->start); 909 MLX5_SET64(page_track_range, addr_range_i_base, length, length); 910 total_ranges_len += length; 911 node = interval_tree_iter_next(node, 0, ULONG_MAX); 912 } 913 914 WARN_ON(node); 915 log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len)); 916 if (log_addr_space_size < 917 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || 918 log_addr_space_size > 919 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) { 920 err = -EOPNOTSUPP; 921 goto out; 922 } 923 924 MLX5_SET(page_track, obj_context, log_addr_space_size, 925 log_addr_space_size); 926 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 927 if (err) 928 goto out; 929 930 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); 931 out: 932 kfree(in); 933 return err; 934 } 935 936 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, 937 u32 tracker_id) 938 { 939 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; 940 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 941 942 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); 943 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 944 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id); 945 946 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 947 } 948 949 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, 950 u32 tracker_id, unsigned long iova, 951 unsigned long length, u32 tracker_state) 952 { 953 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {}; 954 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 955 void *obj_context; 956 void *cmd_hdr; 957 958 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); 959 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); 960 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 961 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id); 962 963 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context); 964 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3); 965 MLX5_SET64(page_track, obj_context, range_start_address, iova); 966 MLX5_SET64(page_track, obj_context, length, length); 967 MLX5_SET(page_track, obj_context, state, tracker_state); 968 969 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 970 } 971 972 static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev, 973 struct mlx5_vhca_page_tracker *tracker) 974 { 975 u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {}; 976 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; 977 void *obj_context; 978 void *cmd_hdr; 979 int err; 980 981 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); 982 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); 983 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 984 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id); 985 986 err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 987 if (err) 988 return err; 989 990 obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context); 991 tracker->status = MLX5_GET(page_track, obj_context, state); 992 return 0; 993 } 994 995 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, 996 struct mlx5_vhca_cq_buf *buf, int nent, 997 int cqe_size) 998 { 999 struct mlx5_frag_buf *frag_buf = &buf->frag_buf; 1000 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); 1001 u8 log_wq_sz = ilog2(cqe_size); 1002 int err; 1003 1004 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf, 1005 mdev->priv.numa_node); 1006 if (err) 1007 return err; 1008 1009 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); 1010 buf->cqe_size = cqe_size; 1011 buf->nent = nent; 1012 return 0; 1013 } 1014 1015 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf) 1016 { 1017 struct mlx5_cqe64 *cqe64; 1018 void *cqe; 1019 int i; 1020 1021 for (i = 0; i < buf->nent; i++) { 1022 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); 1023 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; 1024 cqe64->op_own = MLX5_CQE_INVALID << 4; 1025 } 1026 } 1027 1028 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, 1029 struct mlx5_vhca_cq *cq) 1030 { 1031 mlx5_core_destroy_cq(mdev, &cq->mcq); 1032 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 1033 mlx5_db_free(mdev, &cq->db); 1034 } 1035 1036 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) 1037 { 1038 if (type != MLX5_EVENT_TYPE_CQ_ERROR) 1039 return; 1040 1041 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device, 1042 tracker.cq.mcq)); 1043 } 1044 1045 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, 1046 void *data) 1047 { 1048 struct mlx5_vhca_page_tracker *tracker = 1049 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); 1050 struct mlx5vf_pci_core_device *mvdev = container_of( 1051 tracker, struct mlx5vf_pci_core_device, tracker); 1052 struct mlx5_eqe_obj_change *object; 1053 struct mlx5_eqe *eqe = data; 1054 u8 event_type = (u8)type; 1055 u8 queue_type; 1056 u32 obj_id; 1057 int qp_num; 1058 1059 switch (event_type) { 1060 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: 1061 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: 1062 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: 1063 queue_type = eqe->data.qp_srq.type; 1064 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP) 1065 break; 1066 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; 1067 if (qp_num != tracker->host_qp->qpn && 1068 qp_num != tracker->fw_qp->qpn) 1069 break; 1070 set_tracker_error(mvdev); 1071 break; 1072 case MLX5_EVENT_TYPE_OBJECT_CHANGE: 1073 object = &eqe->data.obj_change; 1074 obj_id = be32_to_cpu(object->obj_id); 1075 if (obj_id == tracker->id) 1076 set_tracker_change_event(mvdev); 1077 break; 1078 default: 1079 break; 1080 } 1081 1082 return NOTIFY_OK; 1083 } 1084 1085 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, 1086 struct mlx5_eqe *eqe) 1087 { 1088 struct mlx5vf_pci_core_device *mvdev = 1089 container_of(mcq, struct mlx5vf_pci_core_device, 1090 tracker.cq.mcq); 1091 1092 complete(&mvdev->tracker_comp); 1093 } 1094 1095 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, 1096 struct mlx5_vhca_page_tracker *tracker, 1097 size_t ncqe) 1098 { 1099 int cqe_size = cache_line_size() == 128 ? 128 : 64; 1100 u32 out[MLX5_ST_SZ_DW(create_cq_out)]; 1101 struct mlx5_vhca_cq *cq; 1102 int inlen, err, eqn; 1103 void *cqc, *in; 1104 __be64 *pas; 1105 int vector; 1106 1107 cq = &tracker->cq; 1108 ncqe = roundup_pow_of_two(ncqe); 1109 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node); 1110 if (err) 1111 return err; 1112 1113 cq->ncqe = ncqe; 1114 cq->mcq.set_ci_db = cq->db.db; 1115 cq->mcq.arm_db = cq->db.db + 1; 1116 cq->mcq.cqe_sz = cqe_size; 1117 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size); 1118 if (err) 1119 goto err_db_free; 1120 1121 init_cq_frag_buf(&cq->buf); 1122 inlen = MLX5_ST_SZ_BYTES(create_cq_in) + 1123 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * 1124 cq->buf.frag_buf.npages; 1125 in = kvzalloc(inlen, GFP_KERNEL); 1126 if (!in) { 1127 err = -ENOMEM; 1128 goto err_buff; 1129 } 1130 1131 vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); 1132 err = mlx5_comp_eqn_get(mdev, vector, &eqn); 1133 if (err) 1134 goto err_vec; 1135 1136 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); 1137 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); 1138 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); 1139 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index); 1140 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift - 1141 MLX5_ADAPTER_PAGE_SHIFT); 1142 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); 1143 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); 1144 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); 1145 cq->mcq.comp = mlx5vf_cq_complete; 1146 cq->mcq.event = mlx5vf_cq_event; 1147 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); 1148 if (err) 1149 goto err_vec; 1150 1151 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1152 cq->mcq.cons_index); 1153 kvfree(in); 1154 return 0; 1155 1156 err_vec: 1157 kvfree(in); 1158 err_buff: 1159 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 1160 err_db_free: 1161 mlx5_db_free(mdev, &cq->db); 1162 return err; 1163 } 1164 1165 static struct mlx5_vhca_qp * 1166 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, 1167 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr) 1168 { 1169 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; 1170 struct mlx5_vhca_qp *qp; 1171 u8 log_rq_stride; 1172 u8 log_rq_sz; 1173 void *qpc; 1174 int inlen; 1175 void *in; 1176 int err; 1177 1178 qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT); 1179 if (!qp) 1180 return ERR_PTR(-ENOMEM); 1181 1182 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); 1183 if (err) 1184 goto err_free; 1185 1186 if (max_recv_wr) { 1187 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); 1188 log_rq_stride = ilog2(MLX5_SEND_WQE_DS); 1189 log_rq_sz = ilog2(qp->rq.wqe_cnt); 1190 err = mlx5_frag_buf_alloc_node(mdev, 1191 wq_get_byte_sz(log_rq_sz, log_rq_stride), 1192 &qp->buf, mdev->priv.numa_node); 1193 if (err) 1194 goto err_db_free; 1195 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc); 1196 } 1197 1198 qp->rq.db = &qp->db.db[MLX5_RCV_DBR]; 1199 inlen = MLX5_ST_SZ_BYTES(create_qp_in) + 1200 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * 1201 qp->buf.npages; 1202 in = kvzalloc(inlen, GFP_KERNEL); 1203 if (!in) { 1204 err = -ENOMEM; 1205 goto err_in; 1206 } 1207 1208 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); 1209 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); 1210 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); 1211 MLX5_SET(qpc, qpc, pd, tracker->pdn); 1212 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index); 1213 MLX5_SET(qpc, qpc, log_page_size, 1214 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); 1215 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); 1216 if (MLX5_CAP_GEN(mdev, cqe_version) == 1) 1217 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); 1218 MLX5_SET(qpc, qpc, no_sq, 1); 1219 if (max_recv_wr) { 1220 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn); 1221 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4); 1222 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz); 1223 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); 1224 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); 1225 mlx5_fill_page_frag_array(&qp->buf, 1226 (__be64 *)MLX5_ADDR_OF(create_qp_in, 1227 in, pas)); 1228 } else { 1229 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); 1230 } 1231 1232 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); 1233 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 1234 kvfree(in); 1235 if (err) 1236 goto err_in; 1237 1238 qp->qpn = MLX5_GET(create_qp_out, out, qpn); 1239 return qp; 1240 1241 err_in: 1242 if (max_recv_wr) 1243 mlx5_frag_buf_free(mdev, &qp->buf); 1244 err_db_free: 1245 mlx5_db_free(mdev, &qp->db); 1246 err_free: 1247 kfree(qp); 1248 return ERR_PTR(err); 1249 } 1250 1251 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp) 1252 { 1253 struct mlx5_wqe_data_seg *data; 1254 unsigned int ix; 1255 1256 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt); 1257 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1); 1258 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix); 1259 data->byte_count = cpu_to_be32(qp->max_msg_size); 1260 data->lkey = cpu_to_be32(qp->recv_buf.mkey); 1261 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset); 1262 qp->rq.pc++; 1263 /* Make sure that descriptors are written before doorbell record. */ 1264 dma_wmb(); 1265 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff); 1266 } 1267 1268 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev, 1269 struct mlx5_vhca_qp *qp, u32 remote_qpn, 1270 bool host_qp) 1271 { 1272 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; 1273 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; 1274 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; 1275 void *qpc; 1276 int ret; 1277 1278 /* Init */ 1279 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc); 1280 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1281 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); 1282 MLX5_SET(qpc, qpc, rre, 1); 1283 MLX5_SET(qpc, qpc, rwe, 1); 1284 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP); 1285 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn); 1286 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in); 1287 if (ret) 1288 return ret; 1289 1290 if (host_qp) { 1291 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1292 int i; 1293 1294 for (i = 0; i < qp->rq.wqe_cnt; i++) { 1295 mlx5vf_post_recv(qp); 1296 recv_buf->next_rq_offset += qp->max_msg_size; 1297 } 1298 } 1299 1300 /* RTR */ 1301 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc); 1302 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1303 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); 1304 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg)); 1305 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn); 1306 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1307 MLX5_SET(qpc, qpc, primary_address_path.fl, 1); 1308 MLX5_SET(qpc, qpc, min_rnr_nak, 1); 1309 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); 1310 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1311 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in); 1312 if (ret || host_qp) 1313 return ret; 1314 1315 /* RTS */ 1316 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc); 1317 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1318 MLX5_SET(qpc, qpc, retry_count, 7); 1319 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ 1320 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ 1321 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); 1322 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1323 1324 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in); 1325 } 1326 1327 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, 1328 struct mlx5_vhca_qp *qp) 1329 { 1330 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; 1331 1332 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); 1333 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); 1334 mlx5_cmd_exec_in(mdev, destroy_qp, in); 1335 1336 mlx5_frag_buf_free(mdev, &qp->buf); 1337 mlx5_db_free(mdev, &qp->db); 1338 kfree(qp); 1339 } 1340 1341 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) 1342 { 1343 int i; 1344 1345 /* Undo alloc_pages_bulk_array() */ 1346 for (i = 0; i < recv_buf->npages; i++) 1347 __free_page(recv_buf->page_list[i]); 1348 1349 kvfree(recv_buf->page_list); 1350 } 1351 1352 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, 1353 unsigned int npages) 1354 { 1355 unsigned int filled = 0, done = 0; 1356 int i; 1357 1358 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), 1359 GFP_KERNEL_ACCOUNT); 1360 if (!recv_buf->page_list) 1361 return -ENOMEM; 1362 1363 for (;;) { 1364 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, 1365 npages - done, 1366 recv_buf->page_list + done); 1367 if (!filled) 1368 goto err; 1369 1370 done += filled; 1371 if (done == npages) 1372 break; 1373 } 1374 1375 recv_buf->npages = npages; 1376 return 0; 1377 1378 err: 1379 for (i = 0; i < npages; i++) { 1380 if (recv_buf->page_list[i]) 1381 __free_page(recv_buf->page_list[i]); 1382 } 1383 1384 kvfree(recv_buf->page_list); 1385 return -ENOMEM; 1386 } 1387 1388 static int register_dma_recv_pages(struct mlx5_core_dev *mdev, 1389 struct mlx5_vhca_recv_buf *recv_buf) 1390 { 1391 int i, j; 1392 1393 recv_buf->dma_addrs = kvcalloc(recv_buf->npages, 1394 sizeof(*recv_buf->dma_addrs), 1395 GFP_KERNEL_ACCOUNT); 1396 if (!recv_buf->dma_addrs) 1397 return -ENOMEM; 1398 1399 for (i = 0; i < recv_buf->npages; i++) { 1400 recv_buf->dma_addrs[i] = dma_map_page(mdev->device, 1401 recv_buf->page_list[i], 1402 0, PAGE_SIZE, 1403 DMA_FROM_DEVICE); 1404 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) 1405 goto error; 1406 } 1407 return 0; 1408 1409 error: 1410 for (j = 0; j < i; j++) 1411 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], 1412 PAGE_SIZE, DMA_FROM_DEVICE); 1413 1414 kvfree(recv_buf->dma_addrs); 1415 return -ENOMEM; 1416 } 1417 1418 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, 1419 struct mlx5_vhca_recv_buf *recv_buf) 1420 { 1421 int i; 1422 1423 for (i = 0; i < recv_buf->npages; i++) 1424 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], 1425 PAGE_SIZE, DMA_FROM_DEVICE); 1426 1427 kvfree(recv_buf->dma_addrs); 1428 } 1429 1430 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, 1431 struct mlx5_vhca_qp *qp) 1432 { 1433 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1434 1435 mlx5_core_destroy_mkey(mdev, recv_buf->mkey); 1436 unregister_dma_recv_pages(mdev, recv_buf); 1437 free_recv_pages(&qp->recv_buf); 1438 } 1439 1440 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, 1441 struct mlx5_vhca_qp *qp, u32 pdn, 1442 u64 rq_size) 1443 { 1444 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE); 1445 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1446 int err; 1447 1448 err = alloc_recv_pages(recv_buf, npages); 1449 if (err < 0) 1450 return err; 1451 1452 err = register_dma_recv_pages(mdev, recv_buf); 1453 if (err) 1454 goto end; 1455 1456 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); 1457 if (err) 1458 goto err_create_mkey; 1459 1460 return 0; 1461 1462 err_create_mkey: 1463 unregister_dma_recv_pages(mdev, recv_buf); 1464 end: 1465 free_recv_pages(recv_buf); 1466 return err; 1467 } 1468 1469 static void 1470 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) 1471 { 1472 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1473 struct mlx5_core_dev *mdev = mvdev->mdev; 1474 1475 lockdep_assert_held(&mvdev->state_mutex); 1476 1477 if (!mvdev->log_active) 1478 return; 1479 1480 WARN_ON(mvdev->mdev_detach); 1481 1482 mlx5_eq_notifier_unregister(mdev, &tracker->nb); 1483 mlx5vf_cmd_destroy_tracker(mdev, tracker->id); 1484 mlx5vf_destroy_qp(mdev, tracker->fw_qp); 1485 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); 1486 mlx5vf_destroy_qp(mdev, tracker->host_qp); 1487 mlx5vf_destroy_cq(mdev, &tracker->cq); 1488 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1489 mlx5_put_uars_page(mdev, tracker->uar); 1490 mvdev->log_active = false; 1491 } 1492 1493 int mlx5vf_stop_page_tracker(struct vfio_device *vdev) 1494 { 1495 struct mlx5vf_pci_core_device *mvdev = container_of( 1496 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1497 1498 mutex_lock(&mvdev->state_mutex); 1499 if (!mvdev->log_active) 1500 goto end; 1501 1502 _mlx5vf_free_page_tracker_resources(mvdev); 1503 mvdev->log_active = false; 1504 end: 1505 mlx5vf_state_mutex_unlock(mvdev); 1506 return 0; 1507 } 1508 1509 int mlx5vf_start_page_tracker(struct vfio_device *vdev, 1510 struct rb_root_cached *ranges, u32 nnodes, 1511 u64 *page_size) 1512 { 1513 struct mlx5vf_pci_core_device *mvdev = container_of( 1514 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1515 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1516 u8 log_tracked_page = ilog2(*page_size); 1517 struct mlx5_vhca_qp *host_qp; 1518 struct mlx5_vhca_qp *fw_qp; 1519 struct mlx5_core_dev *mdev; 1520 u32 log_max_msg_size; 1521 u32 max_msg_size; 1522 u64 rq_size = SZ_2M; 1523 u32 max_recv_wr; 1524 int err; 1525 1526 mutex_lock(&mvdev->state_mutex); 1527 if (mvdev->mdev_detach) { 1528 err = -ENOTCONN; 1529 goto end; 1530 } 1531 1532 if (mvdev->log_active) { 1533 err = -EINVAL; 1534 goto end; 1535 } 1536 1537 mdev = mvdev->mdev; 1538 log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size); 1539 max_msg_size = (1ULL << log_max_msg_size); 1540 /* The RQ must hold at least 4 WQEs/messages for successful QP creation */ 1541 if (rq_size < 4 * max_msg_size) 1542 rq_size = 4 * max_msg_size; 1543 1544 memset(tracker, 0, sizeof(*tracker)); 1545 tracker->uar = mlx5_get_uars_page(mdev); 1546 if (IS_ERR(tracker->uar)) { 1547 err = PTR_ERR(tracker->uar); 1548 goto end; 1549 } 1550 1551 err = mlx5_core_alloc_pd(mdev, &tracker->pdn); 1552 if (err) 1553 goto err_uar; 1554 1555 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size); 1556 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr); 1557 if (err) 1558 goto err_dealloc_pd; 1559 1560 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr); 1561 if (IS_ERR(host_qp)) { 1562 err = PTR_ERR(host_qp); 1563 goto err_cq; 1564 } 1565 1566 host_qp->max_msg_size = max_msg_size; 1567 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1568 pg_track_log_min_page_size)) { 1569 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1570 pg_track_log_min_page_size); 1571 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1572 pg_track_log_max_page_size)) { 1573 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1574 pg_track_log_max_page_size); 1575 } 1576 1577 host_qp->tracked_page_size = (1ULL << log_tracked_page); 1578 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn, 1579 rq_size); 1580 if (err) 1581 goto err_host_qp; 1582 1583 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0); 1584 if (IS_ERR(fw_qp)) { 1585 err = PTR_ERR(fw_qp); 1586 goto err_recv_resources; 1587 } 1588 1589 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true); 1590 if (err) 1591 goto err_activate; 1592 1593 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false); 1594 if (err) 1595 goto err_activate; 1596 1597 tracker->host_qp = host_qp; 1598 tracker->fw_qp = fw_qp; 1599 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes); 1600 if (err) 1601 goto err_activate; 1602 1603 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY); 1604 mlx5_eq_notifier_register(mdev, &tracker->nb); 1605 *page_size = host_qp->tracked_page_size; 1606 mvdev->log_active = true; 1607 mlx5vf_state_mutex_unlock(mvdev); 1608 return 0; 1609 1610 err_activate: 1611 mlx5vf_destroy_qp(mdev, fw_qp); 1612 err_recv_resources: 1613 mlx5vf_free_qp_recv_resources(mdev, host_qp); 1614 err_host_qp: 1615 mlx5vf_destroy_qp(mdev, host_qp); 1616 err_cq: 1617 mlx5vf_destroy_cq(mdev, &tracker->cq); 1618 err_dealloc_pd: 1619 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1620 err_uar: 1621 mlx5_put_uars_page(mdev, tracker->uar); 1622 end: 1623 mlx5vf_state_mutex_unlock(mvdev); 1624 return err; 1625 } 1626 1627 static void 1628 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, 1629 struct iova_bitmap *dirty) 1630 { 1631 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); 1632 u32 nent = size / entry_size; 1633 u32 nent_in_page; 1634 u32 nent_to_set; 1635 struct page *page; 1636 u32 page_offset; 1637 u32 page_index; 1638 u32 buf_offset; 1639 void *kaddr; 1640 u64 addr; 1641 u64 *buf; 1642 int i; 1643 1644 buf_offset = index * qp->max_msg_size; 1645 if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE || 1646 (nent > qp->max_msg_size / entry_size))) 1647 return; 1648 1649 do { 1650 page_index = buf_offset / PAGE_SIZE; 1651 page_offset = buf_offset % PAGE_SIZE; 1652 nent_in_page = (PAGE_SIZE - page_offset) / entry_size; 1653 page = qp->recv_buf.page_list[page_index]; 1654 kaddr = kmap_local_page(page); 1655 buf = kaddr + page_offset; 1656 nent_to_set = min(nent, nent_in_page); 1657 for (i = 0; i < nent_to_set; i++) { 1658 addr = MLX5_GET(page_track_report_entry, buf + i, 1659 dirty_address_low); 1660 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, 1661 dirty_address_high) << 32; 1662 iova_bitmap_set(dirty, addr, qp->tracked_page_size); 1663 } 1664 kunmap_local(kaddr); 1665 buf_offset += (nent_to_set * entry_size); 1666 nent -= nent_to_set; 1667 } while (nent); 1668 } 1669 1670 static void 1671 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe, 1672 struct iova_bitmap *dirty, int *tracker_status) 1673 { 1674 u32 size; 1675 int ix; 1676 1677 qp->rq.cc++; 1678 *tracker_status = be32_to_cpu(cqe->immediate) >> 28; 1679 size = be32_to_cpu(cqe->byte_cnt); 1680 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1); 1681 1682 /* zero length CQE, no data */ 1683 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING); 1684 if (size) 1685 set_report_output(size, ix, qp, dirty); 1686 1687 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size; 1688 mlx5vf_post_recv(qp); 1689 } 1690 1691 static void *get_cqe(struct mlx5_vhca_cq *cq, int n) 1692 { 1693 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); 1694 } 1695 1696 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n) 1697 { 1698 void *cqe = get_cqe(cq, n & (cq->ncqe - 1)); 1699 struct mlx5_cqe64 *cqe64; 1700 1701 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; 1702 1703 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && 1704 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) { 1705 return cqe64; 1706 } else { 1707 return NULL; 1708 } 1709 } 1710 1711 static int 1712 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp, 1713 struct iova_bitmap *dirty, int *tracker_status) 1714 { 1715 struct mlx5_cqe64 *cqe; 1716 u8 opcode; 1717 1718 cqe = get_sw_cqe(cq, cq->mcq.cons_index); 1719 if (!cqe) 1720 return CQ_EMPTY; 1721 1722 ++cq->mcq.cons_index; 1723 /* 1724 * Make sure we read CQ entry contents after we've checked the 1725 * ownership bit. 1726 */ 1727 rmb(); 1728 opcode = get_cqe_opcode(cqe); 1729 switch (opcode) { 1730 case MLX5_CQE_RESP_SEND_IMM: 1731 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status); 1732 return CQ_OK; 1733 default: 1734 return CQ_POLL_ERR; 1735 } 1736 } 1737 1738 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, 1739 unsigned long length, 1740 struct iova_bitmap *dirty) 1741 { 1742 struct mlx5vf_pci_core_device *mvdev = container_of( 1743 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1744 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1745 struct mlx5_vhca_cq *cq = &tracker->cq; 1746 struct mlx5_core_dev *mdev; 1747 int poll_err, err; 1748 1749 mutex_lock(&mvdev->state_mutex); 1750 if (!mvdev->log_active) { 1751 err = -EINVAL; 1752 goto end; 1753 } 1754 1755 if (mvdev->mdev_detach) { 1756 err = -ENOTCONN; 1757 goto end; 1758 } 1759 1760 if (tracker->is_err) { 1761 err = -EIO; 1762 goto end; 1763 } 1764 1765 mdev = mvdev->mdev; 1766 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, 1767 MLX5_PAGE_TRACK_STATE_REPORTING); 1768 if (err) 1769 goto end; 1770 1771 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; 1772 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING && 1773 !tracker->is_err) { 1774 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, 1775 &tracker->status); 1776 if (poll_err == CQ_EMPTY) { 1777 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1778 cq->mcq.cons_index); 1779 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, 1780 dirty, &tracker->status); 1781 if (poll_err == CQ_EMPTY) { 1782 wait_for_completion(&mvdev->tracker_comp); 1783 if (tracker->object_changed) { 1784 tracker->object_changed = false; 1785 err = mlx5vf_cmd_query_tracker(mdev, tracker); 1786 if (err) 1787 goto end; 1788 } 1789 continue; 1790 } 1791 } 1792 if (poll_err == CQ_POLL_ERR) { 1793 err = -EIO; 1794 goto end; 1795 } 1796 mlx5_cq_set_ci(&cq->mcq); 1797 } 1798 1799 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) 1800 tracker->is_err = true; 1801 1802 if (tracker->is_err) 1803 err = -EIO; 1804 end: 1805 mlx5vf_state_mutex_unlock(mvdev); 1806 return err; 1807 } 1808