1 /* 2 * Copyright (c) 2005 Ammasso, Inc. All rights reserved. 3 * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/ctype.h> 38 39 #include <sys/param.h> 40 #include <sys/condvar.h> 41 #include <sys/systm.h> 42 #include <sys/kernel.h> 43 #include <sys/socket.h> 44 #include <sys/endian.h> 45 #include <sys/limits.h> 46 #include <sys/proc.h> 47 #include <sys/signalvar.h> 48 49 #include <sys/lock.h> 50 #include <sys/mutex.h> 51 #include <sys/rwlock.h> 52 #include <sys/queue.h> 53 #include <sys/taskqueue.h> 54 #include <sys/syslog.h> 55 #include <netinet/in.h> 56 57 #include <vm/vm.h> 58 #include <vm/pmap.h> 59 60 #include <linux/types.h> 61 #include <rdma/rdma_cm.h> 62 63 #include "getopt.h" 64 #include "krping.h" 65 66 #define PFX "krping: " 67 68 static int debug = 0; 69 #define DEBUG_LOG if (debug) printf 70 71 static const struct krping_option krping_opts[] = { 72 {"count", OPT_INT, 'C'}, 73 {"size", OPT_INT, 'S'}, 74 {"addr", OPT_STRING, 'a'}, 75 {"port", OPT_INT, 'p'}, 76 {"verbose", OPT_NOPARAM, 'v'}, 77 {"validate", OPT_NOPARAM, 'V'}, 78 {"server", OPT_NOPARAM, 's'}, 79 {"client", OPT_NOPARAM, 'c'}, 80 {"dmamr", OPT_NOPARAM, 'D'}, 81 {"debug", OPT_NOPARAM, 'd'}, 82 {"wlat", OPT_NOPARAM, 'l'}, 83 {"rlat", OPT_NOPARAM, 'L'}, 84 {"bw", OPT_NOPARAM, 'B'}, 85 {"tx-depth", OPT_INT, 't'}, 86 {"poll", OPT_NOPARAM, 'P'}, 87 {"memlimit", OPT_INT, 'm'}, 88 {NULL, 0, 0} 89 }; 90 91 struct mtx krping_mutex; 92 93 /* 94 * List of running krping threads. 95 */ 96 struct krping_cb_list krping_cbs; 97 98 /* 99 * krping "ping/pong" loop: 100 * client sends source rkey/addr/len 101 * server receives source rkey/add/len 102 * server rdma reads "ping" data from source 103 * server sends "go ahead" on rdma read completion 104 * client sends sink rkey/addr/len 105 * server receives sink rkey/addr/len 106 * server rdma writes "pong" data to sink 107 * server sends "go ahead" on rdma write completion 108 * <repeat loop> 109 */ 110 111 /* 112 * Default max buffer size for IO... 113 */ 114 #define RPING_BUFSIZE 128*1024 115 #define RPING_SQ_DEPTH 32 116 117 static void krping_wait(struct krping_cb *cb, int state) 118 { 119 int rc; 120 mtx_lock(&cb->lock); 121 while (cb->state < state) { 122 rc = msleep(cb, &cb->lock, PCATCH, "krping", 0); 123 if (rc && rc != ERESTART) { 124 cb->state = ERROR; 125 break; 126 } 127 } 128 mtx_unlock(&cb->lock); 129 } 130 131 static int krping_cma_event_handler(struct rdma_cm_id *cma_id, 132 struct rdma_cm_event *event) 133 { 134 int ret; 135 struct krping_cb *cb = cma_id->context; 136 137 DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id, 138 (cma_id == cb->cm_id) ? "parent" : "child"); 139 140 mtx_lock(&cb->lock); 141 switch (event->event) { 142 case RDMA_CM_EVENT_ADDR_RESOLVED: 143 cb->state = ADDR_RESOLVED; 144 ret = rdma_resolve_route(cma_id, 2000); 145 if (ret) { 146 log(LOG_ERR, "rdma_resolve_route error %d\n", 147 ret); 148 wakeup(cb); 149 } 150 break; 151 152 case RDMA_CM_EVENT_ROUTE_RESOLVED: 153 cb->state = ROUTE_RESOLVED; 154 wakeup(cb); 155 break; 156 157 case RDMA_CM_EVENT_CONNECT_REQUEST: 158 cb->state = CONNECT_REQUEST; 159 cb->child_cm_id = cma_id; 160 DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id); 161 wakeup(cb); 162 break; 163 164 case RDMA_CM_EVENT_ESTABLISHED: 165 DEBUG_LOG(PFX "ESTABLISHED\n"); 166 if (!cb->server) { 167 cb->state = CONNECTED; 168 wakeup(cb); 169 } 170 break; 171 172 case RDMA_CM_EVENT_ADDR_ERROR: 173 case RDMA_CM_EVENT_ROUTE_ERROR: 174 case RDMA_CM_EVENT_CONNECT_ERROR: 175 case RDMA_CM_EVENT_UNREACHABLE: 176 case RDMA_CM_EVENT_REJECTED: 177 log(LOG_ERR, "cma event %d, error %d\n", event->event, 178 event->status); 179 cb->state = ERROR; 180 wakeup(cb); 181 break; 182 183 case RDMA_CM_EVENT_DISCONNECTED: 184 DEBUG_LOG(PFX "DISCONNECT EVENT...\n"); 185 cb->state = ERROR; 186 wakeup(cb); 187 break; 188 189 case RDMA_CM_EVENT_DEVICE_REMOVAL: 190 DEBUG_LOG(PFX "cma detected device removal!!!!\n"); 191 cb->state = ERROR; 192 wakeup(cb); 193 mtx_unlock(&cb->lock); 194 krping_wait(cb, CLEANUP); 195 tsleep(cb, 0, "krping", 5000); 196 return 0; 197 198 default: 199 log(LOG_ERR, "oof bad type!\n"); 200 wakeup(cb); 201 break; 202 } 203 mtx_unlock(&cb->lock); 204 return 0; 205 } 206 207 static int server_recv(struct krping_cb *cb, struct ib_wc *wc) 208 { 209 if (wc->byte_len != sizeof(cb->recv_buf)) { 210 log(LOG_ERR, "Received bogus data, size %d\n", 211 wc->byte_len); 212 return -1; 213 } 214 215 cb->remote_rkey = ntohl(cb->recv_buf.rkey); 216 cb->remote_addr = ntohll(cb->recv_buf.buf); 217 cb->remote_len = ntohl(cb->recv_buf.size); 218 DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n", 219 cb->remote_rkey, (unsigned long long)cb->remote_addr, 220 cb->remote_len); 221 222 if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) 223 cb->state = RDMA_READ_ADV; 224 else 225 cb->state = RDMA_WRITE_ADV; 226 227 return 0; 228 } 229 230 static int client_recv(struct krping_cb *cb, struct ib_wc *wc) 231 { 232 if (wc->byte_len != sizeof(cb->recv_buf)) { 233 log(LOG_ERR, "Received bogus data, size %d\n", 234 wc->byte_len); 235 return -1; 236 } 237 238 if (cb->state == RDMA_READ_ADV) 239 cb->state = RDMA_WRITE_ADV; 240 else 241 cb->state = RDMA_WRITE_COMPLETE; 242 243 return 0; 244 } 245 246 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) 247 { 248 struct krping_cb *cb = ctx; 249 struct ib_wc wc; 250 struct ib_recv_wr *bad_wr; 251 int ret; 252 253 mtx_lock(&cb->lock); 254 KASSERT(cb->cq == cq, ("bad condition")); 255 if (cb->state == ERROR) { 256 log(LOG_ERR, "cq completion in ERROR state\n"); 257 mtx_unlock(&cb->lock); 258 return; 259 } 260 if (!cb->wlat && !cb->rlat && !cb->bw) 261 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 262 while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { 263 if (wc.status) { 264 if (wc.status == IB_WC_WR_FLUSH_ERR) { 265 DEBUG_LOG("cq flushed\n"); 266 continue; 267 } else { 268 log(LOG_CRIT, "cq completion failed status %d\n", 269 wc.status); 270 goto error; 271 } 272 } 273 274 switch (wc.opcode) { 275 case IB_WC_SEND: 276 DEBUG_LOG(PFX "send completion\n"); 277 cb->stats.send_bytes += cb->send_sgl.length; 278 cb->stats.send_msgs++; 279 break; 280 281 case IB_WC_RDMA_WRITE: 282 DEBUG_LOG(PFX "rdma write completion\n"); 283 cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; 284 cb->stats.write_msgs++; 285 cb->state = RDMA_WRITE_COMPLETE; 286 wakeup(cb); 287 break; 288 289 case IB_WC_RDMA_READ: 290 DEBUG_LOG(PFX "rdma read completion\n"); 291 cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; 292 cb->stats.read_msgs++; 293 cb->state = RDMA_READ_COMPLETE; 294 wakeup(cb); 295 break; 296 297 case IB_WC_RECV: 298 DEBUG_LOG(PFX "recv completion\n"); 299 cb->stats.recv_bytes += sizeof(cb->recv_buf); 300 cb->stats.recv_msgs++; 301 if (cb->wlat || cb->rlat || cb->bw) 302 ret = server_recv(cb, &wc); 303 else 304 ret = cb->server ? server_recv(cb, &wc) : 305 client_recv(cb, &wc); 306 if (ret) { 307 log(LOG_ERR, "recv wc error: %d\n", ret); 308 goto error; 309 } 310 311 ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); 312 if (ret) { 313 log(LOG_ERR, "post recv error: %d\n", 314 ret); 315 goto error; 316 } 317 wakeup(cb); 318 break; 319 320 default: 321 log(LOG_ERR, "unknown!!!!! completion\n"); 322 goto error; 323 } 324 } 325 if (ret) { 326 log(LOG_ERR, "poll error %d\n", ret); 327 goto error; 328 } 329 mtx_unlock(&cb->lock); 330 return; 331 error: 332 cb->state = ERROR; 333 wakeup(cb); 334 mtx_unlock(&cb->lock); 335 } 336 337 static int krping_accept(struct krping_cb *cb) 338 { 339 struct rdma_conn_param conn_param; 340 int ret; 341 342 DEBUG_LOG(PFX "accepting client connection request\n"); 343 344 memset(&conn_param, 0, sizeof conn_param); 345 conn_param.responder_resources = 1; 346 conn_param.initiator_depth = 1; 347 348 ret = rdma_accept(cb->child_cm_id, &conn_param); 349 if (ret) { 350 log(LOG_ERR, "rdma_accept error: %d\n", ret); 351 return ret; 352 } 353 354 if (!cb->wlat && !cb->rlat && !cb->bw) { 355 krping_wait(cb, CONNECTED); 356 if (cb->state == ERROR) { 357 log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state); 358 return -1; 359 } 360 } 361 return 0; 362 } 363 364 static void krping_setup_wr(struct krping_cb *cb) 365 { 366 /* XXX X86 only here... not mapping for dma! */ 367 cb->recv_sgl.addr = vtophys(&cb->recv_buf); 368 cb->recv_sgl.length = sizeof cb->recv_buf; 369 if (cb->use_dmamr) 370 cb->recv_sgl.lkey = cb->dma_mr->lkey; 371 else 372 cb->recv_sgl.lkey = cb->recv_mr->lkey; 373 cb->rq_wr.sg_list = &cb->recv_sgl; 374 cb->rq_wr.num_sge = 1; 375 376 cb->send_sgl.addr = vtophys(&cb->send_buf); 377 cb->send_sgl.length = sizeof cb->send_buf; 378 if (cb->use_dmamr) 379 cb->send_sgl.lkey = cb->dma_mr->lkey; 380 else 381 cb->send_sgl.lkey = cb->send_mr->lkey; 382 383 cb->sq_wr.opcode = IB_WR_SEND; 384 cb->sq_wr.send_flags = IB_SEND_SIGNALED; 385 cb->sq_wr.sg_list = &cb->send_sgl; 386 cb->sq_wr.num_sge = 1; 387 388 cb->rdma_addr = vtophys(cb->rdma_buf); 389 cb->rdma_sgl.addr = cb->rdma_addr; 390 if (cb->use_dmamr) 391 cb->rdma_sgl.lkey = cb->dma_mr->lkey; 392 else 393 cb->rdma_sgl.lkey = cb->rdma_mr->lkey; 394 cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; 395 cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; 396 cb->rdma_sq_wr.num_sge = 1; 397 398 if (!cb->server || cb->wlat || cb->rlat || cb->bw) { 399 cb->start_addr = vtophys(cb->start_buf); 400 } 401 } 402 403 static int krping_setup_buffers(struct krping_cb *cb) 404 { 405 int ret; 406 struct ib_phys_buf buf; 407 u64 iovbase; 408 409 DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb); 410 411 if (cb->use_dmamr) { 412 cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| 413 IB_ACCESS_REMOTE_READ| 414 IB_ACCESS_REMOTE_WRITE); 415 if (IS_ERR(cb->dma_mr)) { 416 log(LOG_ERR, "reg_dmamr failed\n"); 417 return PTR_ERR(cb->dma_mr); 418 } 419 } else { 420 421 buf.addr = vtophys(&cb->recv_buf); 422 buf.size = sizeof cb->recv_buf; 423 iovbase = vtophys(&cb->recv_buf); 424 cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 425 IB_ACCESS_LOCAL_WRITE, 426 &iovbase); 427 428 if (IS_ERR(cb->recv_mr)) { 429 log(LOG_ERR, "recv_buf reg_mr failed\n"); 430 return PTR_ERR(cb->recv_mr); 431 } 432 433 buf.addr = vtophys(&cb->send_buf); 434 buf.size = sizeof cb->send_buf; 435 iovbase = vtophys(&cb->send_buf); 436 cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 437 0, &iovbase); 438 439 if (IS_ERR(cb->send_mr)) { 440 log(LOG_ERR, "send_buf reg_mr failed\n"); 441 ib_dereg_mr(cb->recv_mr); 442 return PTR_ERR(cb->send_mr); 443 } 444 } 445 446 /* RNIC adapters have a limit upto which it can register physical memory 447 * If DMA-MR memory mode is set then normally driver registers maximum 448 * supported memory. After that if contigmalloc allocates memory beyond the 449 * specified RNIC limit then Krping may not work. 450 */ 451 if (cb->use_dmamr && cb->memlimit) 452 cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, cb->memlimit, 453 PAGE_SIZE, 0); 454 else 455 cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL, 456 PAGE_SIZE, 0); 457 458 if (!cb->rdma_buf) { 459 log(LOG_ERR, "rdma_buf malloc failed\n"); 460 ret = ENOMEM; 461 goto err1; 462 } 463 if (!cb->use_dmamr) { 464 465 buf.addr = vtophys(cb->rdma_buf); 466 buf.size = cb->size; 467 iovbase = vtophys(cb->rdma_buf); 468 cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 469 IB_ACCESS_REMOTE_READ| 470 IB_ACCESS_REMOTE_WRITE, 471 &iovbase); 472 473 if (IS_ERR(cb->rdma_mr)) { 474 log(LOG_ERR, "rdma_buf reg_mr failed\n"); 475 ret = PTR_ERR(cb->rdma_mr); 476 goto err2; 477 } 478 } 479 480 if (!cb->server || cb->wlat || cb->rlat || cb->bw) { 481 if (cb->use_dmamr && cb->memlimit) 482 cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 483 0, cb->memlimit, PAGE_SIZE, 0); 484 else 485 cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 486 0, -1UL, PAGE_SIZE, 0); 487 if (!cb->start_buf) { 488 log(LOG_ERR, "start_buf malloc failed\n"); 489 ret = ENOMEM; 490 goto err2; 491 } 492 if (!cb->use_dmamr) { 493 unsigned flags = IB_ACCESS_REMOTE_READ; 494 495 if (cb->wlat || cb->rlat || cb->bw) 496 flags |= IB_ACCESS_REMOTE_WRITE; 497 buf.addr = vtophys(cb->start_buf); 498 buf.size = cb->size; 499 iovbase = vtophys(cb->start_buf); 500 cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 501 flags, 502 &iovbase); 503 504 if (IS_ERR(cb->start_mr)) { 505 log(LOG_ERR, "start_buf reg_mr failed\n"); 506 ret = PTR_ERR(cb->start_mr); 507 goto err3; 508 } 509 } 510 } 511 512 krping_setup_wr(cb); 513 DEBUG_LOG(PFX "allocated & registered buffers...\n"); 514 return 0; 515 err3: 516 contigfree(cb->start_buf, cb->size, M_DEVBUF); 517 518 if (!cb->use_dmamr) 519 ib_dereg_mr(cb->rdma_mr); 520 err2: 521 contigfree(cb->rdma_buf, cb->size, M_DEVBUF); 522 err1: 523 if (cb->use_dmamr) 524 ib_dereg_mr(cb->dma_mr); 525 else { 526 ib_dereg_mr(cb->recv_mr); 527 ib_dereg_mr(cb->send_mr); 528 } 529 return ret; 530 } 531 532 static void krping_free_buffers(struct krping_cb *cb) 533 { 534 DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb); 535 536 #if 0 537 dma_unmap_single(cb->pd->device->dma_device, 538 pci_unmap_addr(cb, recv_mapping), 539 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); 540 dma_unmap_single(cb->pd->device->dma_device, 541 pci_unmap_addr(cb, send_mapping), 542 sizeof(cb->send_buf), DMA_BIDIRECTIONAL); 543 dma_unmap_single(cb->pd->device->dma_device, 544 pci_unmap_addr(cb, rdma_mapping), 545 cb->size, DMA_BIDIRECTIONAL); 546 #endif 547 contigfree(cb->rdma_buf, cb->size, M_DEVBUF); 548 if (!cb->server || cb->wlat || cb->rlat || cb->bw) { 549 #if 0 550 dma_unmap_single(cb->pd->device->dma_device, 551 pci_unmap_addr(cb, start_mapping), 552 cb->size, DMA_BIDIRECTIONAL); 553 #endif 554 contigfree(cb->start_buf, cb->size, M_DEVBUF); 555 } 556 if (cb->use_dmamr) 557 ib_dereg_mr(cb->dma_mr); 558 else { 559 ib_dereg_mr(cb->send_mr); 560 ib_dereg_mr(cb->recv_mr); 561 ib_dereg_mr(cb->rdma_mr); 562 if (!cb->server) 563 ib_dereg_mr(cb->start_mr); 564 } 565 } 566 567 static int krping_create_qp(struct krping_cb *cb) 568 { 569 struct ib_qp_init_attr init_attr; 570 int ret; 571 572 memset(&init_attr, 0, sizeof(init_attr)); 573 init_attr.cap.max_send_wr = cb->txdepth; 574 init_attr.cap.max_recv_wr = 2; 575 init_attr.cap.max_recv_sge = 1; 576 init_attr.cap.max_send_sge = 1; 577 init_attr.qp_type = IB_QPT_RC; 578 init_attr.send_cq = cb->cq; 579 init_attr.recv_cq = cb->cq; 580 581 if (cb->server) { 582 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); 583 if (!ret) 584 cb->qp = cb->child_cm_id->qp; 585 } else { 586 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); 587 if (!ret) 588 cb->qp = cb->cm_id->qp; 589 } 590 591 return ret; 592 } 593 594 static void krping_free_qp(struct krping_cb *cb) 595 { 596 ib_destroy_qp(cb->qp); 597 ib_destroy_cq(cb->cq); 598 ib_dealloc_pd(cb->pd); 599 } 600 601 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) 602 { 603 int ret; 604 cb->pd = ib_alloc_pd(cm_id->device); 605 if (IS_ERR(cb->pd)) { 606 log(LOG_ERR, "ib_alloc_pd failed\n"); 607 return PTR_ERR(cb->pd); 608 } 609 DEBUG_LOG(PFX "created pd %p\n", cb->pd); 610 611 strlcpy(cb->name, cb->pd->device->name, sizeof(cb->name)); 612 613 cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, 614 cb, cb->txdepth * 2, 0); 615 if (IS_ERR(cb->cq)) { 616 log(LOG_ERR, "ib_create_cq failed\n"); 617 ret = PTR_ERR(cb->cq); 618 goto err1; 619 } 620 DEBUG_LOG(PFX "created cq %p\n", cb->cq); 621 622 if (!cb->wlat && !cb->rlat && !cb->bw) { 623 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 624 if (ret) { 625 log(LOG_ERR, "ib_create_cq failed\n"); 626 goto err2; 627 } 628 } 629 630 ret = krping_create_qp(cb); 631 if (ret) { 632 log(LOG_ERR, "krping_create_qp failed: %d\n", ret); 633 goto err2; 634 } 635 DEBUG_LOG(PFX "created qp %p\n", cb->qp); 636 return 0; 637 err2: 638 ib_destroy_cq(cb->cq); 639 err1: 640 ib_dealloc_pd(cb->pd); 641 return ret; 642 } 643 644 static void krping_format_send(struct krping_cb *cb, u64 buf, 645 struct ib_mr *mr) 646 { 647 struct krping_rdma_info *info = &cb->send_buf; 648 649 info->buf = htonll(buf); 650 info->rkey = htonl(mr->rkey); 651 info->size = htonl(cb->size); 652 653 DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n", 654 (unsigned long long)buf, mr->rkey, cb->size); 655 } 656 657 static void krping_test_server(struct krping_cb *cb) 658 { 659 struct ib_send_wr *bad_wr; 660 int ret; 661 662 while (1) { 663 /* Wait for client's Start STAG/TO/Len */ 664 krping_wait(cb, RDMA_READ_ADV); 665 if (cb->state != RDMA_READ_ADV) { 666 DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n", 667 cb->state); 668 break; 669 } 670 671 DEBUG_LOG(PFX "server received sink adv\n"); 672 673 /* Issue RDMA Read. */ 674 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; 675 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 676 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 677 cb->rdma_sq_wr.sg_list->length = cb->remote_len; 678 679 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); 680 if (ret) { 681 log(LOG_ERR, "post send error %d\n", ret); 682 break; 683 } 684 DEBUG_LOG(PFX "server posted rdma read req \n"); 685 686 /* Wait for read completion */ 687 krping_wait(cb, RDMA_READ_COMPLETE); 688 if (cb->state != RDMA_READ_COMPLETE) { 689 log(LOG_ERR, 690 "wait for RDMA_READ_COMPLETE state %d\n", 691 cb->state); 692 break; 693 } 694 DEBUG_LOG(PFX "server received read complete\n"); 695 696 /* Display data in recv buf */ 697 if (cb->verbose) 698 DEBUG_LOG("server ping data: %s\n", cb->rdma_buf); 699 700 /* Tell client to continue */ 701 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 702 if (ret) { 703 log(LOG_ERR, "post send error %d\n", ret); 704 break; 705 } 706 DEBUG_LOG(PFX "server posted go ahead\n"); 707 708 /* Wait for client's RDMA STAG/TO/Len */ 709 krping_wait(cb, RDMA_WRITE_ADV); 710 if (cb->state != RDMA_WRITE_ADV) { 711 log(LOG_ERR, 712 "wait for RDMA_WRITE_ADV state %d\n", 713 cb->state); 714 break; 715 } 716 DEBUG_LOG(PFX "server received sink adv\n"); 717 718 /* RDMA Write echo data */ 719 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 720 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 721 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 722 cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; 723 DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n", 724 cb->rdma_sq_wr.sg_list->lkey, 725 (unsigned long long)cb->rdma_sq_wr.sg_list->addr, 726 cb->rdma_sq_wr.sg_list->length); 727 728 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); 729 if (ret) { 730 log(LOG_ERR, "post send error %d\n", ret); 731 break; 732 } 733 734 /* Wait for completion */ 735 krping_wait(cb, RDMA_WRITE_COMPLETE); 736 if (cb->state != RDMA_WRITE_COMPLETE) { 737 log(LOG_ERR, 738 "wait for RDMA_WRITE_COMPLETE state %d\n", 739 cb->state); 740 break; 741 } 742 DEBUG_LOG(PFX "server rdma write complete \n"); 743 744 cb->state = CONNECTED; 745 746 /* Tell client to begin again */ 747 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 748 if (ret) { 749 log(LOG_ERR, "post send error %d\n", ret); 750 break; 751 } 752 DEBUG_LOG(PFX "server posted go ahead\n"); 753 } 754 } 755 756 static void rlat_test(struct krping_cb *cb) 757 { 758 int scnt; 759 int iters = cb->count; 760 struct timeval start_tv, stop_tv; 761 int ret; 762 struct ib_wc wc; 763 struct ib_send_wr *bad_wr; 764 int ne; 765 766 scnt = 0; 767 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; 768 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 769 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 770 cb->rdma_sq_wr.sg_list->length = cb->size; 771 772 microtime(&start_tv); 773 if (!cb->poll) { 774 cb->state = RDMA_READ_ADV; 775 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 776 } 777 while (scnt < iters) { 778 779 cb->state = RDMA_READ_ADV; 780 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); 781 if (ret) { 782 log(LOG_ERR, 783 "Couldn't post send: ret=%d scnt %d\n", 784 ret, scnt); 785 return; 786 } 787 788 do { 789 if (!cb->poll) { 790 krping_wait(cb, RDMA_READ_COMPLETE); 791 if (cb->state == RDMA_READ_COMPLETE) { 792 ne = 1; 793 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 794 } else { 795 ne = -1; 796 } 797 } else 798 ne = ib_poll_cq(cb->cq, 1, &wc); 799 if (cb->state == ERROR) { 800 log(LOG_ERR, 801 "state == ERROR...bailing scnt %d\n", scnt); 802 return; 803 } 804 } while (ne == 0); 805 806 if (ne < 0) { 807 log(LOG_ERR, "poll CQ failed %d\n", ne); 808 return; 809 } 810 if (cb->poll && wc.status != IB_WC_SUCCESS) { 811 log(LOG_ERR, "Completion wth error at %s:\n", 812 cb->server ? "server" : "client"); 813 log(LOG_ERR, "Failed status %d: wr_id %d\n", 814 wc.status, (int) wc.wr_id); 815 return; 816 } 817 ++scnt; 818 } 819 microtime(&stop_tv); 820 821 if (stop_tv.tv_usec < start_tv.tv_usec) { 822 stop_tv.tv_usec += 1000000; 823 stop_tv.tv_sec -= 1; 824 } 825 826 log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n", 827 stop_tv.tv_sec - start_tv.tv_sec, 828 stop_tv.tv_usec - start_tv.tv_usec, 829 scnt, cb->size); 830 } 831 832 static int alloc_cycle_mem(int cycle_iters, 833 cycles_t **post_cycles_start, 834 cycles_t **post_cycles_stop, 835 cycles_t **poll_cycles_start, 836 cycles_t **poll_cycles_stop, 837 cycles_t **last_poll_cycles_start) 838 { 839 *post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 840 if (!*post_cycles_start) { 841 goto fail1; 842 } 843 *post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 844 if (!*post_cycles_stop) { 845 goto fail2; 846 } 847 *poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 848 if (!*poll_cycles_start) { 849 goto fail3; 850 } 851 *poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 852 if (!*poll_cycles_stop) { 853 goto fail4; 854 } 855 *last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 856 if (!*last_poll_cycles_start) { 857 goto fail5; 858 } 859 return 0; 860 fail5: 861 free(*poll_cycles_stop, M_DEVBUF); 862 fail4: 863 free(*poll_cycles_start, M_DEVBUF); 864 fail3: 865 free(*post_cycles_stop, M_DEVBUF); 866 fail2: 867 free(*post_cycles_start, M_DEVBUF); 868 fail1: 869 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__); 870 return ENOMEM; 871 } 872 873 static void free_cycle_mem(cycles_t *post_cycles_start, 874 cycles_t *post_cycles_stop, 875 cycles_t *poll_cycles_start, 876 cycles_t *poll_cycles_stop, 877 cycles_t *last_poll_cycles_start) 878 { 879 free(last_poll_cycles_start, M_DEVBUF); 880 free(poll_cycles_stop, M_DEVBUF); 881 free(poll_cycles_start, M_DEVBUF); 882 free(post_cycles_stop, M_DEVBUF); 883 free(post_cycles_start, M_DEVBUF); 884 } 885 886 static void wlat_test(struct krping_cb *cb) 887 { 888 int ccnt, scnt, rcnt; 889 int iters=cb->count; 890 volatile char *poll_buf = (char *) cb->start_buf; 891 char *buf = (char *)cb->rdma_buf; 892 ccnt = 0; 893 scnt = 0; 894 rcnt = 0; 895 struct timeval start_tv, stop_tv; 896 cycles_t *post_cycles_start, *post_cycles_stop; 897 cycles_t *poll_cycles_start, *poll_cycles_stop; 898 cycles_t *last_poll_cycles_start; 899 cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; 900 int i; 901 int cycle_iters = 1000; 902 int err; 903 904 err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop, 905 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start); 906 907 if (err) { 908 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__); 909 return; 910 } 911 912 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 913 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 914 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 915 cb->rdma_sq_wr.sg_list->length = cb->size; 916 917 if (cycle_iters > iters) 918 cycle_iters = iters; 919 microtime(&start_tv); 920 while (scnt < iters || ccnt < iters || rcnt < iters) { 921 922 /* Wait till buffer changes. */ 923 if (rcnt < iters && !(scnt < 1 && !cb->server)) { 924 ++rcnt; 925 while (*poll_buf != (char)rcnt) { 926 if (cb->state == ERROR) { 927 log(LOG_ERR, "state = ERROR, bailing\n"); 928 return; 929 } 930 } 931 } 932 933 if (scnt < iters) { 934 struct ib_send_wr *bad_wr; 935 936 *buf = (char)scnt+1; 937 if (scnt < cycle_iters) 938 post_cycles_start[scnt] = get_cycles(); 939 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { 940 log(LOG_ERR, "Couldn't post send: scnt=%d\n", 941 scnt); 942 return; 943 } 944 if (scnt < cycle_iters) 945 post_cycles_stop[scnt] = get_cycles(); 946 scnt++; 947 } 948 949 if (ccnt < iters) { 950 struct ib_wc wc; 951 int ne; 952 953 if (ccnt < cycle_iters) 954 poll_cycles_start[ccnt] = get_cycles(); 955 do { 956 if (ccnt < cycle_iters) 957 last_poll_cycles_start[ccnt] = get_cycles(); 958 ne = ib_poll_cq(cb->cq, 1, &wc); 959 } while (ne == 0); 960 if (ccnt < cycle_iters) 961 poll_cycles_stop[ccnt] = get_cycles(); 962 ++ccnt; 963 964 if (ne < 0) { 965 log(LOG_ERR, "poll CQ failed %d\n", ne); 966 return; 967 } 968 if (wc.status != IB_WC_SUCCESS) { 969 log(LOG_ERR, "Completion wth error at %s:\n", 970 cb->server ? "server" : "client"); 971 log(LOG_ERR, "Failed status %d: wr_id %d\n", 972 wc.status, (int) wc.wr_id); 973 log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n", 974 scnt, rcnt, ccnt); 975 return; 976 } 977 } 978 } 979 microtime(&stop_tv); 980 981 if (stop_tv.tv_usec < start_tv.tv_usec) { 982 stop_tv.tv_usec += 1000000; 983 stop_tv.tv_sec -= 1; 984 } 985 986 for (i=0; i < cycle_iters; i++) { 987 sum_post += post_cycles_stop[i] - post_cycles_start[i]; 988 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; 989 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i]; 990 } 991 992 log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n", 993 stop_tv.tv_sec - start_tv.tv_sec, 994 stop_tv.tv_usec - start_tv.tv_usec, 995 scnt, cb->size, cycle_iters, 996 (unsigned long long)sum_post, (unsigned long long)sum_poll, 997 (unsigned long long)sum_last_poll); 998 999 free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 1000 poll_cycles_stop, last_poll_cycles_start); 1001 } 1002 1003 static void bw_test(struct krping_cb *cb) 1004 { 1005 int ccnt, scnt, rcnt; 1006 int iters=cb->count; 1007 ccnt = 0; 1008 scnt = 0; 1009 rcnt = 0; 1010 struct timeval start_tv, stop_tv; 1011 cycles_t *post_cycles_start, *post_cycles_stop; 1012 cycles_t *poll_cycles_start, *poll_cycles_stop; 1013 cycles_t *last_poll_cycles_start; 1014 cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; 1015 int i; 1016 int cycle_iters = 1000; 1017 int err; 1018 1019 err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop, 1020 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start); 1021 1022 if (err) { 1023 log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__); 1024 return; 1025 } 1026 1027 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 1028 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 1029 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 1030 cb->rdma_sq_wr.sg_list->length = cb->size; 1031 1032 if (cycle_iters > iters) 1033 cycle_iters = iters; 1034 microtime(&start_tv); 1035 while (scnt < iters || ccnt < iters) { 1036 1037 while (scnt < iters && scnt - ccnt < cb->txdepth) { 1038 struct ib_send_wr *bad_wr; 1039 1040 if (scnt < cycle_iters) 1041 post_cycles_start[scnt] = get_cycles(); 1042 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { 1043 log(LOG_ERR, "Couldn't post send: scnt=%d\n", 1044 scnt); 1045 return; 1046 } 1047 if (scnt < cycle_iters) 1048 post_cycles_stop[scnt] = get_cycles(); 1049 ++scnt; 1050 } 1051 1052 if (ccnt < iters) { 1053 int ne; 1054 struct ib_wc wc; 1055 1056 if (ccnt < cycle_iters) 1057 poll_cycles_start[ccnt] = get_cycles(); 1058 do { 1059 if (ccnt < cycle_iters) 1060 last_poll_cycles_start[ccnt] = get_cycles(); 1061 ne = ib_poll_cq(cb->cq, 1, &wc); 1062 } while (ne == 0); 1063 if (ccnt < cycle_iters) 1064 poll_cycles_stop[ccnt] = get_cycles(); 1065 ccnt += 1; 1066 1067 if (ne < 0) { 1068 log(LOG_ERR, "poll CQ failed %d\n", ne); 1069 return; 1070 } 1071 if (wc.status != IB_WC_SUCCESS) { 1072 log(LOG_ERR, "Completion wth error at %s:\n", 1073 cb->server ? "server" : "client"); 1074 log(LOG_ERR, "Failed status %d: wr_id %d\n", 1075 wc.status, (int) wc.wr_id); 1076 return; 1077 } 1078 } 1079 } 1080 microtime(&stop_tv); 1081 1082 if (stop_tv.tv_usec < start_tv.tv_usec) { 1083 stop_tv.tv_usec += 1000000; 1084 stop_tv.tv_sec -= 1; 1085 } 1086 1087 for (i=0; i < cycle_iters; i++) { 1088 sum_post += post_cycles_stop[i] - post_cycles_start[i]; 1089 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; 1090 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i]; 1091 } 1092 1093 log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n", 1094 stop_tv.tv_sec - start_tv.tv_sec, 1095 stop_tv.tv_usec - start_tv.tv_usec, 1096 scnt, cb->size, cycle_iters, 1097 (unsigned long long)sum_post, (unsigned long long)sum_poll, 1098 (unsigned long long)sum_last_poll); 1099 1100 free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 1101 poll_cycles_stop, last_poll_cycles_start); 1102 } 1103 1104 static void krping_rlat_test_server(struct krping_cb *cb) 1105 { 1106 struct ib_send_wr *bad_wr; 1107 struct ib_wc wc; 1108 int ret; 1109 1110 /* Spin waiting for client's Start STAG/TO/Len */ 1111 while (cb->state < RDMA_READ_ADV) { 1112 krping_cq_event_handler(cb->cq, cb); 1113 } 1114 1115 /* Send STAG/TO/Len to client */ 1116 if (cb->dma_mr) 1117 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1118 else 1119 krping_format_send(cb, cb->start_addr, cb->start_mr); 1120 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1121 if (ret) { 1122 log(LOG_ERR, "post send error %d\n", ret); 1123 return; 1124 } 1125 1126 /* Spin waiting for send completion */ 1127 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1128 if (ret < 0) { 1129 log(LOG_ERR, "poll error %d\n", ret); 1130 return; 1131 } 1132 if (wc.status) { 1133 log(LOG_ERR, "send completiong error %d\n", wc.status); 1134 return; 1135 } 1136 1137 krping_wait(cb, ERROR); 1138 } 1139 1140 static void krping_wlat_test_server(struct krping_cb *cb) 1141 { 1142 struct ib_send_wr *bad_wr; 1143 struct ib_wc wc; 1144 int ret; 1145 1146 /* Spin waiting for client's Start STAG/TO/Len */ 1147 while (cb->state < RDMA_READ_ADV) { 1148 krping_cq_event_handler(cb->cq, cb); 1149 } 1150 1151 /* Send STAG/TO/Len to client */ 1152 if (cb->dma_mr) 1153 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1154 else 1155 krping_format_send(cb, cb->start_addr, cb->start_mr); 1156 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1157 if (ret) { 1158 log(LOG_ERR, "post send error %d\n", ret); 1159 return; 1160 } 1161 1162 /* Spin waiting for send completion */ 1163 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1164 if (ret < 0) { 1165 log(LOG_ERR, "poll error %d\n", ret); 1166 return; 1167 } 1168 if (wc.status) { 1169 log(LOG_ERR, "send completiong error %d\n", wc.status); 1170 return; 1171 } 1172 1173 wlat_test(cb); 1174 krping_wait(cb, ERROR); 1175 } 1176 1177 static void krping_bw_test_server(struct krping_cb *cb) 1178 { 1179 struct ib_send_wr *bad_wr; 1180 struct ib_wc wc; 1181 int ret; 1182 1183 /* Spin waiting for client's Start STAG/TO/Len */ 1184 while (cb->state < RDMA_READ_ADV) { 1185 krping_cq_event_handler(cb->cq, cb); 1186 } 1187 1188 /* Send STAG/TO/Len to client */ 1189 if (cb->dma_mr) 1190 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1191 else 1192 krping_format_send(cb, cb->start_addr, cb->start_mr); 1193 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1194 if (ret) { 1195 log(LOG_ERR, "post send error %d\n", ret); 1196 return; 1197 } 1198 1199 /* Spin waiting for send completion */ 1200 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1201 if (ret < 0) { 1202 log(LOG_ERR, "poll error %d\n", ret); 1203 return; 1204 } 1205 if (wc.status) { 1206 log(LOG_ERR, "send completiong error %d\n", wc.status); 1207 return; 1208 } 1209 1210 if (cb->duplex) 1211 bw_test(cb); 1212 krping_wait(cb, ERROR); 1213 } 1214 1215 static int krping_bind_server(struct krping_cb *cb) 1216 { 1217 struct sockaddr_in sin; 1218 int ret; 1219 1220 memset(&sin, 0, sizeof(sin)); 1221 sin.sin_len = sizeof sin; 1222 sin.sin_family = AF_INET; 1223 sin.sin_addr.s_addr = cb->addr.s_addr; 1224 sin.sin_port = cb->port; 1225 1226 ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); 1227 if (ret) { 1228 log(LOG_ERR, "rdma_bind_addr error %d\n", ret); 1229 return ret; 1230 } 1231 DEBUG_LOG(PFX "rdma_bind_addr successful\n"); 1232 1233 DEBUG_LOG(PFX "rdma_listen\n"); 1234 ret = rdma_listen(cb->cm_id, 3); 1235 if (ret) { 1236 log(LOG_ERR, "rdma_listen failed: %d\n", ret); 1237 return ret; 1238 } 1239 1240 krping_wait(cb, CONNECT_REQUEST); 1241 if (cb->state != CONNECT_REQUEST) { 1242 log(LOG_ERR, "wait for CONNECT_REQUEST state %d\n", 1243 cb->state); 1244 return -1; 1245 } 1246 1247 return 0; 1248 } 1249 1250 static void krping_run_server(struct krping_cb *cb) 1251 { 1252 struct ib_recv_wr *bad_wr; 1253 int ret; 1254 1255 ret = krping_bind_server(cb); 1256 if (ret) 1257 return; 1258 1259 ret = krping_setup_qp(cb, cb->child_cm_id); 1260 if (ret) { 1261 log(LOG_ERR, "setup_qp failed: %d\n", ret); 1262 return; 1263 } 1264 1265 ret = krping_setup_buffers(cb); 1266 if (ret) { 1267 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret); 1268 goto err1; 1269 } 1270 1271 ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); 1272 if (ret) { 1273 log(LOG_ERR, "ib_post_recv failed: %d\n", ret); 1274 goto err2; 1275 } 1276 1277 ret = krping_accept(cb); 1278 if (ret) { 1279 log(LOG_ERR, "connect error %d\n", ret); 1280 goto err2; 1281 } 1282 1283 if (cb->wlat) 1284 krping_wlat_test_server(cb); 1285 else if (cb->rlat) 1286 krping_rlat_test_server(cb); 1287 else if (cb->bw) 1288 krping_bw_test_server(cb); 1289 else 1290 krping_test_server(cb); 1291 1292 rdma_disconnect(cb->child_cm_id); 1293 rdma_destroy_id(cb->child_cm_id); 1294 err2: 1295 krping_free_buffers(cb); 1296 err1: 1297 krping_free_qp(cb); 1298 } 1299 1300 static void krping_test_client(struct krping_cb *cb) 1301 { 1302 int ping, start, cc, i, ret; 1303 struct ib_send_wr *bad_wr; 1304 unsigned char c; 1305 1306 start = 65; 1307 for (ping = 0; !cb->count || ping < cb->count; ping++) { 1308 cb->state = RDMA_READ_ADV; 1309 1310 /* Put some ascii text in the buffer. */ 1311 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); 1312 for (i = cc, c = start; i < cb->size; i++) { 1313 cb->start_buf[i] = c; 1314 c++; 1315 if (c > 122) 1316 c = 65; 1317 } 1318 start++; 1319 if (start > 122) 1320 start = 65; 1321 cb->start_buf[cb->size - 1] = 0; 1322 1323 if (cb->dma_mr) 1324 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1325 else 1326 krping_format_send(cb, cb->start_addr, cb->start_mr); 1327 1328 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1329 if (ret) { 1330 log(LOG_ERR, "post send error %d\n", ret); 1331 break; 1332 } 1333 1334 /* Wait for server to ACK */ 1335 krping_wait(cb, RDMA_WRITE_ADV); 1336 if (cb->state != RDMA_WRITE_ADV) { 1337 log(LOG_ERR, 1338 "wait for RDMA_WRITE_ADV state %d\n", 1339 cb->state); 1340 break; 1341 } 1342 1343 if (cb->dma_mr) 1344 krping_format_send(cb, cb->rdma_addr, cb->dma_mr); 1345 else 1346 krping_format_send(cb, cb->rdma_addr, cb->rdma_mr); 1347 1348 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1349 if (ret) { 1350 log(LOG_ERR, "post send error %d\n", ret); 1351 break; 1352 } 1353 1354 /* Wait for the server to say the RDMA Write is complete. */ 1355 krping_wait(cb, RDMA_WRITE_COMPLETE); 1356 if (cb->state != RDMA_WRITE_COMPLETE) { 1357 log(LOG_ERR, 1358 "wait for RDMA_WRITE_COMPLETE state %d\n", 1359 cb->state); 1360 break; 1361 } 1362 1363 if (cb->validate) 1364 if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { 1365 log(LOG_ERR, "data mismatch!\n"); 1366 break; 1367 } 1368 1369 if (cb->verbose) 1370 DEBUG_LOG("ping data: %s\n", cb->rdma_buf); 1371 } 1372 } 1373 1374 static void krping_rlat_test_client(struct krping_cb *cb) 1375 { 1376 struct ib_send_wr *bad_wr; 1377 struct ib_wc wc; 1378 int ret; 1379 1380 cb->state = RDMA_READ_ADV; 1381 1382 /* Send STAG/TO/Len to client */ 1383 if (cb->dma_mr) 1384 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1385 else 1386 krping_format_send(cb, cb->start_addr, cb->rdma_mr); 1387 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1388 if (ret) { 1389 log(LOG_ERR, "post send error %d\n", ret); 1390 return; 1391 } 1392 1393 /* Spin waiting for send completion */ 1394 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1395 if (ret < 0) { 1396 log(LOG_ERR, "poll error %d\n", ret); 1397 return; 1398 } 1399 if (wc.status) { 1400 log(LOG_ERR, "send completion error %d\n", wc.status); 1401 return; 1402 } 1403 1404 /* Spin waiting for server's Start STAG/TO/Len */ 1405 while (cb->state < RDMA_WRITE_ADV) { 1406 krping_cq_event_handler(cb->cq, cb); 1407 } 1408 1409 #if 0 1410 { 1411 int i; 1412 struct timeval start, stop; 1413 time_t sec; 1414 suseconds_t usec; 1415 unsigned long long elapsed; 1416 struct ib_wc wc; 1417 struct ib_send_wr *bad_wr; 1418 int ne; 1419 1420 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 1421 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 1422 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 1423 cb->rdma_sq_wr.sg_list->length = 0; 1424 cb->rdma_sq_wr.num_sge = 0; 1425 1426 microtime(&start); 1427 for (i=0; i < 100000; i++) { 1428 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { 1429 log(LOG_ERR, "Couldn't post send\n"); 1430 return; 1431 } 1432 do { 1433 ne = ib_poll_cq(cb->cq, 1, &wc); 1434 } while (ne == 0); 1435 if (ne < 0) { 1436 log(LOG_ERR, "poll CQ failed %d\n", ne); 1437 return; 1438 } 1439 if (wc.status != IB_WC_SUCCESS) { 1440 log(LOG_ERR, "Completion wth error at %s:\n", 1441 cb->server ? "server" : "client"); 1442 log(LOG_ERR, "Failed status %d: wr_id %d\n", 1443 wc.status, (int) wc.wr_id); 1444 return; 1445 } 1446 } 1447 microtime(&stop); 1448 1449 if (stop.tv_usec < start.tv_usec) { 1450 stop.tv_usec += 1000000; 1451 stop.tv_sec -= 1; 1452 } 1453 sec = stop.tv_sec - start.tv_sec; 1454 usec = stop.tv_usec - start.tv_usec; 1455 elapsed = sec * 1000000 + usec; 1456 log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed); 1457 } 1458 #endif 1459 1460 rlat_test(cb); 1461 } 1462 1463 static void krping_wlat_test_client(struct krping_cb *cb) 1464 { 1465 struct ib_send_wr *bad_wr; 1466 struct ib_wc wc; 1467 int ret; 1468 1469 cb->state = RDMA_READ_ADV; 1470 1471 /* Send STAG/TO/Len to client */ 1472 if (cb->dma_mr) 1473 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1474 else 1475 krping_format_send(cb, cb->start_addr, cb->start_mr); 1476 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1477 if (ret) { 1478 log(LOG_ERR, "post send error %d\n", ret); 1479 return; 1480 } 1481 1482 /* Spin waiting for send completion */ 1483 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1484 if (ret < 0) { 1485 log(LOG_ERR, "poll error %d\n", ret); 1486 return; 1487 } 1488 if (wc.status) { 1489 log(LOG_ERR, "send completion error %d\n", wc.status); 1490 return; 1491 } 1492 1493 /* Spin waiting for server's Start STAG/TO/Len */ 1494 while (cb->state < RDMA_WRITE_ADV) { 1495 krping_cq_event_handler(cb->cq, cb); 1496 } 1497 1498 wlat_test(cb); 1499 } 1500 1501 static void krping_bw_test_client(struct krping_cb *cb) 1502 { 1503 struct ib_send_wr *bad_wr; 1504 struct ib_wc wc; 1505 int ret; 1506 1507 cb->state = RDMA_READ_ADV; 1508 1509 /* Send STAG/TO/Len to client */ 1510 if (cb->dma_mr) 1511 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1512 else 1513 krping_format_send(cb, cb->start_addr, cb->start_mr); 1514 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1515 if (ret) { 1516 log(LOG_ERR, "post send error %d\n", ret); 1517 return; 1518 } 1519 1520 /* Spin waiting for send completion */ 1521 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1522 if (ret < 0) { 1523 log(LOG_ERR, "poll error %d\n", ret); 1524 return; 1525 } 1526 if (wc.status) { 1527 log(LOG_ERR, "send completion error %d\n", wc.status); 1528 return; 1529 } 1530 1531 /* Spin waiting for server's Start STAG/TO/Len */ 1532 while (cb->state < RDMA_WRITE_ADV) { 1533 krping_cq_event_handler(cb->cq, cb); 1534 } 1535 1536 bw_test(cb); 1537 } 1538 1539 static int krping_connect_client(struct krping_cb *cb) 1540 { 1541 struct rdma_conn_param conn_param; 1542 int ret; 1543 1544 memset(&conn_param, 0, sizeof conn_param); 1545 conn_param.responder_resources = 1; 1546 conn_param.initiator_depth = 1; 1547 conn_param.retry_count = 10; 1548 1549 ret = rdma_connect(cb->cm_id, &conn_param); 1550 if (ret) { 1551 log(LOG_ERR, "rdma_connect error %d\n", ret); 1552 return ret; 1553 } 1554 1555 krping_wait(cb, CONNECTED); 1556 if (cb->state == ERROR) { 1557 log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state); 1558 return -1; 1559 } 1560 1561 DEBUG_LOG(PFX "rdma_connect successful\n"); 1562 return 0; 1563 } 1564 1565 static int krping_bind_client(struct krping_cb *cb) 1566 { 1567 struct sockaddr_in sin; 1568 int ret; 1569 1570 memset(&sin, 0, sizeof(sin)); 1571 sin.sin_len = sizeof sin; 1572 sin.sin_family = AF_INET; 1573 sin.sin_addr.s_addr = cb->addr.s_addr; 1574 sin.sin_port = cb->port; 1575 1576 ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, 1577 2000); 1578 if (ret) { 1579 log(LOG_ERR, "rdma_resolve_addr error %d\n", ret); 1580 return ret; 1581 } 1582 1583 krping_wait(cb, ROUTE_RESOLVED); 1584 if (cb->state != ROUTE_RESOLVED) { 1585 log(LOG_ERR, 1586 "addr/route resolution did not resolve: state %d\n", 1587 cb->state); 1588 return EINTR; 1589 } 1590 1591 DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n"); 1592 return 0; 1593 } 1594 1595 static void krping_run_client(struct krping_cb *cb) 1596 { 1597 struct ib_recv_wr *bad_wr; 1598 int ret; 1599 1600 ret = krping_bind_client(cb); 1601 if (ret) 1602 return; 1603 1604 ret = krping_setup_qp(cb, cb->cm_id); 1605 if (ret) { 1606 log(LOG_ERR, "setup_qp failed: %d\n", ret); 1607 return; 1608 } 1609 1610 ret = krping_setup_buffers(cb); 1611 if (ret) { 1612 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret); 1613 goto err1; 1614 } 1615 1616 ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); 1617 if (ret) { 1618 log(LOG_ERR, "ib_post_recv failed: %d\n", ret); 1619 goto err2; 1620 } 1621 1622 ret = krping_connect_client(cb); 1623 if (ret) { 1624 log(LOG_ERR, "connect error %d\n", ret); 1625 goto err2; 1626 } 1627 1628 if (cb->wlat) 1629 krping_wlat_test_client(cb); 1630 else if (cb->rlat) 1631 krping_rlat_test_client(cb); 1632 else if (cb->bw) 1633 krping_bw_test_client(cb); 1634 else 1635 krping_test_client(cb); 1636 rdma_disconnect(cb->cm_id); 1637 err2: 1638 krping_free_buffers(cb); 1639 err1: 1640 krping_free_qp(cb); 1641 } 1642 1643 int krping_doit(char *cmd) 1644 { 1645 struct krping_cb *cb; 1646 int op; 1647 int ret = 0; 1648 char *optarg; 1649 unsigned long optint; 1650 debug = 0; 1651 1652 cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK); 1653 if (!cb) 1654 return ENOMEM; 1655 bzero(cb, sizeof *cb); 1656 1657 mtx_lock(&krping_mutex); 1658 TAILQ_INSERT_TAIL(&krping_cbs, cb, list); 1659 mtx_unlock(&krping_mutex); 1660 1661 cb->server = -1; 1662 cb->state = IDLE; 1663 cb->size = 64; 1664 cb->txdepth = RPING_SQ_DEPTH; 1665 cb->use_dmamr = 1; 1666 cb->memlimit = 0; 1667 mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF); 1668 1669 while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, 1670 &optint)) != 0) { 1671 switch (op) { 1672 case 'a': 1673 cb->addr_str = optarg; 1674 DEBUG_LOG(PFX "ipaddr (%s)\n", optarg); 1675 if (!inet_aton(optarg, &cb->addr)) { 1676 log(LOG_ERR, "bad addr string %s\n", optarg); 1677 ret = EINVAL; 1678 } 1679 break; 1680 case 'D': 1681 cb->use_dmamr = 1; 1682 DEBUG_LOG(PFX "using dma mr\n"); 1683 break; 1684 case 'p': 1685 cb->port = htons(optint); 1686 DEBUG_LOG(PFX "port %d\n", (int)optint); 1687 break; 1688 case 'P': 1689 cb->poll = 1; 1690 DEBUG_LOG("server\n"); 1691 break; 1692 case 's': 1693 cb->server = 1; 1694 DEBUG_LOG(PFX "server\n"); 1695 break; 1696 case 'c': 1697 cb->server = 0; 1698 DEBUG_LOG(PFX "client\n"); 1699 break; 1700 case 'S': 1701 cb->size = optint; 1702 if ((cb->size < 1) || 1703 (cb->size > RPING_BUFSIZE)) { 1704 log(LOG_ERR, "Invalid size %d " 1705 "(valid range is 1 to %d)\n", 1706 cb->size, RPING_BUFSIZE); 1707 ret = EINVAL; 1708 } else 1709 DEBUG_LOG(PFX "size %d\n", (int)optint); 1710 break; 1711 case 'C': 1712 cb->count = optint; 1713 if (cb->count < 0) { 1714 log(LOG_ERR, "Invalid count %d\n", 1715 cb->count); 1716 ret = EINVAL; 1717 } else 1718 DEBUG_LOG(PFX "count %d\n", (int) cb->count); 1719 break; 1720 case 'v': 1721 cb->verbose++; 1722 DEBUG_LOG(PFX "verbose\n"); 1723 break; 1724 case 'V': 1725 cb->validate++; 1726 DEBUG_LOG(PFX "validate data\n"); 1727 break; 1728 case 'L': 1729 cb->rlat++; 1730 break; 1731 case 'l': 1732 cb->wlat++; 1733 break; 1734 case 'B': 1735 cb->bw++; 1736 break; 1737 case 't': 1738 cb->txdepth = optint; 1739 DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth); 1740 break; 1741 case 'd': 1742 debug++; 1743 break; 1744 case 'm': 1745 cb->memlimit = optint; 1746 if (cb->memlimit < 1) { 1747 log(LOG_ERR, "Invalid memory limit %ju\n", 1748 cb->memlimit); 1749 ret = EINVAL; 1750 } else 1751 DEBUG_LOG(PFX "memory limit %d\n", (int)optint); 1752 break; 1753 default: 1754 log(LOG_ERR, "unknown opt %s\n", optarg); 1755 ret = EINVAL; 1756 break; 1757 } 1758 } 1759 if (ret) 1760 goto out; 1761 1762 if (cb->server == -1) { 1763 log(LOG_ERR, "must be either client or server\n"); 1764 ret = EINVAL; 1765 goto out; 1766 } 1767 if ((cb->bw + cb->rlat + cb->wlat) > 1) { 1768 log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n"); 1769 ret = EINVAL; 1770 goto out; 1771 } 1772 1773 1774 cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP); 1775 if (IS_ERR(cb->cm_id)) { 1776 ret = PTR_ERR(cb->cm_id); 1777 log(LOG_ERR, "rdma_create_id error %d\n", ret); 1778 goto out; 1779 } 1780 DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id); 1781 if (cb->server) 1782 krping_run_server(cb); 1783 else 1784 krping_run_client(cb); 1785 DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id); 1786 1787 mtx_lock(&cb->lock); 1788 cb->state = CLEANUP; 1789 wakeup(cb); 1790 mtx_unlock(&cb->lock); 1791 1792 rdma_destroy_id(cb->cm_id); 1793 out: 1794 mtx_lock(&krping_mutex); 1795 TAILQ_REMOVE(&krping_cbs, cb, list); 1796 mtx_unlock(&krping_mutex); 1797 free(cb, M_DEVBUF); 1798 return ret; 1799 } 1800 1801 void krping_init(void) 1802 { 1803 mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF); 1804 TAILQ_INIT(&krping_cbs); 1805 } 1806