1 /* 2 * Copyright (c) 2005 Ammasso, Inc. All rights reserved. 3 * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/ctype.h> 38 39 #include <sys/param.h> 40 #include <sys/condvar.h> 41 #include <sys/systm.h> 42 #include <sys/kernel.h> 43 #include <sys/socket.h> 44 #include <sys/endian.h> 45 #include <sys/limits.h> 46 #include <sys/proc.h> 47 #include <sys/signalvar.h> 48 49 #include <sys/lock.h> 50 #include <sys/mutex.h> 51 #include <sys/rwlock.h> 52 #include <sys/queue.h> 53 #include <sys/taskqueue.h> 54 #include <sys/syslog.h> 55 #include <netinet/in.h> 56 57 #include <vm/vm.h> 58 #include <vm/pmap.h> 59 60 #include <linux/types.h> 61 #include <rdma/rdma_cm.h> 62 63 #include "getopt.h" 64 #include "krping.h" 65 66 #define PFX "krping: " 67 68 static int debug = 0; 69 #define DEBUG_LOG if (debug) printf 70 71 static const struct krping_option krping_opts[] = { 72 {"count", OPT_INT, 'C'}, 73 {"size", OPT_INT, 'S'}, 74 {"addr", OPT_STRING, 'a'}, 75 {"port", OPT_INT, 'p'}, 76 {"verbose", OPT_NOPARAM, 'v'}, 77 {"validate", OPT_NOPARAM, 'V'}, 78 {"server", OPT_NOPARAM, 's'}, 79 {"client", OPT_NOPARAM, 'c'}, 80 {"dmamr", OPT_NOPARAM, 'D'}, 81 {"debug", OPT_NOPARAM, 'd'}, 82 {"wlat", OPT_NOPARAM, 'l'}, 83 {"rlat", OPT_NOPARAM, 'L'}, 84 {"bw", OPT_NOPARAM, 'B'}, 85 {"tx-depth", OPT_INT, 't'}, 86 {"poll", OPT_NOPARAM, 'P'}, 87 {"memlimit", OPT_INT, 'm'}, 88 {NULL, 0, 0} 89 }; 90 91 struct mtx krping_mutex; 92 93 /* 94 * List of running krping threads. 95 */ 96 struct krping_cb_list krping_cbs; 97 98 /* 99 * krping "ping/pong" loop: 100 * client sends source rkey/addr/len 101 * server receives source rkey/add/len 102 * server rdma reads "ping" data from source 103 * server sends "go ahead" on rdma read completion 104 * client sends sink rkey/addr/len 105 * server receives sink rkey/addr/len 106 * server rdma writes "pong" data to sink 107 * server sends "go ahead" on rdma write completion 108 * <repeat loop> 109 */ 110 111 /* 112 * Default max buffer size for IO... 113 */ 114 #define RPING_BUFSIZE 128*1024 115 #define RPING_SQ_DEPTH 32 116 117 static void krping_wait(struct krping_cb *cb, int state) 118 { 119 int rc; 120 mtx_lock(&cb->lock); 121 while (cb->state < state) { 122 rc = msleep(cb, &cb->lock, 0, "krping", 0); 123 if (rc && rc != ERESTART) { 124 cb->state = ERROR; 125 break; 126 } 127 } 128 mtx_unlock(&cb->lock); 129 } 130 131 static int krping_cma_event_handler(struct rdma_cm_id *cma_id, 132 struct rdma_cm_event *event) 133 { 134 int ret; 135 struct krping_cb *cb = cma_id->context; 136 137 DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id, 138 (cma_id == cb->cm_id) ? "parent" : "child"); 139 140 mtx_lock(&cb->lock); 141 switch (event->event) { 142 case RDMA_CM_EVENT_ADDR_RESOLVED: 143 cb->state = ADDR_RESOLVED; 144 ret = rdma_resolve_route(cma_id, 2000); 145 if (ret) { 146 log(LOG_ERR, "rdma_resolve_route error %d\n", 147 ret); 148 wakeup(cb); 149 } 150 break; 151 152 case RDMA_CM_EVENT_ROUTE_RESOLVED: 153 cb->state = ROUTE_RESOLVED; 154 wakeup(cb); 155 break; 156 157 case RDMA_CM_EVENT_CONNECT_REQUEST: 158 cb->state = CONNECT_REQUEST; 159 cb->child_cm_id = cma_id; 160 DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id); 161 wakeup(cb); 162 break; 163 164 case RDMA_CM_EVENT_ESTABLISHED: 165 DEBUG_LOG(PFX "ESTABLISHED\n"); 166 if (!cb->server) { 167 cb->state = CONNECTED; 168 wakeup(cb); 169 } 170 break; 171 172 case RDMA_CM_EVENT_ADDR_ERROR: 173 case RDMA_CM_EVENT_ROUTE_ERROR: 174 case RDMA_CM_EVENT_CONNECT_ERROR: 175 case RDMA_CM_EVENT_UNREACHABLE: 176 case RDMA_CM_EVENT_REJECTED: 177 log(LOG_ERR, "cma event %d, error %d\n", event->event, 178 event->status); 179 cb->state = ERROR; 180 wakeup(cb); 181 break; 182 183 case RDMA_CM_EVENT_DISCONNECTED: 184 DEBUG_LOG(PFX "DISCONNECT EVENT...\n"); 185 cb->state = ERROR; 186 wakeup(cb); 187 break; 188 189 case RDMA_CM_EVENT_DEVICE_REMOVAL: 190 DEBUG_LOG(PFX "cma detected device removal!!!!\n"); 191 break; 192 193 default: 194 log(LOG_ERR, "oof bad type!\n"); 195 wakeup(cb); 196 break; 197 } 198 mtx_unlock(&cb->lock); 199 return 0; 200 } 201 202 static int server_recv(struct krping_cb *cb, struct ib_wc *wc) 203 { 204 if (wc->byte_len != sizeof(cb->recv_buf)) { 205 log(LOG_ERR, "Received bogus data, size %d\n", 206 wc->byte_len); 207 return -1; 208 } 209 210 cb->remote_rkey = ntohl(cb->recv_buf.rkey); 211 cb->remote_addr = ntohll(cb->recv_buf.buf); 212 cb->remote_len = ntohl(cb->recv_buf.size); 213 DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n", 214 cb->remote_rkey, (unsigned long long)cb->remote_addr, 215 cb->remote_len); 216 217 if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) 218 cb->state = RDMA_READ_ADV; 219 else 220 cb->state = RDMA_WRITE_ADV; 221 222 return 0; 223 } 224 225 static int client_recv(struct krping_cb *cb, struct ib_wc *wc) 226 { 227 if (wc->byte_len != sizeof(cb->recv_buf)) { 228 log(LOG_ERR, "Received bogus data, size %d\n", 229 wc->byte_len); 230 return -1; 231 } 232 233 if (cb->state == RDMA_READ_ADV) 234 cb->state = RDMA_WRITE_ADV; 235 else 236 cb->state = RDMA_WRITE_COMPLETE; 237 238 return 0; 239 } 240 241 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) 242 { 243 struct krping_cb *cb = ctx; 244 struct ib_wc wc; 245 struct ib_recv_wr *bad_wr; 246 int ret; 247 248 mtx_lock(&cb->lock); 249 KASSERT(cb->cq == cq, ("bad condition")); 250 if (cb->state == ERROR) { 251 log(LOG_ERR, "cq completion in ERROR state\n"); 252 mtx_unlock(&cb->lock); 253 return; 254 } 255 if (!cb->wlat && !cb->rlat && !cb->bw) 256 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 257 while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { 258 if (wc.status) { 259 if (wc.status == IB_WC_WR_FLUSH_ERR) { 260 DEBUG_LOG("cq flushed\n"); 261 continue; 262 } else { 263 log(LOG_CRIT, "cq completion failed status %d\n", 264 wc.status); 265 goto error; 266 } 267 } 268 269 switch (wc.opcode) { 270 case IB_WC_SEND: 271 DEBUG_LOG(PFX "send completion\n"); 272 cb->stats.send_bytes += cb->send_sgl.length; 273 cb->stats.send_msgs++; 274 break; 275 276 case IB_WC_RDMA_WRITE: 277 DEBUG_LOG(PFX "rdma write completion\n"); 278 cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; 279 cb->stats.write_msgs++; 280 cb->state = RDMA_WRITE_COMPLETE; 281 wakeup(cb); 282 break; 283 284 case IB_WC_RDMA_READ: 285 DEBUG_LOG(PFX "rdma read completion\n"); 286 cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; 287 cb->stats.read_msgs++; 288 cb->state = RDMA_READ_COMPLETE; 289 wakeup(cb); 290 break; 291 292 case IB_WC_RECV: 293 DEBUG_LOG(PFX "recv completion\n"); 294 cb->stats.recv_bytes += sizeof(cb->recv_buf); 295 cb->stats.recv_msgs++; 296 if (cb->wlat || cb->rlat || cb->bw) 297 ret = server_recv(cb, &wc); 298 else 299 ret = cb->server ? server_recv(cb, &wc) : 300 client_recv(cb, &wc); 301 if (ret) { 302 log(LOG_ERR, "recv wc error: %d\n", ret); 303 goto error; 304 } 305 306 ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); 307 if (ret) { 308 log(LOG_ERR, "post recv error: %d\n", 309 ret); 310 goto error; 311 } 312 wakeup(cb); 313 break; 314 315 default: 316 log(LOG_ERR, "unknown!!!!! completion\n"); 317 goto error; 318 } 319 } 320 if (ret) { 321 log(LOG_ERR, "poll error %d\n", ret); 322 goto error; 323 } 324 mtx_unlock(&cb->lock); 325 return; 326 error: 327 cb->state = ERROR; 328 wakeup(cb); 329 mtx_unlock(&cb->lock); 330 } 331 332 static int krping_accept(struct krping_cb *cb) 333 { 334 struct rdma_conn_param conn_param; 335 int ret; 336 337 DEBUG_LOG(PFX "accepting client connection request\n"); 338 339 memset(&conn_param, 0, sizeof conn_param); 340 conn_param.responder_resources = 1; 341 conn_param.initiator_depth = 1; 342 343 ret = rdma_accept(cb->child_cm_id, &conn_param); 344 if (ret) { 345 log(LOG_ERR, "rdma_accept error: %d\n", ret); 346 return ret; 347 } 348 349 if (!cb->wlat && !cb->rlat && !cb->bw) { 350 krping_wait(cb, CONNECTED); 351 if (cb->state == ERROR) { 352 log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state); 353 return -1; 354 } 355 } 356 return 0; 357 } 358 359 static void krping_setup_wr(struct krping_cb *cb) 360 { 361 /* XXX X86 only here... not mapping for dma! */ 362 cb->recv_sgl.addr = vtophys(&cb->recv_buf); 363 cb->recv_sgl.length = sizeof cb->recv_buf; 364 if (cb->use_dmamr) 365 cb->recv_sgl.lkey = cb->dma_mr->lkey; 366 else 367 cb->recv_sgl.lkey = cb->recv_mr->lkey; 368 cb->rq_wr.sg_list = &cb->recv_sgl; 369 cb->rq_wr.num_sge = 1; 370 371 cb->send_sgl.addr = vtophys(&cb->send_buf); 372 cb->send_sgl.length = sizeof cb->send_buf; 373 if (cb->use_dmamr) 374 cb->send_sgl.lkey = cb->dma_mr->lkey; 375 else 376 cb->send_sgl.lkey = cb->send_mr->lkey; 377 378 cb->sq_wr.opcode = IB_WR_SEND; 379 cb->sq_wr.send_flags = IB_SEND_SIGNALED; 380 cb->sq_wr.sg_list = &cb->send_sgl; 381 cb->sq_wr.num_sge = 1; 382 383 cb->rdma_addr = vtophys(cb->rdma_buf); 384 cb->rdma_sgl.addr = cb->rdma_addr; 385 if (cb->use_dmamr) 386 cb->rdma_sgl.lkey = cb->dma_mr->lkey; 387 else 388 cb->rdma_sgl.lkey = cb->rdma_mr->lkey; 389 cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; 390 cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; 391 cb->rdma_sq_wr.num_sge = 1; 392 393 if (!cb->server || cb->wlat || cb->rlat || cb->bw) { 394 cb->start_addr = vtophys(cb->start_buf); 395 } 396 } 397 398 static int krping_setup_buffers(struct krping_cb *cb) 399 { 400 int ret; 401 struct ib_phys_buf buf; 402 u64 iovbase; 403 404 DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb); 405 406 if (cb->use_dmamr) { 407 cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| 408 IB_ACCESS_REMOTE_READ| 409 IB_ACCESS_REMOTE_WRITE); 410 if (IS_ERR(cb->dma_mr)) { 411 log(LOG_ERR, "reg_dmamr failed\n"); 412 return PTR_ERR(cb->dma_mr); 413 } 414 } else { 415 416 buf.addr = vtophys(&cb->recv_buf); 417 buf.size = sizeof cb->recv_buf; 418 iovbase = vtophys(&cb->recv_buf); 419 cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 420 IB_ACCESS_LOCAL_WRITE, 421 &iovbase); 422 423 if (IS_ERR(cb->recv_mr)) { 424 log(LOG_ERR, "recv_buf reg_mr failed\n"); 425 return PTR_ERR(cb->recv_mr); 426 } 427 428 buf.addr = vtophys(&cb->send_buf); 429 buf.size = sizeof cb->send_buf; 430 iovbase = vtophys(&cb->send_buf); 431 cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 432 0, &iovbase); 433 434 if (IS_ERR(cb->send_mr)) { 435 log(LOG_ERR, "send_buf reg_mr failed\n"); 436 ib_dereg_mr(cb->recv_mr); 437 return PTR_ERR(cb->send_mr); 438 } 439 } 440 441 /* RNIC adapters have a limit upto which it can register physical memory 442 * If DMA-MR memory mode is set then normally driver registers maximum 443 * supported memory. After that if contigmalloc allocates memory beyond the 444 * specified RNIC limit then Krping may not work. 445 */ 446 if (cb->use_dmamr && cb->memlimit) 447 cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, cb->memlimit, 448 PAGE_SIZE, 0); 449 else 450 cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL, 451 PAGE_SIZE, 0); 452 453 if (!cb->rdma_buf) { 454 log(LOG_ERR, "rdma_buf malloc failed\n"); 455 ret = ENOMEM; 456 goto err1; 457 } 458 if (!cb->use_dmamr) { 459 460 buf.addr = vtophys(cb->rdma_buf); 461 buf.size = cb->size; 462 iovbase = vtophys(cb->rdma_buf); 463 cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 464 IB_ACCESS_REMOTE_READ| 465 IB_ACCESS_REMOTE_WRITE, 466 &iovbase); 467 468 if (IS_ERR(cb->rdma_mr)) { 469 log(LOG_ERR, "rdma_buf reg_mr failed\n"); 470 ret = PTR_ERR(cb->rdma_mr); 471 goto err2; 472 } 473 } 474 475 if (!cb->server || cb->wlat || cb->rlat || cb->bw) { 476 if (cb->use_dmamr && cb->memlimit) 477 cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 478 0, cb->memlimit, PAGE_SIZE, 0); 479 else 480 cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 481 0, -1UL, PAGE_SIZE, 0); 482 if (!cb->start_buf) { 483 log(LOG_ERR, "start_buf malloc failed\n"); 484 ret = ENOMEM; 485 goto err2; 486 } 487 if (!cb->use_dmamr) { 488 unsigned flags = IB_ACCESS_REMOTE_READ; 489 490 if (cb->wlat || cb->rlat || cb->bw) 491 flags |= IB_ACCESS_REMOTE_WRITE; 492 buf.addr = vtophys(cb->start_buf); 493 buf.size = cb->size; 494 iovbase = vtophys(cb->start_buf); 495 cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 496 flags, 497 &iovbase); 498 499 if (IS_ERR(cb->start_mr)) { 500 log(LOG_ERR, "start_buf reg_mr failed\n"); 501 ret = PTR_ERR(cb->start_mr); 502 goto err3; 503 } 504 } 505 } 506 507 krping_setup_wr(cb); 508 DEBUG_LOG(PFX "allocated & registered buffers...\n"); 509 return 0; 510 err3: 511 contigfree(cb->start_buf, cb->size, M_DEVBUF); 512 513 if (!cb->use_dmamr) 514 ib_dereg_mr(cb->rdma_mr); 515 err2: 516 contigfree(cb->rdma_buf, cb->size, M_DEVBUF); 517 err1: 518 if (cb->use_dmamr) 519 ib_dereg_mr(cb->dma_mr); 520 else { 521 ib_dereg_mr(cb->recv_mr); 522 ib_dereg_mr(cb->send_mr); 523 } 524 return ret; 525 } 526 527 static void krping_free_buffers(struct krping_cb *cb) 528 { 529 DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb); 530 531 #if 0 532 dma_unmap_single(cb->pd->device->dma_device, 533 pci_unmap_addr(cb, recv_mapping), 534 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); 535 dma_unmap_single(cb->pd->device->dma_device, 536 pci_unmap_addr(cb, send_mapping), 537 sizeof(cb->send_buf), DMA_BIDIRECTIONAL); 538 dma_unmap_single(cb->pd->device->dma_device, 539 pci_unmap_addr(cb, rdma_mapping), 540 cb->size, DMA_BIDIRECTIONAL); 541 #endif 542 contigfree(cb->rdma_buf, cb->size, M_DEVBUF); 543 if (!cb->server || cb->wlat || cb->rlat || cb->bw) { 544 #if 0 545 dma_unmap_single(cb->pd->device->dma_device, 546 pci_unmap_addr(cb, start_mapping), 547 cb->size, DMA_BIDIRECTIONAL); 548 #endif 549 contigfree(cb->start_buf, cb->size, M_DEVBUF); 550 } 551 if (cb->use_dmamr) 552 ib_dereg_mr(cb->dma_mr); 553 else { 554 ib_dereg_mr(cb->send_mr); 555 ib_dereg_mr(cb->recv_mr); 556 ib_dereg_mr(cb->rdma_mr); 557 if (!cb->server) 558 ib_dereg_mr(cb->start_mr); 559 } 560 } 561 562 static int krping_create_qp(struct krping_cb *cb) 563 { 564 struct ib_qp_init_attr init_attr; 565 int ret; 566 567 memset(&init_attr, 0, sizeof(init_attr)); 568 init_attr.cap.max_send_wr = cb->txdepth; 569 init_attr.cap.max_recv_wr = 2; 570 init_attr.cap.max_recv_sge = 1; 571 init_attr.cap.max_send_sge = 1; 572 init_attr.qp_type = IB_QPT_RC; 573 init_attr.send_cq = cb->cq; 574 init_attr.recv_cq = cb->cq; 575 576 if (cb->server) { 577 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); 578 if (!ret) 579 cb->qp = cb->child_cm_id->qp; 580 } else { 581 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); 582 if (!ret) 583 cb->qp = cb->cm_id->qp; 584 } 585 586 return ret; 587 } 588 589 static void krping_free_qp(struct krping_cb *cb) 590 { 591 ib_destroy_qp(cb->qp); 592 ib_destroy_cq(cb->cq); 593 ib_dealloc_pd(cb->pd); 594 } 595 596 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) 597 { 598 int ret; 599 cb->pd = ib_alloc_pd(cm_id->device); 600 if (IS_ERR(cb->pd)) { 601 log(LOG_ERR, "ib_alloc_pd failed\n"); 602 return PTR_ERR(cb->pd); 603 } 604 DEBUG_LOG(PFX "created pd %p\n", cb->pd); 605 606 cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, 607 cb, cb->txdepth * 2, 0); 608 if (IS_ERR(cb->cq)) { 609 log(LOG_ERR, "ib_create_cq failed\n"); 610 ret = PTR_ERR(cb->cq); 611 goto err1; 612 } 613 DEBUG_LOG(PFX "created cq %p\n", cb->cq); 614 615 if (!cb->wlat && !cb->rlat && !cb->bw) { 616 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 617 if (ret) { 618 log(LOG_ERR, "ib_create_cq failed\n"); 619 goto err2; 620 } 621 } 622 623 ret = krping_create_qp(cb); 624 if (ret) { 625 log(LOG_ERR, "krping_create_qp failed: %d\n", ret); 626 goto err2; 627 } 628 DEBUG_LOG(PFX "created qp %p\n", cb->qp); 629 return 0; 630 err2: 631 ib_destroy_cq(cb->cq); 632 err1: 633 ib_dealloc_pd(cb->pd); 634 return ret; 635 } 636 637 static void krping_format_send(struct krping_cb *cb, u64 buf, 638 struct ib_mr *mr) 639 { 640 struct krping_rdma_info *info = &cb->send_buf; 641 642 info->buf = htonll(buf); 643 info->rkey = htonl(mr->rkey); 644 info->size = htonl(cb->size); 645 646 DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n", 647 (unsigned long long)buf, mr->rkey, cb->size); 648 } 649 650 static void krping_test_server(struct krping_cb *cb) 651 { 652 struct ib_send_wr *bad_wr; 653 int ret; 654 655 while (1) { 656 /* Wait for client's Start STAG/TO/Len */ 657 krping_wait(cb, RDMA_READ_ADV); 658 if (cb->state != RDMA_READ_ADV) { 659 DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n", 660 cb->state); 661 break; 662 } 663 664 DEBUG_LOG(PFX "server received sink adv\n"); 665 666 /* Issue RDMA Read. */ 667 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; 668 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 669 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 670 cb->rdma_sq_wr.sg_list->length = cb->remote_len; 671 672 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); 673 if (ret) { 674 log(LOG_ERR, "post send error %d\n", ret); 675 break; 676 } 677 DEBUG_LOG(PFX "server posted rdma read req \n"); 678 679 /* Wait for read completion */ 680 krping_wait(cb, RDMA_READ_COMPLETE); 681 if (cb->state != RDMA_READ_COMPLETE) { 682 log(LOG_ERR, 683 "wait for RDMA_READ_COMPLETE state %d\n", 684 cb->state); 685 break; 686 } 687 DEBUG_LOG(PFX "server received read complete\n"); 688 689 /* Display data in recv buf */ 690 if (cb->verbose) 691 DEBUG_LOG("server ping data: %s\n", cb->rdma_buf); 692 693 /* Tell client to continue */ 694 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 695 if (ret) { 696 log(LOG_ERR, "post send error %d\n", ret); 697 break; 698 } 699 DEBUG_LOG(PFX "server posted go ahead\n"); 700 701 /* Wait for client's RDMA STAG/TO/Len */ 702 krping_wait(cb, RDMA_WRITE_ADV); 703 if (cb->state != RDMA_WRITE_ADV) { 704 log(LOG_ERR, 705 "wait for RDMA_WRITE_ADV state %d\n", 706 cb->state); 707 break; 708 } 709 DEBUG_LOG(PFX "server received sink adv\n"); 710 711 /* RDMA Write echo data */ 712 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 713 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 714 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 715 cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; 716 DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n", 717 cb->rdma_sq_wr.sg_list->lkey, 718 (unsigned long long)cb->rdma_sq_wr.sg_list->addr, 719 cb->rdma_sq_wr.sg_list->length); 720 721 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); 722 if (ret) { 723 log(LOG_ERR, "post send error %d\n", ret); 724 break; 725 } 726 727 /* Wait for completion */ 728 krping_wait(cb, RDMA_WRITE_COMPLETE); 729 if (cb->state != RDMA_WRITE_COMPLETE) { 730 log(LOG_ERR, 731 "wait for RDMA_WRITE_COMPLETE state %d\n", 732 cb->state); 733 break; 734 } 735 DEBUG_LOG(PFX "server rdma write complete \n"); 736 737 cb->state = CONNECTED; 738 739 /* Tell client to begin again */ 740 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 741 if (ret) { 742 log(LOG_ERR, "post send error %d\n", ret); 743 break; 744 } 745 DEBUG_LOG(PFX "server posted go ahead\n"); 746 } 747 } 748 749 static void rlat_test(struct krping_cb *cb) 750 { 751 int scnt; 752 int iters = cb->count; 753 struct timeval start_tv, stop_tv; 754 int ret; 755 struct ib_wc wc; 756 struct ib_send_wr *bad_wr; 757 int ne; 758 759 scnt = 0; 760 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; 761 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 762 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 763 cb->rdma_sq_wr.sg_list->length = cb->size; 764 765 microtime(&start_tv); 766 if (!cb->poll) { 767 cb->state = RDMA_READ_ADV; 768 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 769 } 770 while (scnt < iters) { 771 772 cb->state = RDMA_READ_ADV; 773 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); 774 if (ret) { 775 log(LOG_ERR, 776 "Couldn't post send: ret=%d scnt %d\n", 777 ret, scnt); 778 return; 779 } 780 781 do { 782 if (!cb->poll) { 783 krping_wait(cb, RDMA_READ_COMPLETE); 784 if (cb->state == RDMA_READ_COMPLETE) { 785 ne = 1; 786 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 787 } else { 788 ne = -1; 789 } 790 } else 791 ne = ib_poll_cq(cb->cq, 1, &wc); 792 if (cb->state == ERROR) { 793 log(LOG_ERR, 794 "state == ERROR...bailing scnt %d\n", scnt); 795 return; 796 } 797 } while (ne == 0); 798 799 if (ne < 0) { 800 log(LOG_ERR, "poll CQ failed %d\n", ne); 801 return; 802 } 803 if (cb->poll && wc.status != IB_WC_SUCCESS) { 804 log(LOG_ERR, "Completion wth error at %s:\n", 805 cb->server ? "server" : "client"); 806 log(LOG_ERR, "Failed status %d: wr_id %d\n", 807 wc.status, (int) wc.wr_id); 808 return; 809 } 810 ++scnt; 811 } 812 microtime(&stop_tv); 813 814 if (stop_tv.tv_usec < start_tv.tv_usec) { 815 stop_tv.tv_usec += 1000000; 816 stop_tv.tv_sec -= 1; 817 } 818 819 log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n", 820 stop_tv.tv_sec - start_tv.tv_sec, 821 stop_tv.tv_usec - start_tv.tv_usec, 822 scnt, cb->size); 823 } 824 825 static int alloc_cycle_mem(int cycle_iters, 826 cycles_t **post_cycles_start, 827 cycles_t **post_cycles_stop, 828 cycles_t **poll_cycles_start, 829 cycles_t **poll_cycles_stop, 830 cycles_t **last_poll_cycles_start) 831 { 832 *post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 833 if (!*post_cycles_start) { 834 goto fail1; 835 } 836 *post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 837 if (!*post_cycles_stop) { 838 goto fail2; 839 } 840 *poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 841 if (!*poll_cycles_start) { 842 goto fail3; 843 } 844 *poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 845 if (!*poll_cycles_stop) { 846 goto fail4; 847 } 848 *last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 849 if (!*last_poll_cycles_start) { 850 goto fail5; 851 } 852 return 0; 853 fail5: 854 free(*poll_cycles_stop, M_DEVBUF); 855 fail4: 856 free(*poll_cycles_start, M_DEVBUF); 857 fail3: 858 free(*post_cycles_stop, M_DEVBUF); 859 fail2: 860 free(*post_cycles_start, M_DEVBUF); 861 fail1: 862 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__); 863 return ENOMEM; 864 } 865 866 static void free_cycle_mem(cycles_t *post_cycles_start, 867 cycles_t *post_cycles_stop, 868 cycles_t *poll_cycles_start, 869 cycles_t *poll_cycles_stop, 870 cycles_t *last_poll_cycles_start) 871 { 872 free(last_poll_cycles_start, M_DEVBUF); 873 free(poll_cycles_stop, M_DEVBUF); 874 free(poll_cycles_start, M_DEVBUF); 875 free(post_cycles_stop, M_DEVBUF); 876 free(post_cycles_start, M_DEVBUF); 877 } 878 879 static void wlat_test(struct krping_cb *cb) 880 { 881 int ccnt, scnt, rcnt; 882 int iters=cb->count; 883 volatile char *poll_buf = (char *) cb->start_buf; 884 char *buf = (char *)cb->rdma_buf; 885 ccnt = 0; 886 scnt = 0; 887 rcnt = 0; 888 struct timeval start_tv, stop_tv; 889 cycles_t *post_cycles_start, *post_cycles_stop; 890 cycles_t *poll_cycles_start, *poll_cycles_stop; 891 cycles_t *last_poll_cycles_start; 892 cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; 893 int i; 894 int cycle_iters = 1000; 895 int err; 896 897 err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop, 898 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start); 899 900 if (err) { 901 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__); 902 return; 903 } 904 905 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 906 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 907 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 908 cb->rdma_sq_wr.sg_list->length = cb->size; 909 910 if (cycle_iters > iters) 911 cycle_iters = iters; 912 microtime(&start_tv); 913 while (scnt < iters || ccnt < iters || rcnt < iters) { 914 915 /* Wait till buffer changes. */ 916 if (rcnt < iters && !(scnt < 1 && !cb->server)) { 917 ++rcnt; 918 while (*poll_buf != (char)rcnt) { 919 if (cb->state == ERROR) { 920 log(LOG_ERR, "state = ERROR, bailing\n"); 921 return; 922 } 923 } 924 } 925 926 if (scnt < iters) { 927 struct ib_send_wr *bad_wr; 928 929 *buf = (char)scnt+1; 930 if (scnt < cycle_iters) 931 post_cycles_start[scnt] = get_cycles(); 932 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { 933 log(LOG_ERR, "Couldn't post send: scnt=%d\n", 934 scnt); 935 return; 936 } 937 if (scnt < cycle_iters) 938 post_cycles_stop[scnt] = get_cycles(); 939 scnt++; 940 } 941 942 if (ccnt < iters) { 943 struct ib_wc wc; 944 int ne; 945 946 if (ccnt < cycle_iters) 947 poll_cycles_start[ccnt] = get_cycles(); 948 do { 949 if (ccnt < cycle_iters) 950 last_poll_cycles_start[ccnt] = get_cycles(); 951 ne = ib_poll_cq(cb->cq, 1, &wc); 952 } while (ne == 0); 953 if (ccnt < cycle_iters) 954 poll_cycles_stop[ccnt] = get_cycles(); 955 ++ccnt; 956 957 if (ne < 0) { 958 log(LOG_ERR, "poll CQ failed %d\n", ne); 959 return; 960 } 961 if (wc.status != IB_WC_SUCCESS) { 962 log(LOG_ERR, "Completion wth error at %s:\n", 963 cb->server ? "server" : "client"); 964 log(LOG_ERR, "Failed status %d: wr_id %d\n", 965 wc.status, (int) wc.wr_id); 966 log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n", 967 scnt, rcnt, ccnt); 968 return; 969 } 970 } 971 } 972 microtime(&stop_tv); 973 974 if (stop_tv.tv_usec < start_tv.tv_usec) { 975 stop_tv.tv_usec += 1000000; 976 stop_tv.tv_sec -= 1; 977 } 978 979 for (i=0; i < cycle_iters; i++) { 980 sum_post += post_cycles_stop[i] - post_cycles_start[i]; 981 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; 982 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i]; 983 } 984 985 log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n", 986 stop_tv.tv_sec - start_tv.tv_sec, 987 stop_tv.tv_usec - start_tv.tv_usec, 988 scnt, cb->size, cycle_iters, 989 (unsigned long long)sum_post, (unsigned long long)sum_poll, 990 (unsigned long long)sum_last_poll); 991 992 free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 993 poll_cycles_stop, last_poll_cycles_start); 994 } 995 996 static void bw_test(struct krping_cb *cb) 997 { 998 int ccnt, scnt, rcnt; 999 int iters=cb->count; 1000 ccnt = 0; 1001 scnt = 0; 1002 rcnt = 0; 1003 struct timeval start_tv, stop_tv; 1004 cycles_t *post_cycles_start, *post_cycles_stop; 1005 cycles_t *poll_cycles_start, *poll_cycles_stop; 1006 cycles_t *last_poll_cycles_start; 1007 cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; 1008 int i; 1009 int cycle_iters = 1000; 1010 int err; 1011 1012 err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop, 1013 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start); 1014 1015 if (err) { 1016 log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__); 1017 return; 1018 } 1019 1020 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 1021 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 1022 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 1023 cb->rdma_sq_wr.sg_list->length = cb->size; 1024 1025 if (cycle_iters > iters) 1026 cycle_iters = iters; 1027 microtime(&start_tv); 1028 while (scnt < iters || ccnt < iters) { 1029 1030 while (scnt < iters && scnt - ccnt < cb->txdepth) { 1031 struct ib_send_wr *bad_wr; 1032 1033 if (scnt < cycle_iters) 1034 post_cycles_start[scnt] = get_cycles(); 1035 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { 1036 log(LOG_ERR, "Couldn't post send: scnt=%d\n", 1037 scnt); 1038 return; 1039 } 1040 if (scnt < cycle_iters) 1041 post_cycles_stop[scnt] = get_cycles(); 1042 ++scnt; 1043 } 1044 1045 if (ccnt < iters) { 1046 int ne; 1047 struct ib_wc wc; 1048 1049 if (ccnt < cycle_iters) 1050 poll_cycles_start[ccnt] = get_cycles(); 1051 do { 1052 if (ccnt < cycle_iters) 1053 last_poll_cycles_start[ccnt] = get_cycles(); 1054 ne = ib_poll_cq(cb->cq, 1, &wc); 1055 } while (ne == 0); 1056 if (ccnt < cycle_iters) 1057 poll_cycles_stop[ccnt] = get_cycles(); 1058 ccnt += 1; 1059 1060 if (ne < 0) { 1061 log(LOG_ERR, "poll CQ failed %d\n", ne); 1062 return; 1063 } 1064 if (wc.status != IB_WC_SUCCESS) { 1065 log(LOG_ERR, "Completion wth error at %s:\n", 1066 cb->server ? "server" : "client"); 1067 log(LOG_ERR, "Failed status %d: wr_id %d\n", 1068 wc.status, (int) wc.wr_id); 1069 return; 1070 } 1071 } 1072 } 1073 microtime(&stop_tv); 1074 1075 if (stop_tv.tv_usec < start_tv.tv_usec) { 1076 stop_tv.tv_usec += 1000000; 1077 stop_tv.tv_sec -= 1; 1078 } 1079 1080 for (i=0; i < cycle_iters; i++) { 1081 sum_post += post_cycles_stop[i] - post_cycles_start[i]; 1082 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; 1083 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i]; 1084 } 1085 1086 log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n", 1087 stop_tv.tv_sec - start_tv.tv_sec, 1088 stop_tv.tv_usec - start_tv.tv_usec, 1089 scnt, cb->size, cycle_iters, 1090 (unsigned long long)sum_post, (unsigned long long)sum_poll, 1091 (unsigned long long)sum_last_poll); 1092 1093 free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 1094 poll_cycles_stop, last_poll_cycles_start); 1095 } 1096 1097 static void krping_rlat_test_server(struct krping_cb *cb) 1098 { 1099 struct ib_send_wr *bad_wr; 1100 struct ib_wc wc; 1101 int ret; 1102 1103 /* Spin waiting for client's Start STAG/TO/Len */ 1104 while (cb->state < RDMA_READ_ADV) { 1105 krping_cq_event_handler(cb->cq, cb); 1106 } 1107 1108 /* Send STAG/TO/Len to client */ 1109 if (cb->dma_mr) 1110 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1111 else 1112 krping_format_send(cb, cb->start_addr, cb->start_mr); 1113 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1114 if (ret) { 1115 log(LOG_ERR, "post send error %d\n", ret); 1116 return; 1117 } 1118 1119 /* Spin waiting for send completion */ 1120 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1121 if (ret < 0) { 1122 log(LOG_ERR, "poll error %d\n", ret); 1123 return; 1124 } 1125 if (wc.status) { 1126 log(LOG_ERR, "send completiong error %d\n", wc.status); 1127 return; 1128 } 1129 1130 krping_wait(cb, ERROR); 1131 } 1132 1133 static void krping_wlat_test_server(struct krping_cb *cb) 1134 { 1135 struct ib_send_wr *bad_wr; 1136 struct ib_wc wc; 1137 int ret; 1138 1139 /* Spin waiting for client's Start STAG/TO/Len */ 1140 while (cb->state < RDMA_READ_ADV) { 1141 krping_cq_event_handler(cb->cq, cb); 1142 } 1143 1144 /* Send STAG/TO/Len to client */ 1145 if (cb->dma_mr) 1146 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1147 else 1148 krping_format_send(cb, cb->start_addr, cb->start_mr); 1149 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1150 if (ret) { 1151 log(LOG_ERR, "post send error %d\n", ret); 1152 return; 1153 } 1154 1155 /* Spin waiting for send completion */ 1156 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1157 if (ret < 0) { 1158 log(LOG_ERR, "poll error %d\n", ret); 1159 return; 1160 } 1161 if (wc.status) { 1162 log(LOG_ERR, "send completiong error %d\n", wc.status); 1163 return; 1164 } 1165 1166 wlat_test(cb); 1167 1168 } 1169 1170 static void krping_bw_test_server(struct krping_cb *cb) 1171 { 1172 struct ib_send_wr *bad_wr; 1173 struct ib_wc wc; 1174 int ret; 1175 1176 /* Spin waiting for client's Start STAG/TO/Len */ 1177 while (cb->state < RDMA_READ_ADV) { 1178 krping_cq_event_handler(cb->cq, cb); 1179 } 1180 1181 /* Send STAG/TO/Len to client */ 1182 if (cb->dma_mr) 1183 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1184 else 1185 krping_format_send(cb, cb->start_addr, cb->start_mr); 1186 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1187 if (ret) { 1188 log(LOG_ERR, "post send error %d\n", ret); 1189 return; 1190 } 1191 1192 /* Spin waiting for send completion */ 1193 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1194 if (ret < 0) { 1195 log(LOG_ERR, "poll error %d\n", ret); 1196 return; 1197 } 1198 if (wc.status) { 1199 log(LOG_ERR, "send completiong error %d\n", wc.status); 1200 return; 1201 } 1202 1203 if (cb->duplex) 1204 bw_test(cb); 1205 krping_wait(cb, ERROR); 1206 } 1207 1208 static int krping_bind_server(struct krping_cb *cb) 1209 { 1210 struct sockaddr_in sin; 1211 int ret; 1212 1213 memset(&sin, 0, sizeof(sin)); 1214 sin.sin_len = sizeof sin; 1215 sin.sin_family = AF_INET; 1216 sin.sin_addr.s_addr = cb->addr.s_addr; 1217 sin.sin_port = cb->port; 1218 1219 ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); 1220 if (ret) { 1221 log(LOG_ERR, "rdma_bind_addr error %d\n", ret); 1222 return ret; 1223 } 1224 DEBUG_LOG(PFX "rdma_bind_addr successful\n"); 1225 1226 DEBUG_LOG(PFX "rdma_listen\n"); 1227 ret = rdma_listen(cb->cm_id, 3); 1228 if (ret) { 1229 log(LOG_ERR, "rdma_listen failed: %d\n", ret); 1230 return ret; 1231 } 1232 1233 krping_wait(cb, CONNECT_REQUEST); 1234 if (cb->state != CONNECT_REQUEST) { 1235 log(LOG_ERR, "wait for CONNECT_REQUEST state %d\n", 1236 cb->state); 1237 return -1; 1238 } 1239 1240 return 0; 1241 } 1242 1243 static void krping_run_server(struct krping_cb *cb) 1244 { 1245 struct ib_recv_wr *bad_wr; 1246 int ret; 1247 1248 ret = krping_bind_server(cb); 1249 if (ret) 1250 return; 1251 1252 ret = krping_setup_qp(cb, cb->child_cm_id); 1253 if (ret) { 1254 log(LOG_ERR, "setup_qp failed: %d\n", ret); 1255 return; 1256 } 1257 1258 ret = krping_setup_buffers(cb); 1259 if (ret) { 1260 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret); 1261 goto err1; 1262 } 1263 1264 ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); 1265 if (ret) { 1266 log(LOG_ERR, "ib_post_recv failed: %d\n", ret); 1267 goto err2; 1268 } 1269 1270 ret = krping_accept(cb); 1271 if (ret) { 1272 log(LOG_ERR, "connect error %d\n", ret); 1273 goto err2; 1274 } 1275 1276 if (cb->wlat) 1277 krping_wlat_test_server(cb); 1278 else if (cb->rlat) 1279 krping_rlat_test_server(cb); 1280 else if (cb->bw) 1281 krping_bw_test_server(cb); 1282 else 1283 krping_test_server(cb); 1284 1285 rdma_disconnect(cb->child_cm_id); 1286 rdma_destroy_id(cb->child_cm_id); 1287 err2: 1288 krping_free_buffers(cb); 1289 err1: 1290 krping_free_qp(cb); 1291 } 1292 1293 static void krping_test_client(struct krping_cb *cb) 1294 { 1295 int ping, start, cc, i, ret; 1296 struct ib_send_wr *bad_wr; 1297 unsigned char c; 1298 1299 start = 65; 1300 for (ping = 0; !cb->count || ping < cb->count; ping++) { 1301 cb->state = RDMA_READ_ADV; 1302 1303 /* Put some ascii text in the buffer. */ 1304 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); 1305 for (i = cc, c = start; i < cb->size; i++) { 1306 cb->start_buf[i] = c; 1307 c++; 1308 if (c > 122) 1309 c = 65; 1310 } 1311 start++; 1312 if (start > 122) 1313 start = 65; 1314 cb->start_buf[cb->size - 1] = 0; 1315 1316 if (cb->dma_mr) 1317 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1318 else 1319 krping_format_send(cb, cb->start_addr, cb->start_mr); 1320 1321 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1322 if (ret) { 1323 log(LOG_ERR, "post send error %d\n", ret); 1324 break; 1325 } 1326 1327 /* Wait for server to ACK */ 1328 krping_wait(cb, RDMA_WRITE_ADV); 1329 if (cb->state != RDMA_WRITE_ADV) { 1330 log(LOG_ERR, 1331 "wait for RDMA_WRITE_ADV state %d\n", 1332 cb->state); 1333 break; 1334 } 1335 1336 if (cb->dma_mr) 1337 krping_format_send(cb, cb->rdma_addr, cb->dma_mr); 1338 else 1339 krping_format_send(cb, cb->rdma_addr, cb->rdma_mr); 1340 1341 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1342 if (ret) { 1343 log(LOG_ERR, "post send error %d\n", ret); 1344 break; 1345 } 1346 1347 /* Wait for the server to say the RDMA Write is complete. */ 1348 krping_wait(cb, RDMA_WRITE_COMPLETE); 1349 if (cb->state != RDMA_WRITE_COMPLETE) { 1350 log(LOG_ERR, 1351 "wait for RDMA_WRITE_COMPLETE state %d\n", 1352 cb->state); 1353 break; 1354 } 1355 1356 if (cb->validate) 1357 if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { 1358 log(LOG_ERR, "data mismatch!\n"); 1359 break; 1360 } 1361 1362 if (cb->verbose) 1363 DEBUG_LOG("ping data: %s\n", cb->rdma_buf); 1364 } 1365 } 1366 1367 static void krping_rlat_test_client(struct krping_cb *cb) 1368 { 1369 struct ib_send_wr *bad_wr; 1370 struct ib_wc wc; 1371 int ret; 1372 1373 cb->state = RDMA_READ_ADV; 1374 1375 /* Send STAG/TO/Len to client */ 1376 if (cb->dma_mr) 1377 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1378 else 1379 krping_format_send(cb, cb->start_addr, cb->rdma_mr); 1380 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1381 if (ret) { 1382 log(LOG_ERR, "post send error %d\n", ret); 1383 return; 1384 } 1385 1386 /* Spin waiting for send completion */ 1387 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1388 if (ret < 0) { 1389 log(LOG_ERR, "poll error %d\n", ret); 1390 return; 1391 } 1392 if (wc.status) { 1393 log(LOG_ERR, "send completion error %d\n", wc.status); 1394 return; 1395 } 1396 1397 /* Spin waiting for server's Start STAG/TO/Len */ 1398 while (cb->state < RDMA_WRITE_ADV) { 1399 krping_cq_event_handler(cb->cq, cb); 1400 } 1401 1402 #if 0 1403 { 1404 int i; 1405 struct timeval start, stop; 1406 time_t sec; 1407 suseconds_t usec; 1408 unsigned long long elapsed; 1409 struct ib_wc wc; 1410 struct ib_send_wr *bad_wr; 1411 int ne; 1412 1413 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 1414 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 1415 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 1416 cb->rdma_sq_wr.sg_list->length = 0; 1417 cb->rdma_sq_wr.num_sge = 0; 1418 1419 microtime(&start); 1420 for (i=0; i < 100000; i++) { 1421 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { 1422 log(LOG_ERR, "Couldn't post send\n"); 1423 return; 1424 } 1425 do { 1426 ne = ib_poll_cq(cb->cq, 1, &wc); 1427 } while (ne == 0); 1428 if (ne < 0) { 1429 log(LOG_ERR, "poll CQ failed %d\n", ne); 1430 return; 1431 } 1432 if (wc.status != IB_WC_SUCCESS) { 1433 log(LOG_ERR, "Completion wth error at %s:\n", 1434 cb->server ? "server" : "client"); 1435 log(LOG_ERR, "Failed status %d: wr_id %d\n", 1436 wc.status, (int) wc.wr_id); 1437 return; 1438 } 1439 } 1440 microtime(&stop); 1441 1442 if (stop.tv_usec < start.tv_usec) { 1443 stop.tv_usec += 1000000; 1444 stop.tv_sec -= 1; 1445 } 1446 sec = stop.tv_sec - start.tv_sec; 1447 usec = stop.tv_usec - start.tv_usec; 1448 elapsed = sec * 1000000 + usec; 1449 log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed); 1450 } 1451 #endif 1452 1453 rlat_test(cb); 1454 } 1455 1456 static void krping_wlat_test_client(struct krping_cb *cb) 1457 { 1458 struct ib_send_wr *bad_wr; 1459 struct ib_wc wc; 1460 int ret; 1461 1462 cb->state = RDMA_READ_ADV; 1463 1464 /* Send STAG/TO/Len to client */ 1465 if (cb->dma_mr) 1466 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1467 else 1468 krping_format_send(cb, cb->start_addr, cb->start_mr); 1469 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1470 if (ret) { 1471 log(LOG_ERR, "post send error %d\n", ret); 1472 return; 1473 } 1474 1475 /* Spin waiting for send completion */ 1476 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1477 if (ret < 0) { 1478 log(LOG_ERR, "poll error %d\n", ret); 1479 return; 1480 } 1481 if (wc.status) { 1482 log(LOG_ERR, "send completion error %d\n", wc.status); 1483 return; 1484 } 1485 1486 /* Spin waiting for server's Start STAG/TO/Len */ 1487 while (cb->state < RDMA_WRITE_ADV) { 1488 krping_cq_event_handler(cb->cq, cb); 1489 } 1490 1491 wlat_test(cb); 1492 } 1493 1494 static void krping_bw_test_client(struct krping_cb *cb) 1495 { 1496 struct ib_send_wr *bad_wr; 1497 struct ib_wc wc; 1498 int ret; 1499 1500 cb->state = RDMA_READ_ADV; 1501 1502 /* Send STAG/TO/Len to client */ 1503 if (cb->dma_mr) 1504 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1505 else 1506 krping_format_send(cb, cb->start_addr, cb->start_mr); 1507 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1508 if (ret) { 1509 log(LOG_ERR, "post send error %d\n", ret); 1510 return; 1511 } 1512 1513 /* Spin waiting for send completion */ 1514 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1515 if (ret < 0) { 1516 log(LOG_ERR, "poll error %d\n", ret); 1517 return; 1518 } 1519 if (wc.status) { 1520 log(LOG_ERR, "send completion error %d\n", wc.status); 1521 return; 1522 } 1523 1524 /* Spin waiting for server's Start STAG/TO/Len */ 1525 while (cb->state < RDMA_WRITE_ADV) { 1526 krping_cq_event_handler(cb->cq, cb); 1527 } 1528 1529 bw_test(cb); 1530 } 1531 1532 static int krping_connect_client(struct krping_cb *cb) 1533 { 1534 struct rdma_conn_param conn_param; 1535 int ret; 1536 1537 memset(&conn_param, 0, sizeof conn_param); 1538 conn_param.responder_resources = 1; 1539 conn_param.initiator_depth = 1; 1540 conn_param.retry_count = 10; 1541 1542 ret = rdma_connect(cb->cm_id, &conn_param); 1543 if (ret) { 1544 log(LOG_ERR, "rdma_connect error %d\n", ret); 1545 return ret; 1546 } 1547 1548 krping_wait(cb, CONNECTED); 1549 if (cb->state == ERROR) { 1550 log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state); 1551 return -1; 1552 } 1553 1554 DEBUG_LOG(PFX "rdma_connect successful\n"); 1555 return 0; 1556 } 1557 1558 static int krping_bind_client(struct krping_cb *cb) 1559 { 1560 struct sockaddr_in sin; 1561 int ret; 1562 1563 memset(&sin, 0, sizeof(sin)); 1564 sin.sin_len = sizeof sin; 1565 sin.sin_family = AF_INET; 1566 sin.sin_addr.s_addr = cb->addr.s_addr; 1567 sin.sin_port = cb->port; 1568 1569 ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, 1570 2000); 1571 if (ret) { 1572 log(LOG_ERR, "rdma_resolve_addr error %d\n", ret); 1573 return ret; 1574 } 1575 1576 krping_wait(cb, ROUTE_RESOLVED); 1577 if (cb->state != ROUTE_RESOLVED) { 1578 log(LOG_ERR, 1579 "addr/route resolution did not resolve: state %d\n", 1580 cb->state); 1581 return EINTR; 1582 } 1583 1584 DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n"); 1585 return 0; 1586 } 1587 1588 static void krping_run_client(struct krping_cb *cb) 1589 { 1590 struct ib_recv_wr *bad_wr; 1591 int ret; 1592 1593 ret = krping_bind_client(cb); 1594 if (ret) 1595 return; 1596 1597 ret = krping_setup_qp(cb, cb->cm_id); 1598 if (ret) { 1599 log(LOG_ERR, "setup_qp failed: %d\n", ret); 1600 return; 1601 } 1602 1603 ret = krping_setup_buffers(cb); 1604 if (ret) { 1605 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret); 1606 goto err1; 1607 } 1608 1609 ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); 1610 if (ret) { 1611 log(LOG_ERR, "ib_post_recv failed: %d\n", ret); 1612 goto err2; 1613 } 1614 1615 ret = krping_connect_client(cb); 1616 if (ret) { 1617 log(LOG_ERR, "connect error %d\n", ret); 1618 goto err2; 1619 } 1620 1621 if (cb->wlat) 1622 krping_wlat_test_client(cb); 1623 else if (cb->rlat) 1624 krping_rlat_test_client(cb); 1625 else if (cb->bw) 1626 krping_bw_test_client(cb); 1627 else 1628 krping_test_client(cb); 1629 rdma_disconnect(cb->cm_id); 1630 err2: 1631 krping_free_buffers(cb); 1632 err1: 1633 krping_free_qp(cb); 1634 } 1635 1636 int krping_doit(char *cmd) 1637 { 1638 struct krping_cb *cb; 1639 int op; 1640 int ret = 0; 1641 char *optarg; 1642 unsigned long optint; 1643 debug = 0; 1644 1645 cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK); 1646 if (!cb) 1647 return ENOMEM; 1648 bzero(cb, sizeof *cb); 1649 1650 mtx_lock(&krping_mutex); 1651 TAILQ_INSERT_TAIL(&krping_cbs, cb, list); 1652 mtx_unlock(&krping_mutex); 1653 1654 cb->server = -1; 1655 cb->state = IDLE; 1656 cb->size = 64; 1657 cb->txdepth = RPING_SQ_DEPTH; 1658 cb->use_dmamr = 1; 1659 cb->memlimit = 0; 1660 mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF); 1661 1662 while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, 1663 &optint)) != 0) { 1664 switch (op) { 1665 case 'a': 1666 cb->addr_str = optarg; 1667 DEBUG_LOG(PFX "ipaddr (%s)\n", optarg); 1668 if (!inet_aton(optarg, &cb->addr)) { 1669 log(LOG_ERR, "bad addr string %s\n", optarg); 1670 ret = EINVAL; 1671 } 1672 break; 1673 case 'D': 1674 cb->use_dmamr = 1; 1675 DEBUG_LOG(PFX "using dma mr\n"); 1676 break; 1677 case 'p': 1678 cb->port = htons(optint); 1679 DEBUG_LOG(PFX "port %d\n", (int)optint); 1680 break; 1681 case 'P': 1682 cb->poll = 1; 1683 DEBUG_LOG("server\n"); 1684 break; 1685 case 's': 1686 cb->server = 1; 1687 DEBUG_LOG(PFX "server\n"); 1688 break; 1689 case 'c': 1690 cb->server = 0; 1691 DEBUG_LOG(PFX "client\n"); 1692 break; 1693 case 'S': 1694 cb->size = optint; 1695 if ((cb->size < 1) || 1696 (cb->size > RPING_BUFSIZE)) { 1697 log(LOG_ERR, "Invalid size %d " 1698 "(valid range is 1 to %d)\n", 1699 cb->size, RPING_BUFSIZE); 1700 ret = EINVAL; 1701 } else 1702 DEBUG_LOG(PFX "size %d\n", (int)optint); 1703 break; 1704 case 'C': 1705 cb->count = optint; 1706 if (cb->count < 0) { 1707 log(LOG_ERR, "Invalid count %d\n", 1708 cb->count); 1709 ret = EINVAL; 1710 } else 1711 DEBUG_LOG(PFX "count %d\n", (int) cb->count); 1712 break; 1713 case 'v': 1714 cb->verbose++; 1715 DEBUG_LOG(PFX "verbose\n"); 1716 break; 1717 case 'V': 1718 cb->validate++; 1719 DEBUG_LOG(PFX "validate data\n"); 1720 break; 1721 case 'L': 1722 cb->rlat++; 1723 break; 1724 case 'l': 1725 cb->wlat++; 1726 break; 1727 case 'B': 1728 cb->bw++; 1729 break; 1730 case 't': 1731 cb->txdepth = optint; 1732 DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth); 1733 break; 1734 case 'd': 1735 debug++; 1736 break; 1737 case 'm': 1738 cb->memlimit = optint; 1739 if (cb->memlimit < 1) { 1740 log(LOG_ERR, "Invalid memory limit %ju\n", 1741 cb->memlimit); 1742 ret = EINVAL; 1743 } else 1744 DEBUG_LOG(PFX "memory limit %d\n", (int)optint); 1745 break; 1746 default: 1747 log(LOG_ERR, "unknown opt %s\n", optarg); 1748 ret = EINVAL; 1749 break; 1750 } 1751 } 1752 if (ret) 1753 goto out; 1754 1755 if (cb->server == -1) { 1756 log(LOG_ERR, "must be either client or server\n"); 1757 ret = EINVAL; 1758 goto out; 1759 } 1760 if ((cb->bw + cb->rlat + cb->wlat) > 1) { 1761 log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n"); 1762 ret = EINVAL; 1763 goto out; 1764 } 1765 1766 1767 cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP); 1768 if (IS_ERR(cb->cm_id)) { 1769 ret = PTR_ERR(cb->cm_id); 1770 log(LOG_ERR, "rdma_create_id error %d\n", ret); 1771 goto out; 1772 } 1773 DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id); 1774 if (cb->server) 1775 krping_run_server(cb); 1776 else 1777 krping_run_client(cb); 1778 DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id); 1779 rdma_destroy_id(cb->cm_id); 1780 out: 1781 mtx_lock(&krping_mutex); 1782 TAILQ_REMOVE(&krping_cbs, cb, list); 1783 mtx_unlock(&krping_mutex); 1784 free(cb, M_DEVBUF); 1785 return ret; 1786 } 1787 1788 void krping_init(void) 1789 { 1790 mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF); 1791 TAILQ_INIT(&krping_cbs); 1792 } 1793