1 /* 2 * Copyright (c) 2005 Ammasso, Inc. All rights reserved. 3 * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/ctype.h> 38 39 #include <sys/param.h> 40 #include <sys/condvar.h> 41 #include <sys/systm.h> 42 #include <sys/kernel.h> 43 #include <sys/socket.h> 44 #include <sys/module.h> 45 #include <sys/endian.h> 46 #include <sys/limits.h> 47 #include <sys/proc.h> 48 #include <sys/signalvar.h> 49 50 #include <sys/lock.h> 51 #include <sys/mutex.h> 52 #include <sys/rwlock.h> 53 #include <sys/queue.h> 54 #include <sys/taskqueue.h> 55 #include <sys/syslog.h> 56 57 #include <vm/vm.h> 58 #include <vm/pmap.h> 59 60 #include <contrib/rdma/rdma_cm.h> 61 62 #include "getopt.h" 63 #include "krping.h" 64 65 #define PFX "krping: " 66 67 static int debug = 0; 68 #define DEBUG_LOG if (debug) printf 69 70 static const struct krping_option krping_opts[] = { 71 {"count", OPT_INT, 'C'}, 72 {"size", OPT_INT, 'S'}, 73 {"addr", OPT_STRING, 'a'}, 74 {"port", OPT_INT, 'p'}, 75 {"verbose", OPT_NOPARAM, 'v'}, 76 {"validate", OPT_NOPARAM, 'V'}, 77 {"server", OPT_NOPARAM, 's'}, 78 {"client", OPT_NOPARAM, 'c'}, 79 {"dmamr", OPT_NOPARAM, 'D'}, 80 {"debug", OPT_NOPARAM, 'd'}, 81 {"wlat", OPT_NOPARAM, 'l'}, 82 {"rlat", OPT_NOPARAM, 'L'}, 83 {"bw", OPT_NOPARAM, 'B'}, 84 {"tx-depth", OPT_INT, 't'}, 85 {"poll", OPT_NOPARAM, 'P'}, 86 {NULL, 0, 0} 87 }; 88 89 struct mtx krping_mutex; 90 91 /* 92 * List of running krping threads. 93 */ 94 struct krping_cb_list krping_cbs; 95 96 /* 97 * krping "ping/pong" loop: 98 * client sends source rkey/addr/len 99 * server receives source rkey/add/len 100 * server rdma reads "ping" data from source 101 * server sends "go ahead" on rdma read completion 102 * client sends sink rkey/addr/len 103 * server receives sink rkey/addr/len 104 * server rdma writes "pong" data to sink 105 * server sends "go ahead" on rdma write completion 106 * <repeat loop> 107 */ 108 109 /* 110 * Default max buffer size for IO... 111 */ 112 #define RPING_BUFSIZE 128*1024 113 #define RPING_SQ_DEPTH 32 114 115 116 /* lifted from netinet/libalias/alias_proxy.c */ 117 static int inet_aton(const char *cp, struct in_addr *addr); 118 static int 119 inet_aton(cp, addr) 120 const char *cp; 121 struct in_addr *addr; 122 { 123 u_long parts[4]; 124 in_addr_t val; 125 const char *c; 126 char *endptr; 127 int gotend, n; 128 129 c = (const char *)cp; 130 n = 0; 131 /* 132 * Run through the string, grabbing numbers until 133 * the end of the string, or some error 134 */ 135 gotend = 0; 136 while (!gotend) { 137 unsigned long l; 138 139 l = strtoul(c, &endptr, 0); 140 141 if (l == ULONG_MAX || (l == 0 && endptr == c)) 142 return (0); 143 144 val = (in_addr_t)l; 145 /* 146 * If the whole string is invalid, endptr will equal 147 * c.. this way we can make sure someone hasn't 148 * gone '.12' or something which would get past 149 * the next check. 150 */ 151 if (endptr == c) 152 return (0); 153 parts[n] = val; 154 c = endptr; 155 156 /* Check the next character past the previous number's end */ 157 switch (*c) { 158 case '.' : 159 /* Make sure we only do 3 dots .. */ 160 if (n == 3) /* Whoops. Quit. */ 161 return (0); 162 n++; 163 c++; 164 break; 165 166 case '\0': 167 gotend = 1; 168 break; 169 170 default: 171 if (isspace((unsigned char)*c)) { 172 gotend = 1; 173 break; 174 } else 175 return (0); /* Invalid character, so fail */ 176 } 177 178 } 179 180 /* 181 * Concoct the address according to 182 * the number of parts specified. 183 */ 184 185 switch (n) { 186 case 0: /* a -- 32 bits */ 187 /* 188 * Nothing is necessary here. Overflow checking was 189 * already done in strtoul(). 190 */ 191 break; 192 case 1: /* a.b -- 8.24 bits */ 193 if (val > 0xffffff || parts[0] > 0xff) 194 return (0); 195 val |= parts[0] << 24; 196 break; 197 198 case 2: /* a.b.c -- 8.8.16 bits */ 199 if (val > 0xffff || parts[0] > 0xff || parts[1] > 0xff) 200 return (0); 201 val |= (parts[0] << 24) | (parts[1] << 16); 202 break; 203 204 case 3: /* a.b.c.d -- 8.8.8.8 bits */ 205 if (val > 0xff || parts[0] > 0xff || parts[1] > 0xff || 206 parts[2] > 0xff) 207 return (0); 208 val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8); 209 break; 210 } 211 212 if (addr != NULL) 213 addr->s_addr = htonl(val); 214 return (1); 215 } 216 217 218 static void krping_wait(struct krping_cb *cb, int state) 219 { 220 int rc; 221 mtx_lock(&cb->lock); 222 while (cb->state < state) { 223 rc = msleep(cb, &cb->lock, 0, "krping", 0); 224 if (rc && rc != ERESTART) { 225 cb->state = ERROR; 226 break; 227 } 228 } 229 mtx_unlock(&cb->lock); 230 } 231 232 static int krping_cma_event_handler(struct rdma_cm_id *cma_id, 233 struct rdma_cm_event *event) 234 { 235 int ret; 236 struct krping_cb *cb = cma_id->context; 237 238 DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id, 239 (cma_id == cb->cm_id) ? "parent" : "child"); 240 241 mtx_lock(&cb->lock); 242 switch (event->event) { 243 case RDMA_CM_EVENT_ADDR_RESOLVED: 244 cb->state = ADDR_RESOLVED; 245 ret = rdma_resolve_route(cma_id, 2000); 246 if (ret) { 247 log(LOG_ERR, "rdma_resolve_route error %d\n", 248 ret); 249 wakeup(cb); 250 } 251 break; 252 253 case RDMA_CM_EVENT_ROUTE_RESOLVED: 254 cb->state = ROUTE_RESOLVED; 255 wakeup(cb); 256 break; 257 258 case RDMA_CM_EVENT_CONNECT_REQUEST: 259 cb->state = CONNECT_REQUEST; 260 cb->child_cm_id = cma_id; 261 DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id); 262 wakeup(cb); 263 break; 264 265 case RDMA_CM_EVENT_ESTABLISHED: 266 DEBUG_LOG(PFX "ESTABLISHED\n"); 267 if (!cb->server) { 268 cb->state = CONNECTED; 269 wakeup(cb); 270 } 271 break; 272 273 case RDMA_CM_EVENT_ADDR_ERROR: 274 case RDMA_CM_EVENT_ROUTE_ERROR: 275 case RDMA_CM_EVENT_CONNECT_ERROR: 276 case RDMA_CM_EVENT_UNREACHABLE: 277 case RDMA_CM_EVENT_REJECTED: 278 log(LOG_ERR, "cma event %d, error %d\n", event->event, 279 event->status); 280 cb->state = ERROR; 281 wakeup(cb); 282 break; 283 284 case RDMA_CM_EVENT_DISCONNECTED: 285 DEBUG_LOG(PFX "DISCONNECT EVENT...\n"); 286 cb->state = ERROR; 287 wakeup(cb); 288 break; 289 290 case RDMA_CM_EVENT_DEVICE_REMOVAL: 291 DEBUG_LOG(PFX "cma detected device removal!!!!\n"); 292 break; 293 294 default: 295 log(LOG_ERR, "oof bad type!\n"); 296 wakeup(cb); 297 break; 298 } 299 mtx_unlock(&cb->lock); 300 return 0; 301 } 302 303 static int server_recv(struct krping_cb *cb, struct ib_wc *wc) 304 { 305 if (wc->byte_len != sizeof(cb->recv_buf)) { 306 log(LOG_ERR, "Received bogus data, size %d\n", 307 wc->byte_len); 308 return -1; 309 } 310 311 cb->remote_rkey = ntohl(cb->recv_buf.rkey); 312 cb->remote_addr = ntohll(cb->recv_buf.buf); 313 cb->remote_len = ntohl(cb->recv_buf.size); 314 DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n", 315 cb->remote_rkey, (unsigned long long)cb->remote_addr, 316 cb->remote_len); 317 318 if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) 319 cb->state = RDMA_READ_ADV; 320 else 321 cb->state = RDMA_WRITE_ADV; 322 323 return 0; 324 } 325 326 static int client_recv(struct krping_cb *cb, struct ib_wc *wc) 327 { 328 if (wc->byte_len != sizeof(cb->recv_buf)) { 329 log(LOG_ERR, "Received bogus data, size %d\n", 330 wc->byte_len); 331 return -1; 332 } 333 334 if (cb->state == RDMA_READ_ADV) 335 cb->state = RDMA_WRITE_ADV; 336 else 337 cb->state = RDMA_WRITE_COMPLETE; 338 339 return 0; 340 } 341 342 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) 343 { 344 struct krping_cb *cb = ctx; 345 struct ib_wc wc; 346 struct ib_recv_wr *bad_wr; 347 int ret; 348 349 mtx_lock(&cb->lock); 350 KASSERT(cb->cq == cq, ("bad condition")); 351 if (cb->state == ERROR) { 352 log(LOG_ERR, "cq completion in ERROR state\n"); 353 mtx_unlock(&cb->lock); 354 return; 355 } 356 if (!cb->wlat && !cb->rlat && !cb->bw) 357 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 358 while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { 359 if (wc.status) { 360 if (wc.status != IB_WC_WR_FLUSH_ERR) 361 log(LOG_ERR, "cq completion failed status %d\n", 362 wc.status); 363 goto error; 364 } 365 366 switch (wc.opcode) { 367 case IB_WC_SEND: 368 DEBUG_LOG(PFX "send completion\n"); 369 cb->stats.send_bytes += cb->send_sgl.length; 370 cb->stats.send_msgs++; 371 break; 372 373 case IB_WC_RDMA_WRITE: 374 DEBUG_LOG(PFX "rdma write completion\n"); 375 cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; 376 cb->stats.write_msgs++; 377 cb->state = RDMA_WRITE_COMPLETE; 378 wakeup(cb); 379 break; 380 381 case IB_WC_RDMA_READ: 382 DEBUG_LOG(PFX "rdma read completion\n"); 383 cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; 384 cb->stats.read_msgs++; 385 cb->state = RDMA_READ_COMPLETE; 386 wakeup(cb); 387 break; 388 389 case IB_WC_RECV: 390 DEBUG_LOG(PFX "recv completion\n"); 391 cb->stats.recv_bytes += sizeof(cb->recv_buf); 392 cb->stats.recv_msgs++; 393 if (cb->wlat || cb->rlat || cb->bw) 394 ret = server_recv(cb, &wc); 395 else 396 ret = cb->server ? server_recv(cb, &wc) : 397 client_recv(cb, &wc); 398 if (ret) { 399 log(LOG_ERR, "recv wc error: %d\n", ret); 400 goto error; 401 } 402 403 ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); 404 if (ret) { 405 log(LOG_ERR, "post recv error: %d\n", 406 ret); 407 goto error; 408 } 409 wakeup(cb); 410 break; 411 412 default: 413 log(LOG_ERR, "unknown!!!!! completion\n"); 414 goto error; 415 } 416 } 417 if (ret) { 418 log(LOG_ERR, "poll error %d\n", ret); 419 goto error; 420 } 421 mtx_unlock(&cb->lock); 422 return; 423 error: 424 cb->state = ERROR; 425 wakeup(cb); 426 mtx_unlock(&cb->lock); 427 } 428 429 static int krping_accept(struct krping_cb *cb) 430 { 431 struct rdma_conn_param conn_param; 432 int ret; 433 434 DEBUG_LOG(PFX "accepting client connection request\n"); 435 436 memset(&conn_param, 0, sizeof conn_param); 437 conn_param.responder_resources = 1; 438 conn_param.initiator_depth = 1; 439 440 ret = rdma_accept(cb->child_cm_id, &conn_param); 441 if (ret) { 442 log(LOG_ERR, "rdma_accept error: %d\n", ret); 443 return ret; 444 } 445 446 if (!cb->wlat && !cb->rlat && !cb->bw) { 447 krping_wait(cb, CONNECTED); 448 if (cb->state == ERROR) { 449 log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state); 450 return -1; 451 } 452 } 453 return 0; 454 } 455 456 static void krping_setup_wr(struct krping_cb *cb) 457 { 458 /* XXX X86 only here... not mapping for dma! */ 459 cb->recv_sgl.addr = vtophys(&cb->recv_buf); 460 cb->recv_sgl.length = sizeof cb->recv_buf; 461 if (cb->use_dmamr) 462 cb->recv_sgl.lkey = cb->dma_mr->lkey; 463 else 464 cb->recv_sgl.lkey = cb->recv_mr->lkey; 465 cb->rq_wr.sg_list = &cb->recv_sgl; 466 cb->rq_wr.num_sge = 1; 467 468 cb->send_sgl.addr = vtophys(&cb->send_buf); 469 cb->send_sgl.length = sizeof cb->send_buf; 470 if (cb->use_dmamr) 471 cb->send_sgl.lkey = cb->dma_mr->lkey; 472 else 473 cb->send_sgl.lkey = cb->send_mr->lkey; 474 475 cb->sq_wr.opcode = IB_WR_SEND; 476 cb->sq_wr.send_flags = IB_SEND_SIGNALED; 477 cb->sq_wr.sg_list = &cb->send_sgl; 478 cb->sq_wr.num_sge = 1; 479 480 cb->rdma_addr = vtophys(cb->rdma_buf); 481 cb->rdma_sgl.addr = cb->rdma_addr; 482 if (cb->use_dmamr) 483 cb->rdma_sgl.lkey = cb->dma_mr->lkey; 484 else 485 cb->rdma_sgl.lkey = cb->rdma_mr->lkey; 486 cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; 487 cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; 488 cb->rdma_sq_wr.num_sge = 1; 489 490 if (!cb->server || cb->wlat || cb->rlat || cb->bw) { 491 cb->start_addr = vtophys(cb->start_buf); 492 } 493 } 494 495 static int krping_setup_buffers(struct krping_cb *cb) 496 { 497 int ret; 498 struct ib_phys_buf buf; 499 u64 iovbase; 500 501 DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb); 502 503 if (cb->use_dmamr) { 504 cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| 505 IB_ACCESS_REMOTE_READ| 506 IB_ACCESS_REMOTE_WRITE); 507 if (IS_ERR(cb->dma_mr)) { 508 log(LOG_ERR, "reg_dmamr failed\n"); 509 return PTR_ERR(cb->dma_mr); 510 } 511 } else { 512 513 buf.addr = vtophys(&cb->recv_buf); 514 buf.size = sizeof cb->recv_buf; 515 iovbase = vtophys(&cb->recv_buf); 516 cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 517 IB_ACCESS_LOCAL_WRITE, 518 &iovbase); 519 520 if (IS_ERR(cb->recv_mr)) { 521 log(LOG_ERR, "recv_buf reg_mr failed\n"); 522 return PTR_ERR(cb->recv_mr); 523 } 524 525 buf.addr = vtophys(&cb->send_buf); 526 buf.size = sizeof cb->send_buf; 527 iovbase = vtophys(&cb->send_buf); 528 cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 529 0, &iovbase); 530 531 if (IS_ERR(cb->send_mr)) { 532 log(LOG_ERR, "send_buf reg_mr failed\n"); 533 ib_dereg_mr(cb->recv_mr); 534 return PTR_ERR(cb->send_mr); 535 } 536 } 537 538 cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL, 539 PAGE_SIZE, 0); 540 541 if (!cb->rdma_buf) { 542 log(LOG_ERR, "rdma_buf malloc failed\n"); 543 ret = ENOMEM; 544 goto err1; 545 } 546 if (!cb->use_dmamr) { 547 548 buf.addr = vtophys(cb->rdma_buf); 549 buf.size = cb->size; 550 iovbase = vtophys(cb->rdma_buf); 551 cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 552 IB_ACCESS_REMOTE_READ| 553 IB_ACCESS_REMOTE_WRITE, 554 &iovbase); 555 556 if (IS_ERR(cb->rdma_mr)) { 557 log(LOG_ERR, "rdma_buf reg_mr failed\n"); 558 ret = PTR_ERR(cb->rdma_mr); 559 goto err2; 560 } 561 } 562 563 if (!cb->server || cb->wlat || cb->rlat || cb->bw) { 564 cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 565 0, -1UL, PAGE_SIZE, 0); 566 if (!cb->start_buf) { 567 log(LOG_ERR, "start_buf malloc failed\n"); 568 ret = ENOMEM; 569 goto err2; 570 } 571 if (!cb->use_dmamr) { 572 unsigned flags = IB_ACCESS_REMOTE_READ; 573 574 if (cb->wlat || cb->rlat || cb->bw) 575 flags |= IB_ACCESS_REMOTE_WRITE; 576 buf.addr = vtophys(cb->start_buf); 577 buf.size = cb->size; 578 iovbase = vtophys(cb->start_buf); 579 cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 580 flags, 581 &iovbase); 582 583 if (IS_ERR(cb->start_mr)) { 584 log(LOG_ERR, "start_buf reg_mr failed\n"); 585 ret = PTR_ERR(cb->start_mr); 586 goto err3; 587 } 588 } 589 } 590 591 krping_setup_wr(cb); 592 DEBUG_LOG(PFX "allocated & registered buffers...\n"); 593 return 0; 594 err3: 595 contigfree(cb->start_buf, cb->size, M_DEVBUF); 596 597 if (!cb->use_dmamr) 598 ib_dereg_mr(cb->rdma_mr); 599 err2: 600 contigfree(cb->rdma_buf, cb->size, M_DEVBUF); 601 err1: 602 if (cb->use_dmamr) 603 ib_dereg_mr(cb->dma_mr); 604 else { 605 ib_dereg_mr(cb->recv_mr); 606 ib_dereg_mr(cb->send_mr); 607 } 608 return ret; 609 } 610 611 static void krping_free_buffers(struct krping_cb *cb) 612 { 613 DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb); 614 615 #if 0 616 dma_unmap_single(cb->pd->device->dma_device, 617 pci_unmap_addr(cb, recv_mapping), 618 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); 619 dma_unmap_single(cb->pd->device->dma_device, 620 pci_unmap_addr(cb, send_mapping), 621 sizeof(cb->send_buf), DMA_BIDIRECTIONAL); 622 dma_unmap_single(cb->pd->device->dma_device, 623 pci_unmap_addr(cb, rdma_mapping), 624 cb->size, DMA_BIDIRECTIONAL); 625 #endif 626 contigfree(cb->rdma_buf, cb->size, M_DEVBUF); 627 if (!cb->server || cb->wlat || cb->rlat || cb->bw) { 628 #if 0 629 dma_unmap_single(cb->pd->device->dma_device, 630 pci_unmap_addr(cb, start_mapping), 631 cb->size, DMA_BIDIRECTIONAL); 632 #endif 633 contigfree(cb->start_buf, cb->size, M_DEVBUF); 634 } 635 if (cb->use_dmamr) 636 ib_dereg_mr(cb->dma_mr); 637 else { 638 ib_dereg_mr(cb->send_mr); 639 ib_dereg_mr(cb->recv_mr); 640 ib_dereg_mr(cb->rdma_mr); 641 if (!cb->server) 642 ib_dereg_mr(cb->start_mr); 643 } 644 } 645 646 static int krping_create_qp(struct krping_cb *cb) 647 { 648 struct ib_qp_init_attr init_attr; 649 int ret; 650 651 memset(&init_attr, 0, sizeof(init_attr)); 652 init_attr.cap.max_send_wr = cb->txdepth; 653 init_attr.cap.max_recv_wr = 2; 654 init_attr.cap.max_recv_sge = 1; 655 init_attr.cap.max_send_sge = 1; 656 init_attr.qp_type = IB_QPT_RC; 657 init_attr.send_cq = cb->cq; 658 init_attr.recv_cq = cb->cq; 659 660 if (cb->server) { 661 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); 662 if (!ret) 663 cb->qp = cb->child_cm_id->qp; 664 } else { 665 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); 666 if (!ret) 667 cb->qp = cb->cm_id->qp; 668 } 669 670 return ret; 671 } 672 673 static void krping_free_qp(struct krping_cb *cb) 674 { 675 ib_destroy_qp(cb->qp); 676 ib_destroy_cq(cb->cq); 677 ib_dealloc_pd(cb->pd); 678 } 679 680 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) 681 { 682 int ret; 683 cb->pd = ib_alloc_pd(cm_id->device); 684 if (IS_ERR(cb->pd)) { 685 log(LOG_ERR, "ib_alloc_pd failed\n"); 686 return PTR_ERR(cb->pd); 687 } 688 DEBUG_LOG(PFX "created pd %p\n", cb->pd); 689 690 cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, 691 cb, cb->txdepth * 2, 0); 692 if (IS_ERR(cb->cq)) { 693 log(LOG_ERR, "ib_create_cq failed\n"); 694 ret = PTR_ERR(cb->cq); 695 goto err1; 696 } 697 DEBUG_LOG(PFX "created cq %p\n", cb->cq); 698 699 if (!cb->wlat && !cb->rlat && !cb->bw) { 700 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 701 if (ret) { 702 log(LOG_ERR, "ib_create_cq failed\n"); 703 goto err2; 704 } 705 } 706 707 ret = krping_create_qp(cb); 708 if (ret) { 709 log(LOG_ERR, "krping_create_qp failed: %d\n", ret); 710 goto err2; 711 } 712 DEBUG_LOG(PFX "created qp %p\n", cb->qp); 713 return 0; 714 err2: 715 ib_destroy_cq(cb->cq); 716 err1: 717 ib_dealloc_pd(cb->pd); 718 return ret; 719 } 720 721 static void krping_format_send(struct krping_cb *cb, u64 buf, 722 struct ib_mr *mr) 723 { 724 struct krping_rdma_info *info = &cb->send_buf; 725 726 info->buf = htonll(buf); 727 info->rkey = htonl(mr->rkey); 728 info->size = htonl(cb->size); 729 730 DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n", 731 (unsigned long long)buf, mr->rkey, cb->size); 732 } 733 734 static void krping_test_server(struct krping_cb *cb) 735 { 736 struct ib_send_wr *bad_wr; 737 int ret; 738 739 while (1) { 740 /* Wait for client's Start STAG/TO/Len */ 741 krping_wait(cb, RDMA_READ_ADV); 742 if (cb->state != RDMA_READ_ADV) { 743 DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n", 744 cb->state); 745 break; 746 } 747 748 DEBUG_LOG(PFX "server received sink adv\n"); 749 750 /* Issue RDMA Read. */ 751 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; 752 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 753 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 754 cb->rdma_sq_wr.sg_list->length = cb->remote_len; 755 756 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); 757 if (ret) { 758 log(LOG_ERR, "post send error %d\n", ret); 759 break; 760 } 761 DEBUG_LOG(PFX "server posted rdma read req \n"); 762 763 /* Wait for read completion */ 764 krping_wait(cb, RDMA_READ_COMPLETE); 765 if (cb->state != RDMA_READ_COMPLETE) { 766 log(LOG_ERR, 767 "wait for RDMA_READ_COMPLETE state %d\n", 768 cb->state); 769 break; 770 } 771 DEBUG_LOG(PFX "server received read complete\n"); 772 773 /* Display data in recv buf */ 774 if (cb->verbose) 775 DEBUG_LOG("server ping data: %s\n", cb->rdma_buf); 776 777 /* Tell client to continue */ 778 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 779 if (ret) { 780 log(LOG_ERR, "post send error %d\n", ret); 781 break; 782 } 783 DEBUG_LOG(PFX "server posted go ahead\n"); 784 785 /* Wait for client's RDMA STAG/TO/Len */ 786 krping_wait(cb, RDMA_WRITE_ADV); 787 if (cb->state != RDMA_WRITE_ADV) { 788 log(LOG_ERR, 789 "wait for RDMA_WRITE_ADV state %d\n", 790 cb->state); 791 break; 792 } 793 DEBUG_LOG(PFX "server received sink adv\n"); 794 795 /* RDMA Write echo data */ 796 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 797 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 798 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 799 cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; 800 DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n", 801 cb->rdma_sq_wr.sg_list->lkey, 802 (unsigned long long)cb->rdma_sq_wr.sg_list->addr, 803 cb->rdma_sq_wr.sg_list->length); 804 805 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); 806 if (ret) { 807 log(LOG_ERR, "post send error %d\n", ret); 808 break; 809 } 810 811 /* Wait for completion */ 812 krping_wait(cb, RDMA_WRITE_COMPLETE); 813 if (cb->state != RDMA_WRITE_COMPLETE) { 814 log(LOG_ERR, 815 "wait for RDMA_WRITE_COMPLETE state %d\n", 816 cb->state); 817 break; 818 } 819 DEBUG_LOG(PFX "server rdma write complete \n"); 820 821 cb->state = CONNECTED; 822 823 /* Tell client to begin again */ 824 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 825 if (ret) { 826 log(LOG_ERR, "post send error %d\n", ret); 827 break; 828 } 829 DEBUG_LOG(PFX "server posted go ahead\n"); 830 } 831 } 832 833 static void rlat_test(struct krping_cb *cb) 834 { 835 int scnt; 836 int iters = cb->count; 837 struct timeval start_tv, stop_tv; 838 int ret; 839 struct ib_wc wc; 840 struct ib_send_wr *bad_wr; 841 int ne; 842 843 scnt = 0; 844 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; 845 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 846 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 847 cb->rdma_sq_wr.sg_list->length = cb->size; 848 849 microtime(&start_tv); 850 if (!cb->poll) { 851 cb->state = RDMA_READ_ADV; 852 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 853 } 854 while (scnt < iters) { 855 856 cb->state = RDMA_READ_ADV; 857 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); 858 if (ret) { 859 log(LOG_ERR, 860 "Couldn't post send: ret=%d scnt %d\n", 861 ret, scnt); 862 return; 863 } 864 865 do { 866 if (!cb->poll) { 867 krping_wait(cb, RDMA_READ_COMPLETE); 868 if (cb->state == RDMA_READ_COMPLETE) { 869 ne = 1; 870 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); 871 } else { 872 ne = -1; 873 } 874 } else 875 ne = ib_poll_cq(cb->cq, 1, &wc); 876 if (cb->state == ERROR) { 877 log(LOG_ERR, 878 "state == ERROR...bailing scnt %d\n", scnt); 879 return; 880 } 881 } while (ne == 0); 882 883 if (ne < 0) { 884 log(LOG_ERR, "poll CQ failed %d\n", ne); 885 return; 886 } 887 if (cb->poll && wc.status != IB_WC_SUCCESS) { 888 log(LOG_ERR, "Completion wth error at %s:\n", 889 cb->server ? "server" : "client"); 890 log(LOG_ERR, "Failed status %d: wr_id %d\n", 891 wc.status, (int) wc.wr_id); 892 return; 893 } 894 ++scnt; 895 } 896 microtime(&stop_tv); 897 898 if (stop_tv.tv_usec < start_tv.tv_usec) { 899 stop_tv.tv_usec += 1000000; 900 stop_tv.tv_sec -= 1; 901 } 902 903 log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n", 904 stop_tv.tv_sec - start_tv.tv_sec, 905 stop_tv.tv_usec - start_tv.tv_usec, 906 scnt, cb->size); 907 } 908 909 static int alloc_cycle_mem(int cycle_iters, 910 cycles_t **post_cycles_start, 911 cycles_t **post_cycles_stop, 912 cycles_t **poll_cycles_start, 913 cycles_t **poll_cycles_stop, 914 cycles_t **last_poll_cycles_start) 915 { 916 *post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 917 if (!*post_cycles_start) { 918 goto fail1; 919 } 920 *post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 921 if (!*post_cycles_stop) { 922 goto fail2; 923 } 924 *poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 925 if (!*poll_cycles_start) { 926 goto fail3; 927 } 928 *poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 929 if (!*poll_cycles_stop) { 930 goto fail4; 931 } 932 *last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); 933 if (!*last_poll_cycles_start) { 934 goto fail5; 935 } 936 return 0; 937 fail5: 938 free(*poll_cycles_stop, M_DEVBUF); 939 fail4: 940 free(*poll_cycles_start, M_DEVBUF); 941 fail3: 942 free(*post_cycles_stop, M_DEVBUF); 943 fail2: 944 free(*post_cycles_start, M_DEVBUF); 945 fail1: 946 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__); 947 return ENOMEM; 948 } 949 950 static void free_cycle_mem(cycles_t *post_cycles_start, 951 cycles_t *post_cycles_stop, 952 cycles_t *poll_cycles_start, 953 cycles_t *poll_cycles_stop, 954 cycles_t *last_poll_cycles_start) 955 { 956 free(last_poll_cycles_start, M_DEVBUF); 957 free(poll_cycles_stop, M_DEVBUF); 958 free(poll_cycles_start, M_DEVBUF); 959 free(post_cycles_stop, M_DEVBUF); 960 free(post_cycles_start, M_DEVBUF); 961 } 962 963 static void wlat_test(struct krping_cb *cb) 964 { 965 int ccnt, scnt, rcnt; 966 int iters=cb->count; 967 volatile char *poll_buf = (char *) cb->start_buf; 968 char *buf = (char *)cb->rdma_buf; 969 ccnt = 0; 970 scnt = 0; 971 rcnt = 0; 972 struct timeval start_tv, stop_tv; 973 cycles_t *post_cycles_start, *post_cycles_stop; 974 cycles_t *poll_cycles_start, *poll_cycles_stop; 975 cycles_t *last_poll_cycles_start; 976 cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; 977 int i; 978 int cycle_iters = 1000; 979 int err; 980 981 err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop, 982 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start); 983 984 if (err) { 985 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__); 986 return; 987 } 988 989 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 990 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 991 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 992 cb->rdma_sq_wr.sg_list->length = cb->size; 993 994 if (cycle_iters > iters) 995 cycle_iters = iters; 996 microtime(&start_tv); 997 while (scnt < iters || ccnt < iters || rcnt < iters) { 998 999 /* Wait till buffer changes. */ 1000 if (rcnt < iters && !(scnt < 1 && !cb->server)) { 1001 ++rcnt; 1002 while (*poll_buf != (char)rcnt) { 1003 if (cb->state == ERROR) { 1004 log(LOG_ERR, "state = ERROR, bailing\n"); 1005 return; 1006 } 1007 } 1008 } 1009 1010 if (scnt < iters) { 1011 struct ib_send_wr *bad_wr; 1012 1013 *buf = (char)scnt+1; 1014 if (scnt < cycle_iters) 1015 post_cycles_start[scnt] = get_cycles(); 1016 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { 1017 log(LOG_ERR, "Couldn't post send: scnt=%d\n", 1018 scnt); 1019 return; 1020 } 1021 if (scnt < cycle_iters) 1022 post_cycles_stop[scnt] = get_cycles(); 1023 scnt++; 1024 } 1025 1026 if (ccnt < iters) { 1027 struct ib_wc wc; 1028 int ne; 1029 1030 if (ccnt < cycle_iters) 1031 poll_cycles_start[ccnt] = get_cycles(); 1032 do { 1033 if (ccnt < cycle_iters) 1034 last_poll_cycles_start[ccnt] = get_cycles(); 1035 ne = ib_poll_cq(cb->cq, 1, &wc); 1036 } while (ne == 0); 1037 if (ccnt < cycle_iters) 1038 poll_cycles_stop[ccnt] = get_cycles(); 1039 ++ccnt; 1040 1041 if (ne < 0) { 1042 log(LOG_ERR, "poll CQ failed %d\n", ne); 1043 return; 1044 } 1045 if (wc.status != IB_WC_SUCCESS) { 1046 log(LOG_ERR, "Completion wth error at %s:\n", 1047 cb->server ? "server" : "client"); 1048 log(LOG_ERR, "Failed status %d: wr_id %d\n", 1049 wc.status, (int) wc.wr_id); 1050 log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n", 1051 scnt, rcnt, ccnt); 1052 return; 1053 } 1054 } 1055 } 1056 microtime(&stop_tv); 1057 1058 if (stop_tv.tv_usec < start_tv.tv_usec) { 1059 stop_tv.tv_usec += 1000000; 1060 stop_tv.tv_sec -= 1; 1061 } 1062 1063 for (i=0; i < cycle_iters; i++) { 1064 sum_post += post_cycles_stop[i] - post_cycles_start[i]; 1065 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; 1066 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i]; 1067 } 1068 1069 log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n", 1070 stop_tv.tv_sec - start_tv.tv_sec, 1071 stop_tv.tv_usec - start_tv.tv_usec, 1072 scnt, cb->size, cycle_iters, 1073 (unsigned long long)sum_post, (unsigned long long)sum_poll, 1074 (unsigned long long)sum_last_poll); 1075 1076 free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 1077 poll_cycles_stop, last_poll_cycles_start); 1078 } 1079 1080 static void bw_test(struct krping_cb *cb) 1081 { 1082 int ccnt, scnt, rcnt; 1083 int iters=cb->count; 1084 ccnt = 0; 1085 scnt = 0; 1086 rcnt = 0; 1087 struct timeval start_tv, stop_tv; 1088 cycles_t *post_cycles_start, *post_cycles_stop; 1089 cycles_t *poll_cycles_start, *poll_cycles_stop; 1090 cycles_t *last_poll_cycles_start; 1091 cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; 1092 int i; 1093 int cycle_iters = 1000; 1094 int err; 1095 1096 err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop, 1097 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start); 1098 1099 if (err) { 1100 log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__); 1101 return; 1102 } 1103 1104 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 1105 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 1106 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 1107 cb->rdma_sq_wr.sg_list->length = cb->size; 1108 1109 if (cycle_iters > iters) 1110 cycle_iters = iters; 1111 microtime(&start_tv); 1112 while (scnt < iters || ccnt < iters) { 1113 1114 while (scnt < iters && scnt - ccnt < cb->txdepth) { 1115 struct ib_send_wr *bad_wr; 1116 1117 if (scnt < cycle_iters) 1118 post_cycles_start[scnt] = get_cycles(); 1119 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { 1120 log(LOG_ERR, "Couldn't post send: scnt=%d\n", 1121 scnt); 1122 return; 1123 } 1124 if (scnt < cycle_iters) 1125 post_cycles_stop[scnt] = get_cycles(); 1126 ++scnt; 1127 } 1128 1129 if (ccnt < iters) { 1130 int ne; 1131 struct ib_wc wc; 1132 1133 if (ccnt < cycle_iters) 1134 poll_cycles_start[ccnt] = get_cycles(); 1135 do { 1136 if (ccnt < cycle_iters) 1137 last_poll_cycles_start[ccnt] = get_cycles(); 1138 ne = ib_poll_cq(cb->cq, 1, &wc); 1139 } while (ne == 0); 1140 if (ccnt < cycle_iters) 1141 poll_cycles_stop[ccnt] = get_cycles(); 1142 ccnt += 1; 1143 1144 if (ne < 0) { 1145 log(LOG_ERR, "poll CQ failed %d\n", ne); 1146 return; 1147 } 1148 if (wc.status != IB_WC_SUCCESS) { 1149 log(LOG_ERR, "Completion wth error at %s:\n", 1150 cb->server ? "server" : "client"); 1151 log(LOG_ERR, "Failed status %d: wr_id %d\n", 1152 wc.status, (int) wc.wr_id); 1153 return; 1154 } 1155 } 1156 } 1157 microtime(&stop_tv); 1158 1159 if (stop_tv.tv_usec < start_tv.tv_usec) { 1160 stop_tv.tv_usec += 1000000; 1161 stop_tv.tv_sec -= 1; 1162 } 1163 1164 for (i=0; i < cycle_iters; i++) { 1165 sum_post += post_cycles_stop[i] - post_cycles_start[i]; 1166 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; 1167 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i]; 1168 } 1169 1170 log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n", 1171 stop_tv.tv_sec - start_tv.tv_sec, 1172 stop_tv.tv_usec - start_tv.tv_usec, 1173 scnt, cb->size, cycle_iters, 1174 (unsigned long long)sum_post, (unsigned long long)sum_poll, 1175 (unsigned long long)sum_last_poll); 1176 1177 free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 1178 poll_cycles_stop, last_poll_cycles_start); 1179 } 1180 1181 static void krping_rlat_test_server(struct krping_cb *cb) 1182 { 1183 struct ib_send_wr *bad_wr; 1184 struct ib_wc wc; 1185 int ret; 1186 1187 /* Spin waiting for client's Start STAG/TO/Len */ 1188 while (cb->state < RDMA_READ_ADV) { 1189 krping_cq_event_handler(cb->cq, cb); 1190 } 1191 1192 /* Send STAG/TO/Len to client */ 1193 if (cb->dma_mr) 1194 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1195 else 1196 krping_format_send(cb, cb->start_addr, cb->start_mr); 1197 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1198 if (ret) { 1199 log(LOG_ERR, "post send error %d\n", ret); 1200 return; 1201 } 1202 1203 /* Spin waiting for send completion */ 1204 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1205 if (ret < 0) { 1206 log(LOG_ERR, "poll error %d\n", ret); 1207 return; 1208 } 1209 if (wc.status) { 1210 log(LOG_ERR, "send completiong error %d\n", wc.status); 1211 return; 1212 } 1213 1214 krping_wait(cb, ERROR); 1215 } 1216 1217 static void krping_wlat_test_server(struct krping_cb *cb) 1218 { 1219 struct ib_send_wr *bad_wr; 1220 struct ib_wc wc; 1221 int ret; 1222 1223 /* Spin waiting for client's Start STAG/TO/Len */ 1224 while (cb->state < RDMA_READ_ADV) { 1225 krping_cq_event_handler(cb->cq, cb); 1226 } 1227 1228 /* Send STAG/TO/Len to client */ 1229 if (cb->dma_mr) 1230 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1231 else 1232 krping_format_send(cb, cb->start_addr, cb->start_mr); 1233 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1234 if (ret) { 1235 log(LOG_ERR, "post send error %d\n", ret); 1236 return; 1237 } 1238 1239 /* Spin waiting for send completion */ 1240 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1241 if (ret < 0) { 1242 log(LOG_ERR, "poll error %d\n", ret); 1243 return; 1244 } 1245 if (wc.status) { 1246 log(LOG_ERR, "send completiong error %d\n", wc.status); 1247 return; 1248 } 1249 1250 wlat_test(cb); 1251 1252 } 1253 1254 static void krping_bw_test_server(struct krping_cb *cb) 1255 { 1256 struct ib_send_wr *bad_wr; 1257 struct ib_wc wc; 1258 int ret; 1259 1260 /* Spin waiting for client's Start STAG/TO/Len */ 1261 while (cb->state < RDMA_READ_ADV) { 1262 krping_cq_event_handler(cb->cq, cb); 1263 } 1264 1265 /* Send STAG/TO/Len to client */ 1266 if (cb->dma_mr) 1267 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1268 else 1269 krping_format_send(cb, cb->start_addr, cb->start_mr); 1270 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1271 if (ret) { 1272 log(LOG_ERR, "post send error %d\n", ret); 1273 return; 1274 } 1275 1276 /* Spin waiting for send completion */ 1277 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1278 if (ret < 0) { 1279 log(LOG_ERR, "poll error %d\n", ret); 1280 return; 1281 } 1282 if (wc.status) { 1283 log(LOG_ERR, "send completiong error %d\n", wc.status); 1284 return; 1285 } 1286 1287 if (cb->duplex) 1288 bw_test(cb); 1289 krping_wait(cb, ERROR); 1290 } 1291 1292 static int krping_bind_server(struct krping_cb *cb) 1293 { 1294 struct sockaddr_in sin; 1295 int ret; 1296 1297 memset(&sin, 0, sizeof(sin)); 1298 sin.sin_len = sizeof sin; 1299 sin.sin_family = AF_INET; 1300 sin.sin_addr.s_addr = cb->addr.s_addr; 1301 sin.sin_port = cb->port; 1302 1303 ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); 1304 if (ret) { 1305 log(LOG_ERR, "rdma_bind_addr error %d\n", ret); 1306 return ret; 1307 } 1308 DEBUG_LOG(PFX "rdma_bind_addr successful\n"); 1309 1310 DEBUG_LOG(PFX "rdma_listen\n"); 1311 ret = rdma_listen(cb->cm_id, 3); 1312 if (ret) { 1313 log(LOG_ERR, "rdma_listen failed: %d\n", ret); 1314 return ret; 1315 } 1316 1317 krping_wait(cb, CONNECT_REQUEST); 1318 if (cb->state != CONNECT_REQUEST) { 1319 log(LOG_ERR, "wait for CONNECT_REQUEST state %d\n", 1320 cb->state); 1321 return -1; 1322 } 1323 1324 return 0; 1325 } 1326 1327 static void krping_run_server(struct krping_cb *cb) 1328 { 1329 struct ib_recv_wr *bad_wr; 1330 int ret; 1331 1332 ret = krping_bind_server(cb); 1333 if (ret) 1334 return; 1335 1336 ret = krping_setup_qp(cb, cb->child_cm_id); 1337 if (ret) { 1338 log(LOG_ERR, "setup_qp failed: %d\n", ret); 1339 return; 1340 } 1341 1342 ret = krping_setup_buffers(cb); 1343 if (ret) { 1344 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret); 1345 goto err1; 1346 } 1347 1348 ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); 1349 if (ret) { 1350 log(LOG_ERR, "ib_post_recv failed: %d\n", ret); 1351 goto err2; 1352 } 1353 1354 ret = krping_accept(cb); 1355 if (ret) { 1356 log(LOG_ERR, "connect error %d\n", ret); 1357 goto err2; 1358 } 1359 1360 if (cb->wlat) 1361 krping_wlat_test_server(cb); 1362 else if (cb->rlat) 1363 krping_rlat_test_server(cb); 1364 else if (cb->bw) 1365 krping_bw_test_server(cb); 1366 else 1367 krping_test_server(cb); 1368 1369 rdma_disconnect(cb->child_cm_id); 1370 rdma_destroy_id(cb->child_cm_id); 1371 err2: 1372 krping_free_buffers(cb); 1373 err1: 1374 krping_free_qp(cb); 1375 } 1376 1377 static void krping_test_client(struct krping_cb *cb) 1378 { 1379 int ping, start, cc, i, ret; 1380 struct ib_send_wr *bad_wr; 1381 unsigned char c; 1382 1383 start = 65; 1384 for (ping = 0; !cb->count || ping < cb->count; ping++) { 1385 cb->state = RDMA_READ_ADV; 1386 1387 /* Put some ascii text in the buffer. */ 1388 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); 1389 for (i = cc, c = start; i < cb->size; i++) { 1390 cb->start_buf[i] = c; 1391 c++; 1392 if (c > 122) 1393 c = 65; 1394 } 1395 start++; 1396 if (start > 122) 1397 start = 65; 1398 cb->start_buf[cb->size - 1] = 0; 1399 1400 if (cb->dma_mr) 1401 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1402 else 1403 krping_format_send(cb, cb->start_addr, cb->start_mr); 1404 1405 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1406 if (ret) { 1407 log(LOG_ERR, "post send error %d\n", ret); 1408 break; 1409 } 1410 1411 /* Wait for server to ACK */ 1412 krping_wait(cb, RDMA_WRITE_ADV); 1413 if (cb->state != RDMA_WRITE_ADV) { 1414 log(LOG_ERR, 1415 "wait for RDMA_WRITE_ADV state %d\n", 1416 cb->state); 1417 break; 1418 } 1419 1420 if (cb->dma_mr) 1421 krping_format_send(cb, cb->rdma_addr, cb->dma_mr); 1422 else 1423 krping_format_send(cb, cb->rdma_addr, cb->rdma_mr); 1424 1425 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1426 if (ret) { 1427 log(LOG_ERR, "post send error %d\n", ret); 1428 break; 1429 } 1430 1431 /* Wait for the server to say the RDMA Write is complete. */ 1432 krping_wait(cb, RDMA_WRITE_COMPLETE); 1433 if (cb->state != RDMA_WRITE_COMPLETE) { 1434 log(LOG_ERR, 1435 "wait for RDMA_WRITE_COMPLETE state %d\n", 1436 cb->state); 1437 break; 1438 } 1439 1440 if (cb->validate) 1441 if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { 1442 log(LOG_ERR, "data mismatch!\n"); 1443 break; 1444 } 1445 1446 if (cb->verbose) 1447 DEBUG_LOG("ping data: %s\n", cb->rdma_buf); 1448 } 1449 } 1450 1451 static void krping_rlat_test_client(struct krping_cb *cb) 1452 { 1453 struct ib_send_wr *bad_wr; 1454 struct ib_wc wc; 1455 int ret; 1456 1457 cb->state = RDMA_READ_ADV; 1458 1459 /* Send STAG/TO/Len to client */ 1460 if (cb->dma_mr) 1461 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1462 else 1463 krping_format_send(cb, cb->start_addr, cb->rdma_mr); 1464 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1465 if (ret) { 1466 log(LOG_ERR, "post send error %d\n", ret); 1467 return; 1468 } 1469 1470 /* Spin waiting for send completion */ 1471 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1472 if (ret < 0) { 1473 log(LOG_ERR, "poll error %d\n", ret); 1474 return; 1475 } 1476 if (wc.status) { 1477 log(LOG_ERR, "send completion error %d\n", wc.status); 1478 return; 1479 } 1480 1481 /* Spin waiting for server's Start STAG/TO/Len */ 1482 while (cb->state < RDMA_WRITE_ADV) { 1483 krping_cq_event_handler(cb->cq, cb); 1484 } 1485 1486 #if 0 1487 { 1488 int i; 1489 struct timeval start, stop; 1490 time_t sec; 1491 suseconds_t usec; 1492 unsigned long long elapsed; 1493 struct ib_wc wc; 1494 struct ib_send_wr *bad_wr; 1495 int ne; 1496 1497 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; 1498 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; 1499 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; 1500 cb->rdma_sq_wr.sg_list->length = 0; 1501 cb->rdma_sq_wr.num_sge = 0; 1502 1503 microtime(&start); 1504 for (i=0; i < 100000; i++) { 1505 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { 1506 log(LOG_ERR, "Couldn't post send\n"); 1507 return; 1508 } 1509 do { 1510 ne = ib_poll_cq(cb->cq, 1, &wc); 1511 } while (ne == 0); 1512 if (ne < 0) { 1513 log(LOG_ERR, "poll CQ failed %d\n", ne); 1514 return; 1515 } 1516 if (wc.status != IB_WC_SUCCESS) { 1517 log(LOG_ERR, "Completion wth error at %s:\n", 1518 cb->server ? "server" : "client"); 1519 log(LOG_ERR, "Failed status %d: wr_id %d\n", 1520 wc.status, (int) wc.wr_id); 1521 return; 1522 } 1523 } 1524 microtime(&stop); 1525 1526 if (stop.tv_usec < start.tv_usec) { 1527 stop.tv_usec += 1000000; 1528 stop.tv_sec -= 1; 1529 } 1530 sec = stop.tv_sec - start.tv_sec; 1531 usec = stop.tv_usec - start.tv_usec; 1532 elapsed = sec * 1000000 + usec; 1533 log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed); 1534 } 1535 #endif 1536 1537 rlat_test(cb); 1538 } 1539 1540 static void krping_wlat_test_client(struct krping_cb *cb) 1541 { 1542 struct ib_send_wr *bad_wr; 1543 struct ib_wc wc; 1544 int ret; 1545 1546 cb->state = RDMA_READ_ADV; 1547 1548 /* Send STAG/TO/Len to client */ 1549 if (cb->dma_mr) 1550 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1551 else 1552 krping_format_send(cb, cb->start_addr, cb->start_mr); 1553 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1554 if (ret) { 1555 log(LOG_ERR, "post send error %d\n", ret); 1556 return; 1557 } 1558 1559 /* Spin waiting for send completion */ 1560 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1561 if (ret < 0) { 1562 log(LOG_ERR, "poll error %d\n", ret); 1563 return; 1564 } 1565 if (wc.status) { 1566 log(LOG_ERR, "send completion error %d\n", wc.status); 1567 return; 1568 } 1569 1570 /* Spin waiting for server's Start STAG/TO/Len */ 1571 while (cb->state < RDMA_WRITE_ADV) { 1572 krping_cq_event_handler(cb->cq, cb); 1573 } 1574 1575 wlat_test(cb); 1576 } 1577 1578 static void krping_bw_test_client(struct krping_cb *cb) 1579 { 1580 struct ib_send_wr *bad_wr; 1581 struct ib_wc wc; 1582 int ret; 1583 1584 cb->state = RDMA_READ_ADV; 1585 1586 /* Send STAG/TO/Len to client */ 1587 if (cb->dma_mr) 1588 krping_format_send(cb, cb->start_addr, cb->dma_mr); 1589 else 1590 krping_format_send(cb, cb->start_addr, cb->start_mr); 1591 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); 1592 if (ret) { 1593 log(LOG_ERR, "post send error %d\n", ret); 1594 return; 1595 } 1596 1597 /* Spin waiting for send completion */ 1598 while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); 1599 if (ret < 0) { 1600 log(LOG_ERR, "poll error %d\n", ret); 1601 return; 1602 } 1603 if (wc.status) { 1604 log(LOG_ERR, "send completion error %d\n", wc.status); 1605 return; 1606 } 1607 1608 /* Spin waiting for server's Start STAG/TO/Len */ 1609 while (cb->state < RDMA_WRITE_ADV) { 1610 krping_cq_event_handler(cb->cq, cb); 1611 } 1612 1613 bw_test(cb); 1614 } 1615 1616 static int krping_connect_client(struct krping_cb *cb) 1617 { 1618 struct rdma_conn_param conn_param; 1619 int ret; 1620 1621 memset(&conn_param, 0, sizeof conn_param); 1622 conn_param.responder_resources = 1; 1623 conn_param.initiator_depth = 1; 1624 conn_param.retry_count = 10; 1625 1626 ret = rdma_connect(cb->cm_id, &conn_param); 1627 if (ret) { 1628 log(LOG_ERR, "rdma_connect error %d\n", ret); 1629 return ret; 1630 } 1631 1632 krping_wait(cb, CONNECTED); 1633 if (cb->state == ERROR) { 1634 log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state); 1635 return -1; 1636 } 1637 1638 DEBUG_LOG(PFX "rdma_connect successful\n"); 1639 return 0; 1640 } 1641 1642 static int krping_bind_client(struct krping_cb *cb) 1643 { 1644 struct sockaddr_in sin; 1645 int ret; 1646 1647 memset(&sin, 0, sizeof(sin)); 1648 sin.sin_len = sizeof sin; 1649 sin.sin_family = AF_INET; 1650 sin.sin_addr.s_addr = cb->addr.s_addr; 1651 sin.sin_port = cb->port; 1652 1653 ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, 1654 2000); 1655 if (ret) { 1656 log(LOG_ERR, "rdma_resolve_addr error %d\n", ret); 1657 return ret; 1658 } 1659 1660 krping_wait(cb, ROUTE_RESOLVED); 1661 if (cb->state != ROUTE_RESOLVED) { 1662 log(LOG_ERR, 1663 "addr/route resolution did not resolve: state %d\n", 1664 cb->state); 1665 return EINTR; 1666 } 1667 1668 DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n"); 1669 return 0; 1670 } 1671 1672 static void krping_run_client(struct krping_cb *cb) 1673 { 1674 struct ib_recv_wr *bad_wr; 1675 int ret; 1676 1677 ret = krping_bind_client(cb); 1678 if (ret) 1679 return; 1680 1681 ret = krping_setup_qp(cb, cb->cm_id); 1682 if (ret) { 1683 log(LOG_ERR, "setup_qp failed: %d\n", ret); 1684 return; 1685 } 1686 1687 ret = krping_setup_buffers(cb); 1688 if (ret) { 1689 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret); 1690 goto err1; 1691 } 1692 1693 ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); 1694 if (ret) { 1695 log(LOG_ERR, "ib_post_recv failed: %d\n", ret); 1696 goto err2; 1697 } 1698 1699 ret = krping_connect_client(cb); 1700 if (ret) { 1701 log(LOG_ERR, "connect error %d\n", ret); 1702 goto err2; 1703 } 1704 1705 if (cb->wlat) 1706 krping_wlat_test_client(cb); 1707 else if (cb->rlat) 1708 krping_rlat_test_client(cb); 1709 else if (cb->bw) 1710 krping_bw_test_client(cb); 1711 else 1712 krping_test_client(cb); 1713 rdma_disconnect(cb->cm_id); 1714 err2: 1715 krping_free_buffers(cb); 1716 err1: 1717 krping_free_qp(cb); 1718 } 1719 1720 int krping_doit(char *cmd) 1721 { 1722 struct krping_cb *cb; 1723 int op; 1724 int ret = 0; 1725 char *optarg; 1726 unsigned long optint; 1727 debug = 0; 1728 1729 cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK); 1730 if (!cb) 1731 return ENOMEM; 1732 bzero(cb, sizeof *cb); 1733 1734 mtx_lock(&krping_mutex); 1735 TAILQ_INSERT_TAIL(&krping_cbs, cb, list); 1736 mtx_unlock(&krping_mutex); 1737 1738 cb->server = -1; 1739 cb->state = IDLE; 1740 cb->size = 64; 1741 cb->txdepth = RPING_SQ_DEPTH; 1742 mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF); 1743 1744 while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, 1745 &optint)) != 0) { 1746 switch (op) { 1747 case 'a': 1748 cb->addr_str = optarg; 1749 DEBUG_LOG(PFX "ipaddr (%s)\n", optarg); 1750 if (!inet_aton(optarg, &cb->addr)) { 1751 log(LOG_ERR, "bad addr string %s\n", optarg); 1752 ret = EINVAL; 1753 } 1754 break; 1755 case 'D': 1756 cb->use_dmamr = 1; 1757 DEBUG_LOG(PFX "using dma mr\n"); 1758 break; 1759 case 'p': 1760 cb->port = htons(optint); 1761 DEBUG_LOG(PFX "port %d\n", (int)optint); 1762 break; 1763 case 'P': 1764 cb->poll = 1; 1765 DEBUG_LOG("server\n"); 1766 break; 1767 case 's': 1768 cb->server = 1; 1769 DEBUG_LOG(PFX "server\n"); 1770 break; 1771 case 'c': 1772 cb->server = 0; 1773 DEBUG_LOG(PFX "client\n"); 1774 break; 1775 case 'S': 1776 cb->size = optint; 1777 if ((cb->size < 1) || 1778 (cb->size > RPING_BUFSIZE)) { 1779 log(LOG_ERR, "Invalid size %d " 1780 "(valid range is 1 to %d)\n", 1781 cb->size, RPING_BUFSIZE); 1782 ret = EINVAL; 1783 } else 1784 DEBUG_LOG(PFX "size %d\n", (int)optint); 1785 break; 1786 case 'C': 1787 cb->count = optint; 1788 if (cb->count < 0) { 1789 log(LOG_ERR, "Invalid count %d\n", 1790 cb->count); 1791 ret = EINVAL; 1792 } else 1793 DEBUG_LOG(PFX "count %d\n", (int) cb->count); 1794 break; 1795 case 'v': 1796 cb->verbose++; 1797 DEBUG_LOG(PFX "verbose\n"); 1798 break; 1799 case 'V': 1800 cb->validate++; 1801 DEBUG_LOG(PFX "validate data\n"); 1802 break; 1803 case 'L': 1804 cb->rlat++; 1805 break; 1806 case 'l': 1807 cb->wlat++; 1808 break; 1809 case 'B': 1810 cb->bw++; 1811 break; 1812 case 't': 1813 cb->txdepth = optint; 1814 DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth); 1815 break; 1816 case 'd': 1817 debug++; 1818 break; 1819 default: 1820 log(LOG_ERR, "unknown opt %s\n", optarg); 1821 ret = EINVAL; 1822 break; 1823 } 1824 } 1825 if (ret) 1826 goto out; 1827 1828 if (cb->server == -1) { 1829 log(LOG_ERR, "must be either client or server\n"); 1830 ret = EINVAL; 1831 goto out; 1832 } 1833 if ((cb->bw + cb->rlat + cb->wlat) > 1) { 1834 log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n"); 1835 ret = EINVAL; 1836 goto out; 1837 } 1838 1839 1840 cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP); 1841 if (IS_ERR(cb->cm_id)) { 1842 ret = PTR_ERR(cb->cm_id); 1843 log(LOG_ERR, "rdma_create_id error %d\n", ret); 1844 goto out; 1845 } 1846 DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id); 1847 if (cb->server) 1848 krping_run_server(cb); 1849 else 1850 krping_run_client(cb); 1851 DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id); 1852 rdma_destroy_id(cb->cm_id); 1853 out: 1854 mtx_lock(&krping_mutex); 1855 TAILQ_REMOVE(&krping_cbs, cb, list); 1856 mtx_unlock(&krping_mutex); 1857 free(cb, M_DEVBUF); 1858 return ret; 1859 } 1860 1861 void krping_init(void) 1862 { 1863 mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF); 1864 TAILQ_INIT(&krping_cbs); 1865 } 1866