xref: /freebsd/sys/contrib/rdma/krping/krping.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/ctype.h>
38 
39 #include <sys/param.h>
40 #include <sys/condvar.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/socket.h>
44 #include <sys/module.h>
45 #include <sys/endian.h>
46 #include <sys/limits.h>
47 #include <sys/proc.h>
48 #include <sys/signalvar.h>
49 
50 #include <sys/lock.h>
51 #include <sys/mutex.h>
52 #include <sys/rwlock.h>
53 #include <sys/queue.h>
54 #include <sys/taskqueue.h>
55 #include <sys/syslog.h>
56 
57 #include <vm/vm.h>
58 #include <vm/pmap.h>
59 
60 #include <contrib/rdma/rdma_cm.h>
61 
62 #include "getopt.h"
63 #include "krping.h"
64 
65 #define PFX "krping: "
66 
67 static int debug = 0;
68 #define DEBUG_LOG if (debug) printf
69 
70 static const struct krping_option krping_opts[] = {
71 	{"count", OPT_INT, 'C'},
72 	{"size", OPT_INT, 'S'},
73 	{"addr", OPT_STRING, 'a'},
74 	{"port", OPT_INT, 'p'},
75 	{"verbose", OPT_NOPARAM, 'v'},
76 	{"validate", OPT_NOPARAM, 'V'},
77 	{"server", OPT_NOPARAM, 's'},
78 	{"client", OPT_NOPARAM, 'c'},
79 	{"dmamr", OPT_NOPARAM, 'D'},
80 	{"debug", OPT_NOPARAM, 'd'},
81 	{"wlat", OPT_NOPARAM, 'l'},
82 	{"rlat", OPT_NOPARAM, 'L'},
83 	{"bw", OPT_NOPARAM, 'B'},
84 	{"tx-depth", OPT_INT, 't'},
85   	{"poll", OPT_NOPARAM, 'P'},
86 	{NULL, 0, 0}
87 };
88 
89 struct mtx krping_mutex;
90 
91 /*
92  * List of running krping threads.
93  */
94 struct krping_cb_list krping_cbs;
95 
96 /*
97  * krping "ping/pong" loop:
98  * 	client sends source rkey/addr/len
99  *	server receives source rkey/add/len
100  *	server rdma reads "ping" data from source
101  * 	server sends "go ahead" on rdma read completion
102  *	client sends sink rkey/addr/len
103  * 	server receives sink rkey/addr/len
104  * 	server rdma writes "pong" data to sink
105  * 	server sends "go ahead" on rdma write completion
106  * 	<repeat loop>
107  */
108 
109 /*
110  * Default max buffer size for IO...
111  */
112 #define RPING_BUFSIZE 128*1024
113 #define RPING_SQ_DEPTH 32
114 
115 static void krping_wait(struct krping_cb *cb, int state)
116 {
117 	int rc;
118 	mtx_lock(&cb->lock);
119 	while (cb->state < state) {
120 		rc = msleep(cb, &cb->lock, 0, "krping", 0);
121 		if (rc && rc != ERESTART) {
122 			cb->state = ERROR;
123 			break;
124 		}
125 	}
126 	mtx_unlock(&cb->lock);
127 }
128 
129 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
130 				   struct rdma_cm_event *event)
131 {
132 	int ret;
133 	struct krping_cb *cb = cma_id->context;
134 
135 	DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
136 		  (cma_id == cb->cm_id) ? "parent" : "child");
137 
138 	mtx_lock(&cb->lock);
139 	switch (event->event) {
140 	case RDMA_CM_EVENT_ADDR_RESOLVED:
141 		cb->state = ADDR_RESOLVED;
142 		ret = rdma_resolve_route(cma_id, 2000);
143 		if (ret) {
144 			log(LOG_ERR, "rdma_resolve_route error %d\n",
145 			       ret);
146 			wakeup(cb);
147 		}
148 		break;
149 
150 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
151 		cb->state = ROUTE_RESOLVED;
152 		wakeup(cb);
153 		break;
154 
155 	case RDMA_CM_EVENT_CONNECT_REQUEST:
156 		cb->state = CONNECT_REQUEST;
157 		cb->child_cm_id = cma_id;
158 		DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id);
159 		wakeup(cb);
160 		break;
161 
162 	case RDMA_CM_EVENT_ESTABLISHED:
163 		DEBUG_LOG(PFX "ESTABLISHED\n");
164 		if (!cb->server) {
165 			cb->state = CONNECTED;
166 			wakeup(cb);
167 		}
168 		break;
169 
170 	case RDMA_CM_EVENT_ADDR_ERROR:
171 	case RDMA_CM_EVENT_ROUTE_ERROR:
172 	case RDMA_CM_EVENT_CONNECT_ERROR:
173 	case RDMA_CM_EVENT_UNREACHABLE:
174 	case RDMA_CM_EVENT_REJECTED:
175 		log(LOG_ERR, "cma event %d, error %d\n", event->event,
176 		       event->status);
177 		cb->state = ERROR;
178 		wakeup(cb);
179 		break;
180 
181 	case RDMA_CM_EVENT_DISCONNECTED:
182 		DEBUG_LOG(PFX "DISCONNECT EVENT...\n");
183 		cb->state = ERROR;
184 		wakeup(cb);
185 		break;
186 
187 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
188 		DEBUG_LOG(PFX "cma detected device removal!!!!\n");
189 		break;
190 
191 	default:
192 		log(LOG_ERR, "oof bad type!\n");
193 		wakeup(cb);
194 		break;
195 	}
196 	mtx_unlock(&cb->lock);
197 	return 0;
198 }
199 
200 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
201 {
202 	if (wc->byte_len != sizeof(cb->recv_buf)) {
203 		log(LOG_ERR, "Received bogus data, size %d\n",
204 		       wc->byte_len);
205 		return -1;
206 	}
207 
208 	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
209 	cb->remote_addr = ntohll(cb->recv_buf.buf);
210 	cb->remote_len  = ntohl(cb->recv_buf.size);
211 	DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n",
212 		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
213 		  cb->remote_len);
214 
215 	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
216 		cb->state = RDMA_READ_ADV;
217 	else
218 		cb->state = RDMA_WRITE_ADV;
219 
220 	return 0;
221 }
222 
223 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
224 {
225 	if (wc->byte_len != sizeof(cb->recv_buf)) {
226 		log(LOG_ERR, "Received bogus data, size %d\n",
227 		       wc->byte_len);
228 		return -1;
229 	}
230 
231 	if (cb->state == RDMA_READ_ADV)
232 		cb->state = RDMA_WRITE_ADV;
233 	else
234 		cb->state = RDMA_WRITE_COMPLETE;
235 
236 	return 0;
237 }
238 
239 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
240 {
241 	struct krping_cb *cb = ctx;
242 	struct ib_wc wc;
243 	struct ib_recv_wr *bad_wr;
244 	int ret;
245 
246 	mtx_lock(&cb->lock);
247 	KASSERT(cb->cq == cq, ("bad condition"));
248 	if (cb->state == ERROR) {
249 		log(LOG_ERR,  "cq completion in ERROR state\n");
250 		mtx_unlock(&cb->lock);
251 		return;
252 	}
253 	if (!cb->wlat && !cb->rlat && !cb->bw)
254 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
255 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
256 		if (wc.status) {
257 			if (wc.status != IB_WC_WR_FLUSH_ERR)
258 				log(LOG_ERR, "cq completion failed status %d\n",
259 					wc.status);
260 			goto error;
261 		}
262 
263 		switch (wc.opcode) {
264 		case IB_WC_SEND:
265 			DEBUG_LOG(PFX "send completion\n");
266 			cb->stats.send_bytes += cb->send_sgl.length;
267 			cb->stats.send_msgs++;
268 			break;
269 
270 		case IB_WC_RDMA_WRITE:
271 			DEBUG_LOG(PFX "rdma write completion\n");
272 			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
273 			cb->stats.write_msgs++;
274 			cb->state = RDMA_WRITE_COMPLETE;
275 			wakeup(cb);
276 			break;
277 
278 		case IB_WC_RDMA_READ:
279 			DEBUG_LOG(PFX "rdma read completion\n");
280 			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
281 			cb->stats.read_msgs++;
282 			cb->state = RDMA_READ_COMPLETE;
283 			wakeup(cb);
284 			break;
285 
286 		case IB_WC_RECV:
287 			DEBUG_LOG(PFX "recv completion\n");
288 			cb->stats.recv_bytes += sizeof(cb->recv_buf);
289 			cb->stats.recv_msgs++;
290 			if (cb->wlat || cb->rlat || cb->bw)
291 				ret = server_recv(cb, &wc);
292 			else
293 				ret = cb->server ? server_recv(cb, &wc) :
294 					   client_recv(cb, &wc);
295 			if (ret) {
296 				log(LOG_ERR, "recv wc error: %d\n", ret);
297 				goto error;
298 			}
299 
300 			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
301 			if (ret) {
302 				log(LOG_ERR, "post recv error: %d\n",
303 				       ret);
304 				goto error;
305 			}
306 			wakeup(cb);
307 			break;
308 
309 		default:
310 			log(LOG_ERR, "unknown!!!!! completion\n");
311 			goto error;
312 		}
313 	}
314 	if (ret) {
315 		log(LOG_ERR, "poll error %d\n", ret);
316 		goto error;
317 	}
318 	mtx_unlock(&cb->lock);
319 	return;
320 error:
321 	cb->state = ERROR;
322 	wakeup(cb);
323 	mtx_unlock(&cb->lock);
324 }
325 
326 static int krping_accept(struct krping_cb *cb)
327 {
328 	struct rdma_conn_param conn_param;
329 	int ret;
330 
331 	DEBUG_LOG(PFX "accepting client connection request\n");
332 
333 	memset(&conn_param, 0, sizeof conn_param);
334 	conn_param.responder_resources = 1;
335 	conn_param.initiator_depth = 1;
336 
337 	ret = rdma_accept(cb->child_cm_id, &conn_param);
338 	if (ret) {
339 		log(LOG_ERR, "rdma_accept error: %d\n", ret);
340 		return ret;
341 	}
342 
343 	if (!cb->wlat && !cb->rlat && !cb->bw) {
344 		krping_wait(cb, CONNECTED);
345 		if (cb->state == ERROR) {
346 			log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
347 			return -1;
348 		}
349 	}
350 	return 0;
351 }
352 
353 static void krping_setup_wr(struct krping_cb *cb)
354 {
355 	/* XXX X86 only here... not mapping for dma! */
356 	cb->recv_sgl.addr = vtophys(&cb->recv_buf);
357 	cb->recv_sgl.length = sizeof cb->recv_buf;
358 	if (cb->use_dmamr)
359 		cb->recv_sgl.lkey = cb->dma_mr->lkey;
360 	else
361 		cb->recv_sgl.lkey = cb->recv_mr->lkey;
362 	cb->rq_wr.sg_list = &cb->recv_sgl;
363 	cb->rq_wr.num_sge = 1;
364 
365 	cb->send_sgl.addr = vtophys(&cb->send_buf);
366 	cb->send_sgl.length = sizeof cb->send_buf;
367 	if (cb->use_dmamr)
368 		cb->send_sgl.lkey = cb->dma_mr->lkey;
369 	else
370 		cb->send_sgl.lkey = cb->send_mr->lkey;
371 
372 	cb->sq_wr.opcode = IB_WR_SEND;
373 	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
374 	cb->sq_wr.sg_list = &cb->send_sgl;
375 	cb->sq_wr.num_sge = 1;
376 
377 	cb->rdma_addr = vtophys(cb->rdma_buf);
378 	cb->rdma_sgl.addr = cb->rdma_addr;
379 	if (cb->use_dmamr)
380 		cb->rdma_sgl.lkey = cb->dma_mr->lkey;
381 	else
382 		cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
383 	cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
384 	cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
385 	cb->rdma_sq_wr.num_sge = 1;
386 
387 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
388 		cb->start_addr = vtophys(cb->start_buf);
389 	}
390 }
391 
392 static int krping_setup_buffers(struct krping_cb *cb)
393 {
394 	int ret;
395 	struct ib_phys_buf buf;
396 	u64 iovbase;
397 
398 	DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
399 
400 	if (cb->use_dmamr) {
401 		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
402 					   IB_ACCESS_REMOTE_READ|
403 				           IB_ACCESS_REMOTE_WRITE);
404 		if (IS_ERR(cb->dma_mr)) {
405 			log(LOG_ERR, "reg_dmamr failed\n");
406 			return PTR_ERR(cb->dma_mr);
407 		}
408 	} else {
409 
410 		buf.addr = vtophys(&cb->recv_buf);
411 		buf.size = sizeof cb->recv_buf;
412 		iovbase = vtophys(&cb->recv_buf);
413 		cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
414 					     IB_ACCESS_LOCAL_WRITE,
415 					     &iovbase);
416 
417 		if (IS_ERR(cb->recv_mr)) {
418 			log(LOG_ERR, "recv_buf reg_mr failed\n");
419 			return PTR_ERR(cb->recv_mr);
420 		}
421 
422 		buf.addr = vtophys(&cb->send_buf);
423 		buf.size = sizeof cb->send_buf;
424 		iovbase = vtophys(&cb->send_buf);
425 		cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
426 					     0, &iovbase);
427 
428 		if (IS_ERR(cb->send_mr)) {
429 			log(LOG_ERR, "send_buf reg_mr failed\n");
430 			ib_dereg_mr(cb->recv_mr);
431 			return PTR_ERR(cb->send_mr);
432 		}
433 	}
434 
435 	cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
436 		PAGE_SIZE, 0);
437 
438 	if (!cb->rdma_buf) {
439 		log(LOG_ERR, "rdma_buf malloc failed\n");
440 		ret = ENOMEM;
441 		goto err1;
442 	}
443 	if (!cb->use_dmamr) {
444 
445 		buf.addr = vtophys(cb->rdma_buf);
446 		buf.size = cb->size;
447 		iovbase = vtophys(cb->rdma_buf);
448 		cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
449 					     IB_ACCESS_REMOTE_READ|
450 					     IB_ACCESS_REMOTE_WRITE,
451 					     &iovbase);
452 
453 		if (IS_ERR(cb->rdma_mr)) {
454 			log(LOG_ERR, "rdma_buf reg_mr failed\n");
455 			ret = PTR_ERR(cb->rdma_mr);
456 			goto err2;
457 		}
458 	}
459 
460 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
461 		cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
462 			0, -1UL, PAGE_SIZE, 0);
463 		if (!cb->start_buf) {
464 			log(LOG_ERR, "start_buf malloc failed\n");
465 			ret = ENOMEM;
466 			goto err2;
467 		}
468 		if (!cb->use_dmamr) {
469 			unsigned flags = IB_ACCESS_REMOTE_READ;
470 
471 			if (cb->wlat || cb->rlat || cb->bw)
472 				flags |= IB_ACCESS_REMOTE_WRITE;
473 			buf.addr = vtophys(cb->start_buf);
474 			buf.size = cb->size;
475 			iovbase = vtophys(cb->start_buf);
476 			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
477 					     flags,
478 					     &iovbase);
479 
480 			if (IS_ERR(cb->start_mr)) {
481 				log(LOG_ERR, "start_buf reg_mr failed\n");
482 				ret = PTR_ERR(cb->start_mr);
483 				goto err3;
484 			}
485 		}
486 	}
487 
488 	krping_setup_wr(cb);
489 	DEBUG_LOG(PFX "allocated & registered buffers...\n");
490 	return 0;
491 err3:
492 	contigfree(cb->start_buf, cb->size, M_DEVBUF);
493 
494 	if (!cb->use_dmamr)
495 		ib_dereg_mr(cb->rdma_mr);
496 err2:
497 	contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
498 err1:
499 	if (cb->use_dmamr)
500 		ib_dereg_mr(cb->dma_mr);
501 	else {
502 		ib_dereg_mr(cb->recv_mr);
503 		ib_dereg_mr(cb->send_mr);
504 	}
505 	return ret;
506 }
507 
508 static void krping_free_buffers(struct krping_cb *cb)
509 {
510 	DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb);
511 
512 #if 0
513 	dma_unmap_single(cb->pd->device->dma_device,
514 			 pci_unmap_addr(cb, recv_mapping),
515 			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
516 	dma_unmap_single(cb->pd->device->dma_device,
517 			 pci_unmap_addr(cb, send_mapping),
518 			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
519 	dma_unmap_single(cb->pd->device->dma_device,
520 			 pci_unmap_addr(cb, rdma_mapping),
521 			 cb->size, DMA_BIDIRECTIONAL);
522 #endif
523 	contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
524 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
525 #if 0
526 		dma_unmap_single(cb->pd->device->dma_device,
527 			 pci_unmap_addr(cb, start_mapping),
528 			 cb->size, DMA_BIDIRECTIONAL);
529 #endif
530 		contigfree(cb->start_buf, cb->size, M_DEVBUF);
531 	}
532 	if (cb->use_dmamr)
533 		ib_dereg_mr(cb->dma_mr);
534 	else {
535 		ib_dereg_mr(cb->send_mr);
536 		ib_dereg_mr(cb->recv_mr);
537 		ib_dereg_mr(cb->rdma_mr);
538 		if (!cb->server)
539 			ib_dereg_mr(cb->start_mr);
540 	}
541 }
542 
543 static int krping_create_qp(struct krping_cb *cb)
544 {
545 	struct ib_qp_init_attr init_attr;
546 	int ret;
547 
548 	memset(&init_attr, 0, sizeof(init_attr));
549 	init_attr.cap.max_send_wr = cb->txdepth;
550 	init_attr.cap.max_recv_wr = 2;
551 	init_attr.cap.max_recv_sge = 1;
552 	init_attr.cap.max_send_sge = 1;
553 	init_attr.qp_type = IB_QPT_RC;
554 	init_attr.send_cq = cb->cq;
555 	init_attr.recv_cq = cb->cq;
556 
557 	if (cb->server) {
558 		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
559 		if (!ret)
560 			cb->qp = cb->child_cm_id->qp;
561 	} else {
562 		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
563 		if (!ret)
564 			cb->qp = cb->cm_id->qp;
565 	}
566 
567 	return ret;
568 }
569 
570 static void krping_free_qp(struct krping_cb *cb)
571 {
572 	ib_destroy_qp(cb->qp);
573 	ib_destroy_cq(cb->cq);
574 	ib_dealloc_pd(cb->pd);
575 }
576 
577 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
578 {
579 	int ret;
580 	cb->pd = ib_alloc_pd(cm_id->device);
581 	if (IS_ERR(cb->pd)) {
582 		log(LOG_ERR, "ib_alloc_pd failed\n");
583 		return PTR_ERR(cb->pd);
584 	}
585 	DEBUG_LOG(PFX "created pd %p\n", cb->pd);
586 
587 	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
588 			      cb, cb->txdepth * 2, 0);
589 	if (IS_ERR(cb->cq)) {
590 		log(LOG_ERR, "ib_create_cq failed\n");
591 		ret = PTR_ERR(cb->cq);
592 		goto err1;
593 	}
594 	DEBUG_LOG(PFX "created cq %p\n", cb->cq);
595 
596 	if (!cb->wlat && !cb->rlat && !cb->bw) {
597 		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
598 		if (ret) {
599 			log(LOG_ERR, "ib_create_cq failed\n");
600 			goto err2;
601 		}
602 	}
603 
604 	ret = krping_create_qp(cb);
605 	if (ret) {
606 		log(LOG_ERR, "krping_create_qp failed: %d\n", ret);
607 		goto err2;
608 	}
609 	DEBUG_LOG(PFX "created qp %p\n", cb->qp);
610 	return 0;
611 err2:
612 	ib_destroy_cq(cb->cq);
613 err1:
614 	ib_dealloc_pd(cb->pd);
615 	return ret;
616 }
617 
618 static void krping_format_send(struct krping_cb *cb, u64 buf,
619 			       struct ib_mr *mr)
620 {
621 	struct krping_rdma_info *info = &cb->send_buf;
622 
623 	info->buf = htonll(buf);
624 	info->rkey = htonl(mr->rkey);
625 	info->size = htonl(cb->size);
626 
627 	DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n",
628 		  (unsigned long long)buf, mr->rkey, cb->size);
629 }
630 
631 static void krping_test_server(struct krping_cb *cb)
632 {
633 	struct ib_send_wr *bad_wr;
634 	int ret;
635 
636 	while (1) {
637 		/* Wait for client's Start STAG/TO/Len */
638 		krping_wait(cb, RDMA_READ_ADV);
639 		if (cb->state != RDMA_READ_ADV) {
640 			DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n",
641 				cb->state);
642 			break;
643 		}
644 
645 		DEBUG_LOG(PFX "server received sink adv\n");
646 
647 		/* Issue RDMA Read. */
648 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
649 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
650 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
651 		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
652 
653 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
654 		if (ret) {
655 			log(LOG_ERR, "post send error %d\n", ret);
656 			break;
657 		}
658 		DEBUG_LOG(PFX "server posted rdma read req \n");
659 
660 		/* Wait for read completion */
661 		krping_wait(cb, RDMA_READ_COMPLETE);
662 		if (cb->state != RDMA_READ_COMPLETE) {
663 			log(LOG_ERR,
664 			       "wait for RDMA_READ_COMPLETE state %d\n",
665 			       cb->state);
666 			break;
667 		}
668 		DEBUG_LOG(PFX "server received read complete\n");
669 
670 		/* Display data in recv buf */
671 		if (cb->verbose)
672 			DEBUG_LOG("server ping data: %s\n", cb->rdma_buf);
673 
674 		/* Tell client to continue */
675 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
676 		if (ret) {
677 			log(LOG_ERR, "post send error %d\n", ret);
678 			break;
679 		}
680 		DEBUG_LOG(PFX "server posted go ahead\n");
681 
682 		/* Wait for client's RDMA STAG/TO/Len */
683 		krping_wait(cb, RDMA_WRITE_ADV);
684 		if (cb->state != RDMA_WRITE_ADV) {
685 			log(LOG_ERR,
686 			       "wait for RDMA_WRITE_ADV state %d\n",
687 			       cb->state);
688 			break;
689 		}
690 		DEBUG_LOG(PFX "server received sink adv\n");
691 
692 		/* RDMA Write echo data */
693 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
694 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
695 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
696 		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
697 		DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n",
698 			  cb->rdma_sq_wr.sg_list->lkey,
699 			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
700 			  cb->rdma_sq_wr.sg_list->length);
701 
702 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
703 		if (ret) {
704 			log(LOG_ERR, "post send error %d\n", ret);
705 			break;
706 		}
707 
708 		/* Wait for completion */
709 		krping_wait(cb, RDMA_WRITE_COMPLETE);
710 		if (cb->state != RDMA_WRITE_COMPLETE) {
711 			log(LOG_ERR,
712 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
713 			       cb->state);
714 			break;
715 		}
716 		DEBUG_LOG(PFX "server rdma write complete \n");
717 
718 		cb->state = CONNECTED;
719 
720 		/* Tell client to begin again */
721 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
722 		if (ret) {
723 			log(LOG_ERR, "post send error %d\n", ret);
724 			break;
725 		}
726 		DEBUG_LOG(PFX "server posted go ahead\n");
727 	}
728 }
729 
730 static void rlat_test(struct krping_cb *cb)
731 {
732 	int scnt;
733 	int iters = cb->count;
734 	struct timeval start_tv, stop_tv;
735 	int ret;
736 	struct ib_wc wc;
737 	struct ib_send_wr *bad_wr;
738 	int ne;
739 
740 	scnt = 0;
741 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
742 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
743 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
744 	cb->rdma_sq_wr.sg_list->length = cb->size;
745 
746 	microtime(&start_tv);
747  	if (!cb->poll) {
748  		cb->state = RDMA_READ_ADV;
749  		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
750  	}
751 	while (scnt < iters) {
752 
753  		cb->state = RDMA_READ_ADV;
754 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
755 		if (ret) {
756 			log(LOG_ERR,
757 				"Couldn't post send: ret=%d scnt %d\n",
758 				ret, scnt);
759 			return;
760 		}
761 
762 		do {
763 			if (!cb->poll) {
764 				krping_wait(cb, RDMA_READ_COMPLETE);
765 				if (cb->state == RDMA_READ_COMPLETE) {
766 					ne = 1;
767 					ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
768 				} else {
769 					ne = -1;
770 				}
771 			} else
772 				ne = ib_poll_cq(cb->cq, 1, &wc);
773 			if (cb->state == ERROR) {
774 				log(LOG_ERR,
775 				       "state == ERROR...bailing scnt %d\n", scnt);
776 				return;
777 			}
778 		} while (ne == 0);
779 
780 		if (ne < 0) {
781 			log(LOG_ERR, "poll CQ failed %d\n", ne);
782 			return;
783 		}
784  		if (cb->poll && wc.status != IB_WC_SUCCESS) {
785 			log(LOG_ERR, "Completion wth error at %s:\n",
786 				cb->server ? "server" : "client");
787 			log(LOG_ERR, "Failed status %d: wr_id %d\n",
788 				wc.status, (int) wc.wr_id);
789 			return;
790 		}
791 		++scnt;
792 	}
793 	microtime(&stop_tv);
794 
795         if (stop_tv.tv_usec < start_tv.tv_usec) {
796                 stop_tv.tv_usec += 1000000;
797                 stop_tv.tv_sec  -= 1;
798         }
799 
800 	log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n",
801 		stop_tv.tv_sec - start_tv.tv_sec,
802 		stop_tv.tv_usec - start_tv.tv_usec,
803 		scnt, cb->size);
804 }
805 
806 static int alloc_cycle_mem(int cycle_iters,
807 				cycles_t **post_cycles_start,
808 				cycles_t **post_cycles_stop,
809 				cycles_t **poll_cycles_start,
810 				cycles_t **poll_cycles_stop,
811 				cycles_t **last_poll_cycles_start)
812 {
813 	*post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
814 	if (!*post_cycles_start) {
815 		goto fail1;
816 	}
817 	*post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
818 	if (!*post_cycles_stop) {
819 		goto fail2;
820 	}
821 	*poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
822 	if (!*poll_cycles_start) {
823 		goto fail3;
824 	}
825 	*poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
826 	if (!*poll_cycles_stop) {
827 		goto fail4;
828 	}
829 	*last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
830 	if (!*last_poll_cycles_start) {
831 		goto fail5;
832 	}
833 	return 0;
834 fail5:
835 	free(*poll_cycles_stop, M_DEVBUF);
836 fail4:
837 	free(*poll_cycles_start, M_DEVBUF);
838 fail3:
839 	free(*post_cycles_stop, M_DEVBUF);
840 fail2:
841 	free(*post_cycles_start, M_DEVBUF);
842 fail1:
843 	log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
844 	return ENOMEM;
845 }
846 
847 static void free_cycle_mem(cycles_t *post_cycles_start,
848 				cycles_t *post_cycles_stop,
849 				cycles_t *poll_cycles_start,
850 				cycles_t *poll_cycles_stop,
851 				cycles_t *last_poll_cycles_start)
852 {
853 	free(last_poll_cycles_start, M_DEVBUF);
854 	free(poll_cycles_stop, M_DEVBUF);
855 	free(poll_cycles_start, M_DEVBUF);
856 	free(post_cycles_stop, M_DEVBUF);
857 	free(post_cycles_start, M_DEVBUF);
858 }
859 
860 static void wlat_test(struct krping_cb *cb)
861 {
862 	int ccnt, scnt, rcnt;
863 	int iters=cb->count;
864 	volatile char *poll_buf = (char *) cb->start_buf;
865 	char *buf = (char *)cb->rdma_buf;
866 	ccnt = 0;
867 	scnt = 0;
868 	rcnt = 0;
869 	struct timeval start_tv, stop_tv;
870 	cycles_t *post_cycles_start, *post_cycles_stop;
871 	cycles_t *poll_cycles_start, *poll_cycles_stop;
872 	cycles_t *last_poll_cycles_start;
873 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
874 	int i;
875 	int cycle_iters = 1000;
876 	int err;
877 
878 	err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
879 				&poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
880 
881 	if (err) {
882 		log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
883 		return;
884 	}
885 
886 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
887 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
888 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
889 	cb->rdma_sq_wr.sg_list->length = cb->size;
890 
891 	if (cycle_iters > iters)
892 		cycle_iters = iters;
893 	microtime(&start_tv);
894 	while (scnt < iters || ccnt < iters || rcnt < iters) {
895 
896 		/* Wait till buffer changes. */
897 		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
898 			++rcnt;
899 			while (*poll_buf != (char)rcnt) {
900 				if (cb->state == ERROR) {
901 					log(LOG_ERR, "state = ERROR, bailing\n");
902 					return;
903 				}
904 			}
905 		}
906 
907 		if (scnt < iters) {
908 			struct ib_send_wr *bad_wr;
909 
910 			*buf = (char)scnt+1;
911 			if (scnt < cycle_iters)
912 				post_cycles_start[scnt] = get_cycles();
913 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
914 				log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
915 					scnt);
916 				return;
917 			}
918 			if (scnt < cycle_iters)
919 				post_cycles_stop[scnt] = get_cycles();
920 			scnt++;
921 		}
922 
923 		if (ccnt < iters) {
924 			struct ib_wc wc;
925 			int ne;
926 
927 			if (ccnt < cycle_iters)
928 				poll_cycles_start[ccnt] = get_cycles();
929 			do {
930 				if (ccnt < cycle_iters)
931 					last_poll_cycles_start[ccnt] = get_cycles();
932 				ne = ib_poll_cq(cb->cq, 1, &wc);
933 			} while (ne == 0);
934 			if (ccnt < cycle_iters)
935 				poll_cycles_stop[ccnt] = get_cycles();
936 			++ccnt;
937 
938 			if (ne < 0) {
939 				log(LOG_ERR, "poll CQ failed %d\n", ne);
940 				return;
941 			}
942 			if (wc.status != IB_WC_SUCCESS) {
943 				log(LOG_ERR, "Completion wth error at %s:\n",
944 					cb->server ? "server" : "client");
945 				log(LOG_ERR, "Failed status %d: wr_id %d\n",
946 					wc.status, (int) wc.wr_id);
947 				log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n",
948 					scnt, rcnt, ccnt);
949 				return;
950 			}
951 		}
952 	}
953 	microtime(&stop_tv);
954 
955         if (stop_tv.tv_usec < start_tv.tv_usec) {
956                 stop_tv.tv_usec += 1000000;
957                 stop_tv.tv_sec  -= 1;
958         }
959 
960 	for (i=0; i < cycle_iters; i++) {
961 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
962 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
963 		sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
964 	}
965 
966 	log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
967 		stop_tv.tv_sec - start_tv.tv_sec,
968 		stop_tv.tv_usec - start_tv.tv_usec,
969 		scnt, cb->size, cycle_iters,
970 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
971 		(unsigned long long)sum_last_poll);
972 
973 	free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start,
974 			poll_cycles_stop, last_poll_cycles_start);
975 }
976 
977 static void bw_test(struct krping_cb *cb)
978 {
979 	int ccnt, scnt, rcnt;
980 	int iters=cb->count;
981 	ccnt = 0;
982 	scnt = 0;
983 	rcnt = 0;
984 	struct timeval start_tv, stop_tv;
985 	cycles_t *post_cycles_start, *post_cycles_stop;
986 	cycles_t *poll_cycles_start, *poll_cycles_stop;
987 	cycles_t *last_poll_cycles_start;
988 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
989 	int i;
990 	int cycle_iters = 1000;
991 	int err;
992 
993 	err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
994 				&poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
995 
996 	if (err) {
997 		log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__);
998 		return;
999 	}
1000 
1001 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1002 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1003 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1004 	cb->rdma_sq_wr.sg_list->length = cb->size;
1005 
1006 	if (cycle_iters > iters)
1007 		cycle_iters = iters;
1008 	microtime(&start_tv);
1009 	while (scnt < iters || ccnt < iters) {
1010 
1011 		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1012 			struct ib_send_wr *bad_wr;
1013 
1014 			if (scnt < cycle_iters)
1015 				post_cycles_start[scnt] = get_cycles();
1016 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1017 				log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
1018 					scnt);
1019 				return;
1020 			}
1021 			if (scnt < cycle_iters)
1022 				post_cycles_stop[scnt] = get_cycles();
1023 			++scnt;
1024 		}
1025 
1026 		if (ccnt < iters) {
1027 			int ne;
1028 			struct ib_wc wc;
1029 
1030 			if (ccnt < cycle_iters)
1031 				poll_cycles_start[ccnt] = get_cycles();
1032 			do {
1033 				if (ccnt < cycle_iters)
1034 					last_poll_cycles_start[ccnt] = get_cycles();
1035 				ne = ib_poll_cq(cb->cq, 1, &wc);
1036 			} while (ne == 0);
1037 			if (ccnt < cycle_iters)
1038 				poll_cycles_stop[ccnt] = get_cycles();
1039 			ccnt += 1;
1040 
1041 			if (ne < 0) {
1042 				log(LOG_ERR, "poll CQ failed %d\n", ne);
1043 				return;
1044 			}
1045 			if (wc.status != IB_WC_SUCCESS) {
1046 				log(LOG_ERR, "Completion wth error at %s:\n",
1047 					cb->server ? "server" : "client");
1048 				log(LOG_ERR, "Failed status %d: wr_id %d\n",
1049 					wc.status, (int) wc.wr_id);
1050 				return;
1051 			}
1052 		}
1053 	}
1054 	microtime(&stop_tv);
1055 
1056         if (stop_tv.tv_usec < start_tv.tv_usec) {
1057                 stop_tv.tv_usec += 1000000;
1058                 stop_tv.tv_sec  -= 1;
1059         }
1060 
1061 	for (i=0; i < cycle_iters; i++) {
1062 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1063 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1064 		sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
1065 	}
1066 
1067 	log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1068 		stop_tv.tv_sec - start_tv.tv_sec,
1069 		stop_tv.tv_usec - start_tv.tv_usec,
1070 		scnt, cb->size, cycle_iters,
1071 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1072 		(unsigned long long)sum_last_poll);
1073 
1074 	free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start,
1075 			poll_cycles_stop, last_poll_cycles_start);
1076 }
1077 
1078 static void krping_rlat_test_server(struct krping_cb *cb)
1079 {
1080 	struct ib_send_wr *bad_wr;
1081 	struct ib_wc wc;
1082 	int ret;
1083 
1084 	/* Spin waiting for client's Start STAG/TO/Len */
1085 	while (cb->state < RDMA_READ_ADV) {
1086 		krping_cq_event_handler(cb->cq, cb);
1087 	}
1088 
1089 	/* Send STAG/TO/Len to client */
1090 	if (cb->dma_mr)
1091 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1092 	else
1093 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1094 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1095 	if (ret) {
1096 		log(LOG_ERR, "post send error %d\n", ret);
1097 		return;
1098 	}
1099 
1100 	/* Spin waiting for send completion */
1101 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1102 	if (ret < 0) {
1103 		log(LOG_ERR, "poll error %d\n", ret);
1104 		return;
1105 	}
1106 	if (wc.status) {
1107 		log(LOG_ERR, "send completiong error %d\n", wc.status);
1108 		return;
1109 	}
1110 
1111 	krping_wait(cb, ERROR);
1112 }
1113 
1114 static void krping_wlat_test_server(struct krping_cb *cb)
1115 {
1116 	struct ib_send_wr *bad_wr;
1117 	struct ib_wc wc;
1118 	int ret;
1119 
1120 	/* Spin waiting for client's Start STAG/TO/Len */
1121 	while (cb->state < RDMA_READ_ADV) {
1122 		krping_cq_event_handler(cb->cq, cb);
1123 	}
1124 
1125 	/* Send STAG/TO/Len to client */
1126 	if (cb->dma_mr)
1127 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1128 	else
1129 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1130 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1131 	if (ret) {
1132 		log(LOG_ERR, "post send error %d\n", ret);
1133 		return;
1134 	}
1135 
1136 	/* Spin waiting for send completion */
1137 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1138 	if (ret < 0) {
1139 		log(LOG_ERR, "poll error %d\n", ret);
1140 		return;
1141 	}
1142 	if (wc.status) {
1143 		log(LOG_ERR, "send completiong error %d\n", wc.status);
1144 		return;
1145 	}
1146 
1147 	wlat_test(cb);
1148 
1149 }
1150 
1151 static void krping_bw_test_server(struct krping_cb *cb)
1152 {
1153 	struct ib_send_wr *bad_wr;
1154 	struct ib_wc wc;
1155 	int ret;
1156 
1157 	/* Spin waiting for client's Start STAG/TO/Len */
1158 	while (cb->state < RDMA_READ_ADV) {
1159 		krping_cq_event_handler(cb->cq, cb);
1160 	}
1161 
1162 	/* Send STAG/TO/Len to client */
1163 	if (cb->dma_mr)
1164 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1165 	else
1166 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1167 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1168 	if (ret) {
1169 		log(LOG_ERR, "post send error %d\n", ret);
1170 		return;
1171 	}
1172 
1173 	/* Spin waiting for send completion */
1174 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1175 	if (ret < 0) {
1176 		log(LOG_ERR, "poll error %d\n", ret);
1177 		return;
1178 	}
1179 	if (wc.status) {
1180 		log(LOG_ERR, "send completiong error %d\n", wc.status);
1181 		return;
1182 	}
1183 
1184 	if (cb->duplex)
1185 		bw_test(cb);
1186 	krping_wait(cb, ERROR);
1187 }
1188 
1189 static int krping_bind_server(struct krping_cb *cb)
1190 {
1191 	struct sockaddr_in sin;
1192 	int ret;
1193 
1194 	memset(&sin, 0, sizeof(sin));
1195 	sin.sin_len = sizeof sin;
1196 	sin.sin_family = AF_INET;
1197 	sin.sin_addr.s_addr = cb->addr.s_addr;
1198 	sin.sin_port = cb->port;
1199 
1200 	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1201 	if (ret) {
1202 		log(LOG_ERR, "rdma_bind_addr error %d\n", ret);
1203 		return ret;
1204 	}
1205 	DEBUG_LOG(PFX "rdma_bind_addr successful\n");
1206 
1207 	DEBUG_LOG(PFX "rdma_listen\n");
1208 	ret = rdma_listen(cb->cm_id, 3);
1209 	if (ret) {
1210 		log(LOG_ERR, "rdma_listen failed: %d\n", ret);
1211 		return ret;
1212 	}
1213 
1214 	krping_wait(cb, CONNECT_REQUEST);
1215 	if (cb->state != CONNECT_REQUEST) {
1216 		log(LOG_ERR,  "wait for CONNECT_REQUEST state %d\n",
1217 			cb->state);
1218 		return -1;
1219 	}
1220 
1221 	return 0;
1222 }
1223 
1224 static void krping_run_server(struct krping_cb *cb)
1225 {
1226 	struct ib_recv_wr *bad_wr;
1227 	int ret;
1228 
1229 	ret = krping_bind_server(cb);
1230 	if (ret)
1231 		return;
1232 
1233 	ret = krping_setup_qp(cb, cb->child_cm_id);
1234 	if (ret) {
1235 		log(LOG_ERR, "setup_qp failed: %d\n", ret);
1236 		return;
1237 	}
1238 
1239 	ret = krping_setup_buffers(cb);
1240 	if (ret) {
1241 		log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1242 		goto err1;
1243 	}
1244 
1245 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1246 	if (ret) {
1247 		log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1248 		goto err2;
1249 	}
1250 
1251 	ret = krping_accept(cb);
1252 	if (ret) {
1253 		log(LOG_ERR, "connect error %d\n", ret);
1254 		goto err2;
1255 	}
1256 
1257 	if (cb->wlat)
1258 		krping_wlat_test_server(cb);
1259 	else if (cb->rlat)
1260 		krping_rlat_test_server(cb);
1261 	else if (cb->bw)
1262 		krping_bw_test_server(cb);
1263 	else
1264 		krping_test_server(cb);
1265 
1266 	rdma_disconnect(cb->child_cm_id);
1267 	rdma_destroy_id(cb->child_cm_id);
1268 err2:
1269 	krping_free_buffers(cb);
1270 err1:
1271 	krping_free_qp(cb);
1272 }
1273 
1274 static void krping_test_client(struct krping_cb *cb)
1275 {
1276 	int ping, start, cc, i, ret;
1277 	struct ib_send_wr *bad_wr;
1278 	unsigned char c;
1279 
1280 	start = 65;
1281 	for (ping = 0; !cb->count || ping < cb->count; ping++) {
1282 		cb->state = RDMA_READ_ADV;
1283 
1284 		/* Put some ascii text in the buffer. */
1285 		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1286 		for (i = cc, c = start; i < cb->size; i++) {
1287 			cb->start_buf[i] = c;
1288 			c++;
1289 			if (c > 122)
1290 				c = 65;
1291 		}
1292 		start++;
1293 		if (start > 122)
1294 			start = 65;
1295 		cb->start_buf[cb->size - 1] = 0;
1296 
1297 		if (cb->dma_mr)
1298 			krping_format_send(cb, cb->start_addr, cb->dma_mr);
1299 		else
1300 			krping_format_send(cb, cb->start_addr, cb->start_mr);
1301 
1302 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1303 		if (ret) {
1304 			log(LOG_ERR, "post send error %d\n", ret);
1305 			break;
1306 		}
1307 
1308 		/* Wait for server to ACK */
1309 		krping_wait(cb, RDMA_WRITE_ADV);
1310 		if (cb->state != RDMA_WRITE_ADV) {
1311 			log(LOG_ERR,
1312 			       "wait for RDMA_WRITE_ADV state %d\n",
1313 			       cb->state);
1314 			break;
1315 		}
1316 
1317 		if (cb->dma_mr)
1318 			krping_format_send(cb, cb->rdma_addr, cb->dma_mr);
1319 		else
1320 			krping_format_send(cb, cb->rdma_addr, cb->rdma_mr);
1321 
1322 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1323 		if (ret) {
1324 			log(LOG_ERR, "post send error %d\n", ret);
1325 			break;
1326 		}
1327 
1328 		/* Wait for the server to say the RDMA Write is complete. */
1329 		krping_wait(cb, RDMA_WRITE_COMPLETE);
1330 		if (cb->state != RDMA_WRITE_COMPLETE) {
1331 			log(LOG_ERR,
1332 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1333 			       cb->state);
1334 			break;
1335 		}
1336 
1337 		if (cb->validate)
1338 			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1339 				log(LOG_ERR, "data mismatch!\n");
1340 				break;
1341 			}
1342 
1343 		if (cb->verbose)
1344 			DEBUG_LOG("ping data: %s\n", cb->rdma_buf);
1345 	}
1346 }
1347 
1348 static void krping_rlat_test_client(struct krping_cb *cb)
1349 {
1350 	struct ib_send_wr *bad_wr;
1351 	struct ib_wc wc;
1352 	int ret;
1353 
1354 	cb->state = RDMA_READ_ADV;
1355 
1356 	/* Send STAG/TO/Len to client */
1357 	if (cb->dma_mr)
1358 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1359 	else
1360 		krping_format_send(cb, cb->start_addr, cb->rdma_mr);
1361 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1362 	if (ret) {
1363 		log(LOG_ERR, "post send error %d\n", ret);
1364 		return;
1365 	}
1366 
1367 	/* Spin waiting for send completion */
1368 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1369 	if (ret < 0) {
1370 		log(LOG_ERR, "poll error %d\n", ret);
1371 		return;
1372 	}
1373 	if (wc.status) {
1374 		log(LOG_ERR, "send completion error %d\n", wc.status);
1375 		return;
1376 	}
1377 
1378 	/* Spin waiting for server's Start STAG/TO/Len */
1379 	while (cb->state < RDMA_WRITE_ADV) {
1380 		krping_cq_event_handler(cb->cq, cb);
1381 	}
1382 
1383 #if 0
1384 {
1385 	int i;
1386 	struct timeval start, stop;
1387 	time_t sec;
1388 	suseconds_t usec;
1389 	unsigned long long elapsed;
1390 	struct ib_wc wc;
1391 	struct ib_send_wr *bad_wr;
1392 	int ne;
1393 
1394 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1395 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1396 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1397 	cb->rdma_sq_wr.sg_list->length = 0;
1398 	cb->rdma_sq_wr.num_sge = 0;
1399 
1400 	microtime(&start);
1401 	for (i=0; i < 100000; i++) {
1402 		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1403 			log(LOG_ERR,  "Couldn't post send\n");
1404 			return;
1405 		}
1406 		do {
1407 			ne = ib_poll_cq(cb->cq, 1, &wc);
1408 		} while (ne == 0);
1409 		if (ne < 0) {
1410 			log(LOG_ERR, "poll CQ failed %d\n", ne);
1411 			return;
1412 		}
1413 		if (wc.status != IB_WC_SUCCESS) {
1414 			log(LOG_ERR, "Completion wth error at %s:\n",
1415 				cb->server ? "server" : "client");
1416 			log(LOG_ERR, "Failed status %d: wr_id %d\n",
1417 				wc.status, (int) wc.wr_id);
1418 			return;
1419 		}
1420 	}
1421 	microtime(&stop);
1422 
1423 	if (stop.tv_usec < start.tv_usec) {
1424 		stop.tv_usec += 1000000;
1425 		stop.tv_sec  -= 1;
1426 	}
1427 	sec     = stop.tv_sec - start.tv_sec;
1428 	usec    = stop.tv_usec - start.tv_usec;
1429 	elapsed = sec * 1000000 + usec;
1430 	log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1431 }
1432 #endif
1433 
1434 	rlat_test(cb);
1435 }
1436 
1437 static void krping_wlat_test_client(struct krping_cb *cb)
1438 {
1439 	struct ib_send_wr *bad_wr;
1440 	struct ib_wc wc;
1441 	int ret;
1442 
1443 	cb->state = RDMA_READ_ADV;
1444 
1445 	/* Send STAG/TO/Len to client */
1446 	if (cb->dma_mr)
1447 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1448 	else
1449 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1450 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1451 	if (ret) {
1452 		log(LOG_ERR, "post send error %d\n", ret);
1453 		return;
1454 	}
1455 
1456 	/* Spin waiting for send completion */
1457 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1458 	if (ret < 0) {
1459 		log(LOG_ERR, "poll error %d\n", ret);
1460 		return;
1461 	}
1462 	if (wc.status) {
1463 		log(LOG_ERR, "send completion error %d\n", wc.status);
1464 		return;
1465 	}
1466 
1467 	/* Spin waiting for server's Start STAG/TO/Len */
1468 	while (cb->state < RDMA_WRITE_ADV) {
1469 		krping_cq_event_handler(cb->cq, cb);
1470 	}
1471 
1472 	wlat_test(cb);
1473 }
1474 
1475 static void krping_bw_test_client(struct krping_cb *cb)
1476 {
1477 	struct ib_send_wr *bad_wr;
1478 	struct ib_wc wc;
1479 	int ret;
1480 
1481 	cb->state = RDMA_READ_ADV;
1482 
1483 	/* Send STAG/TO/Len to client */
1484 	if (cb->dma_mr)
1485 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1486 	else
1487 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1488 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1489 	if (ret) {
1490 		log(LOG_ERR, "post send error %d\n", ret);
1491 		return;
1492 	}
1493 
1494 	/* Spin waiting for send completion */
1495 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1496 	if (ret < 0) {
1497 		log(LOG_ERR, "poll error %d\n", ret);
1498 		return;
1499 	}
1500 	if (wc.status) {
1501 		log(LOG_ERR, "send completion error %d\n", wc.status);
1502 		return;
1503 	}
1504 
1505 	/* Spin waiting for server's Start STAG/TO/Len */
1506 	while (cb->state < RDMA_WRITE_ADV) {
1507 		krping_cq_event_handler(cb->cq, cb);
1508 	}
1509 
1510 	bw_test(cb);
1511 }
1512 
1513 static int krping_connect_client(struct krping_cb *cb)
1514 {
1515 	struct rdma_conn_param conn_param;
1516 	int ret;
1517 
1518 	memset(&conn_param, 0, sizeof conn_param);
1519 	conn_param.responder_resources = 1;
1520 	conn_param.initiator_depth = 1;
1521 	conn_param.retry_count = 10;
1522 
1523 	ret = rdma_connect(cb->cm_id, &conn_param);
1524 	if (ret) {
1525 		log(LOG_ERR, "rdma_connect error %d\n", ret);
1526 		return ret;
1527 	}
1528 
1529 	krping_wait(cb, CONNECTED);
1530 	if (cb->state == ERROR) {
1531 		log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
1532 		return -1;
1533 	}
1534 
1535 	DEBUG_LOG(PFX "rdma_connect successful\n");
1536 	return 0;
1537 }
1538 
1539 static int krping_bind_client(struct krping_cb *cb)
1540 {
1541 	struct sockaddr_in sin;
1542 	int ret;
1543 
1544 	memset(&sin, 0, sizeof(sin));
1545 	sin.sin_len = sizeof sin;
1546 	sin.sin_family = AF_INET;
1547 	sin.sin_addr.s_addr = cb->addr.s_addr;
1548 	sin.sin_port = cb->port;
1549 
1550 	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
1551 				2000);
1552 	if (ret) {
1553 		log(LOG_ERR, "rdma_resolve_addr error %d\n", ret);
1554 		return ret;
1555 	}
1556 
1557 	krping_wait(cb, ROUTE_RESOLVED);
1558 	if (cb->state != ROUTE_RESOLVED) {
1559 		log(LOG_ERR,
1560 		       "addr/route resolution did not resolve: state %d\n",
1561 		       cb->state);
1562 		return EINTR;
1563 	}
1564 
1565 	DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n");
1566 	return 0;
1567 }
1568 
1569 static void krping_run_client(struct krping_cb *cb)
1570 {
1571 	struct ib_recv_wr *bad_wr;
1572 	int ret;
1573 
1574 	ret = krping_bind_client(cb);
1575 	if (ret)
1576 		return;
1577 
1578 	ret = krping_setup_qp(cb, cb->cm_id);
1579 	if (ret) {
1580 		log(LOG_ERR, "setup_qp failed: %d\n", ret);
1581 		return;
1582 	}
1583 
1584 	ret = krping_setup_buffers(cb);
1585 	if (ret) {
1586 		log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1587 		goto err1;
1588 	}
1589 
1590 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1591 	if (ret) {
1592 		log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1593 		goto err2;
1594 	}
1595 
1596 	ret = krping_connect_client(cb);
1597 	if (ret) {
1598 		log(LOG_ERR, "connect error %d\n", ret);
1599 		goto err2;
1600 	}
1601 
1602 	if (cb->wlat)
1603 		krping_wlat_test_client(cb);
1604 	else if (cb->rlat)
1605 		krping_rlat_test_client(cb);
1606 	else if (cb->bw)
1607 		krping_bw_test_client(cb);
1608 	else
1609 		krping_test_client(cb);
1610 	rdma_disconnect(cb->cm_id);
1611 err2:
1612 	krping_free_buffers(cb);
1613 err1:
1614 	krping_free_qp(cb);
1615 }
1616 
1617 int krping_doit(char *cmd)
1618 {
1619 	struct krping_cb *cb;
1620 	int op;
1621 	int ret = 0;
1622 	char *optarg;
1623 	unsigned long optint;
1624 	debug = 0;
1625 
1626 	cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK);
1627 	if (!cb)
1628 		return ENOMEM;
1629 	bzero(cb, sizeof *cb);
1630 
1631 	mtx_lock(&krping_mutex);
1632 	TAILQ_INSERT_TAIL(&krping_cbs, cb, list);
1633 	mtx_unlock(&krping_mutex);
1634 
1635 	cb->server = -1;
1636 	cb->state = IDLE;
1637 	cb->size = 64;
1638 	cb->txdepth = RPING_SQ_DEPTH;
1639 	mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
1640 
1641 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
1642 			      &optint)) != 0) {
1643 		switch (op) {
1644 		case 'a':
1645 			cb->addr_str = optarg;
1646 			DEBUG_LOG(PFX "ipaddr (%s)\n", optarg);
1647 			if (!inet_aton(optarg, &cb->addr)) {
1648 				log(LOG_ERR, "bad addr string %s\n", optarg);
1649 				ret = EINVAL;
1650 			}
1651 			break;
1652 		case 'D':
1653 			cb->use_dmamr = 1;
1654 			DEBUG_LOG(PFX "using dma mr\n");
1655 			break;
1656 		case 'p':
1657 			cb->port = htons(optint);
1658 			DEBUG_LOG(PFX "port %d\n", (int)optint);
1659 			break;
1660 		case 'P':
1661 			cb->poll = 1;
1662 			DEBUG_LOG("server\n");
1663 			break;
1664 		case 's':
1665 			cb->server = 1;
1666 			DEBUG_LOG(PFX "server\n");
1667 			break;
1668 		case 'c':
1669 			cb->server = 0;
1670 			DEBUG_LOG(PFX "client\n");
1671 			break;
1672 		case 'S':
1673 			cb->size = optint;
1674 			if ((cb->size < 1) ||
1675 			    (cb->size > RPING_BUFSIZE)) {
1676 				log(LOG_ERR, "Invalid size %d "
1677 				       "(valid range is 1 to %d)\n",
1678 				       cb->size, RPING_BUFSIZE);
1679 				ret = EINVAL;
1680 			} else
1681 				DEBUG_LOG(PFX "size %d\n", (int)optint);
1682 			break;
1683 		case 'C':
1684 			cb->count = optint;
1685 			if (cb->count < 0) {
1686 				log(LOG_ERR, "Invalid count %d\n",
1687 					cb->count);
1688 				ret = EINVAL;
1689 			} else
1690 				DEBUG_LOG(PFX "count %d\n", (int) cb->count);
1691 			break;
1692 		case 'v':
1693 			cb->verbose++;
1694 			DEBUG_LOG(PFX "verbose\n");
1695 			break;
1696 		case 'V':
1697 			cb->validate++;
1698 			DEBUG_LOG(PFX "validate data\n");
1699 			break;
1700 		case 'L':
1701 			cb->rlat++;
1702 			break;
1703 		case 'l':
1704 			cb->wlat++;
1705 			break;
1706 		case 'B':
1707 			cb->bw++;
1708 			break;
1709 		case 't':
1710 			cb->txdepth = optint;
1711 			DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth);
1712 			break;
1713 		case 'd':
1714 			debug++;
1715 			break;
1716 		default:
1717 			log(LOG_ERR, "unknown opt %s\n", optarg);
1718 			ret = EINVAL;
1719 			break;
1720 		}
1721 	}
1722 	if (ret)
1723 		goto out;
1724 
1725 	if (cb->server == -1) {
1726 		log(LOG_ERR, "must be either client or server\n");
1727 		ret = EINVAL;
1728 		goto out;
1729 	}
1730 	if ((cb->bw + cb->rlat + cb->wlat) > 1) {
1731 		log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n");
1732 		ret = EINVAL;
1733 		goto out;
1734 	}
1735 
1736 
1737 	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
1738 	if (IS_ERR(cb->cm_id)) {
1739 		ret = PTR_ERR(cb->cm_id);
1740 		log(LOG_ERR, "rdma_create_id error %d\n", ret);
1741 		goto out;
1742 	}
1743 	DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id);
1744 	if (cb->server)
1745 		krping_run_server(cb);
1746 	else
1747 		krping_run_client(cb);
1748 	DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id);
1749 	rdma_destroy_id(cb->cm_id);
1750 out:
1751 	mtx_lock(&krping_mutex);
1752 	TAILQ_REMOVE(&krping_cbs, cb, list);
1753 	mtx_unlock(&krping_mutex);
1754 	free(cb, M_DEVBUF);
1755 	return ret;
1756 }
1757 
1758 void krping_init(void)
1759 {
1760 	mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF);
1761 	TAILQ_INIT(&krping_cbs);
1762 }
1763