xref: /freebsd/sys/contrib/rdma/krping/krping.c (revision 195ebc7e9e4b129de810833791a19dfb4349d6a9)
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/ctype.h>
38 
39 #include <sys/param.h>
40 #include <sys/condvar.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/socket.h>
44 #include <sys/module.h>
45 #include <sys/endian.h>
46 #include <sys/limits.h>
47 #include <sys/proc.h>
48 #include <sys/signalvar.h>
49 
50 #include <sys/lock.h>
51 #include <sys/mutex.h>
52 #include <sys/rwlock.h>
53 #include <sys/queue.h>
54 #include <sys/taskqueue.h>
55 #include <sys/syslog.h>
56 
57 #include <vm/vm.h>
58 #include <vm/pmap.h>
59 
60 #include <contrib/rdma/rdma_cm.h>
61 
62 #include "getopt.h"
63 #include "krping.h"
64 
65 #define PFX "krping: "
66 
67 static int debug = 0;
68 #define DEBUG_LOG if (debug) printf
69 
70 static const struct krping_option krping_opts[] = {
71 	{"count", OPT_INT, 'C'},
72 	{"size", OPT_INT, 'S'},
73 	{"addr", OPT_STRING, 'a'},
74 	{"port", OPT_INT, 'p'},
75 	{"verbose", OPT_NOPARAM, 'v'},
76 	{"validate", OPT_NOPARAM, 'V'},
77 	{"server", OPT_NOPARAM, 's'},
78 	{"client", OPT_NOPARAM, 'c'},
79 	{"dmamr", OPT_NOPARAM, 'D'},
80 	{"debug", OPT_NOPARAM, 'd'},
81 	{"wlat", OPT_NOPARAM, 'l'},
82 	{"rlat", OPT_NOPARAM, 'L'},
83 	{"bw", OPT_NOPARAM, 'B'},
84 	{"tx-depth", OPT_INT, 't'},
85   	{"poll", OPT_NOPARAM, 'P'},
86 	{NULL, 0, 0}
87 };
88 
89 struct mtx krping_mutex;
90 
91 /*
92  * List of running krping threads.
93  */
94 struct krping_cb_list krping_cbs;
95 
96 /*
97  * krping "ping/pong" loop:
98  * 	client sends source rkey/addr/len
99  *	server receives source rkey/add/len
100  *	server rdma reads "ping" data from source
101  * 	server sends "go ahead" on rdma read completion
102  *	client sends sink rkey/addr/len
103  * 	server receives sink rkey/addr/len
104  * 	server rdma writes "pong" data to sink
105  * 	server sends "go ahead" on rdma write completion
106  * 	<repeat loop>
107  */
108 
109 /*
110  * Default max buffer size for IO...
111  */
112 #define RPING_BUFSIZE 128*1024
113 #define RPING_SQ_DEPTH 32
114 
115 
116 /* lifted from netinet/libalias/alias_proxy.c */
117 static int inet_aton(const char *cp, struct in_addr *addr);
118 static int
119 inet_aton(cp, addr)
120         const char *cp;
121         struct in_addr *addr;
122 {
123 	u_long parts[4];
124 	in_addr_t val;
125 	const char *c;
126 	char *endptr;
127 	int gotend, n;
128 
129 	c = (const char *)cp;
130 	n = 0;
131 	/*
132 	 * Run through the string, grabbing numbers until
133 	 * the end of the string, or some error
134 	 */
135 	gotend = 0;
136 	while (!gotend) {
137 		unsigned long l;
138 
139 		l = strtoul(c, &endptr, 0);
140 
141 		if (l == ULONG_MAX || (l == 0 && endptr == c))
142 			return (0);
143 
144 		val = (in_addr_t)l;
145 		/*
146 		 * If the whole string is invalid, endptr will equal
147 		 * c.. this way we can make sure someone hasn't
148 		 * gone '.12' or something which would get past
149 		 * the next check.
150 		 */
151 		if (endptr == c)
152 			return (0);
153 		parts[n] = val;
154 		c = endptr;
155 
156 		/* Check the next character past the previous number's end */
157 		switch (*c) {
158 		case '.' :
159 			/* Make sure we only do 3 dots .. */
160 			if (n == 3)	/* Whoops. Quit. */
161 				return (0);
162 			n++;
163 			c++;
164 			break;
165 
166 		case '\0':
167 			gotend = 1;
168 			break;
169 
170 		default:
171 			if (isspace((unsigned char)*c)) {
172 				gotend = 1;
173 				break;
174 			} else
175 				return (0);	/* Invalid character, so fail */
176 		}
177 
178 	}
179 
180 	/*
181 	 * Concoct the address according to
182 	 * the number of parts specified.
183 	 */
184 
185 	switch (n) {
186 	case 0:				/* a -- 32 bits */
187 		/*
188 		 * Nothing is necessary here.  Overflow checking was
189 		 * already done in strtoul().
190 		 */
191 		break;
192 	case 1:				/* a.b -- 8.24 bits */
193 		if (val > 0xffffff || parts[0] > 0xff)
194 			return (0);
195 		val |= parts[0] << 24;
196 		break;
197 
198 	case 2:				/* a.b.c -- 8.8.16 bits */
199 		if (val > 0xffff || parts[0] > 0xff || parts[1] > 0xff)
200 			return (0);
201 		val |= (parts[0] << 24) | (parts[1] << 16);
202 		break;
203 
204 	case 3:				/* a.b.c.d -- 8.8.8.8 bits */
205 		if (val > 0xff || parts[0] > 0xff || parts[1] > 0xff ||
206 		    parts[2] > 0xff)
207 			return (0);
208 		val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8);
209 		break;
210 	}
211 
212 	if (addr != NULL)
213 		addr->s_addr = htonl(val);
214 	return (1);
215 }
216 
217 
218 static void krping_wait(struct krping_cb *cb, int state)
219 {
220 	int rc;
221 	mtx_lock(&cb->lock);
222 	while (cb->state < state) {
223 		rc = msleep(cb, &cb->lock, 0, "krping", 0);
224 		if (rc && rc != ERESTART) {
225 			cb->state = ERROR;
226 			break;
227 		}
228 	}
229 	mtx_unlock(&cb->lock);
230 }
231 
232 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
233 				   struct rdma_cm_event *event)
234 {
235 	int ret;
236 	struct krping_cb *cb = cma_id->context;
237 
238 	DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
239 		  (cma_id == cb->cm_id) ? "parent" : "child");
240 
241 	mtx_lock(&cb->lock);
242 	switch (event->event) {
243 	case RDMA_CM_EVENT_ADDR_RESOLVED:
244 		cb->state = ADDR_RESOLVED;
245 		ret = rdma_resolve_route(cma_id, 2000);
246 		if (ret) {
247 			log(LOG_ERR, "rdma_resolve_route error %d\n",
248 			       ret);
249 			wakeup(cb);
250 		}
251 		break;
252 
253 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
254 		cb->state = ROUTE_RESOLVED;
255 		wakeup(cb);
256 		break;
257 
258 	case RDMA_CM_EVENT_CONNECT_REQUEST:
259 		cb->state = CONNECT_REQUEST;
260 		cb->child_cm_id = cma_id;
261 		DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id);
262 		wakeup(cb);
263 		break;
264 
265 	case RDMA_CM_EVENT_ESTABLISHED:
266 		DEBUG_LOG(PFX "ESTABLISHED\n");
267 		if (!cb->server) {
268 			cb->state = CONNECTED;
269 			wakeup(cb);
270 		}
271 		break;
272 
273 	case RDMA_CM_EVENT_ADDR_ERROR:
274 	case RDMA_CM_EVENT_ROUTE_ERROR:
275 	case RDMA_CM_EVENT_CONNECT_ERROR:
276 	case RDMA_CM_EVENT_UNREACHABLE:
277 	case RDMA_CM_EVENT_REJECTED:
278 		log(LOG_ERR, "cma event %d, error %d\n", event->event,
279 		       event->status);
280 		cb->state = ERROR;
281 		wakeup(cb);
282 		break;
283 
284 	case RDMA_CM_EVENT_DISCONNECTED:
285 		DEBUG_LOG(PFX "DISCONNECT EVENT...\n");
286 		cb->state = ERROR;
287 		wakeup(cb);
288 		break;
289 
290 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
291 		DEBUG_LOG(PFX "cma detected device removal!!!!\n");
292 		break;
293 
294 	default:
295 		log(LOG_ERR, "oof bad type!\n");
296 		wakeup(cb);
297 		break;
298 	}
299 	mtx_unlock(&cb->lock);
300 	return 0;
301 }
302 
303 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
304 {
305 	if (wc->byte_len != sizeof(cb->recv_buf)) {
306 		log(LOG_ERR, "Received bogus data, size %d\n",
307 		       wc->byte_len);
308 		return -1;
309 	}
310 
311 	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
312 	cb->remote_addr = ntohll(cb->recv_buf.buf);
313 	cb->remote_len  = ntohl(cb->recv_buf.size);
314 	DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n",
315 		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
316 		  cb->remote_len);
317 
318 	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
319 		cb->state = RDMA_READ_ADV;
320 	else
321 		cb->state = RDMA_WRITE_ADV;
322 
323 	return 0;
324 }
325 
326 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
327 {
328 	if (wc->byte_len != sizeof(cb->recv_buf)) {
329 		log(LOG_ERR, "Received bogus data, size %d\n",
330 		       wc->byte_len);
331 		return -1;
332 	}
333 
334 	if (cb->state == RDMA_READ_ADV)
335 		cb->state = RDMA_WRITE_ADV;
336 	else
337 		cb->state = RDMA_WRITE_COMPLETE;
338 
339 	return 0;
340 }
341 
342 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
343 {
344 	struct krping_cb *cb = ctx;
345 	struct ib_wc wc;
346 	struct ib_recv_wr *bad_wr;
347 	int ret;
348 
349 	mtx_lock(&cb->lock);
350 	KASSERT(cb->cq == cq, ("bad condition"));
351 	if (cb->state == ERROR) {
352 		log(LOG_ERR,  "cq completion in ERROR state\n");
353 		mtx_unlock(&cb->lock);
354 		return;
355 	}
356 	if (!cb->wlat && !cb->rlat && !cb->bw)
357 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
358 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
359 		if (wc.status) {
360 			if (wc.status != IB_WC_WR_FLUSH_ERR)
361 				log(LOG_ERR, "cq completion failed status %d\n",
362 					wc.status);
363 			goto error;
364 		}
365 
366 		switch (wc.opcode) {
367 		case IB_WC_SEND:
368 			DEBUG_LOG(PFX "send completion\n");
369 			cb->stats.send_bytes += cb->send_sgl.length;
370 			cb->stats.send_msgs++;
371 			break;
372 
373 		case IB_WC_RDMA_WRITE:
374 			DEBUG_LOG(PFX "rdma write completion\n");
375 			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
376 			cb->stats.write_msgs++;
377 			cb->state = RDMA_WRITE_COMPLETE;
378 			wakeup(cb);
379 			break;
380 
381 		case IB_WC_RDMA_READ:
382 			DEBUG_LOG(PFX "rdma read completion\n");
383 			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
384 			cb->stats.read_msgs++;
385 			cb->state = RDMA_READ_COMPLETE;
386 			wakeup(cb);
387 			break;
388 
389 		case IB_WC_RECV:
390 			DEBUG_LOG(PFX "recv completion\n");
391 			cb->stats.recv_bytes += sizeof(cb->recv_buf);
392 			cb->stats.recv_msgs++;
393 			if (cb->wlat || cb->rlat || cb->bw)
394 				ret = server_recv(cb, &wc);
395 			else
396 				ret = cb->server ? server_recv(cb, &wc) :
397 					   client_recv(cb, &wc);
398 			if (ret) {
399 				log(LOG_ERR, "recv wc error: %d\n", ret);
400 				goto error;
401 			}
402 
403 			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
404 			if (ret) {
405 				log(LOG_ERR, "post recv error: %d\n",
406 				       ret);
407 				goto error;
408 			}
409 			wakeup(cb);
410 			break;
411 
412 		default:
413 			log(LOG_ERR, "unknown!!!!! completion\n");
414 			goto error;
415 		}
416 	}
417 	if (ret) {
418 		log(LOG_ERR, "poll error %d\n", ret);
419 		goto error;
420 	}
421 	mtx_unlock(&cb->lock);
422 	return;
423 error:
424 	cb->state = ERROR;
425 	wakeup(cb);
426 	mtx_unlock(&cb->lock);
427 }
428 
429 static int krping_accept(struct krping_cb *cb)
430 {
431 	struct rdma_conn_param conn_param;
432 	int ret;
433 
434 	DEBUG_LOG(PFX "accepting client connection request\n");
435 
436 	memset(&conn_param, 0, sizeof conn_param);
437 	conn_param.responder_resources = 1;
438 	conn_param.initiator_depth = 1;
439 
440 	ret = rdma_accept(cb->child_cm_id, &conn_param);
441 	if (ret) {
442 		log(LOG_ERR, "rdma_accept error: %d\n", ret);
443 		return ret;
444 	}
445 
446 	if (!cb->wlat && !cb->rlat && !cb->bw) {
447 		krping_wait(cb, CONNECTED);
448 		if (cb->state == ERROR) {
449 			log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
450 			return -1;
451 		}
452 	}
453 	return 0;
454 }
455 
456 static void krping_setup_wr(struct krping_cb *cb)
457 {
458 	/* XXX X86 only here... not mapping for dma! */
459 	cb->recv_sgl.addr = vtophys(&cb->recv_buf);
460 	cb->recv_sgl.length = sizeof cb->recv_buf;
461 	if (cb->use_dmamr)
462 		cb->recv_sgl.lkey = cb->dma_mr->lkey;
463 	else
464 		cb->recv_sgl.lkey = cb->recv_mr->lkey;
465 	cb->rq_wr.sg_list = &cb->recv_sgl;
466 	cb->rq_wr.num_sge = 1;
467 
468 	cb->send_sgl.addr = vtophys(&cb->send_buf);
469 	cb->send_sgl.length = sizeof cb->send_buf;
470 	if (cb->use_dmamr)
471 		cb->send_sgl.lkey = cb->dma_mr->lkey;
472 	else
473 		cb->send_sgl.lkey = cb->send_mr->lkey;
474 
475 	cb->sq_wr.opcode = IB_WR_SEND;
476 	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
477 	cb->sq_wr.sg_list = &cb->send_sgl;
478 	cb->sq_wr.num_sge = 1;
479 
480 	cb->rdma_addr = vtophys(cb->rdma_buf);
481 	cb->rdma_sgl.addr = cb->rdma_addr;
482 	if (cb->use_dmamr)
483 		cb->rdma_sgl.lkey = cb->dma_mr->lkey;
484 	else
485 		cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
486 	cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
487 	cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
488 	cb->rdma_sq_wr.num_sge = 1;
489 
490 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
491 		cb->start_addr = vtophys(cb->start_buf);
492 	}
493 }
494 
495 static int krping_setup_buffers(struct krping_cb *cb)
496 {
497 	int ret;
498 	struct ib_phys_buf buf;
499 	u64 iovbase;
500 
501 	DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
502 
503 	if (cb->use_dmamr) {
504 		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
505 					   IB_ACCESS_REMOTE_READ|
506 				           IB_ACCESS_REMOTE_WRITE);
507 		if (IS_ERR(cb->dma_mr)) {
508 			log(LOG_ERR, "reg_dmamr failed\n");
509 			return PTR_ERR(cb->dma_mr);
510 		}
511 	} else {
512 
513 		buf.addr = vtophys(&cb->recv_buf);
514 		buf.size = sizeof cb->recv_buf;
515 		iovbase = vtophys(&cb->recv_buf);
516 		cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
517 					     IB_ACCESS_LOCAL_WRITE,
518 					     &iovbase);
519 
520 		if (IS_ERR(cb->recv_mr)) {
521 			log(LOG_ERR, "recv_buf reg_mr failed\n");
522 			return PTR_ERR(cb->recv_mr);
523 		}
524 
525 		buf.addr = vtophys(&cb->send_buf);
526 		buf.size = sizeof cb->send_buf;
527 		iovbase = vtophys(&cb->send_buf);
528 		cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
529 					     0, &iovbase);
530 
531 		if (IS_ERR(cb->send_mr)) {
532 			log(LOG_ERR, "send_buf reg_mr failed\n");
533 			ib_dereg_mr(cb->recv_mr);
534 			return PTR_ERR(cb->send_mr);
535 		}
536 	}
537 
538 	cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
539 		PAGE_SIZE, 0);
540 
541 	if (!cb->rdma_buf) {
542 		log(LOG_ERR, "rdma_buf malloc failed\n");
543 		ret = ENOMEM;
544 		goto err1;
545 	}
546 	if (!cb->use_dmamr) {
547 
548 		buf.addr = vtophys(cb->rdma_buf);
549 		buf.size = cb->size;
550 		iovbase = vtophys(cb->rdma_buf);
551 		cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
552 					     IB_ACCESS_REMOTE_READ|
553 					     IB_ACCESS_REMOTE_WRITE,
554 					     &iovbase);
555 
556 		if (IS_ERR(cb->rdma_mr)) {
557 			log(LOG_ERR, "rdma_buf reg_mr failed\n");
558 			ret = PTR_ERR(cb->rdma_mr);
559 			goto err2;
560 		}
561 	}
562 
563 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
564 		cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
565 			0, -1UL, PAGE_SIZE, 0);
566 		if (!cb->start_buf) {
567 			log(LOG_ERR, "start_buf malloc failed\n");
568 			ret = ENOMEM;
569 			goto err2;
570 		}
571 		if (!cb->use_dmamr) {
572 			unsigned flags = IB_ACCESS_REMOTE_READ;
573 
574 			if (cb->wlat || cb->rlat || cb->bw)
575 				flags |= IB_ACCESS_REMOTE_WRITE;
576 			buf.addr = vtophys(cb->start_buf);
577 			buf.size = cb->size;
578 			iovbase = vtophys(cb->start_buf);
579 			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
580 					     flags,
581 					     &iovbase);
582 
583 			if (IS_ERR(cb->start_mr)) {
584 				log(LOG_ERR, "start_buf reg_mr failed\n");
585 				ret = PTR_ERR(cb->start_mr);
586 				goto err3;
587 			}
588 		}
589 	}
590 
591 	krping_setup_wr(cb);
592 	DEBUG_LOG(PFX "allocated & registered buffers...\n");
593 	return 0;
594 err3:
595 	contigfree(cb->start_buf, cb->size, M_DEVBUF);
596 
597 	if (!cb->use_dmamr)
598 		ib_dereg_mr(cb->rdma_mr);
599 err2:
600 	contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
601 err1:
602 	if (cb->use_dmamr)
603 		ib_dereg_mr(cb->dma_mr);
604 	else {
605 		ib_dereg_mr(cb->recv_mr);
606 		ib_dereg_mr(cb->send_mr);
607 	}
608 	return ret;
609 }
610 
611 static void krping_free_buffers(struct krping_cb *cb)
612 {
613 	DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb);
614 
615 #if 0
616 	dma_unmap_single(cb->pd->device->dma_device,
617 			 pci_unmap_addr(cb, recv_mapping),
618 			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
619 	dma_unmap_single(cb->pd->device->dma_device,
620 			 pci_unmap_addr(cb, send_mapping),
621 			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
622 	dma_unmap_single(cb->pd->device->dma_device,
623 			 pci_unmap_addr(cb, rdma_mapping),
624 			 cb->size, DMA_BIDIRECTIONAL);
625 #endif
626 	contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
627 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
628 #if 0
629 		dma_unmap_single(cb->pd->device->dma_device,
630 			 pci_unmap_addr(cb, start_mapping),
631 			 cb->size, DMA_BIDIRECTIONAL);
632 #endif
633 		contigfree(cb->start_buf, cb->size, M_DEVBUF);
634 	}
635 	if (cb->use_dmamr)
636 		ib_dereg_mr(cb->dma_mr);
637 	else {
638 		ib_dereg_mr(cb->send_mr);
639 		ib_dereg_mr(cb->recv_mr);
640 		ib_dereg_mr(cb->rdma_mr);
641 		if (!cb->server)
642 			ib_dereg_mr(cb->start_mr);
643 	}
644 }
645 
646 static int krping_create_qp(struct krping_cb *cb)
647 {
648 	struct ib_qp_init_attr init_attr;
649 	int ret;
650 
651 	memset(&init_attr, 0, sizeof(init_attr));
652 	init_attr.cap.max_send_wr = cb->txdepth;
653 	init_attr.cap.max_recv_wr = 2;
654 	init_attr.cap.max_recv_sge = 1;
655 	init_attr.cap.max_send_sge = 1;
656 	init_attr.qp_type = IB_QPT_RC;
657 	init_attr.send_cq = cb->cq;
658 	init_attr.recv_cq = cb->cq;
659 
660 	if (cb->server) {
661 		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
662 		if (!ret)
663 			cb->qp = cb->child_cm_id->qp;
664 	} else {
665 		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
666 		if (!ret)
667 			cb->qp = cb->cm_id->qp;
668 	}
669 
670 	return ret;
671 }
672 
673 static void krping_free_qp(struct krping_cb *cb)
674 {
675 	ib_destroy_qp(cb->qp);
676 	ib_destroy_cq(cb->cq);
677 	ib_dealloc_pd(cb->pd);
678 }
679 
680 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
681 {
682 	int ret;
683 	cb->pd = ib_alloc_pd(cm_id->device);
684 	if (IS_ERR(cb->pd)) {
685 		log(LOG_ERR, "ib_alloc_pd failed\n");
686 		return PTR_ERR(cb->pd);
687 	}
688 	DEBUG_LOG(PFX "created pd %p\n", cb->pd);
689 
690 	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
691 			      cb, cb->txdepth * 2, 0);
692 	if (IS_ERR(cb->cq)) {
693 		log(LOG_ERR, "ib_create_cq failed\n");
694 		ret = PTR_ERR(cb->cq);
695 		goto err1;
696 	}
697 	DEBUG_LOG(PFX "created cq %p\n", cb->cq);
698 
699 	if (!cb->wlat && !cb->rlat && !cb->bw) {
700 		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
701 		if (ret) {
702 			log(LOG_ERR, "ib_create_cq failed\n");
703 			goto err2;
704 		}
705 	}
706 
707 	ret = krping_create_qp(cb);
708 	if (ret) {
709 		log(LOG_ERR, "krping_create_qp failed: %d\n", ret);
710 		goto err2;
711 	}
712 	DEBUG_LOG(PFX "created qp %p\n", cb->qp);
713 	return 0;
714 err2:
715 	ib_destroy_cq(cb->cq);
716 err1:
717 	ib_dealloc_pd(cb->pd);
718 	return ret;
719 }
720 
721 static void krping_format_send(struct krping_cb *cb, u64 buf,
722 			       struct ib_mr *mr)
723 {
724 	struct krping_rdma_info *info = &cb->send_buf;
725 
726 	info->buf = htonll(buf);
727 	info->rkey = htonl(mr->rkey);
728 	info->size = htonl(cb->size);
729 
730 	DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n",
731 		  (unsigned long long)buf, mr->rkey, cb->size);
732 }
733 
734 static void krping_test_server(struct krping_cb *cb)
735 {
736 	struct ib_send_wr *bad_wr;
737 	int ret;
738 
739 	while (1) {
740 		/* Wait for client's Start STAG/TO/Len */
741 		krping_wait(cb, RDMA_READ_ADV);
742 		if (cb->state != RDMA_READ_ADV) {
743 			DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n",
744 				cb->state);
745 			break;
746 		}
747 
748 		DEBUG_LOG(PFX "server received sink adv\n");
749 
750 		/* Issue RDMA Read. */
751 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
752 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
753 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
754 		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
755 
756 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
757 		if (ret) {
758 			log(LOG_ERR, "post send error %d\n", ret);
759 			break;
760 		}
761 		DEBUG_LOG(PFX "server posted rdma read req \n");
762 
763 		/* Wait for read completion */
764 		krping_wait(cb, RDMA_READ_COMPLETE);
765 		if (cb->state != RDMA_READ_COMPLETE) {
766 			log(LOG_ERR,
767 			       "wait for RDMA_READ_COMPLETE state %d\n",
768 			       cb->state);
769 			break;
770 		}
771 		DEBUG_LOG(PFX "server received read complete\n");
772 
773 		/* Display data in recv buf */
774 		if (cb->verbose)
775 			DEBUG_LOG("server ping data: %s\n", cb->rdma_buf);
776 
777 		/* Tell client to continue */
778 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
779 		if (ret) {
780 			log(LOG_ERR, "post send error %d\n", ret);
781 			break;
782 		}
783 		DEBUG_LOG(PFX "server posted go ahead\n");
784 
785 		/* Wait for client's RDMA STAG/TO/Len */
786 		krping_wait(cb, RDMA_WRITE_ADV);
787 		if (cb->state != RDMA_WRITE_ADV) {
788 			log(LOG_ERR,
789 			       "wait for RDMA_WRITE_ADV state %d\n",
790 			       cb->state);
791 			break;
792 		}
793 		DEBUG_LOG(PFX "server received sink adv\n");
794 
795 		/* RDMA Write echo data */
796 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
797 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
798 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
799 		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
800 		DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n",
801 			  cb->rdma_sq_wr.sg_list->lkey,
802 			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
803 			  cb->rdma_sq_wr.sg_list->length);
804 
805 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
806 		if (ret) {
807 			log(LOG_ERR, "post send error %d\n", ret);
808 			break;
809 		}
810 
811 		/* Wait for completion */
812 		krping_wait(cb, RDMA_WRITE_COMPLETE);
813 		if (cb->state != RDMA_WRITE_COMPLETE) {
814 			log(LOG_ERR,
815 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
816 			       cb->state);
817 			break;
818 		}
819 		DEBUG_LOG(PFX "server rdma write complete \n");
820 
821 		cb->state = CONNECTED;
822 
823 		/* Tell client to begin again */
824 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
825 		if (ret) {
826 			log(LOG_ERR, "post send error %d\n", ret);
827 			break;
828 		}
829 		DEBUG_LOG(PFX "server posted go ahead\n");
830 	}
831 }
832 
833 static void rlat_test(struct krping_cb *cb)
834 {
835 	int scnt;
836 	int iters = cb->count;
837 	struct timeval start_tv, stop_tv;
838 	int ret;
839 	struct ib_wc wc;
840 	struct ib_send_wr *bad_wr;
841 	int ne;
842 
843 	scnt = 0;
844 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
845 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
846 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
847 	cb->rdma_sq_wr.sg_list->length = cb->size;
848 
849 	microtime(&start_tv);
850  	if (!cb->poll) {
851  		cb->state = RDMA_READ_ADV;
852  		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
853  	}
854 	while (scnt < iters) {
855 
856  		cb->state = RDMA_READ_ADV;
857 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
858 		if (ret) {
859 			log(LOG_ERR,
860 				"Couldn't post send: ret=%d scnt %d\n",
861 				ret, scnt);
862 			return;
863 		}
864 
865 		do {
866 			if (!cb->poll) {
867 				krping_wait(cb, RDMA_READ_COMPLETE);
868 				if (cb->state == RDMA_READ_COMPLETE) {
869 					ne = 1;
870 					ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
871 				} else {
872 					ne = -1;
873 				}
874 			} else
875 				ne = ib_poll_cq(cb->cq, 1, &wc);
876 			if (cb->state == ERROR) {
877 				log(LOG_ERR,
878 				       "state == ERROR...bailing scnt %d\n", scnt);
879 				return;
880 			}
881 		} while (ne == 0);
882 
883 		if (ne < 0) {
884 			log(LOG_ERR, "poll CQ failed %d\n", ne);
885 			return;
886 		}
887  		if (cb->poll && wc.status != IB_WC_SUCCESS) {
888 			log(LOG_ERR, "Completion wth error at %s:\n",
889 				cb->server ? "server" : "client");
890 			log(LOG_ERR, "Failed status %d: wr_id %d\n",
891 				wc.status, (int) wc.wr_id);
892 			return;
893 		}
894 		++scnt;
895 	}
896 	microtime(&stop_tv);
897 
898         if (stop_tv.tv_usec < start_tv.tv_usec) {
899                 stop_tv.tv_usec += 1000000;
900                 stop_tv.tv_sec  -= 1;
901         }
902 
903 	log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n",
904 		stop_tv.tv_sec - start_tv.tv_sec,
905 		stop_tv.tv_usec - start_tv.tv_usec,
906 		scnt, cb->size);
907 }
908 
909 static int alloc_cycle_mem(int cycle_iters,
910 				cycles_t **post_cycles_start,
911 				cycles_t **post_cycles_stop,
912 				cycles_t **poll_cycles_start,
913 				cycles_t **poll_cycles_stop,
914 				cycles_t **last_poll_cycles_start)
915 {
916 	*post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
917 	if (!*post_cycles_start) {
918 		goto fail1;
919 	}
920 	*post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
921 	if (!*post_cycles_stop) {
922 		goto fail2;
923 	}
924 	*poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
925 	if (!*poll_cycles_start) {
926 		goto fail3;
927 	}
928 	*poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
929 	if (!*poll_cycles_stop) {
930 		goto fail4;
931 	}
932 	*last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
933 	if (!*last_poll_cycles_start) {
934 		goto fail5;
935 	}
936 	return 0;
937 fail5:
938 	free(*poll_cycles_stop, M_DEVBUF);
939 fail4:
940 	free(*poll_cycles_start, M_DEVBUF);
941 fail3:
942 	free(*post_cycles_stop, M_DEVBUF);
943 fail2:
944 	free(*post_cycles_start, M_DEVBUF);
945 fail1:
946 	log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
947 	return ENOMEM;
948 }
949 
950 static void free_cycle_mem(cycles_t *post_cycles_start,
951 				cycles_t *post_cycles_stop,
952 				cycles_t *poll_cycles_start,
953 				cycles_t *poll_cycles_stop,
954 				cycles_t *last_poll_cycles_start)
955 {
956 	free(last_poll_cycles_start, M_DEVBUF);
957 	free(poll_cycles_stop, M_DEVBUF);
958 	free(poll_cycles_start, M_DEVBUF);
959 	free(post_cycles_stop, M_DEVBUF);
960 	free(post_cycles_start, M_DEVBUF);
961 }
962 
963 static void wlat_test(struct krping_cb *cb)
964 {
965 	int ccnt, scnt, rcnt;
966 	int iters=cb->count;
967 	volatile char *poll_buf = (char *) cb->start_buf;
968 	char *buf = (char *)cb->rdma_buf;
969 	ccnt = 0;
970 	scnt = 0;
971 	rcnt = 0;
972 	struct timeval start_tv, stop_tv;
973 	cycles_t *post_cycles_start, *post_cycles_stop;
974 	cycles_t *poll_cycles_start, *poll_cycles_stop;
975 	cycles_t *last_poll_cycles_start;
976 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
977 	int i;
978 	int cycle_iters = 1000;
979 	int err;
980 
981 	err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
982 				&poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
983 
984 	if (err) {
985 		log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
986 		return;
987 	}
988 
989 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
990 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
991 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
992 	cb->rdma_sq_wr.sg_list->length = cb->size;
993 
994 	if (cycle_iters > iters)
995 		cycle_iters = iters;
996 	microtime(&start_tv);
997 	while (scnt < iters || ccnt < iters || rcnt < iters) {
998 
999 		/* Wait till buffer changes. */
1000 		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1001 			++rcnt;
1002 			while (*poll_buf != (char)rcnt) {
1003 				if (cb->state == ERROR) {
1004 					log(LOG_ERR, "state = ERROR, bailing\n");
1005 					return;
1006 				}
1007 			}
1008 		}
1009 
1010 		if (scnt < iters) {
1011 			struct ib_send_wr *bad_wr;
1012 
1013 			*buf = (char)scnt+1;
1014 			if (scnt < cycle_iters)
1015 				post_cycles_start[scnt] = get_cycles();
1016 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1017 				log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
1018 					scnt);
1019 				return;
1020 			}
1021 			if (scnt < cycle_iters)
1022 				post_cycles_stop[scnt] = get_cycles();
1023 			scnt++;
1024 		}
1025 
1026 		if (ccnt < iters) {
1027 			struct ib_wc wc;
1028 			int ne;
1029 
1030 			if (ccnt < cycle_iters)
1031 				poll_cycles_start[ccnt] = get_cycles();
1032 			do {
1033 				if (ccnt < cycle_iters)
1034 					last_poll_cycles_start[ccnt] = get_cycles();
1035 				ne = ib_poll_cq(cb->cq, 1, &wc);
1036 			} while (ne == 0);
1037 			if (ccnt < cycle_iters)
1038 				poll_cycles_stop[ccnt] = get_cycles();
1039 			++ccnt;
1040 
1041 			if (ne < 0) {
1042 				log(LOG_ERR, "poll CQ failed %d\n", ne);
1043 				return;
1044 			}
1045 			if (wc.status != IB_WC_SUCCESS) {
1046 				log(LOG_ERR, "Completion wth error at %s:\n",
1047 					cb->server ? "server" : "client");
1048 				log(LOG_ERR, "Failed status %d: wr_id %d\n",
1049 					wc.status, (int) wc.wr_id);
1050 				log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n",
1051 					scnt, rcnt, ccnt);
1052 				return;
1053 			}
1054 		}
1055 	}
1056 	microtime(&stop_tv);
1057 
1058         if (stop_tv.tv_usec < start_tv.tv_usec) {
1059                 stop_tv.tv_usec += 1000000;
1060                 stop_tv.tv_sec  -= 1;
1061         }
1062 
1063 	for (i=0; i < cycle_iters; i++) {
1064 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1065 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1066 		sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
1067 	}
1068 
1069 	log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1070 		stop_tv.tv_sec - start_tv.tv_sec,
1071 		stop_tv.tv_usec - start_tv.tv_usec,
1072 		scnt, cb->size, cycle_iters,
1073 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1074 		(unsigned long long)sum_last_poll);
1075 
1076 	free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start,
1077 			poll_cycles_stop, last_poll_cycles_start);
1078 }
1079 
1080 static void bw_test(struct krping_cb *cb)
1081 {
1082 	int ccnt, scnt, rcnt;
1083 	int iters=cb->count;
1084 	ccnt = 0;
1085 	scnt = 0;
1086 	rcnt = 0;
1087 	struct timeval start_tv, stop_tv;
1088 	cycles_t *post_cycles_start, *post_cycles_stop;
1089 	cycles_t *poll_cycles_start, *poll_cycles_stop;
1090 	cycles_t *last_poll_cycles_start;
1091 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1092 	int i;
1093 	int cycle_iters = 1000;
1094 	int err;
1095 
1096 	err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
1097 				&poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
1098 
1099 	if (err) {
1100 		log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__);
1101 		return;
1102 	}
1103 
1104 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1105 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1106 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1107 	cb->rdma_sq_wr.sg_list->length = cb->size;
1108 
1109 	if (cycle_iters > iters)
1110 		cycle_iters = iters;
1111 	microtime(&start_tv);
1112 	while (scnt < iters || ccnt < iters) {
1113 
1114 		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1115 			struct ib_send_wr *bad_wr;
1116 
1117 			if (scnt < cycle_iters)
1118 				post_cycles_start[scnt] = get_cycles();
1119 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1120 				log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
1121 					scnt);
1122 				return;
1123 			}
1124 			if (scnt < cycle_iters)
1125 				post_cycles_stop[scnt] = get_cycles();
1126 			++scnt;
1127 		}
1128 
1129 		if (ccnt < iters) {
1130 			int ne;
1131 			struct ib_wc wc;
1132 
1133 			if (ccnt < cycle_iters)
1134 				poll_cycles_start[ccnt] = get_cycles();
1135 			do {
1136 				if (ccnt < cycle_iters)
1137 					last_poll_cycles_start[ccnt] = get_cycles();
1138 				ne = ib_poll_cq(cb->cq, 1, &wc);
1139 			} while (ne == 0);
1140 			if (ccnt < cycle_iters)
1141 				poll_cycles_stop[ccnt] = get_cycles();
1142 			ccnt += 1;
1143 
1144 			if (ne < 0) {
1145 				log(LOG_ERR, "poll CQ failed %d\n", ne);
1146 				return;
1147 			}
1148 			if (wc.status != IB_WC_SUCCESS) {
1149 				log(LOG_ERR, "Completion wth error at %s:\n",
1150 					cb->server ? "server" : "client");
1151 				log(LOG_ERR, "Failed status %d: wr_id %d\n",
1152 					wc.status, (int) wc.wr_id);
1153 				return;
1154 			}
1155 		}
1156 	}
1157 	microtime(&stop_tv);
1158 
1159         if (stop_tv.tv_usec < start_tv.tv_usec) {
1160                 stop_tv.tv_usec += 1000000;
1161                 stop_tv.tv_sec  -= 1;
1162         }
1163 
1164 	for (i=0; i < cycle_iters; i++) {
1165 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1166 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1167 		sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
1168 	}
1169 
1170 	log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1171 		stop_tv.tv_sec - start_tv.tv_sec,
1172 		stop_tv.tv_usec - start_tv.tv_usec,
1173 		scnt, cb->size, cycle_iters,
1174 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1175 		(unsigned long long)sum_last_poll);
1176 
1177 	free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start,
1178 			poll_cycles_stop, last_poll_cycles_start);
1179 }
1180 
1181 static void krping_rlat_test_server(struct krping_cb *cb)
1182 {
1183 	struct ib_send_wr *bad_wr;
1184 	struct ib_wc wc;
1185 	int ret;
1186 
1187 	/* Spin waiting for client's Start STAG/TO/Len */
1188 	while (cb->state < RDMA_READ_ADV) {
1189 		krping_cq_event_handler(cb->cq, cb);
1190 	}
1191 
1192 	/* Send STAG/TO/Len to client */
1193 	if (cb->dma_mr)
1194 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1195 	else
1196 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1197 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1198 	if (ret) {
1199 		log(LOG_ERR, "post send error %d\n", ret);
1200 		return;
1201 	}
1202 
1203 	/* Spin waiting for send completion */
1204 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1205 	if (ret < 0) {
1206 		log(LOG_ERR, "poll error %d\n", ret);
1207 		return;
1208 	}
1209 	if (wc.status) {
1210 		log(LOG_ERR, "send completiong error %d\n", wc.status);
1211 		return;
1212 	}
1213 
1214 	krping_wait(cb, ERROR);
1215 }
1216 
1217 static void krping_wlat_test_server(struct krping_cb *cb)
1218 {
1219 	struct ib_send_wr *bad_wr;
1220 	struct ib_wc wc;
1221 	int ret;
1222 
1223 	/* Spin waiting for client's Start STAG/TO/Len */
1224 	while (cb->state < RDMA_READ_ADV) {
1225 		krping_cq_event_handler(cb->cq, cb);
1226 	}
1227 
1228 	/* Send STAG/TO/Len to client */
1229 	if (cb->dma_mr)
1230 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1231 	else
1232 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1233 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1234 	if (ret) {
1235 		log(LOG_ERR, "post send error %d\n", ret);
1236 		return;
1237 	}
1238 
1239 	/* Spin waiting for send completion */
1240 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1241 	if (ret < 0) {
1242 		log(LOG_ERR, "poll error %d\n", ret);
1243 		return;
1244 	}
1245 	if (wc.status) {
1246 		log(LOG_ERR, "send completiong error %d\n", wc.status);
1247 		return;
1248 	}
1249 
1250 	wlat_test(cb);
1251 
1252 }
1253 
1254 static void krping_bw_test_server(struct krping_cb *cb)
1255 {
1256 	struct ib_send_wr *bad_wr;
1257 	struct ib_wc wc;
1258 	int ret;
1259 
1260 	/* Spin waiting for client's Start STAG/TO/Len */
1261 	while (cb->state < RDMA_READ_ADV) {
1262 		krping_cq_event_handler(cb->cq, cb);
1263 	}
1264 
1265 	/* Send STAG/TO/Len to client */
1266 	if (cb->dma_mr)
1267 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1268 	else
1269 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1270 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1271 	if (ret) {
1272 		log(LOG_ERR, "post send error %d\n", ret);
1273 		return;
1274 	}
1275 
1276 	/* Spin waiting for send completion */
1277 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1278 	if (ret < 0) {
1279 		log(LOG_ERR, "poll error %d\n", ret);
1280 		return;
1281 	}
1282 	if (wc.status) {
1283 		log(LOG_ERR, "send completiong error %d\n", wc.status);
1284 		return;
1285 	}
1286 
1287 	if (cb->duplex)
1288 		bw_test(cb);
1289 	krping_wait(cb, ERROR);
1290 }
1291 
1292 static int krping_bind_server(struct krping_cb *cb)
1293 {
1294 	struct sockaddr_in sin;
1295 	int ret;
1296 
1297 	memset(&sin, 0, sizeof(sin));
1298 	sin.sin_len = sizeof sin;
1299 	sin.sin_family = AF_INET;
1300 	sin.sin_addr.s_addr = cb->addr.s_addr;
1301 	sin.sin_port = cb->port;
1302 
1303 	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1304 	if (ret) {
1305 		log(LOG_ERR, "rdma_bind_addr error %d\n", ret);
1306 		return ret;
1307 	}
1308 	DEBUG_LOG(PFX "rdma_bind_addr successful\n");
1309 
1310 	DEBUG_LOG(PFX "rdma_listen\n");
1311 	ret = rdma_listen(cb->cm_id, 3);
1312 	if (ret) {
1313 		log(LOG_ERR, "rdma_listen failed: %d\n", ret);
1314 		return ret;
1315 	}
1316 
1317 	krping_wait(cb, CONNECT_REQUEST);
1318 	if (cb->state != CONNECT_REQUEST) {
1319 		log(LOG_ERR,  "wait for CONNECT_REQUEST state %d\n",
1320 			cb->state);
1321 		return -1;
1322 	}
1323 
1324 	return 0;
1325 }
1326 
1327 static void krping_run_server(struct krping_cb *cb)
1328 {
1329 	struct ib_recv_wr *bad_wr;
1330 	int ret;
1331 
1332 	ret = krping_bind_server(cb);
1333 	if (ret)
1334 		return;
1335 
1336 	ret = krping_setup_qp(cb, cb->child_cm_id);
1337 	if (ret) {
1338 		log(LOG_ERR, "setup_qp failed: %d\n", ret);
1339 		return;
1340 	}
1341 
1342 	ret = krping_setup_buffers(cb);
1343 	if (ret) {
1344 		log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1345 		goto err1;
1346 	}
1347 
1348 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1349 	if (ret) {
1350 		log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1351 		goto err2;
1352 	}
1353 
1354 	ret = krping_accept(cb);
1355 	if (ret) {
1356 		log(LOG_ERR, "connect error %d\n", ret);
1357 		goto err2;
1358 	}
1359 
1360 	if (cb->wlat)
1361 		krping_wlat_test_server(cb);
1362 	else if (cb->rlat)
1363 		krping_rlat_test_server(cb);
1364 	else if (cb->bw)
1365 		krping_bw_test_server(cb);
1366 	else
1367 		krping_test_server(cb);
1368 
1369 	rdma_disconnect(cb->child_cm_id);
1370 	rdma_destroy_id(cb->child_cm_id);
1371 err2:
1372 	krping_free_buffers(cb);
1373 err1:
1374 	krping_free_qp(cb);
1375 }
1376 
1377 static void krping_test_client(struct krping_cb *cb)
1378 {
1379 	int ping, start, cc, i, ret;
1380 	struct ib_send_wr *bad_wr;
1381 	unsigned char c;
1382 
1383 	start = 65;
1384 	for (ping = 0; !cb->count || ping < cb->count; ping++) {
1385 		cb->state = RDMA_READ_ADV;
1386 
1387 		/* Put some ascii text in the buffer. */
1388 		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1389 		for (i = cc, c = start; i < cb->size; i++) {
1390 			cb->start_buf[i] = c;
1391 			c++;
1392 			if (c > 122)
1393 				c = 65;
1394 		}
1395 		start++;
1396 		if (start > 122)
1397 			start = 65;
1398 		cb->start_buf[cb->size - 1] = 0;
1399 
1400 		if (cb->dma_mr)
1401 			krping_format_send(cb, cb->start_addr, cb->dma_mr);
1402 		else
1403 			krping_format_send(cb, cb->start_addr, cb->start_mr);
1404 
1405 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1406 		if (ret) {
1407 			log(LOG_ERR, "post send error %d\n", ret);
1408 			break;
1409 		}
1410 
1411 		/* Wait for server to ACK */
1412 		krping_wait(cb, RDMA_WRITE_ADV);
1413 		if (cb->state != RDMA_WRITE_ADV) {
1414 			log(LOG_ERR,
1415 			       "wait for RDMA_WRITE_ADV state %d\n",
1416 			       cb->state);
1417 			break;
1418 		}
1419 
1420 		if (cb->dma_mr)
1421 			krping_format_send(cb, cb->rdma_addr, cb->dma_mr);
1422 		else
1423 			krping_format_send(cb, cb->rdma_addr, cb->rdma_mr);
1424 
1425 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1426 		if (ret) {
1427 			log(LOG_ERR, "post send error %d\n", ret);
1428 			break;
1429 		}
1430 
1431 		/* Wait for the server to say the RDMA Write is complete. */
1432 		krping_wait(cb, RDMA_WRITE_COMPLETE);
1433 		if (cb->state != RDMA_WRITE_COMPLETE) {
1434 			log(LOG_ERR,
1435 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1436 			       cb->state);
1437 			break;
1438 		}
1439 
1440 		if (cb->validate)
1441 			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1442 				log(LOG_ERR, "data mismatch!\n");
1443 				break;
1444 			}
1445 
1446 		if (cb->verbose)
1447 			DEBUG_LOG("ping data: %s\n", cb->rdma_buf);
1448 	}
1449 }
1450 
1451 static void krping_rlat_test_client(struct krping_cb *cb)
1452 {
1453 	struct ib_send_wr *bad_wr;
1454 	struct ib_wc wc;
1455 	int ret;
1456 
1457 	cb->state = RDMA_READ_ADV;
1458 
1459 	/* Send STAG/TO/Len to client */
1460 	if (cb->dma_mr)
1461 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1462 	else
1463 		krping_format_send(cb, cb->start_addr, cb->rdma_mr);
1464 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1465 	if (ret) {
1466 		log(LOG_ERR, "post send error %d\n", ret);
1467 		return;
1468 	}
1469 
1470 	/* Spin waiting for send completion */
1471 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1472 	if (ret < 0) {
1473 		log(LOG_ERR, "poll error %d\n", ret);
1474 		return;
1475 	}
1476 	if (wc.status) {
1477 		log(LOG_ERR, "send completion error %d\n", wc.status);
1478 		return;
1479 	}
1480 
1481 	/* Spin waiting for server's Start STAG/TO/Len */
1482 	while (cb->state < RDMA_WRITE_ADV) {
1483 		krping_cq_event_handler(cb->cq, cb);
1484 	}
1485 
1486 #if 0
1487 {
1488 	int i;
1489 	struct timeval start, stop;
1490 	time_t sec;
1491 	suseconds_t usec;
1492 	unsigned long long elapsed;
1493 	struct ib_wc wc;
1494 	struct ib_send_wr *bad_wr;
1495 	int ne;
1496 
1497 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1498 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1499 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1500 	cb->rdma_sq_wr.sg_list->length = 0;
1501 	cb->rdma_sq_wr.num_sge = 0;
1502 
1503 	microtime(&start);
1504 	for (i=0; i < 100000; i++) {
1505 		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1506 			log(LOG_ERR,  "Couldn't post send\n");
1507 			return;
1508 		}
1509 		do {
1510 			ne = ib_poll_cq(cb->cq, 1, &wc);
1511 		} while (ne == 0);
1512 		if (ne < 0) {
1513 			log(LOG_ERR, "poll CQ failed %d\n", ne);
1514 			return;
1515 		}
1516 		if (wc.status != IB_WC_SUCCESS) {
1517 			log(LOG_ERR, "Completion wth error at %s:\n",
1518 				cb->server ? "server" : "client");
1519 			log(LOG_ERR, "Failed status %d: wr_id %d\n",
1520 				wc.status, (int) wc.wr_id);
1521 			return;
1522 		}
1523 	}
1524 	microtime(&stop);
1525 
1526 	if (stop.tv_usec < start.tv_usec) {
1527 		stop.tv_usec += 1000000;
1528 		stop.tv_sec  -= 1;
1529 	}
1530 	sec     = stop.tv_sec - start.tv_sec;
1531 	usec    = stop.tv_usec - start.tv_usec;
1532 	elapsed = sec * 1000000 + usec;
1533 	log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1534 }
1535 #endif
1536 
1537 	rlat_test(cb);
1538 }
1539 
1540 static void krping_wlat_test_client(struct krping_cb *cb)
1541 {
1542 	struct ib_send_wr *bad_wr;
1543 	struct ib_wc wc;
1544 	int ret;
1545 
1546 	cb->state = RDMA_READ_ADV;
1547 
1548 	/* Send STAG/TO/Len to client */
1549 	if (cb->dma_mr)
1550 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1551 	else
1552 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1553 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1554 	if (ret) {
1555 		log(LOG_ERR, "post send error %d\n", ret);
1556 		return;
1557 	}
1558 
1559 	/* Spin waiting for send completion */
1560 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1561 	if (ret < 0) {
1562 		log(LOG_ERR, "poll error %d\n", ret);
1563 		return;
1564 	}
1565 	if (wc.status) {
1566 		log(LOG_ERR, "send completion error %d\n", wc.status);
1567 		return;
1568 	}
1569 
1570 	/* Spin waiting for server's Start STAG/TO/Len */
1571 	while (cb->state < RDMA_WRITE_ADV) {
1572 		krping_cq_event_handler(cb->cq, cb);
1573 	}
1574 
1575 	wlat_test(cb);
1576 }
1577 
1578 static void krping_bw_test_client(struct krping_cb *cb)
1579 {
1580 	struct ib_send_wr *bad_wr;
1581 	struct ib_wc wc;
1582 	int ret;
1583 
1584 	cb->state = RDMA_READ_ADV;
1585 
1586 	/* Send STAG/TO/Len to client */
1587 	if (cb->dma_mr)
1588 		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1589 	else
1590 		krping_format_send(cb, cb->start_addr, cb->start_mr);
1591 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1592 	if (ret) {
1593 		log(LOG_ERR, "post send error %d\n", ret);
1594 		return;
1595 	}
1596 
1597 	/* Spin waiting for send completion */
1598 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1599 	if (ret < 0) {
1600 		log(LOG_ERR, "poll error %d\n", ret);
1601 		return;
1602 	}
1603 	if (wc.status) {
1604 		log(LOG_ERR, "send completion error %d\n", wc.status);
1605 		return;
1606 	}
1607 
1608 	/* Spin waiting for server's Start STAG/TO/Len */
1609 	while (cb->state < RDMA_WRITE_ADV) {
1610 		krping_cq_event_handler(cb->cq, cb);
1611 	}
1612 
1613 	bw_test(cb);
1614 }
1615 
1616 static int krping_connect_client(struct krping_cb *cb)
1617 {
1618 	struct rdma_conn_param conn_param;
1619 	int ret;
1620 
1621 	memset(&conn_param, 0, sizeof conn_param);
1622 	conn_param.responder_resources = 1;
1623 	conn_param.initiator_depth = 1;
1624 	conn_param.retry_count = 10;
1625 
1626 	ret = rdma_connect(cb->cm_id, &conn_param);
1627 	if (ret) {
1628 		log(LOG_ERR, "rdma_connect error %d\n", ret);
1629 		return ret;
1630 	}
1631 
1632 	krping_wait(cb, CONNECTED);
1633 	if (cb->state == ERROR) {
1634 		log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
1635 		return -1;
1636 	}
1637 
1638 	DEBUG_LOG(PFX "rdma_connect successful\n");
1639 	return 0;
1640 }
1641 
1642 static int krping_bind_client(struct krping_cb *cb)
1643 {
1644 	struct sockaddr_in sin;
1645 	int ret;
1646 
1647 	memset(&sin, 0, sizeof(sin));
1648 	sin.sin_len = sizeof sin;
1649 	sin.sin_family = AF_INET;
1650 	sin.sin_addr.s_addr = cb->addr.s_addr;
1651 	sin.sin_port = cb->port;
1652 
1653 	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
1654 				2000);
1655 	if (ret) {
1656 		log(LOG_ERR, "rdma_resolve_addr error %d\n", ret);
1657 		return ret;
1658 	}
1659 
1660 	krping_wait(cb, ROUTE_RESOLVED);
1661 	if (cb->state != ROUTE_RESOLVED) {
1662 		log(LOG_ERR,
1663 		       "addr/route resolution did not resolve: state %d\n",
1664 		       cb->state);
1665 		return EINTR;
1666 	}
1667 
1668 	DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n");
1669 	return 0;
1670 }
1671 
1672 static void krping_run_client(struct krping_cb *cb)
1673 {
1674 	struct ib_recv_wr *bad_wr;
1675 	int ret;
1676 
1677 	ret = krping_bind_client(cb);
1678 	if (ret)
1679 		return;
1680 
1681 	ret = krping_setup_qp(cb, cb->cm_id);
1682 	if (ret) {
1683 		log(LOG_ERR, "setup_qp failed: %d\n", ret);
1684 		return;
1685 	}
1686 
1687 	ret = krping_setup_buffers(cb);
1688 	if (ret) {
1689 		log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1690 		goto err1;
1691 	}
1692 
1693 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1694 	if (ret) {
1695 		log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1696 		goto err2;
1697 	}
1698 
1699 	ret = krping_connect_client(cb);
1700 	if (ret) {
1701 		log(LOG_ERR, "connect error %d\n", ret);
1702 		goto err2;
1703 	}
1704 
1705 	if (cb->wlat)
1706 		krping_wlat_test_client(cb);
1707 	else if (cb->rlat)
1708 		krping_rlat_test_client(cb);
1709 	else if (cb->bw)
1710 		krping_bw_test_client(cb);
1711 	else
1712 		krping_test_client(cb);
1713 	rdma_disconnect(cb->cm_id);
1714 err2:
1715 	krping_free_buffers(cb);
1716 err1:
1717 	krping_free_qp(cb);
1718 }
1719 
1720 int krping_doit(char *cmd)
1721 {
1722 	struct krping_cb *cb;
1723 	int op;
1724 	int ret = 0;
1725 	char *optarg;
1726 	unsigned long optint;
1727 	debug = 0;
1728 
1729 	cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK);
1730 	if (!cb)
1731 		return ENOMEM;
1732 	bzero(cb, sizeof *cb);
1733 
1734 	mtx_lock(&krping_mutex);
1735 	TAILQ_INSERT_TAIL(&krping_cbs, cb, list);
1736 	mtx_unlock(&krping_mutex);
1737 
1738 	cb->server = -1;
1739 	cb->state = IDLE;
1740 	cb->size = 64;
1741 	cb->txdepth = RPING_SQ_DEPTH;
1742 	mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
1743 
1744 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
1745 			      &optint)) != 0) {
1746 		switch (op) {
1747 		case 'a':
1748 			cb->addr_str = optarg;
1749 			DEBUG_LOG(PFX "ipaddr (%s)\n", optarg);
1750 			if (!inet_aton(optarg, &cb->addr)) {
1751 				log(LOG_ERR, "bad addr string %s\n", optarg);
1752 				ret = EINVAL;
1753 			}
1754 			break;
1755 		case 'D':
1756 			cb->use_dmamr = 1;
1757 			DEBUG_LOG(PFX "using dma mr\n");
1758 			break;
1759 		case 'p':
1760 			cb->port = htons(optint);
1761 			DEBUG_LOG(PFX "port %d\n", (int)optint);
1762 			break;
1763 		case 'P':
1764 			cb->poll = 1;
1765 			DEBUG_LOG("server\n");
1766 			break;
1767 		case 's':
1768 			cb->server = 1;
1769 			DEBUG_LOG(PFX "server\n");
1770 			break;
1771 		case 'c':
1772 			cb->server = 0;
1773 			DEBUG_LOG(PFX "client\n");
1774 			break;
1775 		case 'S':
1776 			cb->size = optint;
1777 			if ((cb->size < 1) ||
1778 			    (cb->size > RPING_BUFSIZE)) {
1779 				log(LOG_ERR, "Invalid size %d "
1780 				       "(valid range is 1 to %d)\n",
1781 				       cb->size, RPING_BUFSIZE);
1782 				ret = EINVAL;
1783 			} else
1784 				DEBUG_LOG(PFX "size %d\n", (int)optint);
1785 			break;
1786 		case 'C':
1787 			cb->count = optint;
1788 			if (cb->count < 0) {
1789 				log(LOG_ERR, "Invalid count %d\n",
1790 					cb->count);
1791 				ret = EINVAL;
1792 			} else
1793 				DEBUG_LOG(PFX "count %d\n", (int) cb->count);
1794 			break;
1795 		case 'v':
1796 			cb->verbose++;
1797 			DEBUG_LOG(PFX "verbose\n");
1798 			break;
1799 		case 'V':
1800 			cb->validate++;
1801 			DEBUG_LOG(PFX "validate data\n");
1802 			break;
1803 		case 'L':
1804 			cb->rlat++;
1805 			break;
1806 		case 'l':
1807 			cb->wlat++;
1808 			break;
1809 		case 'B':
1810 			cb->bw++;
1811 			break;
1812 		case 't':
1813 			cb->txdepth = optint;
1814 			DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth);
1815 			break;
1816 		case 'd':
1817 			debug++;
1818 			break;
1819 		default:
1820 			log(LOG_ERR, "unknown opt %s\n", optarg);
1821 			ret = EINVAL;
1822 			break;
1823 		}
1824 	}
1825 	if (ret)
1826 		goto out;
1827 
1828 	if (cb->server == -1) {
1829 		log(LOG_ERR, "must be either client or server\n");
1830 		ret = EINVAL;
1831 		goto out;
1832 	}
1833 	if ((cb->bw + cb->rlat + cb->wlat) > 1) {
1834 		log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n");
1835 		ret = EINVAL;
1836 		goto out;
1837 	}
1838 
1839 
1840 	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
1841 	if (IS_ERR(cb->cm_id)) {
1842 		ret = PTR_ERR(cb->cm_id);
1843 		log(LOG_ERR, "rdma_create_id error %d\n", ret);
1844 		goto out;
1845 	}
1846 	DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id);
1847 	if (cb->server)
1848 		krping_run_server(cb);
1849 	else
1850 		krping_run_client(cb);
1851 	DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id);
1852 	rdma_destroy_id(cb->cm_id);
1853 out:
1854 	mtx_lock(&krping_mutex);
1855 	TAILQ_REMOVE(&krping_cbs, cb, list);
1856 	mtx_unlock(&krping_mutex);
1857 	free(cb, M_DEVBUF);
1858 	return ret;
1859 }
1860 
1861 void krping_init(void)
1862 {
1863 	mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF);
1864 	TAILQ_INIT(&krping_cbs);
1865 }
1866