xref: /freebsd/sys/contrib/rdma/krping/krping.c (revision 40427cca7a9ae77b095936fb1954417c290cfb17)
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <linux/module.h>
38 #include <linux/moduleparam.h>
39 #include <linux/slab.h>
40 #include <linux/err.h>
41 #include <linux/string.h>
42 #include <linux/list.h>
43 #include <linux/in.h>
44 #include <linux/device.h>
45 #include <linux/pci.h>
46 #include <linux/sched.h>
47 #include <linux/wait.h>
48 
49 #include <asm/atomic.h>
50 
51 #include <rdma/ib_verbs.h>
52 #include <rdma/rdma_cm.h>
53 
54 #include "krping.h"
55 #include "getopt.h"
56 
57 extern int krping_debug;
58 #define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x)
59 #define PRINTF(cb, x...) log(LOG_INFO, x)
60 #define BIND_INFO 1
61 
62 MODULE_AUTHOR("Steve Wise");
63 MODULE_DESCRIPTION("RDMA ping client/server");
64 MODULE_LICENSE("Dual BSD/GPL");
65 MODULE_VERSION(krping, 1);
66 MODULE_DEPEND(krping, linuxkpi, 1, 1, 1);
67 
68 static __inline uint64_t
69 get_cycles(void)
70 {
71 	uint32_t low, high;
72 	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
73 	return (low | ((u_int64_t)high << 32));
74 }
75 
76 typedef uint64_t cycles_t;
77 
78 enum mem_type {
79 	DMA = 1,
80 	FASTREG = 2,
81 	MW = 3,
82 	MR = 4
83 };
84 
85 static const struct krping_option krping_opts[] = {
86 	{"count", OPT_INT, 'C'},
87 	{"size", OPT_INT, 'S'},
88 	{"addr", OPT_STRING, 'a'},
89 	{"port", OPT_INT, 'p'},
90 	{"verbose", OPT_NOPARAM, 'v'},
91 	{"validate", OPT_NOPARAM, 'V'},
92 	{"server", OPT_NOPARAM, 's'},
93 	{"client", OPT_NOPARAM, 'c'},
94 	{"mem_mode", OPT_STRING, 'm'},
95 	{"server_inv", OPT_NOPARAM, 'I'},
96  	{"wlat", OPT_NOPARAM, 'l'},
97  	{"rlat", OPT_NOPARAM, 'L'},
98  	{"bw", OPT_NOPARAM, 'B'},
99  	{"duplex", OPT_NOPARAM, 'd'},
100  	{"txdepth", OPT_INT, 'T'},
101  	{"poll", OPT_NOPARAM, 'P'},
102  	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
103  	{"read_inv", OPT_NOPARAM, 'R'},
104  	{"fr", OPT_INT, 'f'},
105 	{NULL, 0, 0}
106 };
107 
108 #define htonll(x) cpu_to_be64((x))
109 #define ntohll(x) cpu_to_be64((x))
110 
111 static struct mutex krping_mutex;
112 
113 /*
114  * List of running krping threads.
115  */
116 static LIST_HEAD(krping_cbs);
117 
118 /*
119  * krping "ping/pong" loop:
120  * 	client sends source rkey/addr/len
121  *	server receives source rkey/add/len
122  *	server rdma reads "ping" data from source
123  * 	server sends "go ahead" on rdma read completion
124  *	client sends sink rkey/addr/len
125  * 	server receives sink rkey/addr/len
126  * 	server rdma writes "pong" data to sink
127  * 	server sends "go ahead" on rdma write completion
128  * 	<repeat loop>
129  */
130 
131 /*
132  * These states are used to signal events between the completion handler
133  * and the main client or server thread.
134  *
135  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
136  * and RDMA_WRITE_COMPLETE for each ping.
137  */
138 enum test_state {
139 	IDLE = 1,
140 	CONNECT_REQUEST,
141 	ADDR_RESOLVED,
142 	ROUTE_RESOLVED,
143 	CONNECTED,
144 	RDMA_READ_ADV,
145 	RDMA_READ_COMPLETE,
146 	RDMA_WRITE_ADV,
147 	RDMA_WRITE_COMPLETE,
148 	ERROR
149 };
150 
151 struct krping_rdma_info {
152 	uint64_t buf;
153 	uint32_t rkey;
154 	uint32_t size;
155 };
156 
157 /*
158  * Default max buffer size for IO...
159  */
160 #define RPING_BUFSIZE 128*1024
161 #define RPING_SQ_DEPTH 64
162 
163 /*
164  * Control block struct.
165  */
166 struct krping_cb {
167 	void *cookie;
168 	int server;			/* 0 iff client */
169 	struct ib_cq *cq;
170 	struct ib_pd *pd;
171 	struct ib_qp *qp;
172 
173 	enum mem_type mem;
174 	struct ib_mr *dma_mr;
175 
176 	struct ib_fast_reg_page_list *page_list;
177 	int page_list_len;
178 	struct ib_send_wr fastreg_wr;
179 	struct ib_send_wr invalidate_wr;
180 	struct ib_mr *fastreg_mr;
181 	int server_invalidate;
182 	int read_inv;
183 	u8 key;
184 
185 	struct ib_mw *mw;
186 	struct ib_mw_bind bind_attr;
187 
188 	struct ib_recv_wr rq_wr;	/* recv work request record */
189 	struct ib_sge recv_sgl;		/* recv single SGE */
190 	struct krping_rdma_info recv_buf;/* malloc'd buffer */
191 	u64 recv_dma_addr;
192 	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
193 	struct ib_mr *recv_mr;
194 
195 	struct ib_send_wr sq_wr;	/* send work requrest record */
196 	struct ib_sge send_sgl;
197 	struct krping_rdma_info send_buf;/* single send buf */
198 	u64 send_dma_addr;
199 	DECLARE_PCI_UNMAP_ADDR(send_mapping)
200 	struct ib_mr *send_mr;
201 
202 	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
203 	struct ib_sge rdma_sgl;		/* rdma single SGE */
204 	char *rdma_buf;			/* used as rdma sink */
205 	u64  rdma_dma_addr;
206 	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
207 	struct ib_mr *rdma_mr;
208 
209 	uint32_t remote_rkey;		/* remote guys RKEY */
210 	uint64_t remote_addr;		/* remote guys TO */
211 	uint32_t remote_len;		/* remote guys LEN */
212 
213 	char *start_buf;		/* rdma read src */
214 	u64  start_dma_addr;
215 	DECLARE_PCI_UNMAP_ADDR(start_mapping)
216 	struct ib_mr *start_mr;
217 
218 	enum test_state state;		/* used for cond/signalling */
219 	wait_queue_head_t sem;
220 	struct krping_stats stats;
221 
222 	uint16_t port;			/* dst port in NBO */
223 	struct in_addr addr;		/* dst addr in NBO */
224 	char *addr_str;			/* dst addr string */
225 	int verbose;			/* verbose logging */
226 	int count;			/* ping count */
227 	int size;			/* ping data size */
228 	int validate;			/* validate ping data */
229 	int wlat;			/* run wlat test */
230 	int rlat;			/* run rlat test */
231 	int bw;				/* run bw test */
232 	int duplex;			/* run bw full duplex test */
233 	int poll;			/* poll or block for rlat test */
234 	int txdepth;			/* SQ depth */
235 	int local_dma_lkey;		/* use 0 for lkey */
236 	int frtest;			/* fastreg test */
237 	int testnum;
238 
239 	/* CM stuff */
240 	struct rdma_cm_id *cm_id;	/* connection on client side,*/
241 					/* listener on server side. */
242 	struct rdma_cm_id *child_cm_id;	/* connection on server side */
243 	struct list_head list;
244 };
245 
246 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
247 				   struct rdma_cm_event *event)
248 {
249 	int ret;
250 	struct krping_cb *cb = cma_id->context;
251 
252 	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
253 	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
254 
255 	switch (event->event) {
256 	case RDMA_CM_EVENT_ADDR_RESOLVED:
257 		cb->state = ADDR_RESOLVED;
258 		ret = rdma_resolve_route(cma_id, 2000);
259 		if (ret) {
260 			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
261 			wake_up_interruptible(&cb->sem);
262 		}
263 		break;
264 
265 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
266 		cb->state = ROUTE_RESOLVED;
267 		cb->child_cm_id = cma_id;
268 		wake_up_interruptible(&cb->sem);
269 		break;
270 
271 	case RDMA_CM_EVENT_CONNECT_REQUEST:
272 		if (cb->state == IDLE) {
273 			cb->state = CONNECT_REQUEST;
274 			cb->child_cm_id = cma_id;
275 		} else {
276 			PRINTF(cb, "Received connection request in wrong state"
277 			    " (%d)\n", cb->state);
278 		}
279 		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
280 		wake_up_interruptible(&cb->sem);
281 		break;
282 
283 	case RDMA_CM_EVENT_ESTABLISHED:
284 		DEBUG_LOG(cb, "ESTABLISHED\n");
285 		if (!cb->server) {
286 			cb->state = CONNECTED;
287 		}
288 		wake_up_interruptible(&cb->sem);
289 		break;
290 
291 	case RDMA_CM_EVENT_ADDR_ERROR:
292 	case RDMA_CM_EVENT_ROUTE_ERROR:
293 	case RDMA_CM_EVENT_CONNECT_ERROR:
294 	case RDMA_CM_EVENT_UNREACHABLE:
295 	case RDMA_CM_EVENT_REJECTED:
296 		PRINTF(cb, "cma event %d, error %d\n", event->event,
297 		       event->status);
298 		cb->state = ERROR;
299 		wake_up_interruptible(&cb->sem);
300 		break;
301 
302 	case RDMA_CM_EVENT_DISCONNECTED:
303 		PRINTF(cb, "DISCONNECT EVENT...\n");
304 		cb->state = ERROR;
305 		wake_up_interruptible(&cb->sem);
306 		break;
307 
308 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
309 		PRINTF(cb, "cma detected device removal!!!!\n");
310 		break;
311 
312 	default:
313 		PRINTF(cb, "oof bad type!\n");
314 		wake_up_interruptible(&cb->sem);
315 		break;
316 	}
317 	return 0;
318 }
319 
320 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
321 {
322 	if (wc->byte_len != sizeof(cb->recv_buf)) {
323 		PRINTF(cb, "Received bogus data, size %d\n",
324 		       wc->byte_len);
325 		return -1;
326 	}
327 
328 	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
329 	cb->remote_addr = ntohll(cb->recv_buf.buf);
330 	cb->remote_len  = ntohl(cb->recv_buf.size);
331 	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
332 		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
333 		  cb->remote_len);
334 
335 	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
336 		cb->state = RDMA_READ_ADV;
337 	else
338 		cb->state = RDMA_WRITE_ADV;
339 
340 	return 0;
341 }
342 
343 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
344 {
345 	if (wc->byte_len != sizeof(cb->recv_buf)) {
346 		PRINTF(cb, "Received bogus data, size %d\n",
347 		       wc->byte_len);
348 		return -1;
349 	}
350 
351 	if (cb->state == RDMA_READ_ADV)
352 		cb->state = RDMA_WRITE_ADV;
353 	else
354 		cb->state = RDMA_WRITE_COMPLETE;
355 
356 	return 0;
357 }
358 
359 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
360 {
361 	struct krping_cb *cb = ctx;
362 	struct ib_wc wc;
363 	struct ib_recv_wr *bad_wr;
364 	int ret;
365 
366 	BUG_ON(cb->cq != cq);
367 	if (cb->state == ERROR) {
368 		PRINTF(cb, "cq completion in ERROR state\n");
369 		return;
370 	}
371 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest)
372 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
373 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
374 		if (wc.status) {
375 			if (wc.status == IB_WC_WR_FLUSH_ERR) {
376 				DEBUG_LOG(cb, "cq flushed\n");
377 				continue;
378 			} else {
379 				PRINTF(cb, "cq completion failed with "
380 				       "wr_id %jx status %d opcode %d vender_err %x\n",
381 					(uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
382 				goto error;
383 			}
384 		}
385 
386 		switch (wc.opcode) {
387 		case IB_WC_SEND:
388 			DEBUG_LOG(cb, "send completion\n");
389 			cb->stats.send_bytes += cb->send_sgl.length;
390 			cb->stats.send_msgs++;
391 			break;
392 
393 		case IB_WC_RDMA_WRITE:
394 			DEBUG_LOG(cb, "rdma write completion\n");
395 			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
396 			cb->stats.write_msgs++;
397 			cb->state = RDMA_WRITE_COMPLETE;
398 			wake_up_interruptible(&cb->sem);
399 			break;
400 
401 		case IB_WC_RDMA_READ:
402 			DEBUG_LOG(cb, "rdma read completion\n");
403 			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
404 			cb->stats.read_msgs++;
405 			cb->state = RDMA_READ_COMPLETE;
406 			wake_up_interruptible(&cb->sem);
407 			break;
408 
409 		case IB_WC_RECV:
410 			DEBUG_LOG(cb, "recv completion\n");
411 			cb->stats.recv_bytes += sizeof(cb->recv_buf);
412 			cb->stats.recv_msgs++;
413 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest)
414 				ret = server_recv(cb, &wc);
415 			else
416 				ret = cb->server ? server_recv(cb, &wc) :
417 						   client_recv(cb, &wc);
418 			if (ret) {
419 				PRINTF(cb, "recv wc error: %d\n", ret);
420 				goto error;
421 			}
422 
423 			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
424 			if (ret) {
425 				PRINTF(cb, "post recv error: %d\n",
426 				       ret);
427 				goto error;
428 			}
429 			wake_up_interruptible(&cb->sem);
430 			break;
431 
432 		default:
433 			PRINTF(cb,
434 			       "%s:%d Unexpected opcode %d, Shutting down\n",
435 			       __func__, __LINE__, wc.opcode);
436 			goto error;
437 		}
438 	}
439 	if (ret) {
440 		PRINTF(cb, "poll error %d\n", ret);
441 		goto error;
442 	}
443 	return;
444 error:
445 	cb->state = ERROR;
446 	wake_up_interruptible(&cb->sem);
447 }
448 
449 static int krping_accept(struct krping_cb *cb)
450 {
451 	struct rdma_conn_param conn_param;
452 	int ret;
453 
454 	DEBUG_LOG(cb, "accepting client connection request\n");
455 
456 	memset(&conn_param, 0, sizeof conn_param);
457 	conn_param.responder_resources = 1;
458 	conn_param.initiator_depth = 1;
459 
460 	ret = rdma_accept(cb->child_cm_id, &conn_param);
461 	if (ret) {
462 		PRINTF(cb, "rdma_accept error: %d\n", ret);
463 		return ret;
464 	}
465 
466 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
467 		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
468 		if (cb->state == ERROR) {
469 			PRINTF(cb, "wait for CONNECTED state %d\n",
470 				cb->state);
471 			return -1;
472 		}
473 	}
474 	return 0;
475 }
476 
477 static void krping_setup_wr(struct krping_cb *cb)
478 {
479 	cb->recv_sgl.addr = cb->recv_dma_addr;
480 	cb->recv_sgl.length = sizeof cb->recv_buf;
481 	if (cb->local_dma_lkey)
482 		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
483 	else if (cb->mem == DMA)
484 		cb->recv_sgl.lkey = cb->dma_mr->lkey;
485 	else
486 		cb->recv_sgl.lkey = cb->recv_mr->lkey;
487 	cb->rq_wr.sg_list = &cb->recv_sgl;
488 	cb->rq_wr.num_sge = 1;
489 
490 	cb->send_sgl.addr = cb->send_dma_addr;
491 	cb->send_sgl.length = sizeof cb->send_buf;
492 	if (cb->local_dma_lkey)
493 		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
494 	else if (cb->mem == DMA)
495 		cb->send_sgl.lkey = cb->dma_mr->lkey;
496 	else
497 		cb->send_sgl.lkey = cb->send_mr->lkey;
498 
499 	cb->sq_wr.opcode = IB_WR_SEND;
500 	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
501 	cb->sq_wr.sg_list = &cb->send_sgl;
502 	cb->sq_wr.num_sge = 1;
503 
504 	if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
505 		cb->rdma_sgl.addr = cb->rdma_dma_addr;
506 		if (cb->mem == MR)
507 			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
508 		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
509 		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
510 		cb->rdma_sq_wr.num_sge = 1;
511 	}
512 
513 	switch(cb->mem) {
514 	case FASTREG:
515 
516 		/*
517 		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
518 		 * both unsignaled.  The client uses them to reregister
519 		 * the rdma buffers with a new key each iteration.
520 		 */
521 		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
522 		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
523 		cb->fastreg_wr.wr.fast_reg.length = cb->size;
524 		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
525 		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
526 
527 		cb->invalidate_wr.next = &cb->fastreg_wr;
528 		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
529 		break;
530 	case MW:
531 		cb->bind_attr.wr_id = 0xabbaabba;
532 		cb->bind_attr.send_flags = 0; /* unsignaled */
533 #ifdef BIND_INFO
534 		cb->bind_attr.bind_info.length = cb->size;
535 #else
536 		cb->bind_attr.length = cb->size;
537 #endif
538 		break;
539 	default:
540 		break;
541 	}
542 }
543 
544 static int krping_setup_buffers(struct krping_cb *cb)
545 {
546 	int ret;
547 	struct ib_phys_buf buf;
548 	u64 iovbase;
549 
550 	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
551 
552 	cb->recv_dma_addr = ib_dma_map_single(cb->pd->device,
553 				   &cb->recv_buf,
554 				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
555 	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
556 	cb->send_dma_addr = ib_dma_map_single(cb->pd->device,
557 					   &cb->send_buf, sizeof(cb->send_buf),
558 					   DMA_BIDIRECTIONAL);
559 	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
560 
561 	if (cb->mem == DMA) {
562 		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
563 					   IB_ACCESS_REMOTE_READ|
564 				           IB_ACCESS_REMOTE_WRITE);
565 		if (IS_ERR(cb->dma_mr)) {
566 			DEBUG_LOG(cb, "reg_dmamr failed\n");
567 			ret = PTR_ERR(cb->dma_mr);
568 			goto bail;
569 		}
570 	} else {
571 		if (!cb->local_dma_lkey) {
572 			buf.addr = cb->recv_dma_addr;
573 			buf.size = sizeof cb->recv_buf;
574 			DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n",
575 			    (uintmax_t)buf.addr, (int)buf.size);
576 			iovbase = cb->recv_dma_addr;
577 			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
578 						     IB_ACCESS_LOCAL_WRITE,
579 						     &iovbase);
580 
581 			if (IS_ERR(cb->recv_mr)) {
582 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
583 				ret = PTR_ERR(cb->recv_mr);
584 				goto bail;
585 			}
586 
587 			buf.addr = cb->send_dma_addr;
588 			buf.size = sizeof cb->send_buf;
589 			DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n",
590 			    (uintmax_t)buf.addr, (int)buf.size);
591 			iovbase = cb->send_dma_addr;
592 			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
593 						     0, &iovbase);
594 
595 			if (IS_ERR(cb->send_mr)) {
596 				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
597 				ret = PTR_ERR(cb->send_mr);
598 				goto bail;
599 			}
600 		}
601 	}
602 
603 	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
604 	if (!cb->rdma_buf) {
605 		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
606 		ret = -ENOMEM;
607 		goto bail;
608 	}
609 
610 	cb->rdma_dma_addr = ib_dma_map_single(cb->pd->device,
611 			       cb->rdma_buf, cb->size,
612 			       DMA_BIDIRECTIONAL);
613 	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
614 	if (cb->mem != DMA) {
615 		switch (cb->mem) {
616 		case FASTREG:
617 			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
618 				PAGE_SIZE) >> PAGE_SHIFT;
619 			cb->page_list = ib_alloc_fast_reg_page_list(
620 						cb->pd->device,
621 						cb->page_list_len);
622 			if (IS_ERR(cb->page_list)) {
623 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
624 				ret = PTR_ERR(cb->page_list);
625 				goto bail;
626 			}
627 			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd,
628 					cb->page_list->max_page_list_len);
629 			if (IS_ERR(cb->fastreg_mr)) {
630 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
631 				ret = PTR_ERR(cb->fastreg_mr);
632 				goto bail;
633 			}
634 			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
635 				" page_list_len %u\n", cb->fastreg_mr->rkey,
636 				cb->page_list, cb->page_list_len);
637 			break;
638 		case MW:
639 			cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1);
640 			if (IS_ERR(cb->mw)) {
641 				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
642 				ret = PTR_ERR(cb->mw);
643 				goto bail;
644 			}
645 			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
646 			/*FALLTHROUGH*/
647 		case MR:
648 			buf.addr = cb->rdma_dma_addr;
649 			buf.size = cb->size;
650 			iovbase = cb->rdma_dma_addr;
651 			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
652 						IB_ACCESS_LOCAL_WRITE|
653 					     IB_ACCESS_REMOTE_READ|
654 					     IB_ACCESS_REMOTE_WRITE,
655 					     &iovbase);
656 			if (IS_ERR(cb->rdma_mr)) {
657 				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
658 				ret = PTR_ERR(cb->rdma_mr);
659 				goto bail;
660 			}
661 			DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n",
662 				(uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey);
663 			break;
664 		default:
665 			ret = -EINVAL;
666 			goto bail;
667 			break;
668 		}
669 	}
670 
671 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
672 
673 		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
674 		if (!cb->start_buf) {
675 			DEBUG_LOG(cb, "start_buf malloc failed\n");
676 			ret = -ENOMEM;
677 			goto bail;
678 		}
679 
680 		cb->start_dma_addr = ib_dma_map_single(cb->pd->device,
681 						   cb->start_buf, cb->size,
682 						   DMA_BIDIRECTIONAL);
683 		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
684 
685 		if (cb->mem == MR || cb->mem == MW) {
686 			unsigned flags = IB_ACCESS_REMOTE_READ;
687 
688 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest) {
689 				flags |= IB_ACCESS_LOCAL_WRITE |
690 					IB_ACCESS_REMOTE_WRITE;
691 			}
692 
693 			buf.addr = cb->start_dma_addr;
694 			buf.size = cb->size;
695 			DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n",
696 				(uintmax_t)buf.addr, (int)buf.size);
697 			iovbase = cb->start_dma_addr;
698 			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
699 					     flags,
700 					     &iovbase);
701 
702 			if (IS_ERR(cb->start_mr)) {
703 				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
704 				ret = PTR_ERR(cb->start_mr);
705 				goto bail;
706 			}
707 		}
708 	}
709 
710 	krping_setup_wr(cb);
711 	DEBUG_LOG(cb, "allocated & registered buffers...\n");
712 	return 0;
713 bail:
714 	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
715 		ib_dereg_mr(cb->fastreg_mr);
716 	if (cb->mw && !IS_ERR(cb->mw))
717 		ib_dealloc_mw(cb->mw);
718 	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
719 		ib_dereg_mr(cb->rdma_mr);
720 	if (cb->page_list && !IS_ERR(cb->page_list))
721 		ib_free_fast_reg_page_list(cb->page_list);
722 	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
723 		ib_dereg_mr(cb->dma_mr);
724 	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
725 		ib_dereg_mr(cb->recv_mr);
726 	if (cb->send_mr && !IS_ERR(cb->send_mr))
727 		ib_dereg_mr(cb->send_mr);
728 	if (cb->rdma_buf)
729 		kfree(cb->rdma_buf);
730 	if (cb->start_buf)
731 		kfree(cb->start_buf);
732 	return ret;
733 }
734 
735 static void krping_free_buffers(struct krping_cb *cb)
736 {
737 	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
738 
739 	if (cb->dma_mr)
740 		ib_dereg_mr(cb->dma_mr);
741 	if (cb->send_mr)
742 		ib_dereg_mr(cb->send_mr);
743 	if (cb->recv_mr)
744 		ib_dereg_mr(cb->recv_mr);
745 	if (cb->rdma_mr)
746 		ib_dereg_mr(cb->rdma_mr);
747 	if (cb->start_mr)
748 		ib_dereg_mr(cb->start_mr);
749 	if (cb->fastreg_mr)
750 		ib_dereg_mr(cb->fastreg_mr);
751 	if (cb->mw)
752 		ib_dealloc_mw(cb->mw);
753 
754 	dma_unmap_single(cb->pd->device->dma_device,
755 			 pci_unmap_addr(cb, recv_mapping),
756 			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
757 	dma_unmap_single(cb->pd->device->dma_device,
758 			 pci_unmap_addr(cb, send_mapping),
759 			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
760 	dma_unmap_single(cb->pd->device->dma_device,
761 			 pci_unmap_addr(cb, rdma_mapping),
762 			 cb->size, DMA_BIDIRECTIONAL);
763 	kfree(cb->rdma_buf);
764 	if (cb->start_buf) {
765 		dma_unmap_single(cb->pd->device->dma_device,
766 			 pci_unmap_addr(cb, start_mapping),
767 			 cb->size, DMA_BIDIRECTIONAL);
768 		kfree(cb->start_buf);
769 	}
770 }
771 
772 static int krping_create_qp(struct krping_cb *cb)
773 {
774 	struct ib_qp_init_attr init_attr;
775 	int ret;
776 
777 	memset(&init_attr, 0, sizeof(init_attr));
778 	init_attr.cap.max_send_wr = cb->txdepth;
779 	init_attr.cap.max_recv_wr = 2;
780 	init_attr.cap.max_recv_sge = 1;
781 	init_attr.cap.max_send_sge = 1;
782 	init_attr.qp_type = IB_QPT_RC;
783 	init_attr.send_cq = cb->cq;
784 	init_attr.recv_cq = cb->cq;
785 	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
786 
787 	if (cb->server) {
788 		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
789 		if (!ret)
790 			cb->qp = cb->child_cm_id->qp;
791 	} else {
792 		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
793 		if (!ret)
794 			cb->qp = cb->cm_id->qp;
795 	}
796 
797 	return ret;
798 }
799 
800 static void krping_free_qp(struct krping_cb *cb)
801 {
802 	ib_destroy_qp(cb->qp);
803 	ib_destroy_cq(cb->cq);
804 	ib_dealloc_pd(cb->pd);
805 }
806 
807 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
808 {
809 	int ret;
810 	cb->pd = ib_alloc_pd(cm_id->device);
811 	if (IS_ERR(cb->pd)) {
812 		PRINTF(cb, "ib_alloc_pd failed\n");
813 		return PTR_ERR(cb->pd);
814 	}
815 	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
816 
817 	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
818 
819 	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
820 			      cb, cb->txdepth * 2, 0);
821 	if (IS_ERR(cb->cq)) {
822 		PRINTF(cb, "ib_create_cq failed\n");
823 		ret = PTR_ERR(cb->cq);
824 		goto err1;
825 	}
826 	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
827 
828 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
829 		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
830 		if (ret) {
831 			PRINTF(cb, "ib_create_cq failed\n");
832 			goto err2;
833 		}
834 	}
835 
836 	ret = krping_create_qp(cb);
837 	if (ret) {
838 		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
839 		goto err2;
840 	}
841 	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
842 	return 0;
843 err2:
844 	ib_destroy_cq(cb->cq);
845 err1:
846 	ib_dealloc_pd(cb->pd);
847 	return ret;
848 }
849 
850 /*
851  * return the (possibly rebound) rkey for the rdma buffer.
852  * FASTREG mode: invalidate and rebind via fastreg wr.
853  * MW mode: rebind the MW.
854  * other modes: just return the mr rkey.
855  */
856 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
857 {
858 	u32 rkey = 0xffffffff;
859 	u64 p;
860 	struct ib_send_wr *bad_wr;
861 	int i;
862 	int ret;
863 
864 	switch (cb->mem) {
865 	case FASTREG:
866 		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
867 
868 		/*
869 		 * Update the fastreg key.
870 		 */
871 		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
872 		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
873 
874 		/*
875 		 * Update the fastreg WR with new buf info.
876 		 */
877 		if (buf == (u64)cb->start_dma_addr)
878 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
879 		else
880 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
881 		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
882 		p = (u64)(buf & PAGE_MASK);
883 		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len;
884 		     i++, p += PAGE_SIZE) {
885 			cb->page_list->page_list[i] = p;
886 			DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p);
887 		}
888 
889 		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
890 			" iova_start %jx page_list_len %u\n",
891 			post_inv,
892 			cb->fastreg_wr.wr.fast_reg.rkey,
893 			cb->fastreg_wr.wr.fast_reg.page_shift,
894 			(unsigned)cb->fastreg_wr.wr.fast_reg.length,
895 			(uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start,
896 			cb->fastreg_wr.wr.fast_reg.page_list_len);
897 
898 		if (post_inv)
899 			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
900 		else
901 			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
902 		if (ret) {
903 			PRINTF(cb, "post send error %d\n", ret);
904 			cb->state = ERROR;
905 		}
906 		rkey = cb->fastreg_mr->rkey;
907 		break;
908 	case MW:
909 		/*
910 		 * Update the MW with new buf info.
911 		 */
912 		if (buf == (u64)cb->start_dma_addr) {
913 #ifdef BIND_INFO
914 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ;
915 			cb->bind_attr.bind_info.mr = cb->start_mr;
916 #else
917 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
918 			cb->bind_attr.mr = cb->start_mr;
919 #endif
920 		} else {
921 #ifdef BIND_INFO
922 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
923 			cb->bind_attr.bind_info.mr = cb->rdma_mr;
924 #else
925 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
926 			cb->bind_attr.mr = cb->rdma_mr;
927 #endif
928 		}
929 #ifdef BIND_INFO
930 		cb->bind_attr.bind_info.addr = buf;
931 #else
932 		cb->bind_attr.addr = buf;
933 #endif
934 		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n",
935 #ifdef BIND_INFO
936 			cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey);
937 #else
938 			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
939 #endif
940 		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
941 		if (ret) {
942 			PRINTF(cb, "bind mw error %d\n", ret);
943 			cb->state = ERROR;
944 		} else
945 			rkey = cb->mw->rkey;
946 		break;
947 	case MR:
948 		if (buf == (u64)cb->start_dma_addr)
949 			rkey = cb->start_mr->rkey;
950 		else
951 			rkey = cb->rdma_mr->rkey;
952 		break;
953 	case DMA:
954 		rkey = cb->dma_mr->rkey;
955 		break;
956 	default:
957 		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
958 		cb->state = ERROR;
959 		break;
960 	}
961 	return rkey;
962 }
963 
964 static void krping_format_send(struct krping_cb *cb, u64 buf)
965 {
966 	struct krping_rdma_info *info = &cb->send_buf;
967 	u32 rkey;
968 
969 	/*
970 	 * Client side will do fastreg or mw bind before
971 	 * advertising the rdma buffer.  Server side
972 	 * sends have no data.
973 	 */
974 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
975 		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
976 		info->buf = htonll(buf);
977 		info->rkey = htonl(rkey);
978 		info->size = htonl(cb->size);
979 		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
980 			  (unsigned long long)buf, rkey, cb->size);
981 	}
982 }
983 
984 static void krping_test_server(struct krping_cb *cb)
985 {
986 	struct ib_send_wr *bad_wr, inv;
987 	int ret;
988 
989 	while (1) {
990 		/* Wait for client's Start STAG/TO/Len */
991 		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
992 		if (cb->state != RDMA_READ_ADV) {
993 			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
994 				cb->state);
995 			break;
996 		}
997 
998 		DEBUG_LOG(cb, "server received sink adv\n");
999 
1000 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1001 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1002 		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
1003 		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
1004 
1005 		/* Issue RDMA Read. */
1006 		if (cb->read_inv)
1007 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
1008 		else {
1009 
1010 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1011 			if (cb->mem == FASTREG) {
1012 				/*
1013 				 * Immediately follow the read with a
1014 				 * fenced LOCAL_INV.
1015 				 */
1016 				cb->rdma_sq_wr.next = &inv;
1017 				memset(&inv, 0, sizeof inv);
1018 				inv.opcode = IB_WR_LOCAL_INV;
1019 				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
1020 				inv.send_flags = IB_SEND_FENCE;
1021 			}
1022 		}
1023 
1024 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1025 		if (ret) {
1026 			PRINTF(cb, "post send error %d\n", ret);
1027 			break;
1028 		}
1029 		cb->rdma_sq_wr.next = NULL;
1030 
1031 		DEBUG_LOG(cb, "server posted rdma read req \n");
1032 
1033 		/* Wait for read completion */
1034 		wait_event_interruptible(cb->sem,
1035 					 cb->state >= RDMA_READ_COMPLETE);
1036 		if (cb->state != RDMA_READ_COMPLETE) {
1037 			PRINTF(cb,
1038 			       "wait for RDMA_READ_COMPLETE state %d\n",
1039 			       cb->state);
1040 			break;
1041 		}
1042 		DEBUG_LOG(cb, "server received read complete\n");
1043 
1044 		/* Display data in recv buf */
1045 		if (cb->verbose) {
1046 			if (strlen(cb->rdma_buf) > 128) {
1047 				char msgbuf[128];
1048 
1049 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
1050 				PRINTF(cb, "server ping data stripped: %s\n",
1051 				       msgbuf);
1052 			} else
1053 				PRINTF(cb, "server ping data: %s\n",
1054 				       cb->rdma_buf);
1055 		}
1056 
1057 		/* Tell client to continue */
1058 		if (cb->server && cb->server_invalidate) {
1059 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1060 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1061 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1062 		}
1063 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1064 		if (ret) {
1065 			PRINTF(cb, "post send error %d\n", ret);
1066 			break;
1067 		}
1068 		DEBUG_LOG(cb, "server posted go ahead\n");
1069 
1070 		/* Wait for client's RDMA STAG/TO/Len */
1071 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1072 		if (cb->state != RDMA_WRITE_ADV) {
1073 			PRINTF(cb,
1074 			       "wait for RDMA_WRITE_ADV state %d\n",
1075 			       cb->state);
1076 			break;
1077 		}
1078 		DEBUG_LOG(cb, "server received sink adv\n");
1079 
1080 		/* RDMA Write echo data */
1081 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1082 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1083 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1084 		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
1085 		if (cb->local_dma_lkey)
1086 			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
1087 		else
1088 			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
1089 
1090 		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
1091 			  cb->rdma_sq_wr.sg_list->lkey,
1092 			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
1093 			  cb->rdma_sq_wr.sg_list->length);
1094 
1095 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1096 		if (ret) {
1097 			PRINTF(cb, "post send error %d\n", ret);
1098 			break;
1099 		}
1100 
1101 		/* Wait for completion */
1102 		ret = wait_event_interruptible(cb->sem, cb->state >=
1103 							 RDMA_WRITE_COMPLETE);
1104 		if (cb->state != RDMA_WRITE_COMPLETE) {
1105 			PRINTF(cb,
1106 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1107 			       cb->state);
1108 			break;
1109 		}
1110 		DEBUG_LOG(cb, "server rdma write complete \n");
1111 
1112 		cb->state = CONNECTED;
1113 
1114 		/* Tell client to begin again */
1115 		if (cb->server && cb->server_invalidate) {
1116 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1117 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1118 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1119 		}
1120 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1121 		if (ret) {
1122 			PRINTF(cb, "post send error %d\n", ret);
1123 			break;
1124 		}
1125 		DEBUG_LOG(cb, "server posted go ahead\n");
1126 	}
1127 }
1128 
1129 static void rlat_test(struct krping_cb *cb)
1130 {
1131 	int scnt;
1132 	int iters = cb->count;
1133 	struct timeval start_tv, stop_tv;
1134 	int ret;
1135 	struct ib_wc wc;
1136 	struct ib_send_wr *bad_wr;
1137 	int ne;
1138 
1139 	scnt = 0;
1140 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1141 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1142 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1143 	cb->rdma_sq_wr.sg_list->length = cb->size;
1144 
1145 	microtime(&start_tv);
1146 	if (!cb->poll) {
1147 		cb->state = RDMA_READ_ADV;
1148 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
1149 	}
1150 	while (scnt < iters) {
1151 
1152 		cb->state = RDMA_READ_ADV;
1153 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1154 		if (ret) {
1155 			PRINTF(cb,
1156 				"Couldn't post send: ret=%d scnt %d\n",
1157 				ret, scnt);
1158 			return;
1159 		}
1160 
1161 		do {
1162 			if (!cb->poll) {
1163 				wait_event_interruptible(cb->sem,
1164 					cb->state != RDMA_READ_ADV);
1165 				if (cb->state == RDMA_READ_COMPLETE) {
1166 					ne = 1;
1167 					ib_req_notify_cq(cb->cq,
1168 						IB_CQ_NEXT_COMP);
1169 				} else {
1170 					ne = -1;
1171 				}
1172 			} else
1173 				ne = ib_poll_cq(cb->cq, 1, &wc);
1174 			if (cb->state == ERROR) {
1175 				PRINTF(cb,
1176 					"state == ERROR...bailing scnt %d\n",
1177 					scnt);
1178 				return;
1179 			}
1180 		} while (ne == 0);
1181 
1182 		if (ne < 0) {
1183 			PRINTF(cb, "poll CQ failed %d\n", ne);
1184 			return;
1185 		}
1186 		if (cb->poll && wc.status != IB_WC_SUCCESS) {
1187 			PRINTF(cb, "Completion wth error at %s:\n",
1188 				cb->server ? "server" : "client");
1189 			PRINTF(cb, "Failed status %d: wr_id %d\n",
1190 				wc.status, (int) wc.wr_id);
1191 			return;
1192 		}
1193 		++scnt;
1194 	}
1195 	microtime(&stop_tv);
1196 
1197         if (stop_tv.tv_usec < start_tv.tv_usec) {
1198                 stop_tv.tv_usec += 1000000;
1199                 stop_tv.tv_sec  -= 1;
1200         }
1201 
1202 	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
1203 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1204 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1205 		scnt, cb->size);
1206 }
1207 
1208 static void wlat_test(struct krping_cb *cb)
1209 {
1210 	int ccnt, scnt, rcnt;
1211 	int iters=cb->count;
1212 	volatile char *poll_buf = (char *) cb->start_buf;
1213 	char *buf = (char *)cb->rdma_buf;
1214 	struct timeval start_tv, stop_tv;
1215 	cycles_t *post_cycles_start, *post_cycles_stop;
1216 	cycles_t *poll_cycles_start, *poll_cycles_stop;
1217 	cycles_t *last_poll_cycles_start;
1218 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1219 	int i;
1220 	int cycle_iters = 1000;
1221 
1222 	ccnt = 0;
1223 	scnt = 0;
1224 	rcnt = 0;
1225 
1226 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1227 	if (!post_cycles_start) {
1228 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1229 		return;
1230 	}
1231 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1232 	if (!post_cycles_stop) {
1233 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1234 		return;
1235 	}
1236 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1237 	if (!poll_cycles_start) {
1238 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1239 		return;
1240 	}
1241 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1242 	if (!poll_cycles_stop) {
1243 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1244 		return;
1245 	}
1246 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1247 		GFP_KERNEL);
1248 	if (!last_poll_cycles_start) {
1249 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1250 		return;
1251 	}
1252 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1253 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1254 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1255 	cb->rdma_sq_wr.sg_list->length = cb->size;
1256 
1257 	if (cycle_iters > iters)
1258 		cycle_iters = iters;
1259 	microtime(&start_tv);
1260 	while (scnt < iters || ccnt < iters || rcnt < iters) {
1261 
1262 		/* Wait till buffer changes. */
1263 		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1264 			++rcnt;
1265 			while (*poll_buf != (char)rcnt) {
1266 				if (cb->state == ERROR) {
1267 					PRINTF(cb,
1268 						"state = ERROR, bailing\n");
1269 					return;
1270 				}
1271 			}
1272 		}
1273 
1274 		if (scnt < iters) {
1275 			struct ib_send_wr *bad_wr;
1276 
1277 			*buf = (char)scnt+1;
1278 			if (scnt < cycle_iters)
1279 				post_cycles_start[scnt] = get_cycles();
1280 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1281 				PRINTF(cb,
1282 					"Couldn't post send: scnt=%d\n",
1283 					scnt);
1284 				return;
1285 			}
1286 			if (scnt < cycle_iters)
1287 				post_cycles_stop[scnt] = get_cycles();
1288 			scnt++;
1289 		}
1290 
1291 		if (ccnt < iters) {
1292 			struct ib_wc wc;
1293 			int ne;
1294 
1295 			if (ccnt < cycle_iters)
1296 				poll_cycles_start[ccnt] = get_cycles();
1297 			do {
1298 				if (ccnt < cycle_iters)
1299 					last_poll_cycles_start[ccnt] =
1300 						get_cycles();
1301 				ne = ib_poll_cq(cb->cq, 1, &wc);
1302 			} while (ne == 0);
1303 			if (ccnt < cycle_iters)
1304 				poll_cycles_stop[ccnt] = get_cycles();
1305 			++ccnt;
1306 
1307 			if (ne < 0) {
1308 				PRINTF(cb, "poll CQ failed %d\n", ne);
1309 				return;
1310 			}
1311 			if (wc.status != IB_WC_SUCCESS) {
1312 				PRINTF(cb,
1313 					"Completion wth error at %s:\n",
1314 					cb->server ? "server" : "client");
1315 				PRINTF(cb,
1316 					"Failed status %d: wr_id %d\n",
1317 					wc.status, (int) wc.wr_id);
1318 				PRINTF(cb,
1319 					"scnt=%d, rcnt=%d, ccnt=%d\n",
1320 					scnt, rcnt, ccnt);
1321 				return;
1322 			}
1323 		}
1324 	}
1325 	microtime(&stop_tv);
1326 
1327         if (stop_tv.tv_usec < start_tv.tv_usec) {
1328                 stop_tv.tv_usec += 1000000;
1329                 stop_tv.tv_sec  -= 1;
1330         }
1331 
1332 	for (i=0; i < cycle_iters; i++) {
1333 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1334 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1335 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1336 	}
1337 	PRINTF(cb,
1338 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1339 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1340 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1341 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1342 		scnt, cb->size, cycle_iters,
1343 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1344 		(unsigned long long)sum_last_poll);
1345 	kfree(post_cycles_start);
1346 	kfree(post_cycles_stop);
1347 	kfree(poll_cycles_start);
1348 	kfree(poll_cycles_stop);
1349 	kfree(last_poll_cycles_start);
1350 }
1351 
1352 static void bw_test(struct krping_cb *cb)
1353 {
1354 	int ccnt, scnt, rcnt;
1355 	int iters=cb->count;
1356 	struct timeval start_tv, stop_tv;
1357 	cycles_t *post_cycles_start, *post_cycles_stop;
1358 	cycles_t *poll_cycles_start, *poll_cycles_stop;
1359 	cycles_t *last_poll_cycles_start;
1360 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1361 	int i;
1362 	int cycle_iters = 1000;
1363 
1364 	ccnt = 0;
1365 	scnt = 0;
1366 	rcnt = 0;
1367 
1368 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1369 	if (!post_cycles_start) {
1370 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1371 		return;
1372 	}
1373 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1374 	if (!post_cycles_stop) {
1375 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1376 		return;
1377 	}
1378 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1379 	if (!poll_cycles_start) {
1380 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1381 		return;
1382 	}
1383 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1384 	if (!poll_cycles_stop) {
1385 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1386 		return;
1387 	}
1388 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1389 		GFP_KERNEL);
1390 	if (!last_poll_cycles_start) {
1391 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1392 		return;
1393 	}
1394 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1395 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1396 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1397 	cb->rdma_sq_wr.sg_list->length = cb->size;
1398 
1399 	if (cycle_iters > iters)
1400 		cycle_iters = iters;
1401 	microtime(&start_tv);
1402 	while (scnt < iters || ccnt < iters) {
1403 
1404 		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1405 			struct ib_send_wr *bad_wr;
1406 
1407 			if (scnt < cycle_iters)
1408 				post_cycles_start[scnt] = get_cycles();
1409 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1410 				PRINTF(cb,
1411 					"Couldn't post send: scnt=%d\n",
1412 					scnt);
1413 				return;
1414 			}
1415 			if (scnt < cycle_iters)
1416 				post_cycles_stop[scnt] = get_cycles();
1417 			++scnt;
1418 		}
1419 
1420 		if (ccnt < iters) {
1421 			int ne;
1422 			struct ib_wc wc;
1423 
1424 			if (ccnt < cycle_iters)
1425 				poll_cycles_start[ccnt] = get_cycles();
1426 			do {
1427 				if (ccnt < cycle_iters)
1428 					last_poll_cycles_start[ccnt] =
1429 						get_cycles();
1430 				ne = ib_poll_cq(cb->cq, 1, &wc);
1431 			} while (ne == 0);
1432 			if (ccnt < cycle_iters)
1433 				poll_cycles_stop[ccnt] = get_cycles();
1434 			ccnt += 1;
1435 
1436 			if (ne < 0) {
1437 				PRINTF(cb, "poll CQ failed %d\n", ne);
1438 				return;
1439 			}
1440 			if (wc.status != IB_WC_SUCCESS) {
1441 				PRINTF(cb,
1442 					"Completion wth error at %s:\n",
1443 					cb->server ? "server" : "client");
1444 				PRINTF(cb,
1445 					"Failed status %d: wr_id %d\n",
1446 					wc.status, (int) wc.wr_id);
1447 				return;
1448 			}
1449 		}
1450 	}
1451 	microtime(&stop_tv);
1452 
1453         if (stop_tv.tv_usec < start_tv.tv_usec) {
1454                 stop_tv.tv_usec += 1000000;
1455                 stop_tv.tv_sec  -= 1;
1456         }
1457 
1458 	for (i=0; i < cycle_iters; i++) {
1459 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1460 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1461 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1462 	}
1463 	PRINTF(cb,
1464 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1465 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1466 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1467 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1468 		scnt, cb->size, cycle_iters,
1469 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1470 		(unsigned long long)sum_last_poll);
1471 	kfree(post_cycles_start);
1472 	kfree(post_cycles_stop);
1473 	kfree(poll_cycles_start);
1474 	kfree(poll_cycles_stop);
1475 	kfree(last_poll_cycles_start);
1476 }
1477 
1478 static void krping_rlat_test_server(struct krping_cb *cb)
1479 {
1480 	struct ib_send_wr *bad_wr;
1481 	struct ib_wc wc;
1482 	int ret;
1483 
1484 	/* Spin waiting for client's Start STAG/TO/Len */
1485 	while (cb->state < RDMA_READ_ADV) {
1486 		krping_cq_event_handler(cb->cq, cb);
1487 	}
1488 
1489 	/* Send STAG/TO/Len to client */
1490 	krping_format_send(cb, cb->start_dma_addr);
1491 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1492 	if (ret) {
1493 		PRINTF(cb, "post send error %d\n", ret);
1494 		return;
1495 	}
1496 
1497 	/* Spin waiting for send completion */
1498 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1499 	if (ret < 0) {
1500 		PRINTF(cb, "poll error %d\n", ret);
1501 		return;
1502 	}
1503 	if (wc.status) {
1504 		PRINTF(cb, "send completiong error %d\n", wc.status);
1505 		return;
1506 	}
1507 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1508 }
1509 
1510 static void krping_wlat_test_server(struct krping_cb *cb)
1511 {
1512 	struct ib_send_wr *bad_wr;
1513 	struct ib_wc wc;
1514 	int ret;
1515 
1516 	/* Spin waiting for client's Start STAG/TO/Len */
1517 	while (cb->state < RDMA_READ_ADV) {
1518 		krping_cq_event_handler(cb->cq, cb);
1519 	}
1520 
1521 	/* Send STAG/TO/Len to client */
1522 	krping_format_send(cb, cb->start_dma_addr);
1523 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1524 	if (ret) {
1525 		PRINTF(cb, "post send error %d\n", ret);
1526 		return;
1527 	}
1528 
1529 	/* Spin waiting for send completion */
1530 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1531 	if (ret < 0) {
1532 		PRINTF(cb, "poll error %d\n", ret);
1533 		return;
1534 	}
1535 	if (wc.status) {
1536 		PRINTF(cb, "send completiong error %d\n", wc.status);
1537 		return;
1538 	}
1539 
1540 	wlat_test(cb);
1541 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1542 }
1543 
1544 static void krping_bw_test_server(struct krping_cb *cb)
1545 {
1546 	struct ib_send_wr *bad_wr;
1547 	struct ib_wc wc;
1548 	int ret;
1549 
1550 	/* Spin waiting for client's Start STAG/TO/Len */
1551 	while (cb->state < RDMA_READ_ADV) {
1552 		krping_cq_event_handler(cb->cq, cb);
1553 	}
1554 
1555 	/* Send STAG/TO/Len to client */
1556 	krping_format_send(cb, cb->start_dma_addr);
1557 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1558 	if (ret) {
1559 		PRINTF(cb, "post send error %d\n", ret);
1560 		return;
1561 	}
1562 
1563 	/* Spin waiting for send completion */
1564 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1565 	if (ret < 0) {
1566 		PRINTF(cb, "poll error %d\n", ret);
1567 		return;
1568 	}
1569 	if (wc.status) {
1570 		PRINTF(cb, "send completiong error %d\n", wc.status);
1571 		return;
1572 	}
1573 
1574 	if (cb->duplex)
1575 		bw_test(cb);
1576 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1577 }
1578 
1579 static int fastreg_supported(struct krping_cb *cb, int server)
1580 {
1581 	struct ib_device *dev = server?cb->child_cm_id->device:
1582 					cb->cm_id->device;
1583 	struct ib_device_attr attr;
1584 	int ret;
1585 
1586 	ret = ib_query_device(dev, &attr);
1587 	if (ret) {
1588 		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
1589 		return 0;
1590 	}
1591 	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
1592 		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n",
1593 		    (unsigned long long)attr.device_cap_flags);
1594 		return 0;
1595 	}
1596 	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n",
1597 		(uintmax_t)attr.device_cap_flags);
1598 	return 1;
1599 }
1600 
1601 static int krping_bind_server(struct krping_cb *cb)
1602 {
1603 	struct sockaddr_in sin;
1604 	int ret;
1605 
1606 	memset(&sin, 0, sizeof(sin));
1607 	sin.sin_len = sizeof sin;
1608 	sin.sin_family = AF_INET;
1609 	sin.sin_addr.s_addr = cb->addr.s_addr;
1610 	sin.sin_port = cb->port;
1611 
1612 	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1613 	if (ret) {
1614 		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
1615 		return ret;
1616 	}
1617 	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
1618 
1619 	DEBUG_LOG(cb, "rdma_listen\n");
1620 	ret = rdma_listen(cb->cm_id, 3);
1621 	if (ret) {
1622 		PRINTF(cb, "rdma_listen failed: %d\n", ret);
1623 		return ret;
1624 	}
1625 
1626 	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1627 	if (cb->state != CONNECT_REQUEST) {
1628 		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
1629 			cb->state);
1630 		return -1;
1631 	}
1632 
1633 	if (cb->mem == FASTREG && !fastreg_supported(cb, 1))
1634 		return -EINVAL;
1635 
1636 	return 0;
1637 }
1638 
1639 /*
1640  * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads
1641  * complete.
1642  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
1643  */
1644 static void krping_fr_test5(struct krping_cb *cb)
1645 {
1646 	struct ib_fast_reg_page_list **pl;
1647 	struct ib_send_wr *fr, *read, *bad;
1648 	struct ib_wc wc;
1649 	struct ib_sge *sgl;
1650 	u8 key = 0;
1651 	struct ib_mr **mr;
1652 	u8 **buf;
1653 	dma_addr_t *dma_addr;
1654 	int i;
1655 	int ret;
1656 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1657 	time_t start;
1658 	int count = 0;
1659 	int scnt;
1660 	int depth = cb->txdepth >> 1;
1661 
1662 	if (!depth) {
1663 		PRINTF(cb, "txdepth must be > 1 for this test!\n");
1664 		return;
1665 	}
1666 
1667 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
1668 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
1669 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
1670 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
1671 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
1672 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
1673 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
1674 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
1675 	read = kzalloc(sizeof *read * depth, GFP_KERNEL);
1676 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth);
1677 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
1678 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
1679 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
1680 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
1681 	if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) {
1682 		PRINTF(cb, "kzalloc failed\n");
1683 		goto err1;
1684 	}
1685 
1686 	for (scnt = 0; scnt < depth; scnt++) {
1687 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
1688 		if (IS_ERR(pl[scnt])) {
1689 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
1690 			       PTR_ERR(pl[scnt]));
1691 			goto err2;
1692 		}
1693 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
1694 
1695 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
1696 		if (IS_ERR(mr[scnt])) {
1697 			PRINTF(cb, "alloc_fr failed %ld\n",
1698 			       PTR_ERR(mr[scnt]));
1699 			goto err2;
1700 		}
1701 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
1702 		ib_update_fast_reg_key(mr[scnt], ++key);
1703 
1704 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
1705 		if (!buf[scnt]) {
1706 			PRINTF(cb, "kmalloc failed\n");
1707 			ret = -ENOMEM;
1708 			goto err2;
1709 		}
1710 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
1711 		dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
1712 						   buf[scnt], cb->size,
1713 						   DMA_BIDIRECTIONAL);
1714 		if (dma_mapping_error(cb->pd->device->dma_device,
1715 		    dma_addr[scnt])) {
1716 			PRINTF(cb, "dma_map failed\n");
1717 			ret = -ENOMEM;
1718 			goto err2;
1719 		}
1720 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
1721 		for (i=0; i<plen; i++) {
1722 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
1723 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
1724 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
1725 		}
1726 
1727 		sgl[scnt].lkey = mr[scnt]->rkey;
1728 		sgl[scnt].length = cb->size;
1729 		sgl[scnt].addr = (u64)buf[scnt];
1730 		DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n",
1731 			  __func__, scnt,  sgl[scnt].lkey, sgl[scnt].length,
1732 			  (uintmax_t)sgl[scnt].addr);
1733 
1734 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
1735 		fr[scnt].wr_id = scnt;
1736 		fr[scnt].send_flags = 0;
1737 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
1738 		fr[scnt].wr.fast_reg.length = cb->size;
1739 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
1740 		fr[scnt].wr.fast_reg.page_list_len = plen;
1741 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
1742 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1743 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
1744 		fr[scnt].next = &read[scnt];
1745 		read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV;
1746 		read[scnt].wr_id = scnt;
1747 		read[scnt].send_flags = IB_SEND_SIGNALED;
1748 		read[scnt].wr.rdma.rkey = cb->remote_rkey;
1749 		read[scnt].wr.rdma.remote_addr = cb->remote_addr;
1750 		read[scnt].num_sge = 1;
1751 		read[scnt].sg_list = &sgl[scnt];
1752 		ret = ib_post_send(cb->qp, &fr[scnt], &bad);
1753 		if (ret) {
1754 			PRINTF(cb, "ib_post_send failed %d\n", ret);
1755 			goto err2;
1756 		}
1757 	}
1758 
1759 	start = time_uptime;
1760 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
1761 	while (!cb->count || cb->server || count < cb->count) {
1762 		if ((time_uptime - start) >= 9) {
1763 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
1764 				  count);
1765 			wait_event_interruptible_timeout(cb->sem,
1766 							 cb->state == ERROR,
1767 							 1);
1768 			if (cb->state == ERROR)
1769 				break;
1770 			start = time_uptime;
1771 		}
1772 		do {
1773 			ret = ib_poll_cq(cb->cq, 1, &wc);
1774 			if (ret < 0) {
1775 				PRINTF(cb, "ib_poll_cq failed %d\n",
1776 				       ret);
1777 				goto err2;
1778 			}
1779 			if (ret == 1) {
1780 				if (wc.status) {
1781 					PRINTF(cb,
1782 					       "completion error %u wr_id %ju "
1783 					       "opcode %d\n", wc.status,
1784 					       (uintmax_t)wc.wr_id, wc.opcode);
1785 					goto err2;
1786 				}
1787 				count++;
1788 				if (count == cb->count)
1789 					break;
1790 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
1791 				fr[wc.wr_id].wr.fast_reg.rkey =
1792 					mr[wc.wr_id]->rkey;
1793 				sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey;
1794 				ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad);
1795 				if (ret) {
1796 					PRINTF(cb,
1797 					       "ib_post_send failed %d\n", ret);
1798 					goto err2;
1799 				}
1800 			} else if (krping_sigpending()) {
1801 				PRINTF(cb, "signal!\n");
1802 				goto err2;
1803 			}
1804 		} while (ret == 1);
1805 	}
1806 	DEBUG_LOG(cb, "%s done!\n", __func__);
1807 err2:
1808 	DEBUG_LOG(cb, "sleeping 1 second\n");
1809 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1810 	DEBUG_LOG(cb, "draining the cq...\n");
1811 	do {
1812 		ret = ib_poll_cq(cb->cq, 1, &wc);
1813 		if (ret < 0) {
1814 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1815 			break;
1816 		}
1817 		if (ret == 1) {
1818 			if (wc.status) {
1819 				PRINTF(cb, "completion error %u "
1820 				       "opcode %u\n", wc.status, wc.opcode);
1821 			}
1822 		}
1823 	} while (ret == 1);
1824 
1825 	DEBUG_LOG(cb, "destroying fr mrs!\n");
1826 	for (scnt = 0; scnt < depth; scnt++) {
1827 		if (mr[scnt]) {
1828 			ib_dereg_mr(mr[scnt]);
1829 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
1830 		}
1831 	}
1832 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
1833 	for (scnt = 0; scnt < depth; scnt++) {
1834 		if (buf[scnt]) {
1835 			dma_unmap_single(cb->pd->device->dma_device,
1836 					 dma_addr[scnt], cb->size,
1837 					 DMA_BIDIRECTIONAL);
1838 			kfree(buf[scnt]);
1839 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
1840 		}
1841 	}
1842 	DEBUG_LOG(cb, "destroying fr page lists!\n");
1843 	for (scnt = 0; scnt < depth; scnt++) {
1844 		if (pl[scnt]) {
1845 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
1846 			ib_free_fast_reg_page_list(pl[scnt]);
1847 		}
1848 	}
1849 err1:
1850 	if (pl)
1851 		kfree(pl);
1852 	if (mr)
1853 		kfree(mr);
1854 	if (fr)
1855 		kfree(fr);
1856 	if (read)
1857 		kfree(read);
1858 	if (sgl)
1859 		kfree(sgl);
1860 	if (buf)
1861 		kfree(buf);
1862 	if (dma_addr)
1863 		kfree(dma_addr);
1864 }
1865 static void krping_fr_test_server(struct krping_cb *cb)
1866 {
1867 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
1868 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1869 }
1870 
1871 static void krping_fr_test5_server(struct krping_cb *cb)
1872 {
1873 	struct ib_send_wr *bad_wr;
1874 	struct ib_wc wc;
1875 	int ret;
1876 
1877 	/* Spin waiting for client's Start STAG/TO/Len */
1878 	while (cb->state < RDMA_READ_ADV) {
1879 		krping_cq_event_handler(cb->cq, cb);
1880 	}
1881 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
1882 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
1883 
1884 	/* Send STAG/TO/Len to client */
1885 	krping_format_send(cb, cb->start_dma_addr);
1886 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1887 	if (ret) {
1888 		PRINTF(cb, "post send error %d\n", ret);
1889 		return;
1890 	}
1891 
1892 	/* Spin waiting for send completion */
1893 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1894 	if (ret < 0) {
1895 		PRINTF(cb, "poll error %d\n", ret);
1896 		return;
1897 	}
1898 	if (wc.status) {
1899 		PRINTF(cb, "send completiong error %d\n", wc.status);
1900 		return;
1901 	}
1902 
1903 	if (cb->duplex)
1904 		krping_fr_test5(cb);
1905 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
1906 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1907 }
1908 
1909 static void krping_fr_test5_client(struct krping_cb *cb)
1910 {
1911 	struct ib_send_wr *bad;
1912 	struct ib_wc wc;
1913 	int ret;
1914 
1915 	cb->state = RDMA_READ_ADV;
1916 
1917 	/* Send STAG/TO/Len to server */
1918 	krping_format_send(cb, cb->start_dma_addr);
1919 	if (cb->state == ERROR) {
1920 		PRINTF(cb, "krping_format_send failed\n");
1921 		return;
1922 	}
1923 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
1924 	if (ret) {
1925 		PRINTF(cb, "post send error %d\n", ret);
1926 		return;
1927 	}
1928 
1929 	/* Spin waiting for send completion */
1930 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1931 	if (ret < 0) {
1932 		PRINTF(cb, "poll error %d\n", ret);
1933 		return;
1934 	}
1935 	if (wc.status) {
1936 		PRINTF(cb, "send completion error %d\n", wc.status);
1937 		return;
1938 	}
1939 
1940 	/* Spin waiting for server's Start STAG/TO/Len */
1941 	while (cb->state < RDMA_WRITE_ADV) {
1942 		krping_cq_event_handler(cb->cq, cb);
1943 	}
1944 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
1945 	    (uintmax_t)cb->remote_addr);
1946 
1947 	return krping_fr_test5(cb);
1948 }
1949 
1950 /*
1951  * sq-depth worth of write + fastreg + inv, reposting them as the invs
1952  * complete.
1953  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
1954  * If a count is given, then the last IO will have a bogus lkey in the
1955  * write work request.  This reproduces a fw bug where the connection
1956  * will get stuck if a fastreg is processed while the ulptx is failing
1957  * the bad write.
1958  */
1959 static void krping_fr_test6(struct krping_cb *cb)
1960 {
1961 	struct ib_fast_reg_page_list **pl;
1962 	struct ib_send_wr *fr, *write, *inv, *bad;
1963 	struct ib_wc wc;
1964 	struct ib_sge *sgl;
1965 	u8 key = 0;
1966 	struct ib_mr **mr;
1967 	u8 **buf;
1968 	dma_addr_t *dma_addr;
1969 	int i;
1970 	int ret;
1971 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1972 	unsigned long start;
1973 	int count = 0;
1974 	int scnt;
1975 	int depth = cb->txdepth  / 3;
1976 
1977 	if (!depth) {
1978 		PRINTF(cb, "txdepth must be > 3 for this test!\n");
1979 		return;
1980 	}
1981 
1982 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
1983 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
1984 
1985 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
1986 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
1987 
1988 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
1989 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
1990 
1991 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
1992 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
1993 
1994 	write = kzalloc(sizeof *write * depth, GFP_KERNEL);
1995 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth);
1996 
1997 	inv = kzalloc(sizeof *inv * depth, GFP_KERNEL);
1998 	DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth);
1999 
2000 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
2001 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
2002 
2003 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
2004 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
2005 
2006 	if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) {
2007 		PRINTF(cb, "kzalloc failed\n");
2008 		goto err1;
2009 	}
2010 
2011 	for (scnt = 0; scnt < depth; scnt++) {
2012 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2013 		if (IS_ERR(pl[scnt])) {
2014 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
2015 			       PTR_ERR(pl[scnt]));
2016 			goto err2;
2017 		}
2018 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
2019 
2020 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
2021 		if (IS_ERR(mr[scnt])) {
2022 			PRINTF(cb, "alloc_fr failed %ld\n",
2023 			       PTR_ERR(mr[scnt]));
2024 			goto err2;
2025 		}
2026 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
2027 		ib_update_fast_reg_key(mr[scnt], ++key);
2028 
2029 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
2030 		if (!buf[scnt]) {
2031 			PRINTF(cb, "kmalloc failed\n");
2032 			ret = -ENOMEM;
2033 			goto err2;
2034 		}
2035 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
2036 		dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
2037 						   buf[scnt], cb->size,
2038 						   DMA_BIDIRECTIONAL);
2039 		if (dma_mapping_error(cb->pd->device->dma_device,
2040 		    dma_addr[scnt])) {
2041 			PRINTF(cb, "dma_map failed\n");
2042 			ret = -ENOMEM;
2043 			goto err2;
2044 		}
2045 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
2046 		for (i=0; i<plen; i++) {
2047 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
2048 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
2049 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
2050 		}
2051 
2052 		write[scnt].opcode = IB_WR_RDMA_WRITE;
2053 		write[scnt].wr_id = scnt;
2054 		write[scnt].wr.rdma.rkey = cb->remote_rkey;
2055 		write[scnt].wr.rdma.remote_addr = cb->remote_addr;
2056 		write[scnt].num_sge = 1;
2057 		write[scnt].sg_list = &cb->rdma_sgl;
2058 		write[scnt].sg_list->length = cb->size;
2059 		write[scnt].next = &fr[scnt];
2060 
2061 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
2062 		fr[scnt].wr_id = scnt;
2063 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
2064 		fr[scnt].wr.fast_reg.length = cb->size;
2065 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
2066 		fr[scnt].wr.fast_reg.page_list_len = plen;
2067 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
2068 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2069 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
2070 		fr[scnt].next = &inv[scnt];
2071 
2072 		inv[scnt].opcode = IB_WR_LOCAL_INV;
2073 		inv[scnt].send_flags = IB_SEND_SIGNALED;
2074 		inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey;
2075 
2076 		ret = ib_post_send(cb->qp, &write[scnt], &bad);
2077 		if (ret) {
2078 			PRINTF(cb, "ib_post_send failed %d\n", ret);
2079 			goto err2;
2080 		}
2081 	}
2082 
2083 	start = time_uptime;
2084 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
2085 	while (!cb->count || cb->server || count < cb->count) {
2086 		if ((time_uptime - start) >= 9) {
2087 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
2088 				  count);
2089 			wait_event_interruptible_timeout(cb->sem,
2090 							 cb->state == ERROR,
2091 							 1);
2092 			if (cb->state == ERROR)
2093 				break;
2094 			start = time_uptime;
2095 		}
2096 		do {
2097 			ret = ib_poll_cq(cb->cq, 1, &wc);
2098 			if (ret < 0) {
2099 				PRINTF(cb, "ib_poll_cq failed %d\n",
2100 				       ret);
2101 				goto err2;
2102 			}
2103 			if (ret == 1) {
2104 				if (wc.status) {
2105 					PRINTF(cb,
2106 					       "completion error %u wr_id %ju "
2107 					       "opcode %d\n", wc.status,
2108 					       (uintmax_t)wc.wr_id, wc.opcode);
2109 					goto err2;
2110 				}
2111 				count++;
2112 				if (count == (cb->count -1))
2113 					cb->rdma_sgl.lkey = 0x00dead;
2114 				if (count == cb->count)
2115 					break;
2116 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
2117 				fr[wc.wr_id].wr.fast_reg.rkey =
2118 					mr[wc.wr_id]->rkey;
2119 				inv[wc.wr_id].ex.invalidate_rkey =
2120 					mr[wc.wr_id]->rkey;
2121 				ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad);
2122 				if (ret) {
2123 					PRINTF(cb,
2124 					       "ib_post_send failed %d\n", ret);
2125 					goto err2;
2126 				}
2127 			} else if (krping_sigpending()){
2128 				PRINTF(cb, "signal!\n");
2129 				goto err2;
2130 			}
2131 		} while (ret == 1);
2132 	}
2133 	DEBUG_LOG(cb, "%s done!\n", __func__);
2134 err2:
2135 	DEBUG_LOG(cb, "sleeping 1 second\n");
2136 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2137 	DEBUG_LOG(cb, "draining the cq...\n");
2138 	do {
2139 		ret = ib_poll_cq(cb->cq, 1, &wc);
2140 		if (ret < 0) {
2141 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2142 			break;
2143 		}
2144 		if (ret == 1) {
2145 			if (wc.status) {
2146 				PRINTF(cb, "completion error %u "
2147 				       "opcode %u\n", wc.status, wc.opcode);
2148 			}
2149 		}
2150 	} while (ret == 1);
2151 
2152 	DEBUG_LOG(cb, "destroying fr mrs!\n");
2153 	for (scnt = 0; scnt < depth; scnt++) {
2154 		if (mr[scnt]) {
2155 			ib_dereg_mr(mr[scnt]);
2156 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
2157 		}
2158 	}
2159 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
2160 	for (scnt = 0; scnt < depth; scnt++) {
2161 		if (buf[scnt]) {
2162 			dma_unmap_single(cb->pd->device->dma_device,
2163 					 dma_addr[scnt], cb->size,
2164 					 DMA_BIDIRECTIONAL);
2165 			kfree(buf[scnt]);
2166 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
2167 		}
2168 	}
2169 	DEBUG_LOG(cb, "destroying fr page lists!\n");
2170 	for (scnt = 0; scnt < depth; scnt++) {
2171 		if (pl[scnt]) {
2172 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
2173 			ib_free_fast_reg_page_list(pl[scnt]);
2174 		}
2175 	}
2176 err1:
2177 	if (pl)
2178 		kfree(pl);
2179 	if (mr)
2180 		kfree(mr);
2181 	if (fr)
2182 		kfree(fr);
2183 	if (write)
2184 		kfree(write);
2185 	if (inv)
2186 		kfree(inv);
2187 	if (sgl)
2188 		kfree(sgl);
2189 	if (buf)
2190 		kfree(buf);
2191 	if (dma_addr)
2192 		kfree(dma_addr);
2193 }
2194 
2195 static void krping_fr_test6_server(struct krping_cb *cb)
2196 {
2197 	struct ib_send_wr *bad_wr;
2198 	struct ib_wc wc;
2199 	int ret;
2200 
2201 	/* Spin waiting for client's Start STAG/TO/Len */
2202 	while (cb->state < RDMA_READ_ADV) {
2203 		krping_cq_event_handler(cb->cq, cb);
2204 	}
2205 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
2206 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
2207 
2208 	/* Send STAG/TO/Len to client */
2209 	krping_format_send(cb, cb->start_dma_addr);
2210 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2211 	if (ret) {
2212 		PRINTF(cb, "post send error %d\n", ret);
2213 		return;
2214 	}
2215 
2216 	/* Spin waiting for send completion */
2217 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2218 	if (ret < 0) {
2219 		PRINTF(cb, "poll error %d\n", ret);
2220 		return;
2221 	}
2222 	if (wc.status) {
2223 		PRINTF(cb, "send completiong error %d\n", wc.status);
2224 		return;
2225 	}
2226 
2227 	if (cb->duplex)
2228 		krping_fr_test6(cb);
2229 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
2230 	wait_event_interruptible(cb->sem, cb->state == ERROR);
2231 }
2232 
2233 static void krping_fr_test6_client(struct krping_cb *cb)
2234 {
2235 	struct ib_send_wr *bad;
2236 	struct ib_wc wc;
2237 	int ret;
2238 
2239 	cb->state = RDMA_READ_ADV;
2240 
2241 	/* Send STAG/TO/Len to server */
2242 	krping_format_send(cb, cb->start_dma_addr);
2243 	if (cb->state == ERROR) {
2244 		PRINTF(cb, "krping_format_send failed\n");
2245 		return;
2246 	}
2247 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
2248 	if (ret) {
2249 		PRINTF(cb, "post send error %d\n", ret);
2250 		return;
2251 	}
2252 
2253 	/* Spin waiting for send completion */
2254 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2255 	if (ret < 0) {
2256 		PRINTF(cb, "poll error %d\n", ret);
2257 		return;
2258 	}
2259 	if (wc.status) {
2260 		PRINTF(cb, "send completion error %d\n", wc.status);
2261 		return;
2262 	}
2263 
2264 	/* Spin waiting for server's Start STAG/TO/Len */
2265 	while (cb->state < RDMA_WRITE_ADV) {
2266 		krping_cq_event_handler(cb->cq, cb);
2267 	}
2268 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
2269 	    (uintmax_t)cb->remote_addr);
2270 
2271 	return krping_fr_test6(cb);
2272 }
2273 
2274 static void krping_run_server(struct krping_cb *cb)
2275 {
2276 	struct ib_recv_wr *bad_wr;
2277 	int ret;
2278 
2279 	ret = krping_bind_server(cb);
2280 	if (ret)
2281 		return;
2282 
2283 	ret = krping_setup_qp(cb, cb->child_cm_id);
2284 	if (ret) {
2285 		PRINTF(cb, "setup_qp failed: %d\n", ret);
2286 		goto err0;
2287 	}
2288 
2289 	ret = krping_setup_buffers(cb);
2290 	if (ret) {
2291 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
2292 		goto err1;
2293 	}
2294 
2295 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
2296 	if (ret) {
2297 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
2298 		goto err2;
2299 	}
2300 
2301 	ret = krping_accept(cb);
2302 	if (ret) {
2303 		PRINTF(cb, "connect error %d\n", ret);
2304 		goto err2;
2305 	}
2306 
2307 	if (cb->wlat)
2308 		krping_wlat_test_server(cb);
2309 	else if (cb->rlat)
2310 		krping_rlat_test_server(cb);
2311 	else if (cb->bw)
2312 		krping_bw_test_server(cb);
2313 	else if (cb->frtest) {
2314 		switch (cb->testnum) {
2315 		case 1:
2316 		case 2:
2317 		case 3:
2318 		case 4:
2319 			krping_fr_test_server(cb);
2320 			break;
2321 		case 5:
2322 			krping_fr_test5_server(cb);
2323 			break;
2324 		case 6:
2325 			krping_fr_test6_server(cb);
2326 			break;
2327 		default:
2328 			PRINTF(cb, "unknown fr test %d\n", cb->testnum);
2329 			goto err2;
2330 			break;
2331 		}
2332 	} else
2333 		krping_test_server(cb);
2334 	rdma_disconnect(cb->child_cm_id);
2335 err2:
2336 	krping_free_buffers(cb);
2337 err1:
2338 	krping_free_qp(cb);
2339 err0:
2340 	rdma_destroy_id(cb->child_cm_id);
2341 }
2342 
2343 static void krping_test_client(struct krping_cb *cb)
2344 {
2345 	int ping, start, cc, i, ret;
2346 	struct ib_send_wr *bad_wr;
2347 	unsigned char c;
2348 
2349 	start = 65;
2350 	for (ping = 0; !cb->count || ping < cb->count; ping++) {
2351 		cb->state = RDMA_READ_ADV;
2352 
2353 		/* Put some ascii text in the buffer. */
2354 		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
2355 		for (i = cc, c = start; i < cb->size; i++) {
2356 			cb->start_buf[i] = c;
2357 			c++;
2358 			if (c > 122)
2359 				c = 65;
2360 		}
2361 		start++;
2362 		if (start > 122)
2363 			start = 65;
2364 		cb->start_buf[cb->size - 1] = 0;
2365 
2366 		krping_format_send(cb, cb->start_dma_addr);
2367 		if (cb->state == ERROR) {
2368 			PRINTF(cb, "krping_format_send failed\n");
2369 			break;
2370 		}
2371 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2372 		if (ret) {
2373 			PRINTF(cb, "post send error %d\n", ret);
2374 			break;
2375 		}
2376 
2377 		/* Wait for server to ACK */
2378 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
2379 		if (cb->state != RDMA_WRITE_ADV) {
2380 			PRINTF(cb,
2381 			       "wait for RDMA_WRITE_ADV state %d\n",
2382 			       cb->state);
2383 			break;
2384 		}
2385 
2386 		krping_format_send(cb, cb->rdma_dma_addr);
2387 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2388 		if (ret) {
2389 			PRINTF(cb, "post send error %d\n", ret);
2390 			break;
2391 		}
2392 
2393 		/* Wait for the server to say the RDMA Write is complete. */
2394 		wait_event_interruptible(cb->sem,
2395 					 cb->state >= RDMA_WRITE_COMPLETE);
2396 		if (cb->state != RDMA_WRITE_COMPLETE) {
2397 			PRINTF(cb,
2398 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
2399 			       cb->state);
2400 			break;
2401 		}
2402 
2403 		if (cb->validate)
2404 			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
2405 				PRINTF(cb, "data mismatch!\n");
2406 				break;
2407 			}
2408 
2409 		if (cb->verbose) {
2410 			if (strlen(cb->rdma_buf) > 128) {
2411 				char msgbuf[128];
2412 
2413 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
2414 				PRINTF(cb, "ping data stripped: %s\n",
2415 				       msgbuf);
2416 			} else
2417 				PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
2418 		}
2419 #ifdef SLOW_KRPING
2420 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2421 #endif
2422 	}
2423 }
2424 
2425 static void krping_rlat_test_client(struct krping_cb *cb)
2426 {
2427 	struct ib_send_wr *bad_wr;
2428 	struct ib_wc wc;
2429 	int ret;
2430 
2431 	cb->state = RDMA_READ_ADV;
2432 
2433 	/* Send STAG/TO/Len to client */
2434 	krping_format_send(cb, cb->start_dma_addr);
2435 	if (cb->state == ERROR) {
2436 		PRINTF(cb, "krping_format_send failed\n");
2437 		return;
2438 	}
2439 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2440 	if (ret) {
2441 		PRINTF(cb, "post send error %d\n", ret);
2442 		return;
2443 	}
2444 
2445 	/* Spin waiting for send completion */
2446 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2447 	if (ret < 0) {
2448 		PRINTF(cb, "poll error %d\n", ret);
2449 		return;
2450 	}
2451 	if (wc.status) {
2452 		PRINTF(cb, "send completion error %d\n", wc.status);
2453 		return;
2454 	}
2455 
2456 	/* Spin waiting for server's Start STAG/TO/Len */
2457 	while (cb->state < RDMA_WRITE_ADV) {
2458 		krping_cq_event_handler(cb->cq, cb);
2459 	}
2460 
2461 #if 0
2462 {
2463 	int i;
2464 	struct timeval start, stop;
2465 	time_t sec;
2466 	suseconds_t usec;
2467 	unsigned long long elapsed;
2468 	struct ib_wc wc;
2469 	struct ib_send_wr *bad_wr;
2470 	int ne;
2471 
2472 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
2473 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
2474 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
2475 	cb->rdma_sq_wr.sg_list->length = 0;
2476 	cb->rdma_sq_wr.num_sge = 0;
2477 
2478 	microtime(&start);
2479 	for (i=0; i < 100000; i++) {
2480 		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
2481 			PRINTF(cb, "Couldn't post send\n");
2482 			return;
2483 		}
2484 		do {
2485 			ne = ib_poll_cq(cb->cq, 1, &wc);
2486 		} while (ne == 0);
2487 		if (ne < 0) {
2488 			PRINTF(cb, "poll CQ failed %d\n", ne);
2489 			return;
2490 		}
2491 		if (wc.status != IB_WC_SUCCESS) {
2492 			PRINTF(cb, "Completion wth error at %s:\n",
2493 				cb->server ? "server" : "client");
2494 			PRINTF(cb, "Failed status %d: wr_id %d\n",
2495 				wc.status, (int) wc.wr_id);
2496 			return;
2497 		}
2498 	}
2499 	microtime(&stop);
2500 
2501 	if (stop.tv_usec < start.tv_usec) {
2502 		stop.tv_usec += 1000000;
2503 		stop.tv_sec  -= 1;
2504 	}
2505 	sec     = stop.tv_sec - start.tv_sec;
2506 	usec    = stop.tv_usec - start.tv_usec;
2507 	elapsed = sec * 1000000 + usec;
2508 	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
2509 }
2510 #endif
2511 
2512 	rlat_test(cb);
2513 }
2514 
2515 static void krping_wlat_test_client(struct krping_cb *cb)
2516 {
2517 	struct ib_send_wr *bad_wr;
2518 	struct ib_wc wc;
2519 	int ret;
2520 
2521 	cb->state = RDMA_READ_ADV;
2522 
2523 	/* Send STAG/TO/Len to client */
2524 	krping_format_send(cb, cb->start_dma_addr);
2525 	if (cb->state == ERROR) {
2526 		PRINTF(cb, "krping_format_send failed\n");
2527 		return;
2528 	}
2529 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2530 	if (ret) {
2531 		PRINTF(cb, "post send error %d\n", ret);
2532 		return;
2533 	}
2534 
2535 	/* Spin waiting for send completion */
2536 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2537 	if (ret < 0) {
2538 		PRINTF(cb, "poll error %d\n", ret);
2539 		return;
2540 	}
2541 	if (wc.status) {
2542 		PRINTF(cb, "send completion error %d\n", wc.status);
2543 		return;
2544 	}
2545 
2546 	/* Spin waiting for server's Start STAG/TO/Len */
2547 	while (cb->state < RDMA_WRITE_ADV) {
2548 		krping_cq_event_handler(cb->cq, cb);
2549 	}
2550 
2551 	wlat_test(cb);
2552 }
2553 
2554 static void krping_bw_test_client(struct krping_cb *cb)
2555 {
2556 	struct ib_send_wr *bad_wr;
2557 	struct ib_wc wc;
2558 	int ret;
2559 
2560 	cb->state = RDMA_READ_ADV;
2561 
2562 	/* Send STAG/TO/Len to client */
2563 	krping_format_send(cb, cb->start_dma_addr);
2564 	if (cb->state == ERROR) {
2565 		PRINTF(cb, "krping_format_send failed\n");
2566 		return;
2567 	}
2568 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
2569 	if (ret) {
2570 		PRINTF(cb, "post send error %d\n", ret);
2571 		return;
2572 	}
2573 
2574 	/* Spin waiting for send completion */
2575 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
2576 	if (ret < 0) {
2577 		PRINTF(cb, "poll error %d\n", ret);
2578 		return;
2579 	}
2580 	if (wc.status) {
2581 		PRINTF(cb, "send completion error %d\n", wc.status);
2582 		return;
2583 	}
2584 
2585 	/* Spin waiting for server's Start STAG/TO/Len */
2586 	while (cb->state < RDMA_WRITE_ADV) {
2587 		krping_cq_event_handler(cb->cq, cb);
2588 	}
2589 
2590 	bw_test(cb);
2591 }
2592 
2593 
2594 /*
2595  * fastreg 2 valid different mrs and verify the completions.
2596  */
2597 static void krping_fr_test1(struct krping_cb *cb)
2598 {
2599 	struct ib_fast_reg_page_list *pl;
2600 	struct ib_send_wr fr, *bad;
2601 	struct ib_wc wc;
2602 	struct ib_mr *mr1, *mr2;
2603 	int i;
2604 	int ret;
2605 	int size = cb->size;
2606 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2607 	int count = 0;
2608 
2609 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2610 	if (IS_ERR(pl)) {
2611 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
2612 		return;
2613 	}
2614 
2615 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
2616 	if (IS_ERR(mr1)) {
2617 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2618 		goto err1;
2619 	}
2620 	mr2 = ib_alloc_fast_reg_mr(cb->pd, plen);
2621 	if (IS_ERR(mr2)) {
2622 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2623 		goto err2;
2624 	}
2625 
2626 
2627 	for (i=0; i<plen; i++)
2628 		pl->page_list[i] = i * PAGE_SIZE;
2629 
2630 	memset(&fr, 0, sizeof fr);
2631 	fr.opcode = IB_WR_FAST_REG_MR;
2632 	fr.wr_id = 1;
2633 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
2634 	fr.wr.fast_reg.length = size;
2635 	fr.wr.fast_reg.page_list = pl;
2636 	fr.wr.fast_reg.page_list_len = plen;
2637 	fr.wr.fast_reg.iova_start = 0;
2638 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2639 	fr.send_flags = IB_SEND_SIGNALED;
2640 	fr.wr.fast_reg.rkey = mr1->rkey;
2641 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2642 	ret = ib_post_send(cb->qp, &fr, &bad);
2643 	if (ret) {
2644 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2645 		goto err3;
2646 	}
2647 	fr.wr.fast_reg.rkey = mr2->rkey;
2648 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2649 	ret = ib_post_send(cb->qp, &fr, &bad);
2650 	if (ret) {
2651 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2652 		goto err3;
2653 	}
2654 
2655 	DEBUG_LOG(cb, "sleeping 1 second\n");
2656 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2657 	do {
2658 		ret = ib_poll_cq(cb->cq, 1, &wc);
2659 		if (ret < 0) {
2660 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2661 			goto err3;
2662 		}
2663 		if (ret == 1) {
2664 			DEBUG_LOG(cb, "completion status %u wr %s\n",
2665 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
2666 			count++;
2667 		} else if (krping_sigpending()) {
2668 			PRINTF(cb, "signal!\n");
2669 			goto err3;
2670 		}
2671 
2672 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2673 	} while (count != 2);
2674 err3:
2675 	DEBUG_LOG(cb, "sleeping 1 second\n");
2676 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2677 	DEBUG_LOG(cb, "draining the cq...\n");
2678 	do {
2679 		ret = ib_poll_cq(cb->cq, 1, &wc);
2680 		if (ret < 0) {
2681 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2682 			break;
2683 		}
2684 		if (ret == 1) {
2685 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
2686 		}
2687 	} while (ret == 1);
2688 	DEBUG_LOG(cb, "destroying fr mr2!\n");
2689 
2690 	ib_dereg_mr(mr2);
2691 err2:
2692 	DEBUG_LOG(cb, "destroying fr mr1!\n");
2693 	ib_dereg_mr(mr1);
2694 err1:
2695 	DEBUG_LOG(cb, "destroying fr page list!\n");
2696 	ib_free_fast_reg_page_list(pl);
2697 	DEBUG_LOG(cb, "%s done!\n", __func__);
2698 }
2699 
2700 /*
2701  * fastreg the same mr twice, 2nd one should produce error cqe.
2702  */
2703 static void krping_fr_test2(struct krping_cb *cb)
2704 {
2705 	struct ib_fast_reg_page_list *pl;
2706 	struct ib_send_wr fr, *bad;
2707 	struct ib_wc wc;
2708 	struct ib_mr *mr1;
2709 	int i;
2710 	int ret;
2711 	int size = cb->size;
2712 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2713 	int count = 0;
2714 
2715 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2716 	if (IS_ERR(pl)) {
2717 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
2718 		return;
2719 	}
2720 
2721 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
2722 	if (IS_ERR(mr1)) {
2723 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2724 		goto err1;
2725 	}
2726 
2727 	for (i=0; i<plen; i++)
2728 		pl->page_list[i] = i * PAGE_SIZE;
2729 
2730 	memset(&fr, 0, sizeof fr);
2731 	fr.opcode = IB_WR_FAST_REG_MR;
2732 	fr.wr_id = 1;
2733 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
2734 	fr.wr.fast_reg.length = size;
2735 	fr.wr.fast_reg.page_list = pl;
2736 	fr.wr.fast_reg.page_list_len = plen;
2737 	fr.wr.fast_reg.iova_start = 0;
2738 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2739 	fr.send_flags = IB_SEND_SIGNALED;
2740 	fr.wr.fast_reg.rkey = mr1->rkey;
2741 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2742 	ret = ib_post_send(cb->qp, &fr, &bad);
2743 	if (ret) {
2744 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2745 		goto err3;
2746 	}
2747 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2748 	ret = ib_post_send(cb->qp, &fr, &bad);
2749 	if (ret) {
2750 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2751 		goto err3;
2752 	}
2753 
2754 	DEBUG_LOG(cb, "sleeping 1 second\n");
2755 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2756 	do {
2757 		ret = ib_poll_cq(cb->cq, 1, &wc);
2758 		if (ret < 0) {
2759 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2760 			goto err3;
2761 		}
2762 		if (ret == 1) {
2763 			DEBUG_LOG(cb, "completion status %u wr %s\n",
2764 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
2765 			count++;
2766 		} else if (krping_sigpending()) {
2767 			PRINTF(cb, "signal!\n");
2768 			goto err3;
2769 		}
2770 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2771 	} while (count != 2);
2772 err3:
2773 	DEBUG_LOG(cb, "sleeping 1 second\n");
2774 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2775 	DEBUG_LOG(cb, "draining the cq...\n");
2776 	do {
2777 		ret = ib_poll_cq(cb->cq, 1, &wc);
2778 		if (ret < 0) {
2779 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2780 			break;
2781 		}
2782 		if (ret == 1) {
2783 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
2784 		}
2785 	} while (ret == 1);
2786 	DEBUG_LOG(cb, "destroying fr mr1!\n");
2787 	ib_dereg_mr(mr1);
2788 err1:
2789 	DEBUG_LOG(cb, "destroying fr page list!\n");
2790 	ib_free_fast_reg_page_list(pl);
2791 	DEBUG_LOG(cb, "%s done!\n", __func__);
2792 }
2793 
2794 /*
2795  * fastreg pipelined in a loop as fast as we can until the user interrupts.
2796  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
2797  */
2798 static void krping_fr_test3(struct krping_cb *cb)
2799 {
2800 	struct ib_fast_reg_page_list *pl;
2801 	struct ib_send_wr fr, inv, *bad;
2802 	struct ib_wc wc;
2803 	u8 key = 0;
2804 	struct ib_mr *mr;
2805 	int i;
2806 	int ret;
2807 	int size = cb->size;
2808 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2809 	unsigned long start;
2810 	int count = 0;
2811 	int scnt = 0;
2812 
2813 
2814 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2815 	if (IS_ERR(pl)) {
2816 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
2817 		return;
2818 	}
2819 
2820 	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
2821 	if (IS_ERR(mr)) {
2822 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2823 		goto err1;
2824 	}
2825 
2826 	for (i=0; i<plen; i++)
2827 		pl->page_list[i] = i * PAGE_SIZE;
2828 
2829 	memset(&fr, 0, sizeof fr);
2830 	fr.opcode = IB_WR_FAST_REG_MR;
2831 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
2832 	fr.wr.fast_reg.length = size;
2833 	fr.wr.fast_reg.page_list = pl;
2834 	fr.wr.fast_reg.page_list_len = plen;
2835 	fr.wr.fast_reg.iova_start = 0;
2836 	fr.send_flags = IB_SEND_SIGNALED;
2837 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2838 	fr.next = &inv;
2839 	memset(&inv, 0, sizeof inv);
2840 	inv.opcode = IB_WR_LOCAL_INV;
2841 	inv.send_flags = IB_SEND_SIGNALED;
2842 
2843 	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
2844 	start = time_uptime;
2845 	while (1) {
2846 		if ((time_uptime - start) >= 9) {
2847 			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
2848 			wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2849 			if (cb->state == ERROR)
2850 				break;
2851 			start = time_uptime;
2852 		}
2853 		while (scnt < (cb->txdepth>>1)) {
2854 			ib_update_fast_reg_key(mr, ++key);
2855 			fr.wr.fast_reg.rkey = mr->rkey;
2856 			inv.ex.invalidate_rkey = mr->rkey;
2857 			size = arc4random() % cb->size;
2858 			if (size == 0)
2859 				size = cb->size;
2860 			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2861 			fr.wr.fast_reg.length = size;
2862 			fr.wr.fast_reg.page_list_len = plen;
2863 			ret = ib_post_send(cb->qp, &fr, &bad);
2864 			if (ret) {
2865 				PRINTF(cb, "ib_post_send failed %d\n", ret);
2866 				goto err2;
2867 			}
2868 			scnt+=2;
2869 		}
2870 
2871 		do {
2872 			ret = ib_poll_cq(cb->cq, 1, &wc);
2873 			if (ret < 0) {
2874 				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2875 				goto err2;
2876 			}
2877 			if (ret == 1) {
2878 				if (wc.status) {
2879 					PRINTF(cb, "completion error %u\n", wc.status);
2880 					goto err2;
2881 				}
2882 				count++;
2883 				scnt--;
2884 			}
2885 			else if (krping_sigpending()) {
2886 				PRINTF(cb, "signal!\n");
2887 				goto err2;
2888 			}
2889 		} while (ret == 1);
2890 	}
2891 err2:
2892 	DEBUG_LOG(cb, "sleeping 1 second\n");
2893 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2894 	DEBUG_LOG(cb, "draining the cq...\n");
2895 	do {
2896 		ret = ib_poll_cq(cb->cq, 1, &wc);
2897 		if (ret < 0) {
2898 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2899 			break;
2900 		}
2901 		if (ret == 1) {
2902 			if (wc.status) {
2903 				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
2904 			}
2905 		}
2906 	} while (ret == 1);
2907 	DEBUG_LOG(cb, "fr_test: done!\n");
2908 	ib_dereg_mr(mr);
2909 err1:
2910 	DEBUG_LOG(cb, "destroying fr page list!\n");
2911 	ib_free_fast_reg_page_list(pl);
2912 	DEBUG_LOG(cb, "%s done!\n", __func__);
2913 }
2914 
2915 /*
2916  * fastreg 1 and invalidate 1 mr and verify completion.
2917  */
2918 static void krping_fr_test4(struct krping_cb *cb)
2919 {
2920 	struct ib_fast_reg_page_list *pl;
2921 	struct ib_send_wr fr, inv, *bad;
2922 	struct ib_wc wc;
2923 	struct ib_mr *mr1;
2924 	int i;
2925 	int ret;
2926 	int size = cb->size;
2927 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
2928 	int count = 0;
2929 
2930 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
2931 	if (IS_ERR(pl)) {
2932 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
2933 		return;
2934 	}
2935 
2936 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
2937 	if (IS_ERR(mr1)) {
2938 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
2939 		goto err1;
2940 	}
2941 
2942 	for (i=0; i<plen; i++)
2943 		pl->page_list[i] = i * PAGE_SIZE;
2944 
2945 	memset(&fr, 0, sizeof fr);
2946 	fr.opcode = IB_WR_FAST_REG_MR;
2947 	fr.wr_id = 1;
2948 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
2949 	fr.wr.fast_reg.length = size;
2950 	fr.wr.fast_reg.page_list = pl;
2951 	fr.wr.fast_reg.page_list_len = plen;
2952 	fr.wr.fast_reg.iova_start = 0;
2953 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
2954 	fr.send_flags = IB_SEND_SIGNALED;
2955 	fr.wr.fast_reg.rkey = mr1->rkey;
2956 	fr.next = &inv;
2957 	memset(&inv, 0, sizeof inv);
2958 	inv.opcode = IB_WR_LOCAL_INV;
2959 	inv.ex.invalidate_rkey = mr1->rkey;
2960 
2961 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
2962 	ret = ib_post_send(cb->qp, &fr, &bad);
2963 	if (ret) {
2964 		PRINTF(cb, "ib_post_send failed %d\n", ret);
2965 		goto err3;
2966 	}
2967 	DEBUG_LOG(cb, "sleeping 1 second\n");
2968 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2969 	do {
2970 		ret = ib_poll_cq(cb->cq, 1, &wc);
2971 		if (ret < 0) {
2972 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2973 			goto err3;
2974 		}
2975 		if (ret == 1) {
2976 			DEBUG_LOG(cb, "completion status %u wr %s\n",
2977 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
2978 			count++;
2979 		} else if (krping_sigpending()) {
2980 			PRINTF(cb, "signal!\n");
2981 			goto err3;
2982 		}
2983 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2984 	} while (count != 1);
2985 err3:
2986 	DEBUG_LOG(cb, "sleeping 1 second\n");
2987 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2988 	DEBUG_LOG(cb, "draining the cq...\n");
2989 	do {
2990 		ret = ib_poll_cq(cb->cq, 1, &wc);
2991 		if (ret < 0) {
2992 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2993 			break;
2994 		}
2995 		if (ret == 1) {
2996 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
2997 		}
2998 	} while (ret == 1);
2999 	DEBUG_LOG(cb, "destroying fr mr1!\n");
3000 	ib_dereg_mr(mr1);
3001 err1:
3002 	DEBUG_LOG(cb, "destroying fr page list!\n");
3003 	ib_free_fast_reg_page_list(pl);
3004 	DEBUG_LOG(cb, "%s done!\n", __func__);
3005 }
3006 
3007 static void krping_fr_test(struct krping_cb *cb)
3008 {
3009 	switch (cb->testnum) {
3010 	case 1:
3011 		krping_fr_test1(cb);
3012 		break;
3013 	case 2:
3014 		krping_fr_test2(cb);
3015 		break;
3016 	case 3:
3017 		krping_fr_test3(cb);
3018 		break;
3019 	case 4:
3020 		krping_fr_test4(cb);
3021 		break;
3022 	case 5:
3023 		krping_fr_test5_client(cb);
3024 		break;
3025 	case 6:
3026 		krping_fr_test6_client(cb);
3027 		break;
3028 	default:
3029 		PRINTF(cb, "Unkown frtest num %u\n", cb->testnum);
3030 		break;
3031 	}
3032 }
3033 
3034 static int krping_connect_client(struct krping_cb *cb)
3035 {
3036 	struct rdma_conn_param conn_param;
3037 	int ret;
3038 
3039 	memset(&conn_param, 0, sizeof conn_param);
3040 	conn_param.responder_resources = 1;
3041 	conn_param.initiator_depth = 1;
3042 	conn_param.retry_count = 10;
3043 
3044 	ret = rdma_connect(cb->cm_id, &conn_param);
3045 	if (ret) {
3046 		PRINTF(cb, "rdma_connect error %d\n", ret);
3047 		return ret;
3048 	}
3049 
3050 	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
3051 	if (cb->state == ERROR) {
3052 		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
3053 		return -1;
3054 	}
3055 
3056 	DEBUG_LOG(cb, "rdma_connect successful\n");
3057 	return 0;
3058 }
3059 
3060 static int krping_bind_client(struct krping_cb *cb)
3061 {
3062 	struct sockaddr_in sin;
3063 	int ret;
3064 
3065 	memset(&sin, 0, sizeof(sin));
3066 	sin.sin_len = sizeof sin;
3067 	sin.sin_family = AF_INET;
3068 	sin.sin_addr.s_addr = cb->addr.s_addr;
3069 	sin.sin_port = cb->port;
3070 
3071 	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
3072 				2000);
3073 	if (ret) {
3074 		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
3075 		return ret;
3076 	}
3077 
3078 	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
3079 	if (cb->state != ROUTE_RESOLVED) {
3080 		PRINTF(cb,
3081 		       "addr/route resolution did not resolve: state %d\n",
3082 		       cb->state);
3083 		return -EINTR;
3084 	}
3085 
3086 	if (cb->mem == FASTREG && !fastreg_supported(cb, 0))
3087 		return -EINVAL;
3088 
3089 	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
3090 	return 0;
3091 }
3092 
3093 static void krping_run_client(struct krping_cb *cb)
3094 {
3095 	struct ib_recv_wr *bad_wr;
3096 	int ret;
3097 
3098 	ret = krping_bind_client(cb);
3099 	if (ret)
3100 		return;
3101 
3102 	ret = krping_setup_qp(cb, cb->cm_id);
3103 	if (ret) {
3104 		PRINTF(cb, "setup_qp failed: %d\n", ret);
3105 		return;
3106 	}
3107 
3108 	ret = krping_setup_buffers(cb);
3109 	if (ret) {
3110 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
3111 		goto err1;
3112 	}
3113 
3114 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
3115 	if (ret) {
3116 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
3117 		goto err2;
3118 	}
3119 
3120 	ret = krping_connect_client(cb);
3121 	if (ret) {
3122 		PRINTF(cb, "connect error %d\n", ret);
3123 		goto err2;
3124 	}
3125 
3126 	if (cb->wlat)
3127 		krping_wlat_test_client(cb);
3128 	else if (cb->rlat)
3129 		krping_rlat_test_client(cb);
3130 	else if (cb->bw)
3131 		krping_bw_test_client(cb);
3132 	else if (cb->frtest)
3133 		krping_fr_test(cb);
3134 	else
3135 		krping_test_client(cb);
3136 	rdma_disconnect(cb->cm_id);
3137 err2:
3138 	krping_free_buffers(cb);
3139 err1:
3140 	krping_free_qp(cb);
3141 }
3142 
3143 int krping_doit(char *cmd, void *cookie)
3144 {
3145 	struct krping_cb *cb;
3146 	int op;
3147 	int ret = 0;
3148 	char *optarg;
3149 	unsigned long optint;
3150 
3151 	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
3152 	if (!cb)
3153 		return -ENOMEM;
3154 
3155 	mutex_lock(&krping_mutex);
3156 	list_add_tail(&cb->list, &krping_cbs);
3157 	mutex_unlock(&krping_mutex);
3158 
3159 	cb->cookie = cookie;
3160 	cb->server = -1;
3161 	cb->state = IDLE;
3162 	cb->size = 64;
3163 	cb->txdepth = RPING_SQ_DEPTH;
3164 	cb->mem = DMA;
3165 	init_waitqueue_head(&cb->sem);
3166 
3167 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
3168 			      &optint)) != 0) {
3169 		switch (op) {
3170 		case 'a':
3171 			cb->addr_str = optarg;
3172 			DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
3173 			if (!inet_aton(optarg, &cb->addr)) {
3174 				PRINTF(cb, "bad addr string %s\n",
3175 				    optarg);
3176 				ret = EINVAL;
3177 			}
3178 			break;
3179 		case 'p':
3180 			cb->port = htons(optint);
3181 			DEBUG_LOG(cb, "port %d\n", (int)optint);
3182 			break;
3183 		case 'P':
3184 			cb->poll = 1;
3185 			DEBUG_LOG(cb, "server\n");
3186 			break;
3187 		case 's':
3188 			cb->server = 1;
3189 			DEBUG_LOG(cb, "server\n");
3190 			break;
3191 		case 'c':
3192 			cb->server = 0;
3193 			DEBUG_LOG(cb, "client\n");
3194 			break;
3195 		case 'S':
3196 			cb->size = optint;
3197 			if ((cb->size < 1) ||
3198 			    (cb->size > RPING_BUFSIZE)) {
3199 				PRINTF(cb, "Invalid size %d "
3200 				       "(valid range is 1 to %d)\n",
3201 				       cb->size, RPING_BUFSIZE);
3202 				ret = EINVAL;
3203 			} else
3204 				DEBUG_LOG(cb, "size %d\n", (int)optint);
3205 			break;
3206 		case 'C':
3207 			cb->count = optint;
3208 			if (cb->count < 0) {
3209 				PRINTF(cb, "Invalid count %d\n",
3210 					cb->count);
3211 				ret = EINVAL;
3212 			} else
3213 				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
3214 			break;
3215 		case 'v':
3216 			cb->verbose++;
3217 			DEBUG_LOG(cb, "verbose\n");
3218 			break;
3219 		case 'V':
3220 			cb->validate++;
3221 			DEBUG_LOG(cb, "validate data\n");
3222 			break;
3223 		case 'l':
3224 			cb->wlat++;
3225 			break;
3226 		case 'L':
3227 			cb->rlat++;
3228 			break;
3229 		case 'B':
3230 			cb->bw++;
3231 			break;
3232 		case 'd':
3233 			cb->duplex++;
3234 			break;
3235 		case 'm':
3236 			if (!strncmp(optarg, "dma", 3))
3237 				cb->mem = DMA;
3238 			else if (!strncmp(optarg, "fastreg", 7))
3239 				cb->mem = FASTREG;
3240 			else if (!strncmp(optarg, "mw", 2))
3241 				cb->mem = MW;
3242 			else if (!strncmp(optarg, "mr", 2))
3243 				cb->mem = MR;
3244 			else {
3245 				PRINTF(cb, "unknown mem mode %s.  "
3246 					"Must be dma, fastreg, mw, or mr\n",
3247 					optarg);
3248 				ret = -EINVAL;
3249 				break;
3250 			}
3251 			break;
3252 		case 'I':
3253 			cb->server_invalidate = 1;
3254 			break;
3255 		case 'T':
3256 			cb->txdepth = optint;
3257 			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
3258 			break;
3259 		case 'Z':
3260 			cb->local_dma_lkey = 1;
3261 			DEBUG_LOG(cb, "using local dma lkey\n");
3262 			break;
3263 		case 'R':
3264 			cb->read_inv = 1;
3265 			DEBUG_LOG(cb, "using read-with-inv\n");
3266 			break;
3267 		case 'f':
3268 			cb->frtest = 1;
3269 			cb->testnum = optint;
3270 			DEBUG_LOG(cb, "fast-reg test!\n");
3271 			break;
3272 		default:
3273 			PRINTF(cb, "unknown opt %s\n", optarg);
3274 			ret = -EINVAL;
3275 			break;
3276 		}
3277 	}
3278 	if (ret)
3279 		goto out;
3280 
3281 	if (cb->server == -1) {
3282 		PRINTF(cb, "must be either client or server\n");
3283 		ret = -EINVAL;
3284 		goto out;
3285 	}
3286 
3287 	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
3288 		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
3289 		ret = -EINVAL;
3290 		goto out;
3291 	}
3292 	if (cb->server_invalidate && cb->mem != FASTREG) {
3293 		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
3294 		ret = -EINVAL;
3295 		goto out;
3296 	}
3297 
3298 	if (cb->read_inv && cb->mem != FASTREG) {
3299 		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
3300 		ret = -EINVAL;
3301 		goto out;
3302 	}
3303 
3304 	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) {
3305 		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
3306 		ret = -EINVAL;
3307 		goto out;
3308 	}
3309 
3310 	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
3311 	if (IS_ERR(cb->cm_id)) {
3312 		ret = PTR_ERR(cb->cm_id);
3313 		PRINTF(cb, "rdma_create_id error %d\n", ret);
3314 		goto out;
3315 	}
3316 	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
3317 
3318 	if (cb->server)
3319 		krping_run_server(cb);
3320 	else
3321 		krping_run_client(cb);
3322 
3323 	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
3324 	rdma_destroy_id(cb->cm_id);
3325 out:
3326 	mutex_lock(&krping_mutex);
3327 	list_del(&cb->list);
3328 	mutex_unlock(&krping_mutex);
3329 	kfree(cb);
3330 	return ret;
3331 }
3332 
3333 void
3334 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
3335 {
3336 	struct krping_cb *cb;
3337 
3338 	mutex_lock(&krping_mutex);
3339 	list_for_each_entry(cb, &krping_cbs, list)
3340 	    (*f)(cb->pd ? &cb->stats : NULL, arg);
3341 	mutex_unlock(&krping_mutex);
3342 }
3343 
3344 void krping_init(void)
3345 {
3346 
3347 	mutex_init(&krping_mutex);
3348 }
3349