xref: /freebsd/sys/contrib/rdma/krping/krping.c (revision d4ae33f0721c1b170fe37d97e395228ffcfb3f80)
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <linux/module.h>
38 #include <linux/moduleparam.h>
39 #include <linux/init.h>
40 #include <linux/slab.h>
41 #include <linux/err.h>
42 #include <linux/string.h>
43 #include <linux/inet.h>
44 #include <linux/list.h>
45 #include <linux/in.h>
46 #include <linux/device.h>
47 #include <linux/pci.h>
48 #include <linux/sched.h>
49 #include <asm/system.h>
50 
51 #include <asm/atomic.h>
52 
53 #include <rdma/ib_verbs.h>
54 #include <rdma/rdma_cm.h>
55 
56 #include "krping.h"
57 #include "getopt.h"
58 
59 extern int krping_debug;
60 #define DEBUG_LOG(cb, x...) if (krping_debug) krping_printf((cb)->cookie, x)
61 #define PRINTF(cb, x...) krping_printf((cb)->cookie, x)
62 
63 MODULE_AUTHOR("Steve Wise");
64 MODULE_DESCRIPTION("RDMA ping client/server");
65 MODULE_LICENSE("Dual BSD/GPL");
66 
67 static __inline uint64_t
68 get_cycles(void)
69 {
70 	uint32_t low, high;
71 	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
72 	return (low | ((u_int64_t)high << 32));
73 }
74 
75 typedef uint64_t cycles_t;
76 
77 enum mem_type {
78 	DMA = 1,
79 	FASTREG = 2,
80 	MW = 3,
81 	MR = 4
82 };
83 
84 static const struct krping_option krping_opts[] = {
85 	{"count", OPT_INT, 'C'},
86 	{"size", OPT_INT, 'S'},
87 	{"addr", OPT_STRING, 'a'},
88 	{"port", OPT_INT, 'p'},
89 	{"verbose", OPT_NOPARAM, 'v'},
90 	{"validate", OPT_NOPARAM, 'V'},
91 	{"server", OPT_NOPARAM, 's'},
92 	{"client", OPT_NOPARAM, 'c'},
93 	{"mem_mode", OPT_STRING, 'm'},
94 	{"server_inv", OPT_NOPARAM, 'I'},
95  	{"wlat", OPT_NOPARAM, 'l'},
96  	{"rlat", OPT_NOPARAM, 'L'},
97  	{"bw", OPT_NOPARAM, 'B'},
98  	{"duplex", OPT_NOPARAM, 'd'},
99  	{"txdepth", OPT_INT, 'T'},
100  	{"poll", OPT_NOPARAM, 'P'},
101  	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
102  	{"read_inv", OPT_NOPARAM, 'R'},
103  	{"fr", OPT_NOPARAM, 'f'},
104 	{NULL, 0, 0}
105 };
106 
107 #define htonll(x) cpu_to_be64((x))
108 #define ntohll(x) cpu_to_be64((x))
109 
110 static struct mutex krping_mutex;
111 
112 /*
113  * List of running krping threads.
114  */
115 static LIST_HEAD(krping_cbs);
116 
117 /*
118  * krping "ping/pong" loop:
119  * 	client sends source rkey/addr/len
120  *	server receives source rkey/add/len
121  *	server rdma reads "ping" data from source
122  * 	server sends "go ahead" on rdma read completion
123  *	client sends sink rkey/addr/len
124  * 	server receives sink rkey/addr/len
125  * 	server rdma writes "pong" data to sink
126  * 	server sends "go ahead" on rdma write completion
127  * 	<repeat loop>
128  */
129 
130 /*
131  * These states are used to signal events between the completion handler
132  * and the main client or server thread.
133  *
134  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
135  * and RDMA_WRITE_COMPLETE for each ping.
136  */
137 enum test_state {
138 	IDLE = 1,
139 	CONNECT_REQUEST,
140 	ADDR_RESOLVED,
141 	ROUTE_RESOLVED,
142 	CONNECTED,
143 	RDMA_READ_ADV,
144 	RDMA_READ_COMPLETE,
145 	RDMA_WRITE_ADV,
146 	RDMA_WRITE_COMPLETE,
147 	ERROR
148 };
149 
150 struct krping_rdma_info {
151 	uint64_t buf;
152 	uint32_t rkey;
153 	uint32_t size;
154 };
155 
156 /*
157  * Default max buffer size for IO...
158  */
159 #define RPING_BUFSIZE 128*1024
160 #define RPING_SQ_DEPTH 64
161 
162 /*
163  * Control block struct.
164  */
165 struct krping_cb {
166 	void *cookie;
167 	int server;			/* 0 iff client */
168 	struct ib_cq *cq;
169 	struct ib_pd *pd;
170 	struct ib_qp *qp;
171 
172 	enum mem_type mem;
173 	struct ib_mr *dma_mr;
174 
175 	struct ib_fast_reg_page_list *page_list;
176 	int page_list_len;
177 	struct ib_send_wr fastreg_wr;
178 	struct ib_send_wr invalidate_wr;
179 	struct ib_mr *fastreg_mr;
180 	int server_invalidate;
181 	int read_inv;
182 	u8 key;
183 
184 	struct ib_mw *mw;
185 	struct ib_mw_bind bind_attr;
186 
187 	struct ib_recv_wr rq_wr;	/* recv work request record */
188 	struct ib_sge recv_sgl;		/* recv single SGE */
189 	struct krping_rdma_info recv_buf;/* malloc'd buffer */
190 	u64 recv_dma_addr;
191 	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
192 	struct ib_mr *recv_mr;
193 
194 	struct ib_send_wr sq_wr;	/* send work requrest record */
195 	struct ib_sge send_sgl;
196 	struct krping_rdma_info send_buf;/* single send buf */
197 	u64 send_dma_addr;
198 	DECLARE_PCI_UNMAP_ADDR(send_mapping)
199 	struct ib_mr *send_mr;
200 
201 	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
202 	struct ib_sge rdma_sgl;		/* rdma single SGE */
203 	char *rdma_buf;			/* used as rdma sink */
204 	u64  rdma_dma_addr;
205 	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
206 	struct ib_mr *rdma_mr;
207 
208 	uint32_t remote_rkey;		/* remote guys RKEY */
209 	uint64_t remote_addr;		/* remote guys TO */
210 	uint32_t remote_len;		/* remote guys LEN */
211 
212 	char *start_buf;		/* rdma read src */
213 	u64  start_dma_addr;
214 	DECLARE_PCI_UNMAP_ADDR(start_mapping)
215 	struct ib_mr *start_mr;
216 
217 	enum test_state state;		/* used for cond/signalling */
218 	wait_queue_head_t sem;
219 	struct krping_stats stats;
220 
221 	uint16_t port;			/* dst port in NBO */
222 	struct in_addr addr;		/* dst addr in NBO */
223 	char *addr_str;			/* dst addr string */
224 	int verbose;			/* verbose logging */
225 	int count;			/* ping count */
226 	int size;			/* ping data size */
227 	int validate;			/* validate ping data */
228 	int wlat;			/* run wlat test */
229 	int rlat;			/* run rlat test */
230 	int bw;				/* run bw test */
231 	int duplex;			/* run bw full duplex test */
232 	int poll;			/* poll or block for rlat test */
233 	int txdepth;			/* SQ depth */
234 	int local_dma_lkey;		/* use 0 for lkey */
235 	int frtest;			/* fastreg test */
236 
237 	/* CM stuff */
238 	struct rdma_cm_id *cm_id;	/* connection on client side,*/
239 					/* listener on server side. */
240 	struct rdma_cm_id *child_cm_id;	/* connection on server side */
241 	struct list_head list;
242 };
243 
244 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
245 				   struct rdma_cm_event *event)
246 {
247 	int ret;
248 	struct krping_cb *cb = cma_id->context;
249 
250 	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
251 	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
252 
253 	switch (event->event) {
254 	case RDMA_CM_EVENT_ADDR_RESOLVED:
255 		cb->state = ADDR_RESOLVED;
256 		ret = rdma_resolve_route(cma_id, 2000);
257 		if (ret) {
258 			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
259 			wake_up_interruptible(&cb->sem);
260 		}
261 		break;
262 
263 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
264 		cb->state = ROUTE_RESOLVED;
265 		wake_up_interruptible(&cb->sem);
266 		break;
267 
268 	case RDMA_CM_EVENT_CONNECT_REQUEST:
269 		cb->state = CONNECT_REQUEST;
270 		cb->child_cm_id = cma_id;
271 		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
272 		wake_up_interruptible(&cb->sem);
273 		break;
274 
275 	case RDMA_CM_EVENT_ESTABLISHED:
276 		DEBUG_LOG(cb, "ESTABLISHED\n");
277 		if (!cb->server) {
278 			cb->state = CONNECTED;
279 		}
280 		wake_up_interruptible(&cb->sem);
281 		break;
282 
283 	case RDMA_CM_EVENT_ADDR_ERROR:
284 	case RDMA_CM_EVENT_ROUTE_ERROR:
285 	case RDMA_CM_EVENT_CONNECT_ERROR:
286 	case RDMA_CM_EVENT_UNREACHABLE:
287 	case RDMA_CM_EVENT_REJECTED:
288 		PRINTF(cb, "cma event %d, error %d\n", event->event,
289 		       event->status);
290 		cb->state = ERROR;
291 		wake_up_interruptible(&cb->sem);
292 		break;
293 
294 	case RDMA_CM_EVENT_DISCONNECTED:
295 		PRINTF(cb, "DISCONNECT EVENT...\n");
296 		cb->state = ERROR;
297 		wake_up_interruptible(&cb->sem);
298 		break;
299 
300 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
301 		PRINTF(cb, "cma detected device removal!!!!\n");
302 		break;
303 
304 	default:
305 		PRINTF(cb, "oof bad type!\n");
306 		wake_up_interruptible(&cb->sem);
307 		break;
308 	}
309 	return 0;
310 }
311 
312 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
313 {
314 	if (wc->byte_len != sizeof(cb->recv_buf)) {
315 		PRINTF(cb, "Received bogus data, size %d\n",
316 		       wc->byte_len);
317 		return -1;
318 	}
319 
320 	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
321 	cb->remote_addr = ntohll(cb->recv_buf.buf);
322 	cb->remote_len  = ntohl(cb->recv_buf.size);
323 	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
324 		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
325 		  cb->remote_len);
326 
327 	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
328 		cb->state = RDMA_READ_ADV;
329 	else
330 		cb->state = RDMA_WRITE_ADV;
331 
332 	return 0;
333 }
334 
335 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
336 {
337 	if (wc->byte_len != sizeof(cb->recv_buf)) {
338 		PRINTF(cb, "Received bogus data, size %d\n",
339 		       wc->byte_len);
340 		return -1;
341 	}
342 
343 	if (cb->state == RDMA_READ_ADV)
344 		cb->state = RDMA_WRITE_ADV;
345 	else
346 		cb->state = RDMA_WRITE_COMPLETE;
347 
348 	return 0;
349 }
350 
351 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
352 {
353 	struct krping_cb *cb = ctx;
354 	struct ib_wc wc;
355 	struct ib_recv_wr *bad_wr;
356 	int ret;
357 
358 	BUG_ON(cb->cq != cq);
359 	if (cb->state == ERROR) {
360 		PRINTF(cb, "cq completion in ERROR state\n");
361 		return;
362 	}
363 	if (cb->frtest) {
364 		PRINTF(cb, "cq completion event in frtest!\n");
365 		return;
366 	}
367 	if (!cb->wlat && !cb->rlat && !cb->bw)
368 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
369 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
370 		if (wc.status) {
371 			if (wc.status == IB_WC_WR_FLUSH_ERR) {
372 				DEBUG_LOG(cb, "cq flushed\n");
373 				continue;
374 			} else {
375 				PRINTF(cb, "cq completion failed with "
376 				       "wr_id %Lx status %d opcode %d vender_err %x\n",
377 					wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
378 				goto error;
379 			}
380 		}
381 
382 		switch (wc.opcode) {
383 		case IB_WC_SEND:
384 			DEBUG_LOG(cb, "send completion\n");
385 			cb->stats.send_bytes += cb->send_sgl.length;
386 			cb->stats.send_msgs++;
387 			break;
388 
389 		case IB_WC_RDMA_WRITE:
390 			DEBUG_LOG(cb, "rdma write completion\n");
391 			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
392 			cb->stats.write_msgs++;
393 			cb->state = RDMA_WRITE_COMPLETE;
394 			wake_up_interruptible(&cb->sem);
395 			break;
396 
397 		case IB_WC_RDMA_READ:
398 			DEBUG_LOG(cb, "rdma read completion\n");
399 			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
400 			cb->stats.read_msgs++;
401 			cb->state = RDMA_READ_COMPLETE;
402 			wake_up_interruptible(&cb->sem);
403 			break;
404 
405 		case IB_WC_RECV:
406 			DEBUG_LOG(cb, "recv completion\n");
407 			cb->stats.recv_bytes += sizeof(cb->recv_buf);
408 			cb->stats.recv_msgs++;
409 			if (cb->wlat || cb->rlat || cb->bw)
410 				ret = server_recv(cb, &wc);
411 			else
412 				ret = cb->server ? server_recv(cb, &wc) :
413 						   client_recv(cb, &wc);
414 			if (ret) {
415 				PRINTF(cb, "recv wc error: %d\n", ret);
416 				goto error;
417 			}
418 
419 			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
420 			if (ret) {
421 				PRINTF(cb, "post recv error: %d\n",
422 				       ret);
423 				goto error;
424 			}
425 			wake_up_interruptible(&cb->sem);
426 			break;
427 
428 		default:
429 			PRINTF(cb,
430 			       "%s:%d Unexpected opcode %d, Shutting down\n",
431 			       __func__, __LINE__, wc.opcode);
432 			goto error;
433 		}
434 	}
435 	if (ret) {
436 		PRINTF(cb, "poll error %d\n", ret);
437 		goto error;
438 	}
439 	return;
440 error:
441 	cb->state = ERROR;
442 	wake_up_interruptible(&cb->sem);
443 }
444 
445 static int krping_accept(struct krping_cb *cb)
446 {
447 	struct rdma_conn_param conn_param;
448 	int ret;
449 
450 	DEBUG_LOG(cb, "accepting client connection request\n");
451 
452 	memset(&conn_param, 0, sizeof conn_param);
453 	conn_param.responder_resources = 1;
454 	conn_param.initiator_depth = 1;
455 
456 	ret = rdma_accept(cb->child_cm_id, &conn_param);
457 	if (ret) {
458 		PRINTF(cb, "rdma_accept error: %d\n", ret);
459 		return ret;
460 	}
461 
462 	if (!cb->wlat && !cb->rlat && !cb->bw) {
463 		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
464 		if (cb->state == ERROR) {
465 			PRINTF(cb, "wait for CONNECTED state %d\n",
466 				cb->state);
467 			return -1;
468 		}
469 	}
470 	return 0;
471 }
472 
473 static void krping_setup_wr(struct krping_cb *cb)
474 {
475 	cb->recv_sgl.addr = cb->recv_dma_addr;
476 	cb->recv_sgl.length = sizeof cb->recv_buf;
477 	if (cb->local_dma_lkey)
478 		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
479 	else if (cb->mem == DMA)
480 		cb->recv_sgl.lkey = cb->dma_mr->lkey;
481 	else
482 		cb->recv_sgl.lkey = cb->recv_mr->lkey;
483 	cb->rq_wr.sg_list = &cb->recv_sgl;
484 	cb->rq_wr.num_sge = 1;
485 
486 	cb->send_sgl.addr = cb->send_dma_addr;
487 	cb->send_sgl.length = sizeof cb->send_buf;
488 	if (cb->local_dma_lkey)
489 		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
490 	else if (cb->mem == DMA)
491 		cb->send_sgl.lkey = cb->dma_mr->lkey;
492 	else
493 		cb->send_sgl.lkey = cb->send_mr->lkey;
494 
495 	cb->sq_wr.opcode = IB_WR_SEND;
496 	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
497 	cb->sq_wr.sg_list = &cb->send_sgl;
498 	cb->sq_wr.num_sge = 1;
499 
500 	if (cb->server || cb->wlat || cb->rlat || cb->bw) {
501 		cb->rdma_sgl.addr = cb->rdma_dma_addr;
502 		if (cb->mem == MR)
503 			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
504 		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
505 		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
506 		cb->rdma_sq_wr.num_sge = 1;
507 	}
508 
509 	switch(cb->mem) {
510 	case FASTREG:
511 
512 		/*
513 		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
514 		 * both unsignaled.  The client uses them to reregister
515 		 * the rdma buffers with a new key each iteration.
516 		 */
517 		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
518 		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
519 		cb->fastreg_wr.wr.fast_reg.length = cb->size;
520 		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
521 		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
522 
523 		cb->invalidate_wr.next = &cb->fastreg_wr;
524 		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
525 		break;
526 	case MW:
527 		cb->bind_attr.wr_id = 0xabbaabba;
528 		cb->bind_attr.send_flags = 0; /* unsignaled */
529 		cb->bind_attr.length = cb->size;
530 		break;
531 	default:
532 		break;
533 	}
534 }
535 
536 static int krping_setup_buffers(struct krping_cb *cb)
537 {
538 	int ret;
539 	struct ib_phys_buf buf;
540 	u64 iovbase;
541 
542 	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
543 
544 	cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device,
545 				   &cb->recv_buf,
546 				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
547 	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
548 	cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device,
549 					   &cb->send_buf, sizeof(cb->send_buf),
550 					   DMA_BIDIRECTIONAL);
551 	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
552 
553 	if (cb->mem == DMA) {
554 		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
555 					   IB_ACCESS_REMOTE_READ|
556 				           IB_ACCESS_REMOTE_WRITE);
557 		if (IS_ERR(cb->dma_mr)) {
558 			DEBUG_LOG(cb, "reg_dmamr failed\n");
559 			ret = PTR_ERR(cb->dma_mr);
560 			goto bail;
561 		}
562 	} else {
563 		if (!cb->local_dma_lkey) {
564 			buf.addr = cb->recv_dma_addr;
565 			buf.size = sizeof cb->recv_buf;
566 			DEBUG_LOG(cb, "recv buf dma_addr %llx size %d\n", buf.addr,
567 				(int)buf.size);
568 			iovbase = cb->recv_dma_addr;
569 			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
570 						     IB_ACCESS_LOCAL_WRITE,
571 						     &iovbase);
572 
573 			if (IS_ERR(cb->recv_mr)) {
574 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
575 				ret = PTR_ERR(cb->recv_mr);
576 				goto bail;
577 			}
578 
579 			buf.addr = cb->send_dma_addr;
580 			buf.size = sizeof cb->send_buf;
581 			DEBUG_LOG(cb, "send buf dma_addr %llx size %d\n", buf.addr,
582 				(int)buf.size);
583 			iovbase = cb->send_dma_addr;
584 			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
585 						     0, &iovbase);
586 
587 			if (IS_ERR(cb->send_mr)) {
588 				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
589 				ret = PTR_ERR(cb->send_mr);
590 				goto bail;
591 			}
592 		}
593 	}
594 
595 	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
596 	if (!cb->rdma_buf) {
597 		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
598 		ret = -ENOMEM;
599 		goto bail;
600 	}
601 
602 	cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device,
603 			       cb->rdma_buf, cb->size,
604 			       DMA_BIDIRECTIONAL);
605 	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
606 	if (cb->mem != DMA) {
607 		switch (cb->mem) {
608 		case FASTREG:
609 			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
610 				PAGE_SIZE) >> PAGE_SHIFT;
611 			cb->page_list = ib_alloc_fast_reg_page_list(
612 						cb->pd->device,
613 						cb->page_list_len);
614 			if (IS_ERR(cb->page_list)) {
615 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
616 				ret = PTR_ERR(cb->page_list);
617 				goto bail;
618 			}
619 			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd,
620 					cb->page_list->max_page_list_len);
621 			if (IS_ERR(cb->fastreg_mr)) {
622 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
623 				ret = PTR_ERR(cb->fastreg_mr);
624 				goto bail;
625 			}
626 			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
627 				" page_list_len %u\n", cb->fastreg_mr->rkey,
628 				cb->page_list, cb->page_list_len);
629 			break;
630 		case MW:
631 			cb->mw = ib_alloc_mw(cb->pd);
632 			if (IS_ERR(cb->mw)) {
633 				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
634 				ret = PTR_ERR(cb->mw);
635 				goto bail;
636 			}
637 			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
638 			/*FALLTHROUGH*/
639 		case MR:
640 			buf.addr = cb->rdma_dma_addr;
641 			buf.size = cb->size;
642 			iovbase = cb->rdma_dma_addr;
643 			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
644 					     IB_ACCESS_REMOTE_READ|
645 					     IB_ACCESS_REMOTE_WRITE,
646 					     &iovbase);
647 			if (IS_ERR(cb->rdma_mr)) {
648 				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
649 				ret = PTR_ERR(cb->rdma_mr);
650 				goto bail;
651 			}
652 			DEBUG_LOG(cb, "rdma buf dma_addr %llx size %d mr rkey 0x%x\n",
653 				buf.addr, (int)buf.size, cb->rdma_mr->rkey);
654 			break;
655 		default:
656 			ret = -EINVAL;
657 			goto bail;
658 			break;
659 		}
660 	}
661 
662 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
663 
664 		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
665 		if (!cb->start_buf) {
666 			DEBUG_LOG(cb, "start_buf malloc failed\n");
667 			ret = -ENOMEM;
668 			goto bail;
669 		}
670 
671 		cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device,
672 						   cb->start_buf, cb->size,
673 						   DMA_BIDIRECTIONAL);
674 		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
675 
676 		if (cb->mem == MR || cb->mem == MW) {
677 			unsigned flags = IB_ACCESS_REMOTE_READ;
678 
679 			if (cb->wlat || cb->rlat || cb->bw)
680 				flags |= IB_ACCESS_REMOTE_WRITE;
681 
682 			buf.addr = cb->start_dma_addr;
683 			buf.size = cb->size;
684 			DEBUG_LOG(cb, "start buf dma_addr %llx size %d\n",
685 				buf.addr, (int)buf.size);
686 			iovbase = cb->start_dma_addr;
687 			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
688 					     flags,
689 					     &iovbase);
690 
691 			if (IS_ERR(cb->start_mr)) {
692 				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
693 				ret = PTR_ERR(cb->start_mr);
694 				goto bail;
695 			}
696 		}
697 	}
698 
699 	krping_setup_wr(cb);
700 	DEBUG_LOG(cb, "allocated & registered buffers...\n");
701 	return 0;
702 bail:
703 	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
704 		ib_dereg_mr(cb->fastreg_mr);
705 	if (cb->mw && !IS_ERR(cb->mw))
706 		ib_dealloc_mw(cb->mw);
707 	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
708 		ib_dereg_mr(cb->rdma_mr);
709 	if (cb->page_list && !IS_ERR(cb->page_list))
710 		ib_free_fast_reg_page_list(cb->page_list);
711 	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
712 		ib_dereg_mr(cb->dma_mr);
713 	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
714 		ib_dereg_mr(cb->recv_mr);
715 	if (cb->send_mr && !IS_ERR(cb->send_mr))
716 		ib_dereg_mr(cb->send_mr);
717 	if (cb->rdma_buf)
718 		kfree(cb->rdma_buf);
719 	if (cb->start_buf)
720 		kfree(cb->start_buf);
721 	return ret;
722 }
723 
724 static void krping_free_buffers(struct krping_cb *cb)
725 {
726 	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
727 
728 	if (cb->dma_mr)
729 		ib_dereg_mr(cb->dma_mr);
730 	if (cb->send_mr)
731 		ib_dereg_mr(cb->send_mr);
732 	if (cb->recv_mr)
733 		ib_dereg_mr(cb->recv_mr);
734 	if (cb->rdma_mr)
735 		ib_dereg_mr(cb->rdma_mr);
736 	if (cb->start_mr)
737 		ib_dereg_mr(cb->start_mr);
738 	if (cb->fastreg_mr)
739 		ib_dereg_mr(cb->fastreg_mr);
740 	if (cb->mw)
741 		ib_dealloc_mw(cb->mw);
742 
743 	dma_unmap_single(cb->pd->device->dma_device,
744 			 pci_unmap_addr(cb, recv_mapping),
745 			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
746 	dma_unmap_single(cb->pd->device->dma_device,
747 			 pci_unmap_addr(cb, send_mapping),
748 			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
749 	dma_unmap_single(cb->pd->device->dma_device,
750 			 pci_unmap_addr(cb, rdma_mapping),
751 			 cb->size, DMA_BIDIRECTIONAL);
752 	kfree(cb->rdma_buf);
753 	if (cb->start_buf) {
754 		dma_unmap_single(cb->pd->device->dma_device,
755 			 pci_unmap_addr(cb, start_mapping),
756 			 cb->size, DMA_BIDIRECTIONAL);
757 		kfree(cb->start_buf);
758 	}
759 }
760 
761 static int krping_create_qp(struct krping_cb *cb)
762 {
763 	struct ib_qp_init_attr init_attr;
764 	int ret;
765 
766 	memset(&init_attr, 0, sizeof(init_attr));
767 	init_attr.cap.max_send_wr = cb->txdepth;
768 	init_attr.cap.max_recv_wr = 2;
769 	init_attr.cap.max_recv_sge = 1;
770 	init_attr.cap.max_send_sge = 1;
771 	init_attr.qp_type = IB_QPT_RC;
772 	init_attr.send_cq = cb->cq;
773 	init_attr.recv_cq = cb->cq;
774 	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
775 
776 	if (cb->server) {
777 		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
778 		if (!ret)
779 			cb->qp = cb->child_cm_id->qp;
780 	} else {
781 		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
782 		if (!ret)
783 			cb->qp = cb->cm_id->qp;
784 	}
785 
786 	return ret;
787 }
788 
789 static void krping_free_qp(struct krping_cb *cb)
790 {
791 	ib_destroy_qp(cb->qp);
792 	ib_destroy_cq(cb->cq);
793 	ib_dealloc_pd(cb->pd);
794 }
795 
796 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
797 {
798 	int ret;
799 	cb->pd = ib_alloc_pd(cm_id->device);
800 	if (IS_ERR(cb->pd)) {
801 		PRINTF(cb, "ib_alloc_pd failed\n");
802 		return PTR_ERR(cb->pd);
803 	}
804 	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
805 
806 	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
807 
808 	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
809 			      cb, cb->txdepth * 2, 0);
810 	if (IS_ERR(cb->cq)) {
811 		PRINTF(cb, "ib_create_cq failed\n");
812 		ret = PTR_ERR(cb->cq);
813 		goto err1;
814 	}
815 	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
816 
817 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
818 		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
819 		if (ret) {
820 			PRINTF(cb, "ib_create_cq failed\n");
821 			goto err2;
822 		}
823 	}
824 
825 	ret = krping_create_qp(cb);
826 	if (ret) {
827 		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
828 		goto err2;
829 	}
830 	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
831 	return 0;
832 err2:
833 	ib_destroy_cq(cb->cq);
834 err1:
835 	ib_dealloc_pd(cb->pd);
836 	return ret;
837 }
838 
839 /*
840  * return the (possibly rebound) rkey for the rdma buffer.
841  * FASTREG mode: invalidate and rebind via fastreg wr.
842  * MW mode: rebind the MW.
843  * other modes: just return the mr rkey.
844  */
845 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
846 {
847 	u32 rkey = 0xffffffff;
848 	u64 p;
849 	struct ib_send_wr *bad_wr;
850 	int i;
851 	int ret;
852 
853 	switch (cb->mem) {
854 	case FASTREG:
855 		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
856 
857 		/*
858 		 * Update the fastreg key.
859 		 */
860 		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
861 		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
862 
863 		/*
864 		 * Update the fastreg WR with new buf info.
865 		 */
866 		if (buf == (u64)cb->start_dma_addr)
867 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
868 		else
869 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
870 		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
871 		p = (u64)(buf & PAGE_MASK);
872 		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len;
873 		     i++, p += PAGE_SIZE) {
874 			cb->page_list->page_list[i] = p;
875 			DEBUG_LOG(cb, "page_list[%d] 0x%llx\n", i, p);
876 		}
877 
878 		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
879 			" iova_start %llx page_list_len %u\n",
880 			post_inv,
881 			cb->fastreg_wr.wr.fast_reg.rkey,
882 			cb->fastreg_wr.wr.fast_reg.page_shift,
883 			cb->fastreg_wr.wr.fast_reg.length,
884 			cb->fastreg_wr.wr.fast_reg.iova_start,
885 			cb->fastreg_wr.wr.fast_reg.page_list_len);
886 
887 		if (post_inv)
888 			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
889 		else
890 			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
891 		if (ret) {
892 			PRINTF(cb, "post send error %d\n", ret);
893 			cb->state = ERROR;
894 		}
895 		rkey = cb->fastreg_mr->rkey;
896 		break;
897 	case MW:
898 		/*
899 		 * Update the MW with new buf info.
900 		 */
901 		if (buf == (u64)cb->start_dma_addr) {
902 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
903 			cb->bind_attr.mr = cb->start_mr;
904 		} else {
905 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
906 			cb->bind_attr.mr = cb->rdma_mr;
907 		}
908 		cb->bind_attr.addr = buf;
909 		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %llx mr rkey 0x%x\n",
910 			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
911 		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
912 		if (ret) {
913 			PRINTF(cb, "bind mw error %d\n", ret);
914 			cb->state = ERROR;
915 		} else
916 			rkey = cb->mw->rkey;
917 		break;
918 	case MR:
919 		if (buf == (u64)cb->start_dma_addr)
920 			rkey = cb->start_mr->rkey;
921 		else
922 			rkey = cb->rdma_mr->rkey;
923 		break;
924 	case DMA:
925 		rkey = cb->dma_mr->rkey;
926 		break;
927 	default:
928 		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
929 		cb->state = ERROR;
930 		break;
931 	}
932 	return rkey;
933 }
934 
935 static void krping_format_send(struct krping_cb *cb, u64 buf)
936 {
937 	struct krping_rdma_info *info = &cb->send_buf;
938 	u32 rkey;
939 
940 	/*
941 	 * Client side will do fastreg or mw bind before
942 	 * advertising the rdma buffer.  Server side
943 	 * sends have no data.
944 	 */
945 	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
946 		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
947 		info->buf = htonll(buf);
948 		info->rkey = htonl(rkey);
949 		info->size = htonl(cb->size);
950 		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
951 			  (unsigned long long)buf, rkey, cb->size);
952 	}
953 }
954 
955 static void krping_test_server(struct krping_cb *cb)
956 {
957 	struct ib_send_wr *bad_wr, inv;
958 	int ret;
959 
960 	while (1) {
961 		/* Wait for client's Start STAG/TO/Len */
962 		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
963 		if (cb->state != RDMA_READ_ADV) {
964 			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
965 				cb->state);
966 			break;
967 		}
968 
969 		DEBUG_LOG(cb, "server received sink adv\n");
970 
971 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
972 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
973 		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
974 		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
975 		cb->rdma_sq_wr.next = NULL;
976 
977 		/* Issue RDMA Read. */
978 		if (cb->read_inv)
979 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
980 		else {
981 
982 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
983 			if (cb->mem == FASTREG) {
984 				/*
985 				 * Immediately follow the read with a
986 				 * fenced LOCAL_INV.
987 				 */
988 				cb->rdma_sq_wr.next = &inv;
989 				memset(&inv, 0, sizeof inv);
990 				inv.opcode = IB_WR_LOCAL_INV;
991 				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
992 				inv.send_flags = IB_SEND_FENCE;
993 			}
994 		}
995 
996 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
997 		if (ret) {
998 			PRINTF(cb, "post send error %d\n", ret);
999 			break;
1000 		}
1001 		cb->rdma_sq_wr.next = NULL;
1002 
1003 		DEBUG_LOG(cb, "server posted rdma read req \n");
1004 
1005 		/* Wait for read completion */
1006 		wait_event_interruptible(cb->sem,
1007 					 cb->state >= RDMA_READ_COMPLETE);
1008 		if (cb->state != RDMA_READ_COMPLETE) {
1009 			PRINTF(cb,
1010 			       "wait for RDMA_READ_COMPLETE state %d\n",
1011 			       cb->state);
1012 			break;
1013 		}
1014 		DEBUG_LOG(cb, "server received read complete\n");
1015 
1016 		/* Display data in recv buf */
1017 		if (cb->verbose)
1018 			PRINTF(cb, "server ping data: %s\n",
1019 				cb->rdma_buf);
1020 
1021 		/* Tell client to continue */
1022 		if (cb->server && cb->server_invalidate) {
1023 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1024 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1025 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1026 		}
1027 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1028 		if (ret) {
1029 			PRINTF(cb, "post send error %d\n", ret);
1030 			break;
1031 		}
1032 		DEBUG_LOG(cb, "server posted go ahead\n");
1033 
1034 		/* Wait for client's RDMA STAG/TO/Len */
1035 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1036 		if (cb->state != RDMA_WRITE_ADV) {
1037 			PRINTF(cb,
1038 			       "wait for RDMA_WRITE_ADV state %d\n",
1039 			       cb->state);
1040 			break;
1041 		}
1042 		DEBUG_LOG(cb, "server received sink adv\n");
1043 
1044 		/* RDMA Write echo data */
1045 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1046 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1047 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1048 		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
1049 		if (cb->local_dma_lkey)
1050 			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
1051 		else
1052 			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
1053 
1054 		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
1055 			  cb->rdma_sq_wr.sg_list->lkey,
1056 			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
1057 			  cb->rdma_sq_wr.sg_list->length);
1058 
1059 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1060 		if (ret) {
1061 			PRINTF(cb, "post send error %d\n", ret);
1062 			break;
1063 		}
1064 
1065 		/* Wait for completion */
1066 		ret = wait_event_interruptible(cb->sem, cb->state >=
1067 							 RDMA_WRITE_COMPLETE);
1068 		if (cb->state != RDMA_WRITE_COMPLETE) {
1069 			PRINTF(cb,
1070 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1071 			       cb->state);
1072 			break;
1073 		}
1074 		DEBUG_LOG(cb, "server rdma write complete \n");
1075 
1076 		cb->state = CONNECTED;
1077 
1078 		/* Tell client to begin again */
1079 		if (cb->server && cb->server_invalidate) {
1080 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1081 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1082 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1083 		}
1084 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1085 		if (ret) {
1086 			PRINTF(cb, "post send error %d\n", ret);
1087 			break;
1088 		}
1089 		DEBUG_LOG(cb, "server posted go ahead\n");
1090 	}
1091 }
1092 
1093 static void rlat_test(struct krping_cb *cb)
1094 {
1095 	int scnt;
1096 	int iters = cb->count;
1097 	struct timeval start_tv, stop_tv;
1098 	int ret;
1099 	struct ib_wc wc;
1100 	struct ib_send_wr *bad_wr;
1101 	int ne;
1102 
1103 	scnt = 0;
1104 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1105 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1106 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1107 	cb->rdma_sq_wr.sg_list->length = cb->size;
1108 
1109 	microtime(&start_tv);
1110 	if (!cb->poll) {
1111 		cb->state = RDMA_READ_ADV;
1112 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
1113 	}
1114 	while (scnt < iters) {
1115 
1116 		cb->state = RDMA_READ_ADV;
1117 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1118 		if (ret) {
1119 			PRINTF(cb,
1120 				"Couldn't post send: ret=%d scnt %d\n",
1121 				ret, scnt);
1122 			return;
1123 		}
1124 
1125 		do {
1126 			if (!cb->poll) {
1127 				wait_event_interruptible(cb->sem,
1128 					cb->state != RDMA_READ_ADV);
1129 				if (cb->state == RDMA_READ_COMPLETE) {
1130 					ne = 1;
1131 					ib_req_notify_cq(cb->cq,
1132 						IB_CQ_NEXT_COMP);
1133 				} else {
1134 					ne = -1;
1135 				}
1136 			} else
1137 				ne = ib_poll_cq(cb->cq, 1, &wc);
1138 			if (cb->state == ERROR) {
1139 				PRINTF(cb,
1140 					"state == ERROR...bailing scnt %d\n",
1141 					scnt);
1142 				return;
1143 			}
1144 		} while (ne == 0);
1145 
1146 		if (ne < 0) {
1147 			PRINTF(cb, "poll CQ failed %d\n", ne);
1148 			return;
1149 		}
1150 		if (cb->poll && wc.status != IB_WC_SUCCESS) {
1151 			PRINTF(cb, "Completion wth error at %s:\n",
1152 				cb->server ? "server" : "client");
1153 			PRINTF(cb, "Failed status %d: wr_id %d\n",
1154 				wc.status, (int) wc.wr_id);
1155 			return;
1156 		}
1157 		++scnt;
1158 	}
1159 	microtime(&stop_tv);
1160 
1161         if (stop_tv.tv_usec < start_tv.tv_usec) {
1162                 stop_tv.tv_usec += 1000000;
1163                 stop_tv.tv_sec  -= 1;
1164         }
1165 
1166 	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
1167 		stop_tv.tv_sec - start_tv.tv_sec,
1168 		stop_tv.tv_usec - start_tv.tv_usec,
1169 		scnt, cb->size);
1170 }
1171 
1172 static void wlat_test(struct krping_cb *cb)
1173 {
1174 	int ccnt, scnt, rcnt;
1175 	int iters=cb->count;
1176 	volatile char *poll_buf = (char *) cb->start_buf;
1177 	char *buf = (char *)cb->rdma_buf;
1178 	struct timeval start_tv, stop_tv;
1179 	cycles_t *post_cycles_start, *post_cycles_stop;
1180 	cycles_t *poll_cycles_start, *poll_cycles_stop;
1181 	cycles_t *last_poll_cycles_start;
1182 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1183 	int i;
1184 	int cycle_iters = 1000;
1185 
1186 	ccnt = 0;
1187 	scnt = 0;
1188 	rcnt = 0;
1189 
1190 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1191 	if (!post_cycles_start) {
1192 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1193 		return;
1194 	}
1195 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1196 	if (!post_cycles_stop) {
1197 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1198 		return;
1199 	}
1200 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1201 	if (!poll_cycles_start) {
1202 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1203 		return;
1204 	}
1205 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1206 	if (!poll_cycles_stop) {
1207 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1208 		return;
1209 	}
1210 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1211 		GFP_KERNEL);
1212 	if (!last_poll_cycles_start) {
1213 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1214 		return;
1215 	}
1216 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1217 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1218 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1219 	cb->rdma_sq_wr.sg_list->length = cb->size;
1220 
1221 	if (cycle_iters > iters)
1222 		cycle_iters = iters;
1223 	microtime(&start_tv);
1224 	while (scnt < iters || ccnt < iters || rcnt < iters) {
1225 
1226 		/* Wait till buffer changes. */
1227 		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1228 			++rcnt;
1229 			while (*poll_buf != (char)rcnt) {
1230 				if (cb->state == ERROR) {
1231 					PRINTF(cb,
1232 						"state = ERROR, bailing\n");
1233 					return;
1234 				}
1235 			}
1236 		}
1237 
1238 		if (scnt < iters) {
1239 			struct ib_send_wr *bad_wr;
1240 
1241 			*buf = (char)scnt+1;
1242 			if (scnt < cycle_iters)
1243 				post_cycles_start[scnt] = get_cycles();
1244 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1245 				PRINTF(cb,
1246 					"Couldn't post send: scnt=%d\n",
1247 					scnt);
1248 				return;
1249 			}
1250 			if (scnt < cycle_iters)
1251 				post_cycles_stop[scnt] = get_cycles();
1252 			scnt++;
1253 		}
1254 
1255 		if (ccnt < iters) {
1256 			struct ib_wc wc;
1257 			int ne;
1258 
1259 			if (ccnt < cycle_iters)
1260 				poll_cycles_start[ccnt] = get_cycles();
1261 			do {
1262 				if (ccnt < cycle_iters)
1263 					last_poll_cycles_start[ccnt] =
1264 						get_cycles();
1265 				ne = ib_poll_cq(cb->cq, 1, &wc);
1266 			} while (ne == 0);
1267 			if (ccnt < cycle_iters)
1268 				poll_cycles_stop[ccnt] = get_cycles();
1269 			++ccnt;
1270 
1271 			if (ne < 0) {
1272 				PRINTF(cb, "poll CQ failed %d\n", ne);
1273 				return;
1274 			}
1275 			if (wc.status != IB_WC_SUCCESS) {
1276 				PRINTF(cb,
1277 					"Completion wth error at %s:\n",
1278 					cb->server ? "server" : "client");
1279 				PRINTF(cb,
1280 					"Failed status %d: wr_id %d\n",
1281 					wc.status, (int) wc.wr_id);
1282 				PRINTF(cb,
1283 					"scnt=%d, rcnt=%d, ccnt=%d\n",
1284 					scnt, rcnt, ccnt);
1285 				return;
1286 			}
1287 		}
1288 	}
1289 	microtime(&stop_tv);
1290 
1291         if (stop_tv.tv_usec < start_tv.tv_usec) {
1292                 stop_tv.tv_usec += 1000000;
1293                 stop_tv.tv_sec  -= 1;
1294         }
1295 
1296 	for (i=0; i < cycle_iters; i++) {
1297 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1298 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1299 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1300 	}
1301 	PRINTF(cb,
1302 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1303 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1304 		stop_tv.tv_sec - start_tv.tv_sec,
1305 		stop_tv.tv_usec - start_tv.tv_usec,
1306 		scnt, cb->size, cycle_iters,
1307 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1308 		(unsigned long long)sum_last_poll);
1309 	kfree(post_cycles_start);
1310 	kfree(post_cycles_stop);
1311 	kfree(poll_cycles_start);
1312 	kfree(poll_cycles_stop);
1313 	kfree(last_poll_cycles_start);
1314 }
1315 
1316 static void bw_test(struct krping_cb *cb)
1317 {
1318 	int ccnt, scnt, rcnt;
1319 	int iters=cb->count;
1320 	struct timeval start_tv, stop_tv;
1321 	cycles_t *post_cycles_start, *post_cycles_stop;
1322 	cycles_t *poll_cycles_start, *poll_cycles_stop;
1323 	cycles_t *last_poll_cycles_start;
1324 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1325 	int i;
1326 	int cycle_iters = 1000;
1327 
1328 	ccnt = 0;
1329 	scnt = 0;
1330 	rcnt = 0;
1331 
1332 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1333 	if (!post_cycles_start) {
1334 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1335 		return;
1336 	}
1337 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1338 	if (!post_cycles_stop) {
1339 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1340 		return;
1341 	}
1342 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1343 	if (!poll_cycles_start) {
1344 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1345 		return;
1346 	}
1347 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1348 	if (!poll_cycles_stop) {
1349 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1350 		return;
1351 	}
1352 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1353 		GFP_KERNEL);
1354 	if (!last_poll_cycles_start) {
1355 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1356 		return;
1357 	}
1358 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1359 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1360 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1361 	cb->rdma_sq_wr.sg_list->length = cb->size;
1362 
1363 	if (cycle_iters > iters)
1364 		cycle_iters = iters;
1365 	microtime(&start_tv);
1366 	while (scnt < iters || ccnt < iters) {
1367 
1368 		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1369 			struct ib_send_wr *bad_wr;
1370 
1371 			if (scnt < cycle_iters)
1372 				post_cycles_start[scnt] = get_cycles();
1373 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1374 				PRINTF(cb,
1375 					"Couldn't post send: scnt=%d\n",
1376 					scnt);
1377 				return;
1378 			}
1379 			if (scnt < cycle_iters)
1380 				post_cycles_stop[scnt] = get_cycles();
1381 			++scnt;
1382 		}
1383 
1384 		if (ccnt < iters) {
1385 			int ne;
1386 			struct ib_wc wc;
1387 
1388 			if (ccnt < cycle_iters)
1389 				poll_cycles_start[ccnt] = get_cycles();
1390 			do {
1391 				if (ccnt < cycle_iters)
1392 					last_poll_cycles_start[ccnt] =
1393 						get_cycles();
1394 				ne = ib_poll_cq(cb->cq, 1, &wc);
1395 			} while (ne == 0);
1396 			if (ccnt < cycle_iters)
1397 				poll_cycles_stop[ccnt] = get_cycles();
1398 			ccnt += 1;
1399 
1400 			if (ne < 0) {
1401 				PRINTF(cb, "poll CQ failed %d\n", ne);
1402 				return;
1403 			}
1404 			if (wc.status != IB_WC_SUCCESS) {
1405 				PRINTF(cb,
1406 					"Completion wth error at %s:\n",
1407 					cb->server ? "server" : "client");
1408 				PRINTF(cb,
1409 					"Failed status %d: wr_id %d\n",
1410 					wc.status, (int) wc.wr_id);
1411 				return;
1412 			}
1413 		}
1414 	}
1415 	microtime(&stop_tv);
1416 
1417         if (stop_tv.tv_usec < start_tv.tv_usec) {
1418                 stop_tv.tv_usec += 1000000;
1419                 stop_tv.tv_sec  -= 1;
1420         }
1421 
1422 	for (i=0; i < cycle_iters; i++) {
1423 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1424 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1425 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1426 	}
1427 	PRINTF(cb,
1428 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1429 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1430 		stop_tv.tv_sec - start_tv.tv_sec,
1431 		stop_tv.tv_usec - start_tv.tv_usec,
1432 		scnt, cb->size, cycle_iters,
1433 		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1434 		(unsigned long long)sum_last_poll);
1435 	kfree(post_cycles_start);
1436 	kfree(post_cycles_stop);
1437 	kfree(poll_cycles_start);
1438 	kfree(poll_cycles_stop);
1439 	kfree(last_poll_cycles_start);
1440 }
1441 
1442 static void krping_rlat_test_server(struct krping_cb *cb)
1443 {
1444 	struct ib_send_wr *bad_wr;
1445 	struct ib_wc wc;
1446 	int ret;
1447 
1448 	/* Spin waiting for client's Start STAG/TO/Len */
1449 	while (cb->state < RDMA_READ_ADV) {
1450 		krping_cq_event_handler(cb->cq, cb);
1451 	}
1452 
1453 	/* Send STAG/TO/Len to client */
1454 	krping_format_send(cb, cb->start_dma_addr);
1455 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1456 	if (ret) {
1457 		PRINTF(cb, "post send error %d\n", ret);
1458 		return;
1459 	}
1460 
1461 	/* Spin waiting for send completion */
1462 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1463 	if (ret < 0) {
1464 		PRINTF(cb, "poll error %d\n", ret);
1465 		return;
1466 	}
1467 	if (wc.status) {
1468 		PRINTF(cb, "send completiong error %d\n", wc.status);
1469 		return;
1470 	}
1471 
1472 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1473 }
1474 
1475 static void krping_wlat_test_server(struct krping_cb *cb)
1476 {
1477 	struct ib_send_wr *bad_wr;
1478 	struct ib_wc wc;
1479 	int ret;
1480 
1481 	/* Spin waiting for client's Start STAG/TO/Len */
1482 	while (cb->state < RDMA_READ_ADV) {
1483 		krping_cq_event_handler(cb->cq, cb);
1484 	}
1485 
1486 	/* Send STAG/TO/Len to client */
1487 	krping_format_send(cb, cb->start_dma_addr);
1488 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1489 	if (ret) {
1490 		PRINTF(cb, "post send error %d\n", ret);
1491 		return;
1492 	}
1493 
1494 	/* Spin waiting for send completion */
1495 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1496 	if (ret < 0) {
1497 		PRINTF(cb, "poll error %d\n", ret);
1498 		return;
1499 	}
1500 	if (wc.status) {
1501 		PRINTF(cb, "send completiong error %d\n", wc.status);
1502 		return;
1503 	}
1504 
1505 	wlat_test(cb);
1506 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1507 }
1508 
1509 static void krping_bw_test_server(struct krping_cb *cb)
1510 {
1511 	struct ib_send_wr *bad_wr;
1512 	struct ib_wc wc;
1513 	int ret;
1514 
1515 	/* Spin waiting for client's Start STAG/TO/Len */
1516 	while (cb->state < RDMA_READ_ADV) {
1517 		krping_cq_event_handler(cb->cq, cb);
1518 	}
1519 
1520 	/* Send STAG/TO/Len to client */
1521 	krping_format_send(cb, cb->start_dma_addr);
1522 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1523 	if (ret) {
1524 		PRINTF(cb, "post send error %d\n", ret);
1525 		return;
1526 	}
1527 
1528 	/* Spin waiting for send completion */
1529 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1530 	if (ret < 0) {
1531 		PRINTF(cb, "poll error %d\n", ret);
1532 		return;
1533 	}
1534 	if (wc.status) {
1535 		PRINTF(cb, "send completiong error %d\n", wc.status);
1536 		return;
1537 	}
1538 
1539 	if (cb->duplex)
1540 		bw_test(cb);
1541 	wait_event_interruptible(cb->sem, cb->state == ERROR);
1542 }
1543 
1544 static int fastreg_supported(struct krping_cb *cb)
1545 {
1546 	struct ib_device *dev = cb->child_cm_id->device;
1547 	struct ib_device_attr attr;
1548 	int ret;
1549 
1550 	ret = ib_query_device(dev, &attr);
1551 	if (ret) {
1552 		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
1553 		return 0;
1554 	}
1555 	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
1556 		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%x\n",
1557 		    attr.device_cap_flags);
1558 		return 0;
1559 	}
1560 	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%x\n",
1561 		attr.device_cap_flags);
1562 	return 1;
1563 }
1564 
1565 static int krping_bind_server(struct krping_cb *cb)
1566 {
1567 	struct sockaddr_in sin;
1568 	int ret;
1569 
1570 	memset(&sin, 0, sizeof(sin));
1571 	sin.sin_len = sizeof sin;
1572 	sin.sin_family = AF_INET;
1573 	sin.sin_addr.s_addr = cb->addr.s_addr;
1574 	sin.sin_port = cb->port;
1575 
1576 	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1577 	if (ret) {
1578 		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
1579 		return ret;
1580 	}
1581 	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
1582 
1583 	DEBUG_LOG(cb, "rdma_listen\n");
1584 	ret = rdma_listen(cb->cm_id, 3);
1585 	if (ret) {
1586 		PRINTF(cb, "rdma_listen failed: %d\n", ret);
1587 		return ret;
1588 	}
1589 
1590 	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1591 	if (cb->state != CONNECT_REQUEST) {
1592 		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
1593 			cb->state);
1594 		return -1;
1595 	}
1596 
1597 	if (cb->mem == FASTREG && !fastreg_supported(cb))
1598 		return -EINVAL;
1599 
1600 	return 0;
1601 }
1602 
1603 static void krping_run_server(struct krping_cb *cb)
1604 {
1605 	struct ib_recv_wr *bad_wr;
1606 	int ret;
1607 
1608 	ret = krping_bind_server(cb);
1609 	if (ret)
1610 		return;
1611 
1612 	ret = krping_setup_qp(cb, cb->child_cm_id);
1613 	if (ret) {
1614 		PRINTF(cb, "setup_qp failed: %d\n", ret);
1615 		goto err0;
1616 	}
1617 
1618 	ret = krping_setup_buffers(cb);
1619 	if (ret) {
1620 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
1621 		goto err1;
1622 	}
1623 
1624 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1625 	if (ret) {
1626 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
1627 		goto err2;
1628 	}
1629 
1630 	ret = krping_accept(cb);
1631 	if (ret) {
1632 		PRINTF(cb, "connect error %d\n", ret);
1633 		goto err2;
1634 	}
1635 
1636 	if (cb->wlat)
1637 		krping_wlat_test_server(cb);
1638 	else if (cb->rlat)
1639 		krping_rlat_test_server(cb);
1640 	else if (cb->bw)
1641 		krping_bw_test_server(cb);
1642 	else
1643 		krping_test_server(cb);
1644 	rdma_disconnect(cb->child_cm_id);
1645 err2:
1646 	krping_free_buffers(cb);
1647 err1:
1648 	krping_free_qp(cb);
1649 err0:
1650 	rdma_destroy_id(cb->child_cm_id);
1651 }
1652 
1653 static void krping_test_client(struct krping_cb *cb)
1654 {
1655 	int ping, start, cc, i, ret;
1656 	struct ib_send_wr *bad_wr;
1657 	unsigned char c;
1658 
1659 	start = 65;
1660 	for (ping = 0; !cb->count || ping < cb->count; ping++) {
1661 		cb->state = RDMA_READ_ADV;
1662 
1663 		/* Put some ascii text in the buffer. */
1664 		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1665 		for (i = cc, c = start; i < cb->size; i++) {
1666 			cb->start_buf[i] = c;
1667 			c++;
1668 			if (c > 122)
1669 				c = 65;
1670 		}
1671 		start++;
1672 		if (start > 122)
1673 			start = 65;
1674 		cb->start_buf[cb->size - 1] = 0;
1675 
1676 		krping_format_send(cb, cb->start_dma_addr);
1677 		if (cb->state == ERROR) {
1678 			PRINTF(cb, "krping_format_send failed\n");
1679 			break;
1680 		}
1681 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1682 		if (ret) {
1683 			PRINTF(cb, "post send error %d\n", ret);
1684 			break;
1685 		}
1686 
1687 		/* Wait for server to ACK */
1688 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1689 		if (cb->state != RDMA_WRITE_ADV) {
1690 			PRINTF(cb,
1691 			       "wait for RDMA_WRITE_ADV state %d\n",
1692 			       cb->state);
1693 			break;
1694 		}
1695 
1696 		krping_format_send(cb, cb->rdma_dma_addr);
1697 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1698 		if (ret) {
1699 			PRINTF(cb, "post send error %d\n", ret);
1700 			break;
1701 		}
1702 
1703 		/* Wait for the server to say the RDMA Write is complete. */
1704 		wait_event_interruptible(cb->sem,
1705 					 cb->state >= RDMA_WRITE_COMPLETE);
1706 		if (cb->state != RDMA_WRITE_COMPLETE) {
1707 			PRINTF(cb,
1708 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1709 			       cb->state);
1710 			break;
1711 		}
1712 
1713 		if (cb->validate)
1714 			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1715 				PRINTF(cb, "data mismatch!\n");
1716 				break;
1717 			}
1718 
1719 		if (cb->verbose)
1720 			PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
1721 #ifdef SLOW_KRPING
1722 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1723 #endif
1724 	}
1725 }
1726 
1727 static void krping_rlat_test_client(struct krping_cb *cb)
1728 {
1729 	struct ib_send_wr *bad_wr;
1730 	struct ib_wc wc;
1731 	int ret;
1732 
1733 	cb->state = RDMA_READ_ADV;
1734 
1735 	/* Send STAG/TO/Len to client */
1736 	krping_format_send(cb, cb->start_dma_addr);
1737 	if (cb->state == ERROR) {
1738 		PRINTF(cb, "krping_format_send failed\n");
1739 		return;
1740 	}
1741 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1742 	if (ret) {
1743 		PRINTF(cb, "post send error %d\n", ret);
1744 		return;
1745 	}
1746 
1747 	/* Spin waiting for send completion */
1748 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1749 	if (ret < 0) {
1750 		PRINTF(cb, "poll error %d\n", ret);
1751 		return;
1752 	}
1753 	if (wc.status) {
1754 		PRINTF(cb, "send completion error %d\n", wc.status);
1755 		return;
1756 	}
1757 
1758 	/* Spin waiting for server's Start STAG/TO/Len */
1759 	while (cb->state < RDMA_WRITE_ADV) {
1760 		krping_cq_event_handler(cb->cq, cb);
1761 	}
1762 
1763 #if 0
1764 {
1765 	int i;
1766 	struct timeval start, stop;
1767 	time_t sec;
1768 	suseconds_t usec;
1769 	unsigned long long elapsed;
1770 	struct ib_wc wc;
1771 	struct ib_send_wr *bad_wr;
1772 	int ne;
1773 
1774 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1775 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1776 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1777 	cb->rdma_sq_wr.sg_list->length = 0;
1778 	cb->rdma_sq_wr.num_sge = 0;
1779 
1780 	microtime(&start);
1781 	for (i=0; i < 100000; i++) {
1782 		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1783 			PRINTF(cb, "Couldn't post send\n");
1784 			return;
1785 		}
1786 		do {
1787 			ne = ib_poll_cq(cb->cq, 1, &wc);
1788 		} while (ne == 0);
1789 		if (ne < 0) {
1790 			PRINTF(cb, "poll CQ failed %d\n", ne);
1791 			return;
1792 		}
1793 		if (wc.status != IB_WC_SUCCESS) {
1794 			PRINTF(cb, "Completion wth error at %s:\n",
1795 				cb->server ? "server" : "client");
1796 			PRINTF(cb, "Failed status %d: wr_id %d\n",
1797 				wc.status, (int) wc.wr_id);
1798 			return;
1799 		}
1800 	}
1801 	microtime(&stop);
1802 
1803 	if (stop.tv_usec < start.tv_usec) {
1804 		stop.tv_usec += 1000000;
1805 		stop.tv_sec  -= 1;
1806 	}
1807 	sec     = stop.tv_sec - start.tv_sec;
1808 	usec    = stop.tv_usec - start.tv_usec;
1809 	elapsed = sec * 1000000 + usec;
1810 	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1811 }
1812 #endif
1813 
1814 	rlat_test(cb);
1815 }
1816 
1817 static void krping_wlat_test_client(struct krping_cb *cb)
1818 {
1819 	struct ib_send_wr *bad_wr;
1820 	struct ib_wc wc;
1821 	int ret;
1822 
1823 	cb->state = RDMA_READ_ADV;
1824 
1825 	/* Send STAG/TO/Len to client */
1826 	krping_format_send(cb, cb->start_dma_addr);
1827 	if (cb->state == ERROR) {
1828 		PRINTF(cb, "krping_format_send failed\n");
1829 		return;
1830 	}
1831 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1832 	if (ret) {
1833 		PRINTF(cb, "post send error %d\n", ret);
1834 		return;
1835 	}
1836 
1837 	/* Spin waiting for send completion */
1838 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1839 	if (ret < 0) {
1840 		PRINTF(cb, "poll error %d\n", ret);
1841 		return;
1842 	}
1843 	if (wc.status) {
1844 		PRINTF(cb, "send completion error %d\n", wc.status);
1845 		return;
1846 	}
1847 
1848 	/* Spin waiting for server's Start STAG/TO/Len */
1849 	while (cb->state < RDMA_WRITE_ADV) {
1850 		krping_cq_event_handler(cb->cq, cb);
1851 	}
1852 
1853 	wlat_test(cb);
1854 }
1855 
1856 static void krping_bw_test_client(struct krping_cb *cb)
1857 {
1858 	struct ib_send_wr *bad_wr;
1859 	struct ib_wc wc;
1860 	int ret;
1861 
1862 	cb->state = RDMA_READ_ADV;
1863 
1864 	/* Send STAG/TO/Len to client */
1865 	krping_format_send(cb, cb->start_dma_addr);
1866 	if (cb->state == ERROR) {
1867 		PRINTF(cb, "krping_format_send failed\n");
1868 		return;
1869 	}
1870 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1871 	if (ret) {
1872 		PRINTF(cb, "post send error %d\n", ret);
1873 		return;
1874 	}
1875 
1876 	/* Spin waiting for send completion */
1877 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1878 	if (ret < 0) {
1879 		PRINTF(cb, "poll error %d\n", ret);
1880 		return;
1881 	}
1882 	if (wc.status) {
1883 		PRINTF(cb, "send completion error %d\n", wc.status);
1884 		return;
1885 	}
1886 
1887 	/* Spin waiting for server's Start STAG/TO/Len */
1888 	while (cb->state < RDMA_WRITE_ADV) {
1889 		krping_cq_event_handler(cb->cq, cb);
1890 	}
1891 
1892 	bw_test(cb);
1893 }
1894 
1895 static void krping_fr_test(struct krping_cb *cb)
1896 {
1897 	struct ib_fast_reg_page_list *pl;
1898 	struct ib_send_wr fr, inv, *bad;
1899 	struct ib_wc wc;
1900 	u8 key = 0;
1901 	struct ib_mr *mr;
1902 	int i;
1903 	int ret;
1904 	int size = cb->size;
1905 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1906 	time_t start;
1907 	int count = 0;
1908 	int scnt = 0;
1909 
1910 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
1911 	if (IS_ERR(pl)) {
1912 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
1913 		return;
1914 	}
1915 
1916 	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
1917 	if (IS_ERR(mr)) {
1918 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
1919 		goto err1;
1920 	}
1921 
1922 	for (i=0; i<plen; i++)
1923 		pl->page_list[i] = 0xcafebabe | i;
1924 
1925 	memset(&fr, 0, sizeof fr);
1926 	fr.opcode = IB_WR_FAST_REG_MR;
1927 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
1928 	fr.wr.fast_reg.length = size;
1929 	fr.wr.fast_reg.page_list = pl;
1930 	fr.wr.fast_reg.page_list_len = plen;
1931 	fr.wr.fast_reg.iova_start = 0;
1932 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1933 	fr.next = &inv;
1934 	memset(&inv, 0, sizeof inv);
1935 	inv.opcode = IB_WR_LOCAL_INV;
1936 	inv.send_flags = IB_SEND_SIGNALED;
1937 
1938 	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
1939 	start = time_uptime;
1940 	while (1) {
1941 		if ((time_uptime - start) >= 9) {
1942 			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
1943 			wait_event_interruptible(cb->sem, cb->state == ERROR);
1944 			if (cb->state == ERROR)
1945 				break;
1946 			start = time_uptime;
1947 		}
1948 		while (scnt < (cb->txdepth>>1)) {
1949 			ib_update_fast_reg_key(mr, ++key);
1950 			fr.wr.fast_reg.rkey = mr->rkey;
1951 			inv.ex.invalidate_rkey = mr->rkey;
1952 			size = arc4random() % cb->size;
1953 			if (size == 0)
1954 				size = cb->size;
1955 			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1956 			fr.wr.fast_reg.length = size;
1957 			fr.wr.fast_reg.page_list_len = plen;
1958 			ret = ib_post_send(cb->qp, &fr, &bad);
1959 			if (ret) {
1960 				PRINTF(cb, "ib_post_send failed %d\n", ret);
1961 				goto err2;
1962 			}
1963 			scnt++;
1964 		}
1965 
1966 		do {
1967 			ret = ib_poll_cq(cb->cq, 1, &wc);
1968 			if (ret < 0) {
1969 				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1970 				goto err2;
1971 			}
1972 			if (ret == 1) {
1973 				if (wc.status) {
1974 					PRINTF(cb, "completion error %u\n", wc.status);
1975 					goto err2;
1976 				}
1977 				count++;
1978 				scnt--;
1979 			}
1980 			else if (krping_sigpending()) {
1981 				PRINTF(cb, "signal!\n");
1982 				goto err2;
1983 			}
1984 		} while (ret == 1);
1985 	}
1986 err2:
1987 #if 0
1988 	DEBUG_LOG(cb, "sleeping 1 second\n");
1989 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1990 #endif
1991 	DEBUG_LOG(cb, "draining the cq...\n");
1992 	do {
1993 		ret = ib_poll_cq(cb->cq, 1, &wc);
1994 		if (ret < 0) {
1995 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1996 			break;
1997 		}
1998 		if (ret == 1) {
1999 			if (wc.status) {
2000 				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
2001 			}
2002 		}
2003 	} while (ret == 1);
2004 	DEBUG_LOG(cb, "fr_test: done!\n");
2005 	ib_dereg_mr(mr);
2006 err1:
2007 	ib_free_fast_reg_page_list(pl);
2008 }
2009 
2010 static int krping_connect_client(struct krping_cb *cb)
2011 {
2012 	struct rdma_conn_param conn_param;
2013 	int ret;
2014 
2015 	memset(&conn_param, 0, sizeof conn_param);
2016 	conn_param.responder_resources = 1;
2017 	conn_param.initiator_depth = 1;
2018 	conn_param.retry_count = 10;
2019 
2020 	ret = rdma_connect(cb->cm_id, &conn_param);
2021 	if (ret) {
2022 		PRINTF(cb, "rdma_connect error %d\n", ret);
2023 		return ret;
2024 	}
2025 
2026 	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
2027 	if (cb->state == ERROR) {
2028 		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
2029 		return -1;
2030 	}
2031 
2032 	DEBUG_LOG(cb, "rdma_connect successful\n");
2033 	return 0;
2034 }
2035 
2036 static int krping_bind_client(struct krping_cb *cb)
2037 {
2038 	struct sockaddr_in sin;
2039 	int ret;
2040 
2041 	memset(&sin, 0, sizeof(sin));
2042 	sin.sin_len = sizeof sin;
2043 	sin.sin_family = AF_INET;
2044 	sin.sin_addr.s_addr = cb->addr.s_addr;
2045 	sin.sin_port = cb->port;
2046 
2047 	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
2048 				2000);
2049 	if (ret) {
2050 		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
2051 		return ret;
2052 	}
2053 
2054 	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
2055 	if (cb->state != ROUTE_RESOLVED) {
2056 		PRINTF(cb,
2057 		       "addr/route resolution did not resolve: state %d\n",
2058 		       cb->state);
2059 		return -EINTR;
2060 	}
2061 
2062 	if (cb->mem == FASTREG && !fastreg_supported(cb))
2063 		return -EINVAL;
2064 
2065 	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
2066 	return 0;
2067 }
2068 
2069 static void krping_run_client(struct krping_cb *cb)
2070 {
2071 	struct ib_recv_wr *bad_wr;
2072 	int ret;
2073 
2074 	ret = krping_bind_client(cb);
2075 	if (ret)
2076 		return;
2077 
2078 	ret = krping_setup_qp(cb, cb->cm_id);
2079 	if (ret) {
2080 		PRINTF(cb, "setup_qp failed: %d\n", ret);
2081 		return;
2082 	}
2083 
2084 	ret = krping_setup_buffers(cb);
2085 	if (ret) {
2086 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
2087 		goto err1;
2088 	}
2089 
2090 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
2091 	if (ret) {
2092 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
2093 		goto err2;
2094 	}
2095 
2096 	ret = krping_connect_client(cb);
2097 	if (ret) {
2098 		PRINTF(cb, "connect error %d\n", ret);
2099 		goto err2;
2100 	}
2101 
2102 	if (cb->wlat)
2103 		krping_wlat_test_client(cb);
2104 	else if (cb->rlat)
2105 		krping_rlat_test_client(cb);
2106 	else if (cb->bw)
2107 		krping_bw_test_client(cb);
2108 	else if (cb->frtest)
2109 		krping_fr_test(cb);
2110 	else
2111 		krping_test_client(cb);
2112 	rdma_disconnect(cb->cm_id);
2113 err2:
2114 	krping_free_buffers(cb);
2115 err1:
2116 	krping_free_qp(cb);
2117 }
2118 
2119 int krping_doit(char *cmd, void *cookie)
2120 {
2121 	struct krping_cb *cb;
2122 	int op;
2123 	int ret = 0;
2124 	char *optarg;
2125 	unsigned long optint;
2126 
2127 	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
2128 	if (!cb)
2129 		return -ENOMEM;
2130 
2131 	mutex_lock(&krping_mutex);
2132 	list_add_tail(&cb->list, &krping_cbs);
2133 	mutex_unlock(&krping_mutex);
2134 
2135 	cb->cookie = cookie;
2136 	cb->server = -1;
2137 	cb->state = IDLE;
2138 	cb->size = 64;
2139 	cb->txdepth = RPING_SQ_DEPTH;
2140 	cb->mem = DMA;
2141 	init_waitqueue_head(&cb->sem);
2142 
2143 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
2144 			      &optint)) != 0) {
2145 		switch (op) {
2146 		case 'a':
2147 			cb->addr_str = optarg;
2148 			DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
2149 			if (!inet_aton(optarg, &cb->addr)) {
2150 				PRINTF(cb, "bad addr string %s\n",
2151 				    optarg);
2152 				ret = EINVAL;
2153 			}
2154 			break;
2155 		case 'p':
2156 			cb->port = htons(optint);
2157 			DEBUG_LOG(cb, "port %d\n", (int)optint);
2158 			break;
2159 		case 'P':
2160 			cb->poll = 1;
2161 			DEBUG_LOG(cb, "server\n");
2162 			break;
2163 		case 's':
2164 			cb->server = 1;
2165 			DEBUG_LOG(cb, "server\n");
2166 			break;
2167 		case 'c':
2168 			cb->server = 0;
2169 			DEBUG_LOG(cb, "client\n");
2170 			break;
2171 		case 'S':
2172 			cb->size = optint;
2173 			if ((cb->size < 1) ||
2174 			    (cb->size > RPING_BUFSIZE)) {
2175 				PRINTF(cb, "Invalid size %d "
2176 				       "(valid range is 1 to %d)\n",
2177 				       cb->size, RPING_BUFSIZE);
2178 				ret = EINVAL;
2179 			} else
2180 				DEBUG_LOG(cb, "size %d\n", (int)optint);
2181 			break;
2182 		case 'C':
2183 			cb->count = optint;
2184 			if (cb->count < 0) {
2185 				PRINTF(cb, "Invalid count %d\n",
2186 					cb->count);
2187 				ret = EINVAL;
2188 			} else
2189 				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
2190 			break;
2191 		case 'v':
2192 			cb->verbose++;
2193 			DEBUG_LOG(cb, "verbose\n");
2194 			break;
2195 		case 'V':
2196 			cb->validate++;
2197 			DEBUG_LOG(cb, "validate data\n");
2198 			break;
2199 		case 'l':
2200 			cb->wlat++;
2201 			break;
2202 		case 'L':
2203 			cb->rlat++;
2204 			break;
2205 		case 'B':
2206 			cb->bw++;
2207 			break;
2208 		case 'd':
2209 			cb->duplex++;
2210 			break;
2211 		case 'm':
2212 			if (!strncmp(optarg, "dma", 3))
2213 				cb->mem = DMA;
2214 			else if (!strncmp(optarg, "fastreg", 7))
2215 				cb->mem = FASTREG;
2216 			else if (!strncmp(optarg, "mw", 2))
2217 				cb->mem = MW;
2218 			else if (!strncmp(optarg, "mr", 2))
2219 				cb->mem = MR;
2220 			else {
2221 				PRINTF(cb, "unknown mem mode %s.  "
2222 					"Must be dma, fastreg, mw, or mr\n",
2223 					optarg);
2224 				ret = -EINVAL;
2225 				break;
2226 			}
2227 			break;
2228 		case 'I':
2229 			cb->server_invalidate = 1;
2230 			break;
2231 		case 'T':
2232 			cb->txdepth = optint;
2233 			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
2234 			break;
2235 		case 'Z':
2236 			cb->local_dma_lkey = 1;
2237 			DEBUG_LOG(cb, "using local dma lkey\n");
2238 			break;
2239 		case 'R':
2240 			cb->read_inv = 1;
2241 			DEBUG_LOG(cb, "using read-with-inv\n");
2242 			break;
2243 		case 'f':
2244 			cb->frtest = 1;
2245 			DEBUG_LOG(cb, "fast-reg test!\n");
2246 			break;
2247 		default:
2248 			PRINTF(cb, "unknown opt %s\n", optarg);
2249 			ret = -EINVAL;
2250 			break;
2251 		}
2252 	}
2253 	if (ret)
2254 		goto out;
2255 
2256 	if (cb->server == -1) {
2257 		PRINTF(cb, "must be either client or server\n");
2258 		ret = -EINVAL;
2259 		goto out;
2260 	}
2261 
2262 	if (cb->server && cb->frtest) {
2263 		PRINTF(cb, "must be client to run frtest\n");
2264 		ret = -EINVAL;
2265 		goto out;
2266 	}
2267 
2268 	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
2269 		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
2270 		ret = -EINVAL;
2271 		goto out;
2272 	}
2273 
2274 	if (cb->server_invalidate && cb->mem != FASTREG) {
2275 		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
2276 		ret = -EINVAL;
2277 		goto out;
2278 	}
2279 
2280 	if (cb->read_inv && cb->mem != FASTREG) {
2281 		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
2282 		ret = -EINVAL;
2283 		goto out;
2284 	}
2285 
2286 	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw)) {
2287 		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
2288 		ret = -EINVAL;
2289 		goto out;
2290 	}
2291 
2292 	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
2293 	if (IS_ERR(cb->cm_id)) {
2294 		ret = PTR_ERR(cb->cm_id);
2295 		PRINTF(cb, "rdma_create_id error %d\n", ret);
2296 		goto out;
2297 	}
2298 	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
2299 
2300 	if (cb->server)
2301 		krping_run_server(cb);
2302 	else
2303 		krping_run_client(cb);
2304 
2305 	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
2306 	rdma_destroy_id(cb->cm_id);
2307 out:
2308 	mutex_lock(&krping_mutex);
2309 	list_del(&cb->list);
2310 	mutex_unlock(&krping_mutex);
2311 	kfree(cb);
2312 	return ret;
2313 }
2314 
2315 void
2316 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
2317 {
2318 	struct krping_cb *cb;
2319 
2320 	mutex_lock(&krping_mutex);
2321 	list_for_each_entry(cb, &krping_cbs, list)
2322 	    (*f)(cb->pd ? &cb->stats : NULL, arg);
2323 	mutex_unlock(&krping_mutex);
2324 }
2325 
2326 void krping_init(void)
2327 {
2328 
2329 	mutex_init(&krping_mutex);
2330 }
2331