xref: /illumos-gate/usr/src/uts/common/fs/sockfs/sodirect.c (revision 35a5a3587fd94b666239c157d3722745250ccbd7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/cmn_err.h>
30 #include <sys/uio.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/systm.h>
34 #include <sys/socketvar.h>
35 #include <fs/sockfs/sodirect.h>
36 
37 /*
38  * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
39  * we use a consolidation private KAPI to allow the protocol to start
40  * an asynchronous copyout to a user-land receive-side buffer (uioa)
41  * when a blocking socket read (e.g. read, recv, ...) is pending.
42  *
43  * In some broad strokes, this is what happens. When recv is called,
44  * we first determine whether it would be beneficial to use uioa, and
45  * if so set up the required state (all done by sod_rcv_init()).
46  * The protocol can only initiate asynchronous copyout if the receive
47  * queue is empty, so the first thing we do is drain any previously
48  * queued data (using sod_uioa_so_init()). Once the copyouts (if any)
49  * have been scheduled we wait for the receive to be satisfied. During
50  * that time any new mblks that are enqueued will be scheduled to be
51  * copied out asynchronously (sod_uioa_mblk_init()). When the receive
52  * has been satisfied we wait for all scheduled copyout operations to
53  * complete before we return to the user (sod_rcv_done())
54  */
55 
56 static struct kmem_cache *sock_sod_cache;
57 
58 /*
59  * This function is called at the beginning of recvmsg().
60  *
61  * If I/OAT is enabled on this sonode, initialize the uioa state machine
62  * with state UIOA_ALLOC.
63  */
64 uio_t *
65 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
66 {
67 	struct uio *suiop;
68 	struct uio *uiop;
69 	sodirect_t *sodp = so->so_direct;
70 
71 	if (sodp == NULL)
72 		return (NULL);
73 
74 	suiop = NULL;
75 	uiop = *uiopp;
76 
77 	mutex_enter(&so->so_lock);
78 	if (uiop->uio_resid >= uioasync.mincnt &&
79 	    sodp != NULL && sodp->sod_enabled &&
80 	    uioasync.enabled && !(flags & MSG_PEEK) &&
81 	    !so->so_proto_props.sopp_loopback &&
82 	    !(so->so_state & SS_CANTRCVMORE)) {
83 		/*
84 		 * Big enough I/O for uioa min setup and an sodirect socket
85 		 * and sodirect enabled and uioa enabled and I/O will be done
86 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
87 		 */
88 		if (!uioainit(uiop, &sodp->sod_uioa)) {
89 			/*
90 			 * Successful uioainit() so the uio_t part of the
91 			 * uioa_t will be used for all uio_t work to follow,
92 			 * we return the original "uiop" in "suiop".
93 			 */
94 			suiop = uiop;
95 			*uiopp = (uio_t *)&sodp->sod_uioa;
96 			/*
97 			 * Before returning to the caller the passed in uio_t
98 			 * "uiop" will be updated via a call to uioafini()
99 			 * below.
100 			 *
101 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
102 			 * here as first we have to uioamove() any currently
103 			 * queued M_DATA mblk_t(s) so it will be done later.
104 			 */
105 		}
106 	}
107 	mutex_exit(&so->so_lock);
108 
109 	return (suiop);
110 }
111 
112 /*
113  * This function is called at the end of recvmsg(), it finializes all the I/OAT
114  * operations, and reset the uioa state to UIOA_ALLOC.
115  */
116 int
117 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
118 {
119 	int error = 0;
120 	sodirect_t *sodp = so->so_direct;
121 	mblk_t *mp;
122 
123 	if (sodp == NULL) {
124 		return (0);
125 	}
126 
127 	ASSERT(MUTEX_HELD(&so->so_lock));
128 	/* Finish any sodirect and uioa processing */
129 	if (suiop != NULL) {
130 		/* Finish any uioa_t processing */
131 
132 		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
133 		error = uioafini(suiop, (uioa_t *)uiop);
134 		if ((mp = sodp->sod_uioafh) != NULL) {
135 			sodp->sod_uioafh = NULL;
136 			sodp->sod_uioaft = NULL;
137 			freemsg(mp);
138 		}
139 	}
140 	ASSERT(sodp->sod_uioafh == NULL);
141 
142 	return (error);
143 }
144 
145 /*
146  * Schedule a uioamove() on a mblk. This is done as mblks are enqueued
147  * by the protocol on the socket's rcv queue.
148  *
149  * Caller must be holding so_lock.
150  */
151 void
152 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
153 {
154 	uioa_t *uioap = &sodp->sod_uioa;
155 	mblk_t *mp1 = mp;
156 	mblk_t *lmp = NULL;
157 
158 	ASSERT(DB_TYPE(mp) == M_DATA);
159 	ASSERT(msg_size == msgdsize(mp));
160 
161 	if (uioap->uioa_state & UIOA_ENABLED) {
162 		/* Uioa is enabled */
163 
164 		if (msg_size > uioap->uio_resid) {
165 			/*
166 			 * There isn't enough uio space for the mblk_t chain
167 			 * so disable uioa such that this and any additional
168 			 * mblk_t data is handled by the socket and schedule
169 			 * the socket for wakeup to finish this uioa.
170 			 */
171 			uioap->uioa_state &= UIOA_CLR;
172 			uioap->uioa_state |= UIOA_FINI;
173 			return;
174 		}
175 		do {
176 			uint32_t	len = MBLKL(mp1);
177 
178 			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
179 				/* Scheduled, mark dblk_t as such */
180 				DB_FLAGS(mp1) |= DBLK_UIOA;
181 			} else {
182 				/* Error, turn off async processing */
183 				uioap->uioa_state &= UIOA_CLR;
184 				uioap->uioa_state |= UIOA_FINI;
185 				break;
186 			}
187 			lmp = mp1;
188 		} while ((mp1 = mp1->b_cont) != NULL);
189 
190 		if (mp1 != NULL || uioap->uio_resid == 0) {
191 			/* Break the mblk chain if neccessary. */
192 			if (mp1 != NULL && lmp != NULL) {
193 				mp->b_next = mp1;
194 				lmp->b_cont = NULL;
195 			}
196 		}
197 	}
198 }
199 
200 /*
201  * This function is called on a mblk that thas been successfully uioamoved().
202  */
203 void
204 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
205 {
206 	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
207 		/*
208 		 * A uioa flaged mblk_t chain, already uio processed,
209 		 * add it to the sodirect uioa pending free list.
210 		 *
211 		 * Note, a b_cont chain headed by a DBLK_UIOA enable
212 		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
213 		 */
214 		mblk_t	*bpt = sodp->sod_uioaft;
215 
216 		ASSERT(sodp != NULL);
217 
218 		/*
219 		 * Add first mblk_t of "bp" chain to current sodirect uioa
220 		 * free list tail mblk_t, if any, else empty list so new head.
221 		 */
222 		if (bpt == NULL)
223 			sodp->sod_uioafh = bp;
224 		else
225 			bpt->b_cont = bp;
226 
227 		/*
228 		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
229 		 * each to reflect that uioamove() has consumed all data.
230 		 */
231 		bpt = bp;
232 		for (;;) {
233 			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
234 
235 			bpt->b_rptr = bpt->b_wptr;
236 			if (bpt->b_cont == NULL)
237 				break;
238 			bpt = bpt->b_cont;
239 		}
240 		/* New sodirect uioa free list tail */
241 		sodp->sod_uioaft = bpt;
242 
243 		/* Only dequeue once with data returned per uioa_t */
244 		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
245 			sodp->sod_uioa.uioa_state &= UIOA_CLR;
246 			sodp->sod_uioa.uioa_state |= UIOA_FINI;
247 		}
248 	}
249 }
250 
251 /*
252  * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
253  * this function on a non-STREAMS socket to schedule uioamove() on the data
254  * that has already queued in this socket.
255  */
256 void
257 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
258 {
259 	uioa_t	*uioap = (uioa_t *)uiop;
260 	mblk_t	*lbp;
261 	mblk_t	*wbp;
262 	mblk_t	*bp;
263 	int	len;
264 	int	error;
265 	boolean_t in_rcv_q = B_TRUE;
266 
267 	ASSERT(MUTEX_HELD(&so->so_lock));
268 	ASSERT(&sodp->sod_uioa == uioap);
269 
270 	/*
271 	 * Walk first b_cont chain in sod_q
272 	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
273 	 */
274 	bp = so->so_rcv_q_head;
275 
276 again:
277 	/* Walk the chain */
278 	lbp = NULL;
279 	wbp = bp;
280 
281 	do {
282 		if (bp == NULL)
283 			break;
284 
285 		if (wbp->b_datap->db_type != M_DATA) {
286 			/* Not M_DATA, no more uioa */
287 			goto nouioa;
288 		}
289 		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
290 			/* Have a M_DATA mblk_t with data */
291 			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
292 			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
293 				/* Not enough uio sapce, or beyond oobmark */
294 				goto nouioa;
295 			}
296 			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
297 			error = uioamove(wbp->b_rptr, len,
298 			    UIO_READ, uioap);
299 			if (!error) {
300 				/* Scheduled, mark dblk_t as such */
301 				wbp->b_datap->db_flags |= DBLK_UIOA;
302 			} else {
303 				/* Break the mblk chain */
304 				goto nouioa;
305 			}
306 		}
307 		/* Save last wbp processed */
308 		lbp = wbp;
309 	} while ((wbp = wbp->b_cont) != NULL);
310 
311 	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
312 		/*
313 		 * We get here only once to process the sonode dump area
314 		 * if so_rcv_q_head is NULL or all the mblks have been
315 		 * successfully uioamoved()ed.
316 		 */
317 		in_rcv_q = B_FALSE;
318 
319 		/* move to dump area */
320 		bp = so->so_rcv_head;
321 		goto again;
322 	}
323 
324 	return;
325 
326 nouioa:
327 	/* No more uioa */
328 	uioap->uioa_state &= UIOA_CLR;
329 	uioap->uioa_state |= UIOA_FINI;
330 
331 	/*
332 	 * If we processed 1 or more mblk_t(s) then we need to split the
333 	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
334 	 * are in the current chain and the rest are in the following new
335 	 * chain.
336 	 */
337 	if (lbp != NULL) {
338 		/* New end of current chain */
339 		lbp->b_cont = NULL;
340 
341 		/* Insert new chain wbp after bp */
342 		if ((wbp->b_next = bp->b_next) == NULL) {
343 			if (in_rcv_q)
344 				so->so_rcv_q_last_head = wbp;
345 			else
346 				so->so_rcv_last_head = wbp;
347 		}
348 		bp->b_next = wbp;
349 		bp->b_next->b_prev = bp->b_prev;
350 		bp->b_prev = lbp;
351 	}
352 }
353 
354 /*
355  * Initialize sodirect data structures on a socket.
356  */
357 void
358 sod_sock_init(struct sonode *so)
359 {
360 	sodirect_t	*sodp;
361 
362 	ASSERT(so->so_direct == NULL);
363 
364 	so->so_state |= SS_SODIRECT;
365 
366 	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
367 	sodp->sod_enabled = B_TRUE;
368 	sodp->sod_uioafh = NULL;
369 	sodp->sod_uioaft = NULL;
370 	/*
371 	 * Remainder of the sod_uioa members are left uninitialized
372 	 * but will be initialized later by uioainit() before uioa
373 	 * is enabled.
374 	 */
375 	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
376 	so->so_direct = sodp;
377 }
378 
379 void
380 sod_sock_fini(struct sonode *so)
381 {
382 	sodirect_t *sodp = so->so_direct;
383 
384 	ASSERT(sodp->sod_uioafh == NULL);
385 
386 	so->so_direct = NULL;
387 	kmem_cache_free(sock_sod_cache, sodp);
388 }
389 
390 /*
391  * Init the sodirect kmem cache while sockfs is loading.
392  */
393 int
394 sod_init()
395 {
396 	/* Allocate sodirect_t kmem_cache */
397 	sock_sod_cache = kmem_cache_create("sock_sod_cache",
398 	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
399 
400 	return (0);
401 }
402 
403 ssize_t
404 sod_uioa_mblk(struct sonode *so, mblk_t *mp)
405 {
406 	sodirect_t *sodp = so->so_direct;
407 
408 	ASSERT(sodp != NULL);
409 	ASSERT(MUTEX_HELD(&so->so_lock));
410 
411 	ASSERT(sodp->sod_enabled);
412 	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
413 
414 	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
415 
416 	if (mp == NULL && so->so_rcv_q_head != NULL) {
417 		mp = so->so_rcv_q_head;
418 		ASSERT(mp->b_prev != NULL);
419 		mp->b_prev = NULL;
420 		so->so_rcv_q_head = mp->b_next;
421 		if (so->so_rcv_q_head == NULL) {
422 			so->so_rcv_q_last_head = NULL;
423 		}
424 		mp->b_next = NULL;
425 	}
426 
427 	sod_uioa_mblk_done(sodp, mp);
428 
429 	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
430 	    DB_TYPE(so->so_rcv_head) == M_DATA &&
431 	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
432 		/* more arrived */
433 		ASSERT(so->so_rcv_q_head == NULL);
434 		mp = so->so_rcv_head;
435 		so->so_rcv_head = mp->b_next;
436 		if (so->so_rcv_head == NULL)
437 			so->so_rcv_last_head = NULL;
438 		mp->b_prev = mp->b_next = NULL;
439 		sod_uioa_mblk_done(sodp, mp);
440 	}
441 
442 #ifdef DEBUG
443 	if (so->so_rcv_q_head != NULL) {
444 		mblk_t *m = so->so_rcv_q_head;
445 		while (m != NULL) {
446 			if (DB_FLAGS(m) & DBLK_UIOA) {
447 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
448 				    " in so_rcv_q_head.\n", (void *)m);
449 			}
450 			m = m->b_next;
451 		}
452 	}
453 	if (so->so_rcv_head != NULL) {
454 		mblk_t *m = so->so_rcv_head;
455 		while (m != NULL) {
456 			if (DB_FLAGS(m) & DBLK_UIOA) {
457 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
458 				    " in so_rcv_head.\n", (void *)m);
459 			}
460 			m = m->b_next;
461 		}
462 	}
463 #endif
464 	return (sodp->sod_uioa.uioa_mbytes);
465 }
466