1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/cmn_err.h>
29 #include <sys/uio.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/systm.h>
33 #include <sys/socketvar.h>
34 #include <fs/sockfs/sodirect.h>
35
36 /*
37 * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
38 * we use a consolidation private KAPI to allow the protocol to start
39 * an asynchronous copyout to a user-land receive-side buffer (uioa)
40 * when a blocking socket read (e.g. read, recv, ...) is pending.
41 *
42 * In some broad strokes, this is what happens. When recv is called,
43 * we first determine whether it would be beneficial to use uioa, and
44 * if so set up the required state (all done by sod_rcv_init()).
45 * The protocol can only initiate asynchronous copyout if the receive
46 * queue is empty, so the first thing we do is drain any previously
47 * queued data (using sod_uioa_so_init()). Once the copyouts (if any)
48 * have been scheduled we wait for the receive to be satisfied. During
49 * that time any new mblks that are enqueued will be scheduled to be
50 * copied out asynchronously (sod_uioa_mblk_init()). When the receive
51 * has been satisfied we wait for all scheduled copyout operations to
52 * complete before we return to the user (sod_rcv_done())
53 */
54
55 static struct kmem_cache *sock_sod_cache;
56
57 /*
58 * This function is called at the beginning of recvmsg().
59 *
60 * If I/OAT is enabled on this sonode, initialize the uioa state machine
61 * with state UIOA_ALLOC.
62 */
63 uio_t *
sod_rcv_init(struct sonode * so,int flags,struct uio ** uiopp)64 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
65 {
66 struct uio *suiop;
67 struct uio *uiop;
68 sodirect_t *sodp = so->so_direct;
69
70 if (sodp == NULL)
71 return (NULL);
72
73 suiop = NULL;
74 uiop = *uiopp;
75
76 mutex_enter(&so->so_lock);
77 if (uiop->uio_resid >= uioasync.mincnt &&
78 sodp != NULL && sodp->sod_enabled &&
79 uioasync.enabled && !(flags & MSG_PEEK) &&
80 !so->so_proto_props.sopp_loopback && so->so_filter_active == 0 &&
81 !(so->so_state & SS_CANTRCVMORE)) {
82 /*
83 * Big enough I/O for uioa min setup and an sodirect socket
84 * and sodirect enabled and uioa enabled and I/O will be done
85 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
86 */
87 if (!uioainit(uiop, &sodp->sod_uioa)) {
88 /*
89 * Successful uioainit() so the uio_t part of the
90 * uioa_t will be used for all uio_t work to follow,
91 * we return the original "uiop" in "suiop".
92 */
93 suiop = uiop;
94 *uiopp = (uio_t *)&sodp->sod_uioa;
95 /*
96 * Before returning to the caller the passed in uio_t
97 * "uiop" will be updated via a call to uioafini()
98 * below.
99 *
100 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
101 * here as first we have to uioamove() any currently
102 * queued M_DATA mblk_t(s) so it will be done later.
103 */
104 }
105 }
106 mutex_exit(&so->so_lock);
107
108 return (suiop);
109 }
110
111 /*
112 * This function is called at the end of recvmsg(), it finializes all the I/OAT
113 * operations, and reset the uioa state to UIOA_ALLOC.
114 */
115 int
sod_rcv_done(struct sonode * so,struct uio * suiop,struct uio * uiop)116 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
117 {
118 int error = 0;
119 sodirect_t *sodp = so->so_direct;
120 mblk_t *mp;
121
122 if (sodp == NULL) {
123 return (0);
124 }
125
126 ASSERT(MUTEX_HELD(&so->so_lock));
127 /* Finish any sodirect and uioa processing */
128 if (suiop != NULL) {
129 /* Finish any uioa_t processing */
130
131 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
132 error = uioafini(suiop, (uioa_t *)uiop);
133 if ((mp = sodp->sod_uioafh) != NULL) {
134 sodp->sod_uioafh = NULL;
135 sodp->sod_uioaft = NULL;
136 freemsg(mp);
137 }
138 }
139 ASSERT(sodp->sod_uioafh == NULL);
140
141 return (error);
142 }
143
144 /*
145 * Schedule a uioamove() on a mblk. This is done as mblks are enqueued
146 * by the protocol on the socket's rcv queue.
147 *
148 * Caller must be holding so_lock.
149 */
150 void
sod_uioa_mblk_init(struct sodirect_s * sodp,mblk_t * mp,size_t msg_size)151 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
152 {
153 uioa_t *uioap = &sodp->sod_uioa;
154 mblk_t *mp1 = mp;
155 mblk_t *lmp = NULL;
156
157 ASSERT(DB_TYPE(mp) == M_DATA);
158 ASSERT(msg_size == msgdsize(mp));
159
160 if (uioap->uioa_state & UIOA_ENABLED) {
161 /* Uioa is enabled */
162
163 if (msg_size > uioap->uio_resid) {
164 /*
165 * There isn't enough uio space for the mblk_t chain
166 * so disable uioa such that this and any additional
167 * mblk_t data is handled by the socket and schedule
168 * the socket for wakeup to finish this uioa.
169 */
170 uioap->uioa_state &= UIOA_CLR;
171 uioap->uioa_state |= UIOA_FINI;
172 return;
173 }
174 do {
175 uint32_t len = MBLKL(mp1);
176
177 if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
178 /* Scheduled, mark dblk_t as such */
179 DB_FLAGS(mp1) |= DBLK_UIOA;
180 } else {
181 /* Error, turn off async processing */
182 uioap->uioa_state &= UIOA_CLR;
183 uioap->uioa_state |= UIOA_FINI;
184 break;
185 }
186 lmp = mp1;
187 } while ((mp1 = mp1->b_cont) != NULL);
188
189 if (mp1 != NULL || uioap->uio_resid == 0) {
190 /* Break the mblk chain if neccessary. */
191 if (mp1 != NULL && lmp != NULL) {
192 mp->b_next = mp1;
193 lmp->b_cont = NULL;
194 }
195 }
196 }
197 }
198
199 /*
200 * This function is called on a mblk that thas been successfully uioamoved().
201 */
202 void
sod_uioa_mblk_done(sodirect_t * sodp,mblk_t * bp)203 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
204 {
205 if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
206 /*
207 * A uioa flaged mblk_t chain, already uio processed,
208 * add it to the sodirect uioa pending free list.
209 *
210 * Note, a b_cont chain headed by a DBLK_UIOA enable
211 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
212 */
213 mblk_t *bpt = sodp->sod_uioaft;
214
215 ASSERT(sodp != NULL);
216
217 /*
218 * Add first mblk_t of "bp" chain to current sodirect uioa
219 * free list tail mblk_t, if any, else empty list so new head.
220 */
221 if (bpt == NULL)
222 sodp->sod_uioafh = bp;
223 else
224 bpt->b_cont = bp;
225
226 /*
227 * Walk mblk_t "bp" chain to find tail and adjust rptr of
228 * each to reflect that uioamove() has consumed all data.
229 */
230 bpt = bp;
231 for (;;) {
232 ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
233
234 bpt->b_rptr = bpt->b_wptr;
235 if (bpt->b_cont == NULL)
236 break;
237 bpt = bpt->b_cont;
238 }
239 /* New sodirect uioa free list tail */
240 sodp->sod_uioaft = bpt;
241
242 /* Only dequeue once with data returned per uioa_t */
243 if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
244 sodp->sod_uioa.uioa_state &= UIOA_CLR;
245 sodp->sod_uioa.uioa_state |= UIOA_FINI;
246 }
247 }
248 }
249
250 /*
251 * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
252 * this function on a non-STREAMS socket to schedule uioamove() on the data
253 * that has already queued in this socket.
254 */
255 void
sod_uioa_so_init(struct sonode * so,struct sodirect_s * sodp,struct uio * uiop)256 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
257 {
258 uioa_t *uioap = (uioa_t *)uiop;
259 mblk_t *lbp;
260 mblk_t *wbp;
261 mblk_t *bp;
262 int len;
263 int error;
264 boolean_t in_rcv_q = B_TRUE;
265
266 ASSERT(MUTEX_HELD(&so->so_lock));
267 ASSERT(&sodp->sod_uioa == uioap);
268
269 /*
270 * Walk first b_cont chain in sod_q
271 * and schedule any M_DATA mblk_t's for uio asynchronous move.
272 */
273 bp = so->so_rcv_q_head;
274
275 again:
276 /* Walk the chain */
277 lbp = NULL;
278 wbp = bp;
279
280 do {
281 if (bp == NULL)
282 break;
283
284 if (wbp->b_datap->db_type != M_DATA) {
285 /* Not M_DATA, no more uioa */
286 goto nouioa;
287 }
288 if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
289 /* Have a M_DATA mblk_t with data */
290 if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
291 len + uioap->uioa_mbytes >= so->so_oobmark)) {
292 /* Not enough uio sapce, or beyond oobmark */
293 goto nouioa;
294 }
295 ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
296 error = uioamove(wbp->b_rptr, len,
297 UIO_READ, uioap);
298 if (!error) {
299 /* Scheduled, mark dblk_t as such */
300 wbp->b_datap->db_flags |= DBLK_UIOA;
301 } else {
302 /* Break the mblk chain */
303 goto nouioa;
304 }
305 }
306 /* Save last wbp processed */
307 lbp = wbp;
308 } while ((wbp = wbp->b_cont) != NULL);
309
310 if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
311 /*
312 * We get here only once to process the sonode dump area
313 * if so_rcv_q_head is NULL or all the mblks have been
314 * successfully uioamoved()ed.
315 */
316 in_rcv_q = B_FALSE;
317
318 /* move to dump area */
319 bp = so->so_rcv_head;
320 goto again;
321 }
322
323 return;
324
325 nouioa:
326 /* No more uioa */
327 uioap->uioa_state &= UIOA_CLR;
328 uioap->uioa_state |= UIOA_FINI;
329
330 /*
331 * If we processed 1 or more mblk_t(s) then we need to split the
332 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
333 * are in the current chain and the rest are in the following new
334 * chain.
335 */
336 if (lbp != NULL) {
337 /* New end of current chain */
338 lbp->b_cont = NULL;
339
340 /* Insert new chain wbp after bp */
341 if ((wbp->b_next = bp->b_next) == NULL) {
342 if (in_rcv_q)
343 so->so_rcv_q_last_head = wbp;
344 else
345 so->so_rcv_last_head = wbp;
346 }
347 bp->b_next = wbp;
348 bp->b_next->b_prev = bp->b_prev;
349 bp->b_prev = lbp;
350 }
351 }
352
353 /*
354 * Initialize sodirect data structures on a socket.
355 */
356 void
sod_sock_init(struct sonode * so)357 sod_sock_init(struct sonode *so)
358 {
359 sodirect_t *sodp;
360
361 ASSERT(so->so_direct == NULL);
362
363 so->so_state |= SS_SODIRECT;
364
365 sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
366 sodp->sod_enabled = B_TRUE;
367 sodp->sod_uioafh = NULL;
368 sodp->sod_uioaft = NULL;
369 /*
370 * Remainder of the sod_uioa members are left uninitialized
371 * but will be initialized later by uioainit() before uioa
372 * is enabled.
373 */
374 sodp->sod_uioa.uioa_state = UIOA_ALLOC;
375 so->so_direct = sodp;
376 }
377
378 void
sod_sock_fini(struct sonode * so)379 sod_sock_fini(struct sonode *so)
380 {
381 sodirect_t *sodp = so->so_direct;
382
383 ASSERT(sodp->sod_uioafh == NULL);
384
385 so->so_direct = NULL;
386 kmem_cache_free(sock_sod_cache, sodp);
387 }
388
389 /*
390 * Init the sodirect kmem cache while sockfs is loading.
391 */
392 int
sod_init()393 sod_init()
394 {
395 /* Allocate sodirect_t kmem_cache */
396 sock_sod_cache = kmem_cache_create("sock_sod_cache",
397 sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
398
399 return (0);
400 }
401
402 ssize_t
sod_uioa_mblk(struct sonode * so,mblk_t * mp)403 sod_uioa_mblk(struct sonode *so, mblk_t *mp)
404 {
405 sodirect_t *sodp = so->so_direct;
406
407 ASSERT(sodp != NULL);
408 ASSERT(MUTEX_HELD(&so->so_lock));
409
410 ASSERT(sodp->sod_enabled);
411 ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
412
413 ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
414
415 if (mp == NULL && so->so_rcv_q_head != NULL) {
416 mp = so->so_rcv_q_head;
417 ASSERT(mp->b_prev != NULL);
418 mp->b_prev = NULL;
419 so->so_rcv_q_head = mp->b_next;
420 if (so->so_rcv_q_head == NULL) {
421 so->so_rcv_q_last_head = NULL;
422 }
423 mp->b_next = NULL;
424 }
425
426 sod_uioa_mblk_done(sodp, mp);
427
428 if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
429 DB_TYPE(so->so_rcv_head) == M_DATA &&
430 (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
431 /* more arrived */
432 ASSERT(so->so_rcv_q_head == NULL);
433 mp = so->so_rcv_head;
434 so->so_rcv_head = mp->b_next;
435 if (so->so_rcv_head == NULL)
436 so->so_rcv_last_head = NULL;
437 mp->b_prev = mp->b_next = NULL;
438 sod_uioa_mblk_done(sodp, mp);
439 }
440
441 #ifdef DEBUG
442 if (so->so_rcv_q_head != NULL) {
443 mblk_t *m = so->so_rcv_q_head;
444 while (m != NULL) {
445 if (DB_FLAGS(m) & DBLK_UIOA) {
446 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
447 " in so_rcv_q_head.\n", (void *)m);
448 }
449 m = m->b_next;
450 }
451 }
452 if (so->so_rcv_head != NULL) {
453 mblk_t *m = so->so_rcv_head;
454 while (m != NULL) {
455 if (DB_FLAGS(m) & DBLK_UIOA) {
456 cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
457 " in so_rcv_head.\n", (void *)m);
458 }
459 m = m->b_next;
460 }
461 }
462 #endif
463 return (sodp->sod_uioa.uioa_mbytes);
464 }
465