xref: /illumos-gate/usr/src/uts/common/inet/ip/ipsecesp.c (revision a1cdd5a67f3bf3e60db3f3a77baef63640ad91a4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
25  * Copyright (c) 2017 Joyent, Inc.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/errno.h>
32 #include <sys/strlog.h>
33 #include <sys/tihdr.h>
34 #include <sys/socket.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/kmem.h>
38 #include <sys/zone.h>
39 #include <sys/sysmacros.h>
40 #include <sys/cmn_err.h>
41 #include <sys/vtrace.h>
42 #include <sys/debug.h>
43 #include <sys/atomic.h>
44 #include <sys/strsun.h>
45 #include <sys/random.h>
46 #include <netinet/in.h>
47 #include <net/if.h>
48 #include <netinet/ip6.h>
49 #include <net/pfkeyv2.h>
50 #include <net/pfpolicy.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/nd.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 #include <inet/ip6.h>
58 #include <inet/ip_if.h>
59 #include <inet/ip_ndp.h>
60 #include <inet/sadb.h>
61 #include <inet/ipsec_info.h>
62 #include <inet/ipsec_impl.h>
63 #include <inet/ipsecesp.h>
64 #include <inet/ipdrop.h>
65 #include <inet/tcp.h>
66 #include <sys/kstat.h>
67 #include <sys/policy.h>
68 #include <sys/strsun.h>
69 #include <sys/strsubr.h>
70 #include <inet/udp_impl.h>
71 #include <sys/taskq.h>
72 #include <sys/note.h>
73 
74 #include <sys/tsol/tnet.h>
75 
76 /*
77  * Table of ND variables supported by ipsecesp. These are loaded into
78  * ipsecesp_g_nd in ipsecesp_init_nd.
79  * All of these are alterable, within the min/max values given, at run time.
80  */
81 static	ipsecespparam_t	lcl_param_arr[] = {
82 	/* min	max			value	name */
83 	{ 0,	3,			0,	"ipsecesp_debug"},
84 	{ 125,	32000, SADB_AGE_INTERVAL_DEFAULT, "ipsecesp_age_interval"},
85 	{ 1,	10,			1,	"ipsecesp_reap_delay"},
86 	{ 1,	SADB_MAX_REPLAY,	64,	"ipsecesp_replay_size"},
87 	{ 1,	300,			15,	"ipsecesp_acquire_timeout"},
88 	{ 1,	1800,			90,	"ipsecesp_larval_timeout"},
89 	/* Default lifetime values for ACQUIRE messages. */
90 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_bytes"},
91 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_bytes"},
92 	{ 0,	0xffffffffU,	24000,	"ipsecesp_default_soft_addtime"},
93 	{ 0,	0xffffffffU,	28800,	"ipsecesp_default_hard_addtime"},
94 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_usetime"},
95 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_usetime"},
96 	{ 0,	1,		0,	"ipsecesp_log_unknown_spi"},
97 	{ 0,	2,		1,	"ipsecesp_padding_check"},
98 	{ 0,	600,		20,	"ipsecesp_nat_keepalive_interval"},
99 };
100 /* For ipsecesp_nat_keepalive_interval, see ipsecesp.h. */
101 
102 #define	esp0dbg(a)	printf a
103 /* NOTE:  != 0 instead of > 0 so lint doesn't complain. */
104 #define	esp1dbg(espstack, a)	if (espstack->ipsecesp_debug != 0) printf a
105 #define	esp2dbg(espstack, a)	if (espstack->ipsecesp_debug > 1) printf a
106 #define	esp3dbg(espstack, a)	if (espstack->ipsecesp_debug > 2) printf a
107 
108 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
109 static int ipsecesp_close(queue_t *, int, cred_t *);
110 static int ipsecesp_rput(queue_t *, mblk_t *);
111 static int ipsecesp_wput(queue_t *, mblk_t *);
112 static void	*ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
113 static void	ipsecesp_stack_fini(netstackid_t stackid, void *arg);
114 
115 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
116 static void esp_outbound_finish(mblk_t *, ip_xmit_attr_t *);
117 static void esp_inbound_restart(mblk_t *, ip_recv_attr_t *);
118 
119 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
120     ipsecesp_stack_t *, cred_t *);
121 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
122     kstat_named_t **, ipsecesp_stack_t *);
123 static mblk_t *esp_submit_req_inbound(mblk_t *, ip_recv_attr_t *,
124     ipsa_t *, uint_t);
125 static mblk_t *esp_submit_req_outbound(mblk_t *, ip_xmit_attr_t *,
126     ipsa_t *, uchar_t *, uint_t);
127 
128 /* Setable in /etc/system */
129 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
130 
131 static struct module_info info = {
132 	5137, "ipsecesp", 0, INFPSZ, 65536, 1024
133 };
134 
135 static struct qinit rinit = {
136 	ipsecesp_rput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
137 	NULL
138 };
139 
140 static struct qinit winit = {
141 	ipsecesp_wput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
142 	NULL
143 };
144 
145 struct streamtab ipsecespinfo = {
146 	&rinit, &winit, NULL, NULL
147 };
148 
149 static taskq_t *esp_taskq;
150 
151 /*
152  * OTOH, this one is set at open/close, and I'm D_MTQPAIR for now.
153  *
154  * Question:	Do I need this, given that all instance's esps->esps_wq point
155  *		to IP?
156  *
157  * Answer:	Yes, because I need to know which queue is BOUND to
158  *		IPPROTO_ESP
159  */
160 
161 static int	esp_kstat_update(kstat_t *, int);
162 
163 static boolean_t
164 esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
165 {
166 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
167 	    "net", KSTAT_TYPE_NAMED,
168 	    sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
169 
170 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
171 		return (B_FALSE);
172 
173 	espstack->esp_kstats = espstack->esp_ksp->ks_data;
174 
175 	espstack->esp_ksp->ks_update = esp_kstat_update;
176 	espstack->esp_ksp->ks_private = (void *)(uintptr_t)stackid;
177 
178 #define	K64 KSTAT_DATA_UINT64
179 #define	KI(x) kstat_named_init(&(espstack->esp_kstats->esp_stat_##x), #x, K64)
180 
181 	KI(num_aalgs);
182 	KI(num_ealgs);
183 	KI(good_auth);
184 	KI(bad_auth);
185 	KI(bad_padding);
186 	KI(replay_failures);
187 	KI(replay_early_failures);
188 	KI(keysock_in);
189 	KI(out_requests);
190 	KI(acquire_requests);
191 	KI(bytes_expired);
192 	KI(out_discards);
193 	KI(crypto_sync);
194 	KI(crypto_async);
195 	KI(crypto_failures);
196 	KI(bad_decrypt);
197 	KI(sa_port_renumbers);
198 
199 #undef KI
200 #undef K64
201 
202 	kstat_install(espstack->esp_ksp);
203 
204 	return (B_TRUE);
205 }
206 
207 static int
208 esp_kstat_update(kstat_t *kp, int rw)
209 {
210 	esp_kstats_t *ekp;
211 	netstackid_t	stackid = (zoneid_t)(uintptr_t)kp->ks_private;
212 	netstack_t	*ns;
213 	ipsec_stack_t	*ipss;
214 
215 	if ((kp == NULL) || (kp->ks_data == NULL))
216 		return (EIO);
217 
218 	if (rw == KSTAT_WRITE)
219 		return (EACCES);
220 
221 	ns = netstack_find_by_stackid(stackid);
222 	if (ns == NULL)
223 		return (-1);
224 	ipss = ns->netstack_ipsec;
225 	if (ipss == NULL) {
226 		netstack_rele(ns);
227 		return (-1);
228 	}
229 	ekp = (esp_kstats_t *)kp->ks_data;
230 
231 	rw_enter(&ipss->ipsec_alg_lock, RW_READER);
232 	ekp->esp_stat_num_aalgs.value.ui64 =
233 	    ipss->ipsec_nalgs[IPSEC_ALG_AUTH];
234 	ekp->esp_stat_num_ealgs.value.ui64 =
235 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
236 	rw_exit(&ipss->ipsec_alg_lock);
237 
238 	netstack_rele(ns);
239 	return (0);
240 }
241 
242 #ifdef DEBUG
243 /*
244  * Debug routine, useful to see pre-encryption data.
245  */
246 static char *
247 dump_msg(mblk_t *mp)
248 {
249 	char tmp_str[3], tmp_line[256];
250 
251 	while (mp != NULL) {
252 		unsigned char *ptr;
253 
254 		printf("mblk address 0x%p, length %ld, db_ref %d "
255 		    "type %d, base 0x%p, lim 0x%p\n",
256 		    (void *) mp, (long)(mp->b_wptr - mp->b_rptr),
257 		    mp->b_datap->db_ref, mp->b_datap->db_type,
258 		    (void *)mp->b_datap->db_base, (void *)mp->b_datap->db_lim);
259 		ptr = mp->b_rptr;
260 
261 		tmp_line[0] = '\0';
262 		while (ptr < mp->b_wptr) {
263 			uint_t diff;
264 
265 			diff = (ptr - mp->b_rptr);
266 			if (!(diff & 0x1f)) {
267 				if (strlen(tmp_line) > 0) {
268 					printf("bytes: %s\n", tmp_line);
269 					tmp_line[0] = '\0';
270 				}
271 			}
272 			if (!(diff & 0x3))
273 				(void) strcat(tmp_line, " ");
274 			(void) sprintf(tmp_str, "%02x", *ptr);
275 			(void) strcat(tmp_line, tmp_str);
276 			ptr++;
277 		}
278 		if (strlen(tmp_line) > 0)
279 			printf("bytes: %s\n", tmp_line);
280 
281 		mp = mp->b_cont;
282 	}
283 
284 	return ("\n");
285 }
286 
287 #else /* DEBUG */
288 static char *
289 dump_msg(mblk_t *mp)
290 {
291 	printf("Find value of mp %p.\n", mp);
292 	return ("\n");
293 }
294 #endif /* DEBUG */
295 
296 /*
297  * Don't have to lock age_interval, as only one thread will access it at
298  * a time, because I control the one function that does with timeout().
299  */
300 static void
301 esp_ager(void *arg)
302 {
303 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
304 	netstack_t	*ns = espstack->ipsecesp_netstack;
305 	hrtime_t begin = gethrtime();
306 
307 	sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
308 	    espstack->ipsecesp_reap_delay, ns);
309 	sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
310 	    espstack->ipsecesp_reap_delay, ns);
311 
312 	espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
313 	    esp_ager, espstack,
314 	    &espstack->ipsecesp_age_interval, espstack->ipsecesp_age_int_max,
315 	    info.mi_idnum);
316 }
317 
318 /*
319  * Get an ESP NDD parameter.
320  */
321 /* ARGSUSED */
322 static int
323 ipsecesp_param_get(
324     queue_t	*q,
325     mblk_t	*mp,
326     caddr_t	cp,
327     cred_t *cr)
328 {
329 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
330 	uint_t value;
331 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
332 
333 	mutex_enter(&espstack->ipsecesp_param_lock);
334 	value = ipsecesppa->ipsecesp_param_value;
335 	mutex_exit(&espstack->ipsecesp_param_lock);
336 
337 	(void) mi_mpprintf(mp, "%u", value);
338 	return (0);
339 }
340 
341 /*
342  * This routine sets an NDD variable in a ipsecespparam_t structure.
343  */
344 /* ARGSUSED */
345 static int
346 ipsecesp_param_set(
347     queue_t	*q,
348     mblk_t	*mp,
349     char	*value,
350     caddr_t	cp,
351     cred_t *cr)
352 {
353 	ulong_t	new_value;
354 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
355 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
356 
357 	/*
358 	 * Fail the request if the new value does not lie within the
359 	 * required bounds.
360 	 */
361 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
362 	    new_value < ipsecesppa->ipsecesp_param_min ||
363 	    new_value > ipsecesppa->ipsecesp_param_max) {
364 		return (EINVAL);
365 	}
366 
367 	/* Set the new value */
368 	mutex_enter(&espstack->ipsecesp_param_lock);
369 	ipsecesppa->ipsecesp_param_value = new_value;
370 	mutex_exit(&espstack->ipsecesp_param_lock);
371 	return (0);
372 }
373 
374 /*
375  * Using lifetime NDD variables, fill in an extended combination's
376  * lifetime information.
377  */
378 void
379 ipsecesp_fill_defs(sadb_x_ecomb_t *ecomb, netstack_t *ns)
380 {
381 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
382 
383 	ecomb->sadb_x_ecomb_soft_bytes = espstack->ipsecesp_default_soft_bytes;
384 	ecomb->sadb_x_ecomb_hard_bytes = espstack->ipsecesp_default_hard_bytes;
385 	ecomb->sadb_x_ecomb_soft_addtime =
386 	    espstack->ipsecesp_default_soft_addtime;
387 	ecomb->sadb_x_ecomb_hard_addtime =
388 	    espstack->ipsecesp_default_hard_addtime;
389 	ecomb->sadb_x_ecomb_soft_usetime =
390 	    espstack->ipsecesp_default_soft_usetime;
391 	ecomb->sadb_x_ecomb_hard_usetime =
392 	    espstack->ipsecesp_default_hard_usetime;
393 }
394 
395 /*
396  * Initialize things for ESP at module load time.
397  */
398 boolean_t
399 ipsecesp_ddi_init(void)
400 {
401 	esp_taskq = taskq_create("esp_taskq", 1, minclsyspri,
402 	    IPSEC_TASKQ_MIN, IPSEC_TASKQ_MAX, 0);
403 
404 	/*
405 	 * We want to be informed each time a stack is created or
406 	 * destroyed in the kernel, so we can maintain the
407 	 * set of ipsecesp_stack_t's.
408 	 */
409 	netstack_register(NS_IPSECESP, ipsecesp_stack_init, NULL,
410 	    ipsecesp_stack_fini);
411 
412 	return (B_TRUE);
413 }
414 
415 /*
416  * Walk through the param array specified registering each element with the
417  * named dispatch handler.
418  */
419 static boolean_t
420 ipsecesp_param_register(IDP *ndp, ipsecespparam_t *espp, int cnt)
421 {
422 	for (; cnt-- > 0; espp++) {
423 		if (espp->ipsecesp_param_name != NULL &&
424 		    espp->ipsecesp_param_name[0]) {
425 			if (!nd_load(ndp,
426 			    espp->ipsecesp_param_name,
427 			    ipsecesp_param_get, ipsecesp_param_set,
428 			    (caddr_t)espp)) {
429 				nd_free(ndp);
430 				return (B_FALSE);
431 			}
432 		}
433 	}
434 	return (B_TRUE);
435 }
436 
437 /*
438  * Initialize things for ESP for each stack instance
439  */
440 static void *
441 ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns)
442 {
443 	ipsecesp_stack_t	*espstack;
444 	ipsecespparam_t		*espp;
445 
446 	espstack = (ipsecesp_stack_t *)kmem_zalloc(sizeof (*espstack),
447 	    KM_SLEEP);
448 	espstack->ipsecesp_netstack = ns;
449 
450 	espp = (ipsecespparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
451 	espstack->ipsecesp_params = espp;
452 	bcopy(lcl_param_arr, espp, sizeof (lcl_param_arr));
453 
454 	(void) ipsecesp_param_register(&espstack->ipsecesp_g_nd, espp,
455 	    A_CNT(lcl_param_arr));
456 
457 	(void) esp_kstat_init(espstack, stackid);
458 
459 	espstack->esp_sadb.s_acquire_timeout =
460 	    &espstack->ipsecesp_acquire_timeout;
461 	sadbp_init("ESP", &espstack->esp_sadb, SADB_SATYPE_ESP, esp_hash_size,
462 	    espstack->ipsecesp_netstack);
463 
464 	mutex_init(&espstack->ipsecesp_param_lock, NULL, MUTEX_DEFAULT, 0);
465 
466 	ip_drop_register(&espstack->esp_dropper, "IPsec ESP");
467 	return (espstack);
468 }
469 
470 /*
471  * Destroy things for ESP at module unload time.
472  */
473 void
474 ipsecesp_ddi_destroy(void)
475 {
476 	netstack_unregister(NS_IPSECESP);
477 	taskq_destroy(esp_taskq);
478 }
479 
480 /*
481  * Destroy things for ESP for one stack instance
482  */
483 static void
484 ipsecesp_stack_fini(netstackid_t stackid, void *arg)
485 {
486 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
487 
488 	if (espstack->esp_pfkey_q != NULL) {
489 		(void) quntimeout(espstack->esp_pfkey_q, espstack->esp_event);
490 	}
491 	espstack->esp_sadb.s_acquire_timeout = NULL;
492 	sadbp_destroy(&espstack->esp_sadb, espstack->ipsecesp_netstack);
493 	ip_drop_unregister(&espstack->esp_dropper);
494 	mutex_destroy(&espstack->ipsecesp_param_lock);
495 	nd_free(&espstack->ipsecesp_g_nd);
496 
497 	kmem_free(espstack->ipsecesp_params, sizeof (lcl_param_arr));
498 	espstack->ipsecesp_params = NULL;
499 	kstat_delete_netstack(espstack->esp_ksp, stackid);
500 	espstack->esp_ksp = NULL;
501 	espstack->esp_kstats = NULL;
502 	kmem_free(espstack, sizeof (*espstack));
503 }
504 
505 /*
506  * ESP module open routine, which is here for keysock plumbing.
507  * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old
508  * Days of export control, and fears that ESP would not be allowed
509  * to be shipped at all by default.  Eventually, keysock should
510  * either access AH and ESP via modstubs or krtld dependencies, or
511  * perhaps be folded in with AH and ESP into a single IPsec/netsec
512  * module ("netsec" if PF_KEY provides more than AH/ESP keying tables).
513  */
514 /* ARGSUSED */
515 static int
516 ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
517 {
518 	netstack_t		*ns;
519 	ipsecesp_stack_t	*espstack;
520 
521 	if (secpolicy_ip_config(credp, B_FALSE) != 0)
522 		return (EPERM);
523 
524 	if (q->q_ptr != NULL)
525 		return (0);  /* Re-open of an already open instance. */
526 
527 	if (sflag != MODOPEN)
528 		return (EINVAL);
529 
530 	ns = netstack_find_by_cred(credp);
531 	ASSERT(ns != NULL);
532 	espstack = ns->netstack_ipsecesp;
533 	ASSERT(espstack != NULL);
534 
535 	q->q_ptr = espstack;
536 	WR(q)->q_ptr = q->q_ptr;
537 
538 	qprocson(q);
539 	return (0);
540 }
541 
542 /*
543  * ESP module close routine.
544  */
545 /* ARGSUSED */
546 static int
547 ipsecesp_close(queue_t *q, int flags __unused, cred_t *credp __unused)
548 {
549 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
550 
551 	/*
552 	 * Clean up q_ptr, if needed.
553 	 */
554 	qprocsoff(q);
555 
556 	/* Keysock queue check is safe, because of OCEXCL perimeter. */
557 
558 	if (q == espstack->esp_pfkey_q) {
559 		esp1dbg(espstack,
560 		    ("ipsecesp_close:  Ummm... keysock is closing ESP.\n"));
561 		espstack->esp_pfkey_q = NULL;
562 		/* Detach qtimeouts. */
563 		(void) quntimeout(q, espstack->esp_event);
564 	}
565 
566 	netstack_rele(espstack->ipsecesp_netstack);
567 	return (0);
568 }
569 
570 /*
571  * Add a number of bytes to what the SA has protected so far.  Return
572  * B_TRUE if the SA can still protect that many bytes.
573  *
574  * Caller must REFRELE the passed-in assoc.  This function must REFRELE
575  * any obtained peer SA.
576  */
577 static boolean_t
578 esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
579 {
580 	ipsa_t *inassoc, *outassoc;
581 	isaf_t *bucket;
582 	boolean_t inrc, outrc, isv6;
583 	sadb_t *sp;
584 	int outhash;
585 	netstack_t		*ns = assoc->ipsa_netstack;
586 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
587 
588 	/* No peer?  No problem! */
589 	if (!assoc->ipsa_haspeer) {
590 		return (sadb_age_bytes(espstack->esp_pfkey_q, assoc, bytes,
591 		    B_TRUE));
592 	}
593 
594 	/*
595 	 * Otherwise, we want to grab both the original assoc and its peer.
596 	 * There might be a race for this, but if it's a real race, two
597 	 * expire messages may occur.  We limit this by only sending the
598 	 * expire message on one of the peers, we'll pick the inbound
599 	 * arbitrarily.
600 	 *
601 	 * If we need tight synchronization on the peer SA, then we need to
602 	 * reconsider.
603 	 */
604 
605 	/* Use address length to select IPv6/IPv4 */
606 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
607 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
608 
609 	if (inbound) {
610 		inassoc = assoc;
611 		if (isv6) {
612 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
613 			    &inassoc->ipsa_dstaddr));
614 		} else {
615 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
616 			    &inassoc->ipsa_dstaddr));
617 		}
618 		bucket = &sp->sdb_of[outhash];
619 		mutex_enter(&bucket->isaf_lock);
620 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
621 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
622 		    inassoc->ipsa_addrfam);
623 		mutex_exit(&bucket->isaf_lock);
624 		if (outassoc == NULL) {
625 			/* Q: Do we wish to set haspeer == B_FALSE? */
626 			esp0dbg(("esp_age_bytes: "
627 			    "can't find peer for inbound.\n"));
628 			return (sadb_age_bytes(espstack->esp_pfkey_q, inassoc,
629 			    bytes, B_TRUE));
630 		}
631 	} else {
632 		outassoc = assoc;
633 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
634 		mutex_enter(&bucket->isaf_lock);
635 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
636 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
637 		    outassoc->ipsa_addrfam);
638 		mutex_exit(&bucket->isaf_lock);
639 		if (inassoc == NULL) {
640 			/* Q: Do we wish to set haspeer == B_FALSE? */
641 			esp0dbg(("esp_age_bytes: "
642 			    "can't find peer for outbound.\n"));
643 			return (sadb_age_bytes(espstack->esp_pfkey_q, outassoc,
644 			    bytes, B_TRUE));
645 		}
646 	}
647 
648 	inrc = sadb_age_bytes(espstack->esp_pfkey_q, inassoc, bytes, B_TRUE);
649 	outrc = sadb_age_bytes(espstack->esp_pfkey_q, outassoc, bytes, B_FALSE);
650 
651 	/*
652 	 * REFRELE any peer SA.
653 	 *
654 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
655 	 * them in { }.
656 	 */
657 	if (inbound) {
658 		IPSA_REFRELE(outassoc);
659 	} else {
660 		IPSA_REFRELE(inassoc);
661 	}
662 
663 	return (inrc && outrc);
664 }
665 
666 /*
667  * Do incoming NAT-T manipulations for packet.
668  * Returns NULL if the mblk chain is consumed.
669  */
670 static mblk_t *
671 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
672 {
673 	ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
674 	tcpha_t *tcpha;
675 	udpha_t *udpha;
676 	/* Initialize to our inbound cksum adjustment... */
677 	uint32_t sum = assoc->ipsa_inbound_cksum;
678 
679 	switch (ipha->ipha_protocol) {
680 	case IPPROTO_TCP:
681 		tcpha = (tcpha_t *)(data_mp->b_rptr +
682 		    IPH_HDR_LENGTH(ipha));
683 
684 #define	DOWN_SUM(x) (x) = ((x) & 0xFFFF) +	 ((x) >> 16)
685 		sum += ~ntohs(tcpha->tha_sum) & 0xFFFF;
686 		DOWN_SUM(sum);
687 		DOWN_SUM(sum);
688 		tcpha->tha_sum = ~htons(sum);
689 		break;
690 	case IPPROTO_UDP:
691 		udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
692 
693 		if (udpha->uha_checksum != 0) {
694 			/* Adujst if the inbound one was not zero. */
695 			sum += ~ntohs(udpha->uha_checksum) & 0xFFFF;
696 			DOWN_SUM(sum);
697 			DOWN_SUM(sum);
698 			udpha->uha_checksum = ~htons(sum);
699 			if (udpha->uha_checksum == 0)
700 				udpha->uha_checksum = 0xFFFF;
701 		}
702 #undef DOWN_SUM
703 		break;
704 	case IPPROTO_IP:
705 		/*
706 		 * This case is only an issue for self-encapsulated
707 		 * packets.  So for now, fall through.
708 		 */
709 		break;
710 	}
711 	return (data_mp);
712 }
713 
714 
715 /*
716  * Strip ESP header, check padding, and fix IP header.
717  * Returns B_TRUE on success, B_FALSE if an error occured.
718  */
719 static boolean_t
720 esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
721     kstat_named_t **counter, ipsecesp_stack_t *espstack)
722 {
723 	ipha_t *ipha;
724 	ip6_t *ip6h;
725 	uint_t divpoint;
726 	mblk_t *scratch;
727 	uint8_t nexthdr, padlen;
728 	uint8_t lastpad;
729 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
730 	uint8_t *lastbyte;
731 
732 	/*
733 	 * Strip ESP data and fix IP header.
734 	 *
735 	 * XXX In case the beginning of esp_inbound() changes to not do a
736 	 * pullup, this part of the code can remain unchanged.
737 	 */
738 	if (isv4) {
739 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ipha_t));
740 		ipha = (ipha_t *)data_mp->b_rptr;
741 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (esph_t) +
742 		    IPH_HDR_LENGTH(ipha));
743 		divpoint = IPH_HDR_LENGTH(ipha);
744 	} else {
745 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ip6_t));
746 		ip6h = (ip6_t *)data_mp->b_rptr;
747 		divpoint = ip_hdr_length_v6(data_mp, ip6h);
748 	}
749 
750 	scratch = data_mp;
751 	while (scratch->b_cont != NULL)
752 		scratch = scratch->b_cont;
753 
754 	ASSERT((scratch->b_wptr - scratch->b_rptr) >= 3);
755 
756 	/*
757 	 * "Next header" and padding length are the last two bytes in the
758 	 * ESP-protected datagram, thus the explicit - 1 and - 2.
759 	 * lastpad is the last byte of the padding, which can be used for
760 	 * a quick check to see if the padding is correct.
761 	 */
762 	lastbyte = scratch->b_wptr - 1;
763 	nexthdr = *lastbyte--;
764 	padlen = *lastbyte--;
765 
766 	if (isv4) {
767 		/* Fix part of the IP header. */
768 		ipha->ipha_protocol = nexthdr;
769 		/*
770 		 * Reality check the padlen.  The explicit - 2 is for the
771 		 * padding length and the next-header bytes.
772 		 */
773 		if (padlen >= ntohs(ipha->ipha_length) - sizeof (ipha_t) - 2 -
774 		    sizeof (esph_t) - ivlen) {
775 			ESP_BUMP_STAT(espstack, bad_decrypt);
776 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
777 			    info.mi_idnum, 0, 0,
778 			    SL_ERROR | SL_WARN,
779 			    "Corrupt ESP packet (padlen too big).\n");
780 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
781 			    padlen));
782 			esp1dbg(espstack, ("pkt len(%d) - ip hdr - esp "
783 			    "hdr - ivlen(%d) = %d.\n",
784 			    ntohs(ipha->ipha_length), ivlen,
785 			    (int)(ntohs(ipha->ipha_length) - sizeof (ipha_t) -
786 			    2 - sizeof (esph_t) - ivlen)));
787 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
788 			return (B_FALSE);
789 		}
790 
791 		/*
792 		 * Fix the rest of the header.  The explicit - 2 is for the
793 		 * padding length and the next-header bytes.
794 		 */
795 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - padlen -
796 		    2 - sizeof (esph_t) - ivlen);
797 		ipha->ipha_hdr_checksum = 0;
798 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
799 	} else {
800 		if (ip6h->ip6_nxt == IPPROTO_ESP) {
801 			ip6h->ip6_nxt = nexthdr;
802 		} else {
803 			ip_pkt_t ipp;
804 
805 			bzero(&ipp, sizeof (ipp));
806 			(void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
807 			    NULL);
808 			if (ipp.ipp_dstopts != NULL) {
809 				ipp.ipp_dstopts->ip6d_nxt = nexthdr;
810 			} else if (ipp.ipp_rthdr != NULL) {
811 				ipp.ipp_rthdr->ip6r_nxt = nexthdr;
812 			} else if (ipp.ipp_hopopts != NULL) {
813 				ipp.ipp_hopopts->ip6h_nxt = nexthdr;
814 			} else {
815 				/* Panic a DEBUG kernel. */
816 				ASSERT(ipp.ipp_hopopts != NULL);
817 				/* Otherwise, pretend it's IP + ESP. */
818 				cmn_err(CE_WARN, "ESP IPv6 headers wrong.\n");
819 				ip6h->ip6_nxt = nexthdr;
820 			}
821 		}
822 
823 		if (padlen >= ntohs(ip6h->ip6_plen) - 2 - sizeof (esph_t) -
824 		    ivlen) {
825 			ESP_BUMP_STAT(espstack, bad_decrypt);
826 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
827 			    info.mi_idnum, 0, 0,
828 			    SL_ERROR | SL_WARN,
829 			    "Corrupt ESP packet (v6 padlen too big).\n");
830 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
831 			    padlen));
832 			esp1dbg(espstack,
833 			    ("pkt len(%u) - ip hdr - esp hdr - ivlen(%d) = "
834 			    "%u.\n", (unsigned)(ntohs(ip6h->ip6_plen)
835 			    + sizeof (ip6_t)), ivlen,
836 			    (unsigned)(ntohs(ip6h->ip6_plen) - 2 -
837 			    sizeof (esph_t) - ivlen)));
838 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
839 			return (B_FALSE);
840 		}
841 
842 
843 		/*
844 		 * Fix the rest of the header.  The explicit - 2 is for the
845 		 * padding length and the next-header bytes.  IPv6 is nice,
846 		 * because there's no hdr checksum!
847 		 */
848 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - padlen -
849 		    2 - sizeof (esph_t) - ivlen);
850 	}
851 
852 	if (espstack->ipsecesp_padding_check > 0 && padlen > 0) {
853 		/*
854 		 * Weak padding check: compare last-byte to length, they
855 		 * should be equal.
856 		 */
857 		lastpad = *lastbyte--;
858 
859 		if (padlen != lastpad) {
860 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
861 			    info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
862 			    "Corrupt ESP packet (lastpad != padlen).\n");
863 			esp1dbg(espstack,
864 			    ("lastpad (%d) not equal to padlen (%d):\n",
865 			    lastpad, padlen));
866 			ESP_BUMP_STAT(espstack, bad_padding);
867 			*counter = DROPPER(ipss, ipds_esp_bad_padding);
868 			return (B_FALSE);
869 		}
870 
871 		/*
872 		 * Strong padding check: Check all pad bytes to see that
873 		 * they're ascending.  Go backwards using a descending counter
874 		 * to verify.  padlen == 1 is checked by previous block, so
875 		 * only bother if we've more than 1 byte of padding.
876 		 * Consequently, start the check one byte before the location
877 		 * of "lastpad".
878 		 */
879 		if (espstack->ipsecesp_padding_check > 1) {
880 			/*
881 			 * This assert may have to become an if and a pullup
882 			 * if we start accepting multi-dblk mblks. For now,
883 			 * though, any packet here will have been pulled up in
884 			 * esp_inbound.
885 			 */
886 			ASSERT(MBLKL(scratch) >= lastpad + 3);
887 
888 			/*
889 			 * Use "--lastpad" because we already checked the very
890 			 * last pad byte previously.
891 			 */
892 			while (--lastpad != 0) {
893 				if (lastpad != *lastbyte) {
894 					ipsec_rl_strlog(
895 					    espstack->ipsecesp_netstack,
896 					    info.mi_idnum, 0, 0,
897 					    SL_ERROR | SL_WARN, "Corrupt ESP "
898 					    "packet (bad padding).\n");
899 					esp1dbg(espstack,
900 					    ("padding not in correct"
901 					    " format:\n"));
902 					ESP_BUMP_STAT(espstack, bad_padding);
903 					*counter = DROPPER(ipss,
904 					    ipds_esp_bad_padding);
905 					return (B_FALSE);
906 				}
907 				lastbyte--;
908 			}
909 		}
910 	}
911 
912 	/* Trim off the padding. */
913 	ASSERT(data_mp->b_cont == NULL);
914 	data_mp->b_wptr -= (padlen + 2);
915 
916 	/*
917 	 * Remove the ESP header.
918 	 *
919 	 * The above assertions about data_mp's size will make this work.
920 	 *
921 	 * XXX  Question:  If I send up and get back a contiguous mblk,
922 	 * would it be quicker to bcopy over, or keep doing the dupb stuff?
923 	 * I go with copying for now.
924 	 */
925 
926 	if (IS_P2ALIGNED(data_mp->b_rptr, sizeof (uint32_t)) &&
927 	    IS_P2ALIGNED(ivlen, sizeof (uint32_t))) {
928 		uint8_t *start = data_mp->b_rptr;
929 		uint32_t *src, *dst;
930 
931 		src = (uint32_t *)(start + divpoint);
932 		dst = (uint32_t *)(start + divpoint + sizeof (esph_t) + ivlen);
933 
934 		ASSERT(IS_P2ALIGNED(dst, sizeof (uint32_t)) &&
935 		    IS_P2ALIGNED(src, sizeof (uint32_t)));
936 
937 		do {
938 			src--;
939 			dst--;
940 			*dst = *src;
941 		} while (src != (uint32_t *)start);
942 
943 		data_mp->b_rptr = (uchar_t *)dst;
944 	} else {
945 		uint8_t *start = data_mp->b_rptr;
946 		uint8_t *src, *dst;
947 
948 		src = start + divpoint;
949 		dst = src + sizeof (esph_t) + ivlen;
950 
951 		do {
952 			src--;
953 			dst--;
954 			*dst = *src;
955 		} while (src != start);
956 
957 		data_mp->b_rptr = dst;
958 	}
959 
960 	esp2dbg(espstack, ("data_mp after inbound ESP adjustment:\n"));
961 	esp2dbg(espstack, (dump_msg(data_mp)));
962 
963 	return (B_TRUE);
964 }
965 
966 /*
967  * Updating use times can be tricky business if the ipsa_haspeer flag is
968  * set.  This function is called once in an SA's lifetime.
969  *
970  * Caller has to REFRELE "assoc" which is passed in.  This function has
971  * to REFRELE any peer SA that is obtained.
972  */
973 static void
974 esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
975 {
976 	ipsa_t *inassoc, *outassoc;
977 	isaf_t *bucket;
978 	sadb_t *sp;
979 	int outhash;
980 	boolean_t isv6;
981 	netstack_t		*ns = assoc->ipsa_netstack;
982 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
983 
984 	/* No peer?  No problem! */
985 	if (!assoc->ipsa_haspeer) {
986 		sadb_set_usetime(assoc);
987 		return;
988 	}
989 
990 	/*
991 	 * Otherwise, we want to grab both the original assoc and its peer.
992 	 * There might be a race for this, but if it's a real race, the times
993 	 * will be out-of-synch by at most a second, and since our time
994 	 * granularity is a second, this won't be a problem.
995 	 *
996 	 * If we need tight synchronization on the peer SA, then we need to
997 	 * reconsider.
998 	 */
999 
1000 	/* Use address length to select IPv6/IPv4 */
1001 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
1002 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
1003 
1004 	if (inbound) {
1005 		inassoc = assoc;
1006 		if (isv6) {
1007 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
1008 			    &inassoc->ipsa_dstaddr));
1009 		} else {
1010 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
1011 			    &inassoc->ipsa_dstaddr));
1012 		}
1013 		bucket = &sp->sdb_of[outhash];
1014 		mutex_enter(&bucket->isaf_lock);
1015 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
1016 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
1017 		    inassoc->ipsa_addrfam);
1018 		mutex_exit(&bucket->isaf_lock);
1019 		if (outassoc == NULL) {
1020 			/* Q: Do we wish to set haspeer == B_FALSE? */
1021 			esp0dbg(("esp_set_usetime: "
1022 			    "can't find peer for inbound.\n"));
1023 			sadb_set_usetime(inassoc);
1024 			return;
1025 		}
1026 	} else {
1027 		outassoc = assoc;
1028 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
1029 		mutex_enter(&bucket->isaf_lock);
1030 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
1031 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
1032 		    outassoc->ipsa_addrfam);
1033 		mutex_exit(&bucket->isaf_lock);
1034 		if (inassoc == NULL) {
1035 			/* Q: Do we wish to set haspeer == B_FALSE? */
1036 			esp0dbg(("esp_set_usetime: "
1037 			    "can't find peer for outbound.\n"));
1038 			sadb_set_usetime(outassoc);
1039 			return;
1040 		}
1041 	}
1042 
1043 	/* Update usetime on both. */
1044 	sadb_set_usetime(inassoc);
1045 	sadb_set_usetime(outassoc);
1046 
1047 	/*
1048 	 * REFRELE any peer SA.
1049 	 *
1050 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
1051 	 * them in { }.
1052 	 */
1053 	if (inbound) {
1054 		IPSA_REFRELE(outassoc);
1055 	} else {
1056 		IPSA_REFRELE(inassoc);
1057 	}
1058 }
1059 
1060 /*
1061  * Handle ESP inbound data for IPv4 and IPv6.
1062  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1063  * mblk chain data_mp.
1064  */
1065 mblk_t *
1066 esp_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira)
1067 {
1068 	esph_t *esph = (esph_t *)arg;
1069 	ipsa_t *ipsa = ira->ira_ipsec_esp_sa;
1070 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
1071 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1072 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1073 
1074 	/*
1075 	 * We may wish to check replay in-range-only here as an optimization.
1076 	 * Include the reality check of ipsa->ipsa_replay >
1077 	 * ipsa->ipsa_replay_wsize for times when it's the first N packets,
1078 	 * where N == ipsa->ipsa_replay_wsize.
1079 	 *
1080 	 * Another check that may come here later is the "collision" check.
1081 	 * If legitimate packets flow quickly enough, this won't be a problem,
1082 	 * but collisions may cause authentication algorithm crunching to
1083 	 * take place when it doesn't need to.
1084 	 */
1085 	if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
1086 		ESP_BUMP_STAT(espstack, replay_early_failures);
1087 		IP_ESP_BUMP_STAT(ipss, in_discards);
1088 		ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1089 		    DROPPER(ipss, ipds_esp_early_replay),
1090 		    &espstack->esp_dropper);
1091 		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1092 		return (NULL);
1093 	}
1094 
1095 	/*
1096 	 * Adjust the IP header's payload length to reflect the removal
1097 	 * of the ICV.
1098 	 */
1099 	if (!(ira->ira_flags & IRAF_IS_IPV4)) {
1100 		ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
1101 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
1102 		    ipsa->ipsa_mac_len);
1103 	} else {
1104 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1105 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) -
1106 		    ipsa->ipsa_mac_len);
1107 	}
1108 
1109 	/* submit the request to the crypto framework */
1110 	return (esp_submit_req_inbound(data_mp, ira, ipsa,
1111 	    (uint8_t *)esph - data_mp->b_rptr));
1112 }
1113 
1114 /* XXX refactor me */
1115 /*
1116  * Handle the SADB_GETSPI message.  Create a larval SA.
1117  */
1118 static void
1119 esp_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
1120 {
1121 	ipsa_t *newbie, *target;
1122 	isaf_t *outbound, *inbound;
1123 	int rc, diagnostic;
1124 	sadb_sa_t *assoc;
1125 	keysock_out_t *kso;
1126 	uint32_t newspi;
1127 
1128 	/*
1129 	 * Randomly generate a proposed SPI value
1130 	 */
1131 	if (cl_inet_getspi != NULL) {
1132 		cl_inet_getspi(espstack->ipsecesp_netstack->netstack_stackid,
1133 		    IPPROTO_ESP, (uint8_t *)&newspi, sizeof (uint32_t), NULL);
1134 	} else {
1135 		(void) random_get_pseudo_bytes((uint8_t *)&newspi,
1136 		    sizeof (uint32_t));
1137 	}
1138 	newbie = sadb_getspi(ksi, newspi, &diagnostic,
1139 	    espstack->ipsecesp_netstack, IPPROTO_ESP);
1140 
1141 	if (newbie == NULL) {
1142 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM, diagnostic,
1143 		    ksi->ks_in_serial);
1144 		return;
1145 	} else if (newbie == (ipsa_t *)-1) {
1146 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
1147 		    ksi->ks_in_serial);
1148 		return;
1149 	}
1150 
1151 	/*
1152 	 * XXX - We may randomly collide.  We really should recover from this.
1153 	 *	 Unfortunately, that could require spending way-too-much-time
1154 	 *	 in here.  For now, let the user retry.
1155 	 */
1156 
1157 	if (newbie->ipsa_addrfam == AF_INET6) {
1158 		outbound = OUTBOUND_BUCKET_V6(&espstack->esp_sadb.s_v6,
1159 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1160 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v6,
1161 		    newbie->ipsa_spi);
1162 	} else {
1163 		ASSERT(newbie->ipsa_addrfam == AF_INET);
1164 		outbound = OUTBOUND_BUCKET_V4(&espstack->esp_sadb.s_v4,
1165 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1166 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v4,
1167 		    newbie->ipsa_spi);
1168 	}
1169 
1170 	mutex_enter(&outbound->isaf_lock);
1171 	mutex_enter(&inbound->isaf_lock);
1172 
1173 	/*
1174 	 * Check for collisions (i.e. did sadb_getspi() return with something
1175 	 * that already exists?).
1176 	 *
1177 	 * Try outbound first.  Even though SADB_GETSPI is traditionally
1178 	 * for inbound SAs, you never know what a user might do.
1179 	 */
1180 	target = ipsec_getassocbyspi(outbound, newbie->ipsa_spi,
1181 	    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr, newbie->ipsa_addrfam);
1182 	if (target == NULL) {
1183 		target = ipsec_getassocbyspi(inbound, newbie->ipsa_spi,
1184 		    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr,
1185 		    newbie->ipsa_addrfam);
1186 	}
1187 
1188 	/*
1189 	 * I don't have collisions elsewhere!
1190 	 * (Nor will I because I'm still holding inbound/outbound locks.)
1191 	 */
1192 
1193 	if (target != NULL) {
1194 		rc = EEXIST;
1195 		IPSA_REFRELE(target);
1196 	} else {
1197 		/*
1198 		 * sadb_insertassoc() also checks for collisions, so
1199 		 * if there's a colliding entry, rc will be set
1200 		 * to EEXIST.
1201 		 */
1202 		rc = sadb_insertassoc(newbie, inbound);
1203 		newbie->ipsa_hardexpiretime = gethrestime_sec();
1204 		newbie->ipsa_hardexpiretime +=
1205 		    espstack->ipsecesp_larval_timeout;
1206 	}
1207 
1208 	/*
1209 	 * Can exit outbound mutex.  Hold inbound until we're done
1210 	 * with newbie.
1211 	 */
1212 	mutex_exit(&outbound->isaf_lock);
1213 
1214 	if (rc != 0) {
1215 		mutex_exit(&inbound->isaf_lock);
1216 		IPSA_REFRELE(newbie);
1217 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, rc,
1218 		    SADB_X_DIAGNOSTIC_NONE, ksi->ks_in_serial);
1219 		return;
1220 	}
1221 
1222 
1223 	/* Can write here because I'm still holding the bucket lock. */
1224 	newbie->ipsa_type = SADB_SATYPE_ESP;
1225 
1226 	/*
1227 	 * Construct successful return message. We have one thing going
1228 	 * for us in PF_KEY v2.  That's the fact that
1229 	 *	sizeof (sadb_spirange_t) == sizeof (sadb_sa_t)
1230 	 */
1231 	assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SPIRANGE];
1232 	assoc->sadb_sa_exttype = SADB_EXT_SA;
1233 	assoc->sadb_sa_spi = newbie->ipsa_spi;
1234 	*((uint64_t *)(&assoc->sadb_sa_replay)) = 0;
1235 	mutex_exit(&inbound->isaf_lock);
1236 
1237 	/* Convert KEYSOCK_IN to KEYSOCK_OUT. */
1238 	kso = (keysock_out_t *)ksi;
1239 	kso->ks_out_len = sizeof (*kso);
1240 	kso->ks_out_serial = ksi->ks_in_serial;
1241 	kso->ks_out_type = KEYSOCK_OUT;
1242 
1243 	/*
1244 	 * Can safely putnext() to esp_pfkey_q, because this is a turnaround
1245 	 * from the esp_pfkey_q.
1246 	 */
1247 	putnext(espstack->esp_pfkey_q, mp);
1248 }
1249 
1250 /*
1251  * Insert the ESP header into a packet.  Duplicate an mblk, and insert a newly
1252  * allocated mblk with the ESP header in between the two.
1253  */
1254 static boolean_t
1255 esp_insert_esp(mblk_t *mp, mblk_t *esp_mp, uint_t divpoint,
1256     ipsecesp_stack_t *espstack)
1257 {
1258 	mblk_t *split_mp = mp;
1259 	uint_t wheretodiv = divpoint;
1260 
1261 	while ((split_mp->b_wptr - split_mp->b_rptr) < wheretodiv) {
1262 		wheretodiv -= (split_mp->b_wptr - split_mp->b_rptr);
1263 		split_mp = split_mp->b_cont;
1264 		ASSERT(split_mp != NULL);
1265 	}
1266 
1267 	if (split_mp->b_wptr - split_mp->b_rptr != wheretodiv) {
1268 		mblk_t *scratch;
1269 
1270 		/* "scratch" is the 2nd half, split_mp is the first. */
1271 		scratch = dupb(split_mp);
1272 		if (scratch == NULL) {
1273 			esp1dbg(espstack,
1274 			    ("esp_insert_esp: can't allocate scratch.\n"));
1275 			return (B_FALSE);
1276 		}
1277 		/* NOTE:  dupb() doesn't set b_cont appropriately. */
1278 		scratch->b_cont = split_mp->b_cont;
1279 		scratch->b_rptr += wheretodiv;
1280 		split_mp->b_wptr = split_mp->b_rptr + wheretodiv;
1281 		split_mp->b_cont = scratch;
1282 	}
1283 	/*
1284 	 * At this point, split_mp is exactly "wheretodiv" bytes long, and
1285 	 * holds the end of the pre-ESP part of the datagram.
1286 	 */
1287 	esp_mp->b_cont = split_mp->b_cont;
1288 	split_mp->b_cont = esp_mp;
1289 
1290 	return (B_TRUE);
1291 }
1292 
1293 /*
1294  * Section 7 of RFC 3947 says:
1295  *
1296  * 7.  Recovering from the Expiring NAT Mappings
1297  *
1298  *    There are cases where NAT box decides to remove mappings that are still
1299  *    alive (for example, when the keepalive interval is too long, or when the
1300  *    NAT box is rebooted).  To recover from this, ends that are NOT behind
1301  *    NAT SHOULD use the last valid UDP encapsulated IKE or IPsec packet from
1302  *    the other end to determine which IP and port addresses should be used.
1303  *    The host behind dynamic NAT MUST NOT do this, as otherwise it opens a
1304  *    DoS attack possibility because the IP address or port of the other host
1305  *    will not change (it is not behind NAT).
1306  *
1307  *    Keepalives cannot be used for these purposes, as they are not
1308  *    authenticated, but any IKE authenticated IKE packet or ESP packet can be
1309  *    used to detect whether the IP address or the port has changed.
1310  *
1311  * The following function will check an SA and its explicitly-set pair to see
1312  * if the NAT-T remote port matches the received packet (which must have
1313  * passed ESP authentication, see esp_in_done() for the caller context).  If
1314  * there is a mismatch, the SAs are updated.  It is not important if we race
1315  * with a transmitting thread, as if there is a transmitting thread, it will
1316  * merely emit a packet that will most-likely be dropped.
1317  *
1318  * "ports" are ordered src,dst, and assoc is an inbound SA, where src should
1319  * match ipsa_remote_nat_port and dst should match ipsa_local_nat_port.
1320  */
1321 #ifdef _LITTLE_ENDIAN
1322 #define	FIRST_16(x) ((x) & 0xFFFF)
1323 #define	NEXT_16(x) (((x) >> 16) & 0xFFFF)
1324 #else
1325 #define	FIRST_16(x) (((x) >> 16) & 0xFFFF)
1326 #define	NEXT_16(x) ((x) & 0xFFFF)
1327 #endif
1328 static void
1329 esp_port_freshness(uint32_t ports, ipsa_t *assoc)
1330 {
1331 	uint16_t remote = FIRST_16(ports);
1332 	uint16_t local = NEXT_16(ports);
1333 	ipsa_t *outbound_peer;
1334 	isaf_t *bucket;
1335 	ipsecesp_stack_t *espstack = assoc->ipsa_netstack->netstack_ipsecesp;
1336 
1337 	/* We found a conn_t, therefore local != 0. */
1338 	ASSERT(local != 0);
1339 	/* Assume an IPv4 SA. */
1340 	ASSERT(assoc->ipsa_addrfam == AF_INET);
1341 
1342 	/*
1343 	 * On-the-wire rport == 0 means something's very wrong.
1344 	 * An unpaired SA is also useless to us.
1345 	 * If we are behind the NAT, don't bother.
1346 	 * A zero local NAT port defaults to 4500, so check that too.
1347 	 * And, of course, if the ports already match, we don't need to
1348 	 * bother.
1349 	 */
1350 	if (remote == 0 || assoc->ipsa_otherspi == 0 ||
1351 	    (assoc->ipsa_flags & IPSA_F_BEHIND_NAT) ||
1352 	    (assoc->ipsa_remote_nat_port == 0 &&
1353 	    remote == htons(IPPORT_IKE_NATT)) ||
1354 	    remote == assoc->ipsa_remote_nat_port)
1355 		return;
1356 
1357 	/* Try and snag the peer.   NOTE:  Assume IPv4 for now. */
1358 	bucket = OUTBOUND_BUCKET_V4(&(espstack->esp_sadb.s_v4),
1359 	    assoc->ipsa_srcaddr[0]);
1360 	mutex_enter(&bucket->isaf_lock);
1361 	outbound_peer = ipsec_getassocbyspi(bucket, assoc->ipsa_otherspi,
1362 	    assoc->ipsa_dstaddr, assoc->ipsa_srcaddr, AF_INET);
1363 	mutex_exit(&bucket->isaf_lock);
1364 
1365 	/* We probably lost a race to a deleting or expiring thread. */
1366 	if (outbound_peer == NULL)
1367 		return;
1368 
1369 	/*
1370 	 * Hold the mutexes for both SAs so we don't race another inbound
1371 	 * thread.  A lock-entry order shouldn't matter, since all other
1372 	 * per-ipsa locks are individually held-then-released.
1373 	 *
1374 	 * Luckily, this has nothing to do with the remote-NAT address,
1375 	 * so we don't have to re-scribble the cached-checksum differential.
1376 	 */
1377 	mutex_enter(&outbound_peer->ipsa_lock);
1378 	mutex_enter(&assoc->ipsa_lock);
1379 	outbound_peer->ipsa_remote_nat_port = assoc->ipsa_remote_nat_port =
1380 	    remote;
1381 	mutex_exit(&assoc->ipsa_lock);
1382 	mutex_exit(&outbound_peer->ipsa_lock);
1383 	IPSA_REFRELE(outbound_peer);
1384 	ESP_BUMP_STAT(espstack, sa_port_renumbers);
1385 }
1386 /*
1387  * Finish processing of an inbound ESP packet after processing by the
1388  * crypto framework.
1389  * - Remove the ESP header.
1390  * - Send packet back to IP.
1391  * If authentication was performed on the packet, this function is called
1392  * only if the authentication succeeded.
1393  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1394  * mblk chain data_mp.
1395  */
1396 static mblk_t *
1397 esp_in_done(mblk_t *data_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
1398 {
1399 	ipsa_t *assoc;
1400 	uint_t espstart;
1401 	uint32_t ivlen = 0;
1402 	uint_t processed_len;
1403 	esph_t *esph;
1404 	kstat_named_t *counter;
1405 	boolean_t is_natt;
1406 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
1407 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1408 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1409 
1410 	assoc = ira->ira_ipsec_esp_sa;
1411 	ASSERT(assoc != NULL);
1412 
1413 	is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
1414 
1415 	/* get the pointer to the ESP header */
1416 	if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
1417 		/* authentication-only ESP */
1418 		espstart = ic->ic_crypto_data.cd_offset;
1419 		processed_len = ic->ic_crypto_data.cd_length;
1420 	} else {
1421 		/* encryption present */
1422 		ivlen = assoc->ipsa_iv_len;
1423 		if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
1424 			/* encryption-only ESP */
1425 			espstart = ic->ic_crypto_data.cd_offset -
1426 			    sizeof (esph_t) - assoc->ipsa_iv_len;
1427 			processed_len = ic->ic_crypto_data.cd_length +
1428 			    ivlen;
1429 		} else {
1430 			/* encryption with authentication */
1431 			espstart = ic->ic_crypto_dual_data.dd_offset1;
1432 			processed_len = ic->ic_crypto_dual_data.dd_len2 +
1433 			    ivlen;
1434 		}
1435 	}
1436 
1437 	esph = (esph_t *)(data_mp->b_rptr + espstart);
1438 
1439 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE ||
1440 	    (assoc->ipsa_flags & IPSA_F_COMBINED)) {
1441 		/*
1442 		 * Authentication passed if we reach this point.
1443 		 * Packets with authentication will have the ICV
1444 		 * after the crypto data. Adjust b_wptr before
1445 		 * making padlen checks.
1446 		 */
1447 		ESP_BUMP_STAT(espstack, good_auth);
1448 		data_mp->b_wptr -= assoc->ipsa_mac_len;
1449 
1450 		/*
1451 		 * Check replay window here!
1452 		 * For right now, assume keysock will set the replay window
1453 		 * size to zero for SAs that have an unspecified sender.
1454 		 * This may change...
1455 		 */
1456 
1457 		if (!sadb_replay_check(assoc, esph->esph_replay)) {
1458 			/*
1459 			 * Log the event. As of now we print out an event.
1460 			 * Do not print the replay failure number, or else
1461 			 * syslog cannot collate the error messages.  Printing
1462 			 * the replay number that failed opens a denial-of-
1463 			 * service attack.
1464 			 */
1465 			ipsec_assocfailure(info.mi_idnum, 0, 0,
1466 			    SL_ERROR | SL_WARN,
1467 			    "Replay failed for ESP spi 0x%x, dst %s.\n",
1468 			    assoc->ipsa_spi, assoc->ipsa_dstaddr,
1469 			    assoc->ipsa_addrfam, espstack->ipsecesp_netstack);
1470 			ESP_BUMP_STAT(espstack, replay_failures);
1471 			counter = DROPPER(ipss, ipds_esp_replay);
1472 			goto drop_and_bail;
1473 		}
1474 
1475 		if (is_natt) {
1476 			ASSERT(ira->ira_flags & IRAF_ESP_UDP_PORTS);
1477 			ASSERT(ira->ira_esp_udp_ports != 0);
1478 			esp_port_freshness(ira->ira_esp_udp_ports, assoc);
1479 		}
1480 	}
1481 
1482 	esp_set_usetime(assoc, B_TRUE);
1483 
1484 	if (!esp_age_bytes(assoc, processed_len, B_TRUE)) {
1485 		/* The ipsa has hit hard expiration, LOG and AUDIT. */
1486 		ipsec_assocfailure(info.mi_idnum, 0, 0,
1487 		    SL_ERROR | SL_WARN,
1488 		    "ESP association 0x%x, dst %s had bytes expire.\n",
1489 		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1490 		    espstack->ipsecesp_netstack);
1491 		ESP_BUMP_STAT(espstack, bytes_expired);
1492 		counter = DROPPER(ipss, ipds_esp_bytes_expire);
1493 		goto drop_and_bail;
1494 	}
1495 
1496 	/*
1497 	 * Remove ESP header and padding from packet.  I hope the compiler
1498 	 * spews "branch, predict taken" code for this.
1499 	 */
1500 
1501 	if (esp_strip_header(data_mp, (ira->ira_flags & IRAF_IS_IPV4),
1502 	    ivlen, &counter, espstack)) {
1503 
1504 		if (is_system_labeled() && assoc->ipsa_tsl != NULL) {
1505 			if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) {
1506 				ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1507 				    DROPPER(ipss, ipds_ah_nomem),
1508 				    &espstack->esp_dropper);
1509 				BUMP_MIB(ira->ira_ill->ill_ip_mib,
1510 				    ipIfStatsInDiscards);
1511 				return (NULL);
1512 			}
1513 		}
1514 		if (is_natt)
1515 			return (esp_fix_natt_checksums(data_mp, assoc));
1516 
1517 		if (assoc->ipsa_state == IPSA_STATE_IDLE) {
1518 			/*
1519 			 * Cluster buffering case.  Tell caller that we're
1520 			 * handling the packet.
1521 			 */
1522 			sadb_buf_pkt(assoc, data_mp, ira);
1523 			return (NULL);
1524 		}
1525 
1526 		return (data_mp);
1527 	}
1528 
1529 	esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
1530 drop_and_bail:
1531 	IP_ESP_BUMP_STAT(ipss, in_discards);
1532 	ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, counter,
1533 	    &espstack->esp_dropper);
1534 	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1535 	return (NULL);
1536 }
1537 
1538 /*
1539  * Called upon failing the inbound ICV check. The message passed as
1540  * argument is freed.
1541  */
1542 static void
1543 esp_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira)
1544 {
1545 	ipsa_t		*assoc = ira->ira_ipsec_esp_sa;
1546 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
1547 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1548 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1549 
1550 	/*
1551 	 * Log the event. Don't print to the console, block
1552 	 * potential denial-of-service attack.
1553 	 */
1554 	ESP_BUMP_STAT(espstack, bad_auth);
1555 
1556 	ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1557 	    "ESP Authentication failed for spi 0x%x, dst %s.\n",
1558 	    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1559 	    espstack->ipsecesp_netstack);
1560 
1561 	IP_ESP_BUMP_STAT(ipss, in_discards);
1562 	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
1563 	    DROPPER(ipss, ipds_esp_bad_auth),
1564 	    &espstack->esp_dropper);
1565 }
1566 
1567 
1568 /*
1569  * Invoked for outbound packets after ESP processing. If the packet
1570  * also requires AH, performs the AH SA selection and AH processing.
1571  *
1572  * Returns data_mp (possibly with AH added) unless data_mp was consumed
1573  * due to an error, or queued due to async. crypto or an ACQUIRE trigger.
1574  */
1575 static mblk_t *
1576 esp_do_outbound_ah(mblk_t *data_mp, ip_xmit_attr_t *ixa)
1577 {
1578 	ipsec_action_t *ap;
1579 
1580 	ap = ixa->ixa_ipsec_action;
1581 	if (ap == NULL) {
1582 		ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
1583 		ap = pp->ipsp_act;
1584 	}
1585 
1586 	if (!ap->ipa_want_ah)
1587 		return (data_mp);
1588 
1589 	/*
1590 	 * Normally the AH SA would have already been put in place
1591 	 * but it could have been flushed so we need to look for it.
1592 	 */
1593 	if (ixa->ixa_ipsec_ah_sa == NULL) {
1594 		if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) {
1595 			sadb_acquire(data_mp, ixa, B_TRUE, B_FALSE);
1596 			return (NULL);
1597 		}
1598 	}
1599 	ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
1600 
1601 	data_mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(data_mp, ixa);
1602 	return (data_mp);
1603 }
1604 
1605 
1606 /*
1607  * Kernel crypto framework callback invoked after completion of async
1608  * crypto requests for outbound packets.
1609  */
1610 static void
1611 esp_kcf_callback_outbound(void *arg, int status)
1612 {
1613 	mblk_t		*mp = (mblk_t *)arg;
1614 	mblk_t		*async_mp;
1615 	netstack_t	*ns;
1616 	ipsec_stack_t	*ipss;
1617 	ipsecesp_stack_t *espstack;
1618 	mblk_t		*data_mp;
1619 	ip_xmit_attr_t	ixas;
1620 	ipsec_crypto_t	*ic;
1621 	ill_t		*ill;
1622 
1623 	/*
1624 	 * First remove the ipsec_crypto_t mblk
1625 	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1626 	 */
1627 	async_mp = ipsec_remove_crypto_data(mp, &ic);
1628 	ASSERT(async_mp != NULL);
1629 
1630 	/*
1631 	 * Extract the ip_xmit_attr_t from the first mblk.
1632 	 * Verifies that the netstack and ill is still around; could
1633 	 * have vanished while kEf was doing its work.
1634 	 * On succesful return we have a nce_t and the ill/ipst can't
1635 	 * disappear until we do the nce_refrele in ixa_cleanup.
1636 	 */
1637 	data_mp = async_mp->b_cont;
1638 	async_mp->b_cont = NULL;
1639 	if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) {
1640 		/* Disappeared on us - no ill/ipst for MIB */
1641 		/* We have nowhere to do stats since ixa_ipst could be NULL */
1642 		if (ixas.ixa_nce != NULL) {
1643 			ill = ixas.ixa_nce->nce_ill;
1644 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1645 			ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
1646 		}
1647 		freemsg(data_mp);
1648 		goto done;
1649 	}
1650 	ns = ixas.ixa_ipst->ips_netstack;
1651 	espstack = ns->netstack_ipsecesp;
1652 	ipss = ns->netstack_ipsec;
1653 	ill = ixas.ixa_nce->nce_ill;
1654 
1655 	if (status == CRYPTO_SUCCESS) {
1656 		/*
1657 		 * If a ICV was computed, it was stored by the
1658 		 * crypto framework at the end of the packet.
1659 		 */
1660 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1661 
1662 		esp_set_usetime(ixas.ixa_ipsec_esp_sa, B_FALSE);
1663 		/* NAT-T packet. */
1664 		if (IPH_HDR_VERSION(ipha) == IP_VERSION &&
1665 		    ipha->ipha_protocol == IPPROTO_UDP)
1666 			esp_prepare_udp(ns, data_mp, ipha);
1667 
1668 		/* do AH processing if needed */
1669 		data_mp = esp_do_outbound_ah(data_mp, &ixas);
1670 		if (data_mp == NULL)
1671 			goto done;
1672 
1673 		(void) ip_output_post_ipsec(data_mp, &ixas);
1674 	} else {
1675 		/* Outbound shouldn't see invalid MAC */
1676 		ASSERT(status != CRYPTO_INVALID_MAC);
1677 
1678 		esp1dbg(espstack,
1679 		    ("esp_kcf_callback_outbound: crypto failed with 0x%x\n",
1680 		    status));
1681 		ESP_BUMP_STAT(espstack, crypto_failures);
1682 		ESP_BUMP_STAT(espstack, out_discards);
1683 		ip_drop_packet(data_mp, B_FALSE, ill,
1684 		    DROPPER(ipss, ipds_esp_crypto_failed),
1685 		    &espstack->esp_dropper);
1686 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1687 	}
1688 done:
1689 	ixa_cleanup(&ixas);
1690 	(void) ipsec_free_crypto_data(mp);
1691 }
1692 
1693 /*
1694  * Kernel crypto framework callback invoked after completion of async
1695  * crypto requests for inbound packets.
1696  */
1697 static void
1698 esp_kcf_callback_inbound(void *arg, int status)
1699 {
1700 	mblk_t		*mp = (mblk_t *)arg;
1701 	mblk_t		*async_mp;
1702 	netstack_t	*ns;
1703 	ipsecesp_stack_t *espstack;
1704 	ipsec_stack_t	*ipss;
1705 	mblk_t		*data_mp;
1706 	ip_recv_attr_t	iras;
1707 	ipsec_crypto_t	*ic;
1708 
1709 	/*
1710 	 * First remove the ipsec_crypto_t mblk
1711 	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1712 	 */
1713 	async_mp = ipsec_remove_crypto_data(mp, &ic);
1714 	ASSERT(async_mp != NULL);
1715 
1716 	/*
1717 	 * Extract the ip_recv_attr_t from the first mblk.
1718 	 * Verifies that the netstack and ill is still around; could
1719 	 * have vanished while kEf was doing its work.
1720 	 */
1721 	data_mp = async_mp->b_cont;
1722 	async_mp->b_cont = NULL;
1723 	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
1724 		/* The ill or ip_stack_t disappeared on us */
1725 		ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
1726 		freemsg(data_mp);
1727 		goto done;
1728 	}
1729 
1730 	ns = iras.ira_ill->ill_ipst->ips_netstack;
1731 	espstack = ns->netstack_ipsecesp;
1732 	ipss = ns->netstack_ipsec;
1733 
1734 	if (status == CRYPTO_SUCCESS) {
1735 		data_mp = esp_in_done(data_mp, &iras, ic);
1736 		if (data_mp == NULL)
1737 			goto done;
1738 
1739 		/* finish IPsec processing */
1740 		ip_input_post_ipsec(data_mp, &iras);
1741 	} else if (status == CRYPTO_INVALID_MAC) {
1742 		esp_log_bad_auth(data_mp, &iras);
1743 	} else {
1744 		esp1dbg(espstack,
1745 		    ("esp_kcf_callback: crypto failed with 0x%x\n",
1746 		    status));
1747 		ESP_BUMP_STAT(espstack, crypto_failures);
1748 		IP_ESP_BUMP_STAT(ipss, in_discards);
1749 		ip_drop_packet(data_mp, B_TRUE, iras.ira_ill,
1750 		    DROPPER(ipss, ipds_esp_crypto_failed),
1751 		    &espstack->esp_dropper);
1752 		BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1753 	}
1754 done:
1755 	ira_cleanup(&iras, B_TRUE);
1756 	(void) ipsec_free_crypto_data(mp);
1757 }
1758 
1759 /*
1760  * Invoked on crypto framework failure during inbound and outbound processing.
1761  */
1762 static void
1763 esp_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc,
1764     ill_t *ill, ipsecesp_stack_t *espstack)
1765 {
1766 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
1767 
1768 	esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
1769 	    is_inbound ? "inbound" : "outbound", kef_rc));
1770 	ip_drop_packet(data_mp, is_inbound, ill,
1771 	    DROPPER(ipss, ipds_esp_crypto_failed),
1772 	    &espstack->esp_dropper);
1773 	ESP_BUMP_STAT(espstack, crypto_failures);
1774 	if (is_inbound)
1775 		IP_ESP_BUMP_STAT(ipss, in_discards);
1776 	else
1777 		ESP_BUMP_STAT(espstack, out_discards);
1778 }
1779 
1780 /*
1781  * A statement-equivalent macro, _cr MUST point to a modifiable
1782  * crypto_call_req_t.
1783  */
1784 #define	ESP_INIT_CALLREQ(_cr, _mp, _callback)				\
1785 	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE;	\
1786 	(_cr)->cr_callback_arg = (_mp);				\
1787 	(_cr)->cr_callback_func = (_callback)
1788 
1789 #define	ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) {			\
1790 	(mac)->cd_format = CRYPTO_DATA_RAW;				\
1791 	(mac)->cd_offset = 0;						\
1792 	(mac)->cd_length = icvlen;					\
1793 	(mac)->cd_raw.iov_base = (char *)icvbuf;			\
1794 	(mac)->cd_raw.iov_len = icvlen;					\
1795 }
1796 
1797 #define	ESP_INIT_CRYPTO_DATA(data, mp, off, len) {			\
1798 	if (MBLKL(mp) >= (len) + (off)) {				\
1799 		(data)->cd_format = CRYPTO_DATA_RAW;			\
1800 		(data)->cd_raw.iov_base = (char *)(mp)->b_rptr;		\
1801 		(data)->cd_raw.iov_len = MBLKL(mp);			\
1802 		(data)->cd_offset = off;				\
1803 	} else {							\
1804 		(data)->cd_format = CRYPTO_DATA_MBLK;			\
1805 		(data)->cd_mp = mp;					\
1806 		(data)->cd_offset = off;				\
1807 	}								\
1808 	(data)->cd_length = len;					\
1809 }
1810 
1811 #define	ESP_INIT_CRYPTO_DUAL_DATA(data, mp, off1, len1, off2, len2) {	\
1812 	(data)->dd_format = CRYPTO_DATA_MBLK;				\
1813 	(data)->dd_mp = mp;						\
1814 	(data)->dd_len1 = len1;						\
1815 	(data)->dd_offset1 = off1;					\
1816 	(data)->dd_len2 = len2;						\
1817 	(data)->dd_offset2 = off2;					\
1818 }
1819 
1820 /*
1821  * Returns data_mp if successfully completed the request. Returns
1822  * NULL if it failed (and increments InDiscards) or if it is pending.
1823  */
1824 static mblk_t *
1825 esp_submit_req_inbound(mblk_t *esp_mp, ip_recv_attr_t *ira,
1826     ipsa_t *assoc, uint_t esph_offset)
1827 {
1828 	uint_t auth_offset, msg_len, auth_len;
1829 	crypto_call_req_t call_req, *callrp;
1830 	mblk_t *mp;
1831 	esph_t *esph_ptr;
1832 	int kef_rc;
1833 	uint_t icv_len = assoc->ipsa_mac_len;
1834 	crypto_ctx_template_t auth_ctx_tmpl;
1835 	boolean_t do_auth, do_encr, force;
1836 	uint_t encr_offset, encr_len;
1837 	uint_t iv_len = assoc->ipsa_iv_len;
1838 	crypto_ctx_template_t encr_ctx_tmpl;
1839 	ipsec_crypto_t	*ic, icstack;
1840 	uchar_t *iv_ptr;
1841 	netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
1842 	ipsec_stack_t *ipss = ns->netstack_ipsec;
1843 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1844 
1845 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
1846 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
1847 	force = (assoc->ipsa_flags & IPSA_F_ASYNC);
1848 
1849 #ifdef IPSEC_LATENCY_TEST
1850 	kef_rc = CRYPTO_SUCCESS;
1851 #else
1852 	kef_rc = CRYPTO_FAILED;
1853 #endif
1854 
1855 	/*
1856 	 * An inbound packet is of the form:
1857 	 * [IP,options,ESP,IV,data,ICV,pad]
1858 	 */
1859 	esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
1860 	iv_ptr = (uchar_t *)(esph_ptr + 1);
1861 	/* Packet length starting at IP header ending after ESP ICV. */
1862 	msg_len = MBLKL(esp_mp);
1863 
1864 	encr_offset = esph_offset + sizeof (esph_t) + iv_len;
1865 	encr_len = msg_len - encr_offset;
1866 
1867 	/*
1868 	 * Counter mode algs need a nonce. This is setup in sadb_common_add().
1869 	 * If for some reason we are using a SA which does not have a nonce
1870 	 * then we must fail here.
1871 	 */
1872 	if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
1873 	    (assoc->ipsa_nonce == NULL)) {
1874 		ip_drop_packet(esp_mp, B_TRUE, ira->ira_ill,
1875 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
1876 		return (NULL);
1877 	}
1878 
1879 	if (force) {
1880 		/* We are doing asynch; allocate mblks to hold state */
1881 		if ((mp = ip_recv_attr_to_mblk(ira)) == NULL ||
1882 		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
1883 			BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1884 			ip_drop_input("ipIfStatsInDiscards", esp_mp,
1885 			    ira->ira_ill);
1886 			return (NULL);
1887 		}
1888 		linkb(mp, esp_mp);
1889 		callrp = &call_req;
1890 		ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_inbound);
1891 	} else {
1892 		/*
1893 		 * If we know we are going to do sync then ipsec_crypto_t
1894 		 * should be on the stack.
1895 		 */
1896 		ic = &icstack;
1897 		bzero(ic, sizeof (*ic));
1898 		callrp = NULL;
1899 	}
1900 
1901 	if (do_auth) {
1902 		/* authentication context template */
1903 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
1904 		    auth_ctx_tmpl);
1905 
1906 		/* ICV to be verified */
1907 		ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
1908 		    icv_len, esp_mp->b_wptr - icv_len);
1909 
1910 		/* authentication starts at the ESP header */
1911 		auth_offset = esph_offset;
1912 		auth_len = msg_len - auth_offset - icv_len;
1913 		if (!do_encr) {
1914 			/* authentication only */
1915 			/* initialize input data argument */
1916 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
1917 			    esp_mp, auth_offset, auth_len);
1918 
1919 			/* call the crypto framework */
1920 			kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
1921 			    &ic->ic_crypto_data,
1922 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
1923 			    &ic->ic_crypto_mac, callrp);
1924 		}
1925 	}
1926 
1927 	if (do_encr) {
1928 		/* encryption template */
1929 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
1930 		    encr_ctx_tmpl);
1931 
1932 		/* Call the nonce update function. Also passes in IV */
1933 		(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, encr_len,
1934 		    iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
1935 
1936 		if (!do_auth) {
1937 			/* decryption only */
1938 			/* initialize input data argument */
1939 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
1940 			    esp_mp, encr_offset, encr_len);
1941 
1942 			/* call the crypto framework */
1943 			kef_rc = crypto_decrypt((crypto_mechanism_t *)
1944 			    &ic->ic_cmm, &ic->ic_crypto_data,
1945 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
1946 			    NULL, callrp);
1947 		}
1948 	}
1949 
1950 	if (do_auth && do_encr) {
1951 		/* dual operation */
1952 		/* initialize input data argument */
1953 		ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
1954 		    esp_mp, auth_offset, auth_len,
1955 		    encr_offset, encr_len - icv_len);
1956 
1957 		/* specify IV */
1958 		ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
1959 
1960 		/* call the framework */
1961 		kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
1962 		    &assoc->ipsa_emech, &ic->ic_crypto_dual_data,
1963 		    &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
1964 		    auth_ctx_tmpl, encr_ctx_tmpl, &ic->ic_crypto_mac,
1965 		    NULL, callrp);
1966 	}
1967 
1968 	switch (kef_rc) {
1969 	case CRYPTO_SUCCESS:
1970 		ESP_BUMP_STAT(espstack, crypto_sync);
1971 		esp_mp = esp_in_done(esp_mp, ira, ic);
1972 		if (force) {
1973 			/* Free mp after we are done with ic */
1974 			mp = ipsec_free_crypto_data(mp);
1975 			(void) ip_recv_attr_free_mblk(mp);
1976 		}
1977 		return (esp_mp);
1978 	case CRYPTO_QUEUED:
1979 		/* esp_kcf_callback_inbound() will be invoked on completion */
1980 		ESP_BUMP_STAT(espstack, crypto_async);
1981 		return (NULL);
1982 	case CRYPTO_INVALID_MAC:
1983 		if (force) {
1984 			mp = ipsec_free_crypto_data(mp);
1985 			esp_mp = ip_recv_attr_free_mblk(mp);
1986 		}
1987 		ESP_BUMP_STAT(espstack, crypto_sync);
1988 		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1989 		esp_log_bad_auth(esp_mp, ira);
1990 		/* esp_mp was passed to ip_drop_packet */
1991 		return (NULL);
1992 	}
1993 
1994 	if (force) {
1995 		mp = ipsec_free_crypto_data(mp);
1996 		esp_mp = ip_recv_attr_free_mblk(mp);
1997 	}
1998 	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1999 	esp_crypto_failed(esp_mp, B_TRUE, kef_rc, ira->ira_ill, espstack);
2000 	/* esp_mp was passed to ip_drop_packet */
2001 	return (NULL);
2002 }
2003 
2004 /*
2005  * Compute the IP and UDP checksums -- common code for both keepalives and
2006  * actual ESP-in-UDP packets.  Be flexible with multiple mblks because ESP
2007  * uses mblk-insertion to insert the UDP header.
2008  * TODO - If there is an easy way to prep a packet for HW checksums, make
2009  * it happen here.
2010  * Note that this is used before both before calling ip_output_simple and
2011  * in the esp datapath. The former could use IXAF_SET_ULP_CKSUM but not the
2012  * latter.
2013  */
2014 static void
2015 esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
2016 {
2017 	int offset;
2018 	uint32_t cksum;
2019 	uint16_t *arr;
2020 	mblk_t *udpmp = mp;
2021 	uint_t hlen = IPH_HDR_LENGTH(ipha);
2022 
2023 	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2024 
2025 	ipha->ipha_hdr_checksum = 0;
2026 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2027 
2028 	if (ns->netstack_udp->us_do_checksum) {
2029 		ASSERT(MBLKL(udpmp) >= sizeof (udpha_t));
2030 		/* arr points to the IP header. */
2031 		arr = (uint16_t *)ipha;
2032 		IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
2033 		IP_STAT_UPDATE(ns->netstack_ip, ip_out_sw_cksum_bytes,
2034 		    ntohs(htons(ipha->ipha_length) - hlen));
2035 		/* arr[6-9] are the IP addresses. */
2036 		cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
2037 		    ntohs(htons(ipha->ipha_length) - hlen);
2038 		cksum = IP_CSUM(mp, hlen, cksum);
2039 		offset = hlen + UDP_CHECKSUM_OFFSET;
2040 		while (offset >= MBLKL(udpmp)) {
2041 			offset -= MBLKL(udpmp);
2042 			udpmp = udpmp->b_cont;
2043 		}
2044 		/* arr points to the UDP header's checksum field. */
2045 		arr = (uint16_t *)(udpmp->b_rptr + offset);
2046 		*arr = cksum;
2047 	}
2048 }
2049 
2050 /*
2051  * taskq handler so we can send the NAT-T keepalive on a separate thread.
2052  */
2053 static void
2054 actually_send_keepalive(void *arg)
2055 {
2056 	mblk_t *mp = (mblk_t *)arg;
2057 	ip_xmit_attr_t ixas;
2058 	netstack_t	*ns;
2059 	netstackid_t	stackid;
2060 
2061 	stackid = (netstackid_t)(uintptr_t)mp->b_prev;
2062 	mp->b_prev = NULL;
2063 	ns = netstack_find_by_stackid(stackid);
2064 	if (ns == NULL) {
2065 		/* Disappeared */
2066 		ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
2067 		freemsg(mp);
2068 		return;
2069 	}
2070 
2071 	bzero(&ixas, sizeof (ixas));
2072 	ixas.ixa_zoneid = ALL_ZONES;
2073 	ixas.ixa_cred = kcred;
2074 	ixas.ixa_cpid = NOPID;
2075 	ixas.ixa_tsl = NULL;
2076 	ixas.ixa_ipst = ns->netstack_ip;
2077 	/* No ULP checksum; done by esp_prepare_udp */
2078 	ixas.ixa_flags = (IXAF_IS_IPV4 | IXAF_NO_IPSEC | IXAF_VERIFY_SOURCE);
2079 
2080 	(void) ip_output_simple(mp, &ixas);
2081 	ixa_cleanup(&ixas);
2082 	netstack_rele(ns);
2083 }
2084 
2085 /*
2086  * Send a one-byte UDP NAT-T keepalive.
2087  */
2088 void
2089 ipsecesp_send_keepalive(ipsa_t *assoc)
2090 {
2091 	mblk_t		*mp;
2092 	ipha_t		*ipha;
2093 	udpha_t		*udpha;
2094 	netstack_t	*ns = assoc->ipsa_netstack;
2095 
2096 	ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock));
2097 
2098 	mp = allocb(sizeof (ipha_t) + sizeof (udpha_t) + 1, BPRI_HI);
2099 	if (mp == NULL)
2100 		return;
2101 	ipha = (ipha_t *)mp->b_rptr;
2102 	ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2103 	ipha->ipha_type_of_service = 0;
2104 	ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (udpha_t) + 1);
2105 	/* Use the low-16 of the SPI so we have some clue where it came from. */
2106 	ipha->ipha_ident = *(((uint16_t *)(&assoc->ipsa_spi)) + 1);
2107 	ipha->ipha_fragment_offset_and_flags = 0;  /* Too small to fragment! */
2108 	ipha->ipha_ttl = 0xFF;
2109 	ipha->ipha_protocol = IPPROTO_UDP;
2110 	ipha->ipha_hdr_checksum = 0;
2111 	ipha->ipha_src = assoc->ipsa_srcaddr[0];
2112 	ipha->ipha_dst = assoc->ipsa_dstaddr[0];
2113 	udpha = (udpha_t *)(ipha + 1);
2114 	udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2115 	    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2116 	udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2117 	    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2118 	udpha->uha_length = htons(sizeof (udpha_t) + 1);
2119 	udpha->uha_checksum = 0;
2120 	mp->b_wptr = (uint8_t *)(udpha + 1);
2121 	*(mp->b_wptr++) = 0xFF;
2122 
2123 	esp_prepare_udp(ns, mp, ipha);
2124 
2125 	/*
2126 	 * We're holding an isaf_t bucket lock, so pawn off the actual
2127 	 * packet transmission to another thread.  Just in case syncq
2128 	 * processing causes a same-bucket packet to be processed.
2129 	 */
2130 	mp->b_prev = (mblk_t *)(uintptr_t)ns->netstack_stackid;
2131 
2132 	if (taskq_dispatch(esp_taskq, actually_send_keepalive, mp,
2133 	    TQ_NOSLEEP) == TASKQID_INVALID) {
2134 		/* Assume no memory if taskq_dispatch() fails. */
2135 		mp->b_prev = NULL;
2136 		ip_drop_packet(mp, B_FALSE, NULL,
2137 		    DROPPER(ns->netstack_ipsec, ipds_esp_nomem),
2138 		    &ns->netstack_ipsecesp->esp_dropper);
2139 	}
2140 }
2141 
2142 /*
2143  * Returns mp if successfully completed the request. Returns
2144  * NULL if it failed (and increments InDiscards) or if it is pending.
2145  */
2146 static mblk_t *
2147 esp_submit_req_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa, ipsa_t *assoc,
2148     uchar_t *icv_buf, uint_t payload_len)
2149 {
2150 	uint_t auth_len;
2151 	crypto_call_req_t call_req, *callrp;
2152 	mblk_t *esp_mp;
2153 	esph_t *esph_ptr;
2154 	mblk_t *mp;
2155 	int kef_rc = CRYPTO_FAILED;
2156 	uint_t icv_len = assoc->ipsa_mac_len;
2157 	crypto_ctx_template_t auth_ctx_tmpl;
2158 	boolean_t do_auth, do_encr, force;
2159 	uint_t iv_len = assoc->ipsa_iv_len;
2160 	crypto_ctx_template_t encr_ctx_tmpl;
2161 	boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
2162 	size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
2163 	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
2164 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2165 	ipsec_crypto_t	*ic, icstack;
2166 	uchar_t		*iv_ptr;
2167 	crypto_data_t	*cd_ptr = NULL;
2168 	ill_t		*ill = ixa->ixa_nce->nce_ill;
2169 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2170 
2171 	esp3dbg(espstack, ("esp_submit_req_outbound:%s",
2172 	    is_natt ? "natt" : "not natt"));
2173 
2174 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2175 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2176 	force = (assoc->ipsa_flags & IPSA_F_ASYNC);
2177 
2178 #ifdef IPSEC_LATENCY_TEST
2179 	kef_rc = CRYPTO_SUCCESS;
2180 #else
2181 	kef_rc = CRYPTO_FAILED;
2182 #endif
2183 
2184 	/*
2185 	 * Outbound IPsec packets are of the form:
2186 	 * [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
2187 	 * unless it's NATT, then it's
2188 	 * [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
2189 	 * Get a pointer to the mblk containing the ESP header.
2190 	 */
2191 	ASSERT(data_mp->b_cont != NULL);
2192 	esp_mp = data_mp->b_cont;
2193 	esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
2194 	iv_ptr = (uchar_t *)(esph_ptr + 1);
2195 
2196 	/*
2197 	 * Combined mode algs need a nonce. This is setup in sadb_common_add().
2198 	 * If for some reason we are using a SA which does not have a nonce
2199 	 * then we must fail here.
2200 	 */
2201 	if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
2202 	    (assoc->ipsa_nonce == NULL)) {
2203 		ip_drop_packet(data_mp, B_FALSE, NULL,
2204 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2205 		return (NULL);
2206 	}
2207 
2208 	if (force) {
2209 		/* We are doing asynch; allocate mblks to hold state */
2210 		if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL ||
2211 		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
2212 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2213 			ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
2214 			freemsg(data_mp);
2215 			return (NULL);
2216 		}
2217 
2218 		linkb(mp, data_mp);
2219 		callrp = &call_req;
2220 		ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_outbound);
2221 	} else {
2222 		/*
2223 		 * If we know we are going to do sync then ipsec_crypto_t
2224 		 * should be on the stack.
2225 		 */
2226 		ic = &icstack;
2227 		bzero(ic, sizeof (*ic));
2228 		callrp = NULL;
2229 	}
2230 
2231 
2232 	if (do_auth) {
2233 		/* authentication context template */
2234 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2235 		    auth_ctx_tmpl);
2236 
2237 		/* where to store the computed mac */
2238 		ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
2239 		    icv_len, icv_buf);
2240 
2241 		/* authentication starts at the ESP header */
2242 		auth_len = payload_len + iv_len + sizeof (esph_t);
2243 		if (!do_encr) {
2244 			/* authentication only */
2245 			/* initialize input data argument */
2246 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2247 			    esp_mp, esph_offset, auth_len);
2248 
2249 			/* call the crypto framework */
2250 			kef_rc = crypto_mac(&assoc->ipsa_amech,
2251 			    &ic->ic_crypto_data,
2252 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2253 			    &ic->ic_crypto_mac, callrp);
2254 		}
2255 	}
2256 
2257 	if (do_encr) {
2258 		/* encryption context template */
2259 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2260 		    encr_ctx_tmpl);
2261 		/* Call the nonce update function. */
2262 		(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, payload_len,
2263 		    iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
2264 
2265 		if (!do_auth) {
2266 			/* encryption only, skip mblk that contains ESP hdr */
2267 			/* initialize input data argument */
2268 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2269 			    esp_mp->b_cont, 0, payload_len);
2270 
2271 			/*
2272 			 * For combined mode ciphers, the ciphertext is the same
2273 			 * size as the clear text, the ICV should follow the
2274 			 * ciphertext. To convince the kcf to allow in-line
2275 			 * encryption, with an ICV, use ipsec_out_crypto_mac
2276 			 * to point to the same buffer as the data. The calling
2277 			 * function need to ensure the buffer is large enough to
2278 			 * include the ICV.
2279 			 *
2280 			 * The IV is already written to the packet buffer, the
2281 			 * nonce setup function copied it to the params struct
2282 			 * for the cipher to use.
2283 			 */
2284 			if (assoc->ipsa_flags & IPSA_F_COMBINED) {
2285 				bcopy(&ic->ic_crypto_data,
2286 				    &ic->ic_crypto_mac,
2287 				    sizeof (crypto_data_t));
2288 				ic->ic_crypto_mac.cd_length =
2289 				    payload_len + icv_len;
2290 				cd_ptr = &ic->ic_crypto_mac;
2291 			}
2292 
2293 			/* call the crypto framework */
2294 			kef_rc = crypto_encrypt((crypto_mechanism_t *)
2295 			    &ic->ic_cmm, &ic->ic_crypto_data,
2296 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2297 			    cd_ptr, callrp);
2298 
2299 		}
2300 	}
2301 
2302 	if (do_auth && do_encr) {
2303 		/*
2304 		 * Encryption and authentication:
2305 		 * Pass the pointer to the mblk chain starting at the ESP
2306 		 * header to the framework. Skip the ESP header mblk
2307 		 * for encryption, which is reflected by an encryption
2308 		 * offset equal to the length of that mblk. Start
2309 		 * the authentication at the ESP header, i.e. use an
2310 		 * authentication offset of zero.
2311 		 */
2312 		ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
2313 		    esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
2314 
2315 		/* specify IV */
2316 		ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
2317 
2318 		/* call the framework */
2319 		kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
2320 		    &assoc->ipsa_amech, NULL,
2321 		    &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
2322 		    encr_ctx_tmpl, auth_ctx_tmpl,
2323 		    &ic->ic_crypto_dual_data,
2324 		    &ic->ic_crypto_mac, callrp);
2325 	}
2326 
2327 	switch (kef_rc) {
2328 	case CRYPTO_SUCCESS:
2329 		ESP_BUMP_STAT(espstack, crypto_sync);
2330 		esp_set_usetime(assoc, B_FALSE);
2331 		if (force) {
2332 			mp = ipsec_free_crypto_data(mp);
2333 			data_mp = ip_xmit_attr_free_mblk(mp);
2334 		}
2335 		if (is_natt)
2336 			esp_prepare_udp(ns, data_mp, (ipha_t *)data_mp->b_rptr);
2337 		return (data_mp);
2338 	case CRYPTO_QUEUED:
2339 		/* esp_kcf_callback_outbound() will be invoked on completion */
2340 		ESP_BUMP_STAT(espstack, crypto_async);
2341 		return (NULL);
2342 	}
2343 
2344 	if (force) {
2345 		mp = ipsec_free_crypto_data(mp);
2346 		data_mp = ip_xmit_attr_free_mblk(mp);
2347 	}
2348 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2349 	esp_crypto_failed(data_mp, B_FALSE, kef_rc, NULL, espstack);
2350 	/* data_mp was passed to ip_drop_packet */
2351 	return (NULL);
2352 }
2353 
2354 /*
2355  * Handle outbound IPsec processing for IPv4 and IPv6
2356  *
2357  * Returns data_mp if successfully completed the request. Returns
2358  * NULL if it failed (and increments InDiscards) or if it is pending.
2359  */
2360 static mblk_t *
2361 esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa)
2362 {
2363 	mblk_t *espmp, *tailmp;
2364 	ipha_t *ipha;
2365 	ip6_t *ip6h;
2366 	esph_t *esph_ptr, *iv_ptr;
2367 	uint_t af;
2368 	uint8_t *nhp;
2369 	uintptr_t divpoint, datalen, adj, padlen, i, alloclen;
2370 	uintptr_t esplen = sizeof (esph_t);
2371 	uint8_t protocol;
2372 	ipsa_t *assoc;
2373 	uint_t iv_len, block_size, mac_len = 0;
2374 	uchar_t *icv_buf;
2375 	udpha_t *udpha;
2376 	boolean_t is_natt = B_FALSE;
2377 	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
2378 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2379 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2380 	ill_t		*ill = ixa->ixa_nce->nce_ill;
2381 	boolean_t	need_refrele = B_FALSE;
2382 
2383 	ESP_BUMP_STAT(espstack, out_requests);
2384 
2385 	/*
2386 	 * <sigh> We have to copy the message here, because TCP (for example)
2387 	 * keeps a dupb() of the message lying around for retransmission.
2388 	 * Since ESP changes the whole of the datagram, we have to create our
2389 	 * own copy lest we clobber TCP's data.  Since we have to copy anyway,
2390 	 * we might as well make use of msgpullup() and get the mblk into one
2391 	 * contiguous piece!
2392 	 */
2393 	tailmp = msgpullup(data_mp, -1);
2394 	if (tailmp == NULL) {
2395 		esp0dbg(("esp_outbound: msgpullup() failed, "
2396 		    "dropping packet.\n"));
2397 		ip_drop_packet(data_mp, B_FALSE, ill,
2398 		    DROPPER(ipss, ipds_esp_nomem),
2399 		    &espstack->esp_dropper);
2400 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2401 		return (NULL);
2402 	}
2403 	freemsg(data_mp);
2404 	data_mp = tailmp;
2405 
2406 	assoc = ixa->ixa_ipsec_esp_sa;
2407 	ASSERT(assoc != NULL);
2408 
2409 	/*
2410 	 * Get the outer IP header in shape to escape this system..
2411 	 */
2412 	if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) {
2413 		/*
2414 		 * Need to update packet with any CIPSO option and update
2415 		 * ixa_tsl to capture the new label.
2416 		 * We allocate a separate ixa for that purpose.
2417 		 */
2418 		ixa = ip_xmit_attr_duplicate(ixa);
2419 		if (ixa == NULL) {
2420 			ip_drop_packet(data_mp, B_FALSE, ill,
2421 			    DROPPER(ipss, ipds_esp_nomem),
2422 			    &espstack->esp_dropper);
2423 			return (NULL);
2424 		}
2425 		need_refrele = B_TRUE;
2426 
2427 		label_hold(assoc->ipsa_otsl);
2428 		ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl);
2429 
2430 		data_mp = sadb_whack_label(data_mp, assoc, ixa,
2431 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2432 		if (data_mp == NULL) {
2433 			/* Packet dropped by sadb_whack_label */
2434 			ixa_refrele(ixa);
2435 			return (NULL);
2436 		}
2437 	}
2438 
2439 	/*
2440 	 * Reality check....
2441 	 */
2442 	ipha = (ipha_t *)data_mp->b_rptr;  /* So we can call esp_acquire(). */
2443 
2444 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2445 		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
2446 
2447 		af = AF_INET;
2448 		divpoint = IPH_HDR_LENGTH(ipha);
2449 		datalen = ntohs(ipha->ipha_length) - divpoint;
2450 		nhp = (uint8_t *)&ipha->ipha_protocol;
2451 	} else {
2452 		ip_pkt_t ipp;
2453 
2454 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
2455 
2456 		af = AF_INET6;
2457 		ip6h = (ip6_t *)ipha;
2458 		bzero(&ipp, sizeof (ipp));
2459 		divpoint = ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, NULL);
2460 		if (ipp.ipp_dstopts != NULL &&
2461 		    ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
2462 			/*
2463 			 * Destination options are tricky.  If we get in here,
2464 			 * then we have a terminal header following the
2465 			 * destination options.  We need to adjust backwards
2466 			 * so we insert ESP BEFORE the destination options
2467 			 * bag.  (So that the dstopts get encrypted!)
2468 			 *
2469 			 * Since this is for outbound packets only, we know
2470 			 * that non-terminal destination options only precede
2471 			 * routing headers.
2472 			 */
2473 			divpoint -= ipp.ipp_dstoptslen;
2474 		}
2475 		datalen = ntohs(ip6h->ip6_plen) + sizeof (ip6_t) - divpoint;
2476 
2477 		if (ipp.ipp_rthdr != NULL) {
2478 			nhp = &ipp.ipp_rthdr->ip6r_nxt;
2479 		} else if (ipp.ipp_hopopts != NULL) {
2480 			nhp = &ipp.ipp_hopopts->ip6h_nxt;
2481 		} else {
2482 			ASSERT(divpoint == sizeof (ip6_t));
2483 			/* It's probably IP + ESP. */
2484 			nhp = &ip6h->ip6_nxt;
2485 		}
2486 	}
2487 
2488 	mac_len = assoc->ipsa_mac_len;
2489 
2490 	if (assoc->ipsa_flags & IPSA_F_NATT) {
2491 		/* wedge in UDP header */
2492 		is_natt = B_TRUE;
2493 		esplen += UDPH_SIZE;
2494 	}
2495 
2496 	/*
2497 	 * Set up ESP header and encryption padding for ENCR PI request.
2498 	 */
2499 
2500 	/* Determine the padding length.  Pad to 4-bytes for no-encryption. */
2501 	if (assoc->ipsa_encr_alg != SADB_EALG_NULL) {
2502 		iv_len = assoc->ipsa_iv_len;
2503 		block_size = assoc->ipsa_datalen;
2504 
2505 		/*
2506 		 * Pad the data to the length of the cipher block size.
2507 		 * Include the two additional bytes (hence the - 2) for the
2508 		 * padding length and the next header.  Take this into account
2509 		 * when calculating the actual length of the padding.
2510 		 */
2511 		ASSERT(ISP2(iv_len));
2512 		padlen = ((unsigned)(block_size - datalen - 2)) &
2513 		    (block_size - 1);
2514 	} else {
2515 		iv_len = 0;
2516 		padlen = ((unsigned)(sizeof (uint32_t) - datalen - 2)) &
2517 		    (sizeof (uint32_t) - 1);
2518 	}
2519 
2520 	/* Allocate ESP header and IV. */
2521 	esplen += iv_len;
2522 
2523 	/*
2524 	 * Update association byte-count lifetimes.  Don't forget to take
2525 	 * into account the padding length and next-header (hence the + 2).
2526 	 *
2527 	 * Use the amount of data fed into the "encryption algorithm".  This
2528 	 * is the IV, the data length, the padding length, and the final two
2529 	 * bytes (padlen, and next-header).
2530 	 *
2531 	 */
2532 
2533 	if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
2534 		ip_drop_packet(data_mp, B_FALSE, ill,
2535 		    DROPPER(ipss, ipds_esp_bytes_expire),
2536 		    &espstack->esp_dropper);
2537 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2538 		if (need_refrele)
2539 			ixa_refrele(ixa);
2540 		return (NULL);
2541 	}
2542 
2543 	espmp = allocb(esplen, BPRI_HI);
2544 	if (espmp == NULL) {
2545 		ESP_BUMP_STAT(espstack, out_discards);
2546 		esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
2547 		ip_drop_packet(data_mp, B_FALSE, ill,
2548 		    DROPPER(ipss, ipds_esp_nomem),
2549 		    &espstack->esp_dropper);
2550 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2551 		if (need_refrele)
2552 			ixa_refrele(ixa);
2553 		return (NULL);
2554 	}
2555 	espmp->b_wptr += esplen;
2556 	esph_ptr = (esph_t *)espmp->b_rptr;
2557 
2558 	if (is_natt) {
2559 		esp3dbg(espstack, ("esp_outbound: NATT"));
2560 
2561 		udpha = (udpha_t *)espmp->b_rptr;
2562 		udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2563 		    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2564 		udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2565 		    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2566 		/*
2567 		 * Set the checksum to 0, so that the esp_prepare_udp() call
2568 		 * can do the right thing.
2569 		 */
2570 		udpha->uha_checksum = 0;
2571 		esph_ptr = (esph_t *)(udpha + 1);
2572 	}
2573 
2574 	esph_ptr->esph_spi = assoc->ipsa_spi;
2575 
2576 	esph_ptr->esph_replay = htonl(atomic_inc_32_nv(&assoc->ipsa_replay));
2577 	if (esph_ptr->esph_replay == 0 && assoc->ipsa_replay_wsize != 0) {
2578 		/*
2579 		 * XXX We have replay counter wrapping.
2580 		 * We probably want to nuke this SA (and its peer).
2581 		 */
2582 		ipsec_assocfailure(info.mi_idnum, 0, 0,
2583 		    SL_ERROR | SL_CONSOLE | SL_WARN,
2584 		    "Outbound ESP SA (0x%x, %s) has wrapped sequence.\n",
2585 		    esph_ptr->esph_spi, assoc->ipsa_dstaddr, af,
2586 		    espstack->ipsecesp_netstack);
2587 
2588 		ESP_BUMP_STAT(espstack, out_discards);
2589 		sadb_replay_delete(assoc);
2590 		ip_drop_packet(data_mp, B_FALSE, ill,
2591 		    DROPPER(ipss, ipds_esp_replay),
2592 		    &espstack->esp_dropper);
2593 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2594 		if (need_refrele)
2595 			ixa_refrele(ixa);
2596 		return (NULL);
2597 	}
2598 
2599 	iv_ptr = (esph_ptr + 1);
2600 	/*
2601 	 * iv_ptr points to the mblk which will contain the IV once we have
2602 	 * written it there. This mblk will be part of a mblk chain that
2603 	 * will make up the packet.
2604 	 *
2605 	 * For counter mode algorithms, the IV is a 64 bit quantity, it
2606 	 * must NEVER repeat in the lifetime of the SA, otherwise an
2607 	 * attacker who had recorded enough packets might be able to
2608 	 * determine some clear text.
2609 	 *
2610 	 * To ensure this does not happen, the IV is stored in the SA and
2611 	 * incremented for each packet, the IV is then copied into the
2612 	 * "packet" for transmission to the receiving system. The IV will
2613 	 * also be copied into the nonce, when the packet is encrypted.
2614 	 *
2615 	 * CBC mode algorithms use a random IV for each packet. We do not
2616 	 * require the highest quality random bits, but for best security
2617 	 * with CBC mode ciphers, the value must be unlikely to repeat and
2618 	 * must not be known in advance to an adversary capable of influencing
2619 	 * the clear text.
2620 	 */
2621 	if (!update_iv((uint8_t *)iv_ptr, espstack->esp_pfkey_q, assoc,
2622 	    espstack)) {
2623 		ip_drop_packet(data_mp, B_FALSE, ill,
2624 		    DROPPER(ipss, ipds_esp_iv_wrap), &espstack->esp_dropper);
2625 		if (need_refrele)
2626 			ixa_refrele(ixa);
2627 		return (NULL);
2628 	}
2629 
2630 	/* Fix the IP header. */
2631 	alloclen = padlen + 2 + mac_len;
2632 	adj = alloclen + (espmp->b_wptr - espmp->b_rptr);
2633 
2634 	protocol = *nhp;
2635 
2636 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2637 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
2638 		if (is_natt) {
2639 			*nhp = IPPROTO_UDP;
2640 			udpha->uha_length = htons(ntohs(ipha->ipha_length) -
2641 			    IPH_HDR_LENGTH(ipha));
2642 		} else {
2643 			*nhp = IPPROTO_ESP;
2644 		}
2645 		ipha->ipha_hdr_checksum = 0;
2646 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
2647 	} else {
2648 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + adj);
2649 		*nhp = IPPROTO_ESP;
2650 	}
2651 
2652 	/* I've got the two ESP mblks, now insert them. */
2653 
2654 	esp2dbg(espstack, ("data_mp before outbound ESP adjustment:\n"));
2655 	esp2dbg(espstack, (dump_msg(data_mp)));
2656 
2657 	if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
2658 		ESP_BUMP_STAT(espstack, out_discards);
2659 		/* NOTE:  esp_insert_esp() only fails if there's no memory. */
2660 		ip_drop_packet(data_mp, B_FALSE, ill,
2661 		    DROPPER(ipss, ipds_esp_nomem),
2662 		    &espstack->esp_dropper);
2663 		freeb(espmp);
2664 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2665 		if (need_refrele)
2666 			ixa_refrele(ixa);
2667 		return (NULL);
2668 	}
2669 
2670 	/* Append padding (and leave room for ICV). */
2671 	for (tailmp = data_mp; tailmp->b_cont != NULL; tailmp = tailmp->b_cont)
2672 		;
2673 	if (tailmp->b_wptr + alloclen > tailmp->b_datap->db_lim) {
2674 		tailmp->b_cont = allocb(alloclen, BPRI_HI);
2675 		if (tailmp->b_cont == NULL) {
2676 			ESP_BUMP_STAT(espstack, out_discards);
2677 			esp0dbg(("esp_outbound:  Can't allocate tailmp.\n"));
2678 			ip_drop_packet(data_mp, B_FALSE, ill,
2679 			    DROPPER(ipss, ipds_esp_nomem),
2680 			    &espstack->esp_dropper);
2681 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2682 			if (need_refrele)
2683 				ixa_refrele(ixa);
2684 			return (NULL);
2685 		}
2686 		tailmp = tailmp->b_cont;
2687 	}
2688 
2689 	/*
2690 	 * If there's padding, N bytes of padding must be of the form 0x1,
2691 	 * 0x2, 0x3... 0xN.
2692 	 */
2693 	for (i = 0; i < padlen; ) {
2694 		i++;
2695 		*tailmp->b_wptr++ = i;
2696 	}
2697 	*tailmp->b_wptr++ = i;
2698 	*tailmp->b_wptr++ = protocol;
2699 
2700 	esp2dbg(espstack, ("data_Mp before encryption:\n"));
2701 	esp2dbg(espstack, (dump_msg(data_mp)));
2702 
2703 	/*
2704 	 * Okay.  I've set up the pre-encryption ESP.  Let's do it!
2705 	 */
2706 
2707 	if (mac_len > 0) {
2708 		ASSERT(tailmp->b_wptr + mac_len <= tailmp->b_datap->db_lim);
2709 		icv_buf = tailmp->b_wptr;
2710 		tailmp->b_wptr += mac_len;
2711 	} else {
2712 		icv_buf = NULL;
2713 	}
2714 
2715 	data_mp = esp_submit_req_outbound(data_mp, ixa, assoc, icv_buf,
2716 	    datalen + padlen + 2);
2717 	if (need_refrele)
2718 		ixa_refrele(ixa);
2719 	return (data_mp);
2720 }
2721 
2722 /*
2723  * IP calls this to validate the ICMP errors that
2724  * we got from the network.
2725  */
2726 mblk_t *
2727 ipsecesp_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira)
2728 {
2729 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
2730 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2731 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2732 
2733 	/*
2734 	 * Unless we get an entire packet back, this function is useless.
2735 	 * Why?
2736 	 *
2737 	 * 1.)	Partial packets are useless, because the "next header"
2738 	 *	is at the end of the decrypted ESP packet.  Without the
2739 	 *	whole packet, this is useless.
2740 	 *
2741 	 * 2.)	If we every use a stateful cipher, such as a stream or a
2742 	 *	one-time pad, we can't do anything.
2743 	 *
2744 	 * Since the chances of us getting an entire packet back are very
2745 	 * very small, we discard here.
2746 	 */
2747 	IP_ESP_BUMP_STAT(ipss, in_discards);
2748 	ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
2749 	    DROPPER(ipss, ipds_esp_icmp),
2750 	    &espstack->esp_dropper);
2751 	return (NULL);
2752 }
2753 
2754 /*
2755  * Construct an SADB_REGISTER message with the current algorithms.
2756  * This function gets called when 'ipsecalgs -s' is run or when
2757  * in.iked (or other KMD) starts.
2758  */
2759 static boolean_t
2760 esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
2761     ipsecesp_stack_t *espstack, cred_t *cr)
2762 {
2763 	mblk_t *pfkey_msg_mp, *keysock_out_mp;
2764 	sadb_msg_t *samsg;
2765 	sadb_supported_t *sasupp_auth = NULL;
2766 	sadb_supported_t *sasupp_encr = NULL;
2767 	sadb_alg_t *saalg;
2768 	uint_t allocsize = sizeof (*samsg);
2769 	uint_t i, numalgs_snap;
2770 	int current_aalgs;
2771 	ipsec_alginfo_t **authalgs;
2772 	uint_t num_aalgs;
2773 	int current_ealgs;
2774 	ipsec_alginfo_t **encralgs;
2775 	uint_t num_ealgs;
2776 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
2777 	sadb_sens_t *sens;
2778 	size_t sens_len = 0;
2779 	sadb_ext_t *nextext;
2780 	ts_label_t *sens_tsl = NULL;
2781 
2782 	/* Allocate the KEYSOCK_OUT. */
2783 	keysock_out_mp = sadb_keysock_out(serial);
2784 	if (keysock_out_mp == NULL) {
2785 		esp0dbg(("esp_register_out: couldn't allocate mblk.\n"));
2786 		return (B_FALSE);
2787 	}
2788 
2789 	if (is_system_labeled() && (cr != NULL)) {
2790 		sens_tsl = crgetlabel(cr);
2791 		if (sens_tsl != NULL) {
2792 			sens_len = sadb_sens_len_from_label(sens_tsl);
2793 			allocsize += sens_len;
2794 		}
2795 	}
2796 
2797 	/*
2798 	 * Allocate the PF_KEY message that follows KEYSOCK_OUT.
2799 	 */
2800 
2801 	rw_enter(&ipss->ipsec_alg_lock, RW_READER);
2802 	/*
2803 	 * Fill SADB_REGISTER message's algorithm descriptors.  Hold
2804 	 * down the lock while filling it.
2805 	 *
2806 	 * Return only valid algorithms, so the number of algorithms
2807 	 * to send up may be less than the number of algorithm entries
2808 	 * in the table.
2809 	 */
2810 	authalgs = ipss->ipsec_alglists[IPSEC_ALG_AUTH];
2811 	for (num_aalgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2812 		if (authalgs[i] != NULL && ALG_VALID(authalgs[i]))
2813 			num_aalgs++;
2814 
2815 	if (num_aalgs != 0) {
2816 		allocsize += (num_aalgs * sizeof (*saalg));
2817 		allocsize += sizeof (*sasupp_auth);
2818 	}
2819 	encralgs = ipss->ipsec_alglists[IPSEC_ALG_ENCR];
2820 	for (num_ealgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2821 		if (encralgs[i] != NULL && ALG_VALID(encralgs[i]))
2822 			num_ealgs++;
2823 
2824 	if (num_ealgs != 0) {
2825 		allocsize += (num_ealgs * sizeof (*saalg));
2826 		allocsize += sizeof (*sasupp_encr);
2827 	}
2828 	keysock_out_mp->b_cont = allocb(allocsize, BPRI_HI);
2829 	if (keysock_out_mp->b_cont == NULL) {
2830 		rw_exit(&ipss->ipsec_alg_lock);
2831 		freemsg(keysock_out_mp);
2832 		return (B_FALSE);
2833 	}
2834 	pfkey_msg_mp = keysock_out_mp->b_cont;
2835 	pfkey_msg_mp->b_wptr += allocsize;
2836 
2837 	nextext = (sadb_ext_t *)(pfkey_msg_mp->b_rptr + sizeof (*samsg));
2838 
2839 	if (num_aalgs != 0) {
2840 		sasupp_auth = (sadb_supported_t *)nextext;
2841 		saalg = (sadb_alg_t *)(sasupp_auth + 1);
2842 
2843 		ASSERT(((ulong_t)saalg & 0x7) == 0);
2844 
2845 		numalgs_snap = 0;
2846 		for (i = 0;
2847 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_aalgs));
2848 		    i++) {
2849 			if (authalgs[i] == NULL || !ALG_VALID(authalgs[i]))
2850 				continue;
2851 
2852 			saalg->sadb_alg_id = authalgs[i]->alg_id;
2853 			saalg->sadb_alg_ivlen = 0;
2854 			saalg->sadb_alg_minbits	= authalgs[i]->alg_ef_minbits;
2855 			saalg->sadb_alg_maxbits	= authalgs[i]->alg_ef_maxbits;
2856 			saalg->sadb_x_alg_increment =
2857 			    authalgs[i]->alg_increment;
2858 			saalg->sadb_x_alg_saltbits = SADB_8TO1(
2859 			    authalgs[i]->alg_saltlen);
2860 			numalgs_snap++;
2861 			saalg++;
2862 		}
2863 		ASSERT(numalgs_snap == num_aalgs);
2864 #ifdef DEBUG
2865 		/*
2866 		 * Reality check to make sure I snagged all of the
2867 		 * algorithms.
2868 		 */
2869 		for (; i < IPSEC_MAX_ALGS; i++) {
2870 			if (authalgs[i] != NULL && ALG_VALID(authalgs[i])) {
2871 				cmn_err(CE_PANIC, "esp_register_out()! "
2872 				    "Missed aalg #%d.\n", i);
2873 			}
2874 		}
2875 #endif /* DEBUG */
2876 		nextext = (sadb_ext_t *)saalg;
2877 	}
2878 
2879 	if (num_ealgs != 0) {
2880 		sasupp_encr = (sadb_supported_t *)nextext;
2881 		saalg = (sadb_alg_t *)(sasupp_encr + 1);
2882 
2883 		numalgs_snap = 0;
2884 		for (i = 0;
2885 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_ealgs)); i++) {
2886 			if (encralgs[i] == NULL || !ALG_VALID(encralgs[i]))
2887 				continue;
2888 			saalg->sadb_alg_id = encralgs[i]->alg_id;
2889 			saalg->sadb_alg_ivlen = encralgs[i]->alg_ivlen;
2890 			saalg->sadb_alg_minbits	= encralgs[i]->alg_ef_minbits;
2891 			saalg->sadb_alg_maxbits	= encralgs[i]->alg_ef_maxbits;
2892 			/*
2893 			 * We could advertise the ICV length, except there
2894 			 * is not a value in sadb_x_algb to do this.
2895 			 * saalg->sadb_alg_maclen = encralgs[i]->alg_maclen;
2896 			 */
2897 			saalg->sadb_x_alg_increment =
2898 			    encralgs[i]->alg_increment;
2899 			saalg->sadb_x_alg_saltbits =
2900 			    SADB_8TO1(encralgs[i]->alg_saltlen);
2901 
2902 			numalgs_snap++;
2903 			saalg++;
2904 		}
2905 		ASSERT(numalgs_snap == num_ealgs);
2906 #ifdef DEBUG
2907 		/*
2908 		 * Reality check to make sure I snagged all of the
2909 		 * algorithms.
2910 		 */
2911 		for (; i < IPSEC_MAX_ALGS; i++) {
2912 			if (encralgs[i] != NULL && ALG_VALID(encralgs[i])) {
2913 				cmn_err(CE_PANIC, "esp_register_out()! "
2914 				    "Missed ealg #%d.\n", i);
2915 			}
2916 		}
2917 #endif /* DEBUG */
2918 		nextext = (sadb_ext_t *)saalg;
2919 	}
2920 
2921 	current_aalgs = num_aalgs;
2922 	current_ealgs = num_ealgs;
2923 
2924 	rw_exit(&ipss->ipsec_alg_lock);
2925 
2926 	if (sens_tsl != NULL) {
2927 		sens = (sadb_sens_t *)nextext;
2928 		sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
2929 		    sens_tsl, sens_len);
2930 
2931 		nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len);
2932 	}
2933 
2934 	/* Now fill the rest of the SADB_REGISTER message. */
2935 
2936 	samsg = (sadb_msg_t *)pfkey_msg_mp->b_rptr;
2937 	samsg->sadb_msg_version = PF_KEY_V2;
2938 	samsg->sadb_msg_type = SADB_REGISTER;
2939 	samsg->sadb_msg_errno = 0;
2940 	samsg->sadb_msg_satype = SADB_SATYPE_ESP;
2941 	samsg->sadb_msg_len = SADB_8TO64(allocsize);
2942 	samsg->sadb_msg_reserved = 0;
2943 	/*
2944 	 * Assume caller has sufficient sequence/pid number info.  If it's one
2945 	 * from me over a new alg., I could give two hoots about sequence.
2946 	 */
2947 	samsg->sadb_msg_seq = sequence;
2948 	samsg->sadb_msg_pid = pid;
2949 
2950 	if (sasupp_auth != NULL) {
2951 		sasupp_auth->sadb_supported_len = SADB_8TO64(
2952 		    sizeof (*sasupp_auth) + sizeof (*saalg) * current_aalgs);
2953 		sasupp_auth->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
2954 		sasupp_auth->sadb_supported_reserved = 0;
2955 	}
2956 
2957 	if (sasupp_encr != NULL) {
2958 		sasupp_encr->sadb_supported_len = SADB_8TO64(
2959 		    sizeof (*sasupp_encr) + sizeof (*saalg) * current_ealgs);
2960 		sasupp_encr->sadb_supported_exttype =
2961 		    SADB_EXT_SUPPORTED_ENCRYPT;
2962 		sasupp_encr->sadb_supported_reserved = 0;
2963 	}
2964 
2965 	if (espstack->esp_pfkey_q != NULL)
2966 		putnext(espstack->esp_pfkey_q, keysock_out_mp);
2967 	else {
2968 		freemsg(keysock_out_mp);
2969 		return (B_FALSE);
2970 	}
2971 
2972 	return (B_TRUE);
2973 }
2974 
2975 /*
2976  * Invoked when the algorithm table changes. Causes SADB_REGISTER
2977  * messages continaining the current list of algorithms to be
2978  * sent up to the ESP listeners.
2979  */
2980 void
2981 ipsecesp_algs_changed(netstack_t *ns)
2982 {
2983 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
2984 
2985 	/*
2986 	 * Time to send a PF_KEY SADB_REGISTER message to ESP listeners
2987 	 * everywhere.  (The function itself checks for NULL esp_pfkey_q.)
2988 	 */
2989 	(void) esp_register_out(0, 0, 0, espstack, NULL);
2990 }
2991 
2992 /*
2993  * Stub function that taskq_dispatch() invokes to take the mblk (in arg)
2994  * and send() it into ESP and IP again.
2995  */
2996 static void
2997 inbound_task(void *arg)
2998 {
2999 	mblk_t		*mp = (mblk_t *)arg;
3000 	mblk_t		*async_mp;
3001 	ip_recv_attr_t	iras;
3002 
3003 	async_mp = mp;
3004 	mp = async_mp->b_cont;
3005 	async_mp->b_cont = NULL;
3006 	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
3007 		/* The ill or ip_stack_t disappeared on us */
3008 		ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
3009 		freemsg(mp);
3010 		goto done;
3011 	}
3012 
3013 	esp_inbound_restart(mp, &iras);
3014 done:
3015 	ira_cleanup(&iras, B_TRUE);
3016 }
3017 
3018 /*
3019  * Restart ESP after the SA has been added.
3020  */
3021 static void
3022 esp_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira)
3023 {
3024 	esph_t		*esph;
3025 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
3026 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3027 
3028 	esp2dbg(espstack, ("in ESP inbound_task"));
3029 	ASSERT(espstack != NULL);
3030 
3031 	mp = ipsec_inbound_esp_sa(mp, ira, &esph);
3032 	if (mp == NULL)
3033 		return;
3034 
3035 	ASSERT(esph != NULL);
3036 	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3037 	ASSERT(ira->ira_ipsec_esp_sa != NULL);
3038 
3039 	mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, ira);
3040 	if (mp == NULL) {
3041 		/*
3042 		 * Either it failed or is pending. In the former case
3043 		 * ipIfStatsInDiscards was increased.
3044 		 */
3045 		return;
3046 	}
3047 
3048 	ip_input_post_ipsec(mp, ira);
3049 }
3050 
3051 /*
3052  * Now that weak-key passed, actually ADD the security association, and
3053  * send back a reply ADD message.
3054  */
3055 static int
3056 esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
3057     int *diagnostic, ipsecesp_stack_t *espstack)
3058 {
3059 	isaf_t *primary = NULL, *secondary;
3060 	boolean_t clone = B_FALSE, is_inbound = B_FALSE;
3061 	ipsa_t *larval = NULL;
3062 	ipsacq_t *acqrec;
3063 	iacqf_t *acq_bucket;
3064 	mblk_t *acq_msgs = NULL;
3065 	int rc;
3066 	mblk_t *lpkt;
3067 	int error;
3068 	ipsa_query_t sq;
3069 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3070 
3071 	/*
3072 	 * Locate the appropriate table(s).
3073 	 */
3074 	sq.spp = &espstack->esp_sadb;	/* XXX */
3075 	error = sadb_form_query(ksi, IPSA_Q_SA|IPSA_Q_DST,
3076 	    IPSA_Q_SA|IPSA_Q_DST|IPSA_Q_INBOUND|IPSA_Q_OUTBOUND,
3077 	    &sq, diagnostic);
3078 	if (error)
3079 		return (error);
3080 
3081 	/*
3082 	 * Use the direction flags provided by the KMD to determine
3083 	 * if the inbound or outbound table should be the primary
3084 	 * for this SA. If these flags were absent then make this
3085 	 * decision based on the addresses.
3086 	 */
3087 	if (sq.assoc->sadb_sa_flags & IPSA_F_INBOUND) {
3088 		primary = sq.inbound;
3089 		secondary = sq.outbound;
3090 		is_inbound = B_TRUE;
3091 		if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND)
3092 			clone = B_TRUE;
3093 	} else if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND) {
3094 		primary = sq.outbound;
3095 		secondary = sq.inbound;
3096 	}
3097 
3098 	if (primary == NULL) {
3099 		/*
3100 		 * The KMD did not set a direction flag, determine which
3101 		 * table to insert the SA into based on addresses.
3102 		 */
3103 		switch (ksi->ks_in_dsttype) {
3104 		case KS_IN_ADDR_MBCAST:
3105 			clone = B_TRUE;	/* All mcast SAs can be bidirectional */
3106 			sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3107 			/* FALLTHRU */
3108 		/*
3109 		 * If the source address is either one of mine, or unspecified
3110 		 * (which is best summed up by saying "not 'not mine'"),
3111 		 * then the association is potentially bi-directional,
3112 		 * in that it can be used for inbound traffic and outbound
3113 		 * traffic.  The best example of such an SA is a multicast
3114 		 * SA (which allows me to receive the outbound traffic).
3115 		 */
3116 		case KS_IN_ADDR_ME:
3117 			sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3118 			primary = sq.inbound;
3119 			secondary = sq.outbound;
3120 			if (ksi->ks_in_srctype != KS_IN_ADDR_NOTME)
3121 				clone = B_TRUE;
3122 			is_inbound = B_TRUE;
3123 			break;
3124 		/*
3125 		 * If the source address literally not mine (either
3126 		 * unspecified or not mine), then this SA may have an
3127 		 * address that WILL be mine after some configuration.
3128 		 * We pay the price for this by making it a bi-directional
3129 		 * SA.
3130 		 */
3131 		case KS_IN_ADDR_NOTME:
3132 			sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3133 			primary = sq.outbound;
3134 			secondary = sq.inbound;
3135 			if (ksi->ks_in_srctype != KS_IN_ADDR_ME) {
3136 				sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3137 				clone = B_TRUE;
3138 			}
3139 			break;
3140 		default:
3141 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_DST;
3142 			return (EINVAL);
3143 		}
3144 	}
3145 
3146 	/*
3147 	 * Find a ACQUIRE list entry if possible.  If we've added an SA that
3148 	 * suits the needs of an ACQUIRE list entry, we can eliminate the
3149 	 * ACQUIRE list entry and transmit the enqueued packets.  Use the
3150 	 * high-bit of the sequence number to queue it.  Key off destination
3151 	 * addr, and change acqrec's state.
3152 	 */
3153 
3154 	if (samsg->sadb_msg_seq & IACQF_LOWEST_SEQ) {
3155 		acq_bucket = &(sq.sp->sdb_acq[sq.outhash]);
3156 		mutex_enter(&acq_bucket->iacqf_lock);
3157 		for (acqrec = acq_bucket->iacqf_ipsacq; acqrec != NULL;
3158 		    acqrec = acqrec->ipsacq_next) {
3159 			mutex_enter(&acqrec->ipsacq_lock);
3160 			/*
3161 			 * Q:  I only check sequence.  Should I check dst?
3162 			 * A: Yes, check dest because those are the packets
3163 			 *    that are queued up.
3164 			 */
3165 			if (acqrec->ipsacq_seq == samsg->sadb_msg_seq &&
3166 			    IPSA_ARE_ADDR_EQUAL(sq.dstaddr,
3167 			    acqrec->ipsacq_dstaddr, acqrec->ipsacq_addrfam))
3168 				break;
3169 			mutex_exit(&acqrec->ipsacq_lock);
3170 		}
3171 		if (acqrec != NULL) {
3172 			/*
3173 			 * AHA!  I found an ACQUIRE record for this SA.
3174 			 * Grab the msg list, and free the acquire record.
3175 			 * I already am holding the lock for this record,
3176 			 * so all I have to do is free it.
3177 			 */
3178 			acq_msgs = acqrec->ipsacq_mp;
3179 			acqrec->ipsacq_mp = NULL;
3180 			mutex_exit(&acqrec->ipsacq_lock);
3181 			sadb_destroy_acquire(acqrec,
3182 			    espstack->ipsecesp_netstack);
3183 		}
3184 		mutex_exit(&acq_bucket->iacqf_lock);
3185 	}
3186 
3187 	/*
3188 	 * Find PF_KEY message, and see if I'm an update.  If so, find entry
3189 	 * in larval list (if there).
3190 	 */
3191 	if (samsg->sadb_msg_type == SADB_UPDATE) {
3192 		mutex_enter(&sq.inbound->isaf_lock);
3193 		larval = ipsec_getassocbyspi(sq.inbound, sq.assoc->sadb_sa_spi,
3194 		    ALL_ZEROES_PTR, sq.dstaddr, sq.dst->sin_family);
3195 		mutex_exit(&sq.inbound->isaf_lock);
3196 
3197 		if ((larval == NULL) ||
3198 		    (larval->ipsa_state != IPSA_STATE_LARVAL)) {
3199 			*diagnostic = SADB_X_DIAGNOSTIC_SA_NOTFOUND;
3200 			if (larval != NULL) {
3201 				IPSA_REFRELE(larval);
3202 			}
3203 			esp0dbg(("Larval update, but larval disappeared.\n"));
3204 			return (ESRCH);
3205 		} /* Else sadb_common_add unlinks it for me! */
3206 	}
3207 
3208 	if (larval != NULL) {
3209 		/*
3210 		 * Hold again, because sadb_common_add() consumes a reference,
3211 		 * and we don't want to clear_lpkt() without a reference.
3212 		 */
3213 		IPSA_REFHOLD(larval);
3214 	}
3215 
3216 	rc = sadb_common_add(espstack->esp_pfkey_q,
3217 	    mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
3218 	    diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb);
3219 
3220 	if (larval != NULL) {
3221 		if (rc == 0) {
3222 			lpkt = sadb_clear_lpkt(larval);
3223 			if (lpkt != NULL) {
3224 				rc = taskq_dispatch(esp_taskq, inbound_task,
3225 				    lpkt, TQ_NOSLEEP) == TASKQID_INVALID;
3226 			}
3227 		}
3228 		IPSA_REFRELE(larval);
3229 	}
3230 
3231 	/*
3232 	 * How much more stack will I create with all of these
3233 	 * esp_outbound() calls?
3234 	 */
3235 
3236 	/* Handle the packets queued waiting for the SA */
3237 	while (acq_msgs != NULL) {
3238 		mblk_t		*asyncmp;
3239 		mblk_t		*data_mp;
3240 		ip_xmit_attr_t	ixas;
3241 		ill_t		*ill;
3242 
3243 		asyncmp = acq_msgs;
3244 		acq_msgs = acq_msgs->b_next;
3245 		asyncmp->b_next = NULL;
3246 
3247 		/*
3248 		 * Extract the ip_xmit_attr_t from the first mblk.
3249 		 * Verifies that the netstack and ill is still around; could
3250 		 * have vanished while iked was doing its work.
3251 		 * On succesful return we have a nce_t and the ill/ipst can't
3252 		 * disappear until we do the nce_refrele in ixa_cleanup.
3253 		 */
3254 		data_mp = asyncmp->b_cont;
3255 		asyncmp->b_cont = NULL;
3256 		if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) {
3257 			ESP_BUMP_STAT(espstack, out_discards);
3258 			ip_drop_packet(data_mp, B_FALSE, NULL,
3259 			    DROPPER(ipss, ipds_sadb_acquire_timeout),
3260 			    &espstack->esp_dropper);
3261 		} else if (rc != 0) {
3262 			ill = ixas.ixa_nce->nce_ill;
3263 			ESP_BUMP_STAT(espstack, out_discards);
3264 			ip_drop_packet(data_mp, B_FALSE, ill,
3265 			    DROPPER(ipss, ipds_sadb_acquire_timeout),
3266 			    &espstack->esp_dropper);
3267 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3268 		} else {
3269 			esp_outbound_finish(data_mp, &ixas);
3270 		}
3271 		ixa_cleanup(&ixas);
3272 	}
3273 
3274 	return (rc);
3275 }
3276 
3277 /*
3278  * Process one of the queued messages (from ipsacq_mp) once the SA
3279  * has been added.
3280  */
3281 static void
3282 esp_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa)
3283 {
3284 	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
3285 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3286 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3287 	ill_t		*ill = ixa->ixa_nce->nce_ill;
3288 
3289 	if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_ESP)) {
3290 		ESP_BUMP_STAT(espstack, out_discards);
3291 		ip_drop_packet(data_mp, B_FALSE, ill,
3292 		    DROPPER(ipss, ipds_sadb_acquire_timeout),
3293 		    &espstack->esp_dropper);
3294 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3295 		return;
3296 	}
3297 
3298 	data_mp = esp_outbound(data_mp, ixa);
3299 	if (data_mp == NULL)
3300 		return;
3301 
3302 	/* do AH processing if needed */
3303 	data_mp = esp_do_outbound_ah(data_mp, ixa);
3304 	if (data_mp == NULL)
3305 		return;
3306 
3307 	(void) ip_output_post_ipsec(data_mp, ixa);
3308 }
3309 
3310 /*
3311  * Add new ESP security association.  This may become a generic AH/ESP
3312  * routine eventually.
3313  */
3314 static int
3315 esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
3316 {
3317 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3318 	sadb_address_t *srcext =
3319 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3320 	sadb_address_t *dstext =
3321 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3322 	sadb_address_t *isrcext =
3323 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_SRC];
3324 	sadb_address_t *idstext =
3325 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_DST];
3326 	sadb_address_t *nttext_loc =
3327 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_LOC];
3328 	sadb_address_t *nttext_rem =
3329 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_REM];
3330 	sadb_key_t *akey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_AUTH];
3331 	sadb_key_t *ekey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_ENCRYPT];
3332 	struct sockaddr_in *src, *dst;
3333 	struct sockaddr_in *natt_loc, *natt_rem;
3334 	struct sockaddr_in6 *natt_loc6, *natt_rem6;
3335 	sadb_lifetime_t *soft =
3336 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_SOFT];
3337 	sadb_lifetime_t *hard =
3338 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_HARD];
3339 	sadb_lifetime_t *idle =
3340 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_X_EXT_LIFETIME_IDLE];
3341 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3342 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3343 
3344 
3345 
3346 	/* I need certain extensions present for an ADD message. */
3347 	if (srcext == NULL) {
3348 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SRC;
3349 		return (EINVAL);
3350 	}
3351 	if (dstext == NULL) {
3352 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3353 		return (EINVAL);
3354 	}
3355 	if (isrcext == NULL && idstext != NULL) {
3356 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_SRC;
3357 		return (EINVAL);
3358 	}
3359 	if (isrcext != NULL && idstext == NULL) {
3360 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_DST;
3361 		return (EINVAL);
3362 	}
3363 	if (assoc == NULL) {
3364 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3365 		return (EINVAL);
3366 	}
3367 	if (ekey == NULL && assoc->sadb_sa_encrypt != SADB_EALG_NULL) {
3368 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_EKEY;
3369 		return (EINVAL);
3370 	}
3371 
3372 	src = (struct sockaddr_in *)(srcext + 1);
3373 	dst = (struct sockaddr_in *)(dstext + 1);
3374 	natt_loc = (struct sockaddr_in *)(nttext_loc + 1);
3375 	natt_loc6 = (struct sockaddr_in6 *)(nttext_loc + 1);
3376 	natt_rem = (struct sockaddr_in *)(nttext_rem + 1);
3377 	natt_rem6 = (struct sockaddr_in6 *)(nttext_rem + 1);
3378 
3379 	/* Sundry ADD-specific reality checks. */
3380 	/* XXX STATS :  Logging/stats here? */
3381 
3382 	if ((assoc->sadb_sa_state != SADB_SASTATE_MATURE) &&
3383 	    (assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE_ELSEWHERE)) {
3384 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SASTATE;
3385 		return (EINVAL);
3386 	}
3387 	if (assoc->sadb_sa_encrypt == SADB_EALG_NONE) {
3388 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3389 		return (EINVAL);
3390 	}
3391 
3392 #ifndef IPSEC_LATENCY_TEST
3393 	if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
3394 	    assoc->sadb_sa_auth == SADB_AALG_NONE) {
3395 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3396 		return (EINVAL);
3397 	}
3398 #endif
3399 
3400 	if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) {
3401 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
3402 		return (EINVAL);
3403 	}
3404 
3405 	if ((*diagnostic = sadb_hardsoftchk(hard, soft, idle)) != 0) {
3406 		return (EINVAL);
3407 	}
3408 	ASSERT(src->sin_family == dst->sin_family);
3409 
3410 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_LOC) {
3411 		if (nttext_loc == NULL) {
3412 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_LOC;
3413 			return (EINVAL);
3414 		}
3415 
3416 		if (natt_loc->sin_family == AF_INET6 &&
3417 		    !IN6_IS_ADDR_V4MAPPED(&natt_loc6->sin6_addr)) {
3418 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_LOC;
3419 			return (EINVAL);
3420 		}
3421 	}
3422 
3423 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_REM) {
3424 		if (nttext_rem == NULL) {
3425 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_REM;
3426 			return (EINVAL);
3427 		}
3428 		if (natt_rem->sin_family == AF_INET6 &&
3429 		    !IN6_IS_ADDR_V4MAPPED(&natt_rem6->sin6_addr)) {
3430 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_REM;
3431 			return (EINVAL);
3432 		}
3433 	}
3434 
3435 
3436 	/* Stuff I don't support, for now.  XXX Diagnostic? */
3437 	if (ksi->ks_in_extv[SADB_EXT_LIFETIME_CURRENT] != NULL)
3438 		return (EOPNOTSUPP);
3439 
3440 	if ((*diagnostic = sadb_labelchk(ksi)) != 0)
3441 		return (EINVAL);
3442 
3443 	/*
3444 	 * XXX Policy :  I'm not checking identities at this time,
3445 	 * but if I did, I'd do them here, before I sent
3446 	 * the weak key check up to the algorithm.
3447 	 */
3448 
3449 	rw_enter(&ipss->ipsec_alg_lock, RW_READER);
3450 
3451 	/*
3452 	 * First locate the authentication algorithm.
3453 	 */
3454 #ifdef IPSEC_LATENCY_TEST
3455 	if (akey != NULL && assoc->sadb_sa_auth != SADB_AALG_NONE) {
3456 #else
3457 	if (akey != NULL) {
3458 #endif
3459 		ipsec_alginfo_t *aalg;
3460 
3461 		aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
3462 		    [assoc->sadb_sa_auth];
3463 		if (aalg == NULL || !ALG_VALID(aalg)) {
3464 			rw_exit(&ipss->ipsec_alg_lock);
3465 			esp1dbg(espstack, ("Couldn't find auth alg #%d.\n",
3466 			    assoc->sadb_sa_auth));
3467 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3468 			return (EINVAL);
3469 		}
3470 
3471 		/*
3472 		 * Sanity check key sizes.
3473 		 * Note: It's not possible to use SADB_AALG_NONE because
3474 		 * this auth_alg is not defined with ALG_FLAG_VALID. If this
3475 		 * ever changes, the same check for SADB_AALG_NONE and
3476 		 * a auth_key != NULL should be made here ( see below).
3477 		 */
3478 		if (!ipsec_valid_key_size(akey->sadb_key_bits, aalg)) {
3479 			rw_exit(&ipss->ipsec_alg_lock);
3480 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AKEYBITS;
3481 			return (EINVAL);
3482 		}
3483 		ASSERT(aalg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3484 
3485 		/* check key and fix parity if needed */
3486 		if (ipsec_check_key(aalg->alg_mech_type, akey, B_TRUE,
3487 		    diagnostic) != 0) {
3488 			rw_exit(&ipss->ipsec_alg_lock);
3489 			return (EINVAL);
3490 		}
3491 	}
3492 
3493 	/*
3494 	 * Then locate the encryption algorithm.
3495 	 */
3496 	if (ekey != NULL) {
3497 		uint_t keybits;
3498 		ipsec_alginfo_t *ealg;
3499 
3500 		ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
3501 		    [assoc->sadb_sa_encrypt];
3502 		if (ealg == NULL || !ALG_VALID(ealg)) {
3503 			rw_exit(&ipss->ipsec_alg_lock);
3504 			esp1dbg(espstack, ("Couldn't find encr alg #%d.\n",
3505 			    assoc->sadb_sa_encrypt));
3506 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3507 			return (EINVAL);
3508 		}
3509 
3510 		/*
3511 		 * Sanity check key sizes. If the encryption algorithm is
3512 		 * SADB_EALG_NULL but the encryption key is NOT
3513 		 * NULL then complain.
3514 		 *
3515 		 * The keying material includes salt bits if required by
3516 		 * algorithm and optionally the Initial IV, check the
3517 		 * length of whats left.
3518 		 */
3519 		keybits = ekey->sadb_key_bits;
3520 		keybits -= ekey->sadb_key_reserved;
3521 		keybits -= SADB_8TO1(ealg->alg_saltlen);
3522 		if ((assoc->sadb_sa_encrypt == SADB_EALG_NULL) ||
3523 		    (!ipsec_valid_key_size(keybits, ealg))) {
3524 			rw_exit(&ipss->ipsec_alg_lock);
3525 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EKEYBITS;
3526 			return (EINVAL);
3527 		}
3528 		ASSERT(ealg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3529 
3530 		/* check key */
3531 		if (ipsec_check_key(ealg->alg_mech_type, ekey, B_FALSE,
3532 		    diagnostic) != 0) {
3533 			rw_exit(&ipss->ipsec_alg_lock);
3534 			return (EINVAL);
3535 		}
3536 	}
3537 	rw_exit(&ipss->ipsec_alg_lock);
3538 
3539 	return (esp_add_sa_finish(mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
3540 	    diagnostic, espstack));
3541 }
3542 
3543 /*
3544  * Update a security association.  Updates come in two varieties.  The first
3545  * is an update of lifetimes on a non-larval SA.  The second is an update of
3546  * a larval SA, which ends up looking a lot more like an add.
3547  */
3548 static int
3549 esp_update_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3550     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3551 {
3552 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3553 	mblk_t    *buf_pkt;
3554 	int rcode;
3555 
3556 	sadb_address_t *dstext =
3557 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3558 
3559 	if (dstext == NULL) {
3560 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3561 		return (EINVAL);
3562 	}
3563 
3564 	rcode = sadb_update_sa(mp, ksi, &buf_pkt, &espstack->esp_sadb,
3565 	    diagnostic, espstack->esp_pfkey_q, esp_add_sa,
3566 	    espstack->ipsecesp_netstack, sadb_msg_type);
3567 
3568 	if ((assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE) ||
3569 	    (rcode != 0)) {
3570 		return (rcode);
3571 	}
3572 
3573 	HANDLE_BUF_PKT(esp_taskq, espstack->ipsecesp_netstack->netstack_ipsec,
3574 	    espstack->esp_dropper, buf_pkt);
3575 
3576 	return (rcode);
3577 }
3578 
3579 /* XXX refactor me */
3580 /*
3581  * Delete a security association.  This is REALLY likely to be code common to
3582  * both AH and ESP.  Find the association, then unlink it.
3583  */
3584 static int
3585 esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3586     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3587 {
3588 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3589 	sadb_address_t *dstext =
3590 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3591 	sadb_address_t *srcext =
3592 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3593 	struct sockaddr_in *sin;
3594 
3595 	if (assoc == NULL) {
3596 		if (dstext != NULL) {
3597 			sin = (struct sockaddr_in *)(dstext + 1);
3598 		} else if (srcext != NULL) {
3599 			sin = (struct sockaddr_in *)(srcext + 1);
3600 		} else {
3601 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3602 			return (EINVAL);
3603 		}
3604 		return (sadb_purge_sa(mp, ksi,
3605 		    (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
3606 		    &espstack->esp_sadb.s_v4, diagnostic,
3607 		    espstack->esp_pfkey_q));
3608 	}
3609 
3610 	return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
3611 	    espstack->esp_pfkey_q, sadb_msg_type));
3612 }
3613 
3614 /* XXX refactor me */
3615 /*
3616  * Convert the entire contents of all of ESP's SA tables into PF_KEY SADB_DUMP
3617  * messages.
3618  */
3619 static void
3620 esp_dump(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
3621 {
3622 	int error;
3623 	sadb_msg_t *samsg;
3624 
3625 	/*
3626 	 * Dump each fanout, bailing if error is non-zero.
3627 	 */
3628 
3629 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3630 	    &espstack->esp_sadb.s_v4);
3631 	if (error != 0)
3632 		goto bail;
3633 
3634 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3635 	    &espstack->esp_sadb.s_v6);
3636 bail:
3637 	ASSERT(mp->b_cont != NULL);
3638 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3639 	samsg->sadb_msg_errno = (uint8_t)error;
3640 	sadb_pfkey_echo(espstack->esp_pfkey_q, mp,
3641 	    (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL);
3642 }
3643 
3644 /*
3645  * First-cut reality check for an inbound PF_KEY message.
3646  */
3647 static boolean_t
3648 esp_pfkey_reality_failures(mblk_t *mp, keysock_in_t *ksi,
3649     ipsecesp_stack_t *espstack)
3650 {
3651 	int diagnostic;
3652 
3653 	if (ksi->ks_in_extv[SADB_EXT_PROPOSAL] != NULL) {
3654 		diagnostic = SADB_X_DIAGNOSTIC_PROP_PRESENT;
3655 		goto badmsg;
3656 	}
3657 	if (ksi->ks_in_extv[SADB_EXT_SUPPORTED_AUTH] != NULL ||
3658 	    ksi->ks_in_extv[SADB_EXT_SUPPORTED_ENCRYPT] != NULL) {
3659 		diagnostic = SADB_X_DIAGNOSTIC_SUPP_PRESENT;
3660 		goto badmsg;
3661 	}
3662 	return (B_FALSE);	/* False ==> no failures */
3663 
3664 badmsg:
3665 	sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
3666 	    ksi->ks_in_serial);
3667 	return (B_TRUE);	/* True ==> failures */
3668 }
3669 
3670 /*
3671  * ESP parsing of PF_KEY messages.  Keysock did most of the really silly
3672  * error cases.  What I receive is a fully-formed, syntactically legal
3673  * PF_KEY message.  I then need to check semantics...
3674  *
3675  * This code may become common to AH and ESP.  Stay tuned.
3676  *
3677  * I also make the assumption that db_ref's are cool.  If this assumption
3678  * is wrong, this means that someone other than keysock or me has been
3679  * mucking with PF_KEY messages.
3680  */
3681 static void
3682 esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
3683 {
3684 	mblk_t *msg = mp->b_cont;
3685 	sadb_msg_t *samsg;
3686 	keysock_in_t *ksi;
3687 	int error;
3688 	int diagnostic = SADB_X_DIAGNOSTIC_NONE;
3689 
3690 	ASSERT(msg != NULL);
3691 
3692 	samsg = (sadb_msg_t *)msg->b_rptr;
3693 	ksi = (keysock_in_t *)mp->b_rptr;
3694 
3695 	/*
3696 	 * If applicable, convert unspecified AF_INET6 to unspecified
3697 	 * AF_INET.  And do other address reality checks.
3698 	 */
3699 	if (!sadb_addrfix(ksi, espstack->esp_pfkey_q, mp,
3700 	    espstack->ipsecesp_netstack) ||
3701 	    esp_pfkey_reality_failures(mp, ksi, espstack)) {
3702 		return;
3703 	}
3704 
3705 	switch (samsg->sadb_msg_type) {
3706 	case SADB_ADD:
3707 		error = esp_add_sa(mp, ksi, &diagnostic,
3708 		    espstack->ipsecesp_netstack);
3709 		if (error != 0) {
3710 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3711 			    diagnostic, ksi->ks_in_serial);
3712 		}
3713 		/* else esp_add_sa() took care of things. */
3714 		break;
3715 	case SADB_DELETE:
3716 	case SADB_X_DELPAIR:
3717 	case SADB_X_DELPAIR_STATE:
3718 		error = esp_del_sa(mp, ksi, &diagnostic, espstack,
3719 		    samsg->sadb_msg_type);
3720 		if (error != 0) {
3721 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3722 			    diagnostic, ksi->ks_in_serial);
3723 		}
3724 		/* Else esp_del_sa() took care of things. */
3725 		break;
3726 	case SADB_GET:
3727 		error = sadb_delget_sa(mp, ksi, &espstack->esp_sadb,
3728 		    &diagnostic, espstack->esp_pfkey_q, samsg->sadb_msg_type);
3729 		if (error != 0) {
3730 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3731 			    diagnostic, ksi->ks_in_serial);
3732 		}
3733 		/* Else sadb_get_sa() took care of things. */
3734 		break;
3735 	case SADB_FLUSH:
3736 		sadbp_flush(&espstack->esp_sadb, espstack->ipsecesp_netstack);
3737 		sadb_pfkey_echo(espstack->esp_pfkey_q, mp, samsg, ksi, NULL);
3738 		break;
3739 	case SADB_REGISTER:
3740 		/*
3741 		 * Hmmm, let's do it!  Check for extensions (there should
3742 		 * be none), extract the fields, call esp_register_out(),
3743 		 * then either free or report an error.
3744 		 *
3745 		 * Keysock takes care of the PF_KEY bookkeeping for this.
3746 		 */
3747 		if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
3748 		    ksi->ks_in_serial, espstack, msg_getcred(mp, NULL))) {
3749 			freemsg(mp);
3750 		} else {
3751 			/*
3752 			 * Only way this path hits is if there is a memory
3753 			 * failure.  It will not return B_FALSE because of
3754 			 * lack of esp_pfkey_q if I am in wput().
3755 			 */
3756 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM,
3757 			    diagnostic, ksi->ks_in_serial);
3758 		}
3759 		break;
3760 	case SADB_UPDATE:
3761 	case SADB_X_UPDATEPAIR:
3762 		/*
3763 		 * Find a larval, if not there, find a full one and get
3764 		 * strict.
3765 		 */
3766 		error = esp_update_sa(mp, ksi, &diagnostic, espstack,
3767 		    samsg->sadb_msg_type);
3768 		if (error != 0) {
3769 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3770 			    diagnostic, ksi->ks_in_serial);
3771 		}
3772 		/* else esp_update_sa() took care of things. */
3773 		break;
3774 	case SADB_GETSPI:
3775 		/*
3776 		 * Reserve a new larval entry.
3777 		 */
3778 		esp_getspi(mp, ksi, espstack);
3779 		break;
3780 	case SADB_ACQUIRE:
3781 		/*
3782 		 * Find larval and/or ACQUIRE record and kill it (them), I'm
3783 		 * most likely an error.  Inbound ACQUIRE messages should only
3784 		 * have the base header.
3785 		 */
3786 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3787 		    espstack->esp_pfkey_q, espstack->ipsecesp_netstack);
3788 		freemsg(mp);
3789 		break;
3790 	case SADB_DUMP:
3791 		/*
3792 		 * Dump all entries.
3793 		 */
3794 		esp_dump(mp, ksi, espstack);
3795 		/* esp_dump will take care of the return message, etc. */
3796 		break;
3797 	case SADB_EXPIRE:
3798 		/* Should never reach me. */
3799 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EOPNOTSUPP,
3800 		    diagnostic, ksi->ks_in_serial);
3801 		break;
3802 	default:
3803 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL,
3804 		    SADB_X_DIAGNOSTIC_UNKNOWN_MSG, ksi->ks_in_serial);
3805 		break;
3806 	}
3807 }
3808 
3809 /*
3810  * Handle case where PF_KEY says it can't find a keysock for one of my
3811  * ACQUIRE messages.
3812  */
3813 static void
3814 esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
3815 {
3816 	sadb_msg_t *samsg;
3817 	keysock_out_err_t *kse = (keysock_out_err_t *)mp->b_rptr;
3818 
3819 	if (mp->b_cont == NULL) {
3820 		freemsg(mp);
3821 		return;
3822 	}
3823 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3824 
3825 	/*
3826 	 * If keysock can't find any registered, delete the acquire record
3827 	 * immediately, and handle errors.
3828 	 */
3829 	if (samsg->sadb_msg_type == SADB_ACQUIRE) {
3830 		samsg->sadb_msg_errno = kse->ks_err_errno;
3831 		samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
3832 		/*
3833 		 * Use the write-side of the esp_pfkey_q
3834 		 */
3835 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3836 		    WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
3837 	}
3838 
3839 	freemsg(mp);
3840 }
3841 
3842 /*
3843  * ESP module read put routine.
3844  */
3845 static int
3846 ipsecesp_rput(queue_t *q, mblk_t *mp)
3847 {
3848 	putnext(q, mp);
3849 	return (0);
3850 }
3851 
3852 /*
3853  * ESP module write put routine.
3854  */
3855 static int
3856 ipsecesp_wput(queue_t *q, mblk_t *mp)
3857 {
3858 	ipsec_info_t *ii;
3859 	struct iocblk *iocp;
3860 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
3861 
3862 	esp3dbg(espstack, ("In esp_wput().\n"));
3863 
3864 	/* NOTE: Each case must take care of freeing or passing mp. */
3865 	switch (mp->b_datap->db_type) {
3866 	case M_CTL:
3867 		if ((mp->b_wptr - mp->b_rptr) < sizeof (ipsec_info_t)) {
3868 			/* Not big enough message. */
3869 			freemsg(mp);
3870 			break;
3871 		}
3872 		ii = (ipsec_info_t *)mp->b_rptr;
3873 
3874 		switch (ii->ipsec_info_type) {
3875 		case KEYSOCK_OUT_ERR:
3876 			esp1dbg(espstack, ("Got KEYSOCK_OUT_ERR message.\n"));
3877 			esp_keysock_no_socket(mp, espstack);
3878 			break;
3879 		case KEYSOCK_IN:
3880 			ESP_BUMP_STAT(espstack, keysock_in);
3881 			esp3dbg(espstack, ("Got KEYSOCK_IN message.\n"));
3882 
3883 			/* Parse the message. */
3884 			esp_parse_pfkey(mp, espstack);
3885 			break;
3886 		case KEYSOCK_HELLO:
3887 			sadb_keysock_hello(&espstack->esp_pfkey_q, q, mp,
3888 			    esp_ager, (void *)espstack, &espstack->esp_event,
3889 			    SADB_SATYPE_ESP);
3890 			break;
3891 		default:
3892 			esp2dbg(espstack, ("Got M_CTL from above of 0x%x.\n",
3893 			    ii->ipsec_info_type));
3894 			freemsg(mp);
3895 			break;
3896 		}
3897 		break;
3898 	case M_IOCTL:
3899 		iocp = (struct iocblk *)mp->b_rptr;
3900 		switch (iocp->ioc_cmd) {
3901 		case ND_SET:
3902 		case ND_GET:
3903 			if (nd_getset(q, espstack->ipsecesp_g_nd, mp)) {
3904 				qreply(q, mp);
3905 				return (0);
3906 			} else {
3907 				iocp->ioc_error = ENOENT;
3908 			}
3909 			/* FALLTHRU */
3910 		default:
3911 			/* We really don't support any other ioctls, do we? */
3912 
3913 			/* Return EINVAL */
3914 			if (iocp->ioc_error != ENOENT)
3915 				iocp->ioc_error = EINVAL;
3916 			iocp->ioc_count = 0;
3917 			mp->b_datap->db_type = M_IOCACK;
3918 			qreply(q, mp);
3919 			return (0);
3920 		}
3921 	default:
3922 		esp3dbg(espstack,
3923 		    ("Got default message, type %d, passing to IP.\n",
3924 		    mp->b_datap->db_type));
3925 		putnext(q, mp);
3926 	}
3927 	return (0);
3928 }
3929 
3930 /*
3931  * Wrapper to allow IP to trigger an ESP association failure message
3932  * during inbound SA selection.
3933  */
3934 void
3935 ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
3936     uint32_t spi, void *addr, int af, ip_recv_attr_t *ira)
3937 {
3938 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
3939 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3940 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3941 
3942 	if (espstack->ipsecesp_log_unknown_spi) {
3943 		ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
3944 		    addr, af, espstack->ipsecesp_netstack);
3945 	}
3946 
3947 	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
3948 	    DROPPER(ipss, ipds_esp_no_sa),
3949 	    &espstack->esp_dropper);
3950 }
3951 
3952 /*
3953  * Initialize the ESP input and output processing functions.
3954  */
3955 void
3956 ipsecesp_init_funcs(ipsa_t *sa)
3957 {
3958 	if (sa->ipsa_output_func == NULL)
3959 		sa->ipsa_output_func = esp_outbound;
3960 	if (sa->ipsa_input_func == NULL)
3961 		sa->ipsa_input_func = esp_inbound;
3962 }
3963