xref: /illumos-gate/usr/src/uts/common/inet/ip/ipsecesp.c (revision 6f459ff5b49a8482416f3eab8866c784121ecae3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
25  * Copyright (c) 2017 Joyent, Inc.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/errno.h>
32 #include <sys/strlog.h>
33 #include <sys/tihdr.h>
34 #include <sys/socket.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/kmem.h>
38 #include <sys/zone.h>
39 #include <sys/sysmacros.h>
40 #include <sys/cmn_err.h>
41 #include <sys/vtrace.h>
42 #include <sys/debug.h>
43 #include <sys/atomic.h>
44 #include <sys/strsun.h>
45 #include <sys/random.h>
46 #include <netinet/in.h>
47 #include <net/if.h>
48 #include <netinet/ip6.h>
49 #include <net/pfkeyv2.h>
50 #include <net/pfpolicy.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/nd.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 #include <inet/ip6.h>
58 #include <inet/ip_if.h>
59 #include <inet/ip_ndp.h>
60 #include <inet/sadb.h>
61 #include <inet/ipsec_info.h>
62 #include <inet/ipsec_impl.h>
63 #include <inet/ipsecesp.h>
64 #include <inet/ipdrop.h>
65 #include <inet/tcp.h>
66 #include <sys/kstat.h>
67 #include <sys/policy.h>
68 #include <sys/strsun.h>
69 #include <sys/strsubr.h>
70 #include <inet/udp_impl.h>
71 #include <sys/taskq.h>
72 #include <sys/note.h>
73 
74 #include <sys/tsol/tnet.h>
75 
76 /*
77  * Table of ND variables supported by ipsecesp. These are loaded into
78  * ipsecesp_g_nd in ipsecesp_init_nd.
79  * All of these are alterable, within the min/max values given, at run time.
80  */
81 static	ipsecespparam_t	lcl_param_arr[] = {
82 	/* min	max			value	name */
83 	{ 0,	3,			0,	"ipsecesp_debug"},
84 	{ 125,	32000, SADB_AGE_INTERVAL_DEFAULT, "ipsecesp_age_interval"},
85 	{ 1,	10,			1,	"ipsecesp_reap_delay"},
86 	{ 1,	SADB_MAX_REPLAY,	64,	"ipsecesp_replay_size"},
87 	{ 1,	300,			15,	"ipsecesp_acquire_timeout"},
88 	{ 1,	1800,			90,	"ipsecesp_larval_timeout"},
89 	/* Default lifetime values for ACQUIRE messages. */
90 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_bytes"},
91 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_bytes"},
92 	{ 0,	0xffffffffU,	24000,	"ipsecesp_default_soft_addtime"},
93 	{ 0,	0xffffffffU,	28800,	"ipsecesp_default_hard_addtime"},
94 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_usetime"},
95 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_usetime"},
96 	{ 0,	1,		0,	"ipsecesp_log_unknown_spi"},
97 	{ 0,	2,		1,	"ipsecesp_padding_check"},
98 	{ 0,	600,		20,	"ipsecesp_nat_keepalive_interval"},
99 };
100 /* For ipsecesp_nat_keepalive_interval, see ipsecesp.h. */
101 
102 #define	esp0dbg(a)	printf a
103 /* NOTE:  != 0 instead of > 0 so lint doesn't complain. */
104 #define	esp1dbg(espstack, a)	if (espstack->ipsecesp_debug != 0) printf a
105 #define	esp2dbg(espstack, a)	if (espstack->ipsecesp_debug > 1) printf a
106 #define	esp3dbg(espstack, a)	if (espstack->ipsecesp_debug > 2) printf a
107 
108 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
109 static int ipsecesp_close(queue_t *);
110 static void ipsecesp_wput(queue_t *, mblk_t *);
111 static void	*ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
112 static void	ipsecesp_stack_fini(netstackid_t stackid, void *arg);
113 
114 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
115 static void esp_outbound_finish(mblk_t *, ip_xmit_attr_t *);
116 static void esp_inbound_restart(mblk_t *, ip_recv_attr_t *);
117 
118 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
119     ipsecesp_stack_t *, cred_t *);
120 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
121     kstat_named_t **, ipsecesp_stack_t *);
122 static mblk_t *esp_submit_req_inbound(mblk_t *, ip_recv_attr_t *,
123     ipsa_t *, uint_t);
124 static mblk_t *esp_submit_req_outbound(mblk_t *, ip_xmit_attr_t *,
125     ipsa_t *, uchar_t *, uint_t);
126 
127 /* Setable in /etc/system */
128 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
129 
130 static struct module_info info = {
131 	5137, "ipsecesp", 0, INFPSZ, 65536, 1024
132 };
133 
134 static struct qinit rinit = {
135 	(pfi_t)putnext, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
136 	NULL
137 };
138 
139 static struct qinit winit = {
140 	(pfi_t)ipsecesp_wput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
141 	NULL
142 };
143 
144 struct streamtab ipsecespinfo = {
145 	&rinit, &winit, NULL, NULL
146 };
147 
148 static taskq_t *esp_taskq;
149 
150 /*
151  * OTOH, this one is set at open/close, and I'm D_MTQPAIR for now.
152  *
153  * Question:	Do I need this, given that all instance's esps->esps_wq point
154  *		to IP?
155  *
156  * Answer:	Yes, because I need to know which queue is BOUND to
157  *		IPPROTO_ESP
158  */
159 
160 static int	esp_kstat_update(kstat_t *, int);
161 
162 static boolean_t
163 esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
164 {
165 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
166 	    "net", KSTAT_TYPE_NAMED,
167 	    sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
168 
169 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
170 		return (B_FALSE);
171 
172 	espstack->esp_kstats = espstack->esp_ksp->ks_data;
173 
174 	espstack->esp_ksp->ks_update = esp_kstat_update;
175 	espstack->esp_ksp->ks_private = (void *)(uintptr_t)stackid;
176 
177 #define	K64 KSTAT_DATA_UINT64
178 #define	KI(x) kstat_named_init(&(espstack->esp_kstats->esp_stat_##x), #x, K64)
179 
180 	KI(num_aalgs);
181 	KI(num_ealgs);
182 	KI(good_auth);
183 	KI(bad_auth);
184 	KI(bad_padding);
185 	KI(replay_failures);
186 	KI(replay_early_failures);
187 	KI(keysock_in);
188 	KI(out_requests);
189 	KI(acquire_requests);
190 	KI(bytes_expired);
191 	KI(out_discards);
192 	KI(crypto_sync);
193 	KI(crypto_async);
194 	KI(crypto_failures);
195 	KI(bad_decrypt);
196 	KI(sa_port_renumbers);
197 
198 #undef KI
199 #undef K64
200 
201 	kstat_install(espstack->esp_ksp);
202 
203 	return (B_TRUE);
204 }
205 
206 static int
207 esp_kstat_update(kstat_t *kp, int rw)
208 {
209 	esp_kstats_t *ekp;
210 	netstackid_t	stackid = (zoneid_t)(uintptr_t)kp->ks_private;
211 	netstack_t	*ns;
212 	ipsec_stack_t	*ipss;
213 
214 	if ((kp == NULL) || (kp->ks_data == NULL))
215 		return (EIO);
216 
217 	if (rw == KSTAT_WRITE)
218 		return (EACCES);
219 
220 	ns = netstack_find_by_stackid(stackid);
221 	if (ns == NULL)
222 		return (-1);
223 	ipss = ns->netstack_ipsec;
224 	if (ipss == NULL) {
225 		netstack_rele(ns);
226 		return (-1);
227 	}
228 	ekp = (esp_kstats_t *)kp->ks_data;
229 
230 	rw_enter(&ipss->ipsec_alg_lock, RW_READER);
231 	ekp->esp_stat_num_aalgs.value.ui64 =
232 	    ipss->ipsec_nalgs[IPSEC_ALG_AUTH];
233 	ekp->esp_stat_num_ealgs.value.ui64 =
234 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
235 	rw_exit(&ipss->ipsec_alg_lock);
236 
237 	netstack_rele(ns);
238 	return (0);
239 }
240 
241 #ifdef DEBUG
242 /*
243  * Debug routine, useful to see pre-encryption data.
244  */
245 static char *
246 dump_msg(mblk_t *mp)
247 {
248 	char tmp_str[3], tmp_line[256];
249 
250 	while (mp != NULL) {
251 		unsigned char *ptr;
252 
253 		printf("mblk address 0x%p, length %ld, db_ref %d "
254 		    "type %d, base 0x%p, lim 0x%p\n",
255 		    (void *) mp, (long)(mp->b_wptr - mp->b_rptr),
256 		    mp->b_datap->db_ref, mp->b_datap->db_type,
257 		    (void *)mp->b_datap->db_base, (void *)mp->b_datap->db_lim);
258 		ptr = mp->b_rptr;
259 
260 		tmp_line[0] = '\0';
261 		while (ptr < mp->b_wptr) {
262 			uint_t diff;
263 
264 			diff = (ptr - mp->b_rptr);
265 			if (!(diff & 0x1f)) {
266 				if (strlen(tmp_line) > 0) {
267 					printf("bytes: %s\n", tmp_line);
268 					tmp_line[0] = '\0';
269 				}
270 			}
271 			if (!(diff & 0x3))
272 				(void) strcat(tmp_line, " ");
273 			(void) sprintf(tmp_str, "%02x", *ptr);
274 			(void) strcat(tmp_line, tmp_str);
275 			ptr++;
276 		}
277 		if (strlen(tmp_line) > 0)
278 			printf("bytes: %s\n", tmp_line);
279 
280 		mp = mp->b_cont;
281 	}
282 
283 	return ("\n");
284 }
285 
286 #else /* DEBUG */
287 static char *
288 dump_msg(mblk_t *mp)
289 {
290 	printf("Find value of mp %p.\n", mp);
291 	return ("\n");
292 }
293 #endif /* DEBUG */
294 
295 /*
296  * Don't have to lock age_interval, as only one thread will access it at
297  * a time, because I control the one function that does with timeout().
298  */
299 static void
300 esp_ager(void *arg)
301 {
302 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
303 	netstack_t	*ns = espstack->ipsecesp_netstack;
304 	hrtime_t begin = gethrtime();
305 
306 	sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
307 	    espstack->ipsecesp_reap_delay, ns);
308 	sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
309 	    espstack->ipsecesp_reap_delay, ns);
310 
311 	espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
312 	    esp_ager, espstack,
313 	    &espstack->ipsecesp_age_interval, espstack->ipsecesp_age_int_max,
314 	    info.mi_idnum);
315 }
316 
317 /*
318  * Get an ESP NDD parameter.
319  */
320 /* ARGSUSED */
321 static int
322 ipsecesp_param_get(
323     queue_t	*q,
324     mblk_t	*mp,
325     caddr_t	cp,
326     cred_t *cr)
327 {
328 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
329 	uint_t value;
330 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
331 
332 	mutex_enter(&espstack->ipsecesp_param_lock);
333 	value = ipsecesppa->ipsecesp_param_value;
334 	mutex_exit(&espstack->ipsecesp_param_lock);
335 
336 	(void) mi_mpprintf(mp, "%u", value);
337 	return (0);
338 }
339 
340 /*
341  * This routine sets an NDD variable in a ipsecespparam_t structure.
342  */
343 /* ARGSUSED */
344 static int
345 ipsecesp_param_set(
346     queue_t	*q,
347     mblk_t	*mp,
348     char	*value,
349     caddr_t	cp,
350     cred_t *cr)
351 {
352 	ulong_t	new_value;
353 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
354 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
355 
356 	/*
357 	 * Fail the request if the new value does not lie within the
358 	 * required bounds.
359 	 */
360 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
361 	    new_value < ipsecesppa->ipsecesp_param_min ||
362 	    new_value > ipsecesppa->ipsecesp_param_max) {
363 		return (EINVAL);
364 	}
365 
366 	/* Set the new value */
367 	mutex_enter(&espstack->ipsecesp_param_lock);
368 	ipsecesppa->ipsecesp_param_value = new_value;
369 	mutex_exit(&espstack->ipsecesp_param_lock);
370 	return (0);
371 }
372 
373 /*
374  * Using lifetime NDD variables, fill in an extended combination's
375  * lifetime information.
376  */
377 void
378 ipsecesp_fill_defs(sadb_x_ecomb_t *ecomb, netstack_t *ns)
379 {
380 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
381 
382 	ecomb->sadb_x_ecomb_soft_bytes = espstack->ipsecesp_default_soft_bytes;
383 	ecomb->sadb_x_ecomb_hard_bytes = espstack->ipsecesp_default_hard_bytes;
384 	ecomb->sadb_x_ecomb_soft_addtime =
385 	    espstack->ipsecesp_default_soft_addtime;
386 	ecomb->sadb_x_ecomb_hard_addtime =
387 	    espstack->ipsecesp_default_hard_addtime;
388 	ecomb->sadb_x_ecomb_soft_usetime =
389 	    espstack->ipsecesp_default_soft_usetime;
390 	ecomb->sadb_x_ecomb_hard_usetime =
391 	    espstack->ipsecesp_default_hard_usetime;
392 }
393 
394 /*
395  * Initialize things for ESP at module load time.
396  */
397 boolean_t
398 ipsecesp_ddi_init(void)
399 {
400 	esp_taskq = taskq_create("esp_taskq", 1, minclsyspri,
401 	    IPSEC_TASKQ_MIN, IPSEC_TASKQ_MAX, 0);
402 
403 	/*
404 	 * We want to be informed each time a stack is created or
405 	 * destroyed in the kernel, so we can maintain the
406 	 * set of ipsecesp_stack_t's.
407 	 */
408 	netstack_register(NS_IPSECESP, ipsecesp_stack_init, NULL,
409 	    ipsecesp_stack_fini);
410 
411 	return (B_TRUE);
412 }
413 
414 /*
415  * Walk through the param array specified registering each element with the
416  * named dispatch handler.
417  */
418 static boolean_t
419 ipsecesp_param_register(IDP *ndp, ipsecespparam_t *espp, int cnt)
420 {
421 	for (; cnt-- > 0; espp++) {
422 		if (espp->ipsecesp_param_name != NULL &&
423 		    espp->ipsecesp_param_name[0]) {
424 			if (!nd_load(ndp,
425 			    espp->ipsecesp_param_name,
426 			    ipsecesp_param_get, ipsecesp_param_set,
427 			    (caddr_t)espp)) {
428 				nd_free(ndp);
429 				return (B_FALSE);
430 			}
431 		}
432 	}
433 	return (B_TRUE);
434 }
435 
436 /*
437  * Initialize things for ESP for each stack instance
438  */
439 static void *
440 ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns)
441 {
442 	ipsecesp_stack_t	*espstack;
443 	ipsecespparam_t		*espp;
444 
445 	espstack = (ipsecesp_stack_t *)kmem_zalloc(sizeof (*espstack),
446 	    KM_SLEEP);
447 	espstack->ipsecesp_netstack = ns;
448 
449 	espp = (ipsecespparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
450 	espstack->ipsecesp_params = espp;
451 	bcopy(lcl_param_arr, espp, sizeof (lcl_param_arr));
452 
453 	(void) ipsecesp_param_register(&espstack->ipsecesp_g_nd, espp,
454 	    A_CNT(lcl_param_arr));
455 
456 	(void) esp_kstat_init(espstack, stackid);
457 
458 	espstack->esp_sadb.s_acquire_timeout =
459 	    &espstack->ipsecesp_acquire_timeout;
460 	sadbp_init("ESP", &espstack->esp_sadb, SADB_SATYPE_ESP, esp_hash_size,
461 	    espstack->ipsecesp_netstack);
462 
463 	mutex_init(&espstack->ipsecesp_param_lock, NULL, MUTEX_DEFAULT, 0);
464 
465 	ip_drop_register(&espstack->esp_dropper, "IPsec ESP");
466 	return (espstack);
467 }
468 
469 /*
470  * Destroy things for ESP at module unload time.
471  */
472 void
473 ipsecesp_ddi_destroy(void)
474 {
475 	netstack_unregister(NS_IPSECESP);
476 	taskq_destroy(esp_taskq);
477 }
478 
479 /*
480  * Destroy things for ESP for one stack instance
481  */
482 static void
483 ipsecesp_stack_fini(netstackid_t stackid, void *arg)
484 {
485 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
486 
487 	if (espstack->esp_pfkey_q != NULL) {
488 		(void) quntimeout(espstack->esp_pfkey_q, espstack->esp_event);
489 	}
490 	espstack->esp_sadb.s_acquire_timeout = NULL;
491 	sadbp_destroy(&espstack->esp_sadb, espstack->ipsecesp_netstack);
492 	ip_drop_unregister(&espstack->esp_dropper);
493 	mutex_destroy(&espstack->ipsecesp_param_lock);
494 	nd_free(&espstack->ipsecesp_g_nd);
495 
496 	kmem_free(espstack->ipsecesp_params, sizeof (lcl_param_arr));
497 	espstack->ipsecesp_params = NULL;
498 	kstat_delete_netstack(espstack->esp_ksp, stackid);
499 	espstack->esp_ksp = NULL;
500 	espstack->esp_kstats = NULL;
501 	kmem_free(espstack, sizeof (*espstack));
502 }
503 
504 /*
505  * ESP module open routine, which is here for keysock plumbing.
506  * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old
507  * Days of export control, and fears that ESP would not be allowed
508  * to be shipped at all by default.  Eventually, keysock should
509  * either access AH and ESP via modstubs or krtld dependencies, or
510  * perhaps be folded in with AH and ESP into a single IPsec/netsec
511  * module ("netsec" if PF_KEY provides more than AH/ESP keying tables).
512  */
513 /* ARGSUSED */
514 static int
515 ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
516 {
517 	netstack_t		*ns;
518 	ipsecesp_stack_t	*espstack;
519 
520 	if (secpolicy_ip_config(credp, B_FALSE) != 0)
521 		return (EPERM);
522 
523 	if (q->q_ptr != NULL)
524 		return (0);  /* Re-open of an already open instance. */
525 
526 	if (sflag != MODOPEN)
527 		return (EINVAL);
528 
529 	ns = netstack_find_by_cred(credp);
530 	ASSERT(ns != NULL);
531 	espstack = ns->netstack_ipsecesp;
532 	ASSERT(espstack != NULL);
533 
534 	q->q_ptr = espstack;
535 	WR(q)->q_ptr = q->q_ptr;
536 
537 	qprocson(q);
538 	return (0);
539 }
540 
541 /*
542  * ESP module close routine.
543  */
544 static int
545 ipsecesp_close(queue_t *q)
546 {
547 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
548 
549 	/*
550 	 * Clean up q_ptr, if needed.
551 	 */
552 	qprocsoff(q);
553 
554 	/* Keysock queue check is safe, because of OCEXCL perimeter. */
555 
556 	if (q == espstack->esp_pfkey_q) {
557 		esp1dbg(espstack,
558 		    ("ipsecesp_close:  Ummm... keysock is closing ESP.\n"));
559 		espstack->esp_pfkey_q = NULL;
560 		/* Detach qtimeouts. */
561 		(void) quntimeout(q, espstack->esp_event);
562 	}
563 
564 	netstack_rele(espstack->ipsecesp_netstack);
565 	return (0);
566 }
567 
568 /*
569  * Add a number of bytes to what the SA has protected so far.  Return
570  * B_TRUE if the SA can still protect that many bytes.
571  *
572  * Caller must REFRELE the passed-in assoc.  This function must REFRELE
573  * any obtained peer SA.
574  */
575 static boolean_t
576 esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
577 {
578 	ipsa_t *inassoc, *outassoc;
579 	isaf_t *bucket;
580 	boolean_t inrc, outrc, isv6;
581 	sadb_t *sp;
582 	int outhash;
583 	netstack_t		*ns = assoc->ipsa_netstack;
584 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
585 
586 	/* No peer?  No problem! */
587 	if (!assoc->ipsa_haspeer) {
588 		return (sadb_age_bytes(espstack->esp_pfkey_q, assoc, bytes,
589 		    B_TRUE));
590 	}
591 
592 	/*
593 	 * Otherwise, we want to grab both the original assoc and its peer.
594 	 * There might be a race for this, but if it's a real race, two
595 	 * expire messages may occur.  We limit this by only sending the
596 	 * expire message on one of the peers, we'll pick the inbound
597 	 * arbitrarily.
598 	 *
599 	 * If we need tight synchronization on the peer SA, then we need to
600 	 * reconsider.
601 	 */
602 
603 	/* Use address length to select IPv6/IPv4 */
604 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
605 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
606 
607 	if (inbound) {
608 		inassoc = assoc;
609 		if (isv6) {
610 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
611 			    &inassoc->ipsa_dstaddr));
612 		} else {
613 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
614 			    &inassoc->ipsa_dstaddr));
615 		}
616 		bucket = &sp->sdb_of[outhash];
617 		mutex_enter(&bucket->isaf_lock);
618 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
619 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
620 		    inassoc->ipsa_addrfam);
621 		mutex_exit(&bucket->isaf_lock);
622 		if (outassoc == NULL) {
623 			/* Q: Do we wish to set haspeer == B_FALSE? */
624 			esp0dbg(("esp_age_bytes: "
625 			    "can't find peer for inbound.\n"));
626 			return (sadb_age_bytes(espstack->esp_pfkey_q, inassoc,
627 			    bytes, B_TRUE));
628 		}
629 	} else {
630 		outassoc = assoc;
631 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
632 		mutex_enter(&bucket->isaf_lock);
633 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
634 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
635 		    outassoc->ipsa_addrfam);
636 		mutex_exit(&bucket->isaf_lock);
637 		if (inassoc == NULL) {
638 			/* Q: Do we wish to set haspeer == B_FALSE? */
639 			esp0dbg(("esp_age_bytes: "
640 			    "can't find peer for outbound.\n"));
641 			return (sadb_age_bytes(espstack->esp_pfkey_q, outassoc,
642 			    bytes, B_TRUE));
643 		}
644 	}
645 
646 	inrc = sadb_age_bytes(espstack->esp_pfkey_q, inassoc, bytes, B_TRUE);
647 	outrc = sadb_age_bytes(espstack->esp_pfkey_q, outassoc, bytes, B_FALSE);
648 
649 	/*
650 	 * REFRELE any peer SA.
651 	 *
652 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
653 	 * them in { }.
654 	 */
655 	if (inbound) {
656 		IPSA_REFRELE(outassoc);
657 	} else {
658 		IPSA_REFRELE(inassoc);
659 	}
660 
661 	return (inrc && outrc);
662 }
663 
664 /*
665  * Do incoming NAT-T manipulations for packet.
666  * Returns NULL if the mblk chain is consumed.
667  */
668 static mblk_t *
669 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
670 {
671 	ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
672 	tcpha_t *tcpha;
673 	udpha_t *udpha;
674 	/* Initialize to our inbound cksum adjustment... */
675 	uint32_t sum = assoc->ipsa_inbound_cksum;
676 
677 	switch (ipha->ipha_protocol) {
678 	case IPPROTO_TCP:
679 		tcpha = (tcpha_t *)(data_mp->b_rptr +
680 		    IPH_HDR_LENGTH(ipha));
681 
682 #define	DOWN_SUM(x) (x) = ((x) & 0xFFFF) +	 ((x) >> 16)
683 		sum += ~ntohs(tcpha->tha_sum) & 0xFFFF;
684 		DOWN_SUM(sum);
685 		DOWN_SUM(sum);
686 		tcpha->tha_sum = ~htons(sum);
687 		break;
688 	case IPPROTO_UDP:
689 		udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
690 
691 		if (udpha->uha_checksum != 0) {
692 			/* Adujst if the inbound one was not zero. */
693 			sum += ~ntohs(udpha->uha_checksum) & 0xFFFF;
694 			DOWN_SUM(sum);
695 			DOWN_SUM(sum);
696 			udpha->uha_checksum = ~htons(sum);
697 			if (udpha->uha_checksum == 0)
698 				udpha->uha_checksum = 0xFFFF;
699 		}
700 #undef DOWN_SUM
701 		break;
702 	case IPPROTO_IP:
703 		/*
704 		 * This case is only an issue for self-encapsulated
705 		 * packets.  So for now, fall through.
706 		 */
707 		break;
708 	}
709 	return (data_mp);
710 }
711 
712 
713 /*
714  * Strip ESP header, check padding, and fix IP header.
715  * Returns B_TRUE on success, B_FALSE if an error occured.
716  */
717 static boolean_t
718 esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
719     kstat_named_t **counter, ipsecesp_stack_t *espstack)
720 {
721 	ipha_t *ipha;
722 	ip6_t *ip6h;
723 	uint_t divpoint;
724 	mblk_t *scratch;
725 	uint8_t nexthdr, padlen;
726 	uint8_t lastpad;
727 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
728 	uint8_t *lastbyte;
729 
730 	/*
731 	 * Strip ESP data and fix IP header.
732 	 *
733 	 * XXX In case the beginning of esp_inbound() changes to not do a
734 	 * pullup, this part of the code can remain unchanged.
735 	 */
736 	if (isv4) {
737 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ipha_t));
738 		ipha = (ipha_t *)data_mp->b_rptr;
739 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (esph_t) +
740 		    IPH_HDR_LENGTH(ipha));
741 		divpoint = IPH_HDR_LENGTH(ipha);
742 	} else {
743 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ip6_t));
744 		ip6h = (ip6_t *)data_mp->b_rptr;
745 		divpoint = ip_hdr_length_v6(data_mp, ip6h);
746 	}
747 
748 	scratch = data_mp;
749 	while (scratch->b_cont != NULL)
750 		scratch = scratch->b_cont;
751 
752 	ASSERT((scratch->b_wptr - scratch->b_rptr) >= 3);
753 
754 	/*
755 	 * "Next header" and padding length are the last two bytes in the
756 	 * ESP-protected datagram, thus the explicit - 1 and - 2.
757 	 * lastpad is the last byte of the padding, which can be used for
758 	 * a quick check to see if the padding is correct.
759 	 */
760 	lastbyte = scratch->b_wptr - 1;
761 	nexthdr = *lastbyte--;
762 	padlen = *lastbyte--;
763 
764 	if (isv4) {
765 		/* Fix part of the IP header. */
766 		ipha->ipha_protocol = nexthdr;
767 		/*
768 		 * Reality check the padlen.  The explicit - 2 is for the
769 		 * padding length and the next-header bytes.
770 		 */
771 		if (padlen >= ntohs(ipha->ipha_length) - sizeof (ipha_t) - 2 -
772 		    sizeof (esph_t) - ivlen) {
773 			ESP_BUMP_STAT(espstack, bad_decrypt);
774 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
775 			    info.mi_idnum, 0, 0,
776 			    SL_ERROR | SL_WARN,
777 			    "Corrupt ESP packet (padlen too big).\n");
778 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
779 			    padlen));
780 			esp1dbg(espstack, ("pkt len(%d) - ip hdr - esp "
781 			    "hdr - ivlen(%d) = %d.\n",
782 			    ntohs(ipha->ipha_length), ivlen,
783 			    (int)(ntohs(ipha->ipha_length) - sizeof (ipha_t) -
784 			    2 - sizeof (esph_t) - ivlen)));
785 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
786 			return (B_FALSE);
787 		}
788 
789 		/*
790 		 * Fix the rest of the header.  The explicit - 2 is for the
791 		 * padding length and the next-header bytes.
792 		 */
793 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - padlen -
794 		    2 - sizeof (esph_t) - ivlen);
795 		ipha->ipha_hdr_checksum = 0;
796 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
797 	} else {
798 		if (ip6h->ip6_nxt == IPPROTO_ESP) {
799 			ip6h->ip6_nxt = nexthdr;
800 		} else {
801 			ip_pkt_t ipp;
802 
803 			bzero(&ipp, sizeof (ipp));
804 			(void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
805 			    NULL);
806 			if (ipp.ipp_dstopts != NULL) {
807 				ipp.ipp_dstopts->ip6d_nxt = nexthdr;
808 			} else if (ipp.ipp_rthdr != NULL) {
809 				ipp.ipp_rthdr->ip6r_nxt = nexthdr;
810 			} else if (ipp.ipp_hopopts != NULL) {
811 				ipp.ipp_hopopts->ip6h_nxt = nexthdr;
812 			} else {
813 				/* Panic a DEBUG kernel. */
814 				ASSERT(ipp.ipp_hopopts != NULL);
815 				/* Otherwise, pretend it's IP + ESP. */
816 				cmn_err(CE_WARN, "ESP IPv6 headers wrong.\n");
817 				ip6h->ip6_nxt = nexthdr;
818 			}
819 		}
820 
821 		if (padlen >= ntohs(ip6h->ip6_plen) - 2 - sizeof (esph_t) -
822 		    ivlen) {
823 			ESP_BUMP_STAT(espstack, bad_decrypt);
824 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
825 			    info.mi_idnum, 0, 0,
826 			    SL_ERROR | SL_WARN,
827 			    "Corrupt ESP packet (v6 padlen too big).\n");
828 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
829 			    padlen));
830 			esp1dbg(espstack,
831 			    ("pkt len(%u) - ip hdr - esp hdr - ivlen(%d) = "
832 			    "%u.\n", (unsigned)(ntohs(ip6h->ip6_plen)
833 			    + sizeof (ip6_t)), ivlen,
834 			    (unsigned)(ntohs(ip6h->ip6_plen) - 2 -
835 			    sizeof (esph_t) - ivlen)));
836 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
837 			return (B_FALSE);
838 		}
839 
840 
841 		/*
842 		 * Fix the rest of the header.  The explicit - 2 is for the
843 		 * padding length and the next-header bytes.  IPv6 is nice,
844 		 * because there's no hdr checksum!
845 		 */
846 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - padlen -
847 		    2 - sizeof (esph_t) - ivlen);
848 	}
849 
850 	if (espstack->ipsecesp_padding_check > 0 && padlen > 0) {
851 		/*
852 		 * Weak padding check: compare last-byte to length, they
853 		 * should be equal.
854 		 */
855 		lastpad = *lastbyte--;
856 
857 		if (padlen != lastpad) {
858 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
859 			    info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
860 			    "Corrupt ESP packet (lastpad != padlen).\n");
861 			esp1dbg(espstack,
862 			    ("lastpad (%d) not equal to padlen (%d):\n",
863 			    lastpad, padlen));
864 			ESP_BUMP_STAT(espstack, bad_padding);
865 			*counter = DROPPER(ipss, ipds_esp_bad_padding);
866 			return (B_FALSE);
867 		}
868 
869 		/*
870 		 * Strong padding check: Check all pad bytes to see that
871 		 * they're ascending.  Go backwards using a descending counter
872 		 * to verify.  padlen == 1 is checked by previous block, so
873 		 * only bother if we've more than 1 byte of padding.
874 		 * Consequently, start the check one byte before the location
875 		 * of "lastpad".
876 		 */
877 		if (espstack->ipsecesp_padding_check > 1) {
878 			/*
879 			 * This assert may have to become an if and a pullup
880 			 * if we start accepting multi-dblk mblks. For now,
881 			 * though, any packet here will have been pulled up in
882 			 * esp_inbound.
883 			 */
884 			ASSERT(MBLKL(scratch) >= lastpad + 3);
885 
886 			/*
887 			 * Use "--lastpad" because we already checked the very
888 			 * last pad byte previously.
889 			 */
890 			while (--lastpad != 0) {
891 				if (lastpad != *lastbyte) {
892 					ipsec_rl_strlog(
893 					    espstack->ipsecesp_netstack,
894 					    info.mi_idnum, 0, 0,
895 					    SL_ERROR | SL_WARN, "Corrupt ESP "
896 					    "packet (bad padding).\n");
897 					esp1dbg(espstack,
898 					    ("padding not in correct"
899 					    " format:\n"));
900 					ESP_BUMP_STAT(espstack, bad_padding);
901 					*counter = DROPPER(ipss,
902 					    ipds_esp_bad_padding);
903 					return (B_FALSE);
904 				}
905 				lastbyte--;
906 			}
907 		}
908 	}
909 
910 	/* Trim off the padding. */
911 	ASSERT(data_mp->b_cont == NULL);
912 	data_mp->b_wptr -= (padlen + 2);
913 
914 	/*
915 	 * Remove the ESP header.
916 	 *
917 	 * The above assertions about data_mp's size will make this work.
918 	 *
919 	 * XXX  Question:  If I send up and get back a contiguous mblk,
920 	 * would it be quicker to bcopy over, or keep doing the dupb stuff?
921 	 * I go with copying for now.
922 	 */
923 
924 	if (IS_P2ALIGNED(data_mp->b_rptr, sizeof (uint32_t)) &&
925 	    IS_P2ALIGNED(ivlen, sizeof (uint32_t))) {
926 		uint8_t *start = data_mp->b_rptr;
927 		uint32_t *src, *dst;
928 
929 		src = (uint32_t *)(start + divpoint);
930 		dst = (uint32_t *)(start + divpoint + sizeof (esph_t) + ivlen);
931 
932 		ASSERT(IS_P2ALIGNED(dst, sizeof (uint32_t)) &&
933 		    IS_P2ALIGNED(src, sizeof (uint32_t)));
934 
935 		do {
936 			src--;
937 			dst--;
938 			*dst = *src;
939 		} while (src != (uint32_t *)start);
940 
941 		data_mp->b_rptr = (uchar_t *)dst;
942 	} else {
943 		uint8_t *start = data_mp->b_rptr;
944 		uint8_t *src, *dst;
945 
946 		src = start + divpoint;
947 		dst = src + sizeof (esph_t) + ivlen;
948 
949 		do {
950 			src--;
951 			dst--;
952 			*dst = *src;
953 		} while (src != start);
954 
955 		data_mp->b_rptr = dst;
956 	}
957 
958 	esp2dbg(espstack, ("data_mp after inbound ESP adjustment:\n"));
959 	esp2dbg(espstack, (dump_msg(data_mp)));
960 
961 	return (B_TRUE);
962 }
963 
964 /*
965  * Updating use times can be tricky business if the ipsa_haspeer flag is
966  * set.  This function is called once in an SA's lifetime.
967  *
968  * Caller has to REFRELE "assoc" which is passed in.  This function has
969  * to REFRELE any peer SA that is obtained.
970  */
971 static void
972 esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
973 {
974 	ipsa_t *inassoc, *outassoc;
975 	isaf_t *bucket;
976 	sadb_t *sp;
977 	int outhash;
978 	boolean_t isv6;
979 	netstack_t		*ns = assoc->ipsa_netstack;
980 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
981 
982 	/* No peer?  No problem! */
983 	if (!assoc->ipsa_haspeer) {
984 		sadb_set_usetime(assoc);
985 		return;
986 	}
987 
988 	/*
989 	 * Otherwise, we want to grab both the original assoc and its peer.
990 	 * There might be a race for this, but if it's a real race, the times
991 	 * will be out-of-synch by at most a second, and since our time
992 	 * granularity is a second, this won't be a problem.
993 	 *
994 	 * If we need tight synchronization on the peer SA, then we need to
995 	 * reconsider.
996 	 */
997 
998 	/* Use address length to select IPv6/IPv4 */
999 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
1000 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
1001 
1002 	if (inbound) {
1003 		inassoc = assoc;
1004 		if (isv6) {
1005 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
1006 			    &inassoc->ipsa_dstaddr));
1007 		} else {
1008 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
1009 			    &inassoc->ipsa_dstaddr));
1010 		}
1011 		bucket = &sp->sdb_of[outhash];
1012 		mutex_enter(&bucket->isaf_lock);
1013 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
1014 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
1015 		    inassoc->ipsa_addrfam);
1016 		mutex_exit(&bucket->isaf_lock);
1017 		if (outassoc == NULL) {
1018 			/* Q: Do we wish to set haspeer == B_FALSE? */
1019 			esp0dbg(("esp_set_usetime: "
1020 			    "can't find peer for inbound.\n"));
1021 			sadb_set_usetime(inassoc);
1022 			return;
1023 		}
1024 	} else {
1025 		outassoc = assoc;
1026 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
1027 		mutex_enter(&bucket->isaf_lock);
1028 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
1029 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
1030 		    outassoc->ipsa_addrfam);
1031 		mutex_exit(&bucket->isaf_lock);
1032 		if (inassoc == NULL) {
1033 			/* Q: Do we wish to set haspeer == B_FALSE? */
1034 			esp0dbg(("esp_set_usetime: "
1035 			    "can't find peer for outbound.\n"));
1036 			sadb_set_usetime(outassoc);
1037 			return;
1038 		}
1039 	}
1040 
1041 	/* Update usetime on both. */
1042 	sadb_set_usetime(inassoc);
1043 	sadb_set_usetime(outassoc);
1044 
1045 	/*
1046 	 * REFRELE any peer SA.
1047 	 *
1048 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
1049 	 * them in { }.
1050 	 */
1051 	if (inbound) {
1052 		IPSA_REFRELE(outassoc);
1053 	} else {
1054 		IPSA_REFRELE(inassoc);
1055 	}
1056 }
1057 
1058 /*
1059  * Handle ESP inbound data for IPv4 and IPv6.
1060  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1061  * mblk chain data_mp.
1062  */
1063 mblk_t *
1064 esp_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira)
1065 {
1066 	esph_t *esph = (esph_t *)arg;
1067 	ipsa_t *ipsa = ira->ira_ipsec_esp_sa;
1068 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
1069 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1070 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1071 
1072 	/*
1073 	 * We may wish to check replay in-range-only here as an optimization.
1074 	 * Include the reality check of ipsa->ipsa_replay >
1075 	 * ipsa->ipsa_replay_wsize for times when it's the first N packets,
1076 	 * where N == ipsa->ipsa_replay_wsize.
1077 	 *
1078 	 * Another check that may come here later is the "collision" check.
1079 	 * If legitimate packets flow quickly enough, this won't be a problem,
1080 	 * but collisions may cause authentication algorithm crunching to
1081 	 * take place when it doesn't need to.
1082 	 */
1083 	if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
1084 		ESP_BUMP_STAT(espstack, replay_early_failures);
1085 		IP_ESP_BUMP_STAT(ipss, in_discards);
1086 		ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1087 		    DROPPER(ipss, ipds_esp_early_replay),
1088 		    &espstack->esp_dropper);
1089 		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1090 		return (NULL);
1091 	}
1092 
1093 	/*
1094 	 * Adjust the IP header's payload length to reflect the removal
1095 	 * of the ICV.
1096 	 */
1097 	if (!(ira->ira_flags & IRAF_IS_IPV4)) {
1098 		ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
1099 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
1100 		    ipsa->ipsa_mac_len);
1101 	} else {
1102 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1103 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) -
1104 		    ipsa->ipsa_mac_len);
1105 	}
1106 
1107 	/* submit the request to the crypto framework */
1108 	return (esp_submit_req_inbound(data_mp, ira, ipsa,
1109 	    (uint8_t *)esph - data_mp->b_rptr));
1110 }
1111 
1112 /* XXX refactor me */
1113 /*
1114  * Handle the SADB_GETSPI message.  Create a larval SA.
1115  */
1116 static void
1117 esp_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
1118 {
1119 	ipsa_t *newbie, *target;
1120 	isaf_t *outbound, *inbound;
1121 	int rc, diagnostic;
1122 	sadb_sa_t *assoc;
1123 	keysock_out_t *kso;
1124 	uint32_t newspi;
1125 
1126 	/*
1127 	 * Randomly generate a proposed SPI value
1128 	 */
1129 	if (cl_inet_getspi != NULL) {
1130 		cl_inet_getspi(espstack->ipsecesp_netstack->netstack_stackid,
1131 		    IPPROTO_ESP, (uint8_t *)&newspi, sizeof (uint32_t), NULL);
1132 	} else {
1133 		(void) random_get_pseudo_bytes((uint8_t *)&newspi,
1134 		    sizeof (uint32_t));
1135 	}
1136 	newbie = sadb_getspi(ksi, newspi, &diagnostic,
1137 	    espstack->ipsecesp_netstack, IPPROTO_ESP);
1138 
1139 	if (newbie == NULL) {
1140 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM, diagnostic,
1141 		    ksi->ks_in_serial);
1142 		return;
1143 	} else if (newbie == (ipsa_t *)-1) {
1144 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
1145 		    ksi->ks_in_serial);
1146 		return;
1147 	}
1148 
1149 	/*
1150 	 * XXX - We may randomly collide.  We really should recover from this.
1151 	 *	 Unfortunately, that could require spending way-too-much-time
1152 	 *	 in here.  For now, let the user retry.
1153 	 */
1154 
1155 	if (newbie->ipsa_addrfam == AF_INET6) {
1156 		outbound = OUTBOUND_BUCKET_V6(&espstack->esp_sadb.s_v6,
1157 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1158 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v6,
1159 		    newbie->ipsa_spi);
1160 	} else {
1161 		ASSERT(newbie->ipsa_addrfam == AF_INET);
1162 		outbound = OUTBOUND_BUCKET_V4(&espstack->esp_sadb.s_v4,
1163 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1164 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v4,
1165 		    newbie->ipsa_spi);
1166 	}
1167 
1168 	mutex_enter(&outbound->isaf_lock);
1169 	mutex_enter(&inbound->isaf_lock);
1170 
1171 	/*
1172 	 * Check for collisions (i.e. did sadb_getspi() return with something
1173 	 * that already exists?).
1174 	 *
1175 	 * Try outbound first.  Even though SADB_GETSPI is traditionally
1176 	 * for inbound SAs, you never know what a user might do.
1177 	 */
1178 	target = ipsec_getassocbyspi(outbound, newbie->ipsa_spi,
1179 	    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr, newbie->ipsa_addrfam);
1180 	if (target == NULL) {
1181 		target = ipsec_getassocbyspi(inbound, newbie->ipsa_spi,
1182 		    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr,
1183 		    newbie->ipsa_addrfam);
1184 	}
1185 
1186 	/*
1187 	 * I don't have collisions elsewhere!
1188 	 * (Nor will I because I'm still holding inbound/outbound locks.)
1189 	 */
1190 
1191 	if (target != NULL) {
1192 		rc = EEXIST;
1193 		IPSA_REFRELE(target);
1194 	} else {
1195 		/*
1196 		 * sadb_insertassoc() also checks for collisions, so
1197 		 * if there's a colliding entry, rc will be set
1198 		 * to EEXIST.
1199 		 */
1200 		rc = sadb_insertassoc(newbie, inbound);
1201 		newbie->ipsa_hardexpiretime = gethrestime_sec();
1202 		newbie->ipsa_hardexpiretime +=
1203 		    espstack->ipsecesp_larval_timeout;
1204 	}
1205 
1206 	/*
1207 	 * Can exit outbound mutex.  Hold inbound until we're done
1208 	 * with newbie.
1209 	 */
1210 	mutex_exit(&outbound->isaf_lock);
1211 
1212 	if (rc != 0) {
1213 		mutex_exit(&inbound->isaf_lock);
1214 		IPSA_REFRELE(newbie);
1215 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, rc,
1216 		    SADB_X_DIAGNOSTIC_NONE, ksi->ks_in_serial);
1217 		return;
1218 	}
1219 
1220 
1221 	/* Can write here because I'm still holding the bucket lock. */
1222 	newbie->ipsa_type = SADB_SATYPE_ESP;
1223 
1224 	/*
1225 	 * Construct successful return message. We have one thing going
1226 	 * for us in PF_KEY v2.  That's the fact that
1227 	 *	sizeof (sadb_spirange_t) == sizeof (sadb_sa_t)
1228 	 */
1229 	assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SPIRANGE];
1230 	assoc->sadb_sa_exttype = SADB_EXT_SA;
1231 	assoc->sadb_sa_spi = newbie->ipsa_spi;
1232 	*((uint64_t *)(&assoc->sadb_sa_replay)) = 0;
1233 	mutex_exit(&inbound->isaf_lock);
1234 
1235 	/* Convert KEYSOCK_IN to KEYSOCK_OUT. */
1236 	kso = (keysock_out_t *)ksi;
1237 	kso->ks_out_len = sizeof (*kso);
1238 	kso->ks_out_serial = ksi->ks_in_serial;
1239 	kso->ks_out_type = KEYSOCK_OUT;
1240 
1241 	/*
1242 	 * Can safely putnext() to esp_pfkey_q, because this is a turnaround
1243 	 * from the esp_pfkey_q.
1244 	 */
1245 	putnext(espstack->esp_pfkey_q, mp);
1246 }
1247 
1248 /*
1249  * Insert the ESP header into a packet.  Duplicate an mblk, and insert a newly
1250  * allocated mblk with the ESP header in between the two.
1251  */
1252 static boolean_t
1253 esp_insert_esp(mblk_t *mp, mblk_t *esp_mp, uint_t divpoint,
1254     ipsecesp_stack_t *espstack)
1255 {
1256 	mblk_t *split_mp = mp;
1257 	uint_t wheretodiv = divpoint;
1258 
1259 	while ((split_mp->b_wptr - split_mp->b_rptr) < wheretodiv) {
1260 		wheretodiv -= (split_mp->b_wptr - split_mp->b_rptr);
1261 		split_mp = split_mp->b_cont;
1262 		ASSERT(split_mp != NULL);
1263 	}
1264 
1265 	if (split_mp->b_wptr - split_mp->b_rptr != wheretodiv) {
1266 		mblk_t *scratch;
1267 
1268 		/* "scratch" is the 2nd half, split_mp is the first. */
1269 		scratch = dupb(split_mp);
1270 		if (scratch == NULL) {
1271 			esp1dbg(espstack,
1272 			    ("esp_insert_esp: can't allocate scratch.\n"));
1273 			return (B_FALSE);
1274 		}
1275 		/* NOTE:  dupb() doesn't set b_cont appropriately. */
1276 		scratch->b_cont = split_mp->b_cont;
1277 		scratch->b_rptr += wheretodiv;
1278 		split_mp->b_wptr = split_mp->b_rptr + wheretodiv;
1279 		split_mp->b_cont = scratch;
1280 	}
1281 	/*
1282 	 * At this point, split_mp is exactly "wheretodiv" bytes long, and
1283 	 * holds the end of the pre-ESP part of the datagram.
1284 	 */
1285 	esp_mp->b_cont = split_mp->b_cont;
1286 	split_mp->b_cont = esp_mp;
1287 
1288 	return (B_TRUE);
1289 }
1290 
1291 /*
1292  * Section 7 of RFC 3947 says:
1293  *
1294  * 7.  Recovering from the Expiring NAT Mappings
1295  *
1296  *    There are cases where NAT box decides to remove mappings that are still
1297  *    alive (for example, when the keepalive interval is too long, or when the
1298  *    NAT box is rebooted).  To recover from this, ends that are NOT behind
1299  *    NAT SHOULD use the last valid UDP encapsulated IKE or IPsec packet from
1300  *    the other end to determine which IP and port addresses should be used.
1301  *    The host behind dynamic NAT MUST NOT do this, as otherwise it opens a
1302  *    DoS attack possibility because the IP address or port of the other host
1303  *    will not change (it is not behind NAT).
1304  *
1305  *    Keepalives cannot be used for these purposes, as they are not
1306  *    authenticated, but any IKE authenticated IKE packet or ESP packet can be
1307  *    used to detect whether the IP address or the port has changed.
1308  *
1309  * The following function will check an SA and its explicitly-set pair to see
1310  * if the NAT-T remote port matches the received packet (which must have
1311  * passed ESP authentication, see esp_in_done() for the caller context).  If
1312  * there is a mismatch, the SAs are updated.  It is not important if we race
1313  * with a transmitting thread, as if there is a transmitting thread, it will
1314  * merely emit a packet that will most-likely be dropped.
1315  *
1316  * "ports" are ordered src,dst, and assoc is an inbound SA, where src should
1317  * match ipsa_remote_nat_port and dst should match ipsa_local_nat_port.
1318  */
1319 #ifdef _LITTLE_ENDIAN
1320 #define	FIRST_16(x) ((x) & 0xFFFF)
1321 #define	NEXT_16(x) (((x) >> 16) & 0xFFFF)
1322 #else
1323 #define	FIRST_16(x) (((x) >> 16) & 0xFFFF)
1324 #define	NEXT_16(x) ((x) & 0xFFFF)
1325 #endif
1326 static void
1327 esp_port_freshness(uint32_t ports, ipsa_t *assoc)
1328 {
1329 	uint16_t remote = FIRST_16(ports);
1330 	uint16_t local = NEXT_16(ports);
1331 	ipsa_t *outbound_peer;
1332 	isaf_t *bucket;
1333 	ipsecesp_stack_t *espstack = assoc->ipsa_netstack->netstack_ipsecesp;
1334 
1335 	/* We found a conn_t, therefore local != 0. */
1336 	ASSERT(local != 0);
1337 	/* Assume an IPv4 SA. */
1338 	ASSERT(assoc->ipsa_addrfam == AF_INET);
1339 
1340 	/*
1341 	 * On-the-wire rport == 0 means something's very wrong.
1342 	 * An unpaired SA is also useless to us.
1343 	 * If we are behind the NAT, don't bother.
1344 	 * A zero local NAT port defaults to 4500, so check that too.
1345 	 * And, of course, if the ports already match, we don't need to
1346 	 * bother.
1347 	 */
1348 	if (remote == 0 || assoc->ipsa_otherspi == 0 ||
1349 	    (assoc->ipsa_flags & IPSA_F_BEHIND_NAT) ||
1350 	    (assoc->ipsa_remote_nat_port == 0 &&
1351 	    remote == htons(IPPORT_IKE_NATT)) ||
1352 	    remote == assoc->ipsa_remote_nat_port)
1353 		return;
1354 
1355 	/* Try and snag the peer.   NOTE:  Assume IPv4 for now. */
1356 	bucket = OUTBOUND_BUCKET_V4(&(espstack->esp_sadb.s_v4),
1357 	    assoc->ipsa_srcaddr[0]);
1358 	mutex_enter(&bucket->isaf_lock);
1359 	outbound_peer = ipsec_getassocbyspi(bucket, assoc->ipsa_otherspi,
1360 	    assoc->ipsa_dstaddr, assoc->ipsa_srcaddr, AF_INET);
1361 	mutex_exit(&bucket->isaf_lock);
1362 
1363 	/* We probably lost a race to a deleting or expiring thread. */
1364 	if (outbound_peer == NULL)
1365 		return;
1366 
1367 	/*
1368 	 * Hold the mutexes for both SAs so we don't race another inbound
1369 	 * thread.  A lock-entry order shouldn't matter, since all other
1370 	 * per-ipsa locks are individually held-then-released.
1371 	 *
1372 	 * Luckily, this has nothing to do with the remote-NAT address,
1373 	 * so we don't have to re-scribble the cached-checksum differential.
1374 	 */
1375 	mutex_enter(&outbound_peer->ipsa_lock);
1376 	mutex_enter(&assoc->ipsa_lock);
1377 	outbound_peer->ipsa_remote_nat_port = assoc->ipsa_remote_nat_port =
1378 	    remote;
1379 	mutex_exit(&assoc->ipsa_lock);
1380 	mutex_exit(&outbound_peer->ipsa_lock);
1381 	IPSA_REFRELE(outbound_peer);
1382 	ESP_BUMP_STAT(espstack, sa_port_renumbers);
1383 }
1384 /*
1385  * Finish processing of an inbound ESP packet after processing by the
1386  * crypto framework.
1387  * - Remove the ESP header.
1388  * - Send packet back to IP.
1389  * If authentication was performed on the packet, this function is called
1390  * only if the authentication succeeded.
1391  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1392  * mblk chain data_mp.
1393  */
1394 static mblk_t *
1395 esp_in_done(mblk_t *data_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
1396 {
1397 	ipsa_t *assoc;
1398 	uint_t espstart;
1399 	uint32_t ivlen = 0;
1400 	uint_t processed_len;
1401 	esph_t *esph;
1402 	kstat_named_t *counter;
1403 	boolean_t is_natt;
1404 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
1405 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1406 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1407 
1408 	assoc = ira->ira_ipsec_esp_sa;
1409 	ASSERT(assoc != NULL);
1410 
1411 	is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
1412 
1413 	/* get the pointer to the ESP header */
1414 	if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
1415 		/* authentication-only ESP */
1416 		espstart = ic->ic_crypto_data.cd_offset;
1417 		processed_len = ic->ic_crypto_data.cd_length;
1418 	} else {
1419 		/* encryption present */
1420 		ivlen = assoc->ipsa_iv_len;
1421 		if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
1422 			/* encryption-only ESP */
1423 			espstart = ic->ic_crypto_data.cd_offset -
1424 			    sizeof (esph_t) - assoc->ipsa_iv_len;
1425 			processed_len = ic->ic_crypto_data.cd_length +
1426 			    ivlen;
1427 		} else {
1428 			/* encryption with authentication */
1429 			espstart = ic->ic_crypto_dual_data.dd_offset1;
1430 			processed_len = ic->ic_crypto_dual_data.dd_len2 +
1431 			    ivlen;
1432 		}
1433 	}
1434 
1435 	esph = (esph_t *)(data_mp->b_rptr + espstart);
1436 
1437 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE ||
1438 	    (assoc->ipsa_flags & IPSA_F_COMBINED)) {
1439 		/*
1440 		 * Authentication passed if we reach this point.
1441 		 * Packets with authentication will have the ICV
1442 		 * after the crypto data. Adjust b_wptr before
1443 		 * making padlen checks.
1444 		 */
1445 		ESP_BUMP_STAT(espstack, good_auth);
1446 		data_mp->b_wptr -= assoc->ipsa_mac_len;
1447 
1448 		/*
1449 		 * Check replay window here!
1450 		 * For right now, assume keysock will set the replay window
1451 		 * size to zero for SAs that have an unspecified sender.
1452 		 * This may change...
1453 		 */
1454 
1455 		if (!sadb_replay_check(assoc, esph->esph_replay)) {
1456 			/*
1457 			 * Log the event. As of now we print out an event.
1458 			 * Do not print the replay failure number, or else
1459 			 * syslog cannot collate the error messages.  Printing
1460 			 * the replay number that failed opens a denial-of-
1461 			 * service attack.
1462 			 */
1463 			ipsec_assocfailure(info.mi_idnum, 0, 0,
1464 			    SL_ERROR | SL_WARN,
1465 			    "Replay failed for ESP spi 0x%x, dst %s.\n",
1466 			    assoc->ipsa_spi, assoc->ipsa_dstaddr,
1467 			    assoc->ipsa_addrfam, espstack->ipsecesp_netstack);
1468 			ESP_BUMP_STAT(espstack, replay_failures);
1469 			counter = DROPPER(ipss, ipds_esp_replay);
1470 			goto drop_and_bail;
1471 		}
1472 
1473 		if (is_natt) {
1474 			ASSERT(ira->ira_flags & IRAF_ESP_UDP_PORTS);
1475 			ASSERT(ira->ira_esp_udp_ports != 0);
1476 			esp_port_freshness(ira->ira_esp_udp_ports, assoc);
1477 		}
1478 	}
1479 
1480 	esp_set_usetime(assoc, B_TRUE);
1481 
1482 	if (!esp_age_bytes(assoc, processed_len, B_TRUE)) {
1483 		/* The ipsa has hit hard expiration, LOG and AUDIT. */
1484 		ipsec_assocfailure(info.mi_idnum, 0, 0,
1485 		    SL_ERROR | SL_WARN,
1486 		    "ESP association 0x%x, dst %s had bytes expire.\n",
1487 		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1488 		    espstack->ipsecesp_netstack);
1489 		ESP_BUMP_STAT(espstack, bytes_expired);
1490 		counter = DROPPER(ipss, ipds_esp_bytes_expire);
1491 		goto drop_and_bail;
1492 	}
1493 
1494 	/*
1495 	 * Remove ESP header and padding from packet.  I hope the compiler
1496 	 * spews "branch, predict taken" code for this.
1497 	 */
1498 
1499 	if (esp_strip_header(data_mp, (ira->ira_flags & IRAF_IS_IPV4),
1500 	    ivlen, &counter, espstack)) {
1501 
1502 		if (is_system_labeled() && assoc->ipsa_tsl != NULL) {
1503 			if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) {
1504 				ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1505 				    DROPPER(ipss, ipds_ah_nomem),
1506 				    &espstack->esp_dropper);
1507 				BUMP_MIB(ira->ira_ill->ill_ip_mib,
1508 				    ipIfStatsInDiscards);
1509 				return (NULL);
1510 			}
1511 		}
1512 		if (is_natt)
1513 			return (esp_fix_natt_checksums(data_mp, assoc));
1514 
1515 		if (assoc->ipsa_state == IPSA_STATE_IDLE) {
1516 			/*
1517 			 * Cluster buffering case.  Tell caller that we're
1518 			 * handling the packet.
1519 			 */
1520 			sadb_buf_pkt(assoc, data_mp, ira);
1521 			return (NULL);
1522 		}
1523 
1524 		return (data_mp);
1525 	}
1526 
1527 	esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
1528 drop_and_bail:
1529 	IP_ESP_BUMP_STAT(ipss, in_discards);
1530 	ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, counter,
1531 	    &espstack->esp_dropper);
1532 	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1533 	return (NULL);
1534 }
1535 
1536 /*
1537  * Called upon failing the inbound ICV check. The message passed as
1538  * argument is freed.
1539  */
1540 static void
1541 esp_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira)
1542 {
1543 	ipsa_t		*assoc = ira->ira_ipsec_esp_sa;
1544 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
1545 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1546 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1547 
1548 	/*
1549 	 * Log the event. Don't print to the console, block
1550 	 * potential denial-of-service attack.
1551 	 */
1552 	ESP_BUMP_STAT(espstack, bad_auth);
1553 
1554 	ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1555 	    "ESP Authentication failed for spi 0x%x, dst %s.\n",
1556 	    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1557 	    espstack->ipsecesp_netstack);
1558 
1559 	IP_ESP_BUMP_STAT(ipss, in_discards);
1560 	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
1561 	    DROPPER(ipss, ipds_esp_bad_auth),
1562 	    &espstack->esp_dropper);
1563 }
1564 
1565 
1566 /*
1567  * Invoked for outbound packets after ESP processing. If the packet
1568  * also requires AH, performs the AH SA selection and AH processing.
1569  *
1570  * Returns data_mp (possibly with AH added) unless data_mp was consumed
1571  * due to an error, or queued due to async. crypto or an ACQUIRE trigger.
1572  */
1573 static mblk_t *
1574 esp_do_outbound_ah(mblk_t *data_mp, ip_xmit_attr_t *ixa)
1575 {
1576 	ipsec_action_t *ap;
1577 
1578 	ap = ixa->ixa_ipsec_action;
1579 	if (ap == NULL) {
1580 		ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
1581 		ap = pp->ipsp_act;
1582 	}
1583 
1584 	if (!ap->ipa_want_ah)
1585 		return (data_mp);
1586 
1587 	/*
1588 	 * Normally the AH SA would have already been put in place
1589 	 * but it could have been flushed so we need to look for it.
1590 	 */
1591 	if (ixa->ixa_ipsec_ah_sa == NULL) {
1592 		if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) {
1593 			sadb_acquire(data_mp, ixa, B_TRUE, B_FALSE);
1594 			return (NULL);
1595 		}
1596 	}
1597 	ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
1598 
1599 	data_mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(data_mp, ixa);
1600 	return (data_mp);
1601 }
1602 
1603 
1604 /*
1605  * Kernel crypto framework callback invoked after completion of async
1606  * crypto requests for outbound packets.
1607  */
1608 static void
1609 esp_kcf_callback_outbound(void *arg, int status)
1610 {
1611 	mblk_t		*mp = (mblk_t *)arg;
1612 	mblk_t		*async_mp;
1613 	netstack_t	*ns;
1614 	ipsec_stack_t	*ipss;
1615 	ipsecesp_stack_t *espstack;
1616 	mblk_t		*data_mp;
1617 	ip_xmit_attr_t	ixas;
1618 	ipsec_crypto_t	*ic;
1619 	ill_t		*ill;
1620 
1621 	/*
1622 	 * First remove the ipsec_crypto_t mblk
1623 	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1624 	 */
1625 	async_mp = ipsec_remove_crypto_data(mp, &ic);
1626 	ASSERT(async_mp != NULL);
1627 
1628 	/*
1629 	 * Extract the ip_xmit_attr_t from the first mblk.
1630 	 * Verifies that the netstack and ill is still around; could
1631 	 * have vanished while kEf was doing its work.
1632 	 * On succesful return we have a nce_t and the ill/ipst can't
1633 	 * disappear until we do the nce_refrele in ixa_cleanup.
1634 	 */
1635 	data_mp = async_mp->b_cont;
1636 	async_mp->b_cont = NULL;
1637 	if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) {
1638 		/* Disappeared on us - no ill/ipst for MIB */
1639 		/* We have nowhere to do stats since ixa_ipst could be NULL */
1640 		if (ixas.ixa_nce != NULL) {
1641 			ill = ixas.ixa_nce->nce_ill;
1642 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1643 			ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
1644 		}
1645 		freemsg(data_mp);
1646 		goto done;
1647 	}
1648 	ns = ixas.ixa_ipst->ips_netstack;
1649 	espstack = ns->netstack_ipsecesp;
1650 	ipss = ns->netstack_ipsec;
1651 	ill = ixas.ixa_nce->nce_ill;
1652 
1653 	if (status == CRYPTO_SUCCESS) {
1654 		/*
1655 		 * If a ICV was computed, it was stored by the
1656 		 * crypto framework at the end of the packet.
1657 		 */
1658 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1659 
1660 		esp_set_usetime(ixas.ixa_ipsec_esp_sa, B_FALSE);
1661 		/* NAT-T packet. */
1662 		if (IPH_HDR_VERSION(ipha) == IP_VERSION &&
1663 		    ipha->ipha_protocol == IPPROTO_UDP)
1664 			esp_prepare_udp(ns, data_mp, ipha);
1665 
1666 		/* do AH processing if needed */
1667 		data_mp = esp_do_outbound_ah(data_mp, &ixas);
1668 		if (data_mp == NULL)
1669 			goto done;
1670 
1671 		(void) ip_output_post_ipsec(data_mp, &ixas);
1672 	} else {
1673 		/* Outbound shouldn't see invalid MAC */
1674 		ASSERT(status != CRYPTO_INVALID_MAC);
1675 
1676 		esp1dbg(espstack,
1677 		    ("esp_kcf_callback_outbound: crypto failed with 0x%x\n",
1678 		    status));
1679 		ESP_BUMP_STAT(espstack, crypto_failures);
1680 		ESP_BUMP_STAT(espstack, out_discards);
1681 		ip_drop_packet(data_mp, B_FALSE, ill,
1682 		    DROPPER(ipss, ipds_esp_crypto_failed),
1683 		    &espstack->esp_dropper);
1684 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1685 	}
1686 done:
1687 	ixa_cleanup(&ixas);
1688 	(void) ipsec_free_crypto_data(mp);
1689 }
1690 
1691 /*
1692  * Kernel crypto framework callback invoked after completion of async
1693  * crypto requests for inbound packets.
1694  */
1695 static void
1696 esp_kcf_callback_inbound(void *arg, int status)
1697 {
1698 	mblk_t		*mp = (mblk_t *)arg;
1699 	mblk_t		*async_mp;
1700 	netstack_t	*ns;
1701 	ipsecesp_stack_t *espstack;
1702 	ipsec_stack_t	*ipss;
1703 	mblk_t		*data_mp;
1704 	ip_recv_attr_t	iras;
1705 	ipsec_crypto_t	*ic;
1706 
1707 	/*
1708 	 * First remove the ipsec_crypto_t mblk
1709 	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1710 	 */
1711 	async_mp = ipsec_remove_crypto_data(mp, &ic);
1712 	ASSERT(async_mp != NULL);
1713 
1714 	/*
1715 	 * Extract the ip_recv_attr_t from the first mblk.
1716 	 * Verifies that the netstack and ill is still around; could
1717 	 * have vanished while kEf was doing its work.
1718 	 */
1719 	data_mp = async_mp->b_cont;
1720 	async_mp->b_cont = NULL;
1721 	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
1722 		/* The ill or ip_stack_t disappeared on us */
1723 		ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
1724 		freemsg(data_mp);
1725 		goto done;
1726 	}
1727 
1728 	ns = iras.ira_ill->ill_ipst->ips_netstack;
1729 	espstack = ns->netstack_ipsecesp;
1730 	ipss = ns->netstack_ipsec;
1731 
1732 	if (status == CRYPTO_SUCCESS) {
1733 		data_mp = esp_in_done(data_mp, &iras, ic);
1734 		if (data_mp == NULL)
1735 			goto done;
1736 
1737 		/* finish IPsec processing */
1738 		ip_input_post_ipsec(data_mp, &iras);
1739 	} else if (status == CRYPTO_INVALID_MAC) {
1740 		esp_log_bad_auth(data_mp, &iras);
1741 	} else {
1742 		esp1dbg(espstack,
1743 		    ("esp_kcf_callback: crypto failed with 0x%x\n",
1744 		    status));
1745 		ESP_BUMP_STAT(espstack, crypto_failures);
1746 		IP_ESP_BUMP_STAT(ipss, in_discards);
1747 		ip_drop_packet(data_mp, B_TRUE, iras.ira_ill,
1748 		    DROPPER(ipss, ipds_esp_crypto_failed),
1749 		    &espstack->esp_dropper);
1750 		BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1751 	}
1752 done:
1753 	ira_cleanup(&iras, B_TRUE);
1754 	(void) ipsec_free_crypto_data(mp);
1755 }
1756 
1757 /*
1758  * Invoked on crypto framework failure during inbound and outbound processing.
1759  */
1760 static void
1761 esp_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc,
1762     ill_t *ill, ipsecesp_stack_t *espstack)
1763 {
1764 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
1765 
1766 	esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
1767 	    is_inbound ? "inbound" : "outbound", kef_rc));
1768 	ip_drop_packet(data_mp, is_inbound, ill,
1769 	    DROPPER(ipss, ipds_esp_crypto_failed),
1770 	    &espstack->esp_dropper);
1771 	ESP_BUMP_STAT(espstack, crypto_failures);
1772 	if (is_inbound)
1773 		IP_ESP_BUMP_STAT(ipss, in_discards);
1774 	else
1775 		ESP_BUMP_STAT(espstack, out_discards);
1776 }
1777 
1778 /*
1779  * A statement-equivalent macro, _cr MUST point to a modifiable
1780  * crypto_call_req_t.
1781  */
1782 #define	ESP_INIT_CALLREQ(_cr, _mp, _callback)				\
1783 	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE;	\
1784 	(_cr)->cr_callback_arg = (_mp);				\
1785 	(_cr)->cr_callback_func = (_callback)
1786 
1787 #define	ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) {			\
1788 	(mac)->cd_format = CRYPTO_DATA_RAW;				\
1789 	(mac)->cd_offset = 0;						\
1790 	(mac)->cd_length = icvlen;					\
1791 	(mac)->cd_raw.iov_base = (char *)icvbuf;			\
1792 	(mac)->cd_raw.iov_len = icvlen;					\
1793 }
1794 
1795 #define	ESP_INIT_CRYPTO_DATA(data, mp, off, len) {			\
1796 	if (MBLKL(mp) >= (len) + (off)) {				\
1797 		(data)->cd_format = CRYPTO_DATA_RAW;			\
1798 		(data)->cd_raw.iov_base = (char *)(mp)->b_rptr;		\
1799 		(data)->cd_raw.iov_len = MBLKL(mp);			\
1800 		(data)->cd_offset = off;				\
1801 	} else {							\
1802 		(data)->cd_format = CRYPTO_DATA_MBLK;			\
1803 		(data)->cd_mp = mp;			       		\
1804 		(data)->cd_offset = off;				\
1805 	}								\
1806 	(data)->cd_length = len;					\
1807 }
1808 
1809 #define	ESP_INIT_CRYPTO_DUAL_DATA(data, mp, off1, len1, off2, len2) {	\
1810 	(data)->dd_format = CRYPTO_DATA_MBLK;				\
1811 	(data)->dd_mp = mp;						\
1812 	(data)->dd_len1 = len1;						\
1813 	(data)->dd_offset1 = off1;					\
1814 	(data)->dd_len2 = len2;						\
1815 	(data)->dd_offset2 = off2;					\
1816 }
1817 
1818 /*
1819  * Returns data_mp if successfully completed the request. Returns
1820  * NULL if it failed (and increments InDiscards) or if it is pending.
1821  */
1822 static mblk_t *
1823 esp_submit_req_inbound(mblk_t *esp_mp, ip_recv_attr_t *ira,
1824     ipsa_t *assoc, uint_t esph_offset)
1825 {
1826 	uint_t auth_offset, msg_len, auth_len;
1827 	crypto_call_req_t call_req, *callrp;
1828 	mblk_t *mp;
1829 	esph_t *esph_ptr;
1830 	int kef_rc;
1831 	uint_t icv_len = assoc->ipsa_mac_len;
1832 	crypto_ctx_template_t auth_ctx_tmpl;
1833 	boolean_t do_auth, do_encr, force;
1834 	uint_t encr_offset, encr_len;
1835 	uint_t iv_len = assoc->ipsa_iv_len;
1836 	crypto_ctx_template_t encr_ctx_tmpl;
1837 	ipsec_crypto_t	*ic, icstack;
1838 	uchar_t *iv_ptr;
1839 	netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
1840 	ipsec_stack_t *ipss = ns->netstack_ipsec;
1841 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1842 
1843 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
1844 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
1845 	force = (assoc->ipsa_flags & IPSA_F_ASYNC);
1846 
1847 #ifdef IPSEC_LATENCY_TEST
1848 	kef_rc = CRYPTO_SUCCESS;
1849 #else
1850 	kef_rc = CRYPTO_FAILED;
1851 #endif
1852 
1853 	/*
1854 	 * An inbound packet is of the form:
1855 	 * [IP,options,ESP,IV,data,ICV,pad]
1856 	 */
1857 	esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
1858 	iv_ptr = (uchar_t *)(esph_ptr + 1);
1859 	/* Packet length starting at IP header ending after ESP ICV. */
1860 	msg_len = MBLKL(esp_mp);
1861 
1862 	encr_offset = esph_offset + sizeof (esph_t) + iv_len;
1863 	encr_len = msg_len - encr_offset;
1864 
1865 	/*
1866 	 * Counter mode algs need a nonce. This is setup in sadb_common_add().
1867 	 * If for some reason we are using a SA which does not have a nonce
1868 	 * then we must fail here.
1869 	 */
1870 	if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
1871 	    (assoc->ipsa_nonce == NULL)) {
1872 		ip_drop_packet(esp_mp, B_TRUE, ira->ira_ill,
1873 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
1874 		return (NULL);
1875 	}
1876 
1877 	if (force) {
1878 		/* We are doing asynch; allocate mblks to hold state */
1879 		if ((mp = ip_recv_attr_to_mblk(ira)) == NULL ||
1880 		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
1881 			BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1882 			ip_drop_input("ipIfStatsInDiscards", esp_mp,
1883 			    ira->ira_ill);
1884 			return (NULL);
1885 		}
1886 		linkb(mp, esp_mp);
1887 		callrp = &call_req;
1888 		ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_inbound);
1889 	} else {
1890 		/*
1891 		 * If we know we are going to do sync then ipsec_crypto_t
1892 		 * should be on the stack.
1893 		 */
1894 		ic = &icstack;
1895 		bzero(ic, sizeof (*ic));
1896 		callrp = NULL;
1897 	}
1898 
1899 	if (do_auth) {
1900 		/* authentication context template */
1901 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
1902 		    auth_ctx_tmpl);
1903 
1904 		/* ICV to be verified */
1905 		ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
1906 		    icv_len, esp_mp->b_wptr - icv_len);
1907 
1908 		/* authentication starts at the ESP header */
1909 		auth_offset = esph_offset;
1910 		auth_len = msg_len - auth_offset - icv_len;
1911 		if (!do_encr) {
1912 			/* authentication only */
1913 			/* initialize input data argument */
1914 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
1915 			    esp_mp, auth_offset, auth_len);
1916 
1917 			/* call the crypto framework */
1918 			kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
1919 			    &ic->ic_crypto_data,
1920 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
1921 			    &ic->ic_crypto_mac, callrp);
1922 		}
1923 	}
1924 
1925 	if (do_encr) {
1926 		/* encryption template */
1927 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
1928 		    encr_ctx_tmpl);
1929 
1930 		/* Call the nonce update function. Also passes in IV */
1931 		(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, encr_len,
1932 		    iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
1933 
1934 		if (!do_auth) {
1935 			/* decryption only */
1936 			/* initialize input data argument */
1937 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
1938 			    esp_mp, encr_offset, encr_len);
1939 
1940 			/* call the crypto framework */
1941 			kef_rc = crypto_decrypt((crypto_mechanism_t *)
1942 			    &ic->ic_cmm, &ic->ic_crypto_data,
1943 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
1944 			    NULL, callrp);
1945 		}
1946 	}
1947 
1948 	if (do_auth && do_encr) {
1949 		/* dual operation */
1950 		/* initialize input data argument */
1951 		ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
1952 		    esp_mp, auth_offset, auth_len,
1953 		    encr_offset, encr_len - icv_len);
1954 
1955 		/* specify IV */
1956 		ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
1957 
1958 		/* call the framework */
1959 		kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
1960 		    &assoc->ipsa_emech, &ic->ic_crypto_dual_data,
1961 		    &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
1962 		    auth_ctx_tmpl, encr_ctx_tmpl, &ic->ic_crypto_mac,
1963 		    NULL, callrp);
1964 	}
1965 
1966 	switch (kef_rc) {
1967 	case CRYPTO_SUCCESS:
1968 		ESP_BUMP_STAT(espstack, crypto_sync);
1969 		esp_mp = esp_in_done(esp_mp, ira, ic);
1970 		if (force) {
1971 			/* Free mp after we are done with ic */
1972 			mp = ipsec_free_crypto_data(mp);
1973 			(void) ip_recv_attr_free_mblk(mp);
1974 		}
1975 		return (esp_mp);
1976 	case CRYPTO_QUEUED:
1977 		/* esp_kcf_callback_inbound() will be invoked on completion */
1978 		ESP_BUMP_STAT(espstack, crypto_async);
1979 		return (NULL);
1980 	case CRYPTO_INVALID_MAC:
1981 		if (force) {
1982 			mp = ipsec_free_crypto_data(mp);
1983 			esp_mp = ip_recv_attr_free_mblk(mp);
1984 		}
1985 		ESP_BUMP_STAT(espstack, crypto_sync);
1986 		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1987 		esp_log_bad_auth(esp_mp, ira);
1988 		/* esp_mp was passed to ip_drop_packet */
1989 		return (NULL);
1990 	}
1991 
1992 	if (force) {
1993 		mp = ipsec_free_crypto_data(mp);
1994 		esp_mp = ip_recv_attr_free_mblk(mp);
1995 	}
1996 	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1997 	esp_crypto_failed(esp_mp, B_TRUE, kef_rc, ira->ira_ill, espstack);
1998 	/* esp_mp was passed to ip_drop_packet */
1999 	return (NULL);
2000 }
2001 
2002 /*
2003  * Compute the IP and UDP checksums -- common code for both keepalives and
2004  * actual ESP-in-UDP packets.  Be flexible with multiple mblks because ESP
2005  * uses mblk-insertion to insert the UDP header.
2006  * TODO - If there is an easy way to prep a packet for HW checksums, make
2007  * it happen here.
2008  * Note that this is used before both before calling ip_output_simple and
2009  * in the esp datapath. The former could use IXAF_SET_ULP_CKSUM but not the
2010  * latter.
2011  */
2012 static void
2013 esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
2014 {
2015 	int offset;
2016 	uint32_t cksum;
2017 	uint16_t *arr;
2018 	mblk_t *udpmp = mp;
2019 	uint_t hlen = IPH_HDR_LENGTH(ipha);
2020 
2021 	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2022 
2023 	ipha->ipha_hdr_checksum = 0;
2024 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2025 
2026 	if (ns->netstack_udp->us_do_checksum) {
2027 		ASSERT(MBLKL(udpmp) >= sizeof (udpha_t));
2028 		/* arr points to the IP header. */
2029 		arr = (uint16_t *)ipha;
2030 		IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
2031 		IP_STAT_UPDATE(ns->netstack_ip, ip_out_sw_cksum_bytes,
2032 		    ntohs(htons(ipha->ipha_length) - hlen));
2033 		/* arr[6-9] are the IP addresses. */
2034 		cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
2035 		    ntohs(htons(ipha->ipha_length) - hlen);
2036 		cksum = IP_CSUM(mp, hlen, cksum);
2037 		offset = hlen + UDP_CHECKSUM_OFFSET;
2038 		while (offset >= MBLKL(udpmp)) {
2039 			offset -= MBLKL(udpmp);
2040 			udpmp = udpmp->b_cont;
2041 		}
2042 		/* arr points to the UDP header's checksum field. */
2043 		arr = (uint16_t *)(udpmp->b_rptr + offset);
2044 		*arr = cksum;
2045 	}
2046 }
2047 
2048 /*
2049  * taskq handler so we can send the NAT-T keepalive on a separate thread.
2050  */
2051 static void
2052 actually_send_keepalive(void *arg)
2053 {
2054 	mblk_t *mp = (mblk_t *)arg;
2055 	ip_xmit_attr_t ixas;
2056 	netstack_t	*ns;
2057 	netstackid_t	stackid;
2058 
2059 	stackid = (netstackid_t)(uintptr_t)mp->b_prev;
2060 	mp->b_prev = NULL;
2061 	ns = netstack_find_by_stackid(stackid);
2062 	if (ns == NULL) {
2063 		/* Disappeared */
2064 		ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
2065 		freemsg(mp);
2066 		return;
2067 	}
2068 
2069 	bzero(&ixas, sizeof (ixas));
2070 	ixas.ixa_zoneid = ALL_ZONES;
2071 	ixas.ixa_cred = kcred;
2072 	ixas.ixa_cpid = NOPID;
2073 	ixas.ixa_tsl = NULL;
2074 	ixas.ixa_ipst = ns->netstack_ip;
2075 	/* No ULP checksum; done by esp_prepare_udp */
2076 	ixas.ixa_flags = (IXAF_IS_IPV4 | IXAF_NO_IPSEC | IXAF_VERIFY_SOURCE);
2077 
2078 	(void) ip_output_simple(mp, &ixas);
2079 	ixa_cleanup(&ixas);
2080 	netstack_rele(ns);
2081 }
2082 
2083 /*
2084  * Send a one-byte UDP NAT-T keepalive.
2085  */
2086 void
2087 ipsecesp_send_keepalive(ipsa_t *assoc)
2088 {
2089 	mblk_t		*mp;
2090 	ipha_t		*ipha;
2091 	udpha_t		*udpha;
2092 	netstack_t	*ns = assoc->ipsa_netstack;
2093 
2094 	ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock));
2095 
2096 	mp = allocb(sizeof (ipha_t) + sizeof (udpha_t) + 1, BPRI_HI);
2097 	if (mp == NULL)
2098 		return;
2099 	ipha = (ipha_t *)mp->b_rptr;
2100 	ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2101 	ipha->ipha_type_of_service = 0;
2102 	ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (udpha_t) + 1);
2103 	/* Use the low-16 of the SPI so we have some clue where it came from. */
2104 	ipha->ipha_ident = *(((uint16_t *)(&assoc->ipsa_spi)) + 1);
2105 	ipha->ipha_fragment_offset_and_flags = 0;  /* Too small to fragment! */
2106 	ipha->ipha_ttl = 0xFF;
2107 	ipha->ipha_protocol = IPPROTO_UDP;
2108 	ipha->ipha_hdr_checksum = 0;
2109 	ipha->ipha_src = assoc->ipsa_srcaddr[0];
2110 	ipha->ipha_dst = assoc->ipsa_dstaddr[0];
2111 	udpha = (udpha_t *)(ipha + 1);
2112 	udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2113 	    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2114 	udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2115 	    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2116 	udpha->uha_length = htons(sizeof (udpha_t) + 1);
2117 	udpha->uha_checksum = 0;
2118 	mp->b_wptr = (uint8_t *)(udpha + 1);
2119 	*(mp->b_wptr++) = 0xFF;
2120 
2121 	esp_prepare_udp(ns, mp, ipha);
2122 
2123 	/*
2124 	 * We're holding an isaf_t bucket lock, so pawn off the actual
2125 	 * packet transmission to another thread.  Just in case syncq
2126 	 * processing causes a same-bucket packet to be processed.
2127 	 */
2128 	mp->b_prev = (mblk_t *)(uintptr_t)ns->netstack_stackid;
2129 
2130 	if (taskq_dispatch(esp_taskq, actually_send_keepalive, mp,
2131 	    TQ_NOSLEEP) == 0) {
2132 		/* Assume no memory if taskq_dispatch() fails. */
2133 		mp->b_prev = NULL;
2134 		ip_drop_packet(mp, B_FALSE, NULL,
2135 		    DROPPER(ns->netstack_ipsec, ipds_esp_nomem),
2136 		    &ns->netstack_ipsecesp->esp_dropper);
2137 	}
2138 }
2139 
2140 /*
2141  * Returns mp if successfully completed the request. Returns
2142  * NULL if it failed (and increments InDiscards) or if it is pending.
2143  */
2144 static mblk_t *
2145 esp_submit_req_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa, ipsa_t *assoc,
2146     uchar_t *icv_buf, uint_t payload_len)
2147 {
2148 	uint_t auth_len;
2149 	crypto_call_req_t call_req, *callrp;
2150 	mblk_t *esp_mp;
2151 	esph_t *esph_ptr;
2152 	mblk_t *mp;
2153 	int kef_rc = CRYPTO_FAILED;
2154 	uint_t icv_len = assoc->ipsa_mac_len;
2155 	crypto_ctx_template_t auth_ctx_tmpl;
2156 	boolean_t do_auth, do_encr, force;
2157 	uint_t iv_len = assoc->ipsa_iv_len;
2158 	crypto_ctx_template_t encr_ctx_tmpl;
2159 	boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
2160 	size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
2161 	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
2162 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2163 	ipsec_crypto_t	*ic, icstack;
2164 	uchar_t		*iv_ptr;
2165 	crypto_data_t	*cd_ptr = NULL;
2166 	ill_t		*ill = ixa->ixa_nce->nce_ill;
2167 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2168 
2169 	esp3dbg(espstack, ("esp_submit_req_outbound:%s",
2170 	    is_natt ? "natt" : "not natt"));
2171 
2172 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2173 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2174 	force = (assoc->ipsa_flags & IPSA_F_ASYNC);
2175 
2176 #ifdef IPSEC_LATENCY_TEST
2177 	kef_rc = CRYPTO_SUCCESS;
2178 #else
2179 	kef_rc = CRYPTO_FAILED;
2180 #endif
2181 
2182 	/*
2183 	 * Outbound IPsec packets are of the form:
2184 	 * [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
2185 	 * unless it's NATT, then it's
2186 	 * [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
2187 	 * Get a pointer to the mblk containing the ESP header.
2188 	 */
2189 	ASSERT(data_mp->b_cont != NULL);
2190 	esp_mp = data_mp->b_cont;
2191 	esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
2192 	iv_ptr = (uchar_t *)(esph_ptr + 1);
2193 
2194 	/*
2195 	 * Combined mode algs need a nonce. This is setup in sadb_common_add().
2196 	 * If for some reason we are using a SA which does not have a nonce
2197 	 * then we must fail here.
2198 	 */
2199 	if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
2200 	    (assoc->ipsa_nonce == NULL)) {
2201 		ip_drop_packet(data_mp, B_FALSE, NULL,
2202 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2203 		return (NULL);
2204 	}
2205 
2206 	if (force) {
2207 		/* We are doing asynch; allocate mblks to hold state */
2208 		if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL ||
2209 		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
2210 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2211 			ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
2212 			freemsg(data_mp);
2213 			return (NULL);
2214 		}
2215 
2216 		linkb(mp, data_mp);
2217 		callrp = &call_req;
2218 		ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_outbound);
2219 	} else {
2220 		/*
2221 		 * If we know we are going to do sync then ipsec_crypto_t
2222 		 * should be on the stack.
2223 		 */
2224 		ic = &icstack;
2225 		bzero(ic, sizeof (*ic));
2226 		callrp = NULL;
2227 	}
2228 
2229 
2230 	if (do_auth) {
2231 		/* authentication context template */
2232 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2233 		    auth_ctx_tmpl);
2234 
2235 		/* where to store the computed mac */
2236 		ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
2237 		    icv_len, icv_buf);
2238 
2239 		/* authentication starts at the ESP header */
2240 		auth_len = payload_len + iv_len + sizeof (esph_t);
2241 		if (!do_encr) {
2242 			/* authentication only */
2243 			/* initialize input data argument */
2244 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2245 			    esp_mp, esph_offset, auth_len);
2246 
2247 			/* call the crypto framework */
2248 			kef_rc = crypto_mac(&assoc->ipsa_amech,
2249 			    &ic->ic_crypto_data,
2250 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2251 			    &ic->ic_crypto_mac, callrp);
2252 		}
2253 	}
2254 
2255 	if (do_encr) {
2256 		/* encryption context template */
2257 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2258 		    encr_ctx_tmpl);
2259 		/* Call the nonce update function. */
2260 		(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, payload_len,
2261 		    iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
2262 
2263 		if (!do_auth) {
2264 			/* encryption only, skip mblk that contains ESP hdr */
2265 			/* initialize input data argument */
2266 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2267 			    esp_mp->b_cont, 0, payload_len);
2268 
2269 			/*
2270 			 * For combined mode ciphers, the ciphertext is the same
2271 			 * size as the clear text, the ICV should follow the
2272 			 * ciphertext. To convince the kcf to allow in-line
2273 			 * encryption, with an ICV, use ipsec_out_crypto_mac
2274 			 * to point to the same buffer as the data. The calling
2275 			 * function need to ensure the buffer is large enough to
2276 			 * include the ICV.
2277 			 *
2278 			 * The IV is already written to the packet buffer, the
2279 			 * nonce setup function copied it to the params struct
2280 			 * for the cipher to use.
2281 			 */
2282 			if (assoc->ipsa_flags & IPSA_F_COMBINED) {
2283 				bcopy(&ic->ic_crypto_data,
2284 				    &ic->ic_crypto_mac,
2285 				    sizeof (crypto_data_t));
2286 				ic->ic_crypto_mac.cd_length =
2287 				    payload_len + icv_len;
2288 				cd_ptr = &ic->ic_crypto_mac;
2289 			}
2290 
2291 			/* call the crypto framework */
2292 			kef_rc = crypto_encrypt((crypto_mechanism_t *)
2293 			    &ic->ic_cmm, &ic->ic_crypto_data,
2294 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2295 			    cd_ptr, callrp);
2296 
2297 		}
2298 	}
2299 
2300 	if (do_auth && do_encr) {
2301 		/*
2302 		 * Encryption and authentication:
2303 		 * Pass the pointer to the mblk chain starting at the ESP
2304 		 * header to the framework. Skip the ESP header mblk
2305 		 * for encryption, which is reflected by an encryption
2306 		 * offset equal to the length of that mblk. Start
2307 		 * the authentication at the ESP header, i.e. use an
2308 		 * authentication offset of zero.
2309 		 */
2310 		ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
2311 		    esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
2312 
2313 		/* specify IV */
2314 		ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
2315 
2316 		/* call the framework */
2317 		kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
2318 		    &assoc->ipsa_amech, NULL,
2319 		    &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
2320 		    encr_ctx_tmpl, auth_ctx_tmpl,
2321 		    &ic->ic_crypto_dual_data,
2322 		    &ic->ic_crypto_mac, callrp);
2323 	}
2324 
2325 	switch (kef_rc) {
2326 	case CRYPTO_SUCCESS:
2327 		ESP_BUMP_STAT(espstack, crypto_sync);
2328 		esp_set_usetime(assoc, B_FALSE);
2329 		if (force) {
2330 			mp = ipsec_free_crypto_data(mp);
2331 			data_mp = ip_xmit_attr_free_mblk(mp);
2332 		}
2333 		if (is_natt)
2334 			esp_prepare_udp(ns, data_mp, (ipha_t *)data_mp->b_rptr);
2335 		return (data_mp);
2336 	case CRYPTO_QUEUED:
2337 		/* esp_kcf_callback_outbound() will be invoked on completion */
2338 		ESP_BUMP_STAT(espstack, crypto_async);
2339 		return (NULL);
2340 	}
2341 
2342 	if (force) {
2343 		mp = ipsec_free_crypto_data(mp);
2344 		data_mp = ip_xmit_attr_free_mblk(mp);
2345 	}
2346 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2347 	esp_crypto_failed(data_mp, B_FALSE, kef_rc, NULL, espstack);
2348 	/* data_mp was passed to ip_drop_packet */
2349 	return (NULL);
2350 }
2351 
2352 /*
2353  * Handle outbound IPsec processing for IPv4 and IPv6
2354  *
2355  * Returns data_mp if successfully completed the request. Returns
2356  * NULL if it failed (and increments InDiscards) or if it is pending.
2357  */
2358 static mblk_t *
2359 esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa)
2360 {
2361 	mblk_t *espmp, *tailmp;
2362 	ipha_t *ipha;
2363 	ip6_t *ip6h;
2364 	esph_t *esph_ptr, *iv_ptr;
2365 	uint_t af;
2366 	uint8_t *nhp;
2367 	uintptr_t divpoint, datalen, adj, padlen, i, alloclen;
2368 	uintptr_t esplen = sizeof (esph_t);
2369 	uint8_t protocol;
2370 	ipsa_t *assoc;
2371 	uint_t iv_len, block_size, mac_len = 0;
2372 	uchar_t *icv_buf;
2373 	udpha_t *udpha;
2374 	boolean_t is_natt = B_FALSE;
2375 	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
2376 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2377 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2378 	ill_t		*ill = ixa->ixa_nce->nce_ill;
2379 	boolean_t	need_refrele = B_FALSE;
2380 
2381 	ESP_BUMP_STAT(espstack, out_requests);
2382 
2383 	/*
2384 	 * <sigh> We have to copy the message here, because TCP (for example)
2385 	 * keeps a dupb() of the message lying around for retransmission.
2386 	 * Since ESP changes the whole of the datagram, we have to create our
2387 	 * own copy lest we clobber TCP's data.  Since we have to copy anyway,
2388 	 * we might as well make use of msgpullup() and get the mblk into one
2389 	 * contiguous piece!
2390 	 */
2391 	tailmp = msgpullup(data_mp, -1);
2392 	if (tailmp == NULL) {
2393 		esp0dbg(("esp_outbound: msgpullup() failed, "
2394 		    "dropping packet.\n"));
2395 		ip_drop_packet(data_mp, B_FALSE, ill,
2396 		    DROPPER(ipss, ipds_esp_nomem),
2397 		    &espstack->esp_dropper);
2398 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2399 		return (NULL);
2400 	}
2401 	freemsg(data_mp);
2402 	data_mp = tailmp;
2403 
2404 	assoc = ixa->ixa_ipsec_esp_sa;
2405 	ASSERT(assoc != NULL);
2406 
2407 	/*
2408 	 * Get the outer IP header in shape to escape this system..
2409 	 */
2410 	if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) {
2411 		/*
2412 		 * Need to update packet with any CIPSO option and update
2413 		 * ixa_tsl to capture the new label.
2414 		 * We allocate a separate ixa for that purpose.
2415 		 */
2416 		ixa = ip_xmit_attr_duplicate(ixa);
2417 		if (ixa == NULL) {
2418 			ip_drop_packet(data_mp, B_FALSE, ill,
2419 			    DROPPER(ipss, ipds_esp_nomem),
2420 			    &espstack->esp_dropper);
2421 			return (NULL);
2422 		}
2423 		need_refrele = B_TRUE;
2424 
2425 		label_hold(assoc->ipsa_otsl);
2426 		ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl);
2427 
2428 		data_mp = sadb_whack_label(data_mp, assoc, ixa,
2429 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2430 		if (data_mp == NULL) {
2431 			/* Packet dropped by sadb_whack_label */
2432 			ixa_refrele(ixa);
2433 			return (NULL);
2434 		}
2435 	}
2436 
2437 	/*
2438 	 * Reality check....
2439 	 */
2440 	ipha = (ipha_t *)data_mp->b_rptr;  /* So we can call esp_acquire(). */
2441 
2442 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2443 		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
2444 
2445 		af = AF_INET;
2446 		divpoint = IPH_HDR_LENGTH(ipha);
2447 		datalen = ntohs(ipha->ipha_length) - divpoint;
2448 		nhp = (uint8_t *)&ipha->ipha_protocol;
2449 	} else {
2450 		ip_pkt_t ipp;
2451 
2452 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
2453 
2454 		af = AF_INET6;
2455 		ip6h = (ip6_t *)ipha;
2456 		bzero(&ipp, sizeof (ipp));
2457 		divpoint = ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, NULL);
2458 		if (ipp.ipp_dstopts != NULL &&
2459 		    ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
2460 			/*
2461 			 * Destination options are tricky.  If we get in here,
2462 			 * then we have a terminal header following the
2463 			 * destination options.  We need to adjust backwards
2464 			 * so we insert ESP BEFORE the destination options
2465 			 * bag.  (So that the dstopts get encrypted!)
2466 			 *
2467 			 * Since this is for outbound packets only, we know
2468 			 * that non-terminal destination options only precede
2469 			 * routing headers.
2470 			 */
2471 			divpoint -= ipp.ipp_dstoptslen;
2472 		}
2473 		datalen = ntohs(ip6h->ip6_plen) + sizeof (ip6_t) - divpoint;
2474 
2475 		if (ipp.ipp_rthdr != NULL) {
2476 			nhp = &ipp.ipp_rthdr->ip6r_nxt;
2477 		} else if (ipp.ipp_hopopts != NULL) {
2478 			nhp = &ipp.ipp_hopopts->ip6h_nxt;
2479 		} else {
2480 			ASSERT(divpoint == sizeof (ip6_t));
2481 			/* It's probably IP + ESP. */
2482 			nhp = &ip6h->ip6_nxt;
2483 		}
2484 	}
2485 
2486 	mac_len = assoc->ipsa_mac_len;
2487 
2488 	if (assoc->ipsa_flags & IPSA_F_NATT) {
2489 		/* wedge in UDP header */
2490 		is_natt = B_TRUE;
2491 		esplen += UDPH_SIZE;
2492 	}
2493 
2494 	/*
2495 	 * Set up ESP header and encryption padding for ENCR PI request.
2496 	 */
2497 
2498 	/* Determine the padding length.  Pad to 4-bytes for no-encryption. */
2499 	if (assoc->ipsa_encr_alg != SADB_EALG_NULL) {
2500 		iv_len = assoc->ipsa_iv_len;
2501 		block_size = assoc->ipsa_datalen;
2502 
2503 		/*
2504 		 * Pad the data to the length of the cipher block size.
2505 		 * Include the two additional bytes (hence the - 2) for the
2506 		 * padding length and the next header.  Take this into account
2507 		 * when calculating the actual length of the padding.
2508 		 */
2509 		ASSERT(ISP2(iv_len));
2510 		padlen = ((unsigned)(block_size - datalen - 2)) &
2511 		    (block_size - 1);
2512 	} else {
2513 		iv_len = 0;
2514 		padlen = ((unsigned)(sizeof (uint32_t) - datalen - 2)) &
2515 		    (sizeof (uint32_t) - 1);
2516 	}
2517 
2518 	/* Allocate ESP header and IV. */
2519 	esplen += iv_len;
2520 
2521 	/*
2522 	 * Update association byte-count lifetimes.  Don't forget to take
2523 	 * into account the padding length and next-header (hence the + 2).
2524 	 *
2525 	 * Use the amount of data fed into the "encryption algorithm".  This
2526 	 * is the IV, the data length, the padding length, and the final two
2527 	 * bytes (padlen, and next-header).
2528 	 *
2529 	 */
2530 
2531 	if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
2532 		ip_drop_packet(data_mp, B_FALSE, ill,
2533 		    DROPPER(ipss, ipds_esp_bytes_expire),
2534 		    &espstack->esp_dropper);
2535 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2536 		if (need_refrele)
2537 			ixa_refrele(ixa);
2538 		return (NULL);
2539 	}
2540 
2541 	espmp = allocb(esplen, BPRI_HI);
2542 	if (espmp == NULL) {
2543 		ESP_BUMP_STAT(espstack, out_discards);
2544 		esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
2545 		ip_drop_packet(data_mp, B_FALSE, ill,
2546 		    DROPPER(ipss, ipds_esp_nomem),
2547 		    &espstack->esp_dropper);
2548 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2549 		if (need_refrele)
2550 			ixa_refrele(ixa);
2551 		return (NULL);
2552 	}
2553 	espmp->b_wptr += esplen;
2554 	esph_ptr = (esph_t *)espmp->b_rptr;
2555 
2556 	if (is_natt) {
2557 		esp3dbg(espstack, ("esp_outbound: NATT"));
2558 
2559 		udpha = (udpha_t *)espmp->b_rptr;
2560 		udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2561 		    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2562 		udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2563 		    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2564 		/*
2565 		 * Set the checksum to 0, so that the esp_prepare_udp() call
2566 		 * can do the right thing.
2567 		 */
2568 		udpha->uha_checksum = 0;
2569 		esph_ptr = (esph_t *)(udpha + 1);
2570 	}
2571 
2572 	esph_ptr->esph_spi = assoc->ipsa_spi;
2573 
2574 	esph_ptr->esph_replay = htonl(atomic_inc_32_nv(&assoc->ipsa_replay));
2575 	if (esph_ptr->esph_replay == 0 && assoc->ipsa_replay_wsize != 0) {
2576 		/*
2577 		 * XXX We have replay counter wrapping.
2578 		 * We probably want to nuke this SA (and its peer).
2579 		 */
2580 		ipsec_assocfailure(info.mi_idnum, 0, 0,
2581 		    SL_ERROR | SL_CONSOLE | SL_WARN,
2582 		    "Outbound ESP SA (0x%x, %s) has wrapped sequence.\n",
2583 		    esph_ptr->esph_spi, assoc->ipsa_dstaddr, af,
2584 		    espstack->ipsecesp_netstack);
2585 
2586 		ESP_BUMP_STAT(espstack, out_discards);
2587 		sadb_replay_delete(assoc);
2588 		ip_drop_packet(data_mp, B_FALSE, ill,
2589 		    DROPPER(ipss, ipds_esp_replay),
2590 		    &espstack->esp_dropper);
2591 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2592 		if (need_refrele)
2593 			ixa_refrele(ixa);
2594 		return (NULL);
2595 	}
2596 
2597 	iv_ptr = (esph_ptr + 1);
2598 	/*
2599 	 * iv_ptr points to the mblk which will contain the IV once we have
2600 	 * written it there. This mblk will be part of a mblk chain that
2601 	 * will make up the packet.
2602 	 *
2603 	 * For counter mode algorithms, the IV is a 64 bit quantity, it
2604 	 * must NEVER repeat in the lifetime of the SA, otherwise an
2605 	 * attacker who had recorded enough packets might be able to
2606 	 * determine some clear text.
2607 	 *
2608 	 * To ensure this does not happen, the IV is stored in the SA and
2609 	 * incremented for each packet, the IV is then copied into the
2610 	 * "packet" for transmission to the receiving system. The IV will
2611 	 * also be copied into the nonce, when the packet is encrypted.
2612 	 *
2613 	 * CBC mode algorithms use a random IV for each packet. We do not
2614 	 * require the highest quality random bits, but for best security
2615 	 * with CBC mode ciphers, the value must be unlikely to repeat and
2616 	 * must not be known in advance to an adversary capable of influencing
2617 	 * the clear text.
2618 	 */
2619 	if (!update_iv((uint8_t *)iv_ptr, espstack->esp_pfkey_q, assoc,
2620 	    espstack)) {
2621 		ip_drop_packet(data_mp, B_FALSE, ill,
2622 		    DROPPER(ipss, ipds_esp_iv_wrap), &espstack->esp_dropper);
2623 		if (need_refrele)
2624 			ixa_refrele(ixa);
2625 		return (NULL);
2626 	}
2627 
2628 	/* Fix the IP header. */
2629 	alloclen = padlen + 2 + mac_len;
2630 	adj = alloclen + (espmp->b_wptr - espmp->b_rptr);
2631 
2632 	protocol = *nhp;
2633 
2634 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2635 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
2636 		if (is_natt) {
2637 			*nhp = IPPROTO_UDP;
2638 			udpha->uha_length = htons(ntohs(ipha->ipha_length) -
2639 			    IPH_HDR_LENGTH(ipha));
2640 		} else {
2641 			*nhp = IPPROTO_ESP;
2642 		}
2643 		ipha->ipha_hdr_checksum = 0;
2644 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
2645 	} else {
2646 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + adj);
2647 		*nhp = IPPROTO_ESP;
2648 	}
2649 
2650 	/* I've got the two ESP mblks, now insert them. */
2651 
2652 	esp2dbg(espstack, ("data_mp before outbound ESP adjustment:\n"));
2653 	esp2dbg(espstack, (dump_msg(data_mp)));
2654 
2655 	if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
2656 		ESP_BUMP_STAT(espstack, out_discards);
2657 		/* NOTE:  esp_insert_esp() only fails if there's no memory. */
2658 		ip_drop_packet(data_mp, B_FALSE, ill,
2659 		    DROPPER(ipss, ipds_esp_nomem),
2660 		    &espstack->esp_dropper);
2661 		freeb(espmp);
2662 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2663 		if (need_refrele)
2664 			ixa_refrele(ixa);
2665 		return (NULL);
2666 	}
2667 
2668 	/* Append padding (and leave room for ICV). */
2669 	for (tailmp = data_mp; tailmp->b_cont != NULL; tailmp = tailmp->b_cont)
2670 		;
2671 	if (tailmp->b_wptr + alloclen > tailmp->b_datap->db_lim) {
2672 		tailmp->b_cont = allocb(alloclen, BPRI_HI);
2673 		if (tailmp->b_cont == NULL) {
2674 			ESP_BUMP_STAT(espstack, out_discards);
2675 			esp0dbg(("esp_outbound:  Can't allocate tailmp.\n"));
2676 			ip_drop_packet(data_mp, B_FALSE, ill,
2677 			    DROPPER(ipss, ipds_esp_nomem),
2678 			    &espstack->esp_dropper);
2679 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2680 			if (need_refrele)
2681 				ixa_refrele(ixa);
2682 			return (NULL);
2683 		}
2684 		tailmp = tailmp->b_cont;
2685 	}
2686 
2687 	/*
2688 	 * If there's padding, N bytes of padding must be of the form 0x1,
2689 	 * 0x2, 0x3... 0xN.
2690 	 */
2691 	for (i = 0; i < padlen; ) {
2692 		i++;
2693 		*tailmp->b_wptr++ = i;
2694 	}
2695 	*tailmp->b_wptr++ = i;
2696 	*tailmp->b_wptr++ = protocol;
2697 
2698 	esp2dbg(espstack, ("data_Mp before encryption:\n"));
2699 	esp2dbg(espstack, (dump_msg(data_mp)));
2700 
2701 	/*
2702 	 * Okay.  I've set up the pre-encryption ESP.  Let's do it!
2703 	 */
2704 
2705 	if (mac_len > 0) {
2706 		ASSERT(tailmp->b_wptr + mac_len <= tailmp->b_datap->db_lim);
2707 		icv_buf = tailmp->b_wptr;
2708 		tailmp->b_wptr += mac_len;
2709 	} else {
2710 		icv_buf = NULL;
2711 	}
2712 
2713 	data_mp = esp_submit_req_outbound(data_mp, ixa, assoc, icv_buf,
2714 	    datalen + padlen + 2);
2715 	if (need_refrele)
2716 		ixa_refrele(ixa);
2717 	return (data_mp);
2718 }
2719 
2720 /*
2721  * IP calls this to validate the ICMP errors that
2722  * we got from the network.
2723  */
2724 mblk_t *
2725 ipsecesp_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira)
2726 {
2727 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
2728 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2729 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2730 
2731 	/*
2732 	 * Unless we get an entire packet back, this function is useless.
2733 	 * Why?
2734 	 *
2735 	 * 1.)	Partial packets are useless, because the "next header"
2736 	 *	is at the end of the decrypted ESP packet.  Without the
2737 	 *	whole packet, this is useless.
2738 	 *
2739 	 * 2.)	If we every use a stateful cipher, such as a stream or a
2740 	 *	one-time pad, we can't do anything.
2741 	 *
2742 	 * Since the chances of us getting an entire packet back are very
2743 	 * very small, we discard here.
2744 	 */
2745 	IP_ESP_BUMP_STAT(ipss, in_discards);
2746 	ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
2747 	    DROPPER(ipss, ipds_esp_icmp),
2748 	    &espstack->esp_dropper);
2749 	return (NULL);
2750 }
2751 
2752 /*
2753  * Construct an SADB_REGISTER message with the current algorithms.
2754  * This function gets called when 'ipsecalgs -s' is run or when
2755  * in.iked (or other KMD) starts.
2756  */
2757 static boolean_t
2758 esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
2759     ipsecesp_stack_t *espstack, cred_t *cr)
2760 {
2761 	mblk_t *pfkey_msg_mp, *keysock_out_mp;
2762 	sadb_msg_t *samsg;
2763 	sadb_supported_t *sasupp_auth = NULL;
2764 	sadb_supported_t *sasupp_encr = NULL;
2765 	sadb_alg_t *saalg;
2766 	uint_t allocsize = sizeof (*samsg);
2767 	uint_t i, numalgs_snap;
2768 	int current_aalgs;
2769 	ipsec_alginfo_t **authalgs;
2770 	uint_t num_aalgs;
2771 	int current_ealgs;
2772 	ipsec_alginfo_t **encralgs;
2773 	uint_t num_ealgs;
2774 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
2775 	sadb_sens_t *sens;
2776 	size_t sens_len = 0;
2777 	sadb_ext_t *nextext;
2778 	ts_label_t *sens_tsl = NULL;
2779 
2780 	/* Allocate the KEYSOCK_OUT. */
2781 	keysock_out_mp = sadb_keysock_out(serial);
2782 	if (keysock_out_mp == NULL) {
2783 		esp0dbg(("esp_register_out: couldn't allocate mblk.\n"));
2784 		return (B_FALSE);
2785 	}
2786 
2787 	if (is_system_labeled() && (cr != NULL)) {
2788 		sens_tsl = crgetlabel(cr);
2789 		if (sens_tsl != NULL) {
2790 			sens_len = sadb_sens_len_from_label(sens_tsl);
2791 			allocsize += sens_len;
2792 		}
2793 	}
2794 
2795 	/*
2796 	 * Allocate the PF_KEY message that follows KEYSOCK_OUT.
2797 	 */
2798 
2799 	rw_enter(&ipss->ipsec_alg_lock, RW_READER);
2800 	/*
2801 	 * Fill SADB_REGISTER message's algorithm descriptors.  Hold
2802 	 * down the lock while filling it.
2803 	 *
2804 	 * Return only valid algorithms, so the number of algorithms
2805 	 * to send up may be less than the number of algorithm entries
2806 	 * in the table.
2807 	 */
2808 	authalgs = ipss->ipsec_alglists[IPSEC_ALG_AUTH];
2809 	for (num_aalgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2810 		if (authalgs[i] != NULL && ALG_VALID(authalgs[i]))
2811 			num_aalgs++;
2812 
2813 	if (num_aalgs != 0) {
2814 		allocsize += (num_aalgs * sizeof (*saalg));
2815 		allocsize += sizeof (*sasupp_auth);
2816 	}
2817 	encralgs = ipss->ipsec_alglists[IPSEC_ALG_ENCR];
2818 	for (num_ealgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2819 		if (encralgs[i] != NULL && ALG_VALID(encralgs[i]))
2820 			num_ealgs++;
2821 
2822 	if (num_ealgs != 0) {
2823 		allocsize += (num_ealgs * sizeof (*saalg));
2824 		allocsize += sizeof (*sasupp_encr);
2825 	}
2826 	keysock_out_mp->b_cont = allocb(allocsize, BPRI_HI);
2827 	if (keysock_out_mp->b_cont == NULL) {
2828 		rw_exit(&ipss->ipsec_alg_lock);
2829 		freemsg(keysock_out_mp);
2830 		return (B_FALSE);
2831 	}
2832 	pfkey_msg_mp = keysock_out_mp->b_cont;
2833 	pfkey_msg_mp->b_wptr += allocsize;
2834 
2835 	nextext = (sadb_ext_t *)(pfkey_msg_mp->b_rptr + sizeof (*samsg));
2836 
2837 	if (num_aalgs != 0) {
2838 		sasupp_auth = (sadb_supported_t *)nextext;
2839 		saalg = (sadb_alg_t *)(sasupp_auth + 1);
2840 
2841 		ASSERT(((ulong_t)saalg & 0x7) == 0);
2842 
2843 		numalgs_snap = 0;
2844 		for (i = 0;
2845 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_aalgs));
2846 		    i++) {
2847 			if (authalgs[i] == NULL || !ALG_VALID(authalgs[i]))
2848 				continue;
2849 
2850 			saalg->sadb_alg_id = authalgs[i]->alg_id;
2851 			saalg->sadb_alg_ivlen = 0;
2852 			saalg->sadb_alg_minbits	= authalgs[i]->alg_ef_minbits;
2853 			saalg->sadb_alg_maxbits	= authalgs[i]->alg_ef_maxbits;
2854 			saalg->sadb_x_alg_increment =
2855 			    authalgs[i]->alg_increment;
2856 			saalg->sadb_x_alg_saltbits = SADB_8TO1(
2857 			    authalgs[i]->alg_saltlen);
2858 			numalgs_snap++;
2859 			saalg++;
2860 		}
2861 		ASSERT(numalgs_snap == num_aalgs);
2862 #ifdef DEBUG
2863 		/*
2864 		 * Reality check to make sure I snagged all of the
2865 		 * algorithms.
2866 		 */
2867 		for (; i < IPSEC_MAX_ALGS; i++) {
2868 			if (authalgs[i] != NULL && ALG_VALID(authalgs[i])) {
2869 				cmn_err(CE_PANIC, "esp_register_out()! "
2870 				    "Missed aalg #%d.\n", i);
2871 			}
2872 		}
2873 #endif /* DEBUG */
2874 		nextext = (sadb_ext_t *)saalg;
2875 	}
2876 
2877 	if (num_ealgs != 0) {
2878 		sasupp_encr = (sadb_supported_t *)nextext;
2879 		saalg = (sadb_alg_t *)(sasupp_encr + 1);
2880 
2881 		numalgs_snap = 0;
2882 		for (i = 0;
2883 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_ealgs)); i++) {
2884 			if (encralgs[i] == NULL || !ALG_VALID(encralgs[i]))
2885 				continue;
2886 			saalg->sadb_alg_id = encralgs[i]->alg_id;
2887 			saalg->sadb_alg_ivlen = encralgs[i]->alg_ivlen;
2888 			saalg->sadb_alg_minbits	= encralgs[i]->alg_ef_minbits;
2889 			saalg->sadb_alg_maxbits	= encralgs[i]->alg_ef_maxbits;
2890 			/*
2891 			 * We could advertise the ICV length, except there
2892 			 * is not a value in sadb_x_algb to do this.
2893 			 * saalg->sadb_alg_maclen = encralgs[i]->alg_maclen;
2894 			 */
2895 			saalg->sadb_x_alg_increment =
2896 			    encralgs[i]->alg_increment;
2897 			saalg->sadb_x_alg_saltbits =
2898 			    SADB_8TO1(encralgs[i]->alg_saltlen);
2899 
2900 			numalgs_snap++;
2901 			saalg++;
2902 		}
2903 		ASSERT(numalgs_snap == num_ealgs);
2904 #ifdef DEBUG
2905 		/*
2906 		 * Reality check to make sure I snagged all of the
2907 		 * algorithms.
2908 		 */
2909 		for (; i < IPSEC_MAX_ALGS; i++) {
2910 			if (encralgs[i] != NULL && ALG_VALID(encralgs[i])) {
2911 				cmn_err(CE_PANIC, "esp_register_out()! "
2912 				    "Missed ealg #%d.\n", i);
2913 			}
2914 		}
2915 #endif /* DEBUG */
2916 		nextext = (sadb_ext_t *)saalg;
2917 	}
2918 
2919 	current_aalgs = num_aalgs;
2920 	current_ealgs = num_ealgs;
2921 
2922 	rw_exit(&ipss->ipsec_alg_lock);
2923 
2924 	if (sens_tsl != NULL) {
2925 		sens = (sadb_sens_t *)nextext;
2926 		sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
2927 		    sens_tsl, sens_len);
2928 
2929 		nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len);
2930 	}
2931 
2932 	/* Now fill the rest of the SADB_REGISTER message. */
2933 
2934 	samsg = (sadb_msg_t *)pfkey_msg_mp->b_rptr;
2935 	samsg->sadb_msg_version = PF_KEY_V2;
2936 	samsg->sadb_msg_type = SADB_REGISTER;
2937 	samsg->sadb_msg_errno = 0;
2938 	samsg->sadb_msg_satype = SADB_SATYPE_ESP;
2939 	samsg->sadb_msg_len = SADB_8TO64(allocsize);
2940 	samsg->sadb_msg_reserved = 0;
2941 	/*
2942 	 * Assume caller has sufficient sequence/pid number info.  If it's one
2943 	 * from me over a new alg., I could give two hoots about sequence.
2944 	 */
2945 	samsg->sadb_msg_seq = sequence;
2946 	samsg->sadb_msg_pid = pid;
2947 
2948 	if (sasupp_auth != NULL) {
2949 		sasupp_auth->sadb_supported_len = SADB_8TO64(
2950 		    sizeof (*sasupp_auth) + sizeof (*saalg) * current_aalgs);
2951 		sasupp_auth->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
2952 		sasupp_auth->sadb_supported_reserved = 0;
2953 	}
2954 
2955 	if (sasupp_encr != NULL) {
2956 		sasupp_encr->sadb_supported_len = SADB_8TO64(
2957 		    sizeof (*sasupp_encr) + sizeof (*saalg) * current_ealgs);
2958 		sasupp_encr->sadb_supported_exttype =
2959 		    SADB_EXT_SUPPORTED_ENCRYPT;
2960 		sasupp_encr->sadb_supported_reserved = 0;
2961 	}
2962 
2963 	if (espstack->esp_pfkey_q != NULL)
2964 		putnext(espstack->esp_pfkey_q, keysock_out_mp);
2965 	else {
2966 		freemsg(keysock_out_mp);
2967 		return (B_FALSE);
2968 	}
2969 
2970 	return (B_TRUE);
2971 }
2972 
2973 /*
2974  * Invoked when the algorithm table changes. Causes SADB_REGISTER
2975  * messages continaining the current list of algorithms to be
2976  * sent up to the ESP listeners.
2977  */
2978 void
2979 ipsecesp_algs_changed(netstack_t *ns)
2980 {
2981 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
2982 
2983 	/*
2984 	 * Time to send a PF_KEY SADB_REGISTER message to ESP listeners
2985 	 * everywhere.  (The function itself checks for NULL esp_pfkey_q.)
2986 	 */
2987 	(void) esp_register_out(0, 0, 0, espstack, NULL);
2988 }
2989 
2990 /*
2991  * Stub function that taskq_dispatch() invokes to take the mblk (in arg)
2992  * and send() it into ESP and IP again.
2993  */
2994 static void
2995 inbound_task(void *arg)
2996 {
2997 	mblk_t		*mp = (mblk_t *)arg;
2998 	mblk_t		*async_mp;
2999 	ip_recv_attr_t	iras;
3000 
3001 	async_mp = mp;
3002 	mp = async_mp->b_cont;
3003 	async_mp->b_cont = NULL;
3004 	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
3005 		/* The ill or ip_stack_t disappeared on us */
3006 		ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
3007 		freemsg(mp);
3008 		goto done;
3009 	}
3010 
3011 	esp_inbound_restart(mp, &iras);
3012 done:
3013 	ira_cleanup(&iras, B_TRUE);
3014 }
3015 
3016 /*
3017  * Restart ESP after the SA has been added.
3018  */
3019 static void
3020 esp_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira)
3021 {
3022 	esph_t		*esph;
3023 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
3024 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3025 
3026 	esp2dbg(espstack, ("in ESP inbound_task"));
3027 	ASSERT(espstack != NULL);
3028 
3029 	mp = ipsec_inbound_esp_sa(mp, ira, &esph);
3030 	if (mp == NULL)
3031 		return;
3032 
3033 	ASSERT(esph != NULL);
3034 	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3035 	ASSERT(ira->ira_ipsec_esp_sa != NULL);
3036 
3037 	mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, ira);
3038 	if (mp == NULL) {
3039 		/*
3040 		 * Either it failed or is pending. In the former case
3041 		 * ipIfStatsInDiscards was increased.
3042 		 */
3043 		return;
3044 	}
3045 
3046 	ip_input_post_ipsec(mp, ira);
3047 }
3048 
3049 /*
3050  * Now that weak-key passed, actually ADD the security association, and
3051  * send back a reply ADD message.
3052  */
3053 static int
3054 esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
3055     int *diagnostic, ipsecesp_stack_t *espstack)
3056 {
3057 	isaf_t *primary = NULL, *secondary;
3058 	boolean_t clone = B_FALSE, is_inbound = B_FALSE;
3059 	ipsa_t *larval = NULL;
3060 	ipsacq_t *acqrec;
3061 	iacqf_t *acq_bucket;
3062 	mblk_t *acq_msgs = NULL;
3063 	int rc;
3064 	mblk_t *lpkt;
3065 	int error;
3066 	ipsa_query_t sq;
3067 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3068 
3069 	/*
3070 	 * Locate the appropriate table(s).
3071 	 */
3072 	sq.spp = &espstack->esp_sadb;	/* XXX */
3073 	error = sadb_form_query(ksi, IPSA_Q_SA|IPSA_Q_DST,
3074 	    IPSA_Q_SA|IPSA_Q_DST|IPSA_Q_INBOUND|IPSA_Q_OUTBOUND,
3075 	    &sq, diagnostic);
3076 	if (error)
3077 		return (error);
3078 
3079 	/*
3080 	 * Use the direction flags provided by the KMD to determine
3081 	 * if the inbound or outbound table should be the primary
3082 	 * for this SA. If these flags were absent then make this
3083 	 * decision based on the addresses.
3084 	 */
3085 	if (sq.assoc->sadb_sa_flags & IPSA_F_INBOUND) {
3086 		primary = sq.inbound;
3087 		secondary = sq.outbound;
3088 		is_inbound = B_TRUE;
3089 		if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND)
3090 			clone = B_TRUE;
3091 	} else if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND) {
3092 		primary = sq.outbound;
3093 		secondary = sq.inbound;
3094 	}
3095 
3096 	if (primary == NULL) {
3097 		/*
3098 		 * The KMD did not set a direction flag, determine which
3099 		 * table to insert the SA into based on addresses.
3100 		 */
3101 		switch (ksi->ks_in_dsttype) {
3102 		case KS_IN_ADDR_MBCAST:
3103 			clone = B_TRUE;	/* All mcast SAs can be bidirectional */
3104 			sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3105 			/* FALLTHRU */
3106 		/*
3107 		 * If the source address is either one of mine, or unspecified
3108 		 * (which is best summed up by saying "not 'not mine'"),
3109 		 * then the association is potentially bi-directional,
3110 		 * in that it can be used for inbound traffic and outbound
3111 		 * traffic.  The best example of such an SA is a multicast
3112 		 * SA (which allows me to receive the outbound traffic).
3113 		 */
3114 		case KS_IN_ADDR_ME:
3115 			sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3116 			primary = sq.inbound;
3117 			secondary = sq.outbound;
3118 			if (ksi->ks_in_srctype != KS_IN_ADDR_NOTME)
3119 				clone = B_TRUE;
3120 			is_inbound = B_TRUE;
3121 			break;
3122 		/*
3123 		 * If the source address literally not mine (either
3124 		 * unspecified or not mine), then this SA may have an
3125 		 * address that WILL be mine after some configuration.
3126 		 * We pay the price for this by making it a bi-directional
3127 		 * SA.
3128 		 */
3129 		case KS_IN_ADDR_NOTME:
3130 			sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3131 			primary = sq.outbound;
3132 			secondary = sq.inbound;
3133 			if (ksi->ks_in_srctype != KS_IN_ADDR_ME) {
3134 				sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3135 				clone = B_TRUE;
3136 			}
3137 			break;
3138 		default:
3139 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_DST;
3140 			return (EINVAL);
3141 		}
3142 	}
3143 
3144 	/*
3145 	 * Find a ACQUIRE list entry if possible.  If we've added an SA that
3146 	 * suits the needs of an ACQUIRE list entry, we can eliminate the
3147 	 * ACQUIRE list entry and transmit the enqueued packets.  Use the
3148 	 * high-bit of the sequence number to queue it.  Key off destination
3149 	 * addr, and change acqrec's state.
3150 	 */
3151 
3152 	if (samsg->sadb_msg_seq & IACQF_LOWEST_SEQ) {
3153 		acq_bucket = &(sq.sp->sdb_acq[sq.outhash]);
3154 		mutex_enter(&acq_bucket->iacqf_lock);
3155 		for (acqrec = acq_bucket->iacqf_ipsacq; acqrec != NULL;
3156 		    acqrec = acqrec->ipsacq_next) {
3157 			mutex_enter(&acqrec->ipsacq_lock);
3158 			/*
3159 			 * Q:  I only check sequence.  Should I check dst?
3160 			 * A: Yes, check dest because those are the packets
3161 			 *    that are queued up.
3162 			 */
3163 			if (acqrec->ipsacq_seq == samsg->sadb_msg_seq &&
3164 			    IPSA_ARE_ADDR_EQUAL(sq.dstaddr,
3165 			    acqrec->ipsacq_dstaddr, acqrec->ipsacq_addrfam))
3166 				break;
3167 			mutex_exit(&acqrec->ipsacq_lock);
3168 		}
3169 		if (acqrec != NULL) {
3170 			/*
3171 			 * AHA!  I found an ACQUIRE record for this SA.
3172 			 * Grab the msg list, and free the acquire record.
3173 			 * I already am holding the lock for this record,
3174 			 * so all I have to do is free it.
3175 			 */
3176 			acq_msgs = acqrec->ipsacq_mp;
3177 			acqrec->ipsacq_mp = NULL;
3178 			mutex_exit(&acqrec->ipsacq_lock);
3179 			sadb_destroy_acquire(acqrec,
3180 			    espstack->ipsecesp_netstack);
3181 		}
3182 		mutex_exit(&acq_bucket->iacqf_lock);
3183 	}
3184 
3185 	/*
3186 	 * Find PF_KEY message, and see if I'm an update.  If so, find entry
3187 	 * in larval list (if there).
3188 	 */
3189 	if (samsg->sadb_msg_type == SADB_UPDATE) {
3190 		mutex_enter(&sq.inbound->isaf_lock);
3191 		larval = ipsec_getassocbyspi(sq.inbound, sq.assoc->sadb_sa_spi,
3192 		    ALL_ZEROES_PTR, sq.dstaddr, sq.dst->sin_family);
3193 		mutex_exit(&sq.inbound->isaf_lock);
3194 
3195 		if ((larval == NULL) ||
3196 		    (larval->ipsa_state != IPSA_STATE_LARVAL)) {
3197 			*diagnostic = SADB_X_DIAGNOSTIC_SA_NOTFOUND;
3198 			if (larval != NULL) {
3199 				IPSA_REFRELE(larval);
3200 			}
3201 			esp0dbg(("Larval update, but larval disappeared.\n"));
3202 			return (ESRCH);
3203 		} /* Else sadb_common_add unlinks it for me! */
3204 	}
3205 
3206 	if (larval != NULL) {
3207 		/*
3208 		 * Hold again, because sadb_common_add() consumes a reference,
3209 		 * and we don't want to clear_lpkt() without a reference.
3210 		 */
3211 		IPSA_REFHOLD(larval);
3212 	}
3213 
3214 	rc = sadb_common_add(espstack->esp_pfkey_q,
3215 	    mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
3216 	    diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb);
3217 
3218 	if (larval != NULL) {
3219 		if (rc == 0) {
3220 			lpkt = sadb_clear_lpkt(larval);
3221 			if (lpkt != NULL) {
3222 				rc = !taskq_dispatch(esp_taskq, inbound_task,
3223 				    lpkt, TQ_NOSLEEP);
3224 			}
3225 		}
3226 		IPSA_REFRELE(larval);
3227 	}
3228 
3229 	/*
3230 	 * How much more stack will I create with all of these
3231 	 * esp_outbound() calls?
3232 	 */
3233 
3234 	/* Handle the packets queued waiting for the SA */
3235 	while (acq_msgs != NULL) {
3236 		mblk_t		*asyncmp;
3237 		mblk_t		*data_mp;
3238 		ip_xmit_attr_t	ixas;
3239 		ill_t		*ill;
3240 
3241 		asyncmp = acq_msgs;
3242 		acq_msgs = acq_msgs->b_next;
3243 		asyncmp->b_next = NULL;
3244 
3245 		/*
3246 		 * Extract the ip_xmit_attr_t from the first mblk.
3247 		 * Verifies that the netstack and ill is still around; could
3248 		 * have vanished while iked was doing its work.
3249 		 * On succesful return we have a nce_t and the ill/ipst can't
3250 		 * disappear until we do the nce_refrele in ixa_cleanup.
3251 		 */
3252 		data_mp = asyncmp->b_cont;
3253 		asyncmp->b_cont = NULL;
3254 		if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) {
3255 			ESP_BUMP_STAT(espstack, out_discards);
3256 			ip_drop_packet(data_mp, B_FALSE, NULL,
3257 			    DROPPER(ipss, ipds_sadb_acquire_timeout),
3258 			    &espstack->esp_dropper);
3259 		} else if (rc != 0) {
3260 			ill = ixas.ixa_nce->nce_ill;
3261 			ESP_BUMP_STAT(espstack, out_discards);
3262 			ip_drop_packet(data_mp, B_FALSE, ill,
3263 			    DROPPER(ipss, ipds_sadb_acquire_timeout),
3264 			    &espstack->esp_dropper);
3265 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3266 		} else {
3267 			esp_outbound_finish(data_mp, &ixas);
3268 		}
3269 		ixa_cleanup(&ixas);
3270 	}
3271 
3272 	return (rc);
3273 }
3274 
3275 /*
3276  * Process one of the queued messages (from ipsacq_mp) once the SA
3277  * has been added.
3278  */
3279 static void
3280 esp_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa)
3281 {
3282 	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
3283 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3284 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3285 	ill_t		*ill = ixa->ixa_nce->nce_ill;
3286 
3287 	if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_ESP)) {
3288 		ESP_BUMP_STAT(espstack, out_discards);
3289 		ip_drop_packet(data_mp, B_FALSE, ill,
3290 		    DROPPER(ipss, ipds_sadb_acquire_timeout),
3291 		    &espstack->esp_dropper);
3292 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3293 		return;
3294 	}
3295 
3296 	data_mp = esp_outbound(data_mp, ixa);
3297 	if (data_mp == NULL)
3298 		return;
3299 
3300 	/* do AH processing if needed */
3301 	data_mp = esp_do_outbound_ah(data_mp, ixa);
3302 	if (data_mp == NULL)
3303 		return;
3304 
3305 	(void) ip_output_post_ipsec(data_mp, ixa);
3306 }
3307 
3308 /*
3309  * Add new ESP security association.  This may become a generic AH/ESP
3310  * routine eventually.
3311  */
3312 static int
3313 esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
3314 {
3315 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3316 	sadb_address_t *srcext =
3317 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3318 	sadb_address_t *dstext =
3319 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3320 	sadb_address_t *isrcext =
3321 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_SRC];
3322 	sadb_address_t *idstext =
3323 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_DST];
3324 	sadb_address_t *nttext_loc =
3325 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_LOC];
3326 	sadb_address_t *nttext_rem =
3327 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_REM];
3328 	sadb_key_t *akey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_AUTH];
3329 	sadb_key_t *ekey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_ENCRYPT];
3330 	struct sockaddr_in *src, *dst;
3331 	struct sockaddr_in *natt_loc, *natt_rem;
3332 	struct sockaddr_in6 *natt_loc6, *natt_rem6;
3333 	sadb_lifetime_t *soft =
3334 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_SOFT];
3335 	sadb_lifetime_t *hard =
3336 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_HARD];
3337 	sadb_lifetime_t *idle =
3338 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_X_EXT_LIFETIME_IDLE];
3339 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3340 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3341 
3342 
3343 
3344 	/* I need certain extensions present for an ADD message. */
3345 	if (srcext == NULL) {
3346 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SRC;
3347 		return (EINVAL);
3348 	}
3349 	if (dstext == NULL) {
3350 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3351 		return (EINVAL);
3352 	}
3353 	if (isrcext == NULL && idstext != NULL) {
3354 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_SRC;
3355 		return (EINVAL);
3356 	}
3357 	if (isrcext != NULL && idstext == NULL) {
3358 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_DST;
3359 		return (EINVAL);
3360 	}
3361 	if (assoc == NULL) {
3362 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3363 		return (EINVAL);
3364 	}
3365 	if (ekey == NULL && assoc->sadb_sa_encrypt != SADB_EALG_NULL) {
3366 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_EKEY;
3367 		return (EINVAL);
3368 	}
3369 
3370 	src = (struct sockaddr_in *)(srcext + 1);
3371 	dst = (struct sockaddr_in *)(dstext + 1);
3372 	natt_loc = (struct sockaddr_in *)(nttext_loc + 1);
3373 	natt_loc6 = (struct sockaddr_in6 *)(nttext_loc + 1);
3374 	natt_rem = (struct sockaddr_in *)(nttext_rem + 1);
3375 	natt_rem6 = (struct sockaddr_in6 *)(nttext_rem + 1);
3376 
3377 	/* Sundry ADD-specific reality checks. */
3378 	/* XXX STATS :  Logging/stats here? */
3379 
3380 	if ((assoc->sadb_sa_state != SADB_SASTATE_MATURE) &&
3381 	    (assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE_ELSEWHERE)) {
3382 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SASTATE;
3383 		return (EINVAL);
3384 	}
3385 	if (assoc->sadb_sa_encrypt == SADB_EALG_NONE) {
3386 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3387 		return (EINVAL);
3388 	}
3389 
3390 #ifndef IPSEC_LATENCY_TEST
3391 	if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
3392 	    assoc->sadb_sa_auth == SADB_AALG_NONE) {
3393 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3394 		return (EINVAL);
3395 	}
3396 #endif
3397 
3398 	if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) {
3399 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
3400 		return (EINVAL);
3401 	}
3402 
3403 	if ((*diagnostic = sadb_hardsoftchk(hard, soft, idle)) != 0) {
3404 		return (EINVAL);
3405 	}
3406 	ASSERT(src->sin_family == dst->sin_family);
3407 
3408 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_LOC) {
3409 		if (nttext_loc == NULL) {
3410 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_LOC;
3411 			return (EINVAL);
3412 		}
3413 
3414 		if (natt_loc->sin_family == AF_INET6 &&
3415 		    !IN6_IS_ADDR_V4MAPPED(&natt_loc6->sin6_addr)) {
3416 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_LOC;
3417 			return (EINVAL);
3418 		}
3419 	}
3420 
3421 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_REM) {
3422 		if (nttext_rem == NULL) {
3423 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_REM;
3424 			return (EINVAL);
3425 		}
3426 		if (natt_rem->sin_family == AF_INET6 &&
3427 		    !IN6_IS_ADDR_V4MAPPED(&natt_rem6->sin6_addr)) {
3428 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_REM;
3429 			return (EINVAL);
3430 		}
3431 	}
3432 
3433 
3434 	/* Stuff I don't support, for now.  XXX Diagnostic? */
3435 	if (ksi->ks_in_extv[SADB_EXT_LIFETIME_CURRENT] != NULL)
3436 		return (EOPNOTSUPP);
3437 
3438 	if ((*diagnostic = sadb_labelchk(ksi)) != 0)
3439 		return (EINVAL);
3440 
3441 	/*
3442 	 * XXX Policy :  I'm not checking identities at this time,
3443 	 * but if I did, I'd do them here, before I sent
3444 	 * the weak key check up to the algorithm.
3445 	 */
3446 
3447 	rw_enter(&ipss->ipsec_alg_lock, RW_READER);
3448 
3449 	/*
3450 	 * First locate the authentication algorithm.
3451 	 */
3452 #ifdef IPSEC_LATENCY_TEST
3453 	if (akey != NULL && assoc->sadb_sa_auth != SADB_AALG_NONE) {
3454 #else
3455 	if (akey != NULL) {
3456 #endif
3457 		ipsec_alginfo_t *aalg;
3458 
3459 		aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
3460 		    [assoc->sadb_sa_auth];
3461 		if (aalg == NULL || !ALG_VALID(aalg)) {
3462 			rw_exit(&ipss->ipsec_alg_lock);
3463 			esp1dbg(espstack, ("Couldn't find auth alg #%d.\n",
3464 			    assoc->sadb_sa_auth));
3465 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3466 			return (EINVAL);
3467 		}
3468 
3469 		/*
3470 		 * Sanity check key sizes.
3471 		 * Note: It's not possible to use SADB_AALG_NONE because
3472 		 * this auth_alg is not defined with ALG_FLAG_VALID. If this
3473 		 * ever changes, the same check for SADB_AALG_NONE and
3474 		 * a auth_key != NULL should be made here ( see below).
3475 		 */
3476 		if (!ipsec_valid_key_size(akey->sadb_key_bits, aalg)) {
3477 			rw_exit(&ipss->ipsec_alg_lock);
3478 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AKEYBITS;
3479 			return (EINVAL);
3480 		}
3481 		ASSERT(aalg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3482 
3483 		/* check key and fix parity if needed */
3484 		if (ipsec_check_key(aalg->alg_mech_type, akey, B_TRUE,
3485 		    diagnostic) != 0) {
3486 			rw_exit(&ipss->ipsec_alg_lock);
3487 			return (EINVAL);
3488 		}
3489 	}
3490 
3491 	/*
3492 	 * Then locate the encryption algorithm.
3493 	 */
3494 	if (ekey != NULL) {
3495 		uint_t keybits;
3496 		ipsec_alginfo_t *ealg;
3497 
3498 		ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
3499 		    [assoc->sadb_sa_encrypt];
3500 		if (ealg == NULL || !ALG_VALID(ealg)) {
3501 			rw_exit(&ipss->ipsec_alg_lock);
3502 			esp1dbg(espstack, ("Couldn't find encr alg #%d.\n",
3503 			    assoc->sadb_sa_encrypt));
3504 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3505 			return (EINVAL);
3506 		}
3507 
3508 		/*
3509 		 * Sanity check key sizes. If the encryption algorithm is
3510 		 * SADB_EALG_NULL but the encryption key is NOT
3511 		 * NULL then complain.
3512 		 *
3513 		 * The keying material includes salt bits if required by
3514 		 * algorithm and optionally the Initial IV, check the
3515 		 * length of whats left.
3516 		 */
3517 		keybits = ekey->sadb_key_bits;
3518 		keybits -= ekey->sadb_key_reserved;
3519 		keybits -= SADB_8TO1(ealg->alg_saltlen);
3520 		if ((assoc->sadb_sa_encrypt == SADB_EALG_NULL) ||
3521 		    (!ipsec_valid_key_size(keybits, ealg))) {
3522 			rw_exit(&ipss->ipsec_alg_lock);
3523 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EKEYBITS;
3524 			return (EINVAL);
3525 		}
3526 		ASSERT(ealg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3527 
3528 		/* check key */
3529 		if (ipsec_check_key(ealg->alg_mech_type, ekey, B_FALSE,
3530 		    diagnostic) != 0) {
3531 			rw_exit(&ipss->ipsec_alg_lock);
3532 			return (EINVAL);
3533 		}
3534 	}
3535 	rw_exit(&ipss->ipsec_alg_lock);
3536 
3537 	return (esp_add_sa_finish(mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
3538 	    diagnostic, espstack));
3539 }
3540 
3541 /*
3542  * Update a security association.  Updates come in two varieties.  The first
3543  * is an update of lifetimes on a non-larval SA.  The second is an update of
3544  * a larval SA, which ends up looking a lot more like an add.
3545  */
3546 static int
3547 esp_update_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3548     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3549 {
3550 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3551 	mblk_t    *buf_pkt;
3552 	int rcode;
3553 
3554 	sadb_address_t *dstext =
3555 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3556 
3557 	if (dstext == NULL) {
3558 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3559 		return (EINVAL);
3560 	}
3561 
3562 	rcode = sadb_update_sa(mp, ksi, &buf_pkt, &espstack->esp_sadb,
3563 	    diagnostic, espstack->esp_pfkey_q, esp_add_sa,
3564 	    espstack->ipsecesp_netstack, sadb_msg_type);
3565 
3566 	if ((assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE) ||
3567 	    (rcode != 0)) {
3568 		return (rcode);
3569 	}
3570 
3571 	HANDLE_BUF_PKT(esp_taskq, espstack->ipsecesp_netstack->netstack_ipsec,
3572 	    espstack->esp_dropper, buf_pkt);
3573 
3574 	return (rcode);
3575 }
3576 
3577 /* XXX refactor me */
3578 /*
3579  * Delete a security association.  This is REALLY likely to be code common to
3580  * both AH and ESP.  Find the association, then unlink it.
3581  */
3582 static int
3583 esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3584     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3585 {
3586 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3587 	sadb_address_t *dstext =
3588 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3589 	sadb_address_t *srcext =
3590 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3591 	struct sockaddr_in *sin;
3592 
3593 	if (assoc == NULL) {
3594 		if (dstext != NULL) {
3595 			sin = (struct sockaddr_in *)(dstext + 1);
3596 		} else if (srcext != NULL) {
3597 			sin = (struct sockaddr_in *)(srcext + 1);
3598 		} else {
3599 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3600 			return (EINVAL);
3601 		}
3602 		return (sadb_purge_sa(mp, ksi,
3603 		    (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
3604 		    &espstack->esp_sadb.s_v4, diagnostic,
3605 		    espstack->esp_pfkey_q));
3606 	}
3607 
3608 	return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
3609 	    espstack->esp_pfkey_q, sadb_msg_type));
3610 }
3611 
3612 /* XXX refactor me */
3613 /*
3614  * Convert the entire contents of all of ESP's SA tables into PF_KEY SADB_DUMP
3615  * messages.
3616  */
3617 static void
3618 esp_dump(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
3619 {
3620 	int error;
3621 	sadb_msg_t *samsg;
3622 
3623 	/*
3624 	 * Dump each fanout, bailing if error is non-zero.
3625 	 */
3626 
3627 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3628 	    &espstack->esp_sadb.s_v4);
3629 	if (error != 0)
3630 		goto bail;
3631 
3632 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3633 	    &espstack->esp_sadb.s_v6);
3634 bail:
3635 	ASSERT(mp->b_cont != NULL);
3636 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3637 	samsg->sadb_msg_errno = (uint8_t)error;
3638 	sadb_pfkey_echo(espstack->esp_pfkey_q, mp,
3639 	    (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL);
3640 }
3641 
3642 /*
3643  * First-cut reality check for an inbound PF_KEY message.
3644  */
3645 static boolean_t
3646 esp_pfkey_reality_failures(mblk_t *mp, keysock_in_t *ksi,
3647     ipsecesp_stack_t *espstack)
3648 {
3649 	int diagnostic;
3650 
3651 	if (ksi->ks_in_extv[SADB_EXT_PROPOSAL] != NULL) {
3652 		diagnostic = SADB_X_DIAGNOSTIC_PROP_PRESENT;
3653 		goto badmsg;
3654 	}
3655 	if (ksi->ks_in_extv[SADB_EXT_SUPPORTED_AUTH] != NULL ||
3656 	    ksi->ks_in_extv[SADB_EXT_SUPPORTED_ENCRYPT] != NULL) {
3657 		diagnostic = SADB_X_DIAGNOSTIC_SUPP_PRESENT;
3658 		goto badmsg;
3659 	}
3660 	return (B_FALSE);	/* False ==> no failures */
3661 
3662 badmsg:
3663 	sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
3664 	    ksi->ks_in_serial);
3665 	return (B_TRUE);	/* True ==> failures */
3666 }
3667 
3668 /*
3669  * ESP parsing of PF_KEY messages.  Keysock did most of the really silly
3670  * error cases.  What I receive is a fully-formed, syntactically legal
3671  * PF_KEY message.  I then need to check semantics...
3672  *
3673  * This code may become common to AH and ESP.  Stay tuned.
3674  *
3675  * I also make the assumption that db_ref's are cool.  If this assumption
3676  * is wrong, this means that someone other than keysock or me has been
3677  * mucking with PF_KEY messages.
3678  */
3679 static void
3680 esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
3681 {
3682 	mblk_t *msg = mp->b_cont;
3683 	sadb_msg_t *samsg;
3684 	keysock_in_t *ksi;
3685 	int error;
3686 	int diagnostic = SADB_X_DIAGNOSTIC_NONE;
3687 
3688 	ASSERT(msg != NULL);
3689 
3690 	samsg = (sadb_msg_t *)msg->b_rptr;
3691 	ksi = (keysock_in_t *)mp->b_rptr;
3692 
3693 	/*
3694 	 * If applicable, convert unspecified AF_INET6 to unspecified
3695 	 * AF_INET.  And do other address reality checks.
3696 	 */
3697 	if (!sadb_addrfix(ksi, espstack->esp_pfkey_q, mp,
3698 	    espstack->ipsecesp_netstack) ||
3699 	    esp_pfkey_reality_failures(mp, ksi, espstack)) {
3700 		return;
3701 	}
3702 
3703 	switch (samsg->sadb_msg_type) {
3704 	case SADB_ADD:
3705 		error = esp_add_sa(mp, ksi, &diagnostic,
3706 		    espstack->ipsecesp_netstack);
3707 		if (error != 0) {
3708 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3709 			    diagnostic, ksi->ks_in_serial);
3710 		}
3711 		/* else esp_add_sa() took care of things. */
3712 		break;
3713 	case SADB_DELETE:
3714 	case SADB_X_DELPAIR:
3715 	case SADB_X_DELPAIR_STATE:
3716 		error = esp_del_sa(mp, ksi, &diagnostic, espstack,
3717 		    samsg->sadb_msg_type);
3718 		if (error != 0) {
3719 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3720 			    diagnostic, ksi->ks_in_serial);
3721 		}
3722 		/* Else esp_del_sa() took care of things. */
3723 		break;
3724 	case SADB_GET:
3725 		error = sadb_delget_sa(mp, ksi, &espstack->esp_sadb,
3726 		    &diagnostic, espstack->esp_pfkey_q, samsg->sadb_msg_type);
3727 		if (error != 0) {
3728 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3729 			    diagnostic, ksi->ks_in_serial);
3730 		}
3731 		/* Else sadb_get_sa() took care of things. */
3732 		break;
3733 	case SADB_FLUSH:
3734 		sadbp_flush(&espstack->esp_sadb, espstack->ipsecesp_netstack);
3735 		sadb_pfkey_echo(espstack->esp_pfkey_q, mp, samsg, ksi, NULL);
3736 		break;
3737 	case SADB_REGISTER:
3738 		/*
3739 		 * Hmmm, let's do it!  Check for extensions (there should
3740 		 * be none), extract the fields, call esp_register_out(),
3741 		 * then either free or report an error.
3742 		 *
3743 		 * Keysock takes care of the PF_KEY bookkeeping for this.
3744 		 */
3745 		if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
3746 		    ksi->ks_in_serial, espstack, msg_getcred(mp, NULL))) {
3747 			freemsg(mp);
3748 		} else {
3749 			/*
3750 			 * Only way this path hits is if there is a memory
3751 			 * failure.  It will not return B_FALSE because of
3752 			 * lack of esp_pfkey_q if I am in wput().
3753 			 */
3754 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM,
3755 			    diagnostic, ksi->ks_in_serial);
3756 		}
3757 		break;
3758 	case SADB_UPDATE:
3759 	case SADB_X_UPDATEPAIR:
3760 		/*
3761 		 * Find a larval, if not there, find a full one and get
3762 		 * strict.
3763 		 */
3764 		error = esp_update_sa(mp, ksi, &diagnostic, espstack,
3765 		    samsg->sadb_msg_type);
3766 		if (error != 0) {
3767 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3768 			    diagnostic, ksi->ks_in_serial);
3769 		}
3770 		/* else esp_update_sa() took care of things. */
3771 		break;
3772 	case SADB_GETSPI:
3773 		/*
3774 		 * Reserve a new larval entry.
3775 		 */
3776 		esp_getspi(mp, ksi, espstack);
3777 		break;
3778 	case SADB_ACQUIRE:
3779 		/*
3780 		 * Find larval and/or ACQUIRE record and kill it (them), I'm
3781 		 * most likely an error.  Inbound ACQUIRE messages should only
3782 		 * have the base header.
3783 		 */
3784 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3785 		    espstack->esp_pfkey_q, espstack->ipsecesp_netstack);
3786 		freemsg(mp);
3787 		break;
3788 	case SADB_DUMP:
3789 		/*
3790 		 * Dump all entries.
3791 		 */
3792 		esp_dump(mp, ksi, espstack);
3793 		/* esp_dump will take care of the return message, etc. */
3794 		break;
3795 	case SADB_EXPIRE:
3796 		/* Should never reach me. */
3797 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EOPNOTSUPP,
3798 		    diagnostic, ksi->ks_in_serial);
3799 		break;
3800 	default:
3801 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL,
3802 		    SADB_X_DIAGNOSTIC_UNKNOWN_MSG, ksi->ks_in_serial);
3803 		break;
3804 	}
3805 }
3806 
3807 /*
3808  * Handle case where PF_KEY says it can't find a keysock for one of my
3809  * ACQUIRE messages.
3810  */
3811 static void
3812 esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
3813 {
3814 	sadb_msg_t *samsg;
3815 	keysock_out_err_t *kse = (keysock_out_err_t *)mp->b_rptr;
3816 
3817 	if (mp->b_cont == NULL) {
3818 		freemsg(mp);
3819 		return;
3820 	}
3821 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3822 
3823 	/*
3824 	 * If keysock can't find any registered, delete the acquire record
3825 	 * immediately, and handle errors.
3826 	 */
3827 	if (samsg->sadb_msg_type == SADB_ACQUIRE) {
3828 		samsg->sadb_msg_errno = kse->ks_err_errno;
3829 		samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
3830 		/*
3831 		 * Use the write-side of the esp_pfkey_q
3832 		 */
3833 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3834 		    WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
3835 	}
3836 
3837 	freemsg(mp);
3838 }
3839 
3840 /*
3841  * ESP module write put routine.
3842  */
3843 static void
3844 ipsecesp_wput(queue_t *q, mblk_t *mp)
3845 {
3846 	ipsec_info_t *ii;
3847 	struct iocblk *iocp;
3848 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
3849 
3850 	esp3dbg(espstack, ("In esp_wput().\n"));
3851 
3852 	/* NOTE: Each case must take care of freeing or passing mp. */
3853 	switch (mp->b_datap->db_type) {
3854 	case M_CTL:
3855 		if ((mp->b_wptr - mp->b_rptr) < sizeof (ipsec_info_t)) {
3856 			/* Not big enough message. */
3857 			freemsg(mp);
3858 			break;
3859 		}
3860 		ii = (ipsec_info_t *)mp->b_rptr;
3861 
3862 		switch (ii->ipsec_info_type) {
3863 		case KEYSOCK_OUT_ERR:
3864 			esp1dbg(espstack, ("Got KEYSOCK_OUT_ERR message.\n"));
3865 			esp_keysock_no_socket(mp, espstack);
3866 			break;
3867 		case KEYSOCK_IN:
3868 			ESP_BUMP_STAT(espstack, keysock_in);
3869 			esp3dbg(espstack, ("Got KEYSOCK_IN message.\n"));
3870 
3871 			/* Parse the message. */
3872 			esp_parse_pfkey(mp, espstack);
3873 			break;
3874 		case KEYSOCK_HELLO:
3875 			sadb_keysock_hello(&espstack->esp_pfkey_q, q, mp,
3876 			    esp_ager, (void *)espstack, &espstack->esp_event,
3877 			    SADB_SATYPE_ESP);
3878 			break;
3879 		default:
3880 			esp2dbg(espstack, ("Got M_CTL from above of 0x%x.\n",
3881 			    ii->ipsec_info_type));
3882 			freemsg(mp);
3883 			break;
3884 		}
3885 		break;
3886 	case M_IOCTL:
3887 		iocp = (struct iocblk *)mp->b_rptr;
3888 		switch (iocp->ioc_cmd) {
3889 		case ND_SET:
3890 		case ND_GET:
3891 			if (nd_getset(q, espstack->ipsecesp_g_nd, mp)) {
3892 				qreply(q, mp);
3893 				return;
3894 			} else {
3895 				iocp->ioc_error = ENOENT;
3896 			}
3897 			/* FALLTHRU */
3898 		default:
3899 			/* We really don't support any other ioctls, do we? */
3900 
3901 			/* Return EINVAL */
3902 			if (iocp->ioc_error != ENOENT)
3903 				iocp->ioc_error = EINVAL;
3904 			iocp->ioc_count = 0;
3905 			mp->b_datap->db_type = M_IOCACK;
3906 			qreply(q, mp);
3907 			return;
3908 		}
3909 	default:
3910 		esp3dbg(espstack,
3911 		    ("Got default message, type %d, passing to IP.\n",
3912 		    mp->b_datap->db_type));
3913 		putnext(q, mp);
3914 	}
3915 }
3916 
3917 /*
3918  * Wrapper to allow IP to trigger an ESP association failure message
3919  * during inbound SA selection.
3920  */
3921 void
3922 ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
3923     uint32_t spi, void *addr, int af, ip_recv_attr_t *ira)
3924 {
3925 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
3926 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3927 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3928 
3929 	if (espstack->ipsecesp_log_unknown_spi) {
3930 		ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
3931 		    addr, af, espstack->ipsecesp_netstack);
3932 	}
3933 
3934 	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
3935 	    DROPPER(ipss, ipds_esp_no_sa),
3936 	    &espstack->esp_dropper);
3937 }
3938 
3939 /*
3940  * Initialize the ESP input and output processing functions.
3941  */
3942 void
3943 ipsecesp_init_funcs(ipsa_t *sa)
3944 {
3945 	if (sa->ipsa_output_func == NULL)
3946 		sa->ipsa_output_func = esp_outbound;
3947 	if (sa->ipsa_input_func == NULL)
3948 		sa->ipsa_input_func = esp_inbound;
3949 }
3950