xref: /illumos-gate/usr/src/uts/common/inet/ip/ipsecesp.c (revision c5749750a3e052f1194f65a303456224c51dea63)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
25  * Copyright (c) 2017 Joyent, Inc.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/errno.h>
32 #include <sys/strlog.h>
33 #include <sys/tihdr.h>
34 #include <sys/socket.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/kmem.h>
38 #include <sys/zone.h>
39 #include <sys/sysmacros.h>
40 #include <sys/cmn_err.h>
41 #include <sys/vtrace.h>
42 #include <sys/debug.h>
43 #include <sys/atomic.h>
44 #include <sys/strsun.h>
45 #include <sys/random.h>
46 #include <netinet/in.h>
47 #include <net/if.h>
48 #include <netinet/ip6.h>
49 #include <net/pfkeyv2.h>
50 #include <net/pfpolicy.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/nd.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 #include <inet/ip6.h>
58 #include <inet/ip_if.h>
59 #include <inet/ip_ndp.h>
60 #include <inet/sadb.h>
61 #include <inet/ipsec_info.h>
62 #include <inet/ipsec_impl.h>
63 #include <inet/ipsecesp.h>
64 #include <inet/ipdrop.h>
65 #include <inet/tcp.h>
66 #include <sys/kstat.h>
67 #include <sys/policy.h>
68 #include <sys/strsun.h>
69 #include <sys/strsubr.h>
70 #include <inet/udp_impl.h>
71 #include <sys/taskq.h>
72 #include <sys/note.h>
73 
74 #include <sys/tsol/tnet.h>
75 
76 /*
77  * Table of ND variables supported by ipsecesp. These are loaded into
78  * ipsecesp_g_nd in ipsecesp_init_nd.
79  * All of these are alterable, within the min/max values given, at run time.
80  */
81 static	ipsecespparam_t	lcl_param_arr[] = {
82 	/* min	max			value	name */
83 	{ 0,	3,			0,	"ipsecesp_debug"},
84 	{ 125,	32000, SADB_AGE_INTERVAL_DEFAULT, "ipsecesp_age_interval"},
85 	{ 1,	10,			1,	"ipsecesp_reap_delay"},
86 	{ 1,	SADB_MAX_REPLAY,	64,	"ipsecesp_replay_size"},
87 	{ 1,	300,			15,	"ipsecesp_acquire_timeout"},
88 	{ 1,	1800,			90,	"ipsecesp_larval_timeout"},
89 	/* Default lifetime values for ACQUIRE messages. */
90 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_bytes"},
91 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_bytes"},
92 	{ 0,	0xffffffffU,	24000,	"ipsecesp_default_soft_addtime"},
93 	{ 0,	0xffffffffU,	28800,	"ipsecesp_default_hard_addtime"},
94 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_usetime"},
95 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_usetime"},
96 	{ 0,	1,		0,	"ipsecesp_log_unknown_spi"},
97 	{ 0,	2,		1,	"ipsecesp_padding_check"},
98 	{ 0,	600,		20,	"ipsecesp_nat_keepalive_interval"},
99 };
100 /* For ipsecesp_nat_keepalive_interval, see ipsecesp.h. */
101 
102 #define	esp0dbg(a)	printf a
103 /* NOTE:  != 0 instead of > 0 so lint doesn't complain. */
104 #define	esp1dbg(espstack, a)	if (espstack->ipsecesp_debug != 0) printf a
105 #define	esp2dbg(espstack, a)	if (espstack->ipsecesp_debug > 1) printf a
106 #define	esp3dbg(espstack, a)	if (espstack->ipsecesp_debug > 2) printf a
107 
108 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
109 static int ipsecesp_close(queue_t *, int, cred_t *);
110 static void ipsecesp_wput(queue_t *, mblk_t *);
111 static void	*ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
112 static void	ipsecesp_stack_fini(netstackid_t stackid, void *arg);
113 
114 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
115 static void esp_outbound_finish(mblk_t *, ip_xmit_attr_t *);
116 static void esp_inbound_restart(mblk_t *, ip_recv_attr_t *);
117 
118 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
119     ipsecesp_stack_t *, cred_t *);
120 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
121     kstat_named_t **, ipsecesp_stack_t *);
122 static mblk_t *esp_submit_req_inbound(mblk_t *, ip_recv_attr_t *,
123     ipsa_t *, uint_t);
124 static mblk_t *esp_submit_req_outbound(mblk_t *, ip_xmit_attr_t *,
125     ipsa_t *, uchar_t *, uint_t);
126 
127 /* Setable in /etc/system */
128 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
129 
130 static struct module_info info = {
131 	5137, "ipsecesp", 0, INFPSZ, 65536, 1024
132 };
133 
134 static struct qinit rinit = {
135 	(pfi_t)putnext, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
136 	NULL
137 };
138 
139 static struct qinit winit = {
140 	(pfi_t)ipsecesp_wput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
141 	NULL
142 };
143 
144 struct streamtab ipsecespinfo = {
145 	&rinit, &winit, NULL, NULL
146 };
147 
148 static taskq_t *esp_taskq;
149 
150 /*
151  * OTOH, this one is set at open/close, and I'm D_MTQPAIR for now.
152  *
153  * Question:	Do I need this, given that all instance's esps->esps_wq point
154  *		to IP?
155  *
156  * Answer:	Yes, because I need to know which queue is BOUND to
157  *		IPPROTO_ESP
158  */
159 
160 static int	esp_kstat_update(kstat_t *, int);
161 
162 static boolean_t
163 esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
164 {
165 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
166 	    "net", KSTAT_TYPE_NAMED,
167 	    sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
168 
169 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
170 		return (B_FALSE);
171 
172 	espstack->esp_kstats = espstack->esp_ksp->ks_data;
173 
174 	espstack->esp_ksp->ks_update = esp_kstat_update;
175 	espstack->esp_ksp->ks_private = (void *)(uintptr_t)stackid;
176 
177 #define	K64 KSTAT_DATA_UINT64
178 #define	KI(x) kstat_named_init(&(espstack->esp_kstats->esp_stat_##x), #x, K64)
179 
180 	KI(num_aalgs);
181 	KI(num_ealgs);
182 	KI(good_auth);
183 	KI(bad_auth);
184 	KI(bad_padding);
185 	KI(replay_failures);
186 	KI(replay_early_failures);
187 	KI(keysock_in);
188 	KI(out_requests);
189 	KI(acquire_requests);
190 	KI(bytes_expired);
191 	KI(out_discards);
192 	KI(crypto_sync);
193 	KI(crypto_async);
194 	KI(crypto_failures);
195 	KI(bad_decrypt);
196 	KI(sa_port_renumbers);
197 
198 #undef KI
199 #undef K64
200 
201 	kstat_install(espstack->esp_ksp);
202 
203 	return (B_TRUE);
204 }
205 
206 static int
207 esp_kstat_update(kstat_t *kp, int rw)
208 {
209 	esp_kstats_t *ekp;
210 	netstackid_t	stackid = (zoneid_t)(uintptr_t)kp->ks_private;
211 	netstack_t	*ns;
212 	ipsec_stack_t	*ipss;
213 
214 	if ((kp == NULL) || (kp->ks_data == NULL))
215 		return (EIO);
216 
217 	if (rw == KSTAT_WRITE)
218 		return (EACCES);
219 
220 	ns = netstack_find_by_stackid(stackid);
221 	if (ns == NULL)
222 		return (-1);
223 	ipss = ns->netstack_ipsec;
224 	if (ipss == NULL) {
225 		netstack_rele(ns);
226 		return (-1);
227 	}
228 	ekp = (esp_kstats_t *)kp->ks_data;
229 
230 	rw_enter(&ipss->ipsec_alg_lock, RW_READER);
231 	ekp->esp_stat_num_aalgs.value.ui64 =
232 	    ipss->ipsec_nalgs[IPSEC_ALG_AUTH];
233 	ekp->esp_stat_num_ealgs.value.ui64 =
234 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
235 	rw_exit(&ipss->ipsec_alg_lock);
236 
237 	netstack_rele(ns);
238 	return (0);
239 }
240 
241 #ifdef DEBUG
242 /*
243  * Debug routine, useful to see pre-encryption data.
244  */
245 static char *
246 dump_msg(mblk_t *mp)
247 {
248 	char tmp_str[3], tmp_line[256];
249 
250 	while (mp != NULL) {
251 		unsigned char *ptr;
252 
253 		printf("mblk address 0x%p, length %ld, db_ref %d "
254 		    "type %d, base 0x%p, lim 0x%p\n",
255 		    (void *) mp, (long)(mp->b_wptr - mp->b_rptr),
256 		    mp->b_datap->db_ref, mp->b_datap->db_type,
257 		    (void *)mp->b_datap->db_base, (void *)mp->b_datap->db_lim);
258 		ptr = mp->b_rptr;
259 
260 		tmp_line[0] = '\0';
261 		while (ptr < mp->b_wptr) {
262 			uint_t diff;
263 
264 			diff = (ptr - mp->b_rptr);
265 			if (!(diff & 0x1f)) {
266 				if (strlen(tmp_line) > 0) {
267 					printf("bytes: %s\n", tmp_line);
268 					tmp_line[0] = '\0';
269 				}
270 			}
271 			if (!(diff & 0x3))
272 				(void) strcat(tmp_line, " ");
273 			(void) sprintf(tmp_str, "%02x", *ptr);
274 			(void) strcat(tmp_line, tmp_str);
275 			ptr++;
276 		}
277 		if (strlen(tmp_line) > 0)
278 			printf("bytes: %s\n", tmp_line);
279 
280 		mp = mp->b_cont;
281 	}
282 
283 	return ("\n");
284 }
285 
286 #else /* DEBUG */
287 static char *
288 dump_msg(mblk_t *mp)
289 {
290 	printf("Find value of mp %p.\n", mp);
291 	return ("\n");
292 }
293 #endif /* DEBUG */
294 
295 /*
296  * Don't have to lock age_interval, as only one thread will access it at
297  * a time, because I control the one function that does with timeout().
298  */
299 static void
300 esp_ager(void *arg)
301 {
302 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
303 	netstack_t	*ns = espstack->ipsecesp_netstack;
304 	hrtime_t begin = gethrtime();
305 
306 	sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
307 	    espstack->ipsecesp_reap_delay, ns);
308 	sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
309 	    espstack->ipsecesp_reap_delay, ns);
310 
311 	espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
312 	    esp_ager, espstack,
313 	    &espstack->ipsecesp_age_interval, espstack->ipsecesp_age_int_max,
314 	    info.mi_idnum);
315 }
316 
317 /*
318  * Get an ESP NDD parameter.
319  */
320 /* ARGSUSED */
321 static int
322 ipsecesp_param_get(
323     queue_t	*q,
324     mblk_t	*mp,
325     caddr_t	cp,
326     cred_t *cr)
327 {
328 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
329 	uint_t value;
330 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
331 
332 	mutex_enter(&espstack->ipsecesp_param_lock);
333 	value = ipsecesppa->ipsecesp_param_value;
334 	mutex_exit(&espstack->ipsecesp_param_lock);
335 
336 	(void) mi_mpprintf(mp, "%u", value);
337 	return (0);
338 }
339 
340 /*
341  * This routine sets an NDD variable in a ipsecespparam_t structure.
342  */
343 /* ARGSUSED */
344 static int
345 ipsecesp_param_set(
346     queue_t	*q,
347     mblk_t	*mp,
348     char	*value,
349     caddr_t	cp,
350     cred_t *cr)
351 {
352 	ulong_t	new_value;
353 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
354 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
355 
356 	/*
357 	 * Fail the request if the new value does not lie within the
358 	 * required bounds.
359 	 */
360 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
361 	    new_value < ipsecesppa->ipsecesp_param_min ||
362 	    new_value > ipsecesppa->ipsecesp_param_max) {
363 		return (EINVAL);
364 	}
365 
366 	/* Set the new value */
367 	mutex_enter(&espstack->ipsecesp_param_lock);
368 	ipsecesppa->ipsecesp_param_value = new_value;
369 	mutex_exit(&espstack->ipsecesp_param_lock);
370 	return (0);
371 }
372 
373 /*
374  * Using lifetime NDD variables, fill in an extended combination's
375  * lifetime information.
376  */
377 void
378 ipsecesp_fill_defs(sadb_x_ecomb_t *ecomb, netstack_t *ns)
379 {
380 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
381 
382 	ecomb->sadb_x_ecomb_soft_bytes = espstack->ipsecesp_default_soft_bytes;
383 	ecomb->sadb_x_ecomb_hard_bytes = espstack->ipsecesp_default_hard_bytes;
384 	ecomb->sadb_x_ecomb_soft_addtime =
385 	    espstack->ipsecesp_default_soft_addtime;
386 	ecomb->sadb_x_ecomb_hard_addtime =
387 	    espstack->ipsecesp_default_hard_addtime;
388 	ecomb->sadb_x_ecomb_soft_usetime =
389 	    espstack->ipsecesp_default_soft_usetime;
390 	ecomb->sadb_x_ecomb_hard_usetime =
391 	    espstack->ipsecesp_default_hard_usetime;
392 }
393 
394 /*
395  * Initialize things for ESP at module load time.
396  */
397 boolean_t
398 ipsecesp_ddi_init(void)
399 {
400 	esp_taskq = taskq_create("esp_taskq", 1, minclsyspri,
401 	    IPSEC_TASKQ_MIN, IPSEC_TASKQ_MAX, 0);
402 
403 	/*
404 	 * We want to be informed each time a stack is created or
405 	 * destroyed in the kernel, so we can maintain the
406 	 * set of ipsecesp_stack_t's.
407 	 */
408 	netstack_register(NS_IPSECESP, ipsecesp_stack_init, NULL,
409 	    ipsecesp_stack_fini);
410 
411 	return (B_TRUE);
412 }
413 
414 /*
415  * Walk through the param array specified registering each element with the
416  * named dispatch handler.
417  */
418 static boolean_t
419 ipsecesp_param_register(IDP *ndp, ipsecespparam_t *espp, int cnt)
420 {
421 	for (; cnt-- > 0; espp++) {
422 		if (espp->ipsecesp_param_name != NULL &&
423 		    espp->ipsecesp_param_name[0]) {
424 			if (!nd_load(ndp,
425 			    espp->ipsecesp_param_name,
426 			    ipsecesp_param_get, ipsecesp_param_set,
427 			    (caddr_t)espp)) {
428 				nd_free(ndp);
429 				return (B_FALSE);
430 			}
431 		}
432 	}
433 	return (B_TRUE);
434 }
435 
436 /*
437  * Initialize things for ESP for each stack instance
438  */
439 static void *
440 ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns)
441 {
442 	ipsecesp_stack_t	*espstack;
443 	ipsecespparam_t		*espp;
444 
445 	espstack = (ipsecesp_stack_t *)kmem_zalloc(sizeof (*espstack),
446 	    KM_SLEEP);
447 	espstack->ipsecesp_netstack = ns;
448 
449 	espp = (ipsecespparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
450 	espstack->ipsecesp_params = espp;
451 	bcopy(lcl_param_arr, espp, sizeof (lcl_param_arr));
452 
453 	(void) ipsecesp_param_register(&espstack->ipsecesp_g_nd, espp,
454 	    A_CNT(lcl_param_arr));
455 
456 	(void) esp_kstat_init(espstack, stackid);
457 
458 	espstack->esp_sadb.s_acquire_timeout =
459 	    &espstack->ipsecesp_acquire_timeout;
460 	sadbp_init("ESP", &espstack->esp_sadb, SADB_SATYPE_ESP, esp_hash_size,
461 	    espstack->ipsecesp_netstack);
462 
463 	mutex_init(&espstack->ipsecesp_param_lock, NULL, MUTEX_DEFAULT, 0);
464 
465 	ip_drop_register(&espstack->esp_dropper, "IPsec ESP");
466 	return (espstack);
467 }
468 
469 /*
470  * Destroy things for ESP at module unload time.
471  */
472 void
473 ipsecesp_ddi_destroy(void)
474 {
475 	netstack_unregister(NS_IPSECESP);
476 	taskq_destroy(esp_taskq);
477 }
478 
479 /*
480  * Destroy things for ESP for one stack instance
481  */
482 static void
483 ipsecesp_stack_fini(netstackid_t stackid, void *arg)
484 {
485 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
486 
487 	if (espstack->esp_pfkey_q != NULL) {
488 		(void) quntimeout(espstack->esp_pfkey_q, espstack->esp_event);
489 	}
490 	espstack->esp_sadb.s_acquire_timeout = NULL;
491 	sadbp_destroy(&espstack->esp_sadb, espstack->ipsecesp_netstack);
492 	ip_drop_unregister(&espstack->esp_dropper);
493 	mutex_destroy(&espstack->ipsecesp_param_lock);
494 	nd_free(&espstack->ipsecesp_g_nd);
495 
496 	kmem_free(espstack->ipsecesp_params, sizeof (lcl_param_arr));
497 	espstack->ipsecesp_params = NULL;
498 	kstat_delete_netstack(espstack->esp_ksp, stackid);
499 	espstack->esp_ksp = NULL;
500 	espstack->esp_kstats = NULL;
501 	kmem_free(espstack, sizeof (*espstack));
502 }
503 
504 /*
505  * ESP module open routine, which is here for keysock plumbing.
506  * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old
507  * Days of export control, and fears that ESP would not be allowed
508  * to be shipped at all by default.  Eventually, keysock should
509  * either access AH and ESP via modstubs or krtld dependencies, or
510  * perhaps be folded in with AH and ESP into a single IPsec/netsec
511  * module ("netsec" if PF_KEY provides more than AH/ESP keying tables).
512  */
513 /* ARGSUSED */
514 static int
515 ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
516 {
517 	netstack_t		*ns;
518 	ipsecesp_stack_t	*espstack;
519 
520 	if (secpolicy_ip_config(credp, B_FALSE) != 0)
521 		return (EPERM);
522 
523 	if (q->q_ptr != NULL)
524 		return (0);  /* Re-open of an already open instance. */
525 
526 	if (sflag != MODOPEN)
527 		return (EINVAL);
528 
529 	ns = netstack_find_by_cred(credp);
530 	ASSERT(ns != NULL);
531 	espstack = ns->netstack_ipsecesp;
532 	ASSERT(espstack != NULL);
533 
534 	q->q_ptr = espstack;
535 	WR(q)->q_ptr = q->q_ptr;
536 
537 	qprocson(q);
538 	return (0);
539 }
540 
541 /*
542  * ESP module close routine.
543  */
544 /* ARGSUSED */
545 static int
546 ipsecesp_close(queue_t *q, int flags __unused, cred_t *credp __unused)
547 {
548 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
549 
550 	/*
551 	 * Clean up q_ptr, if needed.
552 	 */
553 	qprocsoff(q);
554 
555 	/* Keysock queue check is safe, because of OCEXCL perimeter. */
556 
557 	if (q == espstack->esp_pfkey_q) {
558 		esp1dbg(espstack,
559 		    ("ipsecesp_close:  Ummm... keysock is closing ESP.\n"));
560 		espstack->esp_pfkey_q = NULL;
561 		/* Detach qtimeouts. */
562 		(void) quntimeout(q, espstack->esp_event);
563 	}
564 
565 	netstack_rele(espstack->ipsecesp_netstack);
566 	return (0);
567 }
568 
569 /*
570  * Add a number of bytes to what the SA has protected so far.  Return
571  * B_TRUE if the SA can still protect that many bytes.
572  *
573  * Caller must REFRELE the passed-in assoc.  This function must REFRELE
574  * any obtained peer SA.
575  */
576 static boolean_t
577 esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
578 {
579 	ipsa_t *inassoc, *outassoc;
580 	isaf_t *bucket;
581 	boolean_t inrc, outrc, isv6;
582 	sadb_t *sp;
583 	int outhash;
584 	netstack_t		*ns = assoc->ipsa_netstack;
585 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
586 
587 	/* No peer?  No problem! */
588 	if (!assoc->ipsa_haspeer) {
589 		return (sadb_age_bytes(espstack->esp_pfkey_q, assoc, bytes,
590 		    B_TRUE));
591 	}
592 
593 	/*
594 	 * Otherwise, we want to grab both the original assoc and its peer.
595 	 * There might be a race for this, but if it's a real race, two
596 	 * expire messages may occur.  We limit this by only sending the
597 	 * expire message on one of the peers, we'll pick the inbound
598 	 * arbitrarily.
599 	 *
600 	 * If we need tight synchronization on the peer SA, then we need to
601 	 * reconsider.
602 	 */
603 
604 	/* Use address length to select IPv6/IPv4 */
605 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
606 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
607 
608 	if (inbound) {
609 		inassoc = assoc;
610 		if (isv6) {
611 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
612 			    &inassoc->ipsa_dstaddr));
613 		} else {
614 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
615 			    &inassoc->ipsa_dstaddr));
616 		}
617 		bucket = &sp->sdb_of[outhash];
618 		mutex_enter(&bucket->isaf_lock);
619 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
620 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
621 		    inassoc->ipsa_addrfam);
622 		mutex_exit(&bucket->isaf_lock);
623 		if (outassoc == NULL) {
624 			/* Q: Do we wish to set haspeer == B_FALSE? */
625 			esp0dbg(("esp_age_bytes: "
626 			    "can't find peer for inbound.\n"));
627 			return (sadb_age_bytes(espstack->esp_pfkey_q, inassoc,
628 			    bytes, B_TRUE));
629 		}
630 	} else {
631 		outassoc = assoc;
632 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
633 		mutex_enter(&bucket->isaf_lock);
634 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
635 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
636 		    outassoc->ipsa_addrfam);
637 		mutex_exit(&bucket->isaf_lock);
638 		if (inassoc == NULL) {
639 			/* Q: Do we wish to set haspeer == B_FALSE? */
640 			esp0dbg(("esp_age_bytes: "
641 			    "can't find peer for outbound.\n"));
642 			return (sadb_age_bytes(espstack->esp_pfkey_q, outassoc,
643 			    bytes, B_TRUE));
644 		}
645 	}
646 
647 	inrc = sadb_age_bytes(espstack->esp_pfkey_q, inassoc, bytes, B_TRUE);
648 	outrc = sadb_age_bytes(espstack->esp_pfkey_q, outassoc, bytes, B_FALSE);
649 
650 	/*
651 	 * REFRELE any peer SA.
652 	 *
653 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
654 	 * them in { }.
655 	 */
656 	if (inbound) {
657 		IPSA_REFRELE(outassoc);
658 	} else {
659 		IPSA_REFRELE(inassoc);
660 	}
661 
662 	return (inrc && outrc);
663 }
664 
665 /*
666  * Do incoming NAT-T manipulations for packet.
667  * Returns NULL if the mblk chain is consumed.
668  */
669 static mblk_t *
670 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
671 {
672 	ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
673 	tcpha_t *tcpha;
674 	udpha_t *udpha;
675 	/* Initialize to our inbound cksum adjustment... */
676 	uint32_t sum = assoc->ipsa_inbound_cksum;
677 
678 	switch (ipha->ipha_protocol) {
679 	case IPPROTO_TCP:
680 		tcpha = (tcpha_t *)(data_mp->b_rptr +
681 		    IPH_HDR_LENGTH(ipha));
682 
683 #define	DOWN_SUM(x) (x) = ((x) & 0xFFFF) +	 ((x) >> 16)
684 		sum += ~ntohs(tcpha->tha_sum) & 0xFFFF;
685 		DOWN_SUM(sum);
686 		DOWN_SUM(sum);
687 		tcpha->tha_sum = ~htons(sum);
688 		break;
689 	case IPPROTO_UDP:
690 		udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
691 
692 		if (udpha->uha_checksum != 0) {
693 			/* Adujst if the inbound one was not zero. */
694 			sum += ~ntohs(udpha->uha_checksum) & 0xFFFF;
695 			DOWN_SUM(sum);
696 			DOWN_SUM(sum);
697 			udpha->uha_checksum = ~htons(sum);
698 			if (udpha->uha_checksum == 0)
699 				udpha->uha_checksum = 0xFFFF;
700 		}
701 #undef DOWN_SUM
702 		break;
703 	case IPPROTO_IP:
704 		/*
705 		 * This case is only an issue for self-encapsulated
706 		 * packets.  So for now, fall through.
707 		 */
708 		break;
709 	}
710 	return (data_mp);
711 }
712 
713 
714 /*
715  * Strip ESP header, check padding, and fix IP header.
716  * Returns B_TRUE on success, B_FALSE if an error occured.
717  */
718 static boolean_t
719 esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
720     kstat_named_t **counter, ipsecesp_stack_t *espstack)
721 {
722 	ipha_t *ipha;
723 	ip6_t *ip6h;
724 	uint_t divpoint;
725 	mblk_t *scratch;
726 	uint8_t nexthdr, padlen;
727 	uint8_t lastpad;
728 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
729 	uint8_t *lastbyte;
730 
731 	/*
732 	 * Strip ESP data and fix IP header.
733 	 *
734 	 * XXX In case the beginning of esp_inbound() changes to not do a
735 	 * pullup, this part of the code can remain unchanged.
736 	 */
737 	if (isv4) {
738 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ipha_t));
739 		ipha = (ipha_t *)data_mp->b_rptr;
740 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (esph_t) +
741 		    IPH_HDR_LENGTH(ipha));
742 		divpoint = IPH_HDR_LENGTH(ipha);
743 	} else {
744 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ip6_t));
745 		ip6h = (ip6_t *)data_mp->b_rptr;
746 		divpoint = ip_hdr_length_v6(data_mp, ip6h);
747 	}
748 
749 	scratch = data_mp;
750 	while (scratch->b_cont != NULL)
751 		scratch = scratch->b_cont;
752 
753 	ASSERT((scratch->b_wptr - scratch->b_rptr) >= 3);
754 
755 	/*
756 	 * "Next header" and padding length are the last two bytes in the
757 	 * ESP-protected datagram, thus the explicit - 1 and - 2.
758 	 * lastpad is the last byte of the padding, which can be used for
759 	 * a quick check to see if the padding is correct.
760 	 */
761 	lastbyte = scratch->b_wptr - 1;
762 	nexthdr = *lastbyte--;
763 	padlen = *lastbyte--;
764 
765 	if (isv4) {
766 		/* Fix part of the IP header. */
767 		ipha->ipha_protocol = nexthdr;
768 		/*
769 		 * Reality check the padlen.  The explicit - 2 is for the
770 		 * padding length and the next-header bytes.
771 		 */
772 		if (padlen >= ntohs(ipha->ipha_length) - sizeof (ipha_t) - 2 -
773 		    sizeof (esph_t) - ivlen) {
774 			ESP_BUMP_STAT(espstack, bad_decrypt);
775 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
776 			    info.mi_idnum, 0, 0,
777 			    SL_ERROR | SL_WARN,
778 			    "Corrupt ESP packet (padlen too big).\n");
779 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
780 			    padlen));
781 			esp1dbg(espstack, ("pkt len(%d) - ip hdr - esp "
782 			    "hdr - ivlen(%d) = %d.\n",
783 			    ntohs(ipha->ipha_length), ivlen,
784 			    (int)(ntohs(ipha->ipha_length) - sizeof (ipha_t) -
785 			    2 - sizeof (esph_t) - ivlen)));
786 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
787 			return (B_FALSE);
788 		}
789 
790 		/*
791 		 * Fix the rest of the header.  The explicit - 2 is for the
792 		 * padding length and the next-header bytes.
793 		 */
794 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - padlen -
795 		    2 - sizeof (esph_t) - ivlen);
796 		ipha->ipha_hdr_checksum = 0;
797 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
798 	} else {
799 		if (ip6h->ip6_nxt == IPPROTO_ESP) {
800 			ip6h->ip6_nxt = nexthdr;
801 		} else {
802 			ip_pkt_t ipp;
803 
804 			bzero(&ipp, sizeof (ipp));
805 			(void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
806 			    NULL);
807 			if (ipp.ipp_dstopts != NULL) {
808 				ipp.ipp_dstopts->ip6d_nxt = nexthdr;
809 			} else if (ipp.ipp_rthdr != NULL) {
810 				ipp.ipp_rthdr->ip6r_nxt = nexthdr;
811 			} else if (ipp.ipp_hopopts != NULL) {
812 				ipp.ipp_hopopts->ip6h_nxt = nexthdr;
813 			} else {
814 				/* Panic a DEBUG kernel. */
815 				ASSERT(ipp.ipp_hopopts != NULL);
816 				/* Otherwise, pretend it's IP + ESP. */
817 				cmn_err(CE_WARN, "ESP IPv6 headers wrong.\n");
818 				ip6h->ip6_nxt = nexthdr;
819 			}
820 		}
821 
822 		if (padlen >= ntohs(ip6h->ip6_plen) - 2 - sizeof (esph_t) -
823 		    ivlen) {
824 			ESP_BUMP_STAT(espstack, bad_decrypt);
825 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
826 			    info.mi_idnum, 0, 0,
827 			    SL_ERROR | SL_WARN,
828 			    "Corrupt ESP packet (v6 padlen too big).\n");
829 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
830 			    padlen));
831 			esp1dbg(espstack,
832 			    ("pkt len(%u) - ip hdr - esp hdr - ivlen(%d) = "
833 			    "%u.\n", (unsigned)(ntohs(ip6h->ip6_plen)
834 			    + sizeof (ip6_t)), ivlen,
835 			    (unsigned)(ntohs(ip6h->ip6_plen) - 2 -
836 			    sizeof (esph_t) - ivlen)));
837 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
838 			return (B_FALSE);
839 		}
840 
841 
842 		/*
843 		 * Fix the rest of the header.  The explicit - 2 is for the
844 		 * padding length and the next-header bytes.  IPv6 is nice,
845 		 * because there's no hdr checksum!
846 		 */
847 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - padlen -
848 		    2 - sizeof (esph_t) - ivlen);
849 	}
850 
851 	if (espstack->ipsecesp_padding_check > 0 && padlen > 0) {
852 		/*
853 		 * Weak padding check: compare last-byte to length, they
854 		 * should be equal.
855 		 */
856 		lastpad = *lastbyte--;
857 
858 		if (padlen != lastpad) {
859 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
860 			    info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
861 			    "Corrupt ESP packet (lastpad != padlen).\n");
862 			esp1dbg(espstack,
863 			    ("lastpad (%d) not equal to padlen (%d):\n",
864 			    lastpad, padlen));
865 			ESP_BUMP_STAT(espstack, bad_padding);
866 			*counter = DROPPER(ipss, ipds_esp_bad_padding);
867 			return (B_FALSE);
868 		}
869 
870 		/*
871 		 * Strong padding check: Check all pad bytes to see that
872 		 * they're ascending.  Go backwards using a descending counter
873 		 * to verify.  padlen == 1 is checked by previous block, so
874 		 * only bother if we've more than 1 byte of padding.
875 		 * Consequently, start the check one byte before the location
876 		 * of "lastpad".
877 		 */
878 		if (espstack->ipsecesp_padding_check > 1) {
879 			/*
880 			 * This assert may have to become an if and a pullup
881 			 * if we start accepting multi-dblk mblks. For now,
882 			 * though, any packet here will have been pulled up in
883 			 * esp_inbound.
884 			 */
885 			ASSERT(MBLKL(scratch) >= lastpad + 3);
886 
887 			/*
888 			 * Use "--lastpad" because we already checked the very
889 			 * last pad byte previously.
890 			 */
891 			while (--lastpad != 0) {
892 				if (lastpad != *lastbyte) {
893 					ipsec_rl_strlog(
894 					    espstack->ipsecesp_netstack,
895 					    info.mi_idnum, 0, 0,
896 					    SL_ERROR | SL_WARN, "Corrupt ESP "
897 					    "packet (bad padding).\n");
898 					esp1dbg(espstack,
899 					    ("padding not in correct"
900 					    " format:\n"));
901 					ESP_BUMP_STAT(espstack, bad_padding);
902 					*counter = DROPPER(ipss,
903 					    ipds_esp_bad_padding);
904 					return (B_FALSE);
905 				}
906 				lastbyte--;
907 			}
908 		}
909 	}
910 
911 	/* Trim off the padding. */
912 	ASSERT(data_mp->b_cont == NULL);
913 	data_mp->b_wptr -= (padlen + 2);
914 
915 	/*
916 	 * Remove the ESP header.
917 	 *
918 	 * The above assertions about data_mp's size will make this work.
919 	 *
920 	 * XXX  Question:  If I send up and get back a contiguous mblk,
921 	 * would it be quicker to bcopy over, or keep doing the dupb stuff?
922 	 * I go with copying for now.
923 	 */
924 
925 	if (IS_P2ALIGNED(data_mp->b_rptr, sizeof (uint32_t)) &&
926 	    IS_P2ALIGNED(ivlen, sizeof (uint32_t))) {
927 		uint8_t *start = data_mp->b_rptr;
928 		uint32_t *src, *dst;
929 
930 		src = (uint32_t *)(start + divpoint);
931 		dst = (uint32_t *)(start + divpoint + sizeof (esph_t) + ivlen);
932 
933 		ASSERT(IS_P2ALIGNED(dst, sizeof (uint32_t)) &&
934 		    IS_P2ALIGNED(src, sizeof (uint32_t)));
935 
936 		do {
937 			src--;
938 			dst--;
939 			*dst = *src;
940 		} while (src != (uint32_t *)start);
941 
942 		data_mp->b_rptr = (uchar_t *)dst;
943 	} else {
944 		uint8_t *start = data_mp->b_rptr;
945 		uint8_t *src, *dst;
946 
947 		src = start + divpoint;
948 		dst = src + sizeof (esph_t) + ivlen;
949 
950 		do {
951 			src--;
952 			dst--;
953 			*dst = *src;
954 		} while (src != start);
955 
956 		data_mp->b_rptr = dst;
957 	}
958 
959 	esp2dbg(espstack, ("data_mp after inbound ESP adjustment:\n"));
960 	esp2dbg(espstack, (dump_msg(data_mp)));
961 
962 	return (B_TRUE);
963 }
964 
965 /*
966  * Updating use times can be tricky business if the ipsa_haspeer flag is
967  * set.  This function is called once in an SA's lifetime.
968  *
969  * Caller has to REFRELE "assoc" which is passed in.  This function has
970  * to REFRELE any peer SA that is obtained.
971  */
972 static void
973 esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
974 {
975 	ipsa_t *inassoc, *outassoc;
976 	isaf_t *bucket;
977 	sadb_t *sp;
978 	int outhash;
979 	boolean_t isv6;
980 	netstack_t		*ns = assoc->ipsa_netstack;
981 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
982 
983 	/* No peer?  No problem! */
984 	if (!assoc->ipsa_haspeer) {
985 		sadb_set_usetime(assoc);
986 		return;
987 	}
988 
989 	/*
990 	 * Otherwise, we want to grab both the original assoc and its peer.
991 	 * There might be a race for this, but if it's a real race, the times
992 	 * will be out-of-synch by at most a second, and since our time
993 	 * granularity is a second, this won't be a problem.
994 	 *
995 	 * If we need tight synchronization on the peer SA, then we need to
996 	 * reconsider.
997 	 */
998 
999 	/* Use address length to select IPv6/IPv4 */
1000 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
1001 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
1002 
1003 	if (inbound) {
1004 		inassoc = assoc;
1005 		if (isv6) {
1006 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
1007 			    &inassoc->ipsa_dstaddr));
1008 		} else {
1009 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
1010 			    &inassoc->ipsa_dstaddr));
1011 		}
1012 		bucket = &sp->sdb_of[outhash];
1013 		mutex_enter(&bucket->isaf_lock);
1014 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
1015 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
1016 		    inassoc->ipsa_addrfam);
1017 		mutex_exit(&bucket->isaf_lock);
1018 		if (outassoc == NULL) {
1019 			/* Q: Do we wish to set haspeer == B_FALSE? */
1020 			esp0dbg(("esp_set_usetime: "
1021 			    "can't find peer for inbound.\n"));
1022 			sadb_set_usetime(inassoc);
1023 			return;
1024 		}
1025 	} else {
1026 		outassoc = assoc;
1027 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
1028 		mutex_enter(&bucket->isaf_lock);
1029 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
1030 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
1031 		    outassoc->ipsa_addrfam);
1032 		mutex_exit(&bucket->isaf_lock);
1033 		if (inassoc == NULL) {
1034 			/* Q: Do we wish to set haspeer == B_FALSE? */
1035 			esp0dbg(("esp_set_usetime: "
1036 			    "can't find peer for outbound.\n"));
1037 			sadb_set_usetime(outassoc);
1038 			return;
1039 		}
1040 	}
1041 
1042 	/* Update usetime on both. */
1043 	sadb_set_usetime(inassoc);
1044 	sadb_set_usetime(outassoc);
1045 
1046 	/*
1047 	 * REFRELE any peer SA.
1048 	 *
1049 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
1050 	 * them in { }.
1051 	 */
1052 	if (inbound) {
1053 		IPSA_REFRELE(outassoc);
1054 	} else {
1055 		IPSA_REFRELE(inassoc);
1056 	}
1057 }
1058 
1059 /*
1060  * Handle ESP inbound data for IPv4 and IPv6.
1061  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1062  * mblk chain data_mp.
1063  */
1064 mblk_t *
1065 esp_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira)
1066 {
1067 	esph_t *esph = (esph_t *)arg;
1068 	ipsa_t *ipsa = ira->ira_ipsec_esp_sa;
1069 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
1070 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1071 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1072 
1073 	/*
1074 	 * We may wish to check replay in-range-only here as an optimization.
1075 	 * Include the reality check of ipsa->ipsa_replay >
1076 	 * ipsa->ipsa_replay_wsize for times when it's the first N packets,
1077 	 * where N == ipsa->ipsa_replay_wsize.
1078 	 *
1079 	 * Another check that may come here later is the "collision" check.
1080 	 * If legitimate packets flow quickly enough, this won't be a problem,
1081 	 * but collisions may cause authentication algorithm crunching to
1082 	 * take place when it doesn't need to.
1083 	 */
1084 	if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
1085 		ESP_BUMP_STAT(espstack, replay_early_failures);
1086 		IP_ESP_BUMP_STAT(ipss, in_discards);
1087 		ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1088 		    DROPPER(ipss, ipds_esp_early_replay),
1089 		    &espstack->esp_dropper);
1090 		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1091 		return (NULL);
1092 	}
1093 
1094 	/*
1095 	 * Adjust the IP header's payload length to reflect the removal
1096 	 * of the ICV.
1097 	 */
1098 	if (!(ira->ira_flags & IRAF_IS_IPV4)) {
1099 		ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
1100 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
1101 		    ipsa->ipsa_mac_len);
1102 	} else {
1103 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1104 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) -
1105 		    ipsa->ipsa_mac_len);
1106 	}
1107 
1108 	/* submit the request to the crypto framework */
1109 	return (esp_submit_req_inbound(data_mp, ira, ipsa,
1110 	    (uint8_t *)esph - data_mp->b_rptr));
1111 }
1112 
1113 /* XXX refactor me */
1114 /*
1115  * Handle the SADB_GETSPI message.  Create a larval SA.
1116  */
1117 static void
1118 esp_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
1119 {
1120 	ipsa_t *newbie, *target;
1121 	isaf_t *outbound, *inbound;
1122 	int rc, diagnostic;
1123 	sadb_sa_t *assoc;
1124 	keysock_out_t *kso;
1125 	uint32_t newspi;
1126 
1127 	/*
1128 	 * Randomly generate a proposed SPI value
1129 	 */
1130 	if (cl_inet_getspi != NULL) {
1131 		cl_inet_getspi(espstack->ipsecesp_netstack->netstack_stackid,
1132 		    IPPROTO_ESP, (uint8_t *)&newspi, sizeof (uint32_t), NULL);
1133 	} else {
1134 		(void) random_get_pseudo_bytes((uint8_t *)&newspi,
1135 		    sizeof (uint32_t));
1136 	}
1137 	newbie = sadb_getspi(ksi, newspi, &diagnostic,
1138 	    espstack->ipsecesp_netstack, IPPROTO_ESP);
1139 
1140 	if (newbie == NULL) {
1141 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM, diagnostic,
1142 		    ksi->ks_in_serial);
1143 		return;
1144 	} else if (newbie == (ipsa_t *)-1) {
1145 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
1146 		    ksi->ks_in_serial);
1147 		return;
1148 	}
1149 
1150 	/*
1151 	 * XXX - We may randomly collide.  We really should recover from this.
1152 	 *	 Unfortunately, that could require spending way-too-much-time
1153 	 *	 in here.  For now, let the user retry.
1154 	 */
1155 
1156 	if (newbie->ipsa_addrfam == AF_INET6) {
1157 		outbound = OUTBOUND_BUCKET_V6(&espstack->esp_sadb.s_v6,
1158 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1159 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v6,
1160 		    newbie->ipsa_spi);
1161 	} else {
1162 		ASSERT(newbie->ipsa_addrfam == AF_INET);
1163 		outbound = OUTBOUND_BUCKET_V4(&espstack->esp_sadb.s_v4,
1164 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1165 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v4,
1166 		    newbie->ipsa_spi);
1167 	}
1168 
1169 	mutex_enter(&outbound->isaf_lock);
1170 	mutex_enter(&inbound->isaf_lock);
1171 
1172 	/*
1173 	 * Check for collisions (i.e. did sadb_getspi() return with something
1174 	 * that already exists?).
1175 	 *
1176 	 * Try outbound first.  Even though SADB_GETSPI is traditionally
1177 	 * for inbound SAs, you never know what a user might do.
1178 	 */
1179 	target = ipsec_getassocbyspi(outbound, newbie->ipsa_spi,
1180 	    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr, newbie->ipsa_addrfam);
1181 	if (target == NULL) {
1182 		target = ipsec_getassocbyspi(inbound, newbie->ipsa_spi,
1183 		    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr,
1184 		    newbie->ipsa_addrfam);
1185 	}
1186 
1187 	/*
1188 	 * I don't have collisions elsewhere!
1189 	 * (Nor will I because I'm still holding inbound/outbound locks.)
1190 	 */
1191 
1192 	if (target != NULL) {
1193 		rc = EEXIST;
1194 		IPSA_REFRELE(target);
1195 	} else {
1196 		/*
1197 		 * sadb_insertassoc() also checks for collisions, so
1198 		 * if there's a colliding entry, rc will be set
1199 		 * to EEXIST.
1200 		 */
1201 		rc = sadb_insertassoc(newbie, inbound);
1202 		newbie->ipsa_hardexpiretime = gethrestime_sec();
1203 		newbie->ipsa_hardexpiretime +=
1204 		    espstack->ipsecesp_larval_timeout;
1205 	}
1206 
1207 	/*
1208 	 * Can exit outbound mutex.  Hold inbound until we're done
1209 	 * with newbie.
1210 	 */
1211 	mutex_exit(&outbound->isaf_lock);
1212 
1213 	if (rc != 0) {
1214 		mutex_exit(&inbound->isaf_lock);
1215 		IPSA_REFRELE(newbie);
1216 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, rc,
1217 		    SADB_X_DIAGNOSTIC_NONE, ksi->ks_in_serial);
1218 		return;
1219 	}
1220 
1221 
1222 	/* Can write here because I'm still holding the bucket lock. */
1223 	newbie->ipsa_type = SADB_SATYPE_ESP;
1224 
1225 	/*
1226 	 * Construct successful return message. We have one thing going
1227 	 * for us in PF_KEY v2.  That's the fact that
1228 	 *	sizeof (sadb_spirange_t) == sizeof (sadb_sa_t)
1229 	 */
1230 	assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SPIRANGE];
1231 	assoc->sadb_sa_exttype = SADB_EXT_SA;
1232 	assoc->sadb_sa_spi = newbie->ipsa_spi;
1233 	*((uint64_t *)(&assoc->sadb_sa_replay)) = 0;
1234 	mutex_exit(&inbound->isaf_lock);
1235 
1236 	/* Convert KEYSOCK_IN to KEYSOCK_OUT. */
1237 	kso = (keysock_out_t *)ksi;
1238 	kso->ks_out_len = sizeof (*kso);
1239 	kso->ks_out_serial = ksi->ks_in_serial;
1240 	kso->ks_out_type = KEYSOCK_OUT;
1241 
1242 	/*
1243 	 * Can safely putnext() to esp_pfkey_q, because this is a turnaround
1244 	 * from the esp_pfkey_q.
1245 	 */
1246 	putnext(espstack->esp_pfkey_q, mp);
1247 }
1248 
1249 /*
1250  * Insert the ESP header into a packet.  Duplicate an mblk, and insert a newly
1251  * allocated mblk with the ESP header in between the two.
1252  */
1253 static boolean_t
1254 esp_insert_esp(mblk_t *mp, mblk_t *esp_mp, uint_t divpoint,
1255     ipsecesp_stack_t *espstack)
1256 {
1257 	mblk_t *split_mp = mp;
1258 	uint_t wheretodiv = divpoint;
1259 
1260 	while ((split_mp->b_wptr - split_mp->b_rptr) < wheretodiv) {
1261 		wheretodiv -= (split_mp->b_wptr - split_mp->b_rptr);
1262 		split_mp = split_mp->b_cont;
1263 		ASSERT(split_mp != NULL);
1264 	}
1265 
1266 	if (split_mp->b_wptr - split_mp->b_rptr != wheretodiv) {
1267 		mblk_t *scratch;
1268 
1269 		/* "scratch" is the 2nd half, split_mp is the first. */
1270 		scratch = dupb(split_mp);
1271 		if (scratch == NULL) {
1272 			esp1dbg(espstack,
1273 			    ("esp_insert_esp: can't allocate scratch.\n"));
1274 			return (B_FALSE);
1275 		}
1276 		/* NOTE:  dupb() doesn't set b_cont appropriately. */
1277 		scratch->b_cont = split_mp->b_cont;
1278 		scratch->b_rptr += wheretodiv;
1279 		split_mp->b_wptr = split_mp->b_rptr + wheretodiv;
1280 		split_mp->b_cont = scratch;
1281 	}
1282 	/*
1283 	 * At this point, split_mp is exactly "wheretodiv" bytes long, and
1284 	 * holds the end of the pre-ESP part of the datagram.
1285 	 */
1286 	esp_mp->b_cont = split_mp->b_cont;
1287 	split_mp->b_cont = esp_mp;
1288 
1289 	return (B_TRUE);
1290 }
1291 
1292 /*
1293  * Section 7 of RFC 3947 says:
1294  *
1295  * 7.  Recovering from the Expiring NAT Mappings
1296  *
1297  *    There are cases where NAT box decides to remove mappings that are still
1298  *    alive (for example, when the keepalive interval is too long, or when the
1299  *    NAT box is rebooted).  To recover from this, ends that are NOT behind
1300  *    NAT SHOULD use the last valid UDP encapsulated IKE or IPsec packet from
1301  *    the other end to determine which IP and port addresses should be used.
1302  *    The host behind dynamic NAT MUST NOT do this, as otherwise it opens a
1303  *    DoS attack possibility because the IP address or port of the other host
1304  *    will not change (it is not behind NAT).
1305  *
1306  *    Keepalives cannot be used for these purposes, as they are not
1307  *    authenticated, but any IKE authenticated IKE packet or ESP packet can be
1308  *    used to detect whether the IP address or the port has changed.
1309  *
1310  * The following function will check an SA and its explicitly-set pair to see
1311  * if the NAT-T remote port matches the received packet (which must have
1312  * passed ESP authentication, see esp_in_done() for the caller context).  If
1313  * there is a mismatch, the SAs are updated.  It is not important if we race
1314  * with a transmitting thread, as if there is a transmitting thread, it will
1315  * merely emit a packet that will most-likely be dropped.
1316  *
1317  * "ports" are ordered src,dst, and assoc is an inbound SA, where src should
1318  * match ipsa_remote_nat_port and dst should match ipsa_local_nat_port.
1319  */
1320 #ifdef _LITTLE_ENDIAN
1321 #define	FIRST_16(x) ((x) & 0xFFFF)
1322 #define	NEXT_16(x) (((x) >> 16) & 0xFFFF)
1323 #else
1324 #define	FIRST_16(x) (((x) >> 16) & 0xFFFF)
1325 #define	NEXT_16(x) ((x) & 0xFFFF)
1326 #endif
1327 static void
1328 esp_port_freshness(uint32_t ports, ipsa_t *assoc)
1329 {
1330 	uint16_t remote = FIRST_16(ports);
1331 	uint16_t local = NEXT_16(ports);
1332 	ipsa_t *outbound_peer;
1333 	isaf_t *bucket;
1334 	ipsecesp_stack_t *espstack = assoc->ipsa_netstack->netstack_ipsecesp;
1335 
1336 	/* We found a conn_t, therefore local != 0. */
1337 	ASSERT(local != 0);
1338 	/* Assume an IPv4 SA. */
1339 	ASSERT(assoc->ipsa_addrfam == AF_INET);
1340 
1341 	/*
1342 	 * On-the-wire rport == 0 means something's very wrong.
1343 	 * An unpaired SA is also useless to us.
1344 	 * If we are behind the NAT, don't bother.
1345 	 * A zero local NAT port defaults to 4500, so check that too.
1346 	 * And, of course, if the ports already match, we don't need to
1347 	 * bother.
1348 	 */
1349 	if (remote == 0 || assoc->ipsa_otherspi == 0 ||
1350 	    (assoc->ipsa_flags & IPSA_F_BEHIND_NAT) ||
1351 	    (assoc->ipsa_remote_nat_port == 0 &&
1352 	    remote == htons(IPPORT_IKE_NATT)) ||
1353 	    remote == assoc->ipsa_remote_nat_port)
1354 		return;
1355 
1356 	/* Try and snag the peer.   NOTE:  Assume IPv4 for now. */
1357 	bucket = OUTBOUND_BUCKET_V4(&(espstack->esp_sadb.s_v4),
1358 	    assoc->ipsa_srcaddr[0]);
1359 	mutex_enter(&bucket->isaf_lock);
1360 	outbound_peer = ipsec_getassocbyspi(bucket, assoc->ipsa_otherspi,
1361 	    assoc->ipsa_dstaddr, assoc->ipsa_srcaddr, AF_INET);
1362 	mutex_exit(&bucket->isaf_lock);
1363 
1364 	/* We probably lost a race to a deleting or expiring thread. */
1365 	if (outbound_peer == NULL)
1366 		return;
1367 
1368 	/*
1369 	 * Hold the mutexes for both SAs so we don't race another inbound
1370 	 * thread.  A lock-entry order shouldn't matter, since all other
1371 	 * per-ipsa locks are individually held-then-released.
1372 	 *
1373 	 * Luckily, this has nothing to do with the remote-NAT address,
1374 	 * so we don't have to re-scribble the cached-checksum differential.
1375 	 */
1376 	mutex_enter(&outbound_peer->ipsa_lock);
1377 	mutex_enter(&assoc->ipsa_lock);
1378 	outbound_peer->ipsa_remote_nat_port = assoc->ipsa_remote_nat_port =
1379 	    remote;
1380 	mutex_exit(&assoc->ipsa_lock);
1381 	mutex_exit(&outbound_peer->ipsa_lock);
1382 	IPSA_REFRELE(outbound_peer);
1383 	ESP_BUMP_STAT(espstack, sa_port_renumbers);
1384 }
1385 /*
1386  * Finish processing of an inbound ESP packet after processing by the
1387  * crypto framework.
1388  * - Remove the ESP header.
1389  * - Send packet back to IP.
1390  * If authentication was performed on the packet, this function is called
1391  * only if the authentication succeeded.
1392  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1393  * mblk chain data_mp.
1394  */
1395 static mblk_t *
1396 esp_in_done(mblk_t *data_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
1397 {
1398 	ipsa_t *assoc;
1399 	uint_t espstart;
1400 	uint32_t ivlen = 0;
1401 	uint_t processed_len;
1402 	esph_t *esph;
1403 	kstat_named_t *counter;
1404 	boolean_t is_natt;
1405 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
1406 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1407 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1408 
1409 	assoc = ira->ira_ipsec_esp_sa;
1410 	ASSERT(assoc != NULL);
1411 
1412 	is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
1413 
1414 	/* get the pointer to the ESP header */
1415 	if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
1416 		/* authentication-only ESP */
1417 		espstart = ic->ic_crypto_data.cd_offset;
1418 		processed_len = ic->ic_crypto_data.cd_length;
1419 	} else {
1420 		/* encryption present */
1421 		ivlen = assoc->ipsa_iv_len;
1422 		if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
1423 			/* encryption-only ESP */
1424 			espstart = ic->ic_crypto_data.cd_offset -
1425 			    sizeof (esph_t) - assoc->ipsa_iv_len;
1426 			processed_len = ic->ic_crypto_data.cd_length +
1427 			    ivlen;
1428 		} else {
1429 			/* encryption with authentication */
1430 			espstart = ic->ic_crypto_dual_data.dd_offset1;
1431 			processed_len = ic->ic_crypto_dual_data.dd_len2 +
1432 			    ivlen;
1433 		}
1434 	}
1435 
1436 	esph = (esph_t *)(data_mp->b_rptr + espstart);
1437 
1438 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE ||
1439 	    (assoc->ipsa_flags & IPSA_F_COMBINED)) {
1440 		/*
1441 		 * Authentication passed if we reach this point.
1442 		 * Packets with authentication will have the ICV
1443 		 * after the crypto data. Adjust b_wptr before
1444 		 * making padlen checks.
1445 		 */
1446 		ESP_BUMP_STAT(espstack, good_auth);
1447 		data_mp->b_wptr -= assoc->ipsa_mac_len;
1448 
1449 		/*
1450 		 * Check replay window here!
1451 		 * For right now, assume keysock will set the replay window
1452 		 * size to zero for SAs that have an unspecified sender.
1453 		 * This may change...
1454 		 */
1455 
1456 		if (!sadb_replay_check(assoc, esph->esph_replay)) {
1457 			/*
1458 			 * Log the event. As of now we print out an event.
1459 			 * Do not print the replay failure number, or else
1460 			 * syslog cannot collate the error messages.  Printing
1461 			 * the replay number that failed opens a denial-of-
1462 			 * service attack.
1463 			 */
1464 			ipsec_assocfailure(info.mi_idnum, 0, 0,
1465 			    SL_ERROR | SL_WARN,
1466 			    "Replay failed for ESP spi 0x%x, dst %s.\n",
1467 			    assoc->ipsa_spi, assoc->ipsa_dstaddr,
1468 			    assoc->ipsa_addrfam, espstack->ipsecesp_netstack);
1469 			ESP_BUMP_STAT(espstack, replay_failures);
1470 			counter = DROPPER(ipss, ipds_esp_replay);
1471 			goto drop_and_bail;
1472 		}
1473 
1474 		if (is_natt) {
1475 			ASSERT(ira->ira_flags & IRAF_ESP_UDP_PORTS);
1476 			ASSERT(ira->ira_esp_udp_ports != 0);
1477 			esp_port_freshness(ira->ira_esp_udp_ports, assoc);
1478 		}
1479 	}
1480 
1481 	esp_set_usetime(assoc, B_TRUE);
1482 
1483 	if (!esp_age_bytes(assoc, processed_len, B_TRUE)) {
1484 		/* The ipsa has hit hard expiration, LOG and AUDIT. */
1485 		ipsec_assocfailure(info.mi_idnum, 0, 0,
1486 		    SL_ERROR | SL_WARN,
1487 		    "ESP association 0x%x, dst %s had bytes expire.\n",
1488 		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1489 		    espstack->ipsecesp_netstack);
1490 		ESP_BUMP_STAT(espstack, bytes_expired);
1491 		counter = DROPPER(ipss, ipds_esp_bytes_expire);
1492 		goto drop_and_bail;
1493 	}
1494 
1495 	/*
1496 	 * Remove ESP header and padding from packet.  I hope the compiler
1497 	 * spews "branch, predict taken" code for this.
1498 	 */
1499 
1500 	if (esp_strip_header(data_mp, (ira->ira_flags & IRAF_IS_IPV4),
1501 	    ivlen, &counter, espstack)) {
1502 
1503 		if (is_system_labeled() && assoc->ipsa_tsl != NULL) {
1504 			if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) {
1505 				ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1506 				    DROPPER(ipss, ipds_ah_nomem),
1507 				    &espstack->esp_dropper);
1508 				BUMP_MIB(ira->ira_ill->ill_ip_mib,
1509 				    ipIfStatsInDiscards);
1510 				return (NULL);
1511 			}
1512 		}
1513 		if (is_natt)
1514 			return (esp_fix_natt_checksums(data_mp, assoc));
1515 
1516 		if (assoc->ipsa_state == IPSA_STATE_IDLE) {
1517 			/*
1518 			 * Cluster buffering case.  Tell caller that we're
1519 			 * handling the packet.
1520 			 */
1521 			sadb_buf_pkt(assoc, data_mp, ira);
1522 			return (NULL);
1523 		}
1524 
1525 		return (data_mp);
1526 	}
1527 
1528 	esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
1529 drop_and_bail:
1530 	IP_ESP_BUMP_STAT(ipss, in_discards);
1531 	ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, counter,
1532 	    &espstack->esp_dropper);
1533 	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1534 	return (NULL);
1535 }
1536 
1537 /*
1538  * Called upon failing the inbound ICV check. The message passed as
1539  * argument is freed.
1540  */
1541 static void
1542 esp_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira)
1543 {
1544 	ipsa_t		*assoc = ira->ira_ipsec_esp_sa;
1545 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
1546 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1547 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1548 
1549 	/*
1550 	 * Log the event. Don't print to the console, block
1551 	 * potential denial-of-service attack.
1552 	 */
1553 	ESP_BUMP_STAT(espstack, bad_auth);
1554 
1555 	ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1556 	    "ESP Authentication failed for spi 0x%x, dst %s.\n",
1557 	    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1558 	    espstack->ipsecesp_netstack);
1559 
1560 	IP_ESP_BUMP_STAT(ipss, in_discards);
1561 	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
1562 	    DROPPER(ipss, ipds_esp_bad_auth),
1563 	    &espstack->esp_dropper);
1564 }
1565 
1566 
1567 /*
1568  * Invoked for outbound packets after ESP processing. If the packet
1569  * also requires AH, performs the AH SA selection and AH processing.
1570  *
1571  * Returns data_mp (possibly with AH added) unless data_mp was consumed
1572  * due to an error, or queued due to async. crypto or an ACQUIRE trigger.
1573  */
1574 static mblk_t *
1575 esp_do_outbound_ah(mblk_t *data_mp, ip_xmit_attr_t *ixa)
1576 {
1577 	ipsec_action_t *ap;
1578 
1579 	ap = ixa->ixa_ipsec_action;
1580 	if (ap == NULL) {
1581 		ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
1582 		ap = pp->ipsp_act;
1583 	}
1584 
1585 	if (!ap->ipa_want_ah)
1586 		return (data_mp);
1587 
1588 	/*
1589 	 * Normally the AH SA would have already been put in place
1590 	 * but it could have been flushed so we need to look for it.
1591 	 */
1592 	if (ixa->ixa_ipsec_ah_sa == NULL) {
1593 		if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) {
1594 			sadb_acquire(data_mp, ixa, B_TRUE, B_FALSE);
1595 			return (NULL);
1596 		}
1597 	}
1598 	ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
1599 
1600 	data_mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(data_mp, ixa);
1601 	return (data_mp);
1602 }
1603 
1604 
1605 /*
1606  * Kernel crypto framework callback invoked after completion of async
1607  * crypto requests for outbound packets.
1608  */
1609 static void
1610 esp_kcf_callback_outbound(void *arg, int status)
1611 {
1612 	mblk_t		*mp = (mblk_t *)arg;
1613 	mblk_t		*async_mp;
1614 	netstack_t	*ns;
1615 	ipsec_stack_t	*ipss;
1616 	ipsecesp_stack_t *espstack;
1617 	mblk_t		*data_mp;
1618 	ip_xmit_attr_t	ixas;
1619 	ipsec_crypto_t	*ic;
1620 	ill_t		*ill;
1621 
1622 	/*
1623 	 * First remove the ipsec_crypto_t mblk
1624 	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1625 	 */
1626 	async_mp = ipsec_remove_crypto_data(mp, &ic);
1627 	ASSERT(async_mp != NULL);
1628 
1629 	/*
1630 	 * Extract the ip_xmit_attr_t from the first mblk.
1631 	 * Verifies that the netstack and ill is still around; could
1632 	 * have vanished while kEf was doing its work.
1633 	 * On succesful return we have a nce_t and the ill/ipst can't
1634 	 * disappear until we do the nce_refrele in ixa_cleanup.
1635 	 */
1636 	data_mp = async_mp->b_cont;
1637 	async_mp->b_cont = NULL;
1638 	if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) {
1639 		/* Disappeared on us - no ill/ipst for MIB */
1640 		/* We have nowhere to do stats since ixa_ipst could be NULL */
1641 		if (ixas.ixa_nce != NULL) {
1642 			ill = ixas.ixa_nce->nce_ill;
1643 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1644 			ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
1645 		}
1646 		freemsg(data_mp);
1647 		goto done;
1648 	}
1649 	ns = ixas.ixa_ipst->ips_netstack;
1650 	espstack = ns->netstack_ipsecesp;
1651 	ipss = ns->netstack_ipsec;
1652 	ill = ixas.ixa_nce->nce_ill;
1653 
1654 	if (status == CRYPTO_SUCCESS) {
1655 		/*
1656 		 * If a ICV was computed, it was stored by the
1657 		 * crypto framework at the end of the packet.
1658 		 */
1659 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1660 
1661 		esp_set_usetime(ixas.ixa_ipsec_esp_sa, B_FALSE);
1662 		/* NAT-T packet. */
1663 		if (IPH_HDR_VERSION(ipha) == IP_VERSION &&
1664 		    ipha->ipha_protocol == IPPROTO_UDP)
1665 			esp_prepare_udp(ns, data_mp, ipha);
1666 
1667 		/* do AH processing if needed */
1668 		data_mp = esp_do_outbound_ah(data_mp, &ixas);
1669 		if (data_mp == NULL)
1670 			goto done;
1671 
1672 		(void) ip_output_post_ipsec(data_mp, &ixas);
1673 	} else {
1674 		/* Outbound shouldn't see invalid MAC */
1675 		ASSERT(status != CRYPTO_INVALID_MAC);
1676 
1677 		esp1dbg(espstack,
1678 		    ("esp_kcf_callback_outbound: crypto failed with 0x%x\n",
1679 		    status));
1680 		ESP_BUMP_STAT(espstack, crypto_failures);
1681 		ESP_BUMP_STAT(espstack, out_discards);
1682 		ip_drop_packet(data_mp, B_FALSE, ill,
1683 		    DROPPER(ipss, ipds_esp_crypto_failed),
1684 		    &espstack->esp_dropper);
1685 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1686 	}
1687 done:
1688 	ixa_cleanup(&ixas);
1689 	(void) ipsec_free_crypto_data(mp);
1690 }
1691 
1692 /*
1693  * Kernel crypto framework callback invoked after completion of async
1694  * crypto requests for inbound packets.
1695  */
1696 static void
1697 esp_kcf_callback_inbound(void *arg, int status)
1698 {
1699 	mblk_t		*mp = (mblk_t *)arg;
1700 	mblk_t		*async_mp;
1701 	netstack_t	*ns;
1702 	ipsecesp_stack_t *espstack;
1703 	ipsec_stack_t	*ipss;
1704 	mblk_t		*data_mp;
1705 	ip_recv_attr_t	iras;
1706 	ipsec_crypto_t	*ic;
1707 
1708 	/*
1709 	 * First remove the ipsec_crypto_t mblk
1710 	 * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1711 	 */
1712 	async_mp = ipsec_remove_crypto_data(mp, &ic);
1713 	ASSERT(async_mp != NULL);
1714 
1715 	/*
1716 	 * Extract the ip_recv_attr_t from the first mblk.
1717 	 * Verifies that the netstack and ill is still around; could
1718 	 * have vanished while kEf was doing its work.
1719 	 */
1720 	data_mp = async_mp->b_cont;
1721 	async_mp->b_cont = NULL;
1722 	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
1723 		/* The ill or ip_stack_t disappeared on us */
1724 		ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
1725 		freemsg(data_mp);
1726 		goto done;
1727 	}
1728 
1729 	ns = iras.ira_ill->ill_ipst->ips_netstack;
1730 	espstack = ns->netstack_ipsecesp;
1731 	ipss = ns->netstack_ipsec;
1732 
1733 	if (status == CRYPTO_SUCCESS) {
1734 		data_mp = esp_in_done(data_mp, &iras, ic);
1735 		if (data_mp == NULL)
1736 			goto done;
1737 
1738 		/* finish IPsec processing */
1739 		ip_input_post_ipsec(data_mp, &iras);
1740 	} else if (status == CRYPTO_INVALID_MAC) {
1741 		esp_log_bad_auth(data_mp, &iras);
1742 	} else {
1743 		esp1dbg(espstack,
1744 		    ("esp_kcf_callback: crypto failed with 0x%x\n",
1745 		    status));
1746 		ESP_BUMP_STAT(espstack, crypto_failures);
1747 		IP_ESP_BUMP_STAT(ipss, in_discards);
1748 		ip_drop_packet(data_mp, B_TRUE, iras.ira_ill,
1749 		    DROPPER(ipss, ipds_esp_crypto_failed),
1750 		    &espstack->esp_dropper);
1751 		BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1752 	}
1753 done:
1754 	ira_cleanup(&iras, B_TRUE);
1755 	(void) ipsec_free_crypto_data(mp);
1756 }
1757 
1758 /*
1759  * Invoked on crypto framework failure during inbound and outbound processing.
1760  */
1761 static void
1762 esp_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc,
1763     ill_t *ill, ipsecesp_stack_t *espstack)
1764 {
1765 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
1766 
1767 	esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
1768 	    is_inbound ? "inbound" : "outbound", kef_rc));
1769 	ip_drop_packet(data_mp, is_inbound, ill,
1770 	    DROPPER(ipss, ipds_esp_crypto_failed),
1771 	    &espstack->esp_dropper);
1772 	ESP_BUMP_STAT(espstack, crypto_failures);
1773 	if (is_inbound)
1774 		IP_ESP_BUMP_STAT(ipss, in_discards);
1775 	else
1776 		ESP_BUMP_STAT(espstack, out_discards);
1777 }
1778 
1779 /*
1780  * A statement-equivalent macro, _cr MUST point to a modifiable
1781  * crypto_call_req_t.
1782  */
1783 #define	ESP_INIT_CALLREQ(_cr, _mp, _callback)				\
1784 	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE;	\
1785 	(_cr)->cr_callback_arg = (_mp);				\
1786 	(_cr)->cr_callback_func = (_callback)
1787 
1788 #define	ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) {			\
1789 	(mac)->cd_format = CRYPTO_DATA_RAW;				\
1790 	(mac)->cd_offset = 0;						\
1791 	(mac)->cd_length = icvlen;					\
1792 	(mac)->cd_raw.iov_base = (char *)icvbuf;			\
1793 	(mac)->cd_raw.iov_len = icvlen;					\
1794 }
1795 
1796 #define	ESP_INIT_CRYPTO_DATA(data, mp, off, len) {			\
1797 	if (MBLKL(mp) >= (len) + (off)) {				\
1798 		(data)->cd_format = CRYPTO_DATA_RAW;			\
1799 		(data)->cd_raw.iov_base = (char *)(mp)->b_rptr;		\
1800 		(data)->cd_raw.iov_len = MBLKL(mp);			\
1801 		(data)->cd_offset = off;				\
1802 	} else {							\
1803 		(data)->cd_format = CRYPTO_DATA_MBLK;			\
1804 		(data)->cd_mp = mp;			       		\
1805 		(data)->cd_offset = off;				\
1806 	}								\
1807 	(data)->cd_length = len;					\
1808 }
1809 
1810 #define	ESP_INIT_CRYPTO_DUAL_DATA(data, mp, off1, len1, off2, len2) {	\
1811 	(data)->dd_format = CRYPTO_DATA_MBLK;				\
1812 	(data)->dd_mp = mp;						\
1813 	(data)->dd_len1 = len1;						\
1814 	(data)->dd_offset1 = off1;					\
1815 	(data)->dd_len2 = len2;						\
1816 	(data)->dd_offset2 = off2;					\
1817 }
1818 
1819 /*
1820  * Returns data_mp if successfully completed the request. Returns
1821  * NULL if it failed (and increments InDiscards) or if it is pending.
1822  */
1823 static mblk_t *
1824 esp_submit_req_inbound(mblk_t *esp_mp, ip_recv_attr_t *ira,
1825     ipsa_t *assoc, uint_t esph_offset)
1826 {
1827 	uint_t auth_offset, msg_len, auth_len;
1828 	crypto_call_req_t call_req, *callrp;
1829 	mblk_t *mp;
1830 	esph_t *esph_ptr;
1831 	int kef_rc;
1832 	uint_t icv_len = assoc->ipsa_mac_len;
1833 	crypto_ctx_template_t auth_ctx_tmpl;
1834 	boolean_t do_auth, do_encr, force;
1835 	uint_t encr_offset, encr_len;
1836 	uint_t iv_len = assoc->ipsa_iv_len;
1837 	crypto_ctx_template_t encr_ctx_tmpl;
1838 	ipsec_crypto_t	*ic, icstack;
1839 	uchar_t *iv_ptr;
1840 	netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
1841 	ipsec_stack_t *ipss = ns->netstack_ipsec;
1842 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1843 
1844 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
1845 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
1846 	force = (assoc->ipsa_flags & IPSA_F_ASYNC);
1847 
1848 #ifdef IPSEC_LATENCY_TEST
1849 	kef_rc = CRYPTO_SUCCESS;
1850 #else
1851 	kef_rc = CRYPTO_FAILED;
1852 #endif
1853 
1854 	/*
1855 	 * An inbound packet is of the form:
1856 	 * [IP,options,ESP,IV,data,ICV,pad]
1857 	 */
1858 	esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
1859 	iv_ptr = (uchar_t *)(esph_ptr + 1);
1860 	/* Packet length starting at IP header ending after ESP ICV. */
1861 	msg_len = MBLKL(esp_mp);
1862 
1863 	encr_offset = esph_offset + sizeof (esph_t) + iv_len;
1864 	encr_len = msg_len - encr_offset;
1865 
1866 	/*
1867 	 * Counter mode algs need a nonce. This is setup in sadb_common_add().
1868 	 * If for some reason we are using a SA which does not have a nonce
1869 	 * then we must fail here.
1870 	 */
1871 	if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
1872 	    (assoc->ipsa_nonce == NULL)) {
1873 		ip_drop_packet(esp_mp, B_TRUE, ira->ira_ill,
1874 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
1875 		return (NULL);
1876 	}
1877 
1878 	if (force) {
1879 		/* We are doing asynch; allocate mblks to hold state */
1880 		if ((mp = ip_recv_attr_to_mblk(ira)) == NULL ||
1881 		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
1882 			BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1883 			ip_drop_input("ipIfStatsInDiscards", esp_mp,
1884 			    ira->ira_ill);
1885 			return (NULL);
1886 		}
1887 		linkb(mp, esp_mp);
1888 		callrp = &call_req;
1889 		ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_inbound);
1890 	} else {
1891 		/*
1892 		 * If we know we are going to do sync then ipsec_crypto_t
1893 		 * should be on the stack.
1894 		 */
1895 		ic = &icstack;
1896 		bzero(ic, sizeof (*ic));
1897 		callrp = NULL;
1898 	}
1899 
1900 	if (do_auth) {
1901 		/* authentication context template */
1902 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
1903 		    auth_ctx_tmpl);
1904 
1905 		/* ICV to be verified */
1906 		ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
1907 		    icv_len, esp_mp->b_wptr - icv_len);
1908 
1909 		/* authentication starts at the ESP header */
1910 		auth_offset = esph_offset;
1911 		auth_len = msg_len - auth_offset - icv_len;
1912 		if (!do_encr) {
1913 			/* authentication only */
1914 			/* initialize input data argument */
1915 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
1916 			    esp_mp, auth_offset, auth_len);
1917 
1918 			/* call the crypto framework */
1919 			kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
1920 			    &ic->ic_crypto_data,
1921 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
1922 			    &ic->ic_crypto_mac, callrp);
1923 		}
1924 	}
1925 
1926 	if (do_encr) {
1927 		/* encryption template */
1928 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
1929 		    encr_ctx_tmpl);
1930 
1931 		/* Call the nonce update function. Also passes in IV */
1932 		(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, encr_len,
1933 		    iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
1934 
1935 		if (!do_auth) {
1936 			/* decryption only */
1937 			/* initialize input data argument */
1938 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
1939 			    esp_mp, encr_offset, encr_len);
1940 
1941 			/* call the crypto framework */
1942 			kef_rc = crypto_decrypt((crypto_mechanism_t *)
1943 			    &ic->ic_cmm, &ic->ic_crypto_data,
1944 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
1945 			    NULL, callrp);
1946 		}
1947 	}
1948 
1949 	if (do_auth && do_encr) {
1950 		/* dual operation */
1951 		/* initialize input data argument */
1952 		ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
1953 		    esp_mp, auth_offset, auth_len,
1954 		    encr_offset, encr_len - icv_len);
1955 
1956 		/* specify IV */
1957 		ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
1958 
1959 		/* call the framework */
1960 		kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
1961 		    &assoc->ipsa_emech, &ic->ic_crypto_dual_data,
1962 		    &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
1963 		    auth_ctx_tmpl, encr_ctx_tmpl, &ic->ic_crypto_mac,
1964 		    NULL, callrp);
1965 	}
1966 
1967 	switch (kef_rc) {
1968 	case CRYPTO_SUCCESS:
1969 		ESP_BUMP_STAT(espstack, crypto_sync);
1970 		esp_mp = esp_in_done(esp_mp, ira, ic);
1971 		if (force) {
1972 			/* Free mp after we are done with ic */
1973 			mp = ipsec_free_crypto_data(mp);
1974 			(void) ip_recv_attr_free_mblk(mp);
1975 		}
1976 		return (esp_mp);
1977 	case CRYPTO_QUEUED:
1978 		/* esp_kcf_callback_inbound() will be invoked on completion */
1979 		ESP_BUMP_STAT(espstack, crypto_async);
1980 		return (NULL);
1981 	case CRYPTO_INVALID_MAC:
1982 		if (force) {
1983 			mp = ipsec_free_crypto_data(mp);
1984 			esp_mp = ip_recv_attr_free_mblk(mp);
1985 		}
1986 		ESP_BUMP_STAT(espstack, crypto_sync);
1987 		BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1988 		esp_log_bad_auth(esp_mp, ira);
1989 		/* esp_mp was passed to ip_drop_packet */
1990 		return (NULL);
1991 	}
1992 
1993 	if (force) {
1994 		mp = ipsec_free_crypto_data(mp);
1995 		esp_mp = ip_recv_attr_free_mblk(mp);
1996 	}
1997 	BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1998 	esp_crypto_failed(esp_mp, B_TRUE, kef_rc, ira->ira_ill, espstack);
1999 	/* esp_mp was passed to ip_drop_packet */
2000 	return (NULL);
2001 }
2002 
2003 /*
2004  * Compute the IP and UDP checksums -- common code for both keepalives and
2005  * actual ESP-in-UDP packets.  Be flexible with multiple mblks because ESP
2006  * uses mblk-insertion to insert the UDP header.
2007  * TODO - If there is an easy way to prep a packet for HW checksums, make
2008  * it happen here.
2009  * Note that this is used before both before calling ip_output_simple and
2010  * in the esp datapath. The former could use IXAF_SET_ULP_CKSUM but not the
2011  * latter.
2012  */
2013 static void
2014 esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
2015 {
2016 	int offset;
2017 	uint32_t cksum;
2018 	uint16_t *arr;
2019 	mblk_t *udpmp = mp;
2020 	uint_t hlen = IPH_HDR_LENGTH(ipha);
2021 
2022 	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2023 
2024 	ipha->ipha_hdr_checksum = 0;
2025 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2026 
2027 	if (ns->netstack_udp->us_do_checksum) {
2028 		ASSERT(MBLKL(udpmp) >= sizeof (udpha_t));
2029 		/* arr points to the IP header. */
2030 		arr = (uint16_t *)ipha;
2031 		IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
2032 		IP_STAT_UPDATE(ns->netstack_ip, ip_out_sw_cksum_bytes,
2033 		    ntohs(htons(ipha->ipha_length) - hlen));
2034 		/* arr[6-9] are the IP addresses. */
2035 		cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
2036 		    ntohs(htons(ipha->ipha_length) - hlen);
2037 		cksum = IP_CSUM(mp, hlen, cksum);
2038 		offset = hlen + UDP_CHECKSUM_OFFSET;
2039 		while (offset >= MBLKL(udpmp)) {
2040 			offset -= MBLKL(udpmp);
2041 			udpmp = udpmp->b_cont;
2042 		}
2043 		/* arr points to the UDP header's checksum field. */
2044 		arr = (uint16_t *)(udpmp->b_rptr + offset);
2045 		*arr = cksum;
2046 	}
2047 }
2048 
2049 /*
2050  * taskq handler so we can send the NAT-T keepalive on a separate thread.
2051  */
2052 static void
2053 actually_send_keepalive(void *arg)
2054 {
2055 	mblk_t *mp = (mblk_t *)arg;
2056 	ip_xmit_attr_t ixas;
2057 	netstack_t	*ns;
2058 	netstackid_t	stackid;
2059 
2060 	stackid = (netstackid_t)(uintptr_t)mp->b_prev;
2061 	mp->b_prev = NULL;
2062 	ns = netstack_find_by_stackid(stackid);
2063 	if (ns == NULL) {
2064 		/* Disappeared */
2065 		ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
2066 		freemsg(mp);
2067 		return;
2068 	}
2069 
2070 	bzero(&ixas, sizeof (ixas));
2071 	ixas.ixa_zoneid = ALL_ZONES;
2072 	ixas.ixa_cred = kcred;
2073 	ixas.ixa_cpid = NOPID;
2074 	ixas.ixa_tsl = NULL;
2075 	ixas.ixa_ipst = ns->netstack_ip;
2076 	/* No ULP checksum; done by esp_prepare_udp */
2077 	ixas.ixa_flags = (IXAF_IS_IPV4 | IXAF_NO_IPSEC | IXAF_VERIFY_SOURCE);
2078 
2079 	(void) ip_output_simple(mp, &ixas);
2080 	ixa_cleanup(&ixas);
2081 	netstack_rele(ns);
2082 }
2083 
2084 /*
2085  * Send a one-byte UDP NAT-T keepalive.
2086  */
2087 void
2088 ipsecesp_send_keepalive(ipsa_t *assoc)
2089 {
2090 	mblk_t		*mp;
2091 	ipha_t		*ipha;
2092 	udpha_t		*udpha;
2093 	netstack_t	*ns = assoc->ipsa_netstack;
2094 
2095 	ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock));
2096 
2097 	mp = allocb(sizeof (ipha_t) + sizeof (udpha_t) + 1, BPRI_HI);
2098 	if (mp == NULL)
2099 		return;
2100 	ipha = (ipha_t *)mp->b_rptr;
2101 	ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2102 	ipha->ipha_type_of_service = 0;
2103 	ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (udpha_t) + 1);
2104 	/* Use the low-16 of the SPI so we have some clue where it came from. */
2105 	ipha->ipha_ident = *(((uint16_t *)(&assoc->ipsa_spi)) + 1);
2106 	ipha->ipha_fragment_offset_and_flags = 0;  /* Too small to fragment! */
2107 	ipha->ipha_ttl = 0xFF;
2108 	ipha->ipha_protocol = IPPROTO_UDP;
2109 	ipha->ipha_hdr_checksum = 0;
2110 	ipha->ipha_src = assoc->ipsa_srcaddr[0];
2111 	ipha->ipha_dst = assoc->ipsa_dstaddr[0];
2112 	udpha = (udpha_t *)(ipha + 1);
2113 	udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2114 	    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2115 	udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2116 	    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2117 	udpha->uha_length = htons(sizeof (udpha_t) + 1);
2118 	udpha->uha_checksum = 0;
2119 	mp->b_wptr = (uint8_t *)(udpha + 1);
2120 	*(mp->b_wptr++) = 0xFF;
2121 
2122 	esp_prepare_udp(ns, mp, ipha);
2123 
2124 	/*
2125 	 * We're holding an isaf_t bucket lock, so pawn off the actual
2126 	 * packet transmission to another thread.  Just in case syncq
2127 	 * processing causes a same-bucket packet to be processed.
2128 	 */
2129 	mp->b_prev = (mblk_t *)(uintptr_t)ns->netstack_stackid;
2130 
2131 	if (taskq_dispatch(esp_taskq, actually_send_keepalive, mp,
2132 	    TQ_NOSLEEP) == 0) {
2133 		/* Assume no memory if taskq_dispatch() fails. */
2134 		mp->b_prev = NULL;
2135 		ip_drop_packet(mp, B_FALSE, NULL,
2136 		    DROPPER(ns->netstack_ipsec, ipds_esp_nomem),
2137 		    &ns->netstack_ipsecesp->esp_dropper);
2138 	}
2139 }
2140 
2141 /*
2142  * Returns mp if successfully completed the request. Returns
2143  * NULL if it failed (and increments InDiscards) or if it is pending.
2144  */
2145 static mblk_t *
2146 esp_submit_req_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa, ipsa_t *assoc,
2147     uchar_t *icv_buf, uint_t payload_len)
2148 {
2149 	uint_t auth_len;
2150 	crypto_call_req_t call_req, *callrp;
2151 	mblk_t *esp_mp;
2152 	esph_t *esph_ptr;
2153 	mblk_t *mp;
2154 	int kef_rc = CRYPTO_FAILED;
2155 	uint_t icv_len = assoc->ipsa_mac_len;
2156 	crypto_ctx_template_t auth_ctx_tmpl;
2157 	boolean_t do_auth, do_encr, force;
2158 	uint_t iv_len = assoc->ipsa_iv_len;
2159 	crypto_ctx_template_t encr_ctx_tmpl;
2160 	boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
2161 	size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
2162 	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
2163 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2164 	ipsec_crypto_t	*ic, icstack;
2165 	uchar_t		*iv_ptr;
2166 	crypto_data_t	*cd_ptr = NULL;
2167 	ill_t		*ill = ixa->ixa_nce->nce_ill;
2168 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2169 
2170 	esp3dbg(espstack, ("esp_submit_req_outbound:%s",
2171 	    is_natt ? "natt" : "not natt"));
2172 
2173 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2174 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2175 	force = (assoc->ipsa_flags & IPSA_F_ASYNC);
2176 
2177 #ifdef IPSEC_LATENCY_TEST
2178 	kef_rc = CRYPTO_SUCCESS;
2179 #else
2180 	kef_rc = CRYPTO_FAILED;
2181 #endif
2182 
2183 	/*
2184 	 * Outbound IPsec packets are of the form:
2185 	 * [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
2186 	 * unless it's NATT, then it's
2187 	 * [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
2188 	 * Get a pointer to the mblk containing the ESP header.
2189 	 */
2190 	ASSERT(data_mp->b_cont != NULL);
2191 	esp_mp = data_mp->b_cont;
2192 	esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
2193 	iv_ptr = (uchar_t *)(esph_ptr + 1);
2194 
2195 	/*
2196 	 * Combined mode algs need a nonce. This is setup in sadb_common_add().
2197 	 * If for some reason we are using a SA which does not have a nonce
2198 	 * then we must fail here.
2199 	 */
2200 	if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
2201 	    (assoc->ipsa_nonce == NULL)) {
2202 		ip_drop_packet(data_mp, B_FALSE, NULL,
2203 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2204 		return (NULL);
2205 	}
2206 
2207 	if (force) {
2208 		/* We are doing asynch; allocate mblks to hold state */
2209 		if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL ||
2210 		    (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
2211 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2212 			ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
2213 			freemsg(data_mp);
2214 			return (NULL);
2215 		}
2216 
2217 		linkb(mp, data_mp);
2218 		callrp = &call_req;
2219 		ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_outbound);
2220 	} else {
2221 		/*
2222 		 * If we know we are going to do sync then ipsec_crypto_t
2223 		 * should be on the stack.
2224 		 */
2225 		ic = &icstack;
2226 		bzero(ic, sizeof (*ic));
2227 		callrp = NULL;
2228 	}
2229 
2230 
2231 	if (do_auth) {
2232 		/* authentication context template */
2233 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2234 		    auth_ctx_tmpl);
2235 
2236 		/* where to store the computed mac */
2237 		ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
2238 		    icv_len, icv_buf);
2239 
2240 		/* authentication starts at the ESP header */
2241 		auth_len = payload_len + iv_len + sizeof (esph_t);
2242 		if (!do_encr) {
2243 			/* authentication only */
2244 			/* initialize input data argument */
2245 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2246 			    esp_mp, esph_offset, auth_len);
2247 
2248 			/* call the crypto framework */
2249 			kef_rc = crypto_mac(&assoc->ipsa_amech,
2250 			    &ic->ic_crypto_data,
2251 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2252 			    &ic->ic_crypto_mac, callrp);
2253 		}
2254 	}
2255 
2256 	if (do_encr) {
2257 		/* encryption context template */
2258 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2259 		    encr_ctx_tmpl);
2260 		/* Call the nonce update function. */
2261 		(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, payload_len,
2262 		    iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
2263 
2264 		if (!do_auth) {
2265 			/* encryption only, skip mblk that contains ESP hdr */
2266 			/* initialize input data argument */
2267 			ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2268 			    esp_mp->b_cont, 0, payload_len);
2269 
2270 			/*
2271 			 * For combined mode ciphers, the ciphertext is the same
2272 			 * size as the clear text, the ICV should follow the
2273 			 * ciphertext. To convince the kcf to allow in-line
2274 			 * encryption, with an ICV, use ipsec_out_crypto_mac
2275 			 * to point to the same buffer as the data. The calling
2276 			 * function need to ensure the buffer is large enough to
2277 			 * include the ICV.
2278 			 *
2279 			 * The IV is already written to the packet buffer, the
2280 			 * nonce setup function copied it to the params struct
2281 			 * for the cipher to use.
2282 			 */
2283 			if (assoc->ipsa_flags & IPSA_F_COMBINED) {
2284 				bcopy(&ic->ic_crypto_data,
2285 				    &ic->ic_crypto_mac,
2286 				    sizeof (crypto_data_t));
2287 				ic->ic_crypto_mac.cd_length =
2288 				    payload_len + icv_len;
2289 				cd_ptr = &ic->ic_crypto_mac;
2290 			}
2291 
2292 			/* call the crypto framework */
2293 			kef_rc = crypto_encrypt((crypto_mechanism_t *)
2294 			    &ic->ic_cmm, &ic->ic_crypto_data,
2295 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2296 			    cd_ptr, callrp);
2297 
2298 		}
2299 	}
2300 
2301 	if (do_auth && do_encr) {
2302 		/*
2303 		 * Encryption and authentication:
2304 		 * Pass the pointer to the mblk chain starting at the ESP
2305 		 * header to the framework. Skip the ESP header mblk
2306 		 * for encryption, which is reflected by an encryption
2307 		 * offset equal to the length of that mblk. Start
2308 		 * the authentication at the ESP header, i.e. use an
2309 		 * authentication offset of zero.
2310 		 */
2311 		ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
2312 		    esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
2313 
2314 		/* specify IV */
2315 		ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
2316 
2317 		/* call the framework */
2318 		kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
2319 		    &assoc->ipsa_amech, NULL,
2320 		    &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
2321 		    encr_ctx_tmpl, auth_ctx_tmpl,
2322 		    &ic->ic_crypto_dual_data,
2323 		    &ic->ic_crypto_mac, callrp);
2324 	}
2325 
2326 	switch (kef_rc) {
2327 	case CRYPTO_SUCCESS:
2328 		ESP_BUMP_STAT(espstack, crypto_sync);
2329 		esp_set_usetime(assoc, B_FALSE);
2330 		if (force) {
2331 			mp = ipsec_free_crypto_data(mp);
2332 			data_mp = ip_xmit_attr_free_mblk(mp);
2333 		}
2334 		if (is_natt)
2335 			esp_prepare_udp(ns, data_mp, (ipha_t *)data_mp->b_rptr);
2336 		return (data_mp);
2337 	case CRYPTO_QUEUED:
2338 		/* esp_kcf_callback_outbound() will be invoked on completion */
2339 		ESP_BUMP_STAT(espstack, crypto_async);
2340 		return (NULL);
2341 	}
2342 
2343 	if (force) {
2344 		mp = ipsec_free_crypto_data(mp);
2345 		data_mp = ip_xmit_attr_free_mblk(mp);
2346 	}
2347 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2348 	esp_crypto_failed(data_mp, B_FALSE, kef_rc, NULL, espstack);
2349 	/* data_mp was passed to ip_drop_packet */
2350 	return (NULL);
2351 }
2352 
2353 /*
2354  * Handle outbound IPsec processing for IPv4 and IPv6
2355  *
2356  * Returns data_mp if successfully completed the request. Returns
2357  * NULL if it failed (and increments InDiscards) or if it is pending.
2358  */
2359 static mblk_t *
2360 esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa)
2361 {
2362 	mblk_t *espmp, *tailmp;
2363 	ipha_t *ipha;
2364 	ip6_t *ip6h;
2365 	esph_t *esph_ptr, *iv_ptr;
2366 	uint_t af;
2367 	uint8_t *nhp;
2368 	uintptr_t divpoint, datalen, adj, padlen, i, alloclen;
2369 	uintptr_t esplen = sizeof (esph_t);
2370 	uint8_t protocol;
2371 	ipsa_t *assoc;
2372 	uint_t iv_len, block_size, mac_len = 0;
2373 	uchar_t *icv_buf;
2374 	udpha_t *udpha;
2375 	boolean_t is_natt = B_FALSE;
2376 	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
2377 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2378 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2379 	ill_t		*ill = ixa->ixa_nce->nce_ill;
2380 	boolean_t	need_refrele = B_FALSE;
2381 
2382 	ESP_BUMP_STAT(espstack, out_requests);
2383 
2384 	/*
2385 	 * <sigh> We have to copy the message here, because TCP (for example)
2386 	 * keeps a dupb() of the message lying around for retransmission.
2387 	 * Since ESP changes the whole of the datagram, we have to create our
2388 	 * own copy lest we clobber TCP's data.  Since we have to copy anyway,
2389 	 * we might as well make use of msgpullup() and get the mblk into one
2390 	 * contiguous piece!
2391 	 */
2392 	tailmp = msgpullup(data_mp, -1);
2393 	if (tailmp == NULL) {
2394 		esp0dbg(("esp_outbound: msgpullup() failed, "
2395 		    "dropping packet.\n"));
2396 		ip_drop_packet(data_mp, B_FALSE, ill,
2397 		    DROPPER(ipss, ipds_esp_nomem),
2398 		    &espstack->esp_dropper);
2399 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2400 		return (NULL);
2401 	}
2402 	freemsg(data_mp);
2403 	data_mp = tailmp;
2404 
2405 	assoc = ixa->ixa_ipsec_esp_sa;
2406 	ASSERT(assoc != NULL);
2407 
2408 	/*
2409 	 * Get the outer IP header in shape to escape this system..
2410 	 */
2411 	if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) {
2412 		/*
2413 		 * Need to update packet with any CIPSO option and update
2414 		 * ixa_tsl to capture the new label.
2415 		 * We allocate a separate ixa for that purpose.
2416 		 */
2417 		ixa = ip_xmit_attr_duplicate(ixa);
2418 		if (ixa == NULL) {
2419 			ip_drop_packet(data_mp, B_FALSE, ill,
2420 			    DROPPER(ipss, ipds_esp_nomem),
2421 			    &espstack->esp_dropper);
2422 			return (NULL);
2423 		}
2424 		need_refrele = B_TRUE;
2425 
2426 		label_hold(assoc->ipsa_otsl);
2427 		ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl);
2428 
2429 		data_mp = sadb_whack_label(data_mp, assoc, ixa,
2430 		    DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2431 		if (data_mp == NULL) {
2432 			/* Packet dropped by sadb_whack_label */
2433 			ixa_refrele(ixa);
2434 			return (NULL);
2435 		}
2436 	}
2437 
2438 	/*
2439 	 * Reality check....
2440 	 */
2441 	ipha = (ipha_t *)data_mp->b_rptr;  /* So we can call esp_acquire(). */
2442 
2443 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2444 		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
2445 
2446 		af = AF_INET;
2447 		divpoint = IPH_HDR_LENGTH(ipha);
2448 		datalen = ntohs(ipha->ipha_length) - divpoint;
2449 		nhp = (uint8_t *)&ipha->ipha_protocol;
2450 	} else {
2451 		ip_pkt_t ipp;
2452 
2453 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
2454 
2455 		af = AF_INET6;
2456 		ip6h = (ip6_t *)ipha;
2457 		bzero(&ipp, sizeof (ipp));
2458 		divpoint = ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, NULL);
2459 		if (ipp.ipp_dstopts != NULL &&
2460 		    ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
2461 			/*
2462 			 * Destination options are tricky.  If we get in here,
2463 			 * then we have a terminal header following the
2464 			 * destination options.  We need to adjust backwards
2465 			 * so we insert ESP BEFORE the destination options
2466 			 * bag.  (So that the dstopts get encrypted!)
2467 			 *
2468 			 * Since this is for outbound packets only, we know
2469 			 * that non-terminal destination options only precede
2470 			 * routing headers.
2471 			 */
2472 			divpoint -= ipp.ipp_dstoptslen;
2473 		}
2474 		datalen = ntohs(ip6h->ip6_plen) + sizeof (ip6_t) - divpoint;
2475 
2476 		if (ipp.ipp_rthdr != NULL) {
2477 			nhp = &ipp.ipp_rthdr->ip6r_nxt;
2478 		} else if (ipp.ipp_hopopts != NULL) {
2479 			nhp = &ipp.ipp_hopopts->ip6h_nxt;
2480 		} else {
2481 			ASSERT(divpoint == sizeof (ip6_t));
2482 			/* It's probably IP + ESP. */
2483 			nhp = &ip6h->ip6_nxt;
2484 		}
2485 	}
2486 
2487 	mac_len = assoc->ipsa_mac_len;
2488 
2489 	if (assoc->ipsa_flags & IPSA_F_NATT) {
2490 		/* wedge in UDP header */
2491 		is_natt = B_TRUE;
2492 		esplen += UDPH_SIZE;
2493 	}
2494 
2495 	/*
2496 	 * Set up ESP header and encryption padding for ENCR PI request.
2497 	 */
2498 
2499 	/* Determine the padding length.  Pad to 4-bytes for no-encryption. */
2500 	if (assoc->ipsa_encr_alg != SADB_EALG_NULL) {
2501 		iv_len = assoc->ipsa_iv_len;
2502 		block_size = assoc->ipsa_datalen;
2503 
2504 		/*
2505 		 * Pad the data to the length of the cipher block size.
2506 		 * Include the two additional bytes (hence the - 2) for the
2507 		 * padding length and the next header.  Take this into account
2508 		 * when calculating the actual length of the padding.
2509 		 */
2510 		ASSERT(ISP2(iv_len));
2511 		padlen = ((unsigned)(block_size - datalen - 2)) &
2512 		    (block_size - 1);
2513 	} else {
2514 		iv_len = 0;
2515 		padlen = ((unsigned)(sizeof (uint32_t) - datalen - 2)) &
2516 		    (sizeof (uint32_t) - 1);
2517 	}
2518 
2519 	/* Allocate ESP header and IV. */
2520 	esplen += iv_len;
2521 
2522 	/*
2523 	 * Update association byte-count lifetimes.  Don't forget to take
2524 	 * into account the padding length and next-header (hence the + 2).
2525 	 *
2526 	 * Use the amount of data fed into the "encryption algorithm".  This
2527 	 * is the IV, the data length, the padding length, and the final two
2528 	 * bytes (padlen, and next-header).
2529 	 *
2530 	 */
2531 
2532 	if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
2533 		ip_drop_packet(data_mp, B_FALSE, ill,
2534 		    DROPPER(ipss, ipds_esp_bytes_expire),
2535 		    &espstack->esp_dropper);
2536 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2537 		if (need_refrele)
2538 			ixa_refrele(ixa);
2539 		return (NULL);
2540 	}
2541 
2542 	espmp = allocb(esplen, BPRI_HI);
2543 	if (espmp == NULL) {
2544 		ESP_BUMP_STAT(espstack, out_discards);
2545 		esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
2546 		ip_drop_packet(data_mp, B_FALSE, ill,
2547 		    DROPPER(ipss, ipds_esp_nomem),
2548 		    &espstack->esp_dropper);
2549 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2550 		if (need_refrele)
2551 			ixa_refrele(ixa);
2552 		return (NULL);
2553 	}
2554 	espmp->b_wptr += esplen;
2555 	esph_ptr = (esph_t *)espmp->b_rptr;
2556 
2557 	if (is_natt) {
2558 		esp3dbg(espstack, ("esp_outbound: NATT"));
2559 
2560 		udpha = (udpha_t *)espmp->b_rptr;
2561 		udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2562 		    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2563 		udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2564 		    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2565 		/*
2566 		 * Set the checksum to 0, so that the esp_prepare_udp() call
2567 		 * can do the right thing.
2568 		 */
2569 		udpha->uha_checksum = 0;
2570 		esph_ptr = (esph_t *)(udpha + 1);
2571 	}
2572 
2573 	esph_ptr->esph_spi = assoc->ipsa_spi;
2574 
2575 	esph_ptr->esph_replay = htonl(atomic_inc_32_nv(&assoc->ipsa_replay));
2576 	if (esph_ptr->esph_replay == 0 && assoc->ipsa_replay_wsize != 0) {
2577 		/*
2578 		 * XXX We have replay counter wrapping.
2579 		 * We probably want to nuke this SA (and its peer).
2580 		 */
2581 		ipsec_assocfailure(info.mi_idnum, 0, 0,
2582 		    SL_ERROR | SL_CONSOLE | SL_WARN,
2583 		    "Outbound ESP SA (0x%x, %s) has wrapped sequence.\n",
2584 		    esph_ptr->esph_spi, assoc->ipsa_dstaddr, af,
2585 		    espstack->ipsecesp_netstack);
2586 
2587 		ESP_BUMP_STAT(espstack, out_discards);
2588 		sadb_replay_delete(assoc);
2589 		ip_drop_packet(data_mp, B_FALSE, ill,
2590 		    DROPPER(ipss, ipds_esp_replay),
2591 		    &espstack->esp_dropper);
2592 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2593 		if (need_refrele)
2594 			ixa_refrele(ixa);
2595 		return (NULL);
2596 	}
2597 
2598 	iv_ptr = (esph_ptr + 1);
2599 	/*
2600 	 * iv_ptr points to the mblk which will contain the IV once we have
2601 	 * written it there. This mblk will be part of a mblk chain that
2602 	 * will make up the packet.
2603 	 *
2604 	 * For counter mode algorithms, the IV is a 64 bit quantity, it
2605 	 * must NEVER repeat in the lifetime of the SA, otherwise an
2606 	 * attacker who had recorded enough packets might be able to
2607 	 * determine some clear text.
2608 	 *
2609 	 * To ensure this does not happen, the IV is stored in the SA and
2610 	 * incremented for each packet, the IV is then copied into the
2611 	 * "packet" for transmission to the receiving system. The IV will
2612 	 * also be copied into the nonce, when the packet is encrypted.
2613 	 *
2614 	 * CBC mode algorithms use a random IV for each packet. We do not
2615 	 * require the highest quality random bits, but for best security
2616 	 * with CBC mode ciphers, the value must be unlikely to repeat and
2617 	 * must not be known in advance to an adversary capable of influencing
2618 	 * the clear text.
2619 	 */
2620 	if (!update_iv((uint8_t *)iv_ptr, espstack->esp_pfkey_q, assoc,
2621 	    espstack)) {
2622 		ip_drop_packet(data_mp, B_FALSE, ill,
2623 		    DROPPER(ipss, ipds_esp_iv_wrap), &espstack->esp_dropper);
2624 		if (need_refrele)
2625 			ixa_refrele(ixa);
2626 		return (NULL);
2627 	}
2628 
2629 	/* Fix the IP header. */
2630 	alloclen = padlen + 2 + mac_len;
2631 	adj = alloclen + (espmp->b_wptr - espmp->b_rptr);
2632 
2633 	protocol = *nhp;
2634 
2635 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2636 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
2637 		if (is_natt) {
2638 			*nhp = IPPROTO_UDP;
2639 			udpha->uha_length = htons(ntohs(ipha->ipha_length) -
2640 			    IPH_HDR_LENGTH(ipha));
2641 		} else {
2642 			*nhp = IPPROTO_ESP;
2643 		}
2644 		ipha->ipha_hdr_checksum = 0;
2645 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
2646 	} else {
2647 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + adj);
2648 		*nhp = IPPROTO_ESP;
2649 	}
2650 
2651 	/* I've got the two ESP mblks, now insert them. */
2652 
2653 	esp2dbg(espstack, ("data_mp before outbound ESP adjustment:\n"));
2654 	esp2dbg(espstack, (dump_msg(data_mp)));
2655 
2656 	if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
2657 		ESP_BUMP_STAT(espstack, out_discards);
2658 		/* NOTE:  esp_insert_esp() only fails if there's no memory. */
2659 		ip_drop_packet(data_mp, B_FALSE, ill,
2660 		    DROPPER(ipss, ipds_esp_nomem),
2661 		    &espstack->esp_dropper);
2662 		freeb(espmp);
2663 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2664 		if (need_refrele)
2665 			ixa_refrele(ixa);
2666 		return (NULL);
2667 	}
2668 
2669 	/* Append padding (and leave room for ICV). */
2670 	for (tailmp = data_mp; tailmp->b_cont != NULL; tailmp = tailmp->b_cont)
2671 		;
2672 	if (tailmp->b_wptr + alloclen > tailmp->b_datap->db_lim) {
2673 		tailmp->b_cont = allocb(alloclen, BPRI_HI);
2674 		if (tailmp->b_cont == NULL) {
2675 			ESP_BUMP_STAT(espstack, out_discards);
2676 			esp0dbg(("esp_outbound:  Can't allocate tailmp.\n"));
2677 			ip_drop_packet(data_mp, B_FALSE, ill,
2678 			    DROPPER(ipss, ipds_esp_nomem),
2679 			    &espstack->esp_dropper);
2680 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2681 			if (need_refrele)
2682 				ixa_refrele(ixa);
2683 			return (NULL);
2684 		}
2685 		tailmp = tailmp->b_cont;
2686 	}
2687 
2688 	/*
2689 	 * If there's padding, N bytes of padding must be of the form 0x1,
2690 	 * 0x2, 0x3... 0xN.
2691 	 */
2692 	for (i = 0; i < padlen; ) {
2693 		i++;
2694 		*tailmp->b_wptr++ = i;
2695 	}
2696 	*tailmp->b_wptr++ = i;
2697 	*tailmp->b_wptr++ = protocol;
2698 
2699 	esp2dbg(espstack, ("data_Mp before encryption:\n"));
2700 	esp2dbg(espstack, (dump_msg(data_mp)));
2701 
2702 	/*
2703 	 * Okay.  I've set up the pre-encryption ESP.  Let's do it!
2704 	 */
2705 
2706 	if (mac_len > 0) {
2707 		ASSERT(tailmp->b_wptr + mac_len <= tailmp->b_datap->db_lim);
2708 		icv_buf = tailmp->b_wptr;
2709 		tailmp->b_wptr += mac_len;
2710 	} else {
2711 		icv_buf = NULL;
2712 	}
2713 
2714 	data_mp = esp_submit_req_outbound(data_mp, ixa, assoc, icv_buf,
2715 	    datalen + padlen + 2);
2716 	if (need_refrele)
2717 		ixa_refrele(ixa);
2718 	return (data_mp);
2719 }
2720 
2721 /*
2722  * IP calls this to validate the ICMP errors that
2723  * we got from the network.
2724  */
2725 mblk_t *
2726 ipsecesp_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira)
2727 {
2728 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
2729 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2730 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2731 
2732 	/*
2733 	 * Unless we get an entire packet back, this function is useless.
2734 	 * Why?
2735 	 *
2736 	 * 1.)	Partial packets are useless, because the "next header"
2737 	 *	is at the end of the decrypted ESP packet.  Without the
2738 	 *	whole packet, this is useless.
2739 	 *
2740 	 * 2.)	If we every use a stateful cipher, such as a stream or a
2741 	 *	one-time pad, we can't do anything.
2742 	 *
2743 	 * Since the chances of us getting an entire packet back are very
2744 	 * very small, we discard here.
2745 	 */
2746 	IP_ESP_BUMP_STAT(ipss, in_discards);
2747 	ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
2748 	    DROPPER(ipss, ipds_esp_icmp),
2749 	    &espstack->esp_dropper);
2750 	return (NULL);
2751 }
2752 
2753 /*
2754  * Construct an SADB_REGISTER message with the current algorithms.
2755  * This function gets called when 'ipsecalgs -s' is run or when
2756  * in.iked (or other KMD) starts.
2757  */
2758 static boolean_t
2759 esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
2760     ipsecesp_stack_t *espstack, cred_t *cr)
2761 {
2762 	mblk_t *pfkey_msg_mp, *keysock_out_mp;
2763 	sadb_msg_t *samsg;
2764 	sadb_supported_t *sasupp_auth = NULL;
2765 	sadb_supported_t *sasupp_encr = NULL;
2766 	sadb_alg_t *saalg;
2767 	uint_t allocsize = sizeof (*samsg);
2768 	uint_t i, numalgs_snap;
2769 	int current_aalgs;
2770 	ipsec_alginfo_t **authalgs;
2771 	uint_t num_aalgs;
2772 	int current_ealgs;
2773 	ipsec_alginfo_t **encralgs;
2774 	uint_t num_ealgs;
2775 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
2776 	sadb_sens_t *sens;
2777 	size_t sens_len = 0;
2778 	sadb_ext_t *nextext;
2779 	ts_label_t *sens_tsl = NULL;
2780 
2781 	/* Allocate the KEYSOCK_OUT. */
2782 	keysock_out_mp = sadb_keysock_out(serial);
2783 	if (keysock_out_mp == NULL) {
2784 		esp0dbg(("esp_register_out: couldn't allocate mblk.\n"));
2785 		return (B_FALSE);
2786 	}
2787 
2788 	if (is_system_labeled() && (cr != NULL)) {
2789 		sens_tsl = crgetlabel(cr);
2790 		if (sens_tsl != NULL) {
2791 			sens_len = sadb_sens_len_from_label(sens_tsl);
2792 			allocsize += sens_len;
2793 		}
2794 	}
2795 
2796 	/*
2797 	 * Allocate the PF_KEY message that follows KEYSOCK_OUT.
2798 	 */
2799 
2800 	rw_enter(&ipss->ipsec_alg_lock, RW_READER);
2801 	/*
2802 	 * Fill SADB_REGISTER message's algorithm descriptors.  Hold
2803 	 * down the lock while filling it.
2804 	 *
2805 	 * Return only valid algorithms, so the number of algorithms
2806 	 * to send up may be less than the number of algorithm entries
2807 	 * in the table.
2808 	 */
2809 	authalgs = ipss->ipsec_alglists[IPSEC_ALG_AUTH];
2810 	for (num_aalgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2811 		if (authalgs[i] != NULL && ALG_VALID(authalgs[i]))
2812 			num_aalgs++;
2813 
2814 	if (num_aalgs != 0) {
2815 		allocsize += (num_aalgs * sizeof (*saalg));
2816 		allocsize += sizeof (*sasupp_auth);
2817 	}
2818 	encralgs = ipss->ipsec_alglists[IPSEC_ALG_ENCR];
2819 	for (num_ealgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2820 		if (encralgs[i] != NULL && ALG_VALID(encralgs[i]))
2821 			num_ealgs++;
2822 
2823 	if (num_ealgs != 0) {
2824 		allocsize += (num_ealgs * sizeof (*saalg));
2825 		allocsize += sizeof (*sasupp_encr);
2826 	}
2827 	keysock_out_mp->b_cont = allocb(allocsize, BPRI_HI);
2828 	if (keysock_out_mp->b_cont == NULL) {
2829 		rw_exit(&ipss->ipsec_alg_lock);
2830 		freemsg(keysock_out_mp);
2831 		return (B_FALSE);
2832 	}
2833 	pfkey_msg_mp = keysock_out_mp->b_cont;
2834 	pfkey_msg_mp->b_wptr += allocsize;
2835 
2836 	nextext = (sadb_ext_t *)(pfkey_msg_mp->b_rptr + sizeof (*samsg));
2837 
2838 	if (num_aalgs != 0) {
2839 		sasupp_auth = (sadb_supported_t *)nextext;
2840 		saalg = (sadb_alg_t *)(sasupp_auth + 1);
2841 
2842 		ASSERT(((ulong_t)saalg & 0x7) == 0);
2843 
2844 		numalgs_snap = 0;
2845 		for (i = 0;
2846 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_aalgs));
2847 		    i++) {
2848 			if (authalgs[i] == NULL || !ALG_VALID(authalgs[i]))
2849 				continue;
2850 
2851 			saalg->sadb_alg_id = authalgs[i]->alg_id;
2852 			saalg->sadb_alg_ivlen = 0;
2853 			saalg->sadb_alg_minbits	= authalgs[i]->alg_ef_minbits;
2854 			saalg->sadb_alg_maxbits	= authalgs[i]->alg_ef_maxbits;
2855 			saalg->sadb_x_alg_increment =
2856 			    authalgs[i]->alg_increment;
2857 			saalg->sadb_x_alg_saltbits = SADB_8TO1(
2858 			    authalgs[i]->alg_saltlen);
2859 			numalgs_snap++;
2860 			saalg++;
2861 		}
2862 		ASSERT(numalgs_snap == num_aalgs);
2863 #ifdef DEBUG
2864 		/*
2865 		 * Reality check to make sure I snagged all of the
2866 		 * algorithms.
2867 		 */
2868 		for (; i < IPSEC_MAX_ALGS; i++) {
2869 			if (authalgs[i] != NULL && ALG_VALID(authalgs[i])) {
2870 				cmn_err(CE_PANIC, "esp_register_out()! "
2871 				    "Missed aalg #%d.\n", i);
2872 			}
2873 		}
2874 #endif /* DEBUG */
2875 		nextext = (sadb_ext_t *)saalg;
2876 	}
2877 
2878 	if (num_ealgs != 0) {
2879 		sasupp_encr = (sadb_supported_t *)nextext;
2880 		saalg = (sadb_alg_t *)(sasupp_encr + 1);
2881 
2882 		numalgs_snap = 0;
2883 		for (i = 0;
2884 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_ealgs)); i++) {
2885 			if (encralgs[i] == NULL || !ALG_VALID(encralgs[i]))
2886 				continue;
2887 			saalg->sadb_alg_id = encralgs[i]->alg_id;
2888 			saalg->sadb_alg_ivlen = encralgs[i]->alg_ivlen;
2889 			saalg->sadb_alg_minbits	= encralgs[i]->alg_ef_minbits;
2890 			saalg->sadb_alg_maxbits	= encralgs[i]->alg_ef_maxbits;
2891 			/*
2892 			 * We could advertise the ICV length, except there
2893 			 * is not a value in sadb_x_algb to do this.
2894 			 * saalg->sadb_alg_maclen = encralgs[i]->alg_maclen;
2895 			 */
2896 			saalg->sadb_x_alg_increment =
2897 			    encralgs[i]->alg_increment;
2898 			saalg->sadb_x_alg_saltbits =
2899 			    SADB_8TO1(encralgs[i]->alg_saltlen);
2900 
2901 			numalgs_snap++;
2902 			saalg++;
2903 		}
2904 		ASSERT(numalgs_snap == num_ealgs);
2905 #ifdef DEBUG
2906 		/*
2907 		 * Reality check to make sure I snagged all of the
2908 		 * algorithms.
2909 		 */
2910 		for (; i < IPSEC_MAX_ALGS; i++) {
2911 			if (encralgs[i] != NULL && ALG_VALID(encralgs[i])) {
2912 				cmn_err(CE_PANIC, "esp_register_out()! "
2913 				    "Missed ealg #%d.\n", i);
2914 			}
2915 		}
2916 #endif /* DEBUG */
2917 		nextext = (sadb_ext_t *)saalg;
2918 	}
2919 
2920 	current_aalgs = num_aalgs;
2921 	current_ealgs = num_ealgs;
2922 
2923 	rw_exit(&ipss->ipsec_alg_lock);
2924 
2925 	if (sens_tsl != NULL) {
2926 		sens = (sadb_sens_t *)nextext;
2927 		sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
2928 		    sens_tsl, sens_len);
2929 
2930 		nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len);
2931 	}
2932 
2933 	/* Now fill the rest of the SADB_REGISTER message. */
2934 
2935 	samsg = (sadb_msg_t *)pfkey_msg_mp->b_rptr;
2936 	samsg->sadb_msg_version = PF_KEY_V2;
2937 	samsg->sadb_msg_type = SADB_REGISTER;
2938 	samsg->sadb_msg_errno = 0;
2939 	samsg->sadb_msg_satype = SADB_SATYPE_ESP;
2940 	samsg->sadb_msg_len = SADB_8TO64(allocsize);
2941 	samsg->sadb_msg_reserved = 0;
2942 	/*
2943 	 * Assume caller has sufficient sequence/pid number info.  If it's one
2944 	 * from me over a new alg., I could give two hoots about sequence.
2945 	 */
2946 	samsg->sadb_msg_seq = sequence;
2947 	samsg->sadb_msg_pid = pid;
2948 
2949 	if (sasupp_auth != NULL) {
2950 		sasupp_auth->sadb_supported_len = SADB_8TO64(
2951 		    sizeof (*sasupp_auth) + sizeof (*saalg) * current_aalgs);
2952 		sasupp_auth->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
2953 		sasupp_auth->sadb_supported_reserved = 0;
2954 	}
2955 
2956 	if (sasupp_encr != NULL) {
2957 		sasupp_encr->sadb_supported_len = SADB_8TO64(
2958 		    sizeof (*sasupp_encr) + sizeof (*saalg) * current_ealgs);
2959 		sasupp_encr->sadb_supported_exttype =
2960 		    SADB_EXT_SUPPORTED_ENCRYPT;
2961 		sasupp_encr->sadb_supported_reserved = 0;
2962 	}
2963 
2964 	if (espstack->esp_pfkey_q != NULL)
2965 		putnext(espstack->esp_pfkey_q, keysock_out_mp);
2966 	else {
2967 		freemsg(keysock_out_mp);
2968 		return (B_FALSE);
2969 	}
2970 
2971 	return (B_TRUE);
2972 }
2973 
2974 /*
2975  * Invoked when the algorithm table changes. Causes SADB_REGISTER
2976  * messages continaining the current list of algorithms to be
2977  * sent up to the ESP listeners.
2978  */
2979 void
2980 ipsecesp_algs_changed(netstack_t *ns)
2981 {
2982 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
2983 
2984 	/*
2985 	 * Time to send a PF_KEY SADB_REGISTER message to ESP listeners
2986 	 * everywhere.  (The function itself checks for NULL esp_pfkey_q.)
2987 	 */
2988 	(void) esp_register_out(0, 0, 0, espstack, NULL);
2989 }
2990 
2991 /*
2992  * Stub function that taskq_dispatch() invokes to take the mblk (in arg)
2993  * and send() it into ESP and IP again.
2994  */
2995 static void
2996 inbound_task(void *arg)
2997 {
2998 	mblk_t		*mp = (mblk_t *)arg;
2999 	mblk_t		*async_mp;
3000 	ip_recv_attr_t	iras;
3001 
3002 	async_mp = mp;
3003 	mp = async_mp->b_cont;
3004 	async_mp->b_cont = NULL;
3005 	if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
3006 		/* The ill or ip_stack_t disappeared on us */
3007 		ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
3008 		freemsg(mp);
3009 		goto done;
3010 	}
3011 
3012 	esp_inbound_restart(mp, &iras);
3013 done:
3014 	ira_cleanup(&iras, B_TRUE);
3015 }
3016 
3017 /*
3018  * Restart ESP after the SA has been added.
3019  */
3020 static void
3021 esp_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira)
3022 {
3023 	esph_t		*esph;
3024 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
3025 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3026 
3027 	esp2dbg(espstack, ("in ESP inbound_task"));
3028 	ASSERT(espstack != NULL);
3029 
3030 	mp = ipsec_inbound_esp_sa(mp, ira, &esph);
3031 	if (mp == NULL)
3032 		return;
3033 
3034 	ASSERT(esph != NULL);
3035 	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3036 	ASSERT(ira->ira_ipsec_esp_sa != NULL);
3037 
3038 	mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, ira);
3039 	if (mp == NULL) {
3040 		/*
3041 		 * Either it failed or is pending. In the former case
3042 		 * ipIfStatsInDiscards was increased.
3043 		 */
3044 		return;
3045 	}
3046 
3047 	ip_input_post_ipsec(mp, ira);
3048 }
3049 
3050 /*
3051  * Now that weak-key passed, actually ADD the security association, and
3052  * send back a reply ADD message.
3053  */
3054 static int
3055 esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
3056     int *diagnostic, ipsecesp_stack_t *espstack)
3057 {
3058 	isaf_t *primary = NULL, *secondary;
3059 	boolean_t clone = B_FALSE, is_inbound = B_FALSE;
3060 	ipsa_t *larval = NULL;
3061 	ipsacq_t *acqrec;
3062 	iacqf_t *acq_bucket;
3063 	mblk_t *acq_msgs = NULL;
3064 	int rc;
3065 	mblk_t *lpkt;
3066 	int error;
3067 	ipsa_query_t sq;
3068 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3069 
3070 	/*
3071 	 * Locate the appropriate table(s).
3072 	 */
3073 	sq.spp = &espstack->esp_sadb;	/* XXX */
3074 	error = sadb_form_query(ksi, IPSA_Q_SA|IPSA_Q_DST,
3075 	    IPSA_Q_SA|IPSA_Q_DST|IPSA_Q_INBOUND|IPSA_Q_OUTBOUND,
3076 	    &sq, diagnostic);
3077 	if (error)
3078 		return (error);
3079 
3080 	/*
3081 	 * Use the direction flags provided by the KMD to determine
3082 	 * if the inbound or outbound table should be the primary
3083 	 * for this SA. If these flags were absent then make this
3084 	 * decision based on the addresses.
3085 	 */
3086 	if (sq.assoc->sadb_sa_flags & IPSA_F_INBOUND) {
3087 		primary = sq.inbound;
3088 		secondary = sq.outbound;
3089 		is_inbound = B_TRUE;
3090 		if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND)
3091 			clone = B_TRUE;
3092 	} else if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND) {
3093 		primary = sq.outbound;
3094 		secondary = sq.inbound;
3095 	}
3096 
3097 	if (primary == NULL) {
3098 		/*
3099 		 * The KMD did not set a direction flag, determine which
3100 		 * table to insert the SA into based on addresses.
3101 		 */
3102 		switch (ksi->ks_in_dsttype) {
3103 		case KS_IN_ADDR_MBCAST:
3104 			clone = B_TRUE;	/* All mcast SAs can be bidirectional */
3105 			sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3106 			/* FALLTHRU */
3107 		/*
3108 		 * If the source address is either one of mine, or unspecified
3109 		 * (which is best summed up by saying "not 'not mine'"),
3110 		 * then the association is potentially bi-directional,
3111 		 * in that it can be used for inbound traffic and outbound
3112 		 * traffic.  The best example of such an SA is a multicast
3113 		 * SA (which allows me to receive the outbound traffic).
3114 		 */
3115 		case KS_IN_ADDR_ME:
3116 			sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3117 			primary = sq.inbound;
3118 			secondary = sq.outbound;
3119 			if (ksi->ks_in_srctype != KS_IN_ADDR_NOTME)
3120 				clone = B_TRUE;
3121 			is_inbound = B_TRUE;
3122 			break;
3123 		/*
3124 		 * If the source address literally not mine (either
3125 		 * unspecified or not mine), then this SA may have an
3126 		 * address that WILL be mine after some configuration.
3127 		 * We pay the price for this by making it a bi-directional
3128 		 * SA.
3129 		 */
3130 		case KS_IN_ADDR_NOTME:
3131 			sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3132 			primary = sq.outbound;
3133 			secondary = sq.inbound;
3134 			if (ksi->ks_in_srctype != KS_IN_ADDR_ME) {
3135 				sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3136 				clone = B_TRUE;
3137 			}
3138 			break;
3139 		default:
3140 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_DST;
3141 			return (EINVAL);
3142 		}
3143 	}
3144 
3145 	/*
3146 	 * Find a ACQUIRE list entry if possible.  If we've added an SA that
3147 	 * suits the needs of an ACQUIRE list entry, we can eliminate the
3148 	 * ACQUIRE list entry and transmit the enqueued packets.  Use the
3149 	 * high-bit of the sequence number to queue it.  Key off destination
3150 	 * addr, and change acqrec's state.
3151 	 */
3152 
3153 	if (samsg->sadb_msg_seq & IACQF_LOWEST_SEQ) {
3154 		acq_bucket = &(sq.sp->sdb_acq[sq.outhash]);
3155 		mutex_enter(&acq_bucket->iacqf_lock);
3156 		for (acqrec = acq_bucket->iacqf_ipsacq; acqrec != NULL;
3157 		    acqrec = acqrec->ipsacq_next) {
3158 			mutex_enter(&acqrec->ipsacq_lock);
3159 			/*
3160 			 * Q:  I only check sequence.  Should I check dst?
3161 			 * A: Yes, check dest because those are the packets
3162 			 *    that are queued up.
3163 			 */
3164 			if (acqrec->ipsacq_seq == samsg->sadb_msg_seq &&
3165 			    IPSA_ARE_ADDR_EQUAL(sq.dstaddr,
3166 			    acqrec->ipsacq_dstaddr, acqrec->ipsacq_addrfam))
3167 				break;
3168 			mutex_exit(&acqrec->ipsacq_lock);
3169 		}
3170 		if (acqrec != NULL) {
3171 			/*
3172 			 * AHA!  I found an ACQUIRE record for this SA.
3173 			 * Grab the msg list, and free the acquire record.
3174 			 * I already am holding the lock for this record,
3175 			 * so all I have to do is free it.
3176 			 */
3177 			acq_msgs = acqrec->ipsacq_mp;
3178 			acqrec->ipsacq_mp = NULL;
3179 			mutex_exit(&acqrec->ipsacq_lock);
3180 			sadb_destroy_acquire(acqrec,
3181 			    espstack->ipsecesp_netstack);
3182 		}
3183 		mutex_exit(&acq_bucket->iacqf_lock);
3184 	}
3185 
3186 	/*
3187 	 * Find PF_KEY message, and see if I'm an update.  If so, find entry
3188 	 * in larval list (if there).
3189 	 */
3190 	if (samsg->sadb_msg_type == SADB_UPDATE) {
3191 		mutex_enter(&sq.inbound->isaf_lock);
3192 		larval = ipsec_getassocbyspi(sq.inbound, sq.assoc->sadb_sa_spi,
3193 		    ALL_ZEROES_PTR, sq.dstaddr, sq.dst->sin_family);
3194 		mutex_exit(&sq.inbound->isaf_lock);
3195 
3196 		if ((larval == NULL) ||
3197 		    (larval->ipsa_state != IPSA_STATE_LARVAL)) {
3198 			*diagnostic = SADB_X_DIAGNOSTIC_SA_NOTFOUND;
3199 			if (larval != NULL) {
3200 				IPSA_REFRELE(larval);
3201 			}
3202 			esp0dbg(("Larval update, but larval disappeared.\n"));
3203 			return (ESRCH);
3204 		} /* Else sadb_common_add unlinks it for me! */
3205 	}
3206 
3207 	if (larval != NULL) {
3208 		/*
3209 		 * Hold again, because sadb_common_add() consumes a reference,
3210 		 * and we don't want to clear_lpkt() without a reference.
3211 		 */
3212 		IPSA_REFHOLD(larval);
3213 	}
3214 
3215 	rc = sadb_common_add(espstack->esp_pfkey_q,
3216 	    mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
3217 	    diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb);
3218 
3219 	if (larval != NULL) {
3220 		if (rc == 0) {
3221 			lpkt = sadb_clear_lpkt(larval);
3222 			if (lpkt != NULL) {
3223 				rc = !taskq_dispatch(esp_taskq, inbound_task,
3224 				    lpkt, TQ_NOSLEEP);
3225 			}
3226 		}
3227 		IPSA_REFRELE(larval);
3228 	}
3229 
3230 	/*
3231 	 * How much more stack will I create with all of these
3232 	 * esp_outbound() calls?
3233 	 */
3234 
3235 	/* Handle the packets queued waiting for the SA */
3236 	while (acq_msgs != NULL) {
3237 		mblk_t		*asyncmp;
3238 		mblk_t		*data_mp;
3239 		ip_xmit_attr_t	ixas;
3240 		ill_t		*ill;
3241 
3242 		asyncmp = acq_msgs;
3243 		acq_msgs = acq_msgs->b_next;
3244 		asyncmp->b_next = NULL;
3245 
3246 		/*
3247 		 * Extract the ip_xmit_attr_t from the first mblk.
3248 		 * Verifies that the netstack and ill is still around; could
3249 		 * have vanished while iked was doing its work.
3250 		 * On succesful return we have a nce_t and the ill/ipst can't
3251 		 * disappear until we do the nce_refrele in ixa_cleanup.
3252 		 */
3253 		data_mp = asyncmp->b_cont;
3254 		asyncmp->b_cont = NULL;
3255 		if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) {
3256 			ESP_BUMP_STAT(espstack, out_discards);
3257 			ip_drop_packet(data_mp, B_FALSE, NULL,
3258 			    DROPPER(ipss, ipds_sadb_acquire_timeout),
3259 			    &espstack->esp_dropper);
3260 		} else if (rc != 0) {
3261 			ill = ixas.ixa_nce->nce_ill;
3262 			ESP_BUMP_STAT(espstack, out_discards);
3263 			ip_drop_packet(data_mp, B_FALSE, ill,
3264 			    DROPPER(ipss, ipds_sadb_acquire_timeout),
3265 			    &espstack->esp_dropper);
3266 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3267 		} else {
3268 			esp_outbound_finish(data_mp, &ixas);
3269 		}
3270 		ixa_cleanup(&ixas);
3271 	}
3272 
3273 	return (rc);
3274 }
3275 
3276 /*
3277  * Process one of the queued messages (from ipsacq_mp) once the SA
3278  * has been added.
3279  */
3280 static void
3281 esp_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa)
3282 {
3283 	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
3284 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3285 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3286 	ill_t		*ill = ixa->ixa_nce->nce_ill;
3287 
3288 	if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_ESP)) {
3289 		ESP_BUMP_STAT(espstack, out_discards);
3290 		ip_drop_packet(data_mp, B_FALSE, ill,
3291 		    DROPPER(ipss, ipds_sadb_acquire_timeout),
3292 		    &espstack->esp_dropper);
3293 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3294 		return;
3295 	}
3296 
3297 	data_mp = esp_outbound(data_mp, ixa);
3298 	if (data_mp == NULL)
3299 		return;
3300 
3301 	/* do AH processing if needed */
3302 	data_mp = esp_do_outbound_ah(data_mp, ixa);
3303 	if (data_mp == NULL)
3304 		return;
3305 
3306 	(void) ip_output_post_ipsec(data_mp, ixa);
3307 }
3308 
3309 /*
3310  * Add new ESP security association.  This may become a generic AH/ESP
3311  * routine eventually.
3312  */
3313 static int
3314 esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
3315 {
3316 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3317 	sadb_address_t *srcext =
3318 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3319 	sadb_address_t *dstext =
3320 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3321 	sadb_address_t *isrcext =
3322 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_SRC];
3323 	sadb_address_t *idstext =
3324 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_DST];
3325 	sadb_address_t *nttext_loc =
3326 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_LOC];
3327 	sadb_address_t *nttext_rem =
3328 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_REM];
3329 	sadb_key_t *akey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_AUTH];
3330 	sadb_key_t *ekey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_ENCRYPT];
3331 	struct sockaddr_in *src, *dst;
3332 	struct sockaddr_in *natt_loc, *natt_rem;
3333 	struct sockaddr_in6 *natt_loc6, *natt_rem6;
3334 	sadb_lifetime_t *soft =
3335 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_SOFT];
3336 	sadb_lifetime_t *hard =
3337 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_HARD];
3338 	sadb_lifetime_t *idle =
3339 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_X_EXT_LIFETIME_IDLE];
3340 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3341 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3342 
3343 
3344 
3345 	/* I need certain extensions present for an ADD message. */
3346 	if (srcext == NULL) {
3347 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SRC;
3348 		return (EINVAL);
3349 	}
3350 	if (dstext == NULL) {
3351 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3352 		return (EINVAL);
3353 	}
3354 	if (isrcext == NULL && idstext != NULL) {
3355 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_SRC;
3356 		return (EINVAL);
3357 	}
3358 	if (isrcext != NULL && idstext == NULL) {
3359 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_DST;
3360 		return (EINVAL);
3361 	}
3362 	if (assoc == NULL) {
3363 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3364 		return (EINVAL);
3365 	}
3366 	if (ekey == NULL && assoc->sadb_sa_encrypt != SADB_EALG_NULL) {
3367 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_EKEY;
3368 		return (EINVAL);
3369 	}
3370 
3371 	src = (struct sockaddr_in *)(srcext + 1);
3372 	dst = (struct sockaddr_in *)(dstext + 1);
3373 	natt_loc = (struct sockaddr_in *)(nttext_loc + 1);
3374 	natt_loc6 = (struct sockaddr_in6 *)(nttext_loc + 1);
3375 	natt_rem = (struct sockaddr_in *)(nttext_rem + 1);
3376 	natt_rem6 = (struct sockaddr_in6 *)(nttext_rem + 1);
3377 
3378 	/* Sundry ADD-specific reality checks. */
3379 	/* XXX STATS :  Logging/stats here? */
3380 
3381 	if ((assoc->sadb_sa_state != SADB_SASTATE_MATURE) &&
3382 	    (assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE_ELSEWHERE)) {
3383 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SASTATE;
3384 		return (EINVAL);
3385 	}
3386 	if (assoc->sadb_sa_encrypt == SADB_EALG_NONE) {
3387 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3388 		return (EINVAL);
3389 	}
3390 
3391 #ifndef IPSEC_LATENCY_TEST
3392 	if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
3393 	    assoc->sadb_sa_auth == SADB_AALG_NONE) {
3394 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3395 		return (EINVAL);
3396 	}
3397 #endif
3398 
3399 	if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) {
3400 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
3401 		return (EINVAL);
3402 	}
3403 
3404 	if ((*diagnostic = sadb_hardsoftchk(hard, soft, idle)) != 0) {
3405 		return (EINVAL);
3406 	}
3407 	ASSERT(src->sin_family == dst->sin_family);
3408 
3409 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_LOC) {
3410 		if (nttext_loc == NULL) {
3411 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_LOC;
3412 			return (EINVAL);
3413 		}
3414 
3415 		if (natt_loc->sin_family == AF_INET6 &&
3416 		    !IN6_IS_ADDR_V4MAPPED(&natt_loc6->sin6_addr)) {
3417 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_LOC;
3418 			return (EINVAL);
3419 		}
3420 	}
3421 
3422 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_REM) {
3423 		if (nttext_rem == NULL) {
3424 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_REM;
3425 			return (EINVAL);
3426 		}
3427 		if (natt_rem->sin_family == AF_INET6 &&
3428 		    !IN6_IS_ADDR_V4MAPPED(&natt_rem6->sin6_addr)) {
3429 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_REM;
3430 			return (EINVAL);
3431 		}
3432 	}
3433 
3434 
3435 	/* Stuff I don't support, for now.  XXX Diagnostic? */
3436 	if (ksi->ks_in_extv[SADB_EXT_LIFETIME_CURRENT] != NULL)
3437 		return (EOPNOTSUPP);
3438 
3439 	if ((*diagnostic = sadb_labelchk(ksi)) != 0)
3440 		return (EINVAL);
3441 
3442 	/*
3443 	 * XXX Policy :  I'm not checking identities at this time,
3444 	 * but if I did, I'd do them here, before I sent
3445 	 * the weak key check up to the algorithm.
3446 	 */
3447 
3448 	rw_enter(&ipss->ipsec_alg_lock, RW_READER);
3449 
3450 	/*
3451 	 * First locate the authentication algorithm.
3452 	 */
3453 #ifdef IPSEC_LATENCY_TEST
3454 	if (akey != NULL && assoc->sadb_sa_auth != SADB_AALG_NONE) {
3455 #else
3456 	if (akey != NULL) {
3457 #endif
3458 		ipsec_alginfo_t *aalg;
3459 
3460 		aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
3461 		    [assoc->sadb_sa_auth];
3462 		if (aalg == NULL || !ALG_VALID(aalg)) {
3463 			rw_exit(&ipss->ipsec_alg_lock);
3464 			esp1dbg(espstack, ("Couldn't find auth alg #%d.\n",
3465 			    assoc->sadb_sa_auth));
3466 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3467 			return (EINVAL);
3468 		}
3469 
3470 		/*
3471 		 * Sanity check key sizes.
3472 		 * Note: It's not possible to use SADB_AALG_NONE because
3473 		 * this auth_alg is not defined with ALG_FLAG_VALID. If this
3474 		 * ever changes, the same check for SADB_AALG_NONE and
3475 		 * a auth_key != NULL should be made here ( see below).
3476 		 */
3477 		if (!ipsec_valid_key_size(akey->sadb_key_bits, aalg)) {
3478 			rw_exit(&ipss->ipsec_alg_lock);
3479 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AKEYBITS;
3480 			return (EINVAL);
3481 		}
3482 		ASSERT(aalg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3483 
3484 		/* check key and fix parity if needed */
3485 		if (ipsec_check_key(aalg->alg_mech_type, akey, B_TRUE,
3486 		    diagnostic) != 0) {
3487 			rw_exit(&ipss->ipsec_alg_lock);
3488 			return (EINVAL);
3489 		}
3490 	}
3491 
3492 	/*
3493 	 * Then locate the encryption algorithm.
3494 	 */
3495 	if (ekey != NULL) {
3496 		uint_t keybits;
3497 		ipsec_alginfo_t *ealg;
3498 
3499 		ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
3500 		    [assoc->sadb_sa_encrypt];
3501 		if (ealg == NULL || !ALG_VALID(ealg)) {
3502 			rw_exit(&ipss->ipsec_alg_lock);
3503 			esp1dbg(espstack, ("Couldn't find encr alg #%d.\n",
3504 			    assoc->sadb_sa_encrypt));
3505 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3506 			return (EINVAL);
3507 		}
3508 
3509 		/*
3510 		 * Sanity check key sizes. If the encryption algorithm is
3511 		 * SADB_EALG_NULL but the encryption key is NOT
3512 		 * NULL then complain.
3513 		 *
3514 		 * The keying material includes salt bits if required by
3515 		 * algorithm and optionally the Initial IV, check the
3516 		 * length of whats left.
3517 		 */
3518 		keybits = ekey->sadb_key_bits;
3519 		keybits -= ekey->sadb_key_reserved;
3520 		keybits -= SADB_8TO1(ealg->alg_saltlen);
3521 		if ((assoc->sadb_sa_encrypt == SADB_EALG_NULL) ||
3522 		    (!ipsec_valid_key_size(keybits, ealg))) {
3523 			rw_exit(&ipss->ipsec_alg_lock);
3524 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EKEYBITS;
3525 			return (EINVAL);
3526 		}
3527 		ASSERT(ealg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3528 
3529 		/* check key */
3530 		if (ipsec_check_key(ealg->alg_mech_type, ekey, B_FALSE,
3531 		    diagnostic) != 0) {
3532 			rw_exit(&ipss->ipsec_alg_lock);
3533 			return (EINVAL);
3534 		}
3535 	}
3536 	rw_exit(&ipss->ipsec_alg_lock);
3537 
3538 	return (esp_add_sa_finish(mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
3539 	    diagnostic, espstack));
3540 }
3541 
3542 /*
3543  * Update a security association.  Updates come in two varieties.  The first
3544  * is an update of lifetimes on a non-larval SA.  The second is an update of
3545  * a larval SA, which ends up looking a lot more like an add.
3546  */
3547 static int
3548 esp_update_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3549     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3550 {
3551 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3552 	mblk_t    *buf_pkt;
3553 	int rcode;
3554 
3555 	sadb_address_t *dstext =
3556 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3557 
3558 	if (dstext == NULL) {
3559 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3560 		return (EINVAL);
3561 	}
3562 
3563 	rcode = sadb_update_sa(mp, ksi, &buf_pkt, &espstack->esp_sadb,
3564 	    diagnostic, espstack->esp_pfkey_q, esp_add_sa,
3565 	    espstack->ipsecesp_netstack, sadb_msg_type);
3566 
3567 	if ((assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE) ||
3568 	    (rcode != 0)) {
3569 		return (rcode);
3570 	}
3571 
3572 	HANDLE_BUF_PKT(esp_taskq, espstack->ipsecesp_netstack->netstack_ipsec,
3573 	    espstack->esp_dropper, buf_pkt);
3574 
3575 	return (rcode);
3576 }
3577 
3578 /* XXX refactor me */
3579 /*
3580  * Delete a security association.  This is REALLY likely to be code common to
3581  * both AH and ESP.  Find the association, then unlink it.
3582  */
3583 static int
3584 esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3585     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3586 {
3587 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3588 	sadb_address_t *dstext =
3589 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3590 	sadb_address_t *srcext =
3591 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3592 	struct sockaddr_in *sin;
3593 
3594 	if (assoc == NULL) {
3595 		if (dstext != NULL) {
3596 			sin = (struct sockaddr_in *)(dstext + 1);
3597 		} else if (srcext != NULL) {
3598 			sin = (struct sockaddr_in *)(srcext + 1);
3599 		} else {
3600 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3601 			return (EINVAL);
3602 		}
3603 		return (sadb_purge_sa(mp, ksi,
3604 		    (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
3605 		    &espstack->esp_sadb.s_v4, diagnostic,
3606 		    espstack->esp_pfkey_q));
3607 	}
3608 
3609 	return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
3610 	    espstack->esp_pfkey_q, sadb_msg_type));
3611 }
3612 
3613 /* XXX refactor me */
3614 /*
3615  * Convert the entire contents of all of ESP's SA tables into PF_KEY SADB_DUMP
3616  * messages.
3617  */
3618 static void
3619 esp_dump(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
3620 {
3621 	int error;
3622 	sadb_msg_t *samsg;
3623 
3624 	/*
3625 	 * Dump each fanout, bailing if error is non-zero.
3626 	 */
3627 
3628 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3629 	    &espstack->esp_sadb.s_v4);
3630 	if (error != 0)
3631 		goto bail;
3632 
3633 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3634 	    &espstack->esp_sadb.s_v6);
3635 bail:
3636 	ASSERT(mp->b_cont != NULL);
3637 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3638 	samsg->sadb_msg_errno = (uint8_t)error;
3639 	sadb_pfkey_echo(espstack->esp_pfkey_q, mp,
3640 	    (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL);
3641 }
3642 
3643 /*
3644  * First-cut reality check for an inbound PF_KEY message.
3645  */
3646 static boolean_t
3647 esp_pfkey_reality_failures(mblk_t *mp, keysock_in_t *ksi,
3648     ipsecesp_stack_t *espstack)
3649 {
3650 	int diagnostic;
3651 
3652 	if (ksi->ks_in_extv[SADB_EXT_PROPOSAL] != NULL) {
3653 		diagnostic = SADB_X_DIAGNOSTIC_PROP_PRESENT;
3654 		goto badmsg;
3655 	}
3656 	if (ksi->ks_in_extv[SADB_EXT_SUPPORTED_AUTH] != NULL ||
3657 	    ksi->ks_in_extv[SADB_EXT_SUPPORTED_ENCRYPT] != NULL) {
3658 		diagnostic = SADB_X_DIAGNOSTIC_SUPP_PRESENT;
3659 		goto badmsg;
3660 	}
3661 	return (B_FALSE);	/* False ==> no failures */
3662 
3663 badmsg:
3664 	sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
3665 	    ksi->ks_in_serial);
3666 	return (B_TRUE);	/* True ==> failures */
3667 }
3668 
3669 /*
3670  * ESP parsing of PF_KEY messages.  Keysock did most of the really silly
3671  * error cases.  What I receive is a fully-formed, syntactically legal
3672  * PF_KEY message.  I then need to check semantics...
3673  *
3674  * This code may become common to AH and ESP.  Stay tuned.
3675  *
3676  * I also make the assumption that db_ref's are cool.  If this assumption
3677  * is wrong, this means that someone other than keysock or me has been
3678  * mucking with PF_KEY messages.
3679  */
3680 static void
3681 esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
3682 {
3683 	mblk_t *msg = mp->b_cont;
3684 	sadb_msg_t *samsg;
3685 	keysock_in_t *ksi;
3686 	int error;
3687 	int diagnostic = SADB_X_DIAGNOSTIC_NONE;
3688 
3689 	ASSERT(msg != NULL);
3690 
3691 	samsg = (sadb_msg_t *)msg->b_rptr;
3692 	ksi = (keysock_in_t *)mp->b_rptr;
3693 
3694 	/*
3695 	 * If applicable, convert unspecified AF_INET6 to unspecified
3696 	 * AF_INET.  And do other address reality checks.
3697 	 */
3698 	if (!sadb_addrfix(ksi, espstack->esp_pfkey_q, mp,
3699 	    espstack->ipsecesp_netstack) ||
3700 	    esp_pfkey_reality_failures(mp, ksi, espstack)) {
3701 		return;
3702 	}
3703 
3704 	switch (samsg->sadb_msg_type) {
3705 	case SADB_ADD:
3706 		error = esp_add_sa(mp, ksi, &diagnostic,
3707 		    espstack->ipsecesp_netstack);
3708 		if (error != 0) {
3709 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3710 			    diagnostic, ksi->ks_in_serial);
3711 		}
3712 		/* else esp_add_sa() took care of things. */
3713 		break;
3714 	case SADB_DELETE:
3715 	case SADB_X_DELPAIR:
3716 	case SADB_X_DELPAIR_STATE:
3717 		error = esp_del_sa(mp, ksi, &diagnostic, espstack,
3718 		    samsg->sadb_msg_type);
3719 		if (error != 0) {
3720 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3721 			    diagnostic, ksi->ks_in_serial);
3722 		}
3723 		/* Else esp_del_sa() took care of things. */
3724 		break;
3725 	case SADB_GET:
3726 		error = sadb_delget_sa(mp, ksi, &espstack->esp_sadb,
3727 		    &diagnostic, espstack->esp_pfkey_q, samsg->sadb_msg_type);
3728 		if (error != 0) {
3729 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3730 			    diagnostic, ksi->ks_in_serial);
3731 		}
3732 		/* Else sadb_get_sa() took care of things. */
3733 		break;
3734 	case SADB_FLUSH:
3735 		sadbp_flush(&espstack->esp_sadb, espstack->ipsecesp_netstack);
3736 		sadb_pfkey_echo(espstack->esp_pfkey_q, mp, samsg, ksi, NULL);
3737 		break;
3738 	case SADB_REGISTER:
3739 		/*
3740 		 * Hmmm, let's do it!  Check for extensions (there should
3741 		 * be none), extract the fields, call esp_register_out(),
3742 		 * then either free or report an error.
3743 		 *
3744 		 * Keysock takes care of the PF_KEY bookkeeping for this.
3745 		 */
3746 		if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
3747 		    ksi->ks_in_serial, espstack, msg_getcred(mp, NULL))) {
3748 			freemsg(mp);
3749 		} else {
3750 			/*
3751 			 * Only way this path hits is if there is a memory
3752 			 * failure.  It will not return B_FALSE because of
3753 			 * lack of esp_pfkey_q if I am in wput().
3754 			 */
3755 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM,
3756 			    diagnostic, ksi->ks_in_serial);
3757 		}
3758 		break;
3759 	case SADB_UPDATE:
3760 	case SADB_X_UPDATEPAIR:
3761 		/*
3762 		 * Find a larval, if not there, find a full one and get
3763 		 * strict.
3764 		 */
3765 		error = esp_update_sa(mp, ksi, &diagnostic, espstack,
3766 		    samsg->sadb_msg_type);
3767 		if (error != 0) {
3768 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3769 			    diagnostic, ksi->ks_in_serial);
3770 		}
3771 		/* else esp_update_sa() took care of things. */
3772 		break;
3773 	case SADB_GETSPI:
3774 		/*
3775 		 * Reserve a new larval entry.
3776 		 */
3777 		esp_getspi(mp, ksi, espstack);
3778 		break;
3779 	case SADB_ACQUIRE:
3780 		/*
3781 		 * Find larval and/or ACQUIRE record and kill it (them), I'm
3782 		 * most likely an error.  Inbound ACQUIRE messages should only
3783 		 * have the base header.
3784 		 */
3785 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3786 		    espstack->esp_pfkey_q, espstack->ipsecesp_netstack);
3787 		freemsg(mp);
3788 		break;
3789 	case SADB_DUMP:
3790 		/*
3791 		 * Dump all entries.
3792 		 */
3793 		esp_dump(mp, ksi, espstack);
3794 		/* esp_dump will take care of the return message, etc. */
3795 		break;
3796 	case SADB_EXPIRE:
3797 		/* Should never reach me. */
3798 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EOPNOTSUPP,
3799 		    diagnostic, ksi->ks_in_serial);
3800 		break;
3801 	default:
3802 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL,
3803 		    SADB_X_DIAGNOSTIC_UNKNOWN_MSG, ksi->ks_in_serial);
3804 		break;
3805 	}
3806 }
3807 
3808 /*
3809  * Handle case where PF_KEY says it can't find a keysock for one of my
3810  * ACQUIRE messages.
3811  */
3812 static void
3813 esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
3814 {
3815 	sadb_msg_t *samsg;
3816 	keysock_out_err_t *kse = (keysock_out_err_t *)mp->b_rptr;
3817 
3818 	if (mp->b_cont == NULL) {
3819 		freemsg(mp);
3820 		return;
3821 	}
3822 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3823 
3824 	/*
3825 	 * If keysock can't find any registered, delete the acquire record
3826 	 * immediately, and handle errors.
3827 	 */
3828 	if (samsg->sadb_msg_type == SADB_ACQUIRE) {
3829 		samsg->sadb_msg_errno = kse->ks_err_errno;
3830 		samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
3831 		/*
3832 		 * Use the write-side of the esp_pfkey_q
3833 		 */
3834 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3835 		    WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
3836 	}
3837 
3838 	freemsg(mp);
3839 }
3840 
3841 /*
3842  * ESP module write put routine.
3843  */
3844 static void
3845 ipsecesp_wput(queue_t *q, mblk_t *mp)
3846 {
3847 	ipsec_info_t *ii;
3848 	struct iocblk *iocp;
3849 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
3850 
3851 	esp3dbg(espstack, ("In esp_wput().\n"));
3852 
3853 	/* NOTE: Each case must take care of freeing or passing mp. */
3854 	switch (mp->b_datap->db_type) {
3855 	case M_CTL:
3856 		if ((mp->b_wptr - mp->b_rptr) < sizeof (ipsec_info_t)) {
3857 			/* Not big enough message. */
3858 			freemsg(mp);
3859 			break;
3860 		}
3861 		ii = (ipsec_info_t *)mp->b_rptr;
3862 
3863 		switch (ii->ipsec_info_type) {
3864 		case KEYSOCK_OUT_ERR:
3865 			esp1dbg(espstack, ("Got KEYSOCK_OUT_ERR message.\n"));
3866 			esp_keysock_no_socket(mp, espstack);
3867 			break;
3868 		case KEYSOCK_IN:
3869 			ESP_BUMP_STAT(espstack, keysock_in);
3870 			esp3dbg(espstack, ("Got KEYSOCK_IN message.\n"));
3871 
3872 			/* Parse the message. */
3873 			esp_parse_pfkey(mp, espstack);
3874 			break;
3875 		case KEYSOCK_HELLO:
3876 			sadb_keysock_hello(&espstack->esp_pfkey_q, q, mp,
3877 			    esp_ager, (void *)espstack, &espstack->esp_event,
3878 			    SADB_SATYPE_ESP);
3879 			break;
3880 		default:
3881 			esp2dbg(espstack, ("Got M_CTL from above of 0x%x.\n",
3882 			    ii->ipsec_info_type));
3883 			freemsg(mp);
3884 			break;
3885 		}
3886 		break;
3887 	case M_IOCTL:
3888 		iocp = (struct iocblk *)mp->b_rptr;
3889 		switch (iocp->ioc_cmd) {
3890 		case ND_SET:
3891 		case ND_GET:
3892 			if (nd_getset(q, espstack->ipsecesp_g_nd, mp)) {
3893 				qreply(q, mp);
3894 				return;
3895 			} else {
3896 				iocp->ioc_error = ENOENT;
3897 			}
3898 			/* FALLTHRU */
3899 		default:
3900 			/* We really don't support any other ioctls, do we? */
3901 
3902 			/* Return EINVAL */
3903 			if (iocp->ioc_error != ENOENT)
3904 				iocp->ioc_error = EINVAL;
3905 			iocp->ioc_count = 0;
3906 			mp->b_datap->db_type = M_IOCACK;
3907 			qreply(q, mp);
3908 			return;
3909 		}
3910 	default:
3911 		esp3dbg(espstack,
3912 		    ("Got default message, type %d, passing to IP.\n",
3913 		    mp->b_datap->db_type));
3914 		putnext(q, mp);
3915 	}
3916 }
3917 
3918 /*
3919  * Wrapper to allow IP to trigger an ESP association failure message
3920  * during inbound SA selection.
3921  */
3922 void
3923 ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
3924     uint32_t spi, void *addr, int af, ip_recv_attr_t *ira)
3925 {
3926 	netstack_t	*ns = ira->ira_ill->ill_ipst->ips_netstack;
3927 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3928 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3929 
3930 	if (espstack->ipsecesp_log_unknown_spi) {
3931 		ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
3932 		    addr, af, espstack->ipsecesp_netstack);
3933 	}
3934 
3935 	ip_drop_packet(mp, B_TRUE, ira->ira_ill,
3936 	    DROPPER(ipss, ipds_esp_no_sa),
3937 	    &espstack->esp_dropper);
3938 }
3939 
3940 /*
3941  * Initialize the ESP input and output processing functions.
3942  */
3943 void
3944 ipsecesp_init_funcs(ipsa_t *sa)
3945 {
3946 	if (sa->ipsa_output_func == NULL)
3947 		sa->ipsa_output_func = esp_outbound;
3948 	if (sa->ipsa_input_func == NULL)
3949 		sa->ipsa_input_func = esp_inbound;
3950 }
3951