xref: /titanic_50/usr/src/uts/common/inet/ip/ipsecesp.c (revision 4246c8e92ef9ad6ada2b992b7af02832ff071bf7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/errno.h>
30 #include <sys/strlog.h>
31 #include <sys/tihdr.h>
32 #include <sys/socket.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/kmem.h>
36 #include <sys/zone.h>
37 #include <sys/sysmacros.h>
38 #include <sys/cmn_err.h>
39 #include <sys/vtrace.h>
40 #include <sys/debug.h>
41 #include <sys/atomic.h>
42 #include <sys/strsun.h>
43 #include <sys/random.h>
44 #include <netinet/in.h>
45 #include <net/if.h>
46 #include <netinet/ip6.h>
47 #include <net/pfkeyv2.h>
48 
49 #include <inet/common.h>
50 #include <inet/mi.h>
51 #include <inet/nd.h>
52 #include <inet/ip.h>
53 #include <inet/ip_impl.h>
54 #include <inet/ip6.h>
55 #include <inet/sadb.h>
56 #include <inet/ipsec_info.h>
57 #include <inet/ipsec_impl.h>
58 #include <inet/ipsecesp.h>
59 #include <inet/ipdrop.h>
60 #include <inet/tcp.h>
61 #include <sys/kstat.h>
62 #include <sys/policy.h>
63 #include <sys/strsun.h>
64 #include <inet/udp_impl.h>
65 #include <sys/taskq.h>
66 #include <sys/note.h>
67 
68 #include <sys/iphada.h>
69 
70 /*
71  * Table of ND variables supported by ipsecesp. These are loaded into
72  * ipsecesp_g_nd in ipsecesp_init_nd.
73  * All of these are alterable, within the min/max values given, at run time.
74  */
75 static	ipsecespparam_t	lcl_param_arr[] = {
76 	/* min	max			value	name */
77 	{ 0,	3,			0,	"ipsecesp_debug"},
78 	{ 125,	32000, SADB_AGE_INTERVAL_DEFAULT, "ipsecesp_age_interval"},
79 	{ 1,	10,			1,	"ipsecesp_reap_delay"},
80 	{ 1,	SADB_MAX_REPLAY,	64,	"ipsecesp_replay_size"},
81 	{ 1,	300,			15,	"ipsecesp_acquire_timeout"},
82 	{ 1,	1800,			90,	"ipsecesp_larval_timeout"},
83 	/* Default lifetime values for ACQUIRE messages. */
84 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_bytes"},
85 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_bytes"},
86 	{ 0,	0xffffffffU,	24000,	"ipsecesp_default_soft_addtime"},
87 	{ 0,	0xffffffffU,	28800,	"ipsecesp_default_hard_addtime"},
88 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_usetime"},
89 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_usetime"},
90 	{ 0,	1,		0,	"ipsecesp_log_unknown_spi"},
91 	{ 0,	2,		1,	"ipsecesp_padding_check"},
92 	{ 0,	600,		20,	"ipsecesp_nat_keepalive_interval"},
93 };
94 #define	ipsecesp_debug	ipsecesp_params[0].ipsecesp_param_value
95 #define	ipsecesp_age_interval ipsecesp_params[1].ipsecesp_param_value
96 #define	ipsecesp_age_int_max	ipsecesp_params[1].ipsecesp_param_max
97 #define	ipsecesp_reap_delay	ipsecesp_params[2].ipsecesp_param_value
98 #define	ipsecesp_replay_size	ipsecesp_params[3].ipsecesp_param_value
99 #define	ipsecesp_acquire_timeout	\
100 	ipsecesp_params[4].ipsecesp_param_value
101 #define	ipsecesp_larval_timeout	\
102 	ipsecesp_params[5].ipsecesp_param_value
103 #define	ipsecesp_default_soft_bytes	\
104 	ipsecesp_params[6].ipsecesp_param_value
105 #define	ipsecesp_default_hard_bytes	\
106 	ipsecesp_params[7].ipsecesp_param_value
107 #define	ipsecesp_default_soft_addtime	\
108 	ipsecesp_params[8].ipsecesp_param_value
109 #define	ipsecesp_default_hard_addtime	\
110 	ipsecesp_params[9].ipsecesp_param_value
111 #define	ipsecesp_default_soft_usetime	\
112 	ipsecesp_params[10].ipsecesp_param_value
113 #define	ipsecesp_default_hard_usetime	\
114 	ipsecesp_params[11].ipsecesp_param_value
115 #define	ipsecesp_log_unknown_spi	\
116 	ipsecesp_params[12].ipsecesp_param_value
117 #define	ipsecesp_padding_check	\
118 	ipsecesp_params[13].ipsecesp_param_value
119 /* For ipsecesp_nat_keepalive_interval, see ipsecesp.h. */
120 
121 #define	esp0dbg(a)	printf a
122 /* NOTE:  != 0 instead of > 0 so lint doesn't complain. */
123 #define	esp1dbg(espstack, a)	if (espstack->ipsecesp_debug != 0) printf a
124 #define	esp2dbg(espstack, a)	if (espstack->ipsecesp_debug > 1) printf a
125 #define	esp3dbg(espstack, a)	if (espstack->ipsecesp_debug > 2) printf a
126 
127 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
128 static int ipsecesp_close(queue_t *);
129 static void ipsecesp_rput(queue_t *, mblk_t *);
130 static void ipsecesp_wput(queue_t *, mblk_t *);
131 static void	*ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
132 static void	ipsecesp_stack_fini(netstackid_t stackid, void *arg);
133 static void esp_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
134 
135 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
136 static ipsec_status_t esp_outbound_accelerated(mblk_t *, uint_t);
137 static ipsec_status_t esp_inbound_accelerated(mblk_t *, mblk_t *,
138     boolean_t, ipsa_t *);
139 
140 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
141     ipsecesp_stack_t *);
142 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
143     kstat_named_t **, ipsecesp_stack_t *);
144 static ipsec_status_t esp_submit_req_inbound(mblk_t *, ipsa_t *, uint_t);
145 static ipsec_status_t esp_submit_req_outbound(mblk_t *, ipsa_t *, uchar_t *,
146     uint_t);
147 extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
148     void *);
149 
150 /* Setable in /etc/system */
151 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
152 
153 static struct module_info info = {
154 	5137, "ipsecesp", 0, INFPSZ, 65536, 1024
155 };
156 
157 static struct qinit rinit = {
158 	(pfi_t)ipsecesp_rput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
159 	NULL
160 };
161 
162 static struct qinit winit = {
163 	(pfi_t)ipsecesp_wput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
164 	NULL
165 };
166 
167 struct streamtab ipsecespinfo = {
168 	&rinit, &winit, NULL, NULL
169 };
170 
171 static taskq_t *esp_taskq;
172 
173 /*
174  * OTOH, this one is set at open/close, and I'm D_MTQPAIR for now.
175  *
176  * Question:	Do I need this, given that all instance's esps->esps_wq point
177  *		to IP?
178  *
179  * Answer:	Yes, because I need to know which queue is BOUND to
180  *		IPPROTO_ESP
181  */
182 
183 /*
184  * Stats.  This may eventually become a full-blown SNMP MIB once that spec
185  * stabilizes.
186  */
187 
188 typedef struct esp_kstats_s {
189 	kstat_named_t esp_stat_num_aalgs;
190 	kstat_named_t esp_stat_good_auth;
191 	kstat_named_t esp_stat_bad_auth;
192 	kstat_named_t esp_stat_bad_padding;
193 	kstat_named_t esp_stat_replay_failures;
194 	kstat_named_t esp_stat_replay_early_failures;
195 	kstat_named_t esp_stat_keysock_in;
196 	kstat_named_t esp_stat_out_requests;
197 	kstat_named_t esp_stat_acquire_requests;
198 	kstat_named_t esp_stat_bytes_expired;
199 	kstat_named_t esp_stat_out_discards;
200 	kstat_named_t esp_stat_in_accelerated;
201 	kstat_named_t esp_stat_out_accelerated;
202 	kstat_named_t esp_stat_noaccel;
203 	kstat_named_t esp_stat_crypto_sync;
204 	kstat_named_t esp_stat_crypto_async;
205 	kstat_named_t esp_stat_crypto_failures;
206 	kstat_named_t esp_stat_num_ealgs;
207 	kstat_named_t esp_stat_bad_decrypt;
208 	kstat_named_t esp_stat_sa_port_renumbers;
209 } esp_kstats_t;
210 
211 /*
212  * espstack->esp_kstats is equal to espstack->esp_ksp->ks_data if
213  * kstat_create_netstack for espstack->esp_ksp succeeds, but when it
214  * fails, it will be NULL. Note this is done for all stack instances,
215  * so it *could* fail. hence a non-NULL checking is done for
216  * ESP_BUMP_STAT and ESP_DEBUMP_STAT
217  */
218 #define	ESP_BUMP_STAT(espstack, x)					\
219 do {									\
220 	if (espstack->esp_kstats != NULL)				\
221 		(espstack->esp_kstats->esp_stat_ ## x).value.ui64++;	\
222 _NOTE(CONSTCOND)							\
223 } while (0)
224 
225 #define	ESP_DEBUMP_STAT(espstack, x)					\
226 do {									\
227 	if (espstack->esp_kstats != NULL)				\
228 		(espstack->esp_kstats->esp_stat_ ## x).value.ui64--;	\
229 _NOTE(CONSTCOND)							\
230 } while (0)
231 
232 static int	esp_kstat_update(kstat_t *, int);
233 
234 static boolean_t
235 esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
236 {
237 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
238 	    "net", KSTAT_TYPE_NAMED,
239 	    sizeof (esp_kstats_t) / sizeof (kstat_named_t),
240 	    KSTAT_FLAG_PERSISTENT, stackid);
241 
242 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
243 		return (B_FALSE);
244 
245 	espstack->esp_kstats = espstack->esp_ksp->ks_data;
246 
247 	espstack->esp_ksp->ks_update = esp_kstat_update;
248 	espstack->esp_ksp->ks_private = (void *)(uintptr_t)stackid;
249 
250 #define	K64 KSTAT_DATA_UINT64
251 #define	KI(x) kstat_named_init(&(espstack->esp_kstats->esp_stat_##x), #x, K64)
252 
253 	KI(num_aalgs);
254 	KI(num_ealgs);
255 	KI(good_auth);
256 	KI(bad_auth);
257 	KI(bad_padding);
258 	KI(replay_failures);
259 	KI(replay_early_failures);
260 	KI(keysock_in);
261 	KI(out_requests);
262 	KI(acquire_requests);
263 	KI(bytes_expired);
264 	KI(out_discards);
265 	KI(in_accelerated);
266 	KI(out_accelerated);
267 	KI(noaccel);
268 	KI(crypto_sync);
269 	KI(crypto_async);
270 	KI(crypto_failures);
271 	KI(bad_decrypt);
272 	KI(sa_port_renumbers);
273 
274 #undef KI
275 #undef K64
276 
277 	kstat_install(espstack->esp_ksp);
278 
279 	return (B_TRUE);
280 }
281 
282 static int
283 esp_kstat_update(kstat_t *kp, int rw)
284 {
285 	esp_kstats_t *ekp;
286 	netstackid_t	stackid = (zoneid_t)(uintptr_t)kp->ks_private;
287 	netstack_t	*ns;
288 	ipsec_stack_t	*ipss;
289 
290 	if ((kp == NULL) || (kp->ks_data == NULL))
291 		return (EIO);
292 
293 	if (rw == KSTAT_WRITE)
294 		return (EACCES);
295 
296 	ns = netstack_find_by_stackid(stackid);
297 	if (ns == NULL)
298 		return (-1);
299 	ipss = ns->netstack_ipsec;
300 	if (ipss == NULL) {
301 		netstack_rele(ns);
302 		return (-1);
303 	}
304 	ekp = (esp_kstats_t *)kp->ks_data;
305 
306 	mutex_enter(&ipss->ipsec_alg_lock);
307 	ekp->esp_stat_num_aalgs.value.ui64 =
308 	    ipss->ipsec_nalgs[IPSEC_ALG_AUTH];
309 	ekp->esp_stat_num_ealgs.value.ui64 =
310 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
311 	mutex_exit(&ipss->ipsec_alg_lock);
312 
313 	netstack_rele(ns);
314 	return (0);
315 }
316 
317 #ifdef DEBUG
318 /*
319  * Debug routine, useful to see pre-encryption data.
320  */
321 static char *
322 dump_msg(mblk_t *mp)
323 {
324 	char tmp_str[3], tmp_line[256];
325 
326 	while (mp != NULL) {
327 		unsigned char *ptr;
328 
329 		printf("mblk address 0x%p, length %ld, db_ref %d "
330 		    "type %d, base 0x%p, lim 0x%p\n",
331 		    (void *) mp, (long)(mp->b_wptr - mp->b_rptr),
332 		    mp->b_datap->db_ref, mp->b_datap->db_type,
333 		    (void *)mp->b_datap->db_base, (void *)mp->b_datap->db_lim);
334 		ptr = mp->b_rptr;
335 
336 		tmp_line[0] = '\0';
337 		while (ptr < mp->b_wptr) {
338 			uint_t diff;
339 
340 			diff = (ptr - mp->b_rptr);
341 			if (!(diff & 0x1f)) {
342 				if (strlen(tmp_line) > 0) {
343 					printf("bytes: %s\n", tmp_line);
344 					tmp_line[0] = '\0';
345 				}
346 			}
347 			if (!(diff & 0x3))
348 				(void) strcat(tmp_line, " ");
349 			(void) sprintf(tmp_str, "%02x", *ptr);
350 			(void) strcat(tmp_line, tmp_str);
351 			ptr++;
352 		}
353 		if (strlen(tmp_line) > 0)
354 			printf("bytes: %s\n", tmp_line);
355 
356 		mp = mp->b_cont;
357 	}
358 
359 	return ("\n");
360 }
361 
362 #else /* DEBUG */
363 static char *
364 dump_msg(mblk_t *mp)
365 {
366 	printf("Find value of mp %p.\n", mp);
367 	return ("\n");
368 }
369 #endif /* DEBUG */
370 
371 /*
372  * Don't have to lock age_interval, as only one thread will access it at
373  * a time, because I control the one function that does with timeout().
374  */
375 static void
376 esp_ager(void *arg)
377 {
378 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
379 	netstack_t	*ns = espstack->ipsecesp_netstack;
380 	hrtime_t begin = gethrtime();
381 
382 	sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
383 	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
384 	sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
385 	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
386 
387 	espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
388 	    esp_ager, espstack,
389 	    &espstack->ipsecesp_age_interval, espstack->ipsecesp_age_int_max,
390 	    info.mi_idnum);
391 }
392 
393 /*
394  * Get an ESP NDD parameter.
395  */
396 /* ARGSUSED */
397 static int
398 ipsecesp_param_get(q, mp, cp, cr)
399 	queue_t	*q;
400 	mblk_t	*mp;
401 	caddr_t	cp;
402 	cred_t *cr;
403 {
404 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
405 	uint_t value;
406 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
407 
408 	mutex_enter(&espstack->ipsecesp_param_lock);
409 	value = ipsecesppa->ipsecesp_param_value;
410 	mutex_exit(&espstack->ipsecesp_param_lock);
411 
412 	(void) mi_mpprintf(mp, "%u", value);
413 	return (0);
414 }
415 
416 /*
417  * This routine sets an NDD variable in a ipsecespparam_t structure.
418  */
419 /* ARGSUSED */
420 static int
421 ipsecesp_param_set(q, mp, value, cp, cr)
422 	queue_t	*q;
423 	mblk_t	*mp;
424 	char	*value;
425 	caddr_t	cp;
426 	cred_t *cr;
427 {
428 	ulong_t	new_value;
429 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
430 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
431 
432 	/*
433 	 * Fail the request if the new value does not lie within the
434 	 * required bounds.
435 	 */
436 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
437 	    new_value < ipsecesppa->ipsecesp_param_min ||
438 	    new_value > ipsecesppa->ipsecesp_param_max) {
439 		return (EINVAL);
440 	}
441 
442 	/* Set the new value */
443 	mutex_enter(&espstack->ipsecesp_param_lock);
444 	ipsecesppa->ipsecesp_param_value = new_value;
445 	mutex_exit(&espstack->ipsecesp_param_lock);
446 	return (0);
447 }
448 
449 /*
450  * Using lifetime NDD variables, fill in an extended combination's
451  * lifetime information.
452  */
453 void
454 ipsecesp_fill_defs(sadb_x_ecomb_t *ecomb, netstack_t *ns)
455 {
456 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
457 
458 	ecomb->sadb_x_ecomb_soft_bytes = espstack->ipsecesp_default_soft_bytes;
459 	ecomb->sadb_x_ecomb_hard_bytes = espstack->ipsecesp_default_hard_bytes;
460 	ecomb->sadb_x_ecomb_soft_addtime =
461 	    espstack->ipsecesp_default_soft_addtime;
462 	ecomb->sadb_x_ecomb_hard_addtime =
463 	    espstack->ipsecesp_default_hard_addtime;
464 	ecomb->sadb_x_ecomb_soft_usetime =
465 	    espstack->ipsecesp_default_soft_usetime;
466 	ecomb->sadb_x_ecomb_hard_usetime =
467 	    espstack->ipsecesp_default_hard_usetime;
468 }
469 
470 /*
471  * Initialize things for ESP at module load time.
472  */
473 boolean_t
474 ipsecesp_ddi_init(void)
475 {
476 	esp_taskq = taskq_create("esp_taskq", 1, minclsyspri,
477 	    IPSEC_TASKQ_MIN, IPSEC_TASKQ_MAX, 0);
478 
479 	/*
480 	 * We want to be informed each time a stack is created or
481 	 * destroyed in the kernel, so we can maintain the
482 	 * set of ipsecesp_stack_t's.
483 	 */
484 	netstack_register(NS_IPSECESP, ipsecesp_stack_init, NULL,
485 	    ipsecesp_stack_fini);
486 
487 	return (B_TRUE);
488 }
489 
490 /*
491  * Walk through the param array specified registering each element with the
492  * named dispatch handler.
493  */
494 static boolean_t
495 ipsecesp_param_register(IDP *ndp, ipsecespparam_t *espp, int cnt)
496 {
497 	for (; cnt-- > 0; espp++) {
498 		if (espp->ipsecesp_param_name != NULL &&
499 		    espp->ipsecesp_param_name[0]) {
500 			if (!nd_load(ndp,
501 			    espp->ipsecesp_param_name,
502 			    ipsecesp_param_get, ipsecesp_param_set,
503 			    (caddr_t)espp)) {
504 				nd_free(ndp);
505 				return (B_FALSE);
506 			}
507 		}
508 	}
509 	return (B_TRUE);
510 }
511 /*
512  * Initialize things for ESP for each stack instance
513  */
514 static void *
515 ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns)
516 {
517 	ipsecesp_stack_t	*espstack;
518 	ipsecespparam_t		*espp;
519 
520 	espstack = (ipsecesp_stack_t *)kmem_zalloc(sizeof (*espstack),
521 	    KM_SLEEP);
522 	espstack->ipsecesp_netstack = ns;
523 
524 	espp = (ipsecespparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
525 	espstack->ipsecesp_params = espp;
526 	bcopy(lcl_param_arr, espp, sizeof (lcl_param_arr));
527 
528 	(void) ipsecesp_param_register(&espstack->ipsecesp_g_nd, espp,
529 	    A_CNT(lcl_param_arr));
530 
531 	(void) esp_kstat_init(espstack, stackid);
532 
533 	espstack->esp_sadb.s_acquire_timeout =
534 	    &espstack->ipsecesp_acquire_timeout;
535 	espstack->esp_sadb.s_acqfn = esp_send_acquire;
536 	sadbp_init("ESP", &espstack->esp_sadb, SADB_SATYPE_ESP, esp_hash_size,
537 	    espstack->ipsecesp_netstack);
538 
539 	mutex_init(&espstack->ipsecesp_param_lock, NULL, MUTEX_DEFAULT, 0);
540 
541 	ip_drop_register(&espstack->esp_dropper, "IPsec ESP");
542 	return (espstack);
543 }
544 
545 /*
546  * Destroy things for ESP at module unload time.
547  */
548 void
549 ipsecesp_ddi_destroy(void)
550 {
551 	netstack_unregister(NS_IPSECESP);
552 	taskq_destroy(esp_taskq);
553 }
554 
555 /*
556  * Destroy things for ESP for one stack instance
557  */
558 static void
559 ipsecesp_stack_fini(netstackid_t stackid, void *arg)
560 {
561 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
562 
563 	if (espstack->esp_pfkey_q != NULL) {
564 		(void) quntimeout(espstack->esp_pfkey_q, espstack->esp_event);
565 	}
566 	espstack->esp_sadb.s_acqfn = NULL;
567 	espstack->esp_sadb.s_acquire_timeout = NULL;
568 	sadbp_destroy(&espstack->esp_sadb, espstack->ipsecesp_netstack);
569 	ip_drop_unregister(&espstack->esp_dropper);
570 	mutex_destroy(&espstack->ipsecesp_param_lock);
571 	nd_free(&espstack->ipsecesp_g_nd);
572 
573 	kmem_free(espstack->ipsecesp_params, sizeof (lcl_param_arr));
574 	espstack->ipsecesp_params = NULL;
575 	kstat_delete_netstack(espstack->esp_ksp, stackid);
576 	espstack->esp_ksp = NULL;
577 	espstack->esp_kstats = NULL;
578 	kmem_free(espstack, sizeof (*espstack));
579 }
580 
581 /*
582  * ESP module open routine.
583  */
584 /* ARGSUSED */
585 static int
586 ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
587 {
588 	netstack_t		*ns;
589 	ipsecesp_stack_t	*espstack;
590 
591 	if (secpolicy_ip_config(credp, B_FALSE) != 0)
592 		return (EPERM);
593 
594 	if (q->q_ptr != NULL)
595 		return (0);  /* Re-open of an already open instance. */
596 
597 	if (sflag != MODOPEN)
598 		return (EINVAL);
599 
600 	ns = netstack_find_by_cred(credp);
601 	ASSERT(ns != NULL);
602 	espstack = ns->netstack_ipsecesp;
603 	ASSERT(espstack != NULL);
604 
605 	/*
606 	 * ASSUMPTIONS (because I'm MT_OCEXCL):
607 	 *
608 	 *	* I'm being pushed on top of IP for all my opens (incl. #1).
609 	 *	* Only ipsecesp_open() can write into esp_sadb.s_ip_q.
610 	 *	* Because of this, I can check lazily for esp_sadb.s_ip_q.
611 	 *
612 	 *  If these assumptions are wrong, I'm in BIG trouble...
613 	 */
614 
615 	q->q_ptr = espstack;
616 	WR(q)->q_ptr = q->q_ptr;
617 
618 	if (espstack->esp_sadb.s_ip_q == NULL) {
619 		struct T_unbind_req *tur;
620 
621 		espstack->esp_sadb.s_ip_q = WR(q);
622 		/* Allocate an unbind... */
623 		espstack->esp_ip_unbind = allocb(sizeof (struct T_unbind_req),
624 		    BPRI_HI);
625 
626 		/*
627 		 * Send down T_BIND_REQ to bind IPPROTO_ESP.
628 		 * Handle the ACK here in ESP.
629 		 */
630 		qprocson(q);
631 		if (espstack->esp_ip_unbind == NULL ||
632 		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
633 			if (espstack->esp_ip_unbind != NULL) {
634 				freeb(espstack->esp_ip_unbind);
635 				espstack->esp_ip_unbind = NULL;
636 			}
637 			q->q_ptr = NULL;
638 			netstack_rele(espstack->ipsecesp_netstack);
639 			return (ENOMEM);
640 		}
641 
642 		espstack->esp_ip_unbind->b_datap->db_type = M_PROTO;
643 		tur = (struct T_unbind_req *)espstack->esp_ip_unbind->b_rptr;
644 		tur->PRIM_type = T_UNBIND_REQ;
645 	} else {
646 		qprocson(q);
647 	}
648 
649 	/*
650 	 * For now, there's not much I can do.  I'll be getting a message
651 	 * passed down to me from keysock (in my wput), and a T_BIND_ACK
652 	 * up from IP (in my rput).
653 	 */
654 
655 	return (0);
656 }
657 
658 /*
659  * ESP module close routine.
660  */
661 static int
662 ipsecesp_close(queue_t *q)
663 {
664 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
665 
666 	/*
667 	 * If esp_sadb.s_ip_q is attached to this instance, send a
668 	 * T_UNBIND_REQ to IP for the instance before doing
669 	 * a qprocsoff().
670 	 */
671 	if (WR(q) == espstack->esp_sadb.s_ip_q &&
672 	    espstack->esp_ip_unbind != NULL) {
673 		putnext(WR(q), espstack->esp_ip_unbind);
674 		espstack->esp_ip_unbind = NULL;
675 	}
676 
677 	/*
678 	 * Clean up q_ptr, if needed.
679 	 */
680 	qprocsoff(q);
681 
682 	/* Keysock queue check is safe, because of OCEXCL perimeter. */
683 
684 	if (q == espstack->esp_pfkey_q) {
685 		esp1dbg(espstack,
686 		    ("ipsecesp_close:  Ummm... keysock is closing ESP.\n"));
687 		espstack->esp_pfkey_q = NULL;
688 		/* Detach qtimeouts. */
689 		(void) quntimeout(q, espstack->esp_event);
690 	}
691 
692 	if (WR(q) == espstack->esp_sadb.s_ip_q) {
693 		/*
694 		 * If the esp_sadb.s_ip_q is attached to this instance, find
695 		 * another.  The OCEXCL outer perimeter helps us here.
696 		 */
697 		espstack->esp_sadb.s_ip_q = NULL;
698 
699 		/*
700 		 * Find a replacement queue for esp_sadb.s_ip_q.
701 		 */
702 		if (espstack->esp_pfkey_q != NULL &&
703 		    espstack->esp_pfkey_q != RD(q)) {
704 			/*
705 			 * See if we can use the pfkey_q.
706 			 */
707 			espstack->esp_sadb.s_ip_q = WR(espstack->esp_pfkey_q);
708 		}
709 
710 		if (espstack->esp_sadb.s_ip_q == NULL ||
711 		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
712 			esp1dbg(espstack, ("ipsecesp: Can't reassign ip_q.\n"));
713 			espstack->esp_sadb.s_ip_q = NULL;
714 		} else {
715 			espstack->esp_ip_unbind =
716 			    allocb(sizeof (struct T_unbind_req), BPRI_HI);
717 
718 			if (espstack->esp_ip_unbind != NULL) {
719 				struct T_unbind_req *tur;
720 
721 				espstack->esp_ip_unbind->b_datap->db_type =
722 				    M_PROTO;
723 				tur = (struct T_unbind_req *)
724 				    espstack->esp_ip_unbind->b_rptr;
725 				tur->PRIM_type = T_UNBIND_REQ;
726 			}
727 			/* If it's NULL, I can't do much here. */
728 		}
729 	}
730 
731 	netstack_rele(espstack->ipsecesp_netstack);
732 	return (0);
733 }
734 
735 /*
736  * Add a number of bytes to what the SA has protected so far.  Return
737  * B_TRUE if the SA can still protect that many bytes.
738  *
739  * Caller must REFRELE the passed-in assoc.  This function must REFRELE
740  * any obtained peer SA.
741  */
742 static boolean_t
743 esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
744 {
745 	ipsa_t *inassoc, *outassoc;
746 	isaf_t *bucket;
747 	boolean_t inrc, outrc, isv6;
748 	sadb_t *sp;
749 	int outhash;
750 	netstack_t		*ns = assoc->ipsa_netstack;
751 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
752 
753 	/* No peer?  No problem! */
754 	if (!assoc->ipsa_haspeer) {
755 		return (sadb_age_bytes(espstack->esp_pfkey_q, assoc, bytes,
756 		    B_TRUE));
757 	}
758 
759 	/*
760 	 * Otherwise, we want to grab both the original assoc and its peer.
761 	 * There might be a race for this, but if it's a real race, two
762 	 * expire messages may occur.  We limit this by only sending the
763 	 * expire message on one of the peers, we'll pick the inbound
764 	 * arbitrarily.
765 	 *
766 	 * If we need tight synchronization on the peer SA, then we need to
767 	 * reconsider.
768 	 */
769 
770 	/* Use address length to select IPv6/IPv4 */
771 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
772 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
773 
774 	if (inbound) {
775 		inassoc = assoc;
776 		if (isv6) {
777 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
778 			    &inassoc->ipsa_dstaddr));
779 		} else {
780 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
781 			    &inassoc->ipsa_dstaddr));
782 		}
783 		bucket = &sp->sdb_of[outhash];
784 		mutex_enter(&bucket->isaf_lock);
785 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
786 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
787 		    inassoc->ipsa_addrfam);
788 		mutex_exit(&bucket->isaf_lock);
789 		if (outassoc == NULL) {
790 			/* Q: Do we wish to set haspeer == B_FALSE? */
791 			esp0dbg(("esp_age_bytes: "
792 			    "can't find peer for inbound.\n"));
793 			return (sadb_age_bytes(espstack->esp_pfkey_q, inassoc,
794 			    bytes, B_TRUE));
795 		}
796 	} else {
797 		outassoc = assoc;
798 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
799 		mutex_enter(&bucket->isaf_lock);
800 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
801 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
802 		    outassoc->ipsa_addrfam);
803 		mutex_exit(&bucket->isaf_lock);
804 		if (inassoc == NULL) {
805 			/* Q: Do we wish to set haspeer == B_FALSE? */
806 			esp0dbg(("esp_age_bytes: "
807 			    "can't find peer for outbound.\n"));
808 			return (sadb_age_bytes(espstack->esp_pfkey_q, outassoc,
809 			    bytes, B_TRUE));
810 		}
811 	}
812 
813 	inrc = sadb_age_bytes(espstack->esp_pfkey_q, inassoc, bytes, B_TRUE);
814 	outrc = sadb_age_bytes(espstack->esp_pfkey_q, outassoc, bytes, B_FALSE);
815 
816 	/*
817 	 * REFRELE any peer SA.
818 	 *
819 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
820 	 * them in { }.
821 	 */
822 	if (inbound) {
823 		IPSA_REFRELE(outassoc);
824 	} else {
825 		IPSA_REFRELE(inassoc);
826 	}
827 
828 	return (inrc && outrc);
829 }
830 
831 /*
832  * Do incoming NAT-T manipulations for packet.
833  */
834 static ipsec_status_t
835 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
836 {
837 	ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
838 	tcpha_t *tcph;
839 	udpha_t *udpha;
840 	/* Initialize to our inbound cksum adjustment... */
841 	uint32_t sum = assoc->ipsa_inbound_cksum;
842 
843 	switch (ipha->ipha_protocol) {
844 	case IPPROTO_TCP:
845 		tcph = (tcpha_t *)(data_mp->b_rptr +
846 		    IPH_HDR_LENGTH(ipha));
847 
848 #define	DOWN_SUM(x) (x) = ((x) & 0xFFFF) +	 ((x) >> 16)
849 		sum += ~ntohs(tcph->tha_sum) & 0xFFFF;
850 		DOWN_SUM(sum);
851 		DOWN_SUM(sum);
852 		tcph->tha_sum = ~htons(sum);
853 		break;
854 	case IPPROTO_UDP:
855 		udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
856 
857 		if (udpha->uha_checksum != 0) {
858 			/* Adujst if the inbound one was not zero. */
859 			sum += ~ntohs(udpha->uha_checksum) & 0xFFFF;
860 			DOWN_SUM(sum);
861 			DOWN_SUM(sum);
862 			udpha->uha_checksum = ~htons(sum);
863 			if (udpha->uha_checksum == 0)
864 				udpha->uha_checksum = 0xFFFF;
865 		}
866 #undef DOWN_SUM
867 		break;
868 	case IPPROTO_IP:
869 		/*
870 		 * This case is only an issue for self-encapsulated
871 		 * packets.  So for now, fall through.
872 		 */
873 		break;
874 	}
875 	return (IPSEC_STATUS_SUCCESS);
876 }
877 
878 
879 /*
880  * Strip ESP header, check padding, and fix IP header.
881  * Returns B_TRUE on success, B_FALSE if an error occured.
882  */
883 static boolean_t
884 esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
885     kstat_named_t **counter, ipsecesp_stack_t *espstack)
886 {
887 	ipha_t *ipha;
888 	ip6_t *ip6h;
889 	uint_t divpoint;
890 	mblk_t *scratch;
891 	uint8_t nexthdr, padlen;
892 	uint8_t lastpad;
893 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
894 	uint8_t *lastbyte;
895 
896 	/*
897 	 * Strip ESP data and fix IP header.
898 	 *
899 	 * XXX In case the beginning of esp_inbound() changes to not do a
900 	 * pullup, this part of the code can remain unchanged.
901 	 */
902 	if (isv4) {
903 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ipha_t));
904 		ipha = (ipha_t *)data_mp->b_rptr;
905 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (esph_t) +
906 		    IPH_HDR_LENGTH(ipha));
907 		divpoint = IPH_HDR_LENGTH(ipha);
908 	} else {
909 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ip6_t));
910 		ip6h = (ip6_t *)data_mp->b_rptr;
911 		divpoint = ip_hdr_length_v6(data_mp, ip6h);
912 	}
913 
914 	scratch = data_mp;
915 	while (scratch->b_cont != NULL)
916 		scratch = scratch->b_cont;
917 
918 	ASSERT((scratch->b_wptr - scratch->b_rptr) >= 3);
919 
920 	/*
921 	 * "Next header" and padding length are the last two bytes in the
922 	 * ESP-protected datagram, thus the explicit - 1 and - 2.
923 	 * lastpad is the last byte of the padding, which can be used for
924 	 * a quick check to see if the padding is correct.
925 	 */
926 	lastbyte = scratch->b_wptr - 1;
927 	nexthdr = *lastbyte--;
928 	padlen = *lastbyte--;
929 
930 	if (isv4) {
931 		/* Fix part of the IP header. */
932 		ipha->ipha_protocol = nexthdr;
933 		/*
934 		 * Reality check the padlen.  The explicit - 2 is for the
935 		 * padding length and the next-header bytes.
936 		 */
937 		if (padlen >= ntohs(ipha->ipha_length) - sizeof (ipha_t) - 2 -
938 		    sizeof (esph_t) - ivlen) {
939 			ESP_BUMP_STAT(espstack, bad_decrypt);
940 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
941 			    info.mi_idnum, 0, 0,
942 			    SL_ERROR | SL_WARN,
943 			    "Corrupt ESP packet (padlen too big).\n");
944 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
945 			    padlen));
946 			esp1dbg(espstack, ("pkt len(%d) - ip hdr - esp "
947 			    "hdr - ivlen(%d) = %d.\n",
948 			    ntohs(ipha->ipha_length), ivlen,
949 			    (int)(ntohs(ipha->ipha_length) - sizeof (ipha_t) -
950 			    2 - sizeof (esph_t) - ivlen)));
951 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
952 			return (B_FALSE);
953 		}
954 
955 		/*
956 		 * Fix the rest of the header.  The explicit - 2 is for the
957 		 * padding length and the next-header bytes.
958 		 */
959 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - padlen -
960 		    2 - sizeof (esph_t) - ivlen);
961 		ipha->ipha_hdr_checksum = 0;
962 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
963 	} else {
964 		if (ip6h->ip6_nxt == IPPROTO_ESP) {
965 			ip6h->ip6_nxt = nexthdr;
966 		} else {
967 			ip6_pkt_t ipp;
968 
969 			bzero(&ipp, sizeof (ipp));
970 			(void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
971 			if (ipp.ipp_dstopts != NULL) {
972 				ipp.ipp_dstopts->ip6d_nxt = nexthdr;
973 			} else if (ipp.ipp_rthdr != NULL) {
974 				ipp.ipp_rthdr->ip6r_nxt = nexthdr;
975 			} else if (ipp.ipp_hopopts != NULL) {
976 				ipp.ipp_hopopts->ip6h_nxt = nexthdr;
977 			} else {
978 				/* Panic a DEBUG kernel. */
979 				ASSERT(ipp.ipp_hopopts != NULL);
980 				/* Otherwise, pretend it's IP + ESP. */
981 				cmn_err(CE_WARN, "ESP IPv6 headers wrong.\n");
982 				ip6h->ip6_nxt = nexthdr;
983 			}
984 		}
985 
986 		if (padlen >= ntohs(ip6h->ip6_plen) - 2 - sizeof (esph_t) -
987 		    ivlen) {
988 			ESP_BUMP_STAT(espstack, bad_decrypt);
989 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
990 			    info.mi_idnum, 0, 0,
991 			    SL_ERROR | SL_WARN,
992 			    "Corrupt ESP packet (v6 padlen too big).\n");
993 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
994 			    padlen));
995 			esp1dbg(espstack,
996 			    ("pkt len(%u) - ip hdr - esp hdr - ivlen(%d) = "
997 			    "%u.\n", (unsigned)(ntohs(ip6h->ip6_plen)
998 			    + sizeof (ip6_t)), ivlen,
999 			    (unsigned)(ntohs(ip6h->ip6_plen) - 2 -
1000 			    sizeof (esph_t) - ivlen)));
1001 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
1002 			return (B_FALSE);
1003 		}
1004 
1005 
1006 		/*
1007 		 * Fix the rest of the header.  The explicit - 2 is for the
1008 		 * padding length and the next-header bytes.  IPv6 is nice,
1009 		 * because there's no hdr checksum!
1010 		 */
1011 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - padlen -
1012 		    2 - sizeof (esph_t) - ivlen);
1013 	}
1014 
1015 	if (espstack->ipsecesp_padding_check > 0 && padlen > 0) {
1016 		/*
1017 		 * Weak padding check: compare last-byte to length, they
1018 		 * should be equal.
1019 		 */
1020 		lastpad = *lastbyte--;
1021 
1022 		if (padlen != lastpad) {
1023 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
1024 			    info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1025 			    "Corrupt ESP packet (lastpad != padlen).\n");
1026 			esp1dbg(espstack,
1027 			    ("lastpad (%d) not equal to padlen (%d):\n",
1028 			    lastpad, padlen));
1029 			ESP_BUMP_STAT(espstack, bad_padding);
1030 			*counter = DROPPER(ipss, ipds_esp_bad_padding);
1031 			return (B_FALSE);
1032 		}
1033 
1034 		/*
1035 		 * Strong padding check: Check all pad bytes to see that
1036 		 * they're ascending.  Go backwards using a descending counter
1037 		 * to verify.  padlen == 1 is checked by previous block, so
1038 		 * only bother if we've more than 1 byte of padding.
1039 		 * Consequently, start the check one byte before the location
1040 		 * of "lastpad".
1041 		 */
1042 		if (espstack->ipsecesp_padding_check > 1) {
1043 			/*
1044 			 * This assert may have to become an if and a pullup
1045 			 * if we start accepting multi-dblk mblks. For now,
1046 			 * though, any packet here will have been pulled up in
1047 			 * esp_inbound.
1048 			 */
1049 			ASSERT(MBLKL(scratch) >= lastpad + 3);
1050 
1051 			/*
1052 			 * Use "--lastpad" because we already checked the very
1053 			 * last pad byte previously.
1054 			 */
1055 			while (--lastpad != 0) {
1056 				if (lastpad != *lastbyte) {
1057 					ipsec_rl_strlog(
1058 					    espstack->ipsecesp_netstack,
1059 					    info.mi_idnum, 0, 0,
1060 					    SL_ERROR | SL_WARN, "Corrupt ESP "
1061 					    "packet (bad padding).\n");
1062 					esp1dbg(espstack,
1063 					    ("padding not in correct"
1064 					    " format:\n"));
1065 					ESP_BUMP_STAT(espstack, bad_padding);
1066 					*counter = DROPPER(ipss,
1067 					    ipds_esp_bad_padding);
1068 					return (B_FALSE);
1069 				}
1070 				lastbyte--;
1071 			}
1072 		}
1073 	}
1074 
1075 	/* Trim off the padding. */
1076 	ASSERT(data_mp->b_cont == NULL);
1077 	data_mp->b_wptr -= (padlen + 2);
1078 
1079 	/*
1080 	 * Remove the ESP header.
1081 	 *
1082 	 * The above assertions about data_mp's size will make this work.
1083 	 *
1084 	 * XXX  Question:  If I send up and get back a contiguous mblk,
1085 	 * would it be quicker to bcopy over, or keep doing the dupb stuff?
1086 	 * I go with copying for now.
1087 	 */
1088 
1089 	if (IS_P2ALIGNED(data_mp->b_rptr, sizeof (uint32_t)) &&
1090 	    IS_P2ALIGNED(ivlen, sizeof (uint32_t))) {
1091 		uint8_t *start = data_mp->b_rptr;
1092 		uint32_t *src, *dst;
1093 
1094 		src = (uint32_t *)(start + divpoint);
1095 		dst = (uint32_t *)(start + divpoint + sizeof (esph_t) + ivlen);
1096 
1097 		ASSERT(IS_P2ALIGNED(dst, sizeof (uint32_t)) &&
1098 		    IS_P2ALIGNED(src, sizeof (uint32_t)));
1099 
1100 		do {
1101 			src--;
1102 			dst--;
1103 			*dst = *src;
1104 		} while (src != (uint32_t *)start);
1105 
1106 		data_mp->b_rptr = (uchar_t *)dst;
1107 	} else {
1108 		uint8_t *start = data_mp->b_rptr;
1109 		uint8_t *src, *dst;
1110 
1111 		src = start + divpoint;
1112 		dst = src + sizeof (esph_t) + ivlen;
1113 
1114 		do {
1115 			src--;
1116 			dst--;
1117 			*dst = *src;
1118 		} while (src != start);
1119 
1120 		data_mp->b_rptr = dst;
1121 	}
1122 
1123 	esp2dbg(espstack, ("data_mp after inbound ESP adjustment:\n"));
1124 	esp2dbg(espstack, (dump_msg(data_mp)));
1125 
1126 	return (B_TRUE);
1127 }
1128 
1129 /*
1130  * Updating use times can be tricky business if the ipsa_haspeer flag is
1131  * set.  This function is called once in an SA's lifetime.
1132  *
1133  * Caller has to REFRELE "assoc" which is passed in.  This function has
1134  * to REFRELE any peer SA that is obtained.
1135  */
1136 static void
1137 esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
1138 {
1139 	ipsa_t *inassoc, *outassoc;
1140 	isaf_t *bucket;
1141 	sadb_t *sp;
1142 	int outhash;
1143 	boolean_t isv6;
1144 	netstack_t		*ns = assoc->ipsa_netstack;
1145 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
1146 
1147 	/* No peer?  No problem! */
1148 	if (!assoc->ipsa_haspeer) {
1149 		sadb_set_usetime(assoc);
1150 		return;
1151 	}
1152 
1153 	/*
1154 	 * Otherwise, we want to grab both the original assoc and its peer.
1155 	 * There might be a race for this, but if it's a real race, the times
1156 	 * will be out-of-synch by at most a second, and since our time
1157 	 * granularity is a second, this won't be a problem.
1158 	 *
1159 	 * If we need tight synchronization on the peer SA, then we need to
1160 	 * reconsider.
1161 	 */
1162 
1163 	/* Use address length to select IPv6/IPv4 */
1164 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
1165 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
1166 
1167 	if (inbound) {
1168 		inassoc = assoc;
1169 		if (isv6) {
1170 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
1171 			    &inassoc->ipsa_dstaddr));
1172 		} else {
1173 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
1174 			    &inassoc->ipsa_dstaddr));
1175 		}
1176 		bucket = &sp->sdb_of[outhash];
1177 		mutex_enter(&bucket->isaf_lock);
1178 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
1179 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
1180 		    inassoc->ipsa_addrfam);
1181 		mutex_exit(&bucket->isaf_lock);
1182 		if (outassoc == NULL) {
1183 			/* Q: Do we wish to set haspeer == B_FALSE? */
1184 			esp0dbg(("esp_set_usetime: "
1185 			    "can't find peer for inbound.\n"));
1186 			sadb_set_usetime(inassoc);
1187 			return;
1188 		}
1189 	} else {
1190 		outassoc = assoc;
1191 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
1192 		mutex_enter(&bucket->isaf_lock);
1193 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
1194 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
1195 		    outassoc->ipsa_addrfam);
1196 		mutex_exit(&bucket->isaf_lock);
1197 		if (inassoc == NULL) {
1198 			/* Q: Do we wish to set haspeer == B_FALSE? */
1199 			esp0dbg(("esp_set_usetime: "
1200 			    "can't find peer for outbound.\n"));
1201 			sadb_set_usetime(outassoc);
1202 			return;
1203 		}
1204 	}
1205 
1206 	/* Update usetime on both. */
1207 	sadb_set_usetime(inassoc);
1208 	sadb_set_usetime(outassoc);
1209 
1210 	/*
1211 	 * REFRELE any peer SA.
1212 	 *
1213 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
1214 	 * them in { }.
1215 	 */
1216 	if (inbound) {
1217 		IPSA_REFRELE(outassoc);
1218 	} else {
1219 		IPSA_REFRELE(inassoc);
1220 	}
1221 }
1222 
1223 /*
1224  * Handle ESP inbound data for IPv4 and IPv6.
1225  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1226  * mblk chain ipsec_in_mp.
1227  */
1228 ipsec_status_t
1229 esp_inbound(mblk_t *ipsec_in_mp, void *arg)
1230 {
1231 	mblk_t *data_mp = ipsec_in_mp->b_cont;
1232 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
1233 	esph_t *esph = (esph_t *)arg;
1234 	ipsa_t *ipsa = ii->ipsec_in_esp_sa;
1235 	netstack_t	*ns = ii->ipsec_in_ns;
1236 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1237 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1238 
1239 	/*
1240 	 * We may wish to check replay in-range-only here as an optimization.
1241 	 * Include the reality check of ipsa->ipsa_replay >
1242 	 * ipsa->ipsa_replay_wsize for times when it's the first N packets,
1243 	 * where N == ipsa->ipsa_replay_wsize.
1244 	 *
1245 	 * Another check that may come here later is the "collision" check.
1246 	 * If legitimate packets flow quickly enough, this won't be a problem,
1247 	 * but collisions may cause authentication algorithm crunching to
1248 	 * take place when it doesn't need to.
1249 	 */
1250 	if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
1251 		ESP_BUMP_STAT(espstack, replay_early_failures);
1252 		IP_ESP_BUMP_STAT(ipss, in_discards);
1253 		/*
1254 		 * TODO: Extract inbound interface from the IPSEC_IN
1255 		 * message's ii->ipsec_in_rill_index.
1256 		 */
1257 		ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
1258 		    DROPPER(ipss, ipds_esp_early_replay),
1259 		    &espstack->esp_dropper);
1260 		return (IPSEC_STATUS_FAILED);
1261 	}
1262 
1263 	/*
1264 	 * Has this packet already been processed by a hardware
1265 	 * IPsec accelerator?
1266 	 */
1267 	if (ii->ipsec_in_accelerated) {
1268 		ipsec_status_t rv;
1269 		esp3dbg(espstack,
1270 		    ("esp_inbound: pkt processed by ill=%d isv6=%d\n",
1271 		    ii->ipsec_in_ill_index, !ii->ipsec_in_v4));
1272 		rv = esp_inbound_accelerated(ipsec_in_mp,
1273 		    data_mp, ii->ipsec_in_v4, ipsa);
1274 		return (rv);
1275 	}
1276 	ESP_BUMP_STAT(espstack, noaccel);
1277 
1278 	/*
1279 	 * Adjust the IP header's payload length to reflect the removal
1280 	 * of the ICV.
1281 	 */
1282 	if (!ii->ipsec_in_v4) {
1283 		ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
1284 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
1285 		    ipsa->ipsa_mac_len);
1286 	} else {
1287 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1288 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) -
1289 		    ipsa->ipsa_mac_len);
1290 	}
1291 
1292 	/* submit the request to the crypto framework */
1293 	return (esp_submit_req_inbound(ipsec_in_mp, ipsa,
1294 	    (uint8_t *)esph - data_mp->b_rptr));
1295 }
1296 
1297 /*
1298  * Perform the really difficult work of inserting the proposed situation.
1299  * Called while holding the algorithm lock.
1300  */
1301 static void
1302 esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
1303 {
1304 	sadb_comb_t *comb = (sadb_comb_t *)(prop + 1);
1305 	ipsec_out_t *io;
1306 	ipsec_action_t *ap;
1307 	ipsec_prot_t *prot;
1308 	netstack_t *ns;
1309 	ipsecesp_stack_t *espstack;
1310 	ipsec_stack_t *ipss;
1311 
1312 	io = (ipsec_out_t *)acqrec->ipsacq_mp->b_rptr;
1313 	ASSERT(io->ipsec_out_type == IPSEC_OUT);
1314 	ns = io->ipsec_out_ns;
1315 	espstack = ns->netstack_ipsecesp;
1316 	ipss = ns->netstack_ipsec;
1317 	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1318 
1319 	prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
1320 	prop->sadb_prop_len = SADB_8TO64(sizeof (sadb_prop_t));
1321 	*(uint32_t *)(&prop->sadb_prop_replay) = 0;	/* Quick zero-out! */
1322 
1323 	prop->sadb_prop_replay = espstack->ipsecesp_replay_size;
1324 
1325 	/*
1326 	 * Based upon algorithm properties, and what-not, prioritize
1327 	 * a proposal.  If the IPSEC_OUT message has an algorithm specified,
1328 	 * use it first and foremost.
1329 	 *
1330 	 * For each action in policy list
1331 	 *   Add combination.  If I've hit limit, return.
1332 	 */
1333 
1334 	for (ap = acqrec->ipsacq_act; ap != NULL;
1335 	    ap = ap->ipa_next) {
1336 		ipsec_alginfo_t *ealg = NULL;
1337 		ipsec_alginfo_t *aalg = NULL;
1338 
1339 		if (ap->ipa_act.ipa_type != IPSEC_POLICY_APPLY)
1340 			continue;
1341 
1342 		prot = &ap->ipa_act.ipa_apply;
1343 
1344 		if (!(prot->ipp_use_esp))
1345 			continue;
1346 
1347 		if (prot->ipp_esp_auth_alg != 0) {
1348 			aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
1349 			    [prot->ipp_esp_auth_alg];
1350 			if (aalg == NULL || !ALG_VALID(aalg))
1351 				continue;
1352 		}
1353 
1354 		ASSERT(prot->ipp_encr_alg > 0);
1355 		ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
1356 		    [prot->ipp_encr_alg];
1357 		if (ealg == NULL || !ALG_VALID(ealg))
1358 			continue;
1359 
1360 		comb->sadb_comb_flags = 0;
1361 		comb->sadb_comb_reserved = 0;
1362 		comb->sadb_comb_encrypt = ealg->alg_id;
1363 		comb->sadb_comb_encrypt_minbits =
1364 		    MAX(prot->ipp_espe_minbits, ealg->alg_ef_minbits);
1365 		comb->sadb_comb_encrypt_maxbits =
1366 		    MIN(prot->ipp_espe_maxbits, ealg->alg_ef_maxbits);
1367 		if (aalg == NULL) {
1368 			comb->sadb_comb_auth = 0;
1369 			comb->sadb_comb_auth_minbits = 0;
1370 			comb->sadb_comb_auth_maxbits = 0;
1371 		} else {
1372 			comb->sadb_comb_auth = aalg->alg_id;
1373 			comb->sadb_comb_auth_minbits =
1374 			    MAX(prot->ipp_espa_minbits, aalg->alg_ef_minbits);
1375 			comb->sadb_comb_auth_maxbits =
1376 			    MIN(prot->ipp_espa_maxbits, aalg->alg_ef_maxbits);
1377 		}
1378 
1379 		/*
1380 		 * The following may be based on algorithm
1381 		 * properties, but in the meantime, we just pick
1382 		 * some good, sensible numbers.  Key mgmt. can
1383 		 * (and perhaps should) be the place to finalize
1384 		 * such decisions.
1385 		 */
1386 
1387 		/*
1388 		 * No limits on allocations, since we really don't
1389 		 * support that concept currently.
1390 		 */
1391 		comb->sadb_comb_soft_allocations = 0;
1392 		comb->sadb_comb_hard_allocations = 0;
1393 
1394 		/*
1395 		 * These may want to come from policy rule..
1396 		 */
1397 		comb->sadb_comb_soft_bytes =
1398 		    espstack->ipsecesp_default_soft_bytes;
1399 		comb->sadb_comb_hard_bytes =
1400 		    espstack->ipsecesp_default_hard_bytes;
1401 		comb->sadb_comb_soft_addtime =
1402 		    espstack->ipsecesp_default_soft_addtime;
1403 		comb->sadb_comb_hard_addtime =
1404 		    espstack->ipsecesp_default_hard_addtime;
1405 		comb->sadb_comb_soft_usetime =
1406 		    espstack->ipsecesp_default_soft_usetime;
1407 		comb->sadb_comb_hard_usetime =
1408 		    espstack->ipsecesp_default_hard_usetime;
1409 
1410 		prop->sadb_prop_len += SADB_8TO64(sizeof (*comb));
1411 		if (--combs == 0)
1412 			break;	/* out of space.. */
1413 		comb++;
1414 	}
1415 }
1416 
1417 /*
1418  * Prepare and actually send the SADB_ACQUIRE message to PF_KEY.
1419  */
1420 static void
1421 esp_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns)
1422 {
1423 	uint_t combs;
1424 	sadb_msg_t *samsg;
1425 	sadb_prop_t *prop;
1426 	mblk_t *pfkeymp, *msgmp;
1427 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1428 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1429 
1430 	ESP_BUMP_STAT(espstack, acquire_requests);
1431 
1432 	if (espstack->esp_pfkey_q == NULL) {
1433 		mutex_exit(&acqrec->ipsacq_lock);
1434 		return;
1435 	}
1436 
1437 	/* Set up ACQUIRE. */
1438 	pfkeymp = sadb_setup_acquire(acqrec, SADB_SATYPE_ESP,
1439 	    ns->netstack_ipsec);
1440 	if (pfkeymp == NULL) {
1441 		esp0dbg(("sadb_setup_acquire failed.\n"));
1442 		mutex_exit(&acqrec->ipsacq_lock);
1443 		return;
1444 	}
1445 	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1446 	combs = ipss->ipsec_nalgs[IPSEC_ALG_AUTH] *
1447 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
1448 	msgmp = pfkeymp->b_cont;
1449 	samsg = (sadb_msg_t *)(msgmp->b_rptr);
1450 
1451 	/* Insert proposal here. */
1452 
1453 	prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len);
1454 	esp_insert_prop(prop, acqrec, combs);
1455 	samsg->sadb_msg_len += prop->sadb_prop_len;
1456 	msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len);
1457 
1458 	mutex_exit(&ipss->ipsec_alg_lock);
1459 
1460 	/*
1461 	 * Must mutex_exit() before sending PF_KEY message up, in
1462 	 * order to avoid recursive mutex_enter() if there are no registered
1463 	 * listeners.
1464 	 *
1465 	 * Once I've sent the message, I'm cool anyway.
1466 	 */
1467 	mutex_exit(&acqrec->ipsacq_lock);
1468 	if (extended != NULL) {
1469 		putnext(espstack->esp_pfkey_q, extended);
1470 	}
1471 	putnext(espstack->esp_pfkey_q, pfkeymp);
1472 }
1473 
1474 /*
1475  * Handle the SADB_GETSPI message.  Create a larval SA.
1476  */
1477 static void
1478 esp_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
1479 {
1480 	ipsa_t *newbie, *target;
1481 	isaf_t *outbound, *inbound;
1482 	int rc, diagnostic;
1483 	sadb_sa_t *assoc;
1484 	keysock_out_t *kso;
1485 	uint32_t newspi;
1486 
1487 	/*
1488 	 * Randomly generate a proposed SPI value
1489 	 */
1490 	if (cl_inet_getspi != NULL) {
1491 		cl_inet_getspi(espstack->ipsecesp_netstack->netstack_stackid,
1492 		    IPPROTO_ESP, (uint8_t *)&newspi, sizeof (uint32_t), NULL);
1493 	} else {
1494 		(void) random_get_pseudo_bytes((uint8_t *)&newspi,
1495 		    sizeof (uint32_t));
1496 	}
1497 	newbie = sadb_getspi(ksi, newspi, &diagnostic,
1498 	    espstack->ipsecesp_netstack, IPPROTO_ESP);
1499 
1500 	if (newbie == NULL) {
1501 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM, diagnostic,
1502 		    ksi->ks_in_serial);
1503 		return;
1504 	} else if (newbie == (ipsa_t *)-1) {
1505 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
1506 		    ksi->ks_in_serial);
1507 		return;
1508 	}
1509 
1510 	/*
1511 	 * XXX - We may randomly collide.  We really should recover from this.
1512 	 *	 Unfortunately, that could require spending way-too-much-time
1513 	 *	 in here.  For now, let the user retry.
1514 	 */
1515 
1516 	if (newbie->ipsa_addrfam == AF_INET6) {
1517 		outbound = OUTBOUND_BUCKET_V6(&espstack->esp_sadb.s_v6,
1518 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1519 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v6,
1520 		    newbie->ipsa_spi);
1521 	} else {
1522 		ASSERT(newbie->ipsa_addrfam == AF_INET);
1523 		outbound = OUTBOUND_BUCKET_V4(&espstack->esp_sadb.s_v4,
1524 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1525 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v4,
1526 		    newbie->ipsa_spi);
1527 	}
1528 
1529 	mutex_enter(&outbound->isaf_lock);
1530 	mutex_enter(&inbound->isaf_lock);
1531 
1532 	/*
1533 	 * Check for collisions (i.e. did sadb_getspi() return with something
1534 	 * that already exists?).
1535 	 *
1536 	 * Try outbound first.  Even though SADB_GETSPI is traditionally
1537 	 * for inbound SAs, you never know what a user might do.
1538 	 */
1539 	target = ipsec_getassocbyspi(outbound, newbie->ipsa_spi,
1540 	    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr, newbie->ipsa_addrfam);
1541 	if (target == NULL) {
1542 		target = ipsec_getassocbyspi(inbound, newbie->ipsa_spi,
1543 		    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr,
1544 		    newbie->ipsa_addrfam);
1545 	}
1546 
1547 	/*
1548 	 * I don't have collisions elsewhere!
1549 	 * (Nor will I because I'm still holding inbound/outbound locks.)
1550 	 */
1551 
1552 	if (target != NULL) {
1553 		rc = EEXIST;
1554 		IPSA_REFRELE(target);
1555 	} else {
1556 		/*
1557 		 * sadb_insertassoc() also checks for collisions, so
1558 		 * if there's a colliding entry, rc will be set
1559 		 * to EEXIST.
1560 		 */
1561 		rc = sadb_insertassoc(newbie, inbound);
1562 		newbie->ipsa_hardexpiretime = gethrestime_sec();
1563 		newbie->ipsa_hardexpiretime +=
1564 		    espstack->ipsecesp_larval_timeout;
1565 	}
1566 
1567 	/*
1568 	 * Can exit outbound mutex.  Hold inbound until we're done
1569 	 * with newbie.
1570 	 */
1571 	mutex_exit(&outbound->isaf_lock);
1572 
1573 	if (rc != 0) {
1574 		mutex_exit(&inbound->isaf_lock);
1575 		IPSA_REFRELE(newbie);
1576 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, rc,
1577 		    SADB_X_DIAGNOSTIC_NONE, ksi->ks_in_serial);
1578 		return;
1579 	}
1580 
1581 
1582 	/* Can write here because I'm still holding the bucket lock. */
1583 	newbie->ipsa_type = SADB_SATYPE_ESP;
1584 
1585 	/*
1586 	 * Construct successful return message. We have one thing going
1587 	 * for us in PF_KEY v2.  That's the fact that
1588 	 *	sizeof (sadb_spirange_t) == sizeof (sadb_sa_t)
1589 	 */
1590 	assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SPIRANGE];
1591 	assoc->sadb_sa_exttype = SADB_EXT_SA;
1592 	assoc->sadb_sa_spi = newbie->ipsa_spi;
1593 	*((uint64_t *)(&assoc->sadb_sa_replay)) = 0;
1594 	mutex_exit(&inbound->isaf_lock);
1595 
1596 	/* Convert KEYSOCK_IN to KEYSOCK_OUT. */
1597 	kso = (keysock_out_t *)ksi;
1598 	kso->ks_out_len = sizeof (*kso);
1599 	kso->ks_out_serial = ksi->ks_in_serial;
1600 	kso->ks_out_type = KEYSOCK_OUT;
1601 
1602 	/*
1603 	 * Can safely putnext() to esp_pfkey_q, because this is a turnaround
1604 	 * from the esp_pfkey_q.
1605 	 */
1606 	putnext(espstack->esp_pfkey_q, mp);
1607 }
1608 
1609 /*
1610  * Insert the ESP header into a packet.  Duplicate an mblk, and insert a newly
1611  * allocated mblk with the ESP header in between the two.
1612  */
1613 static boolean_t
1614 esp_insert_esp(mblk_t *mp, mblk_t *esp_mp, uint_t divpoint,
1615     ipsecesp_stack_t *espstack)
1616 {
1617 	mblk_t *split_mp = mp;
1618 	uint_t wheretodiv = divpoint;
1619 
1620 	while ((split_mp->b_wptr - split_mp->b_rptr) < wheretodiv) {
1621 		wheretodiv -= (split_mp->b_wptr - split_mp->b_rptr);
1622 		split_mp = split_mp->b_cont;
1623 		ASSERT(split_mp != NULL);
1624 	}
1625 
1626 	if (split_mp->b_wptr - split_mp->b_rptr != wheretodiv) {
1627 		mblk_t *scratch;
1628 
1629 		/* "scratch" is the 2nd half, split_mp is the first. */
1630 		scratch = dupb(split_mp);
1631 		if (scratch == NULL) {
1632 			esp1dbg(espstack,
1633 			    ("esp_insert_esp: can't allocate scratch.\n"));
1634 			return (B_FALSE);
1635 		}
1636 		/* NOTE:  dupb() doesn't set b_cont appropriately. */
1637 		scratch->b_cont = split_mp->b_cont;
1638 		scratch->b_rptr += wheretodiv;
1639 		split_mp->b_wptr = split_mp->b_rptr + wheretodiv;
1640 		split_mp->b_cont = scratch;
1641 	}
1642 	/*
1643 	 * At this point, split_mp is exactly "wheretodiv" bytes long, and
1644 	 * holds the end of the pre-ESP part of the datagram.
1645 	 */
1646 	esp_mp->b_cont = split_mp->b_cont;
1647 	split_mp->b_cont = esp_mp;
1648 
1649 	return (B_TRUE);
1650 }
1651 
1652 /*
1653  * Section 7 of RFC 3947 says:
1654  *
1655  * 7.  Recovering from the Expiring NAT Mappings
1656  *
1657  *    There are cases where NAT box decides to remove mappings that are still
1658  *    alive (for example, when the keepalive interval is too long, or when the
1659  *    NAT box is rebooted).  To recover from this, ends that are NOT behind
1660  *    NAT SHOULD use the last valid UDP encapsulated IKE or IPsec packet from
1661  *    the other end to determine which IP and port addresses should be used.
1662  *    The host behind dynamic NAT MUST NOT do this, as otherwise it opens a
1663  *    DoS attack possibility because the IP address or port of the other host
1664  *    will not change (it is not behind NAT).
1665  *
1666  *    Keepalives cannot be used for these purposes, as they are not
1667  *    authenticated, but any IKE authenticated IKE packet or ESP packet can be
1668  *    used to detect whether the IP address or the port has changed.
1669  *
1670  * The following function will check an SA and its explicitly-set pair to see
1671  * if the NAT-T remote port matches the received packet (which must have
1672  * passed ESP authentication, see esp_in_done() for the caller context).  If
1673  * there is a mismatch, the SAs are updated.  It is not important if we race
1674  * with a transmitting thread, as if there is a transmitting thread, it will
1675  * merely emit a packet that will most-likely be dropped.
1676  *
1677  * "ports" are ordered src,dst, and assoc is an inbound SA, where src should
1678  * match ipsa_remote_nat_port and dst should match ipsa_local_nat_port.
1679  */
1680 #ifdef _LITTLE_ENDIAN
1681 #define	FIRST_16(x) ((x) & 0xFFFF)
1682 #define	NEXT_16(x) (((x) >> 16) & 0xFFFF)
1683 #else
1684 #define	FIRST_16(x) (((x) >> 16) & 0xFFFF)
1685 #define	NEXT_16(x) ((x) & 0xFFFF)
1686 #endif
1687 static void
1688 esp_port_freshness(uint32_t ports, ipsa_t *assoc)
1689 {
1690 	uint16_t remote = FIRST_16(ports);
1691 	uint16_t local = NEXT_16(ports);
1692 	ipsa_t *outbound_peer;
1693 	isaf_t *bucket;
1694 	ipsecesp_stack_t *espstack = assoc->ipsa_netstack->netstack_ipsecesp;
1695 
1696 	/* We found a conn_t, therefore local != 0. */
1697 	ASSERT(local != 0);
1698 	/* Assume an IPv4 SA. */
1699 	ASSERT(assoc->ipsa_addrfam == AF_INET);
1700 
1701 	/*
1702 	 * On-the-wire rport == 0 means something's very wrong.
1703 	 * An unpaired SA is also useless to us.
1704 	 * If we are behind the NAT, don't bother.
1705 	 * A zero local NAT port defaults to 4500, so check that too.
1706 	 * And, of course, if the ports already match, we don't need to
1707 	 * bother.
1708 	 */
1709 	if (remote == 0 || assoc->ipsa_otherspi == 0 ||
1710 	    (assoc->ipsa_flags & IPSA_F_BEHIND_NAT) ||
1711 	    (assoc->ipsa_remote_nat_port == 0 &&
1712 	    remote == htons(IPPORT_IKE_NATT)) ||
1713 	    remote == assoc->ipsa_remote_nat_port)
1714 		return;
1715 
1716 	/* Try and snag the peer.   NOTE:  Assume IPv4 for now. */
1717 	bucket = OUTBOUND_BUCKET_V4(&(espstack->esp_sadb.s_v4),
1718 	    assoc->ipsa_srcaddr[0]);
1719 	mutex_enter(&bucket->isaf_lock);
1720 	outbound_peer = ipsec_getassocbyspi(bucket, assoc->ipsa_otherspi,
1721 	    assoc->ipsa_dstaddr, assoc->ipsa_srcaddr, AF_INET);
1722 	mutex_exit(&bucket->isaf_lock);
1723 
1724 	/* We probably lost a race to a deleting or expiring thread. */
1725 	if (outbound_peer == NULL)
1726 		return;
1727 
1728 	/*
1729 	 * Hold the mutexes for both SAs so we don't race another inbound
1730 	 * thread.  A lock-entry order shouldn't matter, since all other
1731 	 * per-ipsa locks are individually held-then-released.
1732 	 *
1733 	 * Luckily, this has nothing to do with the remote-NAT address,
1734 	 * so we don't have to re-scribble the cached-checksum differential.
1735 	 */
1736 	mutex_enter(&outbound_peer->ipsa_lock);
1737 	mutex_enter(&assoc->ipsa_lock);
1738 	outbound_peer->ipsa_remote_nat_port = assoc->ipsa_remote_nat_port =
1739 	    remote;
1740 	mutex_exit(&assoc->ipsa_lock);
1741 	mutex_exit(&outbound_peer->ipsa_lock);
1742 	IPSA_REFRELE(outbound_peer);
1743 	ESP_BUMP_STAT(espstack, sa_port_renumbers);
1744 }
1745 /*
1746  * Finish processing of an inbound ESP packet after processing by the
1747  * crypto framework.
1748  * - Remove the ESP header.
1749  * - Send packet back to IP.
1750  * If authentication was performed on the packet, this function is called
1751  * only if the authentication succeeded.
1752  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1753  * mblk chain ipsec_in_mp.
1754  */
1755 static ipsec_status_t
1756 esp_in_done(mblk_t *ipsec_in_mp)
1757 {
1758 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
1759 	mblk_t *data_mp;
1760 	ipsa_t *assoc;
1761 	uint_t espstart;
1762 	uint32_t ivlen = 0;
1763 	uint_t processed_len;
1764 	esph_t *esph;
1765 	kstat_named_t *counter;
1766 	boolean_t is_natt;
1767 	netstack_t	*ns = ii->ipsec_in_ns;
1768 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1769 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1770 
1771 	assoc = ii->ipsec_in_esp_sa;
1772 	ASSERT(assoc != NULL);
1773 
1774 	is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
1775 
1776 	/* get the pointer to the ESP header */
1777 	if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
1778 		/* authentication-only ESP */
1779 		espstart = ii->ipsec_in_crypto_data.cd_offset;
1780 		processed_len = ii->ipsec_in_crypto_data.cd_length;
1781 	} else {
1782 		/* encryption present */
1783 		ivlen = assoc->ipsa_iv_len;
1784 		if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
1785 			/* encryption-only ESP */
1786 			espstart = ii->ipsec_in_crypto_data.cd_offset -
1787 			    sizeof (esph_t) - assoc->ipsa_iv_len;
1788 			processed_len = ii->ipsec_in_crypto_data.cd_length +
1789 			    ivlen;
1790 		} else {
1791 			/* encryption with authentication */
1792 			espstart = ii->ipsec_in_crypto_dual_data.dd_offset1;
1793 			processed_len = ii->ipsec_in_crypto_dual_data.dd_len2 +
1794 			    ivlen;
1795 		}
1796 	}
1797 
1798 	data_mp = ipsec_in_mp->b_cont;
1799 	esph = (esph_t *)(data_mp->b_rptr + espstart);
1800 
1801 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE) {
1802 		/* authentication passed if we reach this point */
1803 		ESP_BUMP_STAT(espstack, good_auth);
1804 		data_mp->b_wptr -= assoc->ipsa_mac_len;
1805 
1806 		/*
1807 		 * Check replay window here!
1808 		 * For right now, assume keysock will set the replay window
1809 		 * size to zero for SAs that have an unspecified sender.
1810 		 * This may change...
1811 		 */
1812 
1813 		if (!sadb_replay_check(assoc, esph->esph_replay)) {
1814 			/*
1815 			 * Log the event. As of now we print out an event.
1816 			 * Do not print the replay failure number, or else
1817 			 * syslog cannot collate the error messages.  Printing
1818 			 * the replay number that failed opens a denial-of-
1819 			 * service attack.
1820 			 */
1821 			ipsec_assocfailure(info.mi_idnum, 0, 0,
1822 			    SL_ERROR | SL_WARN,
1823 			    "Replay failed for ESP spi 0x%x, dst %s.\n",
1824 			    assoc->ipsa_spi, assoc->ipsa_dstaddr,
1825 			    assoc->ipsa_addrfam, espstack->ipsecesp_netstack);
1826 			ESP_BUMP_STAT(espstack, replay_failures);
1827 			counter = DROPPER(ipss, ipds_esp_replay);
1828 			goto drop_and_bail;
1829 		}
1830 
1831 		if (is_natt)
1832 			esp_port_freshness(ii->ipsec_in_esp_udp_ports, assoc);
1833 	}
1834 
1835 	esp_set_usetime(assoc, B_TRUE);
1836 
1837 	if (!esp_age_bytes(assoc, processed_len, B_TRUE)) {
1838 		/* The ipsa has hit hard expiration, LOG and AUDIT. */
1839 		ipsec_assocfailure(info.mi_idnum, 0, 0,
1840 		    SL_ERROR | SL_WARN,
1841 		    "ESP association 0x%x, dst %s had bytes expire.\n",
1842 		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1843 		    espstack->ipsecesp_netstack);
1844 		ESP_BUMP_STAT(espstack, bytes_expired);
1845 		counter = DROPPER(ipss, ipds_esp_bytes_expire);
1846 		goto drop_and_bail;
1847 	}
1848 
1849 	/*
1850 	 * Remove ESP header and padding from packet.  I hope the compiler
1851 	 * spews "branch, predict taken" code for this.
1852 	 */
1853 
1854 	if (esp_strip_header(data_mp, ii->ipsec_in_v4, ivlen, &counter,
1855 	    espstack)) {
1856 		if (is_natt)
1857 			return (esp_fix_natt_checksums(data_mp, assoc));
1858 
1859 		if (assoc->ipsa_state == IPSA_STATE_IDLE) {
1860 			/*
1861 			 * Cluster buffering case.  Tell caller that we're
1862 			 * handling the packet.
1863 			 */
1864 			sadb_buf_pkt(assoc, ipsec_in_mp, ns);
1865 			return (IPSEC_STATUS_PENDING);
1866 		}
1867 
1868 		return (IPSEC_STATUS_SUCCESS);
1869 	}
1870 
1871 	esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
1872 drop_and_bail:
1873 	IP_ESP_BUMP_STAT(ipss, in_discards);
1874 	/*
1875 	 * TODO: Extract inbound interface from the IPSEC_IN message's
1876 	 * ii->ipsec_in_rill_index.
1877 	 */
1878 	ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, counter,
1879 	    &espstack->esp_dropper);
1880 	return (IPSEC_STATUS_FAILED);
1881 }
1882 
1883 /*
1884  * Called upon failing the inbound ICV check. The message passed as
1885  * argument is freed.
1886  */
1887 static void
1888 esp_log_bad_auth(mblk_t *ipsec_in)
1889 {
1890 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
1891 	ipsa_t *assoc = ii->ipsec_in_esp_sa;
1892 	netstack_t	*ns = ii->ipsec_in_ns;
1893 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1894 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1895 
1896 	/*
1897 	 * Log the event. Don't print to the console, block
1898 	 * potential denial-of-service attack.
1899 	 */
1900 	ESP_BUMP_STAT(espstack, bad_auth);
1901 
1902 	ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1903 	    "ESP Authentication failed for spi 0x%x, dst %s.\n",
1904 	    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1905 	    espstack->ipsecesp_netstack);
1906 
1907 	IP_ESP_BUMP_STAT(ipss, in_discards);
1908 	/*
1909 	 * TODO: Extract inbound interface from the IPSEC_IN
1910 	 * message's ii->ipsec_in_rill_index.
1911 	 */
1912 	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
1913 	    DROPPER(ipss, ipds_esp_bad_auth),
1914 	    &espstack->esp_dropper);
1915 }
1916 
1917 
1918 /*
1919  * Invoked for outbound packets after ESP processing. If the packet
1920  * also requires AH, performs the AH SA selection and AH processing.
1921  * Returns B_TRUE if the AH processing was not needed or if it was
1922  * performed successfully. Returns B_FALSE and consumes the passed mblk
1923  * if AH processing was required but could not be performed.
1924  */
1925 static boolean_t
1926 esp_do_outbound_ah(mblk_t *ipsec_mp)
1927 {
1928 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
1929 	ipsec_status_t ipsec_rc;
1930 	ipsec_action_t *ap;
1931 
1932 	ap = io->ipsec_out_act;
1933 	if (ap == NULL) {
1934 		ipsec_policy_t *pp = io->ipsec_out_policy;
1935 		ap = pp->ipsp_act;
1936 	}
1937 
1938 	if (!ap->ipa_want_ah)
1939 		return (B_TRUE);
1940 
1941 	ASSERT(io->ipsec_out_ah_done == B_FALSE);
1942 
1943 	if (io->ipsec_out_ah_sa == NULL) {
1944 		if (!ipsec_outbound_sa(ipsec_mp, IPPROTO_AH)) {
1945 			sadb_acquire(ipsec_mp, io, B_TRUE, B_FALSE);
1946 			return (B_FALSE);
1947 		}
1948 	}
1949 	ASSERT(io->ipsec_out_ah_sa != NULL);
1950 
1951 	io->ipsec_out_ah_done = B_TRUE;
1952 	ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp);
1953 	return (ipsec_rc == IPSEC_STATUS_SUCCESS);
1954 }
1955 
1956 
1957 /*
1958  * Kernel crypto framework callback invoked after completion of async
1959  * crypto requests.
1960  */
1961 static void
1962 esp_kcf_callback(void *arg, int status)
1963 {
1964 	mblk_t *ipsec_mp = (mblk_t *)arg;
1965 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
1966 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
1967 	boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
1968 	netstackid_t	stackid;
1969 	netstack_t	*ns, *ns_arg;
1970 	ipsecesp_stack_t *espstack;
1971 	ipsec_stack_t	*ipss;
1972 
1973 	ASSERT(ipsec_mp->b_cont != NULL);
1974 
1975 	if (is_inbound) {
1976 		stackid = ii->ipsec_in_stackid;
1977 		ns_arg = ii->ipsec_in_ns;
1978 	} else {
1979 		stackid = io->ipsec_out_stackid;
1980 		ns_arg = io->ipsec_out_ns;
1981 	}
1982 
1983 	/*
1984 	 * Verify that the netstack is still around; could have vanished
1985 	 * while kEf was doing its work.
1986 	 */
1987 	ns = netstack_find_by_stackid(stackid);
1988 	if (ns == NULL || ns != ns_arg) {
1989 		/* Disappeared on us */
1990 		if (ns != NULL)
1991 			netstack_rele(ns);
1992 		freemsg(ipsec_mp);
1993 		return;
1994 	}
1995 
1996 	espstack = ns->netstack_ipsecesp;
1997 	ipss = ns->netstack_ipsec;
1998 
1999 	if (status == CRYPTO_SUCCESS) {
2000 		if (is_inbound) {
2001 			if (esp_in_done(ipsec_mp) != IPSEC_STATUS_SUCCESS) {
2002 				netstack_rele(ns);
2003 				return;
2004 			}
2005 			/* finish IPsec processing */
2006 			ip_fanout_proto_again(ipsec_mp, NULL, NULL, NULL);
2007 		} else {
2008 			/*
2009 			 * If a ICV was computed, it was stored by the
2010 			 * crypto framework at the end of the packet.
2011 			 */
2012 			ipha_t *ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
2013 
2014 			esp_set_usetime(io->ipsec_out_esp_sa, B_FALSE);
2015 			/* NAT-T packet. */
2016 			if (ipha->ipha_protocol == IPPROTO_UDP)
2017 				esp_prepare_udp(ns, ipsec_mp->b_cont, ipha);
2018 
2019 			/* do AH processing if needed */
2020 			if (!esp_do_outbound_ah(ipsec_mp)) {
2021 				netstack_rele(ns);
2022 				return;
2023 			}
2024 			/* finish IPsec processing */
2025 			if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
2026 				ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL,
2027 				    NULL);
2028 			} else {
2029 				ip6_t *ip6h = (ip6_t *)ipha;
2030 				ip_wput_ipsec_out_v6(NULL, ipsec_mp, ip6h,
2031 				    NULL, NULL);
2032 			}
2033 		}
2034 
2035 	} else if (status == CRYPTO_INVALID_MAC) {
2036 		esp_log_bad_auth(ipsec_mp);
2037 
2038 	} else {
2039 		esp1dbg(espstack,
2040 		    ("esp_kcf_callback: crypto failed with 0x%x\n",
2041 		    status));
2042 		ESP_BUMP_STAT(espstack, crypto_failures);
2043 		if (is_inbound)
2044 			IP_ESP_BUMP_STAT(ipss, in_discards);
2045 		else
2046 			ESP_BUMP_STAT(espstack, out_discards);
2047 		ip_drop_packet(ipsec_mp, is_inbound, NULL, NULL,
2048 		    DROPPER(ipss, ipds_esp_crypto_failed),
2049 		    &espstack->esp_dropper);
2050 	}
2051 	netstack_rele(ns);
2052 }
2053 
2054 /*
2055  * Invoked on crypto framework failure during inbound and outbound processing.
2056  */
2057 static void
2058 esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
2059     ipsecesp_stack_t *espstack)
2060 {
2061 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
2062 
2063 	esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
2064 	    is_inbound ? "inbound" : "outbound", kef_rc));
2065 	ip_drop_packet(mp, is_inbound, NULL, NULL,
2066 	    DROPPER(ipss, ipds_esp_crypto_failed),
2067 	    &espstack->esp_dropper);
2068 	ESP_BUMP_STAT(espstack, crypto_failures);
2069 	if (is_inbound)
2070 		IP_ESP_BUMP_STAT(ipss, in_discards);
2071 	else
2072 		ESP_BUMP_STAT(espstack, out_discards);
2073 }
2074 
2075 #define	ESP_INIT_CALLREQ(_cr) {						\
2076 	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_RESTRICTED;		\
2077 	(_cr)->cr_callback_arg = ipsec_mp;				\
2078 	(_cr)->cr_callback_func = esp_kcf_callback;			\
2079 }
2080 
2081 #define	ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) {			\
2082 	(mac)->cd_format = CRYPTO_DATA_RAW;				\
2083 	(mac)->cd_offset = 0;						\
2084 	(mac)->cd_length = icvlen;					\
2085 	(mac)->cd_raw.iov_base = (char *)icvbuf;			\
2086 	(mac)->cd_raw.iov_len = icvlen;					\
2087 }
2088 
2089 #define	ESP_INIT_CRYPTO_DATA(data, mp, off, len) {			\
2090 	if (MBLKL(mp) >= (len) + (off)) {				\
2091 		(data)->cd_format = CRYPTO_DATA_RAW;			\
2092 		(data)->cd_raw.iov_base = (char *)(mp)->b_rptr;		\
2093 		(data)->cd_raw.iov_len = MBLKL(mp);			\
2094 		(data)->cd_offset = off;				\
2095 	} else {							\
2096 		(data)->cd_format = CRYPTO_DATA_MBLK;			\
2097 		(data)->cd_mp = mp;			       		\
2098 		(data)->cd_offset = off;				\
2099 	}								\
2100 	(data)->cd_length = len;					\
2101 }
2102 
2103 #define	ESP_INIT_CRYPTO_DUAL_DATA(data, mp, off1, len1, off2, len2) {	\
2104 	(data)->dd_format = CRYPTO_DATA_MBLK;				\
2105 	(data)->dd_mp = mp;						\
2106 	(data)->dd_len1 = len1;						\
2107 	(data)->dd_offset1 = off1;					\
2108 	(data)->dd_len2 = len2;						\
2109 	(data)->dd_offset2 = off2;					\
2110 }
2111 
2112 static ipsec_status_t
2113 esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
2114 {
2115 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
2116 	boolean_t do_auth;
2117 	uint_t auth_offset, msg_len, auth_len;
2118 	crypto_call_req_t call_req;
2119 	mblk_t *esp_mp;
2120 	int kef_rc = CRYPTO_FAILED;
2121 	uint_t icv_len = assoc->ipsa_mac_len;
2122 	crypto_ctx_template_t auth_ctx_tmpl;
2123 	boolean_t do_encr;
2124 	uint_t encr_offset, encr_len;
2125 	uint_t iv_len = assoc->ipsa_iv_len;
2126 	crypto_ctx_template_t encr_ctx_tmpl;
2127 	netstack_t	*ns = ii->ipsec_in_ns;
2128 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2129 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2130 
2131 	ASSERT(ii->ipsec_in_type == IPSEC_IN);
2132 
2133 	/*
2134 	 * In case kEF queues and calls back, make sure we have the
2135 	 * netstackid_t for verification that the IP instance is still around
2136 	 * in esp_kcf_callback().
2137 	 */
2138 	ASSERT(ii->ipsec_in_stackid == ns->netstack_stackid);
2139 
2140 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2141 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2142 
2143 	/*
2144 	 * An inbound packet is of the form:
2145 	 * IPSEC_IN -> [IP,options,ESP,IV,data,ICV,pad]
2146 	 */
2147 	esp_mp = ipsec_mp->b_cont;
2148 	msg_len = MBLKL(esp_mp);
2149 
2150 	ESP_INIT_CALLREQ(&call_req);
2151 
2152 	if (do_auth) {
2153 		/* force asynchronous processing? */
2154 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
2155 		    IPSEC_ALGS_EXEC_ASYNC)
2156 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2157 
2158 		/* authentication context template */
2159 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2160 		    auth_ctx_tmpl);
2161 
2162 		/* ICV to be verified */
2163 		ESP_INIT_CRYPTO_MAC(&ii->ipsec_in_crypto_mac,
2164 		    icv_len, esp_mp->b_wptr - icv_len);
2165 
2166 		/* authentication starts at the ESP header */
2167 		auth_offset = esph_offset;
2168 		auth_len = msg_len - auth_offset - icv_len;
2169 		if (!do_encr) {
2170 			/* authentication only */
2171 			/* initialize input data argument */
2172 			ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
2173 			    esp_mp, auth_offset, auth_len);
2174 
2175 			/* call the crypto framework */
2176 			kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
2177 			    &ii->ipsec_in_crypto_data,
2178 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2179 			    &ii->ipsec_in_crypto_mac, &call_req);
2180 		}
2181 	}
2182 
2183 	if (do_encr) {
2184 		/* force asynchronous processing? */
2185 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
2186 		    IPSEC_ALGS_EXEC_ASYNC)
2187 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2188 
2189 		/* encryption template */
2190 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2191 		    encr_ctx_tmpl);
2192 
2193 		/* skip IV, since it is passed separately */
2194 		encr_offset = esph_offset + sizeof (esph_t) + iv_len;
2195 		encr_len = msg_len - encr_offset;
2196 
2197 		if (!do_auth) {
2198 			/* decryption only */
2199 			/* initialize input data argument */
2200 			ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
2201 			    esp_mp, encr_offset, encr_len);
2202 
2203 			/* specify IV */
2204 			ii->ipsec_in_crypto_data.cd_miscdata =
2205 			    (char *)esp_mp->b_rptr + sizeof (esph_t) +
2206 			    esph_offset;
2207 
2208 			/* call the crypto framework */
2209 			kef_rc = crypto_decrypt(&assoc->ipsa_emech,
2210 			    &ii->ipsec_in_crypto_data,
2211 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2212 			    NULL, &call_req);
2213 		}
2214 	}
2215 
2216 	if (do_auth && do_encr) {
2217 		/* dual operation */
2218 		/* initialize input data argument */
2219 		ESP_INIT_CRYPTO_DUAL_DATA(&ii->ipsec_in_crypto_dual_data,
2220 		    esp_mp, auth_offset, auth_len,
2221 		    encr_offset, encr_len - icv_len);
2222 
2223 		/* specify IV */
2224 		ii->ipsec_in_crypto_dual_data.dd_miscdata =
2225 		    (char *)esp_mp->b_rptr + sizeof (esph_t) + esph_offset;
2226 
2227 		/* call the framework */
2228 		kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
2229 		    &assoc->ipsa_emech, &ii->ipsec_in_crypto_dual_data,
2230 		    &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
2231 		    auth_ctx_tmpl, encr_ctx_tmpl, &ii->ipsec_in_crypto_mac,
2232 		    NULL, &call_req);
2233 	}
2234 
2235 	switch (kef_rc) {
2236 	case CRYPTO_SUCCESS:
2237 		ESP_BUMP_STAT(espstack, crypto_sync);
2238 		return (esp_in_done(ipsec_mp));
2239 	case CRYPTO_QUEUED:
2240 		/* esp_kcf_callback() will be invoked on completion */
2241 		ESP_BUMP_STAT(espstack, crypto_async);
2242 		return (IPSEC_STATUS_PENDING);
2243 	case CRYPTO_INVALID_MAC:
2244 		ESP_BUMP_STAT(espstack, crypto_sync);
2245 		esp_log_bad_auth(ipsec_mp);
2246 		return (IPSEC_STATUS_FAILED);
2247 	}
2248 
2249 	esp_crypto_failed(ipsec_mp, B_TRUE, kef_rc, espstack);
2250 	return (IPSEC_STATUS_FAILED);
2251 }
2252 
2253 /*
2254  * Compute the IP and UDP checksums -- common code for both keepalives and
2255  * actual ESP-in-UDP packets.  Be flexible with multiple mblks because ESP
2256  * uses mblk-insertion to insert the UDP header.
2257  * TODO - If there is an easy way to prep a packet for HW checksums, make
2258  * it happen here.
2259  */
2260 static void
2261 esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
2262 {
2263 	int offset;
2264 	uint32_t cksum;
2265 	uint16_t *arr;
2266 	mblk_t *udpmp = mp;
2267 	uint_t hlen = IPH_HDR_LENGTH(ipha);
2268 
2269 	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2270 
2271 	ipha->ipha_hdr_checksum = 0;
2272 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2273 
2274 	if (ns->netstack_udp->us_do_checksum) {
2275 		ASSERT(MBLKL(udpmp) >= sizeof (udpha_t));
2276 		/* arr points to the IP header. */
2277 		arr = (uint16_t *)ipha;
2278 		IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
2279 		IP_STAT_UPDATE(ns->netstack_ip, ip_udp_out_sw_cksum_bytes,
2280 		    ntohs(htons(ipha->ipha_length) - hlen));
2281 		/* arr[6-9] are the IP addresses. */
2282 		cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
2283 		    ntohs(htons(ipha->ipha_length) - hlen);
2284 		cksum = IP_CSUM(mp, hlen, cksum);
2285 		offset = hlen + UDP_CHECKSUM_OFFSET;
2286 		while (offset >= MBLKL(udpmp)) {
2287 			offset -= MBLKL(udpmp);
2288 			udpmp = udpmp->b_cont;
2289 		}
2290 		/* arr points to the UDP header's checksum field. */
2291 		arr = (uint16_t *)(udpmp->b_rptr + offset);
2292 		*arr = cksum;
2293 	}
2294 }
2295 
2296 /*
2297  * taskq handler so we can send the NAT-T keepalive on a separate thread.
2298  */
2299 static void
2300 actually_send_keepalive(void *arg)
2301 {
2302 	mblk_t *ipsec_mp = (mblk_t *)arg;
2303 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
2304 	ipha_t *ipha;
2305 	netstack_t *ns;
2306 
2307 	ASSERT(DB_TYPE(ipsec_mp) == M_CTL);
2308 	ASSERT(io->ipsec_out_type == IPSEC_OUT);
2309 	ASSERT(ipsec_mp->b_cont != NULL);
2310 	ASSERT(DB_TYPE(ipsec_mp->b_cont) == M_DATA);
2311 
2312 	ns = netstack_find_by_stackid(io->ipsec_out_stackid);
2313 	if (ns == NULL || ns != io->ipsec_out_ns) {
2314 		/* Just freemsg(). */
2315 		if (ns != NULL)
2316 			netstack_rele(ns);
2317 		freemsg(ipsec_mp);
2318 		return;
2319 	}
2320 
2321 	ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
2322 	ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL, NULL);
2323 	netstack_rele(ns);
2324 }
2325 
2326 /*
2327  * Send a one-byte UDP NAT-T keepalive.  Construct an IPSEC_OUT too that'll
2328  * get fed into esp_send_udp/ip_wput_ipsec_out.
2329  */
2330 void
2331 ipsecesp_send_keepalive(ipsa_t *assoc)
2332 {
2333 	mblk_t *mp = NULL, *ipsec_mp = NULL;
2334 	ipha_t *ipha;
2335 	udpha_t *udpha;
2336 	ipsec_out_t *io;
2337 
2338 	ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock));
2339 
2340 	mp = allocb(sizeof (ipha_t) + sizeof (udpha_t) + 1, BPRI_HI);
2341 	if (mp == NULL)
2342 		return;
2343 	ipha = (ipha_t *)mp->b_rptr;
2344 	ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2345 	ipha->ipha_type_of_service = 0;
2346 	ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (udpha_t) + 1);
2347 	/* Use the low-16 of the SPI so we have some clue where it came from. */
2348 	ipha->ipha_ident = *(((uint16_t *)(&assoc->ipsa_spi)) + 1);
2349 	ipha->ipha_fragment_offset_and_flags = 0;  /* Too small to fragment! */
2350 	ipha->ipha_ttl = 0xFF;
2351 	ipha->ipha_protocol = IPPROTO_UDP;
2352 	ipha->ipha_hdr_checksum = 0;
2353 	ipha->ipha_src = assoc->ipsa_srcaddr[0];
2354 	ipha->ipha_dst = assoc->ipsa_dstaddr[0];
2355 	udpha = (udpha_t *)(ipha + 1);
2356 	udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2357 	    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2358 	udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2359 	    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2360 	udpha->uha_length = htons(sizeof (udpha_t) + 1);
2361 	udpha->uha_checksum = 0;
2362 	mp->b_wptr = (uint8_t *)(udpha + 1);
2363 	*(mp->b_wptr++) = 0xFF;
2364 
2365 	ipsec_mp = ipsec_alloc_ipsec_out(assoc->ipsa_netstack);
2366 	if (ipsec_mp == NULL) {
2367 		freeb(mp);
2368 		return;
2369 	}
2370 	ipsec_mp->b_cont = mp;
2371 	io = (ipsec_out_t *)ipsec_mp->b_rptr;
2372 	io->ipsec_out_zoneid =
2373 	    netstackid_to_zoneid(assoc->ipsa_netstack->netstack_stackid);
2374 	io->ipsec_out_stackid = assoc->ipsa_netstack->netstack_stackid;
2375 
2376 	esp_prepare_udp(assoc->ipsa_netstack, mp, ipha);
2377 	/*
2378 	 * We're holding an isaf_t bucket lock, so pawn off the actual
2379 	 * packet transmission to another thread.  Just in case syncq
2380 	 * processing causes a same-bucket packet to be processed.
2381 	 */
2382 	if (taskq_dispatch(esp_taskq, actually_send_keepalive, ipsec_mp,
2383 	    TQ_NOSLEEP) == 0) {
2384 		/* Assume no memory if taskq_dispatch() fails. */
2385 		ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
2386 		    DROPPER(assoc->ipsa_netstack->netstack_ipsec,
2387 		    ipds_esp_nomem),
2388 		    &assoc->ipsa_netstack->netstack_ipsecesp->esp_dropper);
2389 	}
2390 }
2391 
2392 static ipsec_status_t
2393 esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
2394     uint_t payload_len)
2395 {
2396 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
2397 	uint_t auth_len;
2398 	crypto_call_req_t call_req;
2399 	mblk_t *esp_mp;
2400 	int kef_rc = CRYPTO_FAILED;
2401 	uint_t icv_len = assoc->ipsa_mac_len;
2402 	crypto_ctx_template_t auth_ctx_tmpl;
2403 	boolean_t do_auth;
2404 	boolean_t do_encr;
2405 	uint_t iv_len = assoc->ipsa_iv_len;
2406 	crypto_ctx_template_t encr_ctx_tmpl;
2407 	boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
2408 	size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
2409 	netstack_t	*ns = io->ipsec_out_ns;
2410 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2411 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2412 
2413 	esp3dbg(espstack, ("esp_submit_req_outbound:%s",
2414 	    is_natt ? "natt" : "not natt"));
2415 
2416 	ASSERT(io->ipsec_out_type == IPSEC_OUT);
2417 
2418 	/*
2419 	 * In case kEF queues and calls back, keep netstackid_t for
2420 	 * verification that the IP instance is still around in
2421 	 * esp_kcf_callback().
2422 	 */
2423 	io->ipsec_out_stackid = ns->netstack_stackid;
2424 
2425 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2426 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2427 
2428 	/*
2429 	 * Outbound IPsec packets are of the form:
2430 	 * IPSEC_OUT -> [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
2431 	 * unless it's NATT, then it's
2432 	 * IPSEC_OUT -> [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
2433 	 * Get a pointer to the mblk containing the ESP header.
2434 	 */
2435 	ASSERT(ipsec_mp->b_cont != NULL && ipsec_mp->b_cont->b_cont != NULL);
2436 	esp_mp = ipsec_mp->b_cont->b_cont;
2437 
2438 	ESP_INIT_CALLREQ(&call_req);
2439 
2440 	if (do_auth) {
2441 		/* force asynchronous processing? */
2442 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
2443 		    IPSEC_ALGS_EXEC_ASYNC)
2444 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2445 
2446 		/* authentication context template */
2447 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2448 		    auth_ctx_tmpl);
2449 
2450 		/* where to store the computed mac */
2451 		ESP_INIT_CRYPTO_MAC(&io->ipsec_out_crypto_mac,
2452 		    icv_len, icv_buf);
2453 
2454 		/* authentication starts at the ESP header */
2455 		auth_len = payload_len + iv_len + sizeof (esph_t);
2456 		if (!do_encr) {
2457 			/* authentication only */
2458 			/* initialize input data argument */
2459 			ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
2460 			    esp_mp, esph_offset, auth_len);
2461 
2462 			/* call the crypto framework */
2463 			kef_rc = crypto_mac(&assoc->ipsa_amech,
2464 			    &io->ipsec_out_crypto_data,
2465 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2466 			    &io->ipsec_out_crypto_mac, &call_req);
2467 		}
2468 	}
2469 
2470 	if (do_encr) {
2471 		/* force asynchronous processing? */
2472 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
2473 		    IPSEC_ALGS_EXEC_ASYNC)
2474 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2475 
2476 		/* encryption context template */
2477 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2478 		    encr_ctx_tmpl);
2479 
2480 		if (!do_auth) {
2481 			/* encryption only, skip mblk that contains ESP hdr */
2482 			/* initialize input data argument */
2483 			ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
2484 			    esp_mp->b_cont, 0, payload_len);
2485 
2486 			/* specify IV */
2487 			io->ipsec_out_crypto_data.cd_miscdata =
2488 			    (char *)esp_mp->b_rptr + sizeof (esph_t) +
2489 			    esph_offset;
2490 
2491 			/* call the crypto framework */
2492 			kef_rc = crypto_encrypt(&assoc->ipsa_emech,
2493 			    &io->ipsec_out_crypto_data,
2494 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2495 			    NULL, &call_req);
2496 		}
2497 	}
2498 
2499 	if (do_auth && do_encr) {
2500 		/*
2501 		 * Encryption and authentication:
2502 		 * Pass the pointer to the mblk chain starting at the ESP
2503 		 * header to the framework. Skip the ESP header mblk
2504 		 * for encryption, which is reflected by an encryption
2505 		 * offset equal to the length of that mblk. Start
2506 		 * the authentication at the ESP header, i.e. use an
2507 		 * authentication offset of zero.
2508 		 */
2509 		ESP_INIT_CRYPTO_DUAL_DATA(&io->ipsec_out_crypto_dual_data,
2510 		    esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
2511 
2512 		/* specify IV */
2513 		io->ipsec_out_crypto_dual_data.dd_miscdata =
2514 		    (char *)esp_mp->b_rptr + sizeof (esph_t) + esph_offset;
2515 
2516 		/* call the framework */
2517 		kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
2518 		    &assoc->ipsa_amech, NULL,
2519 		    &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
2520 		    encr_ctx_tmpl, auth_ctx_tmpl,
2521 		    &io->ipsec_out_crypto_dual_data,
2522 		    &io->ipsec_out_crypto_mac, &call_req);
2523 	}
2524 
2525 	switch (kef_rc) {
2526 	case CRYPTO_SUCCESS:
2527 		ESP_BUMP_STAT(espstack, crypto_sync);
2528 		esp_set_usetime(assoc, B_FALSE);
2529 		if (is_natt)
2530 			esp_prepare_udp(ns, ipsec_mp->b_cont,
2531 			    (ipha_t *)ipsec_mp->b_cont->b_rptr);
2532 		return (IPSEC_STATUS_SUCCESS);
2533 	case CRYPTO_QUEUED:
2534 		/* esp_kcf_callback() will be invoked on completion */
2535 		ESP_BUMP_STAT(espstack, crypto_async);
2536 		return (IPSEC_STATUS_PENDING);
2537 	}
2538 
2539 	esp_crypto_failed(ipsec_mp, B_TRUE, kef_rc, espstack);
2540 	return (IPSEC_STATUS_FAILED);
2541 }
2542 
2543 /*
2544  * Handle outbound IPsec processing for IPv4 and IPv6
2545  * On success returns B_TRUE, on failure returns B_FALSE and frees the
2546  * mblk chain ipsec_in_mp.
2547  */
2548 static ipsec_status_t
2549 esp_outbound(mblk_t *mp)
2550 {
2551 	mblk_t *ipsec_out_mp, *data_mp, *espmp, *tailmp;
2552 	ipsec_out_t *io;
2553 	ipha_t *ipha;
2554 	ip6_t *ip6h;
2555 	esph_t *esph;
2556 	uint_t af;
2557 	uint8_t *nhp;
2558 	uintptr_t divpoint, datalen, adj, padlen, i, alloclen;
2559 	uintptr_t esplen = sizeof (esph_t);
2560 	uint8_t protocol;
2561 	ipsa_t *assoc;
2562 	uint_t iv_len, mac_len = 0;
2563 	uchar_t *icv_buf;
2564 	udpha_t *udpha;
2565 	boolean_t is_natt = B_FALSE;
2566 	netstack_t	*ns;
2567 	ipsecesp_stack_t *espstack;
2568 	ipsec_stack_t	*ipss;
2569 
2570 	ipsec_out_mp = mp;
2571 	data_mp = ipsec_out_mp->b_cont;
2572 
2573 	io = (ipsec_out_t *)ipsec_out_mp->b_rptr;
2574 	ns = io->ipsec_out_ns;
2575 	espstack = ns->netstack_ipsecesp;
2576 	ipss = ns->netstack_ipsec;
2577 
2578 	ESP_BUMP_STAT(espstack, out_requests);
2579 
2580 	/*
2581 	 * <sigh> We have to copy the message here, because TCP (for example)
2582 	 * keeps a dupb() of the message lying around for retransmission.
2583 	 * Since ESP changes the whole of the datagram, we have to create our
2584 	 * own copy lest we clobber TCP's data.  Since we have to copy anyway,
2585 	 * we might as well make use of msgpullup() and get the mblk into one
2586 	 * contiguous piece!
2587 	 */
2588 	ipsec_out_mp->b_cont = msgpullup(data_mp, -1);
2589 	if (ipsec_out_mp->b_cont == NULL) {
2590 		esp0dbg(("esp_outbound: msgpullup() failed, "
2591 		    "dropping packet.\n"));
2592 		ipsec_out_mp->b_cont = data_mp;
2593 		/*
2594 		 * TODO:  Find the outbound IRE for this packet and
2595 		 * pass it to ip_drop_packet().
2596 		 */
2597 		ip_drop_packet(ipsec_out_mp, B_FALSE, NULL, NULL,
2598 		    DROPPER(ipss, ipds_esp_nomem),
2599 		    &espstack->esp_dropper);
2600 		return (IPSEC_STATUS_FAILED);
2601 	} else {
2602 		freemsg(data_mp);
2603 		data_mp = ipsec_out_mp->b_cont;
2604 	}
2605 
2606 	/*
2607 	 * Reality check....
2608 	 */
2609 
2610 	ipha = (ipha_t *)data_mp->b_rptr;  /* So we can call esp_acquire(). */
2611 
2612 	if (io->ipsec_out_v4) {
2613 		af = AF_INET;
2614 		divpoint = IPH_HDR_LENGTH(ipha);
2615 		datalen = ntohs(ipha->ipha_length) - divpoint;
2616 		nhp = (uint8_t *)&ipha->ipha_protocol;
2617 	} else {
2618 		ip6_pkt_t ipp;
2619 
2620 		af = AF_INET6;
2621 		ip6h = (ip6_t *)ipha;
2622 		bzero(&ipp, sizeof (ipp));
2623 		divpoint = ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
2624 		if (ipp.ipp_dstopts != NULL &&
2625 		    ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
2626 			/*
2627 			 * Destination options are tricky.  If we get in here,
2628 			 * then we have a terminal header following the
2629 			 * destination options.  We need to adjust backwards
2630 			 * so we insert ESP BEFORE the destination options
2631 			 * bag.  (So that the dstopts get encrypted!)
2632 			 *
2633 			 * Since this is for outbound packets only, we know
2634 			 * that non-terminal destination options only precede
2635 			 * routing headers.
2636 			 */
2637 			divpoint -= ipp.ipp_dstoptslen;
2638 		}
2639 		datalen = ntohs(ip6h->ip6_plen) + sizeof (ip6_t) - divpoint;
2640 
2641 		if (ipp.ipp_rthdr != NULL) {
2642 			nhp = &ipp.ipp_rthdr->ip6r_nxt;
2643 		} else if (ipp.ipp_hopopts != NULL) {
2644 			nhp = &ipp.ipp_hopopts->ip6h_nxt;
2645 		} else {
2646 			ASSERT(divpoint == sizeof (ip6_t));
2647 			/* It's probably IP + ESP. */
2648 			nhp = &ip6h->ip6_nxt;
2649 		}
2650 	}
2651 	assoc = io->ipsec_out_esp_sa;
2652 	ASSERT(assoc != NULL);
2653 
2654 	if (assoc->ipsa_auth_alg != SADB_AALG_NONE)
2655 		mac_len = assoc->ipsa_mac_len;
2656 
2657 	if (assoc->ipsa_flags & IPSA_F_NATT) {
2658 		/* wedge in fake UDP */
2659 		is_natt = B_TRUE;
2660 		esplen += UDPH_SIZE;
2661 	}
2662 
2663 	/*
2664 	 * Set up ESP header and encryption padding for ENCR PI request.
2665 	 */
2666 
2667 	/* Determine the padding length.  Pad to 4-bytes for no-encryption. */
2668 	if (assoc->ipsa_encr_alg != SADB_EALG_NULL) {
2669 		iv_len = assoc->ipsa_iv_len;
2670 
2671 		/*
2672 		 * Include the two additional bytes (hence the - 2) for the
2673 		 * padding length and the next header.  Take this into account
2674 		 * when calculating the actual length of the padding.
2675 		 */
2676 		ASSERT(ISP2(iv_len));
2677 		padlen = ((unsigned)(iv_len - datalen - 2)) & (iv_len - 1);
2678 	} else {
2679 		iv_len = 0;
2680 		padlen = ((unsigned)(sizeof (uint32_t) - datalen - 2)) &
2681 		    (sizeof (uint32_t) - 1);
2682 	}
2683 
2684 	/* Allocate ESP header and IV. */
2685 	esplen += iv_len;
2686 
2687 	/*
2688 	 * Update association byte-count lifetimes.  Don't forget to take
2689 	 * into account the padding length and next-header (hence the + 2).
2690 	 *
2691 	 * Use the amount of data fed into the "encryption algorithm".  This
2692 	 * is the IV, the data length, the padding length, and the final two
2693 	 * bytes (padlen, and next-header).
2694 	 *
2695 	 */
2696 
2697 	if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
2698 		/*
2699 		 * TODO:  Find the outbound IRE for this packet and
2700 		 * pass it to ip_drop_packet().
2701 		 */
2702 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2703 		    DROPPER(ipss, ipds_esp_bytes_expire),
2704 		    &espstack->esp_dropper);
2705 		return (IPSEC_STATUS_FAILED);
2706 	}
2707 
2708 	espmp = allocb(esplen, BPRI_HI);
2709 	if (espmp == NULL) {
2710 		ESP_BUMP_STAT(espstack, out_discards);
2711 		esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
2712 		/*
2713 		 * TODO:  Find the outbound IRE for this packet and
2714 		 * pass it to ip_drop_packet().
2715 		 */
2716 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2717 		    DROPPER(ipss, ipds_esp_nomem),
2718 		    &espstack->esp_dropper);
2719 		return (IPSEC_STATUS_FAILED);
2720 	}
2721 	espmp->b_wptr += esplen;
2722 	esph = (esph_t *)espmp->b_rptr;
2723 
2724 	if (is_natt) {
2725 		esp3dbg(espstack, ("esp_outbound: NATT"));
2726 
2727 		udpha = (udpha_t *)espmp->b_rptr;
2728 		udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2729 		    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2730 		udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2731 		    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2732 		/*
2733 		 * Set the checksum to 0, so that the esp_prepare_udp() call
2734 		 * can do the right thing.
2735 		 */
2736 		udpha->uha_checksum = 0;
2737 		esph = (esph_t *)(udpha + 1);
2738 	}
2739 
2740 	esph->esph_spi = assoc->ipsa_spi;
2741 
2742 	esph->esph_replay = htonl(atomic_add_32_nv(&assoc->ipsa_replay, 1));
2743 	if (esph->esph_replay == 0 && assoc->ipsa_replay_wsize != 0) {
2744 		/*
2745 		 * XXX We have replay counter wrapping.
2746 		 * We probably want to nuke this SA (and its peer).
2747 		 */
2748 		ipsec_assocfailure(info.mi_idnum, 0, 0,
2749 		    SL_ERROR | SL_CONSOLE | SL_WARN,
2750 		    "Outbound ESP SA (0x%x, %s) has wrapped sequence.\n",
2751 		    esph->esph_spi, assoc->ipsa_dstaddr, af,
2752 		    espstack->ipsecesp_netstack);
2753 
2754 		ESP_BUMP_STAT(espstack, out_discards);
2755 		sadb_replay_delete(assoc);
2756 		/*
2757 		 * TODO:  Find the outbound IRE for this packet and
2758 		 * pass it to ip_drop_packet().
2759 		 */
2760 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2761 		    DROPPER(ipss, ipds_esp_replay),
2762 		    &espstack->esp_dropper);
2763 		return (IPSEC_STATUS_FAILED);
2764 	}
2765 
2766 	/*
2767 	 * Set the IV to a random quantity.  We do not require the
2768 	 * highest quality random bits, but for best security with CBC
2769 	 * mode ciphers, the value must be unlikely to repeat and also
2770 	 * must not be known in advance to an adversary capable of
2771 	 * influencing the plaintext.
2772 	 */
2773 	(void) random_get_pseudo_bytes((uint8_t *)(esph + 1), iv_len);
2774 
2775 	/* Fix the IP header. */
2776 	alloclen = padlen + 2 + mac_len;
2777 	adj = alloclen + (espmp->b_wptr - espmp->b_rptr);
2778 
2779 	protocol = *nhp;
2780 
2781 	if (io->ipsec_out_v4) {
2782 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
2783 		if (is_natt) {
2784 			*nhp = IPPROTO_UDP;
2785 			udpha->uha_length = htons(ntohs(ipha->ipha_length) -
2786 			    IPH_HDR_LENGTH(ipha));
2787 		} else {
2788 			*nhp = IPPROTO_ESP;
2789 		}
2790 		ipha->ipha_hdr_checksum = 0;
2791 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
2792 	} else {
2793 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + adj);
2794 		*nhp = IPPROTO_ESP;
2795 	}
2796 
2797 	/* I've got the two ESP mblks, now insert them. */
2798 
2799 	esp2dbg(espstack, ("data_mp before outbound ESP adjustment:\n"));
2800 	esp2dbg(espstack, (dump_msg(data_mp)));
2801 
2802 	if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
2803 		ESP_BUMP_STAT(espstack, out_discards);
2804 		/* NOTE:  esp_insert_esp() only fails if there's no memory. */
2805 		/*
2806 		 * TODO:  Find the outbound IRE for this packet and
2807 		 * pass it to ip_drop_packet().
2808 		 */
2809 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2810 		    DROPPER(ipss, ipds_esp_nomem),
2811 		    &espstack->esp_dropper);
2812 		freeb(espmp);
2813 		return (IPSEC_STATUS_FAILED);
2814 	}
2815 
2816 	/* Append padding (and leave room for ICV). */
2817 	for (tailmp = data_mp; tailmp->b_cont != NULL; tailmp = tailmp->b_cont)
2818 		;
2819 	if (tailmp->b_wptr + alloclen > tailmp->b_datap->db_lim) {
2820 		tailmp->b_cont = allocb(alloclen, BPRI_HI);
2821 		if (tailmp->b_cont == NULL) {
2822 			ESP_BUMP_STAT(espstack, out_discards);
2823 			esp0dbg(("esp_outbound:  Can't allocate tailmp.\n"));
2824 			/*
2825 			 * TODO:  Find the outbound IRE for this packet and
2826 			 * pass it to ip_drop_packet().
2827 			 */
2828 			ip_drop_packet(mp, B_FALSE, NULL, NULL,
2829 			    DROPPER(ipss, ipds_esp_nomem),
2830 			    &espstack->esp_dropper);
2831 			return (IPSEC_STATUS_FAILED);
2832 		}
2833 		tailmp = tailmp->b_cont;
2834 	}
2835 
2836 	/*
2837 	 * If there's padding, N bytes of padding must be of the form 0x1,
2838 	 * 0x2, 0x3... 0xN.
2839 	 */
2840 	for (i = 0; i < padlen; ) {
2841 		i++;
2842 		*tailmp->b_wptr++ = i;
2843 	}
2844 	*tailmp->b_wptr++ = i;
2845 	*tailmp->b_wptr++ = protocol;
2846 
2847 	esp2dbg(espstack, ("data_Mp before encryption:\n"));
2848 	esp2dbg(espstack, (dump_msg(data_mp)));
2849 
2850 	/*
2851 	 * The packet is eligible for hardware acceleration if the
2852 	 * following conditions are satisfied:
2853 	 *
2854 	 * 1. the packet will not be fragmented
2855 	 * 2. the provider supports the algorithms specified by SA
2856 	 * 3. there is no pending control message being exchanged
2857 	 * 4. snoop is not attached
2858 	 * 5. the destination address is not a multicast address
2859 	 *
2860 	 * All five of these conditions are checked by IP prior to
2861 	 * sending the packet to ESP.
2862 	 *
2863 	 * But We, and We Alone, can, nay MUST check if the packet
2864 	 * is over NATT, and then disqualify it from hardware
2865 	 * acceleration.
2866 	 */
2867 
2868 	if (io->ipsec_out_is_capab_ill && !(assoc->ipsa_flags & IPSA_F_NATT)) {
2869 		return (esp_outbound_accelerated(ipsec_out_mp, mac_len));
2870 	}
2871 	ESP_BUMP_STAT(espstack, noaccel);
2872 
2873 	/*
2874 	 * Okay.  I've set up the pre-encryption ESP.  Let's do it!
2875 	 */
2876 
2877 	if (mac_len > 0) {
2878 		ASSERT(tailmp->b_wptr + mac_len <= tailmp->b_datap->db_lim);
2879 		icv_buf = tailmp->b_wptr;
2880 		tailmp->b_wptr += mac_len;
2881 	} else {
2882 		icv_buf = NULL;
2883 	}
2884 
2885 	return (esp_submit_req_outbound(ipsec_out_mp, assoc, icv_buf,
2886 	    datalen + padlen + 2));
2887 }
2888 
2889 /*
2890  * IP calls this to validate the ICMP errors that
2891  * we got from the network.
2892  */
2893 ipsec_status_t
2894 ipsecesp_icmp_error(mblk_t *ipsec_mp)
2895 {
2896 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
2897 	boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
2898 	netstack_t	*ns;
2899 	ipsecesp_stack_t *espstack;
2900 	ipsec_stack_t	*ipss;
2901 
2902 	if (is_inbound) {
2903 		ns = ii->ipsec_in_ns;
2904 	} else {
2905 		ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
2906 
2907 		ns = io->ipsec_out_ns;
2908 	}
2909 	espstack = ns->netstack_ipsecesp;
2910 	ipss = ns->netstack_ipsec;
2911 
2912 	/*
2913 	 * Unless we get an entire packet back, this function is useless.
2914 	 * Why?
2915 	 *
2916 	 * 1.)	Partial packets are useless, because the "next header"
2917 	 *	is at the end of the decrypted ESP packet.  Without the
2918 	 *	whole packet, this is useless.
2919 	 *
2920 	 * 2.)	If we every use a stateful cipher, such as a stream or a
2921 	 *	one-time pad, we can't do anything.
2922 	 *
2923 	 * Since the chances of us getting an entire packet back are very
2924 	 * very small, we discard here.
2925 	 */
2926 	IP_ESP_BUMP_STAT(ipss, in_discards);
2927 	ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
2928 	    DROPPER(ipss, ipds_esp_icmp),
2929 	    &espstack->esp_dropper);
2930 	return (IPSEC_STATUS_FAILED);
2931 }
2932 
2933 /*
2934  * ESP module read put routine.
2935  */
2936 /* ARGSUSED */
2937 static void
2938 ipsecesp_rput(queue_t *q, mblk_t *mp)
2939 {
2940 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
2941 
2942 	ASSERT(mp->b_datap->db_type != M_CTL);	/* No more IRE_DB_REQ. */
2943 
2944 	switch (mp->b_datap->db_type) {
2945 	case M_PROTO:
2946 	case M_PCPROTO:
2947 		/* TPI message of some sort. */
2948 		switch (*((t_scalar_t *)mp->b_rptr)) {
2949 		case T_BIND_ACK:
2950 			esp3dbg(espstack,
2951 			    ("Thank you IP from ESP for T_BIND_ACK\n"));
2952 			break;
2953 		case T_ERROR_ACK:
2954 			cmn_err(CE_WARN,
2955 			    "ipsecesp:  ESP received T_ERROR_ACK from IP.");
2956 			/*
2957 			 * Make esp_sadb.s_ip_q NULL, and in the
2958 			 * future, perhaps try again.
2959 			 */
2960 			espstack->esp_sadb.s_ip_q = NULL;
2961 			break;
2962 		case T_OK_ACK:
2963 			/* Probably from a (rarely sent) T_UNBIND_REQ. */
2964 			break;
2965 		default:
2966 			esp0dbg(("Unknown M_{,PC}PROTO message.\n"));
2967 		}
2968 		freemsg(mp);
2969 		break;
2970 	default:
2971 		/* For now, passthru message. */
2972 		esp2dbg(espstack, ("ESP got unknown mblk type %d.\n",
2973 		    mp->b_datap->db_type));
2974 		putnext(q, mp);
2975 	}
2976 }
2977 
2978 /*
2979  * Construct an SADB_REGISTER message with the current algorithms.
2980  */
2981 static boolean_t
2982 esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
2983     ipsecesp_stack_t *espstack)
2984 {
2985 	mblk_t *pfkey_msg_mp, *keysock_out_mp;
2986 	sadb_msg_t *samsg;
2987 	sadb_supported_t *sasupp_auth = NULL;
2988 	sadb_supported_t *sasupp_encr = NULL;
2989 	sadb_alg_t *saalg;
2990 	uint_t allocsize = sizeof (*samsg);
2991 	uint_t i, numalgs_snap;
2992 	int current_aalgs;
2993 	ipsec_alginfo_t **authalgs;
2994 	uint_t num_aalgs;
2995 	int current_ealgs;
2996 	ipsec_alginfo_t **encralgs;
2997 	uint_t num_ealgs;
2998 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
2999 
3000 	/* Allocate the KEYSOCK_OUT. */
3001 	keysock_out_mp = sadb_keysock_out(serial);
3002 	if (keysock_out_mp == NULL) {
3003 		esp0dbg(("esp_register_out: couldn't allocate mblk.\n"));
3004 		return (B_FALSE);
3005 	}
3006 
3007 	/*
3008 	 * Allocate the PF_KEY message that follows KEYSOCK_OUT.
3009 	 */
3010 
3011 	mutex_enter(&ipss->ipsec_alg_lock);
3012 
3013 	/*
3014 	 * Fill SADB_REGISTER message's algorithm descriptors.  Hold
3015 	 * down the lock while filling it.
3016 	 *
3017 	 * Return only valid algorithms, so the number of algorithms
3018 	 * to send up may be less than the number of algorithm entries
3019 	 * in the table.
3020 	 */
3021 	authalgs = ipss->ipsec_alglists[IPSEC_ALG_AUTH];
3022 	for (num_aalgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
3023 		if (authalgs[i] != NULL && ALG_VALID(authalgs[i]))
3024 			num_aalgs++;
3025 
3026 	if (num_aalgs != 0) {
3027 		allocsize += (num_aalgs * sizeof (*saalg));
3028 		allocsize += sizeof (*sasupp_auth);
3029 	}
3030 	encralgs = ipss->ipsec_alglists[IPSEC_ALG_ENCR];
3031 	for (num_ealgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
3032 		if (encralgs[i] != NULL && ALG_VALID(encralgs[i]))
3033 			num_ealgs++;
3034 
3035 	if (num_ealgs != 0) {
3036 		allocsize += (num_ealgs * sizeof (*saalg));
3037 		allocsize += sizeof (*sasupp_encr);
3038 	}
3039 	keysock_out_mp->b_cont = allocb(allocsize, BPRI_HI);
3040 	if (keysock_out_mp->b_cont == NULL) {
3041 		mutex_exit(&ipss->ipsec_alg_lock);
3042 		freemsg(keysock_out_mp);
3043 		return (B_FALSE);
3044 	}
3045 
3046 	pfkey_msg_mp = keysock_out_mp->b_cont;
3047 	pfkey_msg_mp->b_wptr += allocsize;
3048 	if (num_aalgs != 0) {
3049 		sasupp_auth = (sadb_supported_t *)
3050 		    (pfkey_msg_mp->b_rptr + sizeof (*samsg));
3051 		saalg = (sadb_alg_t *)(sasupp_auth + 1);
3052 
3053 		ASSERT(((ulong_t)saalg & 0x7) == 0);
3054 
3055 		numalgs_snap = 0;
3056 		for (i = 0;
3057 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_aalgs));
3058 		    i++) {
3059 			if (authalgs[i] == NULL || !ALG_VALID(authalgs[i]))
3060 				continue;
3061 
3062 			saalg->sadb_alg_id = authalgs[i]->alg_id;
3063 			saalg->sadb_alg_ivlen = 0;
3064 			saalg->sadb_alg_minbits	= authalgs[i]->alg_ef_minbits;
3065 			saalg->sadb_alg_maxbits	= authalgs[i]->alg_ef_maxbits;
3066 			saalg->sadb_x_alg_defincr = authalgs[i]->alg_ef_default;
3067 			saalg->sadb_x_alg_increment =
3068 			    authalgs[i]->alg_increment;
3069 			numalgs_snap++;
3070 			saalg++;
3071 		}
3072 		ASSERT(numalgs_snap == num_aalgs);
3073 #ifdef DEBUG
3074 		/*
3075 		 * Reality check to make sure I snagged all of the
3076 		 * algorithms.
3077 		 */
3078 		for (; i < IPSEC_MAX_ALGS; i++) {
3079 			if (authalgs[i] != NULL && ALG_VALID(authalgs[i])) {
3080 				cmn_err(CE_PANIC, "esp_register_out()! "
3081 				    "Missed aalg #%d.\n", i);
3082 			}
3083 		}
3084 #endif /* DEBUG */
3085 	} else {
3086 		saalg = (sadb_alg_t *)(pfkey_msg_mp->b_rptr + sizeof (*samsg));
3087 	}
3088 
3089 	if (num_ealgs != 0) {
3090 		sasupp_encr = (sadb_supported_t *)saalg;
3091 		saalg = (sadb_alg_t *)(sasupp_encr + 1);
3092 
3093 		numalgs_snap = 0;
3094 		for (i = 0;
3095 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_ealgs)); i++) {
3096 			if (encralgs[i] == NULL || !ALG_VALID(encralgs[i]))
3097 				continue;
3098 			saalg->sadb_alg_id = encralgs[i]->alg_id;
3099 			saalg->sadb_alg_ivlen = encralgs[i]->alg_datalen;
3100 			saalg->sadb_alg_minbits	= encralgs[i]->alg_ef_minbits;
3101 			saalg->sadb_alg_maxbits	= encralgs[i]->alg_ef_maxbits;
3102 			saalg->sadb_x_alg_defincr = encralgs[i]->alg_ef_default;
3103 			saalg->sadb_x_alg_increment =
3104 			    encralgs[i]->alg_increment;
3105 			numalgs_snap++;
3106 			saalg++;
3107 		}
3108 		ASSERT(numalgs_snap == num_ealgs);
3109 #ifdef DEBUG
3110 		/*
3111 		 * Reality check to make sure I snagged all of the
3112 		 * algorithms.
3113 		 */
3114 		for (; i < IPSEC_MAX_ALGS; i++) {
3115 			if (encralgs[i] != NULL && ALG_VALID(encralgs[i])) {
3116 				cmn_err(CE_PANIC, "esp_register_out()! "
3117 				    "Missed ealg #%d.\n", i);
3118 			}
3119 		}
3120 #endif /* DEBUG */
3121 	}
3122 
3123 	current_aalgs = num_aalgs;
3124 	current_ealgs = num_ealgs;
3125 
3126 	mutex_exit(&ipss->ipsec_alg_lock);
3127 
3128 	/* Now fill the rest of the SADB_REGISTER message. */
3129 
3130 	samsg = (sadb_msg_t *)pfkey_msg_mp->b_rptr;
3131 	samsg->sadb_msg_version = PF_KEY_V2;
3132 	samsg->sadb_msg_type = SADB_REGISTER;
3133 	samsg->sadb_msg_errno = 0;
3134 	samsg->sadb_msg_satype = SADB_SATYPE_ESP;
3135 	samsg->sadb_msg_len = SADB_8TO64(allocsize);
3136 	samsg->sadb_msg_reserved = 0;
3137 	/*
3138 	 * Assume caller has sufficient sequence/pid number info.  If it's one
3139 	 * from me over a new alg., I could give two hoots about sequence.
3140 	 */
3141 	samsg->sadb_msg_seq = sequence;
3142 	samsg->sadb_msg_pid = pid;
3143 
3144 	if (sasupp_auth != NULL) {
3145 		sasupp_auth->sadb_supported_len = SADB_8TO64(
3146 		    sizeof (*sasupp_auth) + sizeof (*saalg) * current_aalgs);
3147 		sasupp_auth->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
3148 		sasupp_auth->sadb_supported_reserved = 0;
3149 	}
3150 
3151 	if (sasupp_encr != NULL) {
3152 		sasupp_encr->sadb_supported_len = SADB_8TO64(
3153 		    sizeof (*sasupp_encr) + sizeof (*saalg) * current_ealgs);
3154 		sasupp_encr->sadb_supported_exttype =
3155 		    SADB_EXT_SUPPORTED_ENCRYPT;
3156 		sasupp_encr->sadb_supported_reserved = 0;
3157 	}
3158 
3159 	if (espstack->esp_pfkey_q != NULL)
3160 		putnext(espstack->esp_pfkey_q, keysock_out_mp);
3161 	else {
3162 		freemsg(keysock_out_mp);
3163 		return (B_FALSE);
3164 	}
3165 
3166 	return (B_TRUE);
3167 }
3168 
3169 /*
3170  * Invoked when the algorithm table changes. Causes SADB_REGISTER
3171  * messages continaining the current list of algorithms to be
3172  * sent up to the ESP listeners.
3173  */
3174 void
3175 ipsecesp_algs_changed(netstack_t *ns)
3176 {
3177 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
3178 
3179 	/*
3180 	 * Time to send a PF_KEY SADB_REGISTER message to ESP listeners
3181 	 * everywhere.  (The function itself checks for NULL esp_pfkey_q.)
3182 	 */
3183 	(void) esp_register_out(0, 0, 0, espstack);
3184 }
3185 
3186 /*
3187  * Stub function that taskq_dispatch() invokes to take the mblk (in arg)
3188  * and put() it into AH and STREAMS again.
3189  */
3190 static void
3191 inbound_task(void *arg)
3192 {
3193 	esph_t *esph;
3194 	mblk_t *mp = (mblk_t *)arg;
3195 	ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
3196 	netstack_t *ns;
3197 	ipsecesp_stack_t *espstack;
3198 	int ipsec_rc;
3199 
3200 	ns = netstack_find_by_stackid(ii->ipsec_in_stackid);
3201 	if (ns == NULL || ns != ii->ipsec_in_ns) {
3202 		/* Just freemsg(). */
3203 		if (ns != NULL)
3204 			netstack_rele(ns);
3205 		freemsg(mp);
3206 		return;
3207 	}
3208 
3209 	espstack = ns->netstack_ipsecesp;
3210 
3211 	esp2dbg(espstack, ("in ESP inbound_task"));
3212 	ASSERT(espstack != NULL);
3213 
3214 	esph = ipsec_inbound_esp_sa(mp, ns);
3215 	if (esph != NULL) {
3216 		ASSERT(ii->ipsec_in_esp_sa != NULL);
3217 		ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(mp, esph);
3218 		if (ipsec_rc == IPSEC_STATUS_SUCCESS)
3219 			ip_fanout_proto_again(mp, NULL, NULL, NULL);
3220 	}
3221 	netstack_rele(ns);
3222 }
3223 
3224 /*
3225  * Now that weak-key passed, actually ADD the security association, and
3226  * send back a reply ADD message.
3227  */
3228 static int
3229 esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
3230     int *diagnostic, ipsecesp_stack_t *espstack)
3231 {
3232 	isaf_t *primary = NULL, *secondary, *inbound, *outbound;
3233 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3234 	sadb_address_t *dstext =
3235 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3236 	struct sockaddr_in *dst;
3237 	struct sockaddr_in6 *dst6;
3238 	boolean_t is_ipv4, clone = B_FALSE, is_inbound = B_FALSE;
3239 	uint32_t *dstaddr;
3240 	ipsa_t *larval = NULL;
3241 	ipsacq_t *acqrec;
3242 	iacqf_t *acq_bucket;
3243 	mblk_t *acq_msgs = NULL;
3244 	int rc;
3245 	sadb_t *sp;
3246 	int outhash;
3247 	mblk_t *lpkt;
3248 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3249 
3250 	/*
3251 	 * Locate the appropriate table(s).
3252 	 */
3253 
3254 	dst = (struct sockaddr_in *)(dstext + 1);
3255 	dst6 = (struct sockaddr_in6 *)dst;
3256 	is_ipv4 = (dst->sin_family == AF_INET);
3257 	if (is_ipv4) {
3258 		sp = &espstack->esp_sadb.s_v4;
3259 		dstaddr = (uint32_t *)(&dst->sin_addr);
3260 		outhash = OUTBOUND_HASH_V4(sp, *(ipaddr_t *)dstaddr);
3261 	} else {
3262 		sp = &espstack->esp_sadb.s_v6;
3263 		dstaddr = (uint32_t *)(&dst6->sin6_addr);
3264 		outhash = OUTBOUND_HASH_V6(sp, *(in6_addr_t *)dstaddr);
3265 	}
3266 
3267 	inbound = INBOUND_BUCKET(sp, assoc->sadb_sa_spi);
3268 	outbound = &sp->sdb_of[outhash];
3269 
3270 	/*
3271 	 * Use the direction flags provided by the KMD to determine
3272 	 * if the inbound or outbound table should be the primary
3273 	 * for this SA. If these flags were absent then make this
3274 	 * decision based on the addresses.
3275 	 */
3276 	if (assoc->sadb_sa_flags & IPSA_F_INBOUND) {
3277 		primary = inbound;
3278 		secondary = outbound;
3279 		is_inbound = B_TRUE;
3280 		if (assoc->sadb_sa_flags & IPSA_F_OUTBOUND)
3281 			clone = B_TRUE;
3282 	} else {
3283 		if (assoc->sadb_sa_flags & IPSA_F_OUTBOUND) {
3284 			primary = outbound;
3285 			secondary = inbound;
3286 		}
3287 	}
3288 
3289 	if (primary == NULL) {
3290 		/*
3291 		 * The KMD did not set a direction flag, determine which
3292 		 * table to insert the SA into based on addresses.
3293 		 */
3294 		switch (ksi->ks_in_dsttype) {
3295 		case KS_IN_ADDR_MBCAST:
3296 			clone = B_TRUE;	/* All mcast SAs can be bidirectional */
3297 			assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3298 			/* FALLTHRU */
3299 		/*
3300 		 * If the source address is either one of mine, or unspecified
3301 		 * (which is best summed up by saying "not 'not mine'"),
3302 		 * then the association is potentially bi-directional,
3303 		 * in that it can be used for inbound traffic and outbound
3304 		 * traffic.  The best example of such an SA is a multicast
3305 		 * SA (which allows me to receive the outbound traffic).
3306 		 */
3307 		case KS_IN_ADDR_ME:
3308 			assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3309 			primary = inbound;
3310 			secondary = outbound;
3311 			if (ksi->ks_in_srctype != KS_IN_ADDR_NOTME)
3312 				clone = B_TRUE;
3313 			is_inbound = B_TRUE;
3314 			break;
3315 		/*
3316 		 * If the source address literally not mine (either
3317 		 * unspecified or not mine), then this SA may have an
3318 		 * address that WILL be mine after some configuration.
3319 		 * We pay the price for this by making it a bi-directional
3320 		 * SA.
3321 		 */
3322 		case KS_IN_ADDR_NOTME:
3323 			assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3324 			primary = outbound;
3325 			secondary = inbound;
3326 			if (ksi->ks_in_srctype != KS_IN_ADDR_ME) {
3327 				assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3328 				clone = B_TRUE;
3329 			}
3330 			break;
3331 		default:
3332 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_DST;
3333 			return (EINVAL);
3334 		}
3335 	}
3336 
3337 	/*
3338 	 * Find a ACQUIRE list entry if possible.  If we've added an SA that
3339 	 * suits the needs of an ACQUIRE list entry, we can eliminate the
3340 	 * ACQUIRE list entry and transmit the enqueued packets.  Use the
3341 	 * high-bit of the sequence number to queue it.  Key off destination
3342 	 * addr, and change acqrec's state.
3343 	 */
3344 
3345 	if (samsg->sadb_msg_seq & IACQF_LOWEST_SEQ) {
3346 		acq_bucket = &sp->sdb_acq[outhash];
3347 		mutex_enter(&acq_bucket->iacqf_lock);
3348 		for (acqrec = acq_bucket->iacqf_ipsacq; acqrec != NULL;
3349 		    acqrec = acqrec->ipsacq_next) {
3350 			mutex_enter(&acqrec->ipsacq_lock);
3351 			/*
3352 			 * Q:  I only check sequence.  Should I check dst?
3353 			 * A: Yes, check dest because those are the packets
3354 			 *    that are queued up.
3355 			 */
3356 			if (acqrec->ipsacq_seq == samsg->sadb_msg_seq &&
3357 			    IPSA_ARE_ADDR_EQUAL(dstaddr,
3358 			    acqrec->ipsacq_dstaddr, acqrec->ipsacq_addrfam))
3359 				break;
3360 			mutex_exit(&acqrec->ipsacq_lock);
3361 		}
3362 		if (acqrec != NULL) {
3363 			/*
3364 			 * AHA!  I found an ACQUIRE record for this SA.
3365 			 * Grab the msg list, and free the acquire record.
3366 			 * I already am holding the lock for this record,
3367 			 * so all I have to do is free it.
3368 			 */
3369 			acq_msgs = acqrec->ipsacq_mp;
3370 			acqrec->ipsacq_mp = NULL;
3371 			mutex_exit(&acqrec->ipsacq_lock);
3372 			sadb_destroy_acquire(acqrec,
3373 			    espstack->ipsecesp_netstack);
3374 		}
3375 		mutex_exit(&acq_bucket->iacqf_lock);
3376 	}
3377 
3378 	/*
3379 	 * Find PF_KEY message, and see if I'm an update.  If so, find entry
3380 	 * in larval list (if there).
3381 	 */
3382 
3383 	if (samsg->sadb_msg_type == SADB_UPDATE) {
3384 		mutex_enter(&inbound->isaf_lock);
3385 		larval = ipsec_getassocbyspi(inbound, assoc->sadb_sa_spi,
3386 		    ALL_ZEROES_PTR, dstaddr, dst->sin_family);
3387 		mutex_exit(&inbound->isaf_lock);
3388 
3389 		if ((larval == NULL) ||
3390 		    (larval->ipsa_state != IPSA_STATE_LARVAL)) {
3391 			*diagnostic = SADB_X_DIAGNOSTIC_SA_NOTFOUND;
3392 			if (larval != NULL) {
3393 				IPSA_REFRELE(larval);
3394 			}
3395 			esp0dbg(("Larval update, but larval disappeared.\n"));
3396 			return (ESRCH);
3397 		} /* Else sadb_common_add unlinks it for me! */
3398 	}
3399 
3400 	lpkt = NULL;
3401 	if (larval != NULL)
3402 		lpkt = sadb_clear_lpkt(larval);
3403 
3404 	rc = sadb_common_add(espstack->esp_sadb.s_ip_q, espstack->esp_pfkey_q,
3405 	    mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
3406 	    diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb);
3407 
3408 	if (rc == 0 && lpkt != NULL)
3409 		rc = !taskq_dispatch(esp_taskq, inbound_task, lpkt, TQ_NOSLEEP);
3410 
3411 	if (rc != 0) {
3412 		ip_drop_packet(lpkt, B_TRUE, NULL, NULL,
3413 		    DROPPER(ipss, ipds_sadb_inlarval_timeout),
3414 		    &espstack->esp_dropper);
3415 	}
3416 
3417 	/*
3418 	 * How much more stack will I create with all of these
3419 	 * esp_outbound() calls?
3420 	 */
3421 
3422 	while (acq_msgs != NULL) {
3423 		mblk_t *mp = acq_msgs;
3424 
3425 		acq_msgs = acq_msgs->b_next;
3426 		mp->b_next = NULL;
3427 		if (rc == 0) {
3428 			if (ipsec_outbound_sa(mp, IPPROTO_ESP)) {
3429 				((ipsec_out_t *)(mp->b_rptr))->
3430 				    ipsec_out_esp_done = B_TRUE;
3431 				if (esp_outbound(mp) == IPSEC_STATUS_SUCCESS) {
3432 					ipha_t *ipha;
3433 
3434 					/* do AH processing if needed */
3435 					if (!esp_do_outbound_ah(mp))
3436 						continue;
3437 
3438 					ipha = (ipha_t *)mp->b_cont->b_rptr;
3439 
3440 					/* finish IPsec processing */
3441 					if (is_ipv4) {
3442 						ip_wput_ipsec_out(NULL, mp,
3443 						    ipha, NULL, NULL);
3444 					} else {
3445 						ip6_t *ip6h = (ip6_t *)ipha;
3446 						ip_wput_ipsec_out_v6(NULL,
3447 						    mp, ip6h, NULL, NULL);
3448 					}
3449 				}
3450 				continue;
3451 			}
3452 		}
3453 		ESP_BUMP_STAT(espstack, out_discards);
3454 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
3455 		    DROPPER(ipss, ipds_sadb_acquire_timeout),
3456 		    &espstack->esp_dropper);
3457 	}
3458 
3459 	return (rc);
3460 }
3461 
3462 /*
3463  * Add new ESP security association.  This may become a generic AH/ESP
3464  * routine eventually.
3465  */
3466 static int
3467 esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
3468 {
3469 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3470 	sadb_address_t *srcext =
3471 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3472 	sadb_address_t *dstext =
3473 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3474 	sadb_address_t *isrcext =
3475 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_SRC];
3476 	sadb_address_t *idstext =
3477 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_DST];
3478 	sadb_address_t *nttext_loc =
3479 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_LOC];
3480 	sadb_address_t *nttext_rem =
3481 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_REM];
3482 	sadb_key_t *akey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_AUTH];
3483 	sadb_key_t *ekey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_ENCRYPT];
3484 	struct sockaddr_in *src, *dst;
3485 	struct sockaddr_in *natt_loc, *natt_rem;
3486 	struct sockaddr_in6 *natt_loc6, *natt_rem6;
3487 	sadb_lifetime_t *soft =
3488 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_SOFT];
3489 	sadb_lifetime_t *hard =
3490 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_HARD];
3491 	sadb_lifetime_t *idle =
3492 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_X_EXT_LIFETIME_IDLE];
3493 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3494 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3495 
3496 	/* I need certain extensions present for an ADD message. */
3497 	if (srcext == NULL) {
3498 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SRC;
3499 		return (EINVAL);
3500 	}
3501 	if (dstext == NULL) {
3502 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3503 		return (EINVAL);
3504 	}
3505 	if (isrcext == NULL && idstext != NULL) {
3506 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_SRC;
3507 		return (EINVAL);
3508 	}
3509 	if (isrcext != NULL && idstext == NULL) {
3510 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_DST;
3511 		return (EINVAL);
3512 	}
3513 	if (assoc == NULL) {
3514 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3515 		return (EINVAL);
3516 	}
3517 	if (ekey == NULL && assoc->sadb_sa_encrypt != SADB_EALG_NULL) {
3518 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_EKEY;
3519 		return (EINVAL);
3520 	}
3521 
3522 	src = (struct sockaddr_in *)(srcext + 1);
3523 	dst = (struct sockaddr_in *)(dstext + 1);
3524 	natt_loc = (struct sockaddr_in *)(nttext_loc + 1);
3525 	natt_loc6 = (struct sockaddr_in6 *)(nttext_loc + 1);
3526 	natt_rem = (struct sockaddr_in *)(nttext_rem + 1);
3527 	natt_rem6 = (struct sockaddr_in6 *)(nttext_rem + 1);
3528 
3529 	/* Sundry ADD-specific reality checks. */
3530 	/* XXX STATS :  Logging/stats here? */
3531 
3532 	if ((assoc->sadb_sa_state != SADB_SASTATE_MATURE) &&
3533 	    (assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE_ELSEWHERE)) {
3534 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SASTATE;
3535 		return (EINVAL);
3536 	}
3537 	if (assoc->sadb_sa_encrypt == SADB_EALG_NONE) {
3538 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3539 		return (EINVAL);
3540 	}
3541 
3542 	if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
3543 	    assoc->sadb_sa_auth == SADB_AALG_NONE) {
3544 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3545 		return (EINVAL);
3546 	}
3547 
3548 	if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) {
3549 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
3550 		return (EINVAL);
3551 	}
3552 
3553 	if ((*diagnostic = sadb_hardsoftchk(hard, soft, idle)) != 0) {
3554 		return (EINVAL);
3555 	}
3556 	ASSERT(src->sin_family == dst->sin_family);
3557 
3558 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_LOC) {
3559 		if (nttext_loc == NULL) {
3560 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_LOC;
3561 			return (EINVAL);
3562 		}
3563 
3564 		if (natt_loc->sin_family == AF_INET6 &&
3565 		    !IN6_IS_ADDR_V4MAPPED(&natt_loc6->sin6_addr)) {
3566 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_LOC;
3567 			return (EINVAL);
3568 		}
3569 	}
3570 
3571 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_REM) {
3572 		if (nttext_rem == NULL) {
3573 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_REM;
3574 			return (EINVAL);
3575 		}
3576 		if (natt_rem->sin_family == AF_INET6 &&
3577 		    !IN6_IS_ADDR_V4MAPPED(&natt_rem6->sin6_addr)) {
3578 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_REM;
3579 			return (EINVAL);
3580 		}
3581 	}
3582 
3583 
3584 	/* Stuff I don't support, for now.  XXX Diagnostic? */
3585 	if (ksi->ks_in_extv[SADB_EXT_LIFETIME_CURRENT] != NULL ||
3586 	    ksi->ks_in_extv[SADB_EXT_SENSITIVITY] != NULL)
3587 		return (EOPNOTSUPP);
3588 
3589 	/*
3590 	 * XXX Policy :  I'm not checking identities or sensitivity
3591 	 * labels at this time, but if I did, I'd do them here, before I sent
3592 	 * the weak key check up to the algorithm.
3593 	 */
3594 
3595 	mutex_enter(&ipss->ipsec_alg_lock);
3596 
3597 	/*
3598 	 * First locate the authentication algorithm.
3599 	 */
3600 	if (akey != NULL) {
3601 		ipsec_alginfo_t *aalg;
3602 
3603 		aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
3604 		    [assoc->sadb_sa_auth];
3605 		if (aalg == NULL || !ALG_VALID(aalg)) {
3606 			mutex_exit(&ipss->ipsec_alg_lock);
3607 			esp1dbg(espstack, ("Couldn't find auth alg #%d.\n",
3608 			    assoc->sadb_sa_auth));
3609 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3610 			return (EINVAL);
3611 		}
3612 
3613 		/*
3614 		 * Sanity check key sizes.
3615 		 * Note: It's not possible to use SADB_AALG_NONE because
3616 		 * this auth_alg is not defined with ALG_FLAG_VALID. If this
3617 		 * ever changes, the same check for SADB_AALG_NONE and
3618 		 * a auth_key != NULL should be made here ( see below).
3619 		 */
3620 		if (!ipsec_valid_key_size(akey->sadb_key_bits, aalg)) {
3621 			mutex_exit(&ipss->ipsec_alg_lock);
3622 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AKEYBITS;
3623 			return (EINVAL);
3624 		}
3625 		ASSERT(aalg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3626 
3627 		/* check key and fix parity if needed */
3628 		if (ipsec_check_key(aalg->alg_mech_type, akey, B_TRUE,
3629 		    diagnostic) != 0) {
3630 			mutex_exit(&ipss->ipsec_alg_lock);
3631 			return (EINVAL);
3632 		}
3633 	}
3634 
3635 	/*
3636 	 * Then locate the encryption algorithm.
3637 	 */
3638 	if (ekey != NULL) {
3639 		ipsec_alginfo_t *ealg;
3640 
3641 		ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
3642 		    [assoc->sadb_sa_encrypt];
3643 		if (ealg == NULL || !ALG_VALID(ealg)) {
3644 			mutex_exit(&ipss->ipsec_alg_lock);
3645 			esp1dbg(espstack, ("Couldn't find encr alg #%d.\n",
3646 			    assoc->sadb_sa_encrypt));
3647 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3648 			return (EINVAL);
3649 		}
3650 
3651 		/*
3652 		 * Sanity check key sizes. If the encryption algorithm is
3653 		 * SADB_EALG_NULL but the encryption key is NOT
3654 		 * NULL then complain.
3655 		 */
3656 		if ((assoc->sadb_sa_encrypt == SADB_EALG_NULL) ||
3657 		    (!ipsec_valid_key_size(ekey->sadb_key_bits, ealg))) {
3658 			mutex_exit(&ipss->ipsec_alg_lock);
3659 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EKEYBITS;
3660 			return (EINVAL);
3661 		}
3662 		ASSERT(ealg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3663 
3664 		/* check key */
3665 		if (ipsec_check_key(ealg->alg_mech_type, ekey, B_FALSE,
3666 		    diagnostic) != 0) {
3667 			mutex_exit(&ipss->ipsec_alg_lock);
3668 			return (EINVAL);
3669 		}
3670 	}
3671 	mutex_exit(&ipss->ipsec_alg_lock);
3672 
3673 	return (esp_add_sa_finish(mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
3674 	    diagnostic, espstack));
3675 }
3676 
3677 /*
3678  * Update a security association.  Updates come in two varieties.  The first
3679  * is an update of lifetimes on a non-larval SA.  The second is an update of
3680  * a larval SA, which ends up looking a lot more like an add.
3681  */
3682 static int
3683 esp_update_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3684     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3685 {
3686 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3687 	mblk_t    *buf_pkt;
3688 	int rcode;
3689 
3690 	sadb_address_t *dstext =
3691 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3692 
3693 	if (dstext == NULL) {
3694 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3695 		return (EINVAL);
3696 	}
3697 
3698 	rcode = sadb_update_sa(mp, ksi, &buf_pkt, &espstack->esp_sadb,
3699 	    diagnostic, espstack->esp_pfkey_q, esp_add_sa,
3700 	    espstack->ipsecesp_netstack, sadb_msg_type);
3701 
3702 	if ((assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE) ||
3703 	    (rcode != 0)) {
3704 		return (rcode);
3705 	}
3706 
3707 	HANDLE_BUF_PKT(esp_taskq, espstack->ipsecesp_netstack->netstack_ipsec,
3708 	    espstack->esp_dropper, buf_pkt);
3709 
3710 	return (rcode);
3711 }
3712 
3713 /*
3714  * Delete a security association.  This is REALLY likely to be code common to
3715  * both AH and ESP.  Find the association, then unlink it.
3716  */
3717 static int
3718 esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3719     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3720 {
3721 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3722 	sadb_address_t *dstext =
3723 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3724 	sadb_address_t *srcext =
3725 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3726 	struct sockaddr_in *sin;
3727 
3728 	if (assoc == NULL) {
3729 		if (dstext != NULL) {
3730 			sin = (struct sockaddr_in *)(dstext + 1);
3731 		} else if (srcext != NULL) {
3732 			sin = (struct sockaddr_in *)(srcext + 1);
3733 		} else {
3734 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3735 			return (EINVAL);
3736 		}
3737 		return (sadb_purge_sa(mp, ksi,
3738 		    (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
3739 		    &espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
3740 		    espstack->esp_sadb.s_ip_q));
3741 	}
3742 
3743 	return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
3744 	    espstack->esp_pfkey_q, sadb_msg_type));
3745 }
3746 
3747 /*
3748  * Convert the entire contents of all of ESP's SA tables into PF_KEY SADB_DUMP
3749  * messages.
3750  */
3751 static void
3752 esp_dump(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
3753 {
3754 	int error;
3755 	sadb_msg_t *samsg;
3756 
3757 	/*
3758 	 * Dump each fanout, bailing if error is non-zero.
3759 	 */
3760 
3761 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3762 	    &espstack->esp_sadb.s_v4);
3763 	if (error != 0)
3764 		goto bail;
3765 
3766 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3767 	    &espstack->esp_sadb.s_v6);
3768 bail:
3769 	ASSERT(mp->b_cont != NULL);
3770 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3771 	samsg->sadb_msg_errno = (uint8_t)error;
3772 	sadb_pfkey_echo(espstack->esp_pfkey_q, mp,
3773 	    (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL);
3774 }
3775 
3776 /*
3777  * First-cut reality check for an inbound PF_KEY message.
3778  */
3779 static boolean_t
3780 esp_pfkey_reality_failures(mblk_t *mp, keysock_in_t *ksi,
3781     ipsecesp_stack_t *espstack)
3782 {
3783 	int diagnostic;
3784 
3785 	if (ksi->ks_in_extv[SADB_EXT_PROPOSAL] != NULL) {
3786 		diagnostic = SADB_X_DIAGNOSTIC_PROP_PRESENT;
3787 		goto badmsg;
3788 	}
3789 	if (ksi->ks_in_extv[SADB_EXT_SUPPORTED_AUTH] != NULL ||
3790 	    ksi->ks_in_extv[SADB_EXT_SUPPORTED_ENCRYPT] != NULL) {
3791 		diagnostic = SADB_X_DIAGNOSTIC_SUPP_PRESENT;
3792 		goto badmsg;
3793 	}
3794 	return (B_FALSE);	/* False ==> no failures */
3795 
3796 badmsg:
3797 	sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
3798 	    ksi->ks_in_serial);
3799 	return (B_TRUE);	/* True ==> failures */
3800 }
3801 
3802 /*
3803  * ESP parsing of PF_KEY messages.  Keysock did most of the really silly
3804  * error cases.  What I receive is a fully-formed, syntactically legal
3805  * PF_KEY message.  I then need to check semantics...
3806  *
3807  * This code may become common to AH and ESP.  Stay tuned.
3808  *
3809  * I also make the assumption that db_ref's are cool.  If this assumption
3810  * is wrong, this means that someone other than keysock or me has been
3811  * mucking with PF_KEY messages.
3812  */
3813 static void
3814 esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
3815 {
3816 	mblk_t *msg = mp->b_cont;
3817 	sadb_msg_t *samsg;
3818 	keysock_in_t *ksi;
3819 	int error;
3820 	int diagnostic = SADB_X_DIAGNOSTIC_NONE;
3821 
3822 	ASSERT(msg != NULL);
3823 
3824 	samsg = (sadb_msg_t *)msg->b_rptr;
3825 	ksi = (keysock_in_t *)mp->b_rptr;
3826 
3827 	/*
3828 	 * If applicable, convert unspecified AF_INET6 to unspecified
3829 	 * AF_INET.  And do other address reality checks.
3830 	 */
3831 	if (!sadb_addrfix(ksi, espstack->esp_pfkey_q, mp,
3832 	    espstack->ipsecesp_netstack) ||
3833 	    esp_pfkey_reality_failures(mp, ksi, espstack)) {
3834 		return;
3835 	}
3836 
3837 	switch (samsg->sadb_msg_type) {
3838 	case SADB_ADD:
3839 		error = esp_add_sa(mp, ksi, &diagnostic,
3840 		    espstack->ipsecesp_netstack);
3841 		if (error != 0) {
3842 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3843 			    diagnostic, ksi->ks_in_serial);
3844 		}
3845 		/* else esp_add_sa() took care of things. */
3846 		break;
3847 	case SADB_DELETE:
3848 	case SADB_X_DELPAIR:
3849 	case SADB_X_DELPAIR_STATE:
3850 		error = esp_del_sa(mp, ksi, &diagnostic, espstack,
3851 		    samsg->sadb_msg_type);
3852 		if (error != 0) {
3853 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3854 			    diagnostic, ksi->ks_in_serial);
3855 		}
3856 		/* Else esp_del_sa() took care of things. */
3857 		break;
3858 	case SADB_GET:
3859 		error = sadb_delget_sa(mp, ksi, &espstack->esp_sadb,
3860 		    &diagnostic, espstack->esp_pfkey_q, samsg->sadb_msg_type);
3861 		if (error != 0) {
3862 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3863 			    diagnostic, ksi->ks_in_serial);
3864 		}
3865 		/* Else sadb_get_sa() took care of things. */
3866 		break;
3867 	case SADB_FLUSH:
3868 		sadbp_flush(&espstack->esp_sadb, espstack->ipsecesp_netstack);
3869 		sadb_pfkey_echo(espstack->esp_pfkey_q, mp, samsg, ksi, NULL);
3870 		break;
3871 	case SADB_REGISTER:
3872 		/*
3873 		 * Hmmm, let's do it!  Check for extensions (there should
3874 		 * be none), extract the fields, call esp_register_out(),
3875 		 * then either free or report an error.
3876 		 *
3877 		 * Keysock takes care of the PF_KEY bookkeeping for this.
3878 		 */
3879 		if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
3880 		    ksi->ks_in_serial, espstack)) {
3881 			freemsg(mp);
3882 		} else {
3883 			/*
3884 			 * Only way this path hits is if there is a memory
3885 			 * failure.  It will not return B_FALSE because of
3886 			 * lack of esp_pfkey_q if I am in wput().
3887 			 */
3888 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM,
3889 			    diagnostic, ksi->ks_in_serial);
3890 		}
3891 		break;
3892 	case SADB_UPDATE:
3893 	case SADB_X_UPDATEPAIR:
3894 		/*
3895 		 * Find a larval, if not there, find a full one and get
3896 		 * strict.
3897 		 */
3898 		error = esp_update_sa(mp, ksi, &diagnostic, espstack,
3899 		    samsg->sadb_msg_type);
3900 		if (error != 0) {
3901 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3902 			    diagnostic, ksi->ks_in_serial);
3903 		}
3904 		/* else esp_update_sa() took care of things. */
3905 		break;
3906 	case SADB_GETSPI:
3907 		/*
3908 		 * Reserve a new larval entry.
3909 		 */
3910 		esp_getspi(mp, ksi, espstack);
3911 		break;
3912 	case SADB_ACQUIRE:
3913 		/*
3914 		 * Find larval and/or ACQUIRE record and kill it (them), I'm
3915 		 * most likely an error.  Inbound ACQUIRE messages should only
3916 		 * have the base header.
3917 		 */
3918 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3919 		    espstack->esp_pfkey_q, espstack->ipsecesp_netstack);
3920 		freemsg(mp);
3921 		break;
3922 	case SADB_DUMP:
3923 		/*
3924 		 * Dump all entries.
3925 		 */
3926 		esp_dump(mp, ksi, espstack);
3927 		/* esp_dump will take care of the return message, etc. */
3928 		break;
3929 	case SADB_EXPIRE:
3930 		/* Should never reach me. */
3931 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EOPNOTSUPP,
3932 		    diagnostic, ksi->ks_in_serial);
3933 		break;
3934 	default:
3935 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL,
3936 		    SADB_X_DIAGNOSTIC_UNKNOWN_MSG, ksi->ks_in_serial);
3937 		break;
3938 	}
3939 }
3940 
3941 /*
3942  * Handle case where PF_KEY says it can't find a keysock for one of my
3943  * ACQUIRE messages.
3944  */
3945 static void
3946 esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
3947 {
3948 	sadb_msg_t *samsg;
3949 	keysock_out_err_t *kse = (keysock_out_err_t *)mp->b_rptr;
3950 
3951 	if (mp->b_cont == NULL) {
3952 		freemsg(mp);
3953 		return;
3954 	}
3955 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3956 
3957 	/*
3958 	 * If keysock can't find any registered, delete the acquire record
3959 	 * immediately, and handle errors.
3960 	 */
3961 	if (samsg->sadb_msg_type == SADB_ACQUIRE) {
3962 		samsg->sadb_msg_errno = kse->ks_err_errno;
3963 		samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
3964 		/*
3965 		 * Use the write-side of the esp_pfkey_q, in case there is
3966 		 * no esp_sadb.s_ip_q.
3967 		 */
3968 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3969 		    WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
3970 	}
3971 
3972 	freemsg(mp);
3973 }
3974 
3975 /*
3976  * ESP module write put routine.
3977  */
3978 static void
3979 ipsecesp_wput(queue_t *q, mblk_t *mp)
3980 {
3981 	ipsec_info_t *ii;
3982 	struct iocblk *iocp;
3983 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
3984 
3985 	esp3dbg(espstack, ("In esp_wput().\n"));
3986 
3987 	/* NOTE: Each case must take care of freeing or passing mp. */
3988 	switch (mp->b_datap->db_type) {
3989 	case M_CTL:
3990 		if ((mp->b_wptr - mp->b_rptr) < sizeof (ipsec_info_t)) {
3991 			/* Not big enough message. */
3992 			freemsg(mp);
3993 			break;
3994 		}
3995 		ii = (ipsec_info_t *)mp->b_rptr;
3996 
3997 		switch (ii->ipsec_info_type) {
3998 		case KEYSOCK_OUT_ERR:
3999 			esp1dbg(espstack, ("Got KEYSOCK_OUT_ERR message.\n"));
4000 			esp_keysock_no_socket(mp, espstack);
4001 			break;
4002 		case KEYSOCK_IN:
4003 			ESP_BUMP_STAT(espstack, keysock_in);
4004 			esp3dbg(espstack, ("Got KEYSOCK_IN message.\n"));
4005 
4006 			/* Parse the message. */
4007 			esp_parse_pfkey(mp, espstack);
4008 			break;
4009 		case KEYSOCK_HELLO:
4010 			sadb_keysock_hello(&espstack->esp_pfkey_q, q, mp,
4011 			    esp_ager, (void *)espstack, &espstack->esp_event,
4012 			    SADB_SATYPE_ESP);
4013 			break;
4014 		default:
4015 			esp2dbg(espstack, ("Got M_CTL from above of 0x%x.\n",
4016 			    ii->ipsec_info_type));
4017 			freemsg(mp);
4018 			break;
4019 		}
4020 		break;
4021 	case M_IOCTL:
4022 		iocp = (struct iocblk *)mp->b_rptr;
4023 		switch (iocp->ioc_cmd) {
4024 		case ND_SET:
4025 		case ND_GET:
4026 			if (nd_getset(q, espstack->ipsecesp_g_nd, mp)) {
4027 				qreply(q, mp);
4028 				return;
4029 			} else {
4030 				iocp->ioc_error = ENOENT;
4031 			}
4032 			/* FALLTHRU */
4033 		default:
4034 			/* We really don't support any other ioctls, do we? */
4035 
4036 			/* Return EINVAL */
4037 			if (iocp->ioc_error != ENOENT)
4038 				iocp->ioc_error = EINVAL;
4039 			iocp->ioc_count = 0;
4040 			mp->b_datap->db_type = M_IOCACK;
4041 			qreply(q, mp);
4042 			return;
4043 		}
4044 	default:
4045 		esp3dbg(espstack,
4046 		    ("Got default message, type %d, passing to IP.\n",
4047 		    mp->b_datap->db_type));
4048 		putnext(q, mp);
4049 	}
4050 }
4051 
4052 /*
4053  * Process an outbound ESP packet that can be accelerated by a IPsec
4054  * hardware acceleration capable Provider.
4055  * The caller already inserted and initialized the ESP header.
4056  * This function allocates a tagging M_CTL, and adds room at the end
4057  * of the packet to hold the ICV if authentication is needed.
4058  *
4059  * On success returns B_TRUE, on failure returns B_FALSE and frees the
4060  * mblk chain ipsec_out.
4061  */
4062 static ipsec_status_t
4063 esp_outbound_accelerated(mblk_t *ipsec_out, uint_t icv_len)
4064 {
4065 	ipsec_out_t *io;
4066 	mblk_t *lastmp;
4067 	netstack_t	*ns;
4068 	ipsecesp_stack_t *espstack;
4069 	ipsec_stack_t	*ipss;
4070 
4071 	io = (ipsec_out_t *)ipsec_out->b_rptr;
4072 	ns = io->ipsec_out_ns;
4073 	espstack = ns->netstack_ipsecesp;
4074 	ipss = ns->netstack_ipsec;
4075 
4076 	ESP_BUMP_STAT(espstack, out_accelerated);
4077 
4078 	/* mark packet as being accelerated in IPSEC_OUT */
4079 	ASSERT(io->ipsec_out_accelerated == B_FALSE);
4080 	io->ipsec_out_accelerated = B_TRUE;
4081 
4082 	/*
4083 	 * add room at the end of the packet for the ICV if needed
4084 	 */
4085 	if (icv_len > 0) {
4086 		/* go to last mblk */
4087 		lastmp = ipsec_out;	/* For following while loop. */
4088 		do {
4089 			lastmp = lastmp->b_cont;
4090 		} while (lastmp->b_cont != NULL);
4091 
4092 		/* if not enough available room, allocate new mblk */
4093 		if ((lastmp->b_wptr + icv_len) > lastmp->b_datap->db_lim) {
4094 			lastmp->b_cont = allocb(icv_len, BPRI_HI);
4095 			if (lastmp->b_cont == NULL) {
4096 				ESP_BUMP_STAT(espstack, out_discards);
4097 				ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
4098 				    DROPPER(ipss, ipds_esp_nomem),
4099 				    &espstack->esp_dropper);
4100 				return (IPSEC_STATUS_FAILED);
4101 			}
4102 			lastmp = lastmp->b_cont;
4103 		}
4104 		lastmp->b_wptr += icv_len;
4105 	}
4106 
4107 	return (IPSEC_STATUS_SUCCESS);
4108 }
4109 
4110 /*
4111  * Process an inbound accelerated ESP packet.
4112  * On success returns B_TRUE, on failure returns B_FALSE and frees the
4113  * mblk chain ipsec_in.
4114  */
4115 static ipsec_status_t
4116 esp_inbound_accelerated(mblk_t *ipsec_in, mblk_t *data_mp, boolean_t isv4,
4117     ipsa_t *assoc)
4118 {
4119 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
4120 	mblk_t *hada_mp;
4121 	uint32_t icv_len = 0;
4122 	da_ipsec_t *hada;
4123 	ipha_t *ipha;
4124 	ip6_t *ip6h;
4125 	kstat_named_t *counter;
4126 	netstack_t	*ns = ii->ipsec_in_ns;
4127 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
4128 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4129 
4130 	ESP_BUMP_STAT(espstack, in_accelerated);
4131 
4132 	hada_mp = ii->ipsec_in_da;
4133 	ASSERT(hada_mp != NULL);
4134 	hada = (da_ipsec_t *)hada_mp->b_rptr;
4135 
4136 	/*
4137 	 * We only support one level of decapsulation in hardware, so
4138 	 * nuke the pointer.
4139 	 */
4140 	ii->ipsec_in_da = NULL;
4141 	ii->ipsec_in_accelerated = B_FALSE;
4142 
4143 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE) {
4144 		/*
4145 		 * ESP with authentication. We expect the Provider to have
4146 		 * computed the ICV and placed it in the hardware acceleration
4147 		 * data attributes.
4148 		 *
4149 		 * Extract ICV length from attributes M_CTL and sanity check
4150 		 * its value. We allow the mblk to be smaller than da_ipsec_t
4151 		 * for a small ICV, as long as the entire ICV fits within the
4152 		 * mblk.
4153 		 *
4154 		 * Also ensures that the ICV length computed by Provider
4155 		 * corresponds to the ICV length of the agorithm specified by
4156 		 * the SA.
4157 		 */
4158 		icv_len = hada->da_icv_len;
4159 		if ((icv_len != assoc->ipsa_mac_len) ||
4160 		    (icv_len > DA_ICV_MAX_LEN) || (MBLKL(hada_mp) <
4161 		    (sizeof (da_ipsec_t) - DA_ICV_MAX_LEN + icv_len))) {
4162 			esp0dbg(("esp_inbound_accelerated: "
4163 			    "ICV len (%u) incorrect or mblk too small (%u)\n",
4164 			    icv_len, (uint32_t)(MBLKL(hada_mp))));
4165 			counter = DROPPER(ipss, ipds_esp_bad_auth);
4166 			goto esp_in_discard;
4167 		}
4168 	}
4169 
4170 	/* get pointers to IP header */
4171 	if (isv4) {
4172 		ipha = (ipha_t *)data_mp->b_rptr;
4173 	} else {
4174 		ip6h = (ip6_t *)data_mp->b_rptr;
4175 	}
4176 
4177 	/*
4178 	 * Compare ICV in ESP packet vs ICV computed by adapter.
4179 	 * We also remove the ICV from the end of the packet since
4180 	 * it will no longer be needed.
4181 	 *
4182 	 * Assume that esp_inbound() already ensured that the pkt
4183 	 * was in one mblk.
4184 	 */
4185 	ASSERT(data_mp->b_cont == NULL);
4186 	data_mp->b_wptr -= icv_len;
4187 	/* adjust IP header */
4188 	if (isv4)
4189 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - icv_len);
4190 	else
4191 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - icv_len);
4192 	if (icv_len && bcmp(hada->da_icv, data_mp->b_wptr, icv_len)) {
4193 		int af;
4194 		void *addr;
4195 
4196 		if (isv4) {
4197 			addr = &ipha->ipha_dst;
4198 			af = AF_INET;
4199 		} else {
4200 			addr = &ip6h->ip6_dst;
4201 			af = AF_INET6;
4202 		}
4203 
4204 		/*
4205 		 * Log the event. Don't print to the console, block
4206 		 * potential denial-of-service attack.
4207 		 */
4208 		ESP_BUMP_STAT(espstack, bad_auth);
4209 		ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
4210 		    "ESP Authentication failed spi %x, dst_addr %s",
4211 		    assoc->ipsa_spi, addr, af, espstack->ipsecesp_netstack);
4212 		counter = DROPPER(ipss, ipds_esp_bad_auth);
4213 		goto esp_in_discard;
4214 	}
4215 
4216 	esp3dbg(espstack, ("esp_inbound_accelerated: ESP authentication "
4217 	    "succeeded, checking replay\n"));
4218 
4219 	ipsec_in->b_cont = data_mp;
4220 
4221 	/*
4222 	 * Remove ESP header and padding from packet.
4223 	 */
4224 	if (!esp_strip_header(data_mp, ii->ipsec_in_v4, assoc->ipsa_iv_len,
4225 	    &counter, espstack)) {
4226 		esp1dbg(espstack, ("esp_inbound_accelerated: "
4227 		    "esp_strip_header() failed\n"));
4228 		goto esp_in_discard;
4229 	}
4230 
4231 	freeb(hada_mp);
4232 
4233 	/*
4234 	 * Account for usage..
4235 	 */
4236 	if (!esp_age_bytes(assoc, msgdsize(data_mp), B_TRUE)) {
4237 		/* The ipsa has hit hard expiration, LOG and AUDIT. */
4238 		ESP_BUMP_STAT(espstack, bytes_expired);
4239 		IP_ESP_BUMP_STAT(ipss, in_discards);
4240 		ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
4241 		    "ESP association 0x%x, dst %s had bytes expire.\n",
4242 		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
4243 		    espstack->ipsecesp_netstack);
4244 		ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
4245 		    DROPPER(ipss, ipds_esp_bytes_expire),
4246 		    &espstack->esp_dropper);
4247 		return (IPSEC_STATUS_FAILED);
4248 	}
4249 
4250 	/* done processing the packet */
4251 	return (IPSEC_STATUS_SUCCESS);
4252 
4253 esp_in_discard:
4254 	IP_ESP_BUMP_STAT(ipss, in_discards);
4255 	freeb(hada_mp);
4256 
4257 	ipsec_in->b_cont = data_mp;	/* For ip_drop_packet()'s sake... */
4258 	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter,
4259 	    &espstack->esp_dropper);
4260 
4261 	return (IPSEC_STATUS_FAILED);
4262 }
4263 
4264 /*
4265  * Wrapper to allow IP to trigger an ESP association failure message
4266  * during inbound SA selection.
4267  */
4268 void
4269 ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
4270     uint32_t spi, void *addr, int af, ipsecesp_stack_t *espstack)
4271 {
4272 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
4273 
4274 	if (espstack->ipsecesp_log_unknown_spi) {
4275 		ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
4276 		    addr, af, espstack->ipsecesp_netstack);
4277 	}
4278 
4279 	ip_drop_packet(mp, B_TRUE, NULL, NULL,
4280 	    DROPPER(ipss, ipds_esp_no_sa),
4281 	    &espstack->esp_dropper);
4282 }
4283 
4284 /*
4285  * Initialize the ESP input and output processing functions.
4286  */
4287 void
4288 ipsecesp_init_funcs(ipsa_t *sa)
4289 {
4290 	if (sa->ipsa_output_func == NULL)
4291 		sa->ipsa_output_func = esp_outbound;
4292 	if (sa->ipsa_input_func == NULL)
4293 		sa->ipsa_input_func = esp_inbound;
4294 }
4295