xref: /titanic_52/usr/src/uts/common/inet/ip/ipsecesp.c (revision 5f149bca52352f45598e5563debe72ce04bd7a21)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/errno.h>
32 #include <sys/strlog.h>
33 #include <sys/tihdr.h>
34 #include <sys/socket.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/kmem.h>
38 #include <sys/zone.h>
39 #include <sys/sysmacros.h>
40 #include <sys/cmn_err.h>
41 #include <sys/vtrace.h>
42 #include <sys/debug.h>
43 #include <sys/atomic.h>
44 #include <sys/strsun.h>
45 #include <sys/random.h>
46 #include <netinet/in.h>
47 #include <net/if.h>
48 #include <netinet/ip6.h>
49 #include <net/pfkeyv2.h>
50 
51 #include <inet/common.h>
52 #include <inet/mi.h>
53 #include <inet/nd.h>
54 #include <inet/ip.h>
55 #include <inet/ip_impl.h>
56 #include <inet/ip6.h>
57 #include <inet/sadb.h>
58 #include <inet/ipsec_info.h>
59 #include <inet/ipsec_impl.h>
60 #include <inet/ipsecesp.h>
61 #include <inet/ipdrop.h>
62 #include <inet/tcp.h>
63 #include <sys/kstat.h>
64 #include <sys/policy.h>
65 #include <sys/strsun.h>
66 #include <inet/udp_impl.h>
67 #include <sys/taskq.h>
68 #include <sys/note.h>
69 
70 #include <sys/iphada.h>
71 
72 /*
73  * Table of ND variables supported by ipsecesp. These are loaded into
74  * ipsecesp_g_nd in ipsecesp_init_nd.
75  * All of these are alterable, within the min/max values given, at run time.
76  */
77 static	ipsecespparam_t	lcl_param_arr[] = {
78 	/* min	max			value	name */
79 	{ 0,	3,			0,	"ipsecesp_debug"},
80 	{ 125,	32000, SADB_AGE_INTERVAL_DEFAULT, "ipsecesp_age_interval"},
81 	{ 1,	10,			1,	"ipsecesp_reap_delay"},
82 	{ 1,	SADB_MAX_REPLAY,	64,	"ipsecesp_replay_size"},
83 	{ 1,	300,			15,	"ipsecesp_acquire_timeout"},
84 	{ 1,	1800,			90,	"ipsecesp_larval_timeout"},
85 	/* Default lifetime values for ACQUIRE messages. */
86 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_bytes"},
87 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_bytes"},
88 	{ 0,	0xffffffffU,	24000,	"ipsecesp_default_soft_addtime"},
89 	{ 0,	0xffffffffU,	28800,	"ipsecesp_default_hard_addtime"},
90 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_usetime"},
91 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_usetime"},
92 	{ 0,	1,		0,	"ipsecesp_log_unknown_spi"},
93 	{ 0,	2,		1,	"ipsecesp_padding_check"},
94 	{ 0,	600,		20,	"ipsecesp_nat_keepalive_interval"},
95 };
96 #define	ipsecesp_debug	ipsecesp_params[0].ipsecesp_param_value
97 #define	ipsecesp_age_interval ipsecesp_params[1].ipsecesp_param_value
98 #define	ipsecesp_age_int_max	ipsecesp_params[1].ipsecesp_param_max
99 #define	ipsecesp_reap_delay	ipsecesp_params[2].ipsecesp_param_value
100 #define	ipsecesp_replay_size	ipsecesp_params[3].ipsecesp_param_value
101 #define	ipsecesp_acquire_timeout	\
102 	ipsecesp_params[4].ipsecesp_param_value
103 #define	ipsecesp_larval_timeout	\
104 	ipsecesp_params[5].ipsecesp_param_value
105 #define	ipsecesp_default_soft_bytes	\
106 	ipsecesp_params[6].ipsecesp_param_value
107 #define	ipsecesp_default_hard_bytes	\
108 	ipsecesp_params[7].ipsecesp_param_value
109 #define	ipsecesp_default_soft_addtime	\
110 	ipsecesp_params[8].ipsecesp_param_value
111 #define	ipsecesp_default_hard_addtime	\
112 	ipsecesp_params[9].ipsecesp_param_value
113 #define	ipsecesp_default_soft_usetime	\
114 	ipsecesp_params[10].ipsecesp_param_value
115 #define	ipsecesp_default_hard_usetime	\
116 	ipsecesp_params[11].ipsecesp_param_value
117 #define	ipsecesp_log_unknown_spi	\
118 	ipsecesp_params[12].ipsecesp_param_value
119 #define	ipsecesp_padding_check	\
120 	ipsecesp_params[13].ipsecesp_param_value
121 /* For ipsecesp_nat_keepalive_interval, see ipsecesp.h. */
122 
123 #define	esp0dbg(a)	printf a
124 /* NOTE:  != 0 instead of > 0 so lint doesn't complain. */
125 #define	esp1dbg(espstack, a)	if (espstack->ipsecesp_debug != 0) printf a
126 #define	esp2dbg(espstack, a)	if (espstack->ipsecesp_debug > 1) printf a
127 #define	esp3dbg(espstack, a)	if (espstack->ipsecesp_debug > 2) printf a
128 
129 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
130 static int ipsecesp_close(queue_t *);
131 static void ipsecesp_rput(queue_t *, mblk_t *);
132 static void ipsecesp_wput(queue_t *, mblk_t *);
133 static void	*ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
134 static void	ipsecesp_stack_fini(netstackid_t stackid, void *arg);
135 static void esp_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
136 
137 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
138 static ipsec_status_t esp_outbound_accelerated(mblk_t *, uint_t);
139 static ipsec_status_t esp_inbound_accelerated(mblk_t *, mblk_t *,
140     boolean_t, ipsa_t *);
141 
142 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
143     ipsecesp_stack_t *);
144 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
145     kstat_named_t **, ipsecesp_stack_t *);
146 static ipsec_status_t esp_submit_req_inbound(mblk_t *, ipsa_t *, uint_t);
147 static ipsec_status_t esp_submit_req_outbound(mblk_t *, ipsa_t *, uchar_t *,
148     uint_t);
149 /* Setable in /etc/system */
150 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
151 
152 static struct module_info info = {
153 	5137, "ipsecesp", 0, INFPSZ, 65536, 1024
154 };
155 
156 static struct qinit rinit = {
157 	(pfi_t)ipsecesp_rput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
158 	NULL
159 };
160 
161 static struct qinit winit = {
162 	(pfi_t)ipsecesp_wput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
163 	NULL
164 };
165 
166 struct streamtab ipsecespinfo = {
167 	&rinit, &winit, NULL, NULL
168 };
169 
170 static taskq_t *esp_taskq;
171 
172 /*
173  * OTOH, this one is set at open/close, and I'm D_MTQPAIR for now.
174  *
175  * Question:	Do I need this, given that all instance's esps->esps_wq point
176  *		to IP?
177  *
178  * Answer:	Yes, because I need to know which queue is BOUND to
179  *		IPPROTO_ESP
180  */
181 
182 /*
183  * Stats.  This may eventually become a full-blown SNMP MIB once that spec
184  * stabilizes.
185  */
186 
187 typedef struct esp_kstats_s {
188 	kstat_named_t esp_stat_num_aalgs;
189 	kstat_named_t esp_stat_good_auth;
190 	kstat_named_t esp_stat_bad_auth;
191 	kstat_named_t esp_stat_bad_padding;
192 	kstat_named_t esp_stat_replay_failures;
193 	kstat_named_t esp_stat_replay_early_failures;
194 	kstat_named_t esp_stat_keysock_in;
195 	kstat_named_t esp_stat_out_requests;
196 	kstat_named_t esp_stat_acquire_requests;
197 	kstat_named_t esp_stat_bytes_expired;
198 	kstat_named_t esp_stat_out_discards;
199 	kstat_named_t esp_stat_in_accelerated;
200 	kstat_named_t esp_stat_out_accelerated;
201 	kstat_named_t esp_stat_noaccel;
202 	kstat_named_t esp_stat_crypto_sync;
203 	kstat_named_t esp_stat_crypto_async;
204 	kstat_named_t esp_stat_crypto_failures;
205 	kstat_named_t esp_stat_num_ealgs;
206 	kstat_named_t esp_stat_bad_decrypt;
207 } esp_kstats_t;
208 
209 /*
210  * espstack->esp_kstats is equal to espstack->esp_ksp->ks_data if
211  * kstat_create_netstack for espstack->esp_ksp succeeds, but when it
212  * fails, it will be NULL. Note this is done for all stack instances,
213  * so it *could* fail. hence a non-NULL checking is done for
214  * ESP_BUMP_STAT and ESP_DEBUMP_STAT
215  */
216 #define	ESP_BUMP_STAT(espstack, x)					\
217 do {									\
218 	if (espstack->esp_kstats != NULL)				\
219 		(espstack->esp_kstats->esp_stat_ ## x).value.ui64++;	\
220 _NOTE(CONSTCOND)							\
221 } while (0)
222 
223 #define	ESP_DEBUMP_STAT(espstack, x)					\
224 do {									\
225 	if (espstack->esp_kstats != NULL)				\
226 		(espstack->esp_kstats->esp_stat_ ## x).value.ui64--;	\
227 _NOTE(CONSTCOND)							\
228 } while (0)
229 
230 static int	esp_kstat_update(kstat_t *, int);
231 
232 static boolean_t
233 esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
234 {
235 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
236 	    "net", KSTAT_TYPE_NAMED,
237 	    sizeof (esp_kstats_t) / sizeof (kstat_named_t),
238 	    KSTAT_FLAG_PERSISTENT, stackid);
239 
240 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
241 		return (B_FALSE);
242 
243 	espstack->esp_kstats = espstack->esp_ksp->ks_data;
244 
245 	espstack->esp_ksp->ks_update = esp_kstat_update;
246 	espstack->esp_ksp->ks_private = (void *)(uintptr_t)stackid;
247 
248 #define	K64 KSTAT_DATA_UINT64
249 #define	KI(x) kstat_named_init(&(espstack->esp_kstats->esp_stat_##x), #x, K64)
250 
251 	KI(num_aalgs);
252 	KI(num_ealgs);
253 	KI(good_auth);
254 	KI(bad_auth);
255 	KI(bad_padding);
256 	KI(replay_failures);
257 	KI(replay_early_failures);
258 	KI(keysock_in);
259 	KI(out_requests);
260 	KI(acquire_requests);
261 	KI(bytes_expired);
262 	KI(out_discards);
263 	KI(in_accelerated);
264 	KI(out_accelerated);
265 	KI(noaccel);
266 	KI(crypto_sync);
267 	KI(crypto_async);
268 	KI(crypto_failures);
269 	KI(bad_decrypt);
270 
271 #undef KI
272 #undef K64
273 
274 	kstat_install(espstack->esp_ksp);
275 
276 	return (B_TRUE);
277 }
278 
279 static int
280 esp_kstat_update(kstat_t *kp, int rw)
281 {
282 	esp_kstats_t *ekp;
283 	netstackid_t	stackid = (zoneid_t)(uintptr_t)kp->ks_private;
284 	netstack_t	*ns;
285 	ipsec_stack_t	*ipss;
286 
287 	if ((kp == NULL) || (kp->ks_data == NULL))
288 		return (EIO);
289 
290 	if (rw == KSTAT_WRITE)
291 		return (EACCES);
292 
293 	ns = netstack_find_by_stackid(stackid);
294 	if (ns == NULL)
295 		return (-1);
296 	ipss = ns->netstack_ipsec;
297 	if (ipss == NULL) {
298 		netstack_rele(ns);
299 		return (-1);
300 	}
301 	ekp = (esp_kstats_t *)kp->ks_data;
302 
303 	mutex_enter(&ipss->ipsec_alg_lock);
304 	ekp->esp_stat_num_aalgs.value.ui64 =
305 	    ipss->ipsec_nalgs[IPSEC_ALG_AUTH];
306 	ekp->esp_stat_num_ealgs.value.ui64 =
307 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
308 	mutex_exit(&ipss->ipsec_alg_lock);
309 
310 	netstack_rele(ns);
311 	return (0);
312 }
313 
314 #ifdef DEBUG
315 /*
316  * Debug routine, useful to see pre-encryption data.
317  */
318 static char *
319 dump_msg(mblk_t *mp)
320 {
321 	char tmp_str[3], tmp_line[256];
322 
323 	while (mp != NULL) {
324 		unsigned char *ptr;
325 
326 		printf("mblk address 0x%p, length %ld, db_ref %d "
327 		    "type %d, base 0x%p, lim 0x%p\n",
328 		    (void *) mp, (long)(mp->b_wptr - mp->b_rptr),
329 		    mp->b_datap->db_ref, mp->b_datap->db_type,
330 		    (void *)mp->b_datap->db_base, (void *)mp->b_datap->db_lim);
331 		ptr = mp->b_rptr;
332 
333 		tmp_line[0] = '\0';
334 		while (ptr < mp->b_wptr) {
335 			uint_t diff;
336 
337 			diff = (ptr - mp->b_rptr);
338 			if (!(diff & 0x1f)) {
339 				if (strlen(tmp_line) > 0) {
340 					printf("bytes: %s\n", tmp_line);
341 					tmp_line[0] = '\0';
342 				}
343 			}
344 			if (!(diff & 0x3))
345 				(void) strcat(tmp_line, " ");
346 			(void) sprintf(tmp_str, "%02x", *ptr);
347 			(void) strcat(tmp_line, tmp_str);
348 			ptr++;
349 		}
350 		if (strlen(tmp_line) > 0)
351 			printf("bytes: %s\n", tmp_line);
352 
353 		mp = mp->b_cont;
354 	}
355 
356 	return ("\n");
357 }
358 
359 #else /* DEBUG */
360 static char *
361 dump_msg(mblk_t *mp)
362 {
363 	printf("Find value of mp %p.\n", mp);
364 	return ("\n");
365 }
366 #endif /* DEBUG */
367 
368 /*
369  * Don't have to lock age_interval, as only one thread will access it at
370  * a time, because I control the one function that does with timeout().
371  */
372 static void
373 esp_ager(void *arg)
374 {
375 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
376 	netstack_t	*ns = espstack->ipsecesp_netstack;
377 	hrtime_t begin = gethrtime();
378 
379 	sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
380 	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
381 	sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
382 	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
383 
384 	espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
385 	    esp_ager, espstack,
386 	    &espstack->ipsecesp_age_interval, espstack->ipsecesp_age_int_max,
387 	    info.mi_idnum);
388 }
389 
390 /*
391  * Get an ESP NDD parameter.
392  */
393 /* ARGSUSED */
394 static int
395 ipsecesp_param_get(q, mp, cp, cr)
396 	queue_t	*q;
397 	mblk_t	*mp;
398 	caddr_t	cp;
399 	cred_t *cr;
400 {
401 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
402 	uint_t value;
403 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
404 
405 	mutex_enter(&espstack->ipsecesp_param_lock);
406 	value = ipsecesppa->ipsecesp_param_value;
407 	mutex_exit(&espstack->ipsecesp_param_lock);
408 
409 	(void) mi_mpprintf(mp, "%u", value);
410 	return (0);
411 }
412 
413 /*
414  * This routine sets an NDD variable in a ipsecespparam_t structure.
415  */
416 /* ARGSUSED */
417 static int
418 ipsecesp_param_set(q, mp, value, cp, cr)
419 	queue_t	*q;
420 	mblk_t	*mp;
421 	char	*value;
422 	caddr_t	cp;
423 	cred_t *cr;
424 {
425 	ulong_t	new_value;
426 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
427 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
428 
429 	/*
430 	 * Fail the request if the new value does not lie within the
431 	 * required bounds.
432 	 */
433 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
434 	    new_value < ipsecesppa->ipsecesp_param_min ||
435 	    new_value > ipsecesppa->ipsecesp_param_max) {
436 		return (EINVAL);
437 	}
438 
439 	/* Set the new value */
440 	mutex_enter(&espstack->ipsecesp_param_lock);
441 	ipsecesppa->ipsecesp_param_value = new_value;
442 	mutex_exit(&espstack->ipsecesp_param_lock);
443 	return (0);
444 }
445 
446 /*
447  * Using lifetime NDD variables, fill in an extended combination's
448  * lifetime information.
449  */
450 void
451 ipsecesp_fill_defs(sadb_x_ecomb_t *ecomb, netstack_t *ns)
452 {
453 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
454 
455 	ecomb->sadb_x_ecomb_soft_bytes = espstack->ipsecesp_default_soft_bytes;
456 	ecomb->sadb_x_ecomb_hard_bytes = espstack->ipsecesp_default_hard_bytes;
457 	ecomb->sadb_x_ecomb_soft_addtime =
458 	    espstack->ipsecesp_default_soft_addtime;
459 	ecomb->sadb_x_ecomb_hard_addtime =
460 	    espstack->ipsecesp_default_hard_addtime;
461 	ecomb->sadb_x_ecomb_soft_usetime =
462 	    espstack->ipsecesp_default_soft_usetime;
463 	ecomb->sadb_x_ecomb_hard_usetime =
464 	    espstack->ipsecesp_default_hard_usetime;
465 }
466 
467 /*
468  * Initialize things for ESP at module load time.
469  */
470 boolean_t
471 ipsecesp_ddi_init(void)
472 {
473 	esp_taskq = taskq_create("esp_taskq", 1, minclsyspri,
474 	    IPSEC_TASKQ_MIN, IPSEC_TASKQ_MAX, 0);
475 
476 	/*
477 	 * We want to be informed each time a stack is created or
478 	 * destroyed in the kernel, so we can maintain the
479 	 * set of ipsecesp_stack_t's.
480 	 */
481 	netstack_register(NS_IPSECESP, ipsecesp_stack_init, NULL,
482 	    ipsecesp_stack_fini);
483 
484 	return (B_TRUE);
485 }
486 
487 /*
488  * Walk through the param array specified registering each element with the
489  * named dispatch handler.
490  */
491 static boolean_t
492 ipsecesp_param_register(IDP *ndp, ipsecespparam_t *espp, int cnt)
493 {
494 	for (; cnt-- > 0; espp++) {
495 		if (espp->ipsecesp_param_name != NULL &&
496 		    espp->ipsecesp_param_name[0]) {
497 			if (!nd_load(ndp,
498 			    espp->ipsecesp_param_name,
499 			    ipsecesp_param_get, ipsecesp_param_set,
500 			    (caddr_t)espp)) {
501 				nd_free(ndp);
502 				return (B_FALSE);
503 			}
504 		}
505 	}
506 	return (B_TRUE);
507 }
508 /*
509  * Initialize things for ESP for each stack instance
510  */
511 static void *
512 ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns)
513 {
514 	ipsecesp_stack_t	*espstack;
515 	ipsecespparam_t		*espp;
516 
517 	espstack = (ipsecesp_stack_t *)kmem_zalloc(sizeof (*espstack),
518 	    KM_SLEEP);
519 	espstack->ipsecesp_netstack = ns;
520 
521 	espp = (ipsecespparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
522 	espstack->ipsecesp_params = espp;
523 	bcopy(lcl_param_arr, espp, sizeof (lcl_param_arr));
524 
525 	(void) ipsecesp_param_register(&espstack->ipsecesp_g_nd, espp,
526 	    A_CNT(lcl_param_arr));
527 
528 	(void) esp_kstat_init(espstack, stackid);
529 
530 	espstack->esp_sadb.s_acquire_timeout =
531 	    &espstack->ipsecesp_acquire_timeout;
532 	espstack->esp_sadb.s_acqfn = esp_send_acquire;
533 	sadbp_init("ESP", &espstack->esp_sadb, SADB_SATYPE_ESP, esp_hash_size,
534 	    espstack->ipsecesp_netstack);
535 
536 	mutex_init(&espstack->ipsecesp_param_lock, NULL, MUTEX_DEFAULT, 0);
537 
538 	ip_drop_register(&espstack->esp_dropper, "IPsec ESP");
539 	return (espstack);
540 }
541 
542 /*
543  * Destroy things for ESP at module unload time.
544  */
545 void
546 ipsecesp_ddi_destroy(void)
547 {
548 	netstack_unregister(NS_IPSECESP);
549 	taskq_destroy(esp_taskq);
550 }
551 
552 /*
553  * Destroy things for ESP for one stack instance
554  */
555 static void
556 ipsecesp_stack_fini(netstackid_t stackid, void *arg)
557 {
558 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
559 
560 	if (espstack->esp_pfkey_q != NULL) {
561 		(void) quntimeout(espstack->esp_pfkey_q, espstack->esp_event);
562 	}
563 	espstack->esp_sadb.s_acqfn = NULL;
564 	espstack->esp_sadb.s_acquire_timeout = NULL;
565 	sadbp_destroy(&espstack->esp_sadb, espstack->ipsecesp_netstack);
566 	ip_drop_unregister(&espstack->esp_dropper);
567 	mutex_destroy(&espstack->ipsecesp_param_lock);
568 	nd_free(&espstack->ipsecesp_g_nd);
569 
570 	kmem_free(espstack->ipsecesp_params, sizeof (lcl_param_arr));
571 	espstack->ipsecesp_params = NULL;
572 	kstat_delete_netstack(espstack->esp_ksp, stackid);
573 	espstack->esp_ksp = NULL;
574 	espstack->esp_kstats = NULL;
575 	kmem_free(espstack, sizeof (*espstack));
576 }
577 
578 /*
579  * ESP module open routine.
580  */
581 /* ARGSUSED */
582 static int
583 ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
584 {
585 	netstack_t		*ns;
586 	ipsecesp_stack_t	*espstack;
587 
588 	if (secpolicy_ip_config(credp, B_FALSE) != 0) {
589 		esp0dbg(("Non-privileged user trying to open ipsecesp.\n"));
590 		return (EPERM);
591 	}
592 
593 	if (q->q_ptr != NULL)
594 		return (0);  /* Re-open of an already open instance. */
595 
596 	if (sflag != MODOPEN)
597 		return (EINVAL);
598 
599 	ns = netstack_find_by_cred(credp);
600 	ASSERT(ns != NULL);
601 	espstack = ns->netstack_ipsecesp;
602 	ASSERT(espstack != NULL);
603 
604 	/*
605 	 * ASSUMPTIONS (because I'm MT_OCEXCL):
606 	 *
607 	 *	* I'm being pushed on top of IP for all my opens (incl. #1).
608 	 *	* Only ipsecesp_open() can write into esp_sadb.s_ip_q.
609 	 *	* Because of this, I can check lazily for esp_sadb.s_ip_q.
610 	 *
611 	 *  If these assumptions are wrong, I'm in BIG trouble...
612 	 */
613 
614 	q->q_ptr = espstack;
615 	WR(q)->q_ptr = q->q_ptr;
616 
617 	if (espstack->esp_sadb.s_ip_q == NULL) {
618 		struct T_unbind_req *tur;
619 
620 		espstack->esp_sadb.s_ip_q = WR(q);
621 		/* Allocate an unbind... */
622 		espstack->esp_ip_unbind = allocb(sizeof (struct T_unbind_req),
623 		    BPRI_HI);
624 
625 		/*
626 		 * Send down T_BIND_REQ to bind IPPROTO_ESP.
627 		 * Handle the ACK here in ESP.
628 		 */
629 		qprocson(q);
630 		if (espstack->esp_ip_unbind == NULL ||
631 		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
632 			if (espstack->esp_ip_unbind != NULL) {
633 				freeb(espstack->esp_ip_unbind);
634 				espstack->esp_ip_unbind = NULL;
635 			}
636 			q->q_ptr = NULL;
637 			netstack_rele(espstack->ipsecesp_netstack);
638 			return (ENOMEM);
639 		}
640 
641 		espstack->esp_ip_unbind->b_datap->db_type = M_PROTO;
642 		tur = (struct T_unbind_req *)espstack->esp_ip_unbind->b_rptr;
643 		tur->PRIM_type = T_UNBIND_REQ;
644 	} else {
645 		qprocson(q);
646 	}
647 
648 	/*
649 	 * For now, there's not much I can do.  I'll be getting a message
650 	 * passed down to me from keysock (in my wput), and a T_BIND_ACK
651 	 * up from IP (in my rput).
652 	 */
653 
654 	return (0);
655 }
656 
657 /*
658  * ESP module close routine.
659  */
660 static int
661 ipsecesp_close(queue_t *q)
662 {
663 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
664 
665 	/*
666 	 * If esp_sadb.s_ip_q is attached to this instance, send a
667 	 * T_UNBIND_REQ to IP for the instance before doing
668 	 * a qprocsoff().
669 	 */
670 	if (WR(q) == espstack->esp_sadb.s_ip_q &&
671 	    espstack->esp_ip_unbind != NULL) {
672 		putnext(WR(q), espstack->esp_ip_unbind);
673 		espstack->esp_ip_unbind = NULL;
674 	}
675 
676 	/*
677 	 * Clean up q_ptr, if needed.
678 	 */
679 	qprocsoff(q);
680 
681 	/* Keysock queue check is safe, because of OCEXCL perimeter. */
682 
683 	if (q == espstack->esp_pfkey_q) {
684 		esp1dbg(espstack,
685 		    ("ipsecesp_close:  Ummm... keysock is closing ESP.\n"));
686 		espstack->esp_pfkey_q = NULL;
687 		/* Detach qtimeouts. */
688 		(void) quntimeout(q, espstack->esp_event);
689 	}
690 
691 	if (WR(q) == espstack->esp_sadb.s_ip_q) {
692 		/*
693 		 * If the esp_sadb.s_ip_q is attached to this instance, find
694 		 * another.  The OCEXCL outer perimeter helps us here.
695 		 */
696 		espstack->esp_sadb.s_ip_q = NULL;
697 
698 		/*
699 		 * Find a replacement queue for esp_sadb.s_ip_q.
700 		 */
701 		if (espstack->esp_pfkey_q != NULL &&
702 		    espstack->esp_pfkey_q != RD(q)) {
703 			/*
704 			 * See if we can use the pfkey_q.
705 			 */
706 			espstack->esp_sadb.s_ip_q = WR(espstack->esp_pfkey_q);
707 		}
708 
709 		if (espstack->esp_sadb.s_ip_q == NULL ||
710 		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
711 			esp1dbg(espstack, ("ipsecesp: Can't reassign ip_q.\n"));
712 			espstack->esp_sadb.s_ip_q = NULL;
713 		} else {
714 			espstack->esp_ip_unbind =
715 			    allocb(sizeof (struct T_unbind_req), BPRI_HI);
716 
717 			if (espstack->esp_ip_unbind != NULL) {
718 				struct T_unbind_req *tur;
719 
720 				espstack->esp_ip_unbind->b_datap->db_type =
721 				    M_PROTO;
722 				tur = (struct T_unbind_req *)
723 				    espstack->esp_ip_unbind->b_rptr;
724 				tur->PRIM_type = T_UNBIND_REQ;
725 			}
726 			/* If it's NULL, I can't do much here. */
727 		}
728 	}
729 
730 	netstack_rele(espstack->ipsecesp_netstack);
731 	return (0);
732 }
733 
734 /*
735  * Add a number of bytes to what the SA has protected so far.  Return
736  * B_TRUE if the SA can still protect that many bytes.
737  *
738  * Caller must REFRELE the passed-in assoc.  This function must REFRELE
739  * any obtained peer SA.
740  */
741 static boolean_t
742 esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
743 {
744 	ipsa_t *inassoc, *outassoc;
745 	isaf_t *bucket;
746 	boolean_t inrc, outrc, isv6;
747 	sadb_t *sp;
748 	int outhash;
749 	netstack_t		*ns = assoc->ipsa_netstack;
750 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
751 
752 	/* No peer?  No problem! */
753 	if (!assoc->ipsa_haspeer) {
754 		return (sadb_age_bytes(espstack->esp_pfkey_q, assoc, bytes,
755 		    B_TRUE));
756 	}
757 
758 	/*
759 	 * Otherwise, we want to grab both the original assoc and its peer.
760 	 * There might be a race for this, but if it's a real race, two
761 	 * expire messages may occur.  We limit this by only sending the
762 	 * expire message on one of the peers, we'll pick the inbound
763 	 * arbitrarily.
764 	 *
765 	 * If we need tight synchronization on the peer SA, then we need to
766 	 * reconsider.
767 	 */
768 
769 	/* Use address length to select IPv6/IPv4 */
770 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
771 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
772 
773 	if (inbound) {
774 		inassoc = assoc;
775 		if (isv6) {
776 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
777 			    &inassoc->ipsa_dstaddr));
778 		} else {
779 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
780 			    &inassoc->ipsa_dstaddr));
781 		}
782 		bucket = &sp->sdb_of[outhash];
783 		mutex_enter(&bucket->isaf_lock);
784 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
785 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
786 		    inassoc->ipsa_addrfam);
787 		mutex_exit(&bucket->isaf_lock);
788 		if (outassoc == NULL) {
789 			/* Q: Do we wish to set haspeer == B_FALSE? */
790 			esp0dbg(("esp_age_bytes: "
791 			    "can't find peer for inbound.\n"));
792 			return (sadb_age_bytes(espstack->esp_pfkey_q, inassoc,
793 			    bytes, B_TRUE));
794 		}
795 	} else {
796 		outassoc = assoc;
797 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
798 		mutex_enter(&bucket->isaf_lock);
799 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
800 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
801 		    outassoc->ipsa_addrfam);
802 		mutex_exit(&bucket->isaf_lock);
803 		if (inassoc == NULL) {
804 			/* Q: Do we wish to set haspeer == B_FALSE? */
805 			esp0dbg(("esp_age_bytes: "
806 			    "can't find peer for outbound.\n"));
807 			return (sadb_age_bytes(espstack->esp_pfkey_q, outassoc,
808 			    bytes, B_TRUE));
809 		}
810 	}
811 
812 	inrc = sadb_age_bytes(espstack->esp_pfkey_q, inassoc, bytes, B_TRUE);
813 	outrc = sadb_age_bytes(espstack->esp_pfkey_q, outassoc, bytes, B_FALSE);
814 
815 	/*
816 	 * REFRELE any peer SA.
817 	 *
818 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
819 	 * them in { }.
820 	 */
821 	if (inbound) {
822 		IPSA_REFRELE(outassoc);
823 	} else {
824 		IPSA_REFRELE(inassoc);
825 	}
826 
827 	return (inrc && outrc);
828 }
829 
830 /*
831  * Do incoming NAT-T manipulations for packet.
832  */
833 static ipsec_status_t
834 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
835 {
836 	ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
837 	tcpha_t *tcph;
838 	udpha_t *udpha;
839 	/* Initialize to our inbound cksum adjustment... */
840 	uint32_t sum = assoc->ipsa_inbound_cksum;
841 
842 	switch (ipha->ipha_protocol) {
843 	case IPPROTO_TCP:
844 		tcph = (tcpha_t *)(data_mp->b_rptr +
845 		    IPH_HDR_LENGTH(ipha));
846 
847 #define	DOWN_SUM(x) (x) = ((x) & 0xFFFF) +	 ((x) >> 16)
848 		sum += ~ntohs(tcph->tha_sum) & 0xFFFF;
849 		DOWN_SUM(sum);
850 		DOWN_SUM(sum);
851 		tcph->tha_sum = ~htons(sum);
852 		break;
853 	case IPPROTO_UDP:
854 		udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
855 
856 		if (udpha->uha_checksum != 0) {
857 			/* Adujst if the inbound one was not zero. */
858 			sum += ~ntohs(udpha->uha_checksum) & 0xFFFF;
859 			DOWN_SUM(sum);
860 			DOWN_SUM(sum);
861 			udpha->uha_checksum = ~htons(sum);
862 			if (udpha->uha_checksum == 0)
863 				udpha->uha_checksum = 0xFFFF;
864 		}
865 #undef DOWN_SUM
866 		break;
867 	case IPPROTO_IP:
868 		/*
869 		 * This case is only an issue for self-encapsulated
870 		 * packets.  So for now, fall through.
871 		 */
872 		break;
873 	}
874 	return (IPSEC_STATUS_SUCCESS);
875 }
876 
877 
878 /*
879  * Strip ESP header, check padding, and fix IP header.
880  * Returns B_TRUE on success, B_FALSE if an error occured.
881  */
882 static boolean_t
883 esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
884     kstat_named_t **counter, ipsecesp_stack_t *espstack)
885 {
886 	ipha_t *ipha;
887 	ip6_t *ip6h;
888 	uint_t divpoint;
889 	mblk_t *scratch;
890 	uint8_t nexthdr, padlen;
891 	uint8_t lastpad;
892 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
893 	uint8_t *lastbyte;
894 
895 	/*
896 	 * Strip ESP data and fix IP header.
897 	 *
898 	 * XXX In case the beginning of esp_inbound() changes to not do a
899 	 * pullup, this part of the code can remain unchanged.
900 	 */
901 	if (isv4) {
902 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ipha_t));
903 		ipha = (ipha_t *)data_mp->b_rptr;
904 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (esph_t) +
905 		    IPH_HDR_LENGTH(ipha));
906 		divpoint = IPH_HDR_LENGTH(ipha);
907 	} else {
908 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ip6_t));
909 		ip6h = (ip6_t *)data_mp->b_rptr;
910 		divpoint = ip_hdr_length_v6(data_mp, ip6h);
911 	}
912 
913 	scratch = data_mp;
914 	while (scratch->b_cont != NULL)
915 		scratch = scratch->b_cont;
916 
917 	ASSERT((scratch->b_wptr - scratch->b_rptr) >= 3);
918 
919 	/*
920 	 * "Next header" and padding length are the last two bytes in the
921 	 * ESP-protected datagram, thus the explicit - 1 and - 2.
922 	 * lastpad is the last byte of the padding, which can be used for
923 	 * a quick check to see if the padding is correct.
924 	 */
925 	lastbyte = scratch->b_wptr - 1;
926 	nexthdr = *lastbyte--;
927 	padlen = *lastbyte--;
928 
929 	if (isv4) {
930 		/* Fix part of the IP header. */
931 		ipha->ipha_protocol = nexthdr;
932 		/*
933 		 * Reality check the padlen.  The explicit - 2 is for the
934 		 * padding length and the next-header bytes.
935 		 */
936 		if (padlen >= ntohs(ipha->ipha_length) - sizeof (ipha_t) - 2 -
937 		    sizeof (esph_t) - ivlen) {
938 			ESP_BUMP_STAT(espstack, bad_decrypt);
939 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
940 			    info.mi_idnum, 0, 0,
941 			    SL_ERROR | SL_WARN,
942 			    "Corrupt ESP packet (padlen too big).\n");
943 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
944 			    padlen));
945 			esp1dbg(espstack, ("pkt len(%d) - ip hdr - esp "
946 			    "hdr - ivlen(%d) = %d.\n",
947 			    ntohs(ipha->ipha_length), ivlen,
948 			    (int)(ntohs(ipha->ipha_length) - sizeof (ipha_t) -
949 			    2 - sizeof (esph_t) - ivlen)));
950 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
951 			return (B_FALSE);
952 		}
953 
954 		/*
955 		 * Fix the rest of the header.  The explicit - 2 is for the
956 		 * padding length and the next-header bytes.
957 		 */
958 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - padlen -
959 		    2 - sizeof (esph_t) - ivlen);
960 		ipha->ipha_hdr_checksum = 0;
961 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
962 	} else {
963 		if (ip6h->ip6_nxt == IPPROTO_ESP) {
964 			ip6h->ip6_nxt = nexthdr;
965 		} else {
966 			ip6_pkt_t ipp;
967 
968 			bzero(&ipp, sizeof (ipp));
969 			(void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
970 			if (ipp.ipp_dstopts != NULL) {
971 				ipp.ipp_dstopts->ip6d_nxt = nexthdr;
972 			} else if (ipp.ipp_rthdr != NULL) {
973 				ipp.ipp_rthdr->ip6r_nxt = nexthdr;
974 			} else if (ipp.ipp_hopopts != NULL) {
975 				ipp.ipp_hopopts->ip6h_nxt = nexthdr;
976 			} else {
977 				/* Panic a DEBUG kernel. */
978 				ASSERT(ipp.ipp_hopopts != NULL);
979 				/* Otherwise, pretend it's IP + ESP. */
980 				cmn_err(CE_WARN, "ESP IPv6 headers wrong.\n");
981 				ip6h->ip6_nxt = nexthdr;
982 			}
983 		}
984 
985 		if (padlen >= ntohs(ip6h->ip6_plen) - 2 - sizeof (esph_t) -
986 		    ivlen) {
987 			ESP_BUMP_STAT(espstack, bad_decrypt);
988 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
989 			    info.mi_idnum, 0, 0,
990 			    SL_ERROR | SL_WARN,
991 			    "Corrupt ESP packet (v6 padlen too big).\n");
992 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
993 			    padlen));
994 			esp1dbg(espstack,
995 			    ("pkt len(%u) - ip hdr - esp hdr - ivlen(%d) = "
996 			    "%u.\n", (unsigned)(ntohs(ip6h->ip6_plen)
997 			    + sizeof (ip6_t)), ivlen,
998 			    (unsigned)(ntohs(ip6h->ip6_plen) - 2 -
999 			    sizeof (esph_t) - ivlen)));
1000 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
1001 			return (B_FALSE);
1002 		}
1003 
1004 
1005 		/*
1006 		 * Fix the rest of the header.  The explicit - 2 is for the
1007 		 * padding length and the next-header bytes.  IPv6 is nice,
1008 		 * because there's no hdr checksum!
1009 		 */
1010 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - padlen -
1011 		    2 - sizeof (esph_t) - ivlen);
1012 	}
1013 
1014 	if (espstack->ipsecesp_padding_check > 0 && padlen > 0) {
1015 		/*
1016 		 * Weak padding check: compare last-byte to length, they
1017 		 * should be equal.
1018 		 */
1019 		lastpad = *lastbyte--;
1020 
1021 		if (padlen != lastpad) {
1022 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
1023 			    info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1024 			    "Corrupt ESP packet (lastpad != padlen).\n");
1025 			esp1dbg(espstack,
1026 			    ("lastpad (%d) not equal to padlen (%d):\n",
1027 			    lastpad, padlen));
1028 			ESP_BUMP_STAT(espstack, bad_padding);
1029 			*counter = DROPPER(ipss, ipds_esp_bad_padding);
1030 			return (B_FALSE);
1031 		}
1032 
1033 		/*
1034 		 * Strong padding check: Check all pad bytes to see that
1035 		 * they're ascending.  Go backwards using a descending counter
1036 		 * to verify.  padlen == 1 is checked by previous block, so
1037 		 * only bother if we've more than 1 byte of padding.
1038 		 * Consequently, start the check one byte before the location
1039 		 * of "lastpad".
1040 		 */
1041 		if (espstack->ipsecesp_padding_check > 1) {
1042 			/*
1043 			 * This assert may have to become an if and a pullup
1044 			 * if we start accepting multi-dblk mblks. For now,
1045 			 * though, any packet here will have been pulled up in
1046 			 * esp_inbound.
1047 			 */
1048 			ASSERT(MBLKL(scratch) >= lastpad + 3);
1049 
1050 			/*
1051 			 * Use "--lastpad" because we already checked the very
1052 			 * last pad byte previously.
1053 			 */
1054 			while (--lastpad != 0) {
1055 				if (lastpad != *lastbyte) {
1056 					ipsec_rl_strlog(
1057 					    espstack->ipsecesp_netstack,
1058 					    info.mi_idnum, 0, 0,
1059 					    SL_ERROR | SL_WARN, "Corrupt ESP "
1060 					    "packet (bad padding).\n");
1061 					esp1dbg(espstack,
1062 					    ("padding not in correct"
1063 					    " format:\n"));
1064 					ESP_BUMP_STAT(espstack, bad_padding);
1065 					*counter = DROPPER(ipss,
1066 					    ipds_esp_bad_padding);
1067 					return (B_FALSE);
1068 				}
1069 				lastbyte--;
1070 			}
1071 		}
1072 	}
1073 
1074 	/* Trim off the padding. */
1075 	ASSERT(data_mp->b_cont == NULL);
1076 	data_mp->b_wptr -= (padlen + 2);
1077 
1078 	/*
1079 	 * Remove the ESP header.
1080 	 *
1081 	 * The above assertions about data_mp's size will make this work.
1082 	 *
1083 	 * XXX  Question:  If I send up and get back a contiguous mblk,
1084 	 * would it be quicker to bcopy over, or keep doing the dupb stuff?
1085 	 * I go with copying for now.
1086 	 */
1087 
1088 	if (IS_P2ALIGNED(data_mp->b_rptr, sizeof (uint32_t)) &&
1089 	    IS_P2ALIGNED(ivlen, sizeof (uint32_t))) {
1090 		uint8_t *start = data_mp->b_rptr;
1091 		uint32_t *src, *dst;
1092 
1093 		src = (uint32_t *)(start + divpoint);
1094 		dst = (uint32_t *)(start + divpoint + sizeof (esph_t) + ivlen);
1095 
1096 		ASSERT(IS_P2ALIGNED(dst, sizeof (uint32_t)) &&
1097 		    IS_P2ALIGNED(src, sizeof (uint32_t)));
1098 
1099 		do {
1100 			src--;
1101 			dst--;
1102 			*dst = *src;
1103 		} while (src != (uint32_t *)start);
1104 
1105 		data_mp->b_rptr = (uchar_t *)dst;
1106 	} else {
1107 		uint8_t *start = data_mp->b_rptr;
1108 		uint8_t *src, *dst;
1109 
1110 		src = start + divpoint;
1111 		dst = src + sizeof (esph_t) + ivlen;
1112 
1113 		do {
1114 			src--;
1115 			dst--;
1116 			*dst = *src;
1117 		} while (src != start);
1118 
1119 		data_mp->b_rptr = dst;
1120 	}
1121 
1122 	esp2dbg(espstack, ("data_mp after inbound ESP adjustment:\n"));
1123 	esp2dbg(espstack, (dump_msg(data_mp)));
1124 
1125 	return (B_TRUE);
1126 }
1127 
1128 /*
1129  * Updating use times can be tricky business if the ipsa_haspeer flag is
1130  * set.  This function is called once in an SA's lifetime.
1131  *
1132  * Caller has to REFRELE "assoc" which is passed in.  This function has
1133  * to REFRELE any peer SA that is obtained.
1134  */
1135 static void
1136 esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
1137 {
1138 	ipsa_t *inassoc, *outassoc;
1139 	isaf_t *bucket;
1140 	sadb_t *sp;
1141 	int outhash;
1142 	boolean_t isv6;
1143 	netstack_t		*ns = assoc->ipsa_netstack;
1144 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
1145 
1146 	/* No peer?  No problem! */
1147 	if (!assoc->ipsa_haspeer) {
1148 		sadb_set_usetime(assoc);
1149 		return;
1150 	}
1151 
1152 	/*
1153 	 * Otherwise, we want to grab both the original assoc and its peer.
1154 	 * There might be a race for this, but if it's a real race, the times
1155 	 * will be out-of-synch by at most a second, and since our time
1156 	 * granularity is a second, this won't be a problem.
1157 	 *
1158 	 * If we need tight synchronization on the peer SA, then we need to
1159 	 * reconsider.
1160 	 */
1161 
1162 	/* Use address length to select IPv6/IPv4 */
1163 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
1164 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
1165 
1166 	if (inbound) {
1167 		inassoc = assoc;
1168 		if (isv6) {
1169 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
1170 			    &inassoc->ipsa_dstaddr));
1171 		} else {
1172 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
1173 			    &inassoc->ipsa_dstaddr));
1174 		}
1175 		bucket = &sp->sdb_of[outhash];
1176 		mutex_enter(&bucket->isaf_lock);
1177 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
1178 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
1179 		    inassoc->ipsa_addrfam);
1180 		mutex_exit(&bucket->isaf_lock);
1181 		if (outassoc == NULL) {
1182 			/* Q: Do we wish to set haspeer == B_FALSE? */
1183 			esp0dbg(("esp_set_usetime: "
1184 			    "can't find peer for inbound.\n"));
1185 			sadb_set_usetime(inassoc);
1186 			return;
1187 		}
1188 	} else {
1189 		outassoc = assoc;
1190 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
1191 		mutex_enter(&bucket->isaf_lock);
1192 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
1193 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
1194 		    outassoc->ipsa_addrfam);
1195 		mutex_exit(&bucket->isaf_lock);
1196 		if (inassoc == NULL) {
1197 			/* Q: Do we wish to set haspeer == B_FALSE? */
1198 			esp0dbg(("esp_set_usetime: "
1199 			    "can't find peer for outbound.\n"));
1200 			sadb_set_usetime(outassoc);
1201 			return;
1202 		}
1203 	}
1204 
1205 	/* Update usetime on both. */
1206 	sadb_set_usetime(inassoc);
1207 	sadb_set_usetime(outassoc);
1208 
1209 	/*
1210 	 * REFRELE any peer SA.
1211 	 *
1212 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
1213 	 * them in { }.
1214 	 */
1215 	if (inbound) {
1216 		IPSA_REFRELE(outassoc);
1217 	} else {
1218 		IPSA_REFRELE(inassoc);
1219 	}
1220 }
1221 
1222 /*
1223  * Handle ESP inbound data for IPv4 and IPv6.
1224  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1225  * mblk chain ipsec_in_mp.
1226  */
1227 ipsec_status_t
1228 esp_inbound(mblk_t *ipsec_in_mp, void *arg)
1229 {
1230 	mblk_t *data_mp = ipsec_in_mp->b_cont;
1231 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
1232 	esph_t *esph = (esph_t *)arg;
1233 	ipsa_t *ipsa = ii->ipsec_in_esp_sa;
1234 	netstack_t	*ns = ii->ipsec_in_ns;
1235 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1236 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1237 
1238 	/*
1239 	 * We may wish to check replay in-range-only here as an optimization.
1240 	 * Include the reality check of ipsa->ipsa_replay >
1241 	 * ipsa->ipsa_replay_wsize for times when it's the first N packets,
1242 	 * where N == ipsa->ipsa_replay_wsize.
1243 	 *
1244 	 * Another check that may come here later is the "collision" check.
1245 	 * If legitimate packets flow quickly enough, this won't be a problem,
1246 	 * but collisions may cause authentication algorithm crunching to
1247 	 * take place when it doesn't need to.
1248 	 */
1249 	if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
1250 		ESP_BUMP_STAT(espstack, replay_early_failures);
1251 		IP_ESP_BUMP_STAT(ipss, in_discards);
1252 		/*
1253 		 * TODO: Extract inbound interface from the IPSEC_IN
1254 		 * message's ii->ipsec_in_rill_index.
1255 		 */
1256 		ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
1257 		    DROPPER(ipss, ipds_esp_early_replay),
1258 		    &espstack->esp_dropper);
1259 		return (IPSEC_STATUS_FAILED);
1260 	}
1261 
1262 	/*
1263 	 * Has this packet already been processed by a hardware
1264 	 * IPsec accelerator?
1265 	 */
1266 	if (ii->ipsec_in_accelerated) {
1267 		ipsec_status_t rv;
1268 		esp3dbg(espstack,
1269 		    ("esp_inbound: pkt processed by ill=%d isv6=%d\n",
1270 		    ii->ipsec_in_ill_index, !ii->ipsec_in_v4));
1271 		rv = esp_inbound_accelerated(ipsec_in_mp,
1272 		    data_mp, ii->ipsec_in_v4, ipsa);
1273 		return (rv);
1274 	}
1275 	ESP_BUMP_STAT(espstack, noaccel);
1276 
1277 	/*
1278 	 * Adjust the IP header's payload length to reflect the removal
1279 	 * of the ICV.
1280 	 */
1281 	if (!ii->ipsec_in_v4) {
1282 		ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
1283 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
1284 		    ipsa->ipsa_mac_len);
1285 	} else {
1286 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1287 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) -
1288 		    ipsa->ipsa_mac_len);
1289 	}
1290 
1291 	/* submit the request to the crypto framework */
1292 	return (esp_submit_req_inbound(ipsec_in_mp, ipsa,
1293 	    (uint8_t *)esph - data_mp->b_rptr));
1294 }
1295 
1296 /*
1297  * Perform the really difficult work of inserting the proposed situation.
1298  * Called while holding the algorithm lock.
1299  */
1300 static void
1301 esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
1302 {
1303 	sadb_comb_t *comb = (sadb_comb_t *)(prop + 1);
1304 	ipsec_out_t *io;
1305 	ipsec_action_t *ap;
1306 	ipsec_prot_t *prot;
1307 	netstack_t *ns;
1308 	ipsecesp_stack_t *espstack;
1309 	ipsec_stack_t *ipss;
1310 
1311 	io = (ipsec_out_t *)acqrec->ipsacq_mp->b_rptr;
1312 	ASSERT(io->ipsec_out_type == IPSEC_OUT);
1313 	ns = io->ipsec_out_ns;
1314 	espstack = ns->netstack_ipsecesp;
1315 	ipss = ns->netstack_ipsec;
1316 	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1317 
1318 	prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
1319 	prop->sadb_prop_len = SADB_8TO64(sizeof (sadb_prop_t));
1320 	*(uint32_t *)(&prop->sadb_prop_replay) = 0;	/* Quick zero-out! */
1321 
1322 	prop->sadb_prop_replay = espstack->ipsecesp_replay_size;
1323 
1324 	/*
1325 	 * Based upon algorithm properties, and what-not, prioritize
1326 	 * a proposal.  If the IPSEC_OUT message has an algorithm specified,
1327 	 * use it first and foremost.
1328 	 *
1329 	 * For each action in policy list
1330 	 *   Add combination.  If I've hit limit, return.
1331 	 */
1332 
1333 	for (ap = acqrec->ipsacq_act; ap != NULL;
1334 	    ap = ap->ipa_next) {
1335 		ipsec_alginfo_t *ealg = NULL;
1336 		ipsec_alginfo_t *aalg = NULL;
1337 
1338 		if (ap->ipa_act.ipa_type != IPSEC_POLICY_APPLY)
1339 			continue;
1340 
1341 		prot = &ap->ipa_act.ipa_apply;
1342 
1343 		if (!(prot->ipp_use_esp))
1344 			continue;
1345 
1346 		if (prot->ipp_esp_auth_alg != 0) {
1347 			aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
1348 			    [prot->ipp_esp_auth_alg];
1349 			if (aalg == NULL || !ALG_VALID(aalg))
1350 				continue;
1351 		}
1352 
1353 		ASSERT(prot->ipp_encr_alg > 0);
1354 		ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
1355 		    [prot->ipp_encr_alg];
1356 		if (ealg == NULL || !ALG_VALID(ealg))
1357 			continue;
1358 
1359 		comb->sadb_comb_flags = 0;
1360 		comb->sadb_comb_reserved = 0;
1361 		comb->sadb_comb_encrypt = ealg->alg_id;
1362 		comb->sadb_comb_encrypt_minbits =
1363 		    MAX(prot->ipp_espe_minbits, ealg->alg_ef_minbits);
1364 		comb->sadb_comb_encrypt_maxbits =
1365 		    MIN(prot->ipp_espe_maxbits, ealg->alg_ef_maxbits);
1366 		if (aalg == NULL) {
1367 			comb->sadb_comb_auth = 0;
1368 			comb->sadb_comb_auth_minbits = 0;
1369 			comb->sadb_comb_auth_maxbits = 0;
1370 		} else {
1371 			comb->sadb_comb_auth = aalg->alg_id;
1372 			comb->sadb_comb_auth_minbits =
1373 			    MAX(prot->ipp_espa_minbits, aalg->alg_ef_minbits);
1374 			comb->sadb_comb_auth_maxbits =
1375 			    MIN(prot->ipp_espa_maxbits, aalg->alg_ef_maxbits);
1376 		}
1377 
1378 		/*
1379 		 * The following may be based on algorithm
1380 		 * properties, but in the meantime, we just pick
1381 		 * some good, sensible numbers.  Key mgmt. can
1382 		 * (and perhaps should) be the place to finalize
1383 		 * such decisions.
1384 		 */
1385 
1386 		/*
1387 		 * No limits on allocations, since we really don't
1388 		 * support that concept currently.
1389 		 */
1390 		comb->sadb_comb_soft_allocations = 0;
1391 		comb->sadb_comb_hard_allocations = 0;
1392 
1393 		/*
1394 		 * These may want to come from policy rule..
1395 		 */
1396 		comb->sadb_comb_soft_bytes =
1397 		    espstack->ipsecesp_default_soft_bytes;
1398 		comb->sadb_comb_hard_bytes =
1399 		    espstack->ipsecesp_default_hard_bytes;
1400 		comb->sadb_comb_soft_addtime =
1401 		    espstack->ipsecesp_default_soft_addtime;
1402 		comb->sadb_comb_hard_addtime =
1403 		    espstack->ipsecesp_default_hard_addtime;
1404 		comb->sadb_comb_soft_usetime =
1405 		    espstack->ipsecesp_default_soft_usetime;
1406 		comb->sadb_comb_hard_usetime =
1407 		    espstack->ipsecesp_default_hard_usetime;
1408 
1409 		prop->sadb_prop_len += SADB_8TO64(sizeof (*comb));
1410 		if (--combs == 0)
1411 			break;	/* out of space.. */
1412 		comb++;
1413 	}
1414 }
1415 
1416 /*
1417  * Prepare and actually send the SADB_ACQUIRE message to PF_KEY.
1418  */
1419 static void
1420 esp_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns)
1421 {
1422 	uint_t combs;
1423 	sadb_msg_t *samsg;
1424 	sadb_prop_t *prop;
1425 	mblk_t *pfkeymp, *msgmp;
1426 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1427 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1428 
1429 	ESP_BUMP_STAT(espstack, acquire_requests);
1430 
1431 	if (espstack->esp_pfkey_q == NULL)
1432 		return;
1433 
1434 	/* Set up ACQUIRE. */
1435 	pfkeymp = sadb_setup_acquire(acqrec, SADB_SATYPE_ESP,
1436 	    ns->netstack_ipsec);
1437 	if (pfkeymp == NULL) {
1438 		esp0dbg(("sadb_setup_acquire failed.\n"));
1439 		return;
1440 	}
1441 	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1442 	combs = ipss->ipsec_nalgs[IPSEC_ALG_AUTH] *
1443 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
1444 	msgmp = pfkeymp->b_cont;
1445 	samsg = (sadb_msg_t *)(msgmp->b_rptr);
1446 
1447 	/* Insert proposal here. */
1448 
1449 	prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len);
1450 	esp_insert_prop(prop, acqrec, combs);
1451 	samsg->sadb_msg_len += prop->sadb_prop_len;
1452 	msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len);
1453 
1454 	mutex_exit(&ipss->ipsec_alg_lock);
1455 
1456 	/*
1457 	 * Must mutex_exit() before sending PF_KEY message up, in
1458 	 * order to avoid recursive mutex_enter() if there are no registered
1459 	 * listeners.
1460 	 *
1461 	 * Once I've sent the message, I'm cool anyway.
1462 	 */
1463 	mutex_exit(&acqrec->ipsacq_lock);
1464 	if (extended != NULL) {
1465 		putnext(espstack->esp_pfkey_q, extended);
1466 	}
1467 	putnext(espstack->esp_pfkey_q, pfkeymp);
1468 }
1469 
1470 /*
1471  * Handle the SADB_GETSPI message.  Create a larval SA.
1472  */
1473 static void
1474 esp_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
1475 {
1476 	ipsa_t *newbie, *target;
1477 	isaf_t *outbound, *inbound;
1478 	int rc, diagnostic;
1479 	sadb_sa_t *assoc;
1480 	keysock_out_t *kso;
1481 	uint32_t newspi;
1482 
1483 	/*
1484 	 * Randomly generate a proposed SPI value
1485 	 */
1486 	(void) random_get_pseudo_bytes((uint8_t *)&newspi, sizeof (uint32_t));
1487 	newbie = sadb_getspi(ksi, newspi, &diagnostic,
1488 	    espstack->ipsecesp_netstack);
1489 
1490 	if (newbie == NULL) {
1491 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM, diagnostic,
1492 		    ksi->ks_in_serial);
1493 		return;
1494 	} else if (newbie == (ipsa_t *)-1) {
1495 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
1496 		    ksi->ks_in_serial);
1497 		return;
1498 	}
1499 
1500 	/*
1501 	 * XXX - We may randomly collide.  We really should recover from this.
1502 	 *	 Unfortunately, that could require spending way-too-much-time
1503 	 *	 in here.  For now, let the user retry.
1504 	 */
1505 
1506 	if (newbie->ipsa_addrfam == AF_INET6) {
1507 		outbound = OUTBOUND_BUCKET_V6(&espstack->esp_sadb.s_v6,
1508 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1509 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v6,
1510 		    newbie->ipsa_spi);
1511 	} else {
1512 		ASSERT(newbie->ipsa_addrfam == AF_INET);
1513 		outbound = OUTBOUND_BUCKET_V4(&espstack->esp_sadb.s_v4,
1514 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1515 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v4,
1516 		    newbie->ipsa_spi);
1517 	}
1518 
1519 	mutex_enter(&outbound->isaf_lock);
1520 	mutex_enter(&inbound->isaf_lock);
1521 
1522 	/*
1523 	 * Check for collisions (i.e. did sadb_getspi() return with something
1524 	 * that already exists?).
1525 	 *
1526 	 * Try outbound first.  Even though SADB_GETSPI is traditionally
1527 	 * for inbound SAs, you never know what a user might do.
1528 	 */
1529 	target = ipsec_getassocbyspi(outbound, newbie->ipsa_spi,
1530 	    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr, newbie->ipsa_addrfam);
1531 	if (target == NULL) {
1532 		target = ipsec_getassocbyspi(inbound, newbie->ipsa_spi,
1533 		    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr,
1534 		    newbie->ipsa_addrfam);
1535 	}
1536 
1537 	/*
1538 	 * I don't have collisions elsewhere!
1539 	 * (Nor will I because I'm still holding inbound/outbound locks.)
1540 	 */
1541 
1542 	if (target != NULL) {
1543 		rc = EEXIST;
1544 		IPSA_REFRELE(target);
1545 	} else {
1546 		/*
1547 		 * sadb_insertassoc() also checks for collisions, so
1548 		 * if there's a colliding entry, rc will be set
1549 		 * to EEXIST.
1550 		 */
1551 		rc = sadb_insertassoc(newbie, inbound);
1552 		newbie->ipsa_hardexpiretime = gethrestime_sec();
1553 		newbie->ipsa_hardexpiretime +=
1554 		    espstack->ipsecesp_larval_timeout;
1555 	}
1556 
1557 	/*
1558 	 * Can exit outbound mutex.  Hold inbound until we're done
1559 	 * with newbie.
1560 	 */
1561 	mutex_exit(&outbound->isaf_lock);
1562 
1563 	if (rc != 0) {
1564 		mutex_exit(&inbound->isaf_lock);
1565 		IPSA_REFRELE(newbie);
1566 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, rc,
1567 		    SADB_X_DIAGNOSTIC_NONE, ksi->ks_in_serial);
1568 		return;
1569 	}
1570 
1571 
1572 	/* Can write here because I'm still holding the bucket lock. */
1573 	newbie->ipsa_type = SADB_SATYPE_ESP;
1574 
1575 	/*
1576 	 * Construct successful return message.  We have one thing going
1577 	 * for us in PF_KEY v2.  That's the fact that
1578 	 *	sizeof (sadb_spirange_t) == sizeof (sadb_sa_t)
1579 	 */
1580 	assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SPIRANGE];
1581 	assoc->sadb_sa_exttype = SADB_EXT_SA;
1582 	assoc->sadb_sa_spi = newbie->ipsa_spi;
1583 	*((uint64_t *)(&assoc->sadb_sa_replay)) = 0;
1584 	mutex_exit(&inbound->isaf_lock);
1585 
1586 	/* Convert KEYSOCK_IN to KEYSOCK_OUT. */
1587 	kso = (keysock_out_t *)ksi;
1588 	kso->ks_out_len = sizeof (*kso);
1589 	kso->ks_out_serial = ksi->ks_in_serial;
1590 	kso->ks_out_type = KEYSOCK_OUT;
1591 
1592 	/*
1593 	 * Can safely putnext() to esp_pfkey_q, because this is a turnaround
1594 	 * from the esp_pfkey_q.
1595 	 */
1596 	putnext(espstack->esp_pfkey_q, mp);
1597 }
1598 
1599 /*
1600  * Insert the ESP header into a packet.  Duplicate an mblk, and insert a newly
1601  * allocated mblk with the ESP header in between the two.
1602  */
1603 static boolean_t
1604 esp_insert_esp(mblk_t *mp, mblk_t *esp_mp, uint_t divpoint,
1605     ipsecesp_stack_t *espstack)
1606 {
1607 	mblk_t *split_mp = mp;
1608 	uint_t wheretodiv = divpoint;
1609 
1610 	while ((split_mp->b_wptr - split_mp->b_rptr) < wheretodiv) {
1611 		wheretodiv -= (split_mp->b_wptr - split_mp->b_rptr);
1612 		split_mp = split_mp->b_cont;
1613 		ASSERT(split_mp != NULL);
1614 	}
1615 
1616 	if (split_mp->b_wptr - split_mp->b_rptr != wheretodiv) {
1617 		mblk_t *scratch;
1618 
1619 		/* "scratch" is the 2nd half, split_mp is the first. */
1620 		scratch = dupb(split_mp);
1621 		if (scratch == NULL) {
1622 			esp1dbg(espstack,
1623 			    ("esp_insert_esp: can't allocate scratch.\n"));
1624 			return (B_FALSE);
1625 		}
1626 		/* NOTE:  dupb() doesn't set b_cont appropriately. */
1627 		scratch->b_cont = split_mp->b_cont;
1628 		scratch->b_rptr += wheretodiv;
1629 		split_mp->b_wptr = split_mp->b_rptr + wheretodiv;
1630 		split_mp->b_cont = scratch;
1631 	}
1632 	/*
1633 	 * At this point, split_mp is exactly "wheretodiv" bytes long, and
1634 	 * holds the end of the pre-ESP part of the datagram.
1635 	 */
1636 	esp_mp->b_cont = split_mp->b_cont;
1637 	split_mp->b_cont = esp_mp;
1638 
1639 	return (B_TRUE);
1640 }
1641 
1642 /*
1643  * Finish processing of an inbound ESP packet after processing by the
1644  * crypto framework.
1645  * - Remove the ESP header.
1646  * - Send packet back to IP.
1647  * If authentication was performed on the packet, this function is called
1648  * only if the authentication succeeded.
1649  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1650  * mblk chain ipsec_in_mp.
1651  */
1652 static ipsec_status_t
1653 esp_in_done(mblk_t *ipsec_in_mp)
1654 {
1655 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
1656 	mblk_t *data_mp;
1657 	ipsa_t *assoc;
1658 	uint_t espstart;
1659 	uint32_t ivlen = 0;
1660 	uint_t processed_len;
1661 	esph_t *esph;
1662 	kstat_named_t *counter;
1663 	boolean_t is_natt;
1664 	netstack_t	*ns = ii->ipsec_in_ns;
1665 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1666 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1667 
1668 	assoc = ii->ipsec_in_esp_sa;
1669 	ASSERT(assoc != NULL);
1670 
1671 	is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
1672 
1673 	/* get the pointer to the ESP header */
1674 	if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
1675 		/* authentication-only ESP */
1676 		espstart = ii->ipsec_in_crypto_data.cd_offset;
1677 		processed_len = ii->ipsec_in_crypto_data.cd_length;
1678 	} else {
1679 		/* encryption present */
1680 		ivlen = assoc->ipsa_iv_len;
1681 		if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
1682 			/* encryption-only ESP */
1683 			espstart = ii->ipsec_in_crypto_data.cd_offset -
1684 			    sizeof (esph_t) - assoc->ipsa_iv_len;
1685 			processed_len = ii->ipsec_in_crypto_data.cd_length +
1686 			    ivlen;
1687 		} else {
1688 			/* encryption with authentication */
1689 			espstart = ii->ipsec_in_crypto_dual_data.dd_offset1;
1690 			processed_len = ii->ipsec_in_crypto_dual_data.dd_len2 +
1691 			    ivlen;
1692 		}
1693 	}
1694 
1695 	data_mp = ipsec_in_mp->b_cont;
1696 	esph = (esph_t *)(data_mp->b_rptr + espstart);
1697 
1698 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE) {
1699 		/* authentication passed if we reach this point */
1700 		ESP_BUMP_STAT(espstack, good_auth);
1701 		data_mp->b_wptr -= assoc->ipsa_mac_len;
1702 
1703 		/*
1704 		 * Check replay window here!
1705 		 * For right now, assume keysock will set the replay window
1706 		 * size to zero for SAs that have an unspecified sender.
1707 		 * This may change...
1708 		 */
1709 
1710 		if (!sadb_replay_check(assoc, esph->esph_replay)) {
1711 			/*
1712 			 * Log the event. As of now we print out an event.
1713 			 * Do not print the replay failure number, or else
1714 			 * syslog cannot collate the error messages.  Printing
1715 			 * the replay number that failed opens a denial-of-
1716 			 * service attack.
1717 			 */
1718 			ipsec_assocfailure(info.mi_idnum, 0, 0,
1719 			    SL_ERROR | SL_WARN,
1720 			    "Replay failed for ESP spi 0x%x, dst %s.\n",
1721 			    assoc->ipsa_spi, assoc->ipsa_dstaddr,
1722 			    assoc->ipsa_addrfam, espstack->ipsecesp_netstack);
1723 			ESP_BUMP_STAT(espstack, replay_failures);
1724 			counter = DROPPER(ipss, ipds_esp_replay);
1725 			goto drop_and_bail;
1726 		}
1727 	}
1728 
1729 	esp_set_usetime(assoc, B_TRUE);
1730 
1731 	if (!esp_age_bytes(assoc, processed_len, B_TRUE)) {
1732 		/* The ipsa has hit hard expiration, LOG and AUDIT. */
1733 		ipsec_assocfailure(info.mi_idnum, 0, 0,
1734 		    SL_ERROR | SL_WARN,
1735 		    "ESP association 0x%x, dst %s had bytes expire.\n",
1736 		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1737 		    espstack->ipsecesp_netstack);
1738 		ESP_BUMP_STAT(espstack, bytes_expired);
1739 		counter = DROPPER(ipss, ipds_esp_bytes_expire);
1740 		goto drop_and_bail;
1741 	}
1742 
1743 	/*
1744 	 * Remove ESP header and padding from packet.  I hope the compiler
1745 	 * spews "branch, predict taken" code for this.
1746 	 */
1747 
1748 	if (esp_strip_header(data_mp, ii->ipsec_in_v4, ivlen, &counter,
1749 	    espstack)) {
1750 		if (is_natt)
1751 			return (esp_fix_natt_checksums(data_mp, assoc));
1752 		return (IPSEC_STATUS_SUCCESS);
1753 	}
1754 
1755 	esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
1756 drop_and_bail:
1757 	IP_ESP_BUMP_STAT(ipss, in_discards);
1758 	/*
1759 	 * TODO: Extract inbound interface from the IPSEC_IN message's
1760 	 * ii->ipsec_in_rill_index.
1761 	 */
1762 	ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, counter,
1763 	    &espstack->esp_dropper);
1764 	return (IPSEC_STATUS_FAILED);
1765 }
1766 
1767 /*
1768  * Called upon failing the inbound ICV check. The message passed as
1769  * argument is freed.
1770  */
1771 static void
1772 esp_log_bad_auth(mblk_t *ipsec_in)
1773 {
1774 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
1775 	ipsa_t *assoc = ii->ipsec_in_esp_sa;
1776 	netstack_t	*ns = ii->ipsec_in_ns;
1777 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1778 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1779 
1780 	/*
1781 	 * Log the event. Don't print to the console, block
1782 	 * potential denial-of-service attack.
1783 	 */
1784 	ESP_BUMP_STAT(espstack, bad_auth);
1785 
1786 	ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1787 	    "ESP Authentication failed for spi 0x%x, dst %s.\n",
1788 	    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1789 	    espstack->ipsecesp_netstack);
1790 
1791 	IP_ESP_BUMP_STAT(ipss, in_discards);
1792 	/*
1793 	 * TODO: Extract inbound interface from the IPSEC_IN
1794 	 * message's ii->ipsec_in_rill_index.
1795 	 */
1796 	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
1797 	    DROPPER(ipss, ipds_esp_bad_auth),
1798 	    &espstack->esp_dropper);
1799 }
1800 
1801 
1802 /*
1803  * Invoked for outbound packets after ESP processing. If the packet
1804  * also requires AH, performs the AH SA selection and AH processing.
1805  * Returns B_TRUE if the AH processing was not needed or if it was
1806  * performed successfully. Returns B_FALSE and consumes the passed mblk
1807  * if AH processing was required but could not be performed.
1808  */
1809 static boolean_t
1810 esp_do_outbound_ah(mblk_t *ipsec_mp)
1811 {
1812 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
1813 	ipsec_status_t ipsec_rc;
1814 	ipsec_action_t *ap;
1815 
1816 	ap = io->ipsec_out_act;
1817 	if (ap == NULL) {
1818 		ipsec_policy_t *pp = io->ipsec_out_policy;
1819 		ap = pp->ipsp_act;
1820 	}
1821 
1822 	if (!ap->ipa_want_ah)
1823 		return (B_TRUE);
1824 
1825 	ASSERT(io->ipsec_out_ah_done == B_FALSE);
1826 
1827 	if (io->ipsec_out_ah_sa == NULL) {
1828 		if (!ipsec_outbound_sa(ipsec_mp, IPPROTO_AH)) {
1829 			sadb_acquire(ipsec_mp, io, B_TRUE, B_FALSE);
1830 			return (B_FALSE);
1831 		}
1832 	}
1833 	ASSERT(io->ipsec_out_ah_sa != NULL);
1834 
1835 	io->ipsec_out_ah_done = B_TRUE;
1836 	ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp);
1837 	return (ipsec_rc == IPSEC_STATUS_SUCCESS);
1838 }
1839 
1840 
1841 /*
1842  * Kernel crypto framework callback invoked after completion of async
1843  * crypto requests.
1844  */
1845 static void
1846 esp_kcf_callback(void *arg, int status)
1847 {
1848 	mblk_t *ipsec_mp = (mblk_t *)arg;
1849 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
1850 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
1851 	boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
1852 	netstackid_t	stackid;
1853 	netstack_t	*ns, *ns_arg;
1854 	ipsecesp_stack_t *espstack;
1855 	ipsec_stack_t	*ipss;
1856 
1857 	ASSERT(ipsec_mp->b_cont != NULL);
1858 
1859 	if (is_inbound) {
1860 		stackid = ii->ipsec_in_stackid;
1861 		ns_arg = ii->ipsec_in_ns;
1862 	} else {
1863 		stackid = io->ipsec_out_stackid;
1864 		ns_arg = io->ipsec_out_ns;
1865 	}
1866 
1867 	/*
1868 	 * Verify that the netstack is still around; could have vanished
1869 	 * while kEf was doing its work.
1870 	 */
1871 	ns = netstack_find_by_stackid(stackid);
1872 	if (ns == NULL || ns != ns_arg) {
1873 		/* Disappeared on us */
1874 		if (ns != NULL)
1875 			netstack_rele(ns);
1876 		freemsg(ipsec_mp);
1877 		return;
1878 	}
1879 
1880 	espstack = ns->netstack_ipsecesp;
1881 	ipss = ns->netstack_ipsec;
1882 
1883 	if (status == CRYPTO_SUCCESS) {
1884 		if (is_inbound) {
1885 			if (esp_in_done(ipsec_mp) != IPSEC_STATUS_SUCCESS) {
1886 				netstack_rele(ns);
1887 				return;
1888 			}
1889 			/* finish IPsec processing */
1890 			ip_fanout_proto_again(ipsec_mp, NULL, NULL, NULL);
1891 		} else {
1892 			/*
1893 			 * If a ICV was computed, it was stored by the
1894 			 * crypto framework at the end of the packet.
1895 			 */
1896 			ipha_t *ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
1897 
1898 			esp_set_usetime(io->ipsec_out_esp_sa, B_FALSE);
1899 			/* NAT-T packet. */
1900 			if (ipha->ipha_protocol == IPPROTO_UDP)
1901 				esp_prepare_udp(ns, ipsec_mp->b_cont, ipha);
1902 
1903 			/* do AH processing if needed */
1904 			if (!esp_do_outbound_ah(ipsec_mp)) {
1905 				netstack_rele(ns);
1906 				return;
1907 			}
1908 			/* finish IPsec processing */
1909 			if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
1910 				ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL,
1911 				    NULL);
1912 			} else {
1913 				ip6_t *ip6h = (ip6_t *)ipha;
1914 				ip_wput_ipsec_out_v6(NULL, ipsec_mp, ip6h,
1915 				    NULL, NULL);
1916 			}
1917 		}
1918 
1919 	} else if (status == CRYPTO_INVALID_MAC) {
1920 		esp_log_bad_auth(ipsec_mp);
1921 
1922 	} else {
1923 		esp1dbg(espstack,
1924 		    ("esp_kcf_callback: crypto failed with 0x%x\n",
1925 		    status));
1926 		ESP_BUMP_STAT(espstack, crypto_failures);
1927 		if (is_inbound)
1928 			IP_ESP_BUMP_STAT(ipss, in_discards);
1929 		else
1930 			ESP_BUMP_STAT(espstack, out_discards);
1931 		ip_drop_packet(ipsec_mp, is_inbound, NULL, NULL,
1932 		    DROPPER(ipss, ipds_esp_crypto_failed),
1933 		    &espstack->esp_dropper);
1934 	}
1935 	netstack_rele(ns);
1936 }
1937 
1938 /*
1939  * Invoked on crypto framework failure during inbound and outbound processing.
1940  */
1941 static void
1942 esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
1943     ipsecesp_stack_t *espstack)
1944 {
1945 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
1946 
1947 	esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
1948 	    is_inbound ? "inbound" : "outbound", kef_rc));
1949 	ip_drop_packet(mp, is_inbound, NULL, NULL,
1950 	    DROPPER(ipss, ipds_esp_crypto_failed),
1951 	    &espstack->esp_dropper);
1952 	ESP_BUMP_STAT(espstack, crypto_failures);
1953 	if (is_inbound)
1954 		IP_ESP_BUMP_STAT(ipss, in_discards);
1955 	else
1956 		ESP_BUMP_STAT(espstack, out_discards);
1957 }
1958 
1959 #define	ESP_INIT_CALLREQ(_cr) {						\
1960 	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_RESTRICTED;		\
1961 	(_cr)->cr_callback_arg = ipsec_mp;				\
1962 	(_cr)->cr_callback_func = esp_kcf_callback;			\
1963 }
1964 
1965 #define	ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) {			\
1966 	(mac)->cd_format = CRYPTO_DATA_RAW;				\
1967 	(mac)->cd_offset = 0;						\
1968 	(mac)->cd_length = icvlen;					\
1969 	(mac)->cd_raw.iov_base = (char *)icvbuf;			\
1970 	(mac)->cd_raw.iov_len = icvlen;					\
1971 }
1972 
1973 #define	ESP_INIT_CRYPTO_DATA(data, mp, off, len) {			\
1974 	if (MBLKL(mp) >= (len) + (off)) {				\
1975 		(data)->cd_format = CRYPTO_DATA_RAW;			\
1976 		(data)->cd_raw.iov_base = (char *)(mp)->b_rptr;		\
1977 		(data)->cd_raw.iov_len = MBLKL(mp);			\
1978 		(data)->cd_offset = off;				\
1979 	} else {							\
1980 		(data)->cd_format = CRYPTO_DATA_MBLK;			\
1981 		(data)->cd_mp = mp;			       		\
1982 		(data)->cd_offset = off;				\
1983 	}								\
1984 	(data)->cd_length = len;					\
1985 }
1986 
1987 #define	ESP_INIT_CRYPTO_DUAL_DATA(data, mp, off1, len1, off2, len2) {	\
1988 	(data)->dd_format = CRYPTO_DATA_MBLK;				\
1989 	(data)->dd_mp = mp;						\
1990 	(data)->dd_len1 = len1;						\
1991 	(data)->dd_offset1 = off1;					\
1992 	(data)->dd_len2 = len2;						\
1993 	(data)->dd_offset2 = off2;					\
1994 }
1995 
1996 static ipsec_status_t
1997 esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
1998 {
1999 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
2000 	boolean_t do_auth;
2001 	uint_t auth_offset, msg_len, auth_len;
2002 	crypto_call_req_t call_req;
2003 	mblk_t *esp_mp;
2004 	int kef_rc = CRYPTO_FAILED;
2005 	uint_t icv_len = assoc->ipsa_mac_len;
2006 	crypto_ctx_template_t auth_ctx_tmpl;
2007 	boolean_t do_encr;
2008 	uint_t encr_offset, encr_len;
2009 	uint_t iv_len = assoc->ipsa_iv_len;
2010 	crypto_ctx_template_t encr_ctx_tmpl;
2011 	netstack_t	*ns = ii->ipsec_in_ns;
2012 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2013 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2014 
2015 	ASSERT(ii->ipsec_in_type == IPSEC_IN);
2016 
2017 	/*
2018 	 * In case kEF queues and calls back, keep netstackid_t for
2019 	 * verification that the IP instance is still around in
2020 	 * esp_kcf_callback().
2021 	 */
2022 	ii->ipsec_in_stackid = ns->netstack_stackid;
2023 
2024 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2025 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2026 
2027 	/*
2028 	 * An inbound packet is of the form:
2029 	 * IPSEC_IN -> [IP,options,ESP,IV,data,ICV,pad]
2030 	 */
2031 	esp_mp = ipsec_mp->b_cont;
2032 	msg_len = MBLKL(esp_mp);
2033 
2034 	ESP_INIT_CALLREQ(&call_req);
2035 
2036 	if (do_auth) {
2037 		/* force asynchronous processing? */
2038 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
2039 		    IPSEC_ALGS_EXEC_ASYNC)
2040 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2041 
2042 		/* authentication context template */
2043 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2044 		    auth_ctx_tmpl);
2045 
2046 		/* ICV to be verified */
2047 		ESP_INIT_CRYPTO_MAC(&ii->ipsec_in_crypto_mac,
2048 		    icv_len, esp_mp->b_wptr - icv_len);
2049 
2050 		/* authentication starts at the ESP header */
2051 		auth_offset = esph_offset;
2052 		auth_len = msg_len - auth_offset - icv_len;
2053 		if (!do_encr) {
2054 			/* authentication only */
2055 			/* initialize input data argument */
2056 			ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
2057 			    esp_mp, auth_offset, auth_len);
2058 
2059 			/* call the crypto framework */
2060 			kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
2061 			    &ii->ipsec_in_crypto_data,
2062 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2063 			    &ii->ipsec_in_crypto_mac, &call_req);
2064 		}
2065 	}
2066 
2067 	if (do_encr) {
2068 		/* force asynchronous processing? */
2069 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
2070 		    IPSEC_ALGS_EXEC_ASYNC)
2071 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2072 
2073 		/* encryption template */
2074 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2075 		    encr_ctx_tmpl);
2076 
2077 		/* skip IV, since it is passed separately */
2078 		encr_offset = esph_offset + sizeof (esph_t) + iv_len;
2079 		encr_len = msg_len - encr_offset;
2080 
2081 		if (!do_auth) {
2082 			/* decryption only */
2083 			/* initialize input data argument */
2084 			ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
2085 			    esp_mp, encr_offset, encr_len);
2086 
2087 			/* specify IV */
2088 			ii->ipsec_in_crypto_data.cd_miscdata =
2089 			    (char *)esp_mp->b_rptr + sizeof (esph_t) +
2090 			    esph_offset;
2091 
2092 			/* call the crypto framework */
2093 			kef_rc = crypto_decrypt(&assoc->ipsa_emech,
2094 			    &ii->ipsec_in_crypto_data,
2095 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2096 			    NULL, &call_req);
2097 		}
2098 	}
2099 
2100 	if (do_auth && do_encr) {
2101 		/* dual operation */
2102 		/* initialize input data argument */
2103 		ESP_INIT_CRYPTO_DUAL_DATA(&ii->ipsec_in_crypto_dual_data,
2104 		    esp_mp, auth_offset, auth_len,
2105 		    encr_offset, encr_len - icv_len);
2106 
2107 		/* specify IV */
2108 		ii->ipsec_in_crypto_dual_data.dd_miscdata =
2109 		    (char *)esp_mp->b_rptr + sizeof (esph_t) + esph_offset;
2110 
2111 		/* call the framework */
2112 		kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
2113 		    &assoc->ipsa_emech, &ii->ipsec_in_crypto_dual_data,
2114 		    &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
2115 		    auth_ctx_tmpl, encr_ctx_tmpl, &ii->ipsec_in_crypto_mac,
2116 		    NULL, &call_req);
2117 	}
2118 
2119 	switch (kef_rc) {
2120 	case CRYPTO_SUCCESS:
2121 		ESP_BUMP_STAT(espstack, crypto_sync);
2122 		return (esp_in_done(ipsec_mp));
2123 	case CRYPTO_QUEUED:
2124 		/* esp_kcf_callback() will be invoked on completion */
2125 		ESP_BUMP_STAT(espstack, crypto_async);
2126 		return (IPSEC_STATUS_PENDING);
2127 	case CRYPTO_INVALID_MAC:
2128 		ESP_BUMP_STAT(espstack, crypto_sync);
2129 		esp_log_bad_auth(ipsec_mp);
2130 		return (IPSEC_STATUS_FAILED);
2131 	}
2132 
2133 	esp_crypto_failed(ipsec_mp, B_TRUE, kef_rc, espstack);
2134 	return (IPSEC_STATUS_FAILED);
2135 }
2136 
2137 /*
2138  * Compute the IP and UDP checksums -- common code for both keepalives and
2139  * actual ESP-in-UDP packets.  Be flexible with multiple mblks because ESP
2140  * uses mblk-insertion to insert the UDP header.
2141  * TODO - If there is an easy way to prep a packet for HW checksums, make
2142  * it happen here.
2143  */
2144 static void
2145 esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
2146 {
2147 	int offset;
2148 	uint32_t cksum;
2149 	uint16_t *arr;
2150 	mblk_t *udpmp = mp;
2151 
2152 	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2153 
2154 	ipha->ipha_hdr_checksum = 0;
2155 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2156 
2157 	if (ns->netstack_udp->us_do_checksum) {
2158 		ASSERT(MBLKL(udpmp) >= sizeof (udpha_t));
2159 		/* arr points to the IP header. */
2160 		arr = (uint16_t *)ipha;
2161 		IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
2162 		IP_STAT_UPDATE(ns->netstack_ip, ip_udp_out_sw_cksum_bytes,
2163 		    ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH));
2164 		/* arr[6-9] are the IP addresses. */
2165 		cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
2166 		    ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
2167 		cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH, cksum);
2168 		offset = IP_SIMPLE_HDR_LENGTH + UDP_CHECKSUM_OFFSET;
2169 		while (offset >= MBLKL(udpmp)) {
2170 			offset -= MBLKL(udpmp);
2171 			udpmp = udpmp->b_cont;
2172 		}
2173 		/* arr points to the UDP header's checksum field. */
2174 		arr = (uint16_t *)(udpmp->b_rptr + offset);
2175 		*arr = cksum;
2176 	}
2177 }
2178 
2179 /*
2180  * Send a one-byte UDP NAT-T keepalive.  Construct an IPSEC_OUT too that'll
2181  * get fed into esp_send_udp/ip_wput_ipsec_out.
2182  */
2183 void
2184 ipsecesp_send_keepalive(ipsa_t *assoc)
2185 {
2186 	mblk_t *mp = NULL, *ipsec_mp = NULL;
2187 	ipha_t *ipha;
2188 	udpha_t *udpha;
2189 	ipsec_out_t *io;
2190 
2191 	ASSERT(!MUTEX_HELD(&assoc->ipsa_lock));
2192 
2193 	mp = allocb(sizeof (ipha_t) + sizeof (udpha_t) + 1, BPRI_HI);
2194 	if (mp == NULL)
2195 		return;
2196 	ipha = (ipha_t *)mp->b_rptr;
2197 	ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2198 	ipha->ipha_type_of_service = 0;
2199 	ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (udpha_t) + 1);
2200 	/* Use the low-16 of the SPI so we have some clue where it came from. */
2201 	ipha->ipha_ident = *(((uint16_t *)(&assoc->ipsa_spi)) + 1);
2202 	ipha->ipha_fragment_offset_and_flags = 0;  /* Too small to fragment! */
2203 	ipha->ipha_ttl = 0xFF;
2204 	ipha->ipha_protocol = IPPROTO_UDP;
2205 	ipha->ipha_hdr_checksum = 0;
2206 	ipha->ipha_src = assoc->ipsa_srcaddr[0];
2207 	ipha->ipha_dst = assoc->ipsa_dstaddr[0];
2208 	udpha = (udpha_t *)(ipha + 1);
2209 	udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2210 	    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2211 	udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2212 	    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2213 	udpha->uha_length = htons(sizeof (udpha_t) + 1);
2214 	udpha->uha_checksum = 0;
2215 	mp->b_wptr = (uint8_t *)(udpha + 1);
2216 	*(mp->b_wptr++) = 0xFF;
2217 
2218 	ipsec_mp = ipsec_alloc_ipsec_out(assoc->ipsa_netstack);
2219 	if (ipsec_mp == NULL) {
2220 		freeb(mp);
2221 		return;
2222 	}
2223 	ipsec_mp->b_cont = mp;
2224 	io = (ipsec_out_t *)ipsec_mp->b_rptr;
2225 	io->ipsec_out_zoneid =
2226 	    netstackid_to_zoneid(assoc->ipsa_netstack->netstack_stackid);
2227 
2228 	esp_prepare_udp(assoc->ipsa_netstack, mp, ipha);
2229 	ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL, NULL);
2230 }
2231 
2232 static ipsec_status_t
2233 esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
2234     uint_t payload_len)
2235 {
2236 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
2237 	uint_t auth_len;
2238 	crypto_call_req_t call_req;
2239 	mblk_t *esp_mp;
2240 	int kef_rc = CRYPTO_FAILED;
2241 	uint_t icv_len = assoc->ipsa_mac_len;
2242 	crypto_ctx_template_t auth_ctx_tmpl;
2243 	boolean_t do_auth;
2244 	boolean_t do_encr;
2245 	uint_t iv_len = assoc->ipsa_iv_len;
2246 	crypto_ctx_template_t encr_ctx_tmpl;
2247 	boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
2248 	size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
2249 	netstack_t	*ns = io->ipsec_out_ns;
2250 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2251 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2252 
2253 	esp3dbg(espstack, ("esp_submit_req_outbound:%s",
2254 	    is_natt ? "natt" : "not natt"));
2255 
2256 	ASSERT(io->ipsec_out_type == IPSEC_OUT);
2257 
2258 	/*
2259 	 * In case kEF queues and calls back, keep netstackid_t for
2260 	 * verification that the IP instance is still around in
2261 	 * esp_kcf_callback().
2262 	 */
2263 	io->ipsec_out_stackid = ns->netstack_stackid;
2264 
2265 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2266 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2267 
2268 	/*
2269 	 * Outbound IPsec packets are of the form:
2270 	 * IPSEC_OUT -> [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
2271 	 * unless it's NATT, then it's
2272 	 * IPSEC_OUT -> [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
2273 	 * Get a pointer to the mblk containing the ESP header.
2274 	 */
2275 	ASSERT(ipsec_mp->b_cont != NULL && ipsec_mp->b_cont->b_cont != NULL);
2276 	esp_mp = ipsec_mp->b_cont->b_cont;
2277 
2278 	ESP_INIT_CALLREQ(&call_req);
2279 
2280 	if (do_auth) {
2281 		/* force asynchronous processing? */
2282 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
2283 		    IPSEC_ALGS_EXEC_ASYNC)
2284 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2285 
2286 		/* authentication context template */
2287 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2288 		    auth_ctx_tmpl);
2289 
2290 		/* where to store the computed mac */
2291 		ESP_INIT_CRYPTO_MAC(&io->ipsec_out_crypto_mac,
2292 		    icv_len, icv_buf);
2293 
2294 		/* authentication starts at the ESP header */
2295 		auth_len = payload_len + iv_len + sizeof (esph_t);
2296 		if (!do_encr) {
2297 			/* authentication only */
2298 			/* initialize input data argument */
2299 			ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
2300 			    esp_mp, esph_offset, auth_len);
2301 
2302 			/* call the crypto framework */
2303 			kef_rc = crypto_mac(&assoc->ipsa_amech,
2304 			    &io->ipsec_out_crypto_data,
2305 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2306 			    &io->ipsec_out_crypto_mac, &call_req);
2307 		}
2308 	}
2309 
2310 	if (do_encr) {
2311 		/* force asynchronous processing? */
2312 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
2313 		    IPSEC_ALGS_EXEC_ASYNC)
2314 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2315 
2316 		/* encryption context template */
2317 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2318 		    encr_ctx_tmpl);
2319 
2320 		if (!do_auth) {
2321 			/* encryption only, skip mblk that contains ESP hdr */
2322 			/* initialize input data argument */
2323 			ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
2324 			    esp_mp->b_cont, 0, payload_len);
2325 
2326 			/* specify IV */
2327 			io->ipsec_out_crypto_data.cd_miscdata =
2328 			    (char *)esp_mp->b_rptr + sizeof (esph_t) +
2329 			    esph_offset;
2330 
2331 			/* call the crypto framework */
2332 			kef_rc = crypto_encrypt(&assoc->ipsa_emech,
2333 			    &io->ipsec_out_crypto_data,
2334 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2335 			    NULL, &call_req);
2336 		}
2337 	}
2338 
2339 	if (do_auth && do_encr) {
2340 		/*
2341 		 * Encryption and authentication:
2342 		 * Pass the pointer to the mblk chain starting at the ESP
2343 		 * header to the framework. Skip the ESP header mblk
2344 		 * for encryption, which is reflected by an encryption
2345 		 * offset equal to the length of that mblk. Start
2346 		 * the authentication at the ESP header, i.e. use an
2347 		 * authentication offset of zero.
2348 		 */
2349 		ESP_INIT_CRYPTO_DUAL_DATA(&io->ipsec_out_crypto_dual_data,
2350 		    esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
2351 
2352 		/* specify IV */
2353 		io->ipsec_out_crypto_dual_data.dd_miscdata =
2354 		    (char *)esp_mp->b_rptr + sizeof (esph_t) + esph_offset;
2355 
2356 		/* call the framework */
2357 		kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
2358 		    &assoc->ipsa_amech, NULL,
2359 		    &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
2360 		    encr_ctx_tmpl, auth_ctx_tmpl,
2361 		    &io->ipsec_out_crypto_dual_data,
2362 		    &io->ipsec_out_crypto_mac, &call_req);
2363 	}
2364 
2365 	switch (kef_rc) {
2366 	case CRYPTO_SUCCESS:
2367 		ESP_BUMP_STAT(espstack, crypto_sync);
2368 		esp_set_usetime(assoc, B_FALSE);
2369 		if (is_natt)
2370 			esp_prepare_udp(ns, ipsec_mp->b_cont,
2371 			    (ipha_t *)ipsec_mp->b_cont->b_rptr);
2372 		return (IPSEC_STATUS_SUCCESS);
2373 	case CRYPTO_QUEUED:
2374 		/* esp_kcf_callback() will be invoked on completion */
2375 		ESP_BUMP_STAT(espstack, crypto_async);
2376 		return (IPSEC_STATUS_PENDING);
2377 	}
2378 
2379 	esp_crypto_failed(ipsec_mp, B_TRUE, kef_rc, espstack);
2380 	return (IPSEC_STATUS_FAILED);
2381 }
2382 
2383 /*
2384  * Handle outbound IPsec processing for IPv4 and IPv6
2385  * On success returns B_TRUE, on failure returns B_FALSE and frees the
2386  * mblk chain ipsec_in_mp.
2387  */
2388 static ipsec_status_t
2389 esp_outbound(mblk_t *mp)
2390 {
2391 	mblk_t *ipsec_out_mp, *data_mp, *espmp, *tailmp;
2392 	ipsec_out_t *io;
2393 	ipha_t *ipha;
2394 	ip6_t *ip6h;
2395 	esph_t *esph;
2396 	uint_t af;
2397 	uint8_t *nhp;
2398 	uintptr_t divpoint, datalen, adj, padlen, i, alloclen;
2399 	uintptr_t esplen = sizeof (esph_t);
2400 	uint8_t protocol;
2401 	ipsa_t *assoc;
2402 	uint_t iv_len, mac_len = 0;
2403 	uchar_t *icv_buf;
2404 	udpha_t *udpha;
2405 	boolean_t is_natt = B_FALSE;
2406 	netstack_t	*ns;
2407 	ipsecesp_stack_t *espstack;
2408 	ipsec_stack_t	*ipss;
2409 
2410 	ipsec_out_mp = mp;
2411 	data_mp = ipsec_out_mp->b_cont;
2412 
2413 	io = (ipsec_out_t *)ipsec_out_mp->b_rptr;
2414 	ns = io->ipsec_out_ns;
2415 	espstack = ns->netstack_ipsecesp;
2416 	ipss = ns->netstack_ipsec;
2417 
2418 	ESP_BUMP_STAT(espstack, out_requests);
2419 
2420 	/*
2421 	 * <sigh> We have to copy the message here, because TCP (for example)
2422 	 * keeps a dupb() of the message lying around for retransmission.
2423 	 * Since ESP changes the whole of the datagram, we have to create our
2424 	 * own copy lest we clobber TCP's data.  Since we have to copy anyway,
2425 	 * we might as well make use of msgpullup() and get the mblk into one
2426 	 * contiguous piece!
2427 	 */
2428 	ipsec_out_mp->b_cont = msgpullup(data_mp, -1);
2429 	if (ipsec_out_mp->b_cont == NULL) {
2430 		esp0dbg(("esp_outbound: msgpullup() failed, "
2431 		    "dropping packet.\n"));
2432 		ipsec_out_mp->b_cont = data_mp;
2433 		/*
2434 		 * TODO:  Find the outbound IRE for this packet and
2435 		 * pass it to ip_drop_packet().
2436 		 */
2437 		ip_drop_packet(ipsec_out_mp, B_FALSE, NULL, NULL,
2438 		    DROPPER(ipss, ipds_esp_nomem),
2439 		    &espstack->esp_dropper);
2440 		return (IPSEC_STATUS_FAILED);
2441 	} else {
2442 		freemsg(data_mp);
2443 		data_mp = ipsec_out_mp->b_cont;
2444 	}
2445 
2446 	/*
2447 	 * Reality check....
2448 	 */
2449 
2450 	ipha = (ipha_t *)data_mp->b_rptr;  /* So we can call esp_acquire(). */
2451 
2452 	if (io->ipsec_out_v4) {
2453 		af = AF_INET;
2454 		divpoint = IPH_HDR_LENGTH(ipha);
2455 		datalen = ntohs(ipha->ipha_length) - divpoint;
2456 		nhp = (uint8_t *)&ipha->ipha_protocol;
2457 	} else {
2458 		ip6_pkt_t ipp;
2459 
2460 		af = AF_INET6;
2461 		ip6h = (ip6_t *)ipha;
2462 		bzero(&ipp, sizeof (ipp));
2463 		divpoint = ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
2464 		if (ipp.ipp_dstopts != NULL &&
2465 		    ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
2466 			/*
2467 			 * Destination options are tricky.  If we get in here,
2468 			 * then we have a terminal header following the
2469 			 * destination options.  We need to adjust backwards
2470 			 * so we insert ESP BEFORE the destination options
2471 			 * bag.  (So that the dstopts get encrypted!)
2472 			 *
2473 			 * Since this is for outbound packets only, we know
2474 			 * that non-terminal destination options only precede
2475 			 * routing headers.
2476 			 */
2477 			divpoint -= ipp.ipp_dstoptslen;
2478 		}
2479 		datalen = ntohs(ip6h->ip6_plen) + sizeof (ip6_t) - divpoint;
2480 
2481 		if (ipp.ipp_rthdr != NULL) {
2482 			nhp = &ipp.ipp_rthdr->ip6r_nxt;
2483 		} else if (ipp.ipp_hopopts != NULL) {
2484 			nhp = &ipp.ipp_hopopts->ip6h_nxt;
2485 		} else {
2486 			ASSERT(divpoint == sizeof (ip6_t));
2487 			/* It's probably IP + ESP. */
2488 			nhp = &ip6h->ip6_nxt;
2489 		}
2490 	}
2491 	assoc = io->ipsec_out_esp_sa;
2492 	ASSERT(assoc != NULL);
2493 
2494 	if (assoc->ipsa_auth_alg != SADB_AALG_NONE)
2495 		mac_len = assoc->ipsa_mac_len;
2496 
2497 	if (assoc->ipsa_flags & IPSA_F_NATT) {
2498 		/* wedge in fake UDP */
2499 		is_natt = B_TRUE;
2500 		esplen += UDPH_SIZE;
2501 	}
2502 
2503 	/*
2504 	 * Set up ESP header and encryption padding for ENCR PI request.
2505 	 */
2506 
2507 	/* Determine the padding length.  Pad to 4-bytes for no-encryption. */
2508 	if (assoc->ipsa_encr_alg != SADB_EALG_NULL) {
2509 		iv_len = assoc->ipsa_iv_len;
2510 
2511 		/*
2512 		 * Include the two additional bytes (hence the - 2) for the
2513 		 * padding length and the next header.  Take this into account
2514 		 * when calculating the actual length of the padding.
2515 		 */
2516 		ASSERT(ISP2(iv_len));
2517 		padlen = ((unsigned)(iv_len - datalen - 2)) & (iv_len - 1);
2518 	} else {
2519 		iv_len = 0;
2520 		padlen = ((unsigned)(sizeof (uint32_t) - datalen - 2)) &
2521 		    (sizeof (uint32_t) - 1);
2522 	}
2523 
2524 	/* Allocate ESP header and IV. */
2525 	esplen += iv_len;
2526 
2527 	/*
2528 	 * Update association byte-count lifetimes.  Don't forget to take
2529 	 * into account the padding length and next-header (hence the + 2).
2530 	 *
2531 	 * Use the amount of data fed into the "encryption algorithm".  This
2532 	 * is the IV, the data length, the padding length, and the final two
2533 	 * bytes (padlen, and next-header).
2534 	 *
2535 	 */
2536 
2537 	if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
2538 		/*
2539 		 * TODO:  Find the outbound IRE for this packet and
2540 		 * pass it to ip_drop_packet().
2541 		 */
2542 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2543 		    DROPPER(ipss, ipds_esp_bytes_expire),
2544 		    &espstack->esp_dropper);
2545 		return (IPSEC_STATUS_FAILED);
2546 	}
2547 
2548 	espmp = allocb(esplen, BPRI_HI);
2549 	if (espmp == NULL) {
2550 		ESP_BUMP_STAT(espstack, out_discards);
2551 		esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
2552 		/*
2553 		 * TODO:  Find the outbound IRE for this packet and
2554 		 * pass it to ip_drop_packet().
2555 		 */
2556 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2557 		    DROPPER(ipss, ipds_esp_nomem),
2558 		    &espstack->esp_dropper);
2559 		return (IPSEC_STATUS_FAILED);
2560 	}
2561 	espmp->b_wptr += esplen;
2562 	esph = (esph_t *)espmp->b_rptr;
2563 
2564 	if (is_natt) {
2565 		esp3dbg(espstack, ("esp_outbound: NATT"));
2566 
2567 		udpha = (udpha_t *)espmp->b_rptr;
2568 		udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2569 		    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2570 		udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2571 		    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2572 		/*
2573 		 * Set the checksum to 0, so that the esp_prepare_udp() call
2574 		 * can do the right thing.
2575 		 */
2576 		udpha->uha_checksum = 0;
2577 		esph = (esph_t *)(udpha + 1);
2578 	}
2579 
2580 	esph->esph_spi = assoc->ipsa_spi;
2581 
2582 	esph->esph_replay = htonl(atomic_add_32_nv(&assoc->ipsa_replay, 1));
2583 	if (esph->esph_replay == 0 && assoc->ipsa_replay_wsize != 0) {
2584 		/*
2585 		 * XXX We have replay counter wrapping.
2586 		 * We probably want to nuke this SA (and its peer).
2587 		 */
2588 		ipsec_assocfailure(info.mi_idnum, 0, 0,
2589 		    SL_ERROR | SL_CONSOLE | SL_WARN,
2590 		    "Outbound ESP SA (0x%x, %s) has wrapped sequence.\n",
2591 		    esph->esph_spi, assoc->ipsa_dstaddr, af,
2592 		    espstack->ipsecesp_netstack);
2593 
2594 		ESP_BUMP_STAT(espstack, out_discards);
2595 		sadb_replay_delete(assoc);
2596 		/*
2597 		 * TODO:  Find the outbound IRE for this packet and
2598 		 * pass it to ip_drop_packet().
2599 		 */
2600 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2601 		    DROPPER(ipss, ipds_esp_replay),
2602 		    &espstack->esp_dropper);
2603 		return (IPSEC_STATUS_FAILED);
2604 	}
2605 
2606 	/*
2607 	 * Set the IV to a random quantity.  We do not require the
2608 	 * highest quality random bits, but for best security with CBC
2609 	 * mode ciphers, the value must be unlikely to repeat and also
2610 	 * must not be known in advance to an adversary capable of
2611 	 * influencing the plaintext.
2612 	 */
2613 	(void) random_get_pseudo_bytes((uint8_t *)(esph + 1), iv_len);
2614 
2615 	/* Fix the IP header. */
2616 	alloclen = padlen + 2 + mac_len;
2617 	adj = alloclen + (espmp->b_wptr - espmp->b_rptr);
2618 
2619 	protocol = *nhp;
2620 
2621 	if (io->ipsec_out_v4) {
2622 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
2623 		if (is_natt) {
2624 			*nhp = IPPROTO_UDP;
2625 			udpha->uha_length = htons(ntohs(ipha->ipha_length) -
2626 			    IPH_HDR_LENGTH(ipha));
2627 		} else {
2628 			*nhp = IPPROTO_ESP;
2629 		}
2630 		ipha->ipha_hdr_checksum = 0;
2631 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
2632 	} else {
2633 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + adj);
2634 		*nhp = IPPROTO_ESP;
2635 	}
2636 
2637 	/* I've got the two ESP mblks, now insert them. */
2638 
2639 	esp2dbg(espstack, ("data_mp before outbound ESP adjustment:\n"));
2640 	esp2dbg(espstack, (dump_msg(data_mp)));
2641 
2642 	if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
2643 		ESP_BUMP_STAT(espstack, out_discards);
2644 		/* NOTE:  esp_insert_esp() only fails if there's no memory. */
2645 		/*
2646 		 * TODO:  Find the outbound IRE for this packet and
2647 		 * pass it to ip_drop_packet().
2648 		 */
2649 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2650 		    DROPPER(ipss, ipds_esp_nomem),
2651 		    &espstack->esp_dropper);
2652 		freeb(espmp);
2653 		return (IPSEC_STATUS_FAILED);
2654 	}
2655 
2656 	/* Append padding (and leave room for ICV). */
2657 	for (tailmp = data_mp; tailmp->b_cont != NULL; tailmp = tailmp->b_cont)
2658 		;
2659 	if (tailmp->b_wptr + alloclen > tailmp->b_datap->db_lim) {
2660 		tailmp->b_cont = allocb(alloclen, BPRI_HI);
2661 		if (tailmp->b_cont == NULL) {
2662 			ESP_BUMP_STAT(espstack, out_discards);
2663 			esp0dbg(("esp_outbound:  Can't allocate tailmp.\n"));
2664 			/*
2665 			 * TODO:  Find the outbound IRE for this packet and
2666 			 * pass it to ip_drop_packet().
2667 			 */
2668 			ip_drop_packet(mp, B_FALSE, NULL, NULL,
2669 			    DROPPER(ipss, ipds_esp_nomem),
2670 			    &espstack->esp_dropper);
2671 			return (IPSEC_STATUS_FAILED);
2672 		}
2673 		tailmp = tailmp->b_cont;
2674 	}
2675 
2676 	/*
2677 	 * If there's padding, N bytes of padding must be of the form 0x1,
2678 	 * 0x2, 0x3... 0xN.
2679 	 */
2680 	for (i = 0; i < padlen; ) {
2681 		i++;
2682 		*tailmp->b_wptr++ = i;
2683 	}
2684 	*tailmp->b_wptr++ = i;
2685 	*tailmp->b_wptr++ = protocol;
2686 
2687 	esp2dbg(espstack, ("data_Mp before encryption:\n"));
2688 	esp2dbg(espstack, (dump_msg(data_mp)));
2689 
2690 	/*
2691 	 * The packet is eligible for hardware acceleration if the
2692 	 * following conditions are satisfied:
2693 	 *
2694 	 * 1. the packet will not be fragmented
2695 	 * 2. the provider supports the algorithms specified by SA
2696 	 * 3. there is no pending control message being exchanged
2697 	 * 4. snoop is not attached
2698 	 * 5. the destination address is not a multicast address
2699 	 *
2700 	 * All five of these conditions are checked by IP prior to
2701 	 * sending the packet to ESP.
2702 	 *
2703 	 * But We, and We Alone, can, nay MUST check if the packet
2704 	 * is over NATT, and then disqualify it from hardware
2705 	 * acceleration.
2706 	 */
2707 
2708 	if (io->ipsec_out_is_capab_ill && !(assoc->ipsa_flags & IPSA_F_NATT)) {
2709 		return (esp_outbound_accelerated(ipsec_out_mp, mac_len));
2710 	}
2711 	ESP_BUMP_STAT(espstack, noaccel);
2712 
2713 	/*
2714 	 * Okay.  I've set up the pre-encryption ESP.  Let's do it!
2715 	 */
2716 
2717 	if (mac_len > 0) {
2718 		ASSERT(tailmp->b_wptr + mac_len <= tailmp->b_datap->db_lim);
2719 		icv_buf = tailmp->b_wptr;
2720 		tailmp->b_wptr += mac_len;
2721 	} else {
2722 		icv_buf = NULL;
2723 	}
2724 
2725 	return (esp_submit_req_outbound(ipsec_out_mp, assoc, icv_buf,
2726 	    datalen + padlen + 2));
2727 }
2728 
2729 /*
2730  * IP calls this to validate the ICMP errors that
2731  * we got from the network.
2732  */
2733 ipsec_status_t
2734 ipsecesp_icmp_error(mblk_t *ipsec_mp)
2735 {
2736 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
2737 	boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
2738 	netstack_t	*ns;
2739 	ipsecesp_stack_t *espstack;
2740 	ipsec_stack_t	*ipss;
2741 
2742 	if (is_inbound) {
2743 		ns = ii->ipsec_in_ns;
2744 	} else {
2745 		ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
2746 
2747 		ns = io->ipsec_out_ns;
2748 	}
2749 	espstack = ns->netstack_ipsecesp;
2750 	ipss = ns->netstack_ipsec;
2751 
2752 	/*
2753 	 * Unless we get an entire packet back, this function is useless.
2754 	 * Why?
2755 	 *
2756 	 * 1.)	Partial packets are useless, because the "next header"
2757 	 *	is at the end of the decrypted ESP packet.  Without the
2758 	 *	whole packet, this is useless.
2759 	 *
2760 	 * 2.)	If we every use a stateful cipher, such as a stream or a
2761 	 *	one-time pad, we can't do anything.
2762 	 *
2763 	 * Since the chances of us getting an entire packet back are very
2764 	 * very small, we discard here.
2765 	 */
2766 	IP_ESP_BUMP_STAT(ipss, in_discards);
2767 	ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
2768 	    DROPPER(ipss, ipds_esp_icmp),
2769 	    &espstack->esp_dropper);
2770 	return (IPSEC_STATUS_FAILED);
2771 }
2772 
2773 /*
2774  * ESP module read put routine.
2775  */
2776 /* ARGSUSED */
2777 static void
2778 ipsecesp_rput(queue_t *q, mblk_t *mp)
2779 {
2780 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
2781 
2782 	ASSERT(mp->b_datap->db_type != M_CTL);	/* No more IRE_DB_REQ. */
2783 
2784 	switch (mp->b_datap->db_type) {
2785 	case M_PROTO:
2786 	case M_PCPROTO:
2787 		/* TPI message of some sort. */
2788 		switch (*((t_scalar_t *)mp->b_rptr)) {
2789 		case T_BIND_ACK:
2790 			esp3dbg(espstack,
2791 			    ("Thank you IP from ESP for T_BIND_ACK\n"));
2792 			break;
2793 		case T_ERROR_ACK:
2794 			cmn_err(CE_WARN,
2795 			    "ipsecesp:  ESP received T_ERROR_ACK from IP.");
2796 			/*
2797 			 * Make esp_sadb.s_ip_q NULL, and in the
2798 			 * future, perhaps try again.
2799 			 */
2800 			espstack->esp_sadb.s_ip_q = NULL;
2801 			break;
2802 		case T_OK_ACK:
2803 			/* Probably from a (rarely sent) T_UNBIND_REQ. */
2804 			break;
2805 		default:
2806 			esp0dbg(("Unknown M_{,PC}PROTO message.\n"));
2807 		}
2808 		freemsg(mp);
2809 		break;
2810 	default:
2811 		/* For now, passthru message. */
2812 		esp2dbg(espstack, ("ESP got unknown mblk type %d.\n",
2813 		    mp->b_datap->db_type));
2814 		putnext(q, mp);
2815 	}
2816 }
2817 
2818 /*
2819  * Construct an SADB_REGISTER message with the current algorithms.
2820  */
2821 static boolean_t
2822 esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
2823     ipsecesp_stack_t *espstack)
2824 {
2825 	mblk_t *pfkey_msg_mp, *keysock_out_mp;
2826 	sadb_msg_t *samsg;
2827 	sadb_supported_t *sasupp_auth = NULL;
2828 	sadb_supported_t *sasupp_encr = NULL;
2829 	sadb_alg_t *saalg;
2830 	uint_t allocsize = sizeof (*samsg);
2831 	uint_t i, numalgs_snap;
2832 	int current_aalgs;
2833 	ipsec_alginfo_t **authalgs;
2834 	uint_t num_aalgs;
2835 	int current_ealgs;
2836 	ipsec_alginfo_t **encralgs;
2837 	uint_t num_ealgs;
2838 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
2839 
2840 	/* Allocate the KEYSOCK_OUT. */
2841 	keysock_out_mp = sadb_keysock_out(serial);
2842 	if (keysock_out_mp == NULL) {
2843 		esp0dbg(("esp_register_out: couldn't allocate mblk.\n"));
2844 		return (B_FALSE);
2845 	}
2846 
2847 	/*
2848 	 * Allocate the PF_KEY message that follows KEYSOCK_OUT.
2849 	 */
2850 
2851 	mutex_enter(&ipss->ipsec_alg_lock);
2852 
2853 	/*
2854 	 * Fill SADB_REGISTER message's algorithm descriptors.  Hold
2855 	 * down the lock while filling it.
2856 	 *
2857 	 * Return only valid algorithms, so the number of algorithms
2858 	 * to send up may be less than the number of algorithm entries
2859 	 * in the table.
2860 	 */
2861 	authalgs = ipss->ipsec_alglists[IPSEC_ALG_AUTH];
2862 	for (num_aalgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2863 		if (authalgs[i] != NULL && ALG_VALID(authalgs[i]))
2864 			num_aalgs++;
2865 
2866 	if (num_aalgs != 0) {
2867 		allocsize += (num_aalgs * sizeof (*saalg));
2868 		allocsize += sizeof (*sasupp_auth);
2869 	}
2870 	encralgs = ipss->ipsec_alglists[IPSEC_ALG_ENCR];
2871 	for (num_ealgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2872 		if (encralgs[i] != NULL && ALG_VALID(encralgs[i]))
2873 			num_ealgs++;
2874 
2875 	if (num_ealgs != 0) {
2876 		allocsize += (num_ealgs * sizeof (*saalg));
2877 		allocsize += sizeof (*sasupp_encr);
2878 	}
2879 	keysock_out_mp->b_cont = allocb(allocsize, BPRI_HI);
2880 	if (keysock_out_mp->b_cont == NULL) {
2881 		mutex_exit(&ipss->ipsec_alg_lock);
2882 		freemsg(keysock_out_mp);
2883 		return (B_FALSE);
2884 	}
2885 
2886 	pfkey_msg_mp = keysock_out_mp->b_cont;
2887 	pfkey_msg_mp->b_wptr += allocsize;
2888 	if (num_aalgs != 0) {
2889 		sasupp_auth = (sadb_supported_t *)
2890 		    (pfkey_msg_mp->b_rptr + sizeof (*samsg));
2891 		saalg = (sadb_alg_t *)(sasupp_auth + 1);
2892 
2893 		ASSERT(((ulong_t)saalg & 0x7) == 0);
2894 
2895 		numalgs_snap = 0;
2896 		for (i = 0;
2897 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_aalgs));
2898 		    i++) {
2899 			if (authalgs[i] == NULL || !ALG_VALID(authalgs[i]))
2900 				continue;
2901 
2902 			saalg->sadb_alg_id = authalgs[i]->alg_id;
2903 			saalg->sadb_alg_ivlen = 0;
2904 			saalg->sadb_alg_minbits	= authalgs[i]->alg_ef_minbits;
2905 			saalg->sadb_alg_maxbits	= authalgs[i]->alg_ef_maxbits;
2906 			saalg->sadb_x_alg_defincr = authalgs[i]->alg_ef_default;
2907 			saalg->sadb_x_alg_increment =
2908 			    authalgs[i]->alg_increment;
2909 			numalgs_snap++;
2910 			saalg++;
2911 		}
2912 		ASSERT(numalgs_snap == num_aalgs);
2913 #ifdef DEBUG
2914 		/*
2915 		 * Reality check to make sure I snagged all of the
2916 		 * algorithms.
2917 		 */
2918 		for (; i < IPSEC_MAX_ALGS; i++) {
2919 			if (authalgs[i] != NULL && ALG_VALID(authalgs[i])) {
2920 				cmn_err(CE_PANIC, "esp_register_out()! "
2921 				    "Missed aalg #%d.\n", i);
2922 			}
2923 		}
2924 #endif /* DEBUG */
2925 	} else {
2926 		saalg = (sadb_alg_t *)(pfkey_msg_mp->b_rptr + sizeof (*samsg));
2927 	}
2928 
2929 	if (num_ealgs != 0) {
2930 		sasupp_encr = (sadb_supported_t *)saalg;
2931 		saalg = (sadb_alg_t *)(sasupp_encr + 1);
2932 
2933 		numalgs_snap = 0;
2934 		for (i = 0;
2935 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_ealgs)); i++) {
2936 			if (encralgs[i] == NULL || !ALG_VALID(encralgs[i]))
2937 				continue;
2938 			saalg->sadb_alg_id = encralgs[i]->alg_id;
2939 			saalg->sadb_alg_ivlen = encralgs[i]->alg_datalen;
2940 			saalg->sadb_alg_minbits	= encralgs[i]->alg_ef_minbits;
2941 			saalg->sadb_alg_maxbits	= encralgs[i]->alg_ef_maxbits;
2942 			saalg->sadb_x_alg_defincr = encralgs[i]->alg_ef_default;
2943 			saalg->sadb_x_alg_increment =
2944 			    encralgs[i]->alg_increment;
2945 			numalgs_snap++;
2946 			saalg++;
2947 		}
2948 		ASSERT(numalgs_snap == num_ealgs);
2949 #ifdef DEBUG
2950 		/*
2951 		 * Reality check to make sure I snagged all of the
2952 		 * algorithms.
2953 		 */
2954 		for (; i < IPSEC_MAX_ALGS; i++) {
2955 			if (encralgs[i] != NULL && ALG_VALID(encralgs[i])) {
2956 				cmn_err(CE_PANIC, "esp_register_out()! "
2957 				    "Missed ealg #%d.\n", i);
2958 			}
2959 		}
2960 #endif /* DEBUG */
2961 	}
2962 
2963 	current_aalgs = num_aalgs;
2964 	current_ealgs = num_ealgs;
2965 
2966 	mutex_exit(&ipss->ipsec_alg_lock);
2967 
2968 	/* Now fill the rest of the SADB_REGISTER message. */
2969 
2970 	samsg = (sadb_msg_t *)pfkey_msg_mp->b_rptr;
2971 	samsg->sadb_msg_version = PF_KEY_V2;
2972 	samsg->sadb_msg_type = SADB_REGISTER;
2973 	samsg->sadb_msg_errno = 0;
2974 	samsg->sadb_msg_satype = SADB_SATYPE_ESP;
2975 	samsg->sadb_msg_len = SADB_8TO64(allocsize);
2976 	samsg->sadb_msg_reserved = 0;
2977 	/*
2978 	 * Assume caller has sufficient sequence/pid number info.  If it's one
2979 	 * from me over a new alg., I could give two hoots about sequence.
2980 	 */
2981 	samsg->sadb_msg_seq = sequence;
2982 	samsg->sadb_msg_pid = pid;
2983 
2984 	if (sasupp_auth != NULL) {
2985 		sasupp_auth->sadb_supported_len = SADB_8TO64(
2986 		    sizeof (*sasupp_auth) + sizeof (*saalg) * current_aalgs);
2987 		sasupp_auth->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
2988 		sasupp_auth->sadb_supported_reserved = 0;
2989 	}
2990 
2991 	if (sasupp_encr != NULL) {
2992 		sasupp_encr->sadb_supported_len = SADB_8TO64(
2993 		    sizeof (*sasupp_encr) + sizeof (*saalg) * current_ealgs);
2994 		sasupp_encr->sadb_supported_exttype =
2995 		    SADB_EXT_SUPPORTED_ENCRYPT;
2996 		sasupp_encr->sadb_supported_reserved = 0;
2997 	}
2998 
2999 	if (espstack->esp_pfkey_q != NULL)
3000 		putnext(espstack->esp_pfkey_q, keysock_out_mp);
3001 	else {
3002 		freemsg(keysock_out_mp);
3003 		return (B_FALSE);
3004 	}
3005 
3006 	return (B_TRUE);
3007 }
3008 
3009 /*
3010  * Invoked when the algorithm table changes. Causes SADB_REGISTER
3011  * messages continaining the current list of algorithms to be
3012  * sent up to the ESP listeners.
3013  */
3014 void
3015 ipsecesp_algs_changed(netstack_t *ns)
3016 {
3017 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
3018 
3019 	/*
3020 	 * Time to send a PF_KEY SADB_REGISTER message to ESP listeners
3021 	 * everywhere.  (The function itself checks for NULL esp_pfkey_q.)
3022 	 */
3023 	(void) esp_register_out(0, 0, 0, espstack);
3024 }
3025 
3026 /*
3027  * taskq_dispatch handler.
3028  */
3029 static void
3030 inbound_task(void *arg)
3031 {
3032 	esph_t *esph;
3033 	mblk_t *mp = (mblk_t *)arg;
3034 	ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
3035 	netstack_t		*ns = ii->ipsec_in_ns;
3036 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
3037 	int ipsec_rc;
3038 
3039 	esp2dbg(espstack, ("in ESP inbound_task"));
3040 	ASSERT(espstack != NULL);
3041 
3042 	esph = ipsec_inbound_esp_sa(mp, ns);
3043 	if (esph == NULL)
3044 		return;
3045 	ASSERT(ii->ipsec_in_esp_sa != NULL);
3046 	ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(mp, esph);
3047 	if (ipsec_rc != IPSEC_STATUS_SUCCESS)
3048 		return;
3049 	ip_fanout_proto_again(mp, NULL, NULL, NULL);
3050 }
3051 
3052 /*
3053  * Now that weak-key passed, actually ADD the security association, and
3054  * send back a reply ADD message.
3055  */
3056 static int
3057 esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
3058     int *diagnostic, ipsecesp_stack_t *espstack)
3059 {
3060 	isaf_t *primary, *secondary, *inbound, *outbound;
3061 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3062 	sadb_address_t *dstext =
3063 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3064 	struct sockaddr_in *dst;
3065 	struct sockaddr_in6 *dst6;
3066 	boolean_t is_ipv4, clone = B_FALSE, is_inbound = B_FALSE;
3067 	uint32_t *dstaddr;
3068 	ipsa_t *larval = NULL;
3069 	ipsacq_t *acqrec;
3070 	iacqf_t *acq_bucket;
3071 	mblk_t *acq_msgs = NULL;
3072 	int rc;
3073 	sadb_t *sp;
3074 	int outhash;
3075 	mblk_t *lpkt;
3076 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3077 
3078 	/*
3079 	 * Locate the appropriate table(s).
3080 	 */
3081 
3082 	dst = (struct sockaddr_in *)(dstext + 1);
3083 	dst6 = (struct sockaddr_in6 *)dst;
3084 	is_ipv4 = (dst->sin_family == AF_INET);
3085 	if (is_ipv4) {
3086 		sp = &espstack->esp_sadb.s_v4;
3087 		dstaddr = (uint32_t *)(&dst->sin_addr);
3088 		outhash = OUTBOUND_HASH_V4(sp, *(ipaddr_t *)dstaddr);
3089 	} else {
3090 		sp = &espstack->esp_sadb.s_v6;
3091 		dstaddr = (uint32_t *)(&dst6->sin6_addr);
3092 		outhash = OUTBOUND_HASH_V6(sp, *(in6_addr_t *)dstaddr);
3093 	}
3094 
3095 	inbound = INBOUND_BUCKET(sp, assoc->sadb_sa_spi);
3096 	outbound = &sp->sdb_of[outhash];
3097 
3098 	switch (ksi->ks_in_dsttype) {
3099 	case KS_IN_ADDR_MBCAST:
3100 		clone = B_TRUE;	/* All mcast SAs can be bidirectional */
3101 		/* FALLTHRU */
3102 	case KS_IN_ADDR_ME:
3103 		primary = inbound;
3104 		secondary = outbound;
3105 		/*
3106 		 * If the source address is either one of mine, or unspecified
3107 		 * (which is best summed up by saying "not 'not mine'"),
3108 		 * then the association is potentially bi-directional,
3109 		 * in that it can be used for inbound traffic and outbound
3110 		 * traffic.  The best example of such an SA is a multicast
3111 		 * SA (which allows me to receive the outbound traffic).
3112 		 */
3113 		if (ksi->ks_in_srctype != KS_IN_ADDR_NOTME)
3114 			clone = B_TRUE;
3115 		is_inbound = B_TRUE;
3116 		break;
3117 	case KS_IN_ADDR_NOTME:
3118 		primary = outbound;
3119 		secondary = inbound;
3120 		/*
3121 		 * If the source address literally not mine (either
3122 		 * unspecified or not mine), then this SA may have an
3123 		 * address that WILL be mine after some configuration.
3124 		 * We pay the price for this by making it a bi-directional
3125 		 * SA.
3126 		 */
3127 		if (ksi->ks_in_srctype != KS_IN_ADDR_ME)
3128 			clone = B_TRUE;
3129 		break;
3130 	default:
3131 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_DST;
3132 		return (EINVAL);
3133 	}
3134 
3135 	/*
3136 	 * Find a ACQUIRE list entry if possible.  If we've added an SA that
3137 	 * suits the needs of an ACQUIRE list entry, we can eliminate the
3138 	 * ACQUIRE list entry and transmit the enqueued packets.  Use the
3139 	 * high-bit of the sequence number to queue it.  Key off destination
3140 	 * addr, and change acqrec's state.
3141 	 */
3142 
3143 	if (samsg->sadb_msg_seq & IACQF_LOWEST_SEQ) {
3144 		acq_bucket = &sp->sdb_acq[outhash];
3145 		mutex_enter(&acq_bucket->iacqf_lock);
3146 		for (acqrec = acq_bucket->iacqf_ipsacq; acqrec != NULL;
3147 		    acqrec = acqrec->ipsacq_next) {
3148 			mutex_enter(&acqrec->ipsacq_lock);
3149 			/*
3150 			 * Q:  I only check sequence.  Should I check dst?
3151 			 * A: Yes, check dest because those are the packets
3152 			 *    that are queued up.
3153 			 */
3154 			if (acqrec->ipsacq_seq == samsg->sadb_msg_seq &&
3155 			    IPSA_ARE_ADDR_EQUAL(dstaddr,
3156 			    acqrec->ipsacq_dstaddr, acqrec->ipsacq_addrfam))
3157 				break;
3158 			mutex_exit(&acqrec->ipsacq_lock);
3159 		}
3160 		if (acqrec != NULL) {
3161 			/*
3162 			 * AHA!  I found an ACQUIRE record for this SA.
3163 			 * Grab the msg list, and free the acquire record.
3164 			 * I already am holding the lock for this record,
3165 			 * so all I have to do is free it.
3166 			 */
3167 			acq_msgs = acqrec->ipsacq_mp;
3168 			acqrec->ipsacq_mp = NULL;
3169 			mutex_exit(&acqrec->ipsacq_lock);
3170 			sadb_destroy_acquire(acqrec,
3171 			    espstack->ipsecesp_netstack);
3172 		}
3173 		mutex_exit(&acq_bucket->iacqf_lock);
3174 	}
3175 
3176 	/*
3177 	 * Find PF_KEY message, and see if I'm an update.  If so, find entry
3178 	 * in larval list (if there).
3179 	 */
3180 
3181 	if (samsg->sadb_msg_type == SADB_UPDATE) {
3182 		mutex_enter(&inbound->isaf_lock);
3183 		larval = ipsec_getassocbyspi(inbound, assoc->sadb_sa_spi,
3184 		    ALL_ZEROES_PTR, dstaddr, dst->sin_family);
3185 		mutex_exit(&inbound->isaf_lock);
3186 
3187 		if (larval == NULL) {
3188 			esp0dbg(("Larval update, but larval disappeared.\n"));
3189 			return (ESRCH);
3190 		} /* Else sadb_common_add unlinks it for me! */
3191 	}
3192 
3193 	lpkt = NULL;
3194 	if (larval != NULL)
3195 		lpkt = sadb_clear_lpkt(larval);
3196 
3197 	rc = sadb_common_add(espstack->esp_sadb.s_ip_q, espstack->esp_pfkey_q,
3198 	    mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
3199 	    diagnostic, espstack->ipsecesp_netstack);
3200 
3201 	if (rc == 0 && lpkt != NULL) {
3202 		rc = !taskq_dispatch(esp_taskq, inbound_task,
3203 		    (void *) lpkt, TQ_NOSLEEP);
3204 	}
3205 
3206 	if (rc != 0) {
3207 		ip_drop_packet(lpkt, B_TRUE, NULL, NULL,
3208 		    DROPPER(ipss, ipds_sadb_inlarval_timeout),
3209 		    &espstack->esp_dropper);
3210 	}
3211 
3212 	/*
3213 	 * How much more stack will I create with all of these
3214 	 * esp_outbound() calls?
3215 	 */
3216 
3217 	while (acq_msgs != NULL) {
3218 		mblk_t *mp = acq_msgs;
3219 
3220 		acq_msgs = acq_msgs->b_next;
3221 		mp->b_next = NULL;
3222 		if (rc == 0) {
3223 			if (ipsec_outbound_sa(mp, IPPROTO_ESP)) {
3224 				((ipsec_out_t *)(mp->b_rptr))->
3225 				    ipsec_out_esp_done = B_TRUE;
3226 				if (esp_outbound(mp) == IPSEC_STATUS_SUCCESS) {
3227 					ipha_t *ipha;
3228 
3229 					/* do AH processing if needed */
3230 					if (!esp_do_outbound_ah(mp))
3231 						continue;
3232 
3233 					ipha = (ipha_t *)mp->b_cont->b_rptr;
3234 
3235 					/* finish IPsec processing */
3236 					if (is_ipv4) {
3237 						ip_wput_ipsec_out(NULL, mp,
3238 						    ipha, NULL, NULL);
3239 					} else {
3240 						ip6_t *ip6h = (ip6_t *)ipha;
3241 						ip_wput_ipsec_out_v6(NULL,
3242 						    mp, ip6h, NULL, NULL);
3243 					}
3244 				}
3245 				continue;
3246 			}
3247 		}
3248 		ESP_BUMP_STAT(espstack, out_discards);
3249 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
3250 		    DROPPER(ipss, ipds_sadb_acquire_timeout),
3251 		    &espstack->esp_dropper);
3252 	}
3253 
3254 	return (rc);
3255 }
3256 
3257 /*
3258  * Add new ESP security association.  This may become a generic AH/ESP
3259  * routine eventually.
3260  */
3261 static int
3262 esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
3263 {
3264 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3265 	sadb_address_t *srcext =
3266 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3267 	sadb_address_t *dstext =
3268 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3269 	sadb_address_t *isrcext =
3270 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_SRC];
3271 	sadb_address_t *idstext =
3272 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_DST];
3273 	sadb_address_t *nttext_loc =
3274 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_LOC];
3275 	sadb_address_t *nttext_rem =
3276 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_REM];
3277 	sadb_key_t *akey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_AUTH];
3278 	sadb_key_t *ekey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_ENCRYPT];
3279 	struct sockaddr_in *src, *dst;
3280 	struct sockaddr_in *natt_loc, *natt_rem;
3281 	struct sockaddr_in6 *natt_loc6, *natt_rem6;
3282 	sadb_lifetime_t *soft =
3283 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_SOFT];
3284 	sadb_lifetime_t *hard =
3285 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_HARD];
3286 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3287 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3288 
3289 	/* I need certain extensions present for an ADD message. */
3290 	if (srcext == NULL) {
3291 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SRC;
3292 		return (EINVAL);
3293 	}
3294 	if (dstext == NULL) {
3295 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3296 		return (EINVAL);
3297 	}
3298 	if (isrcext == NULL && idstext != NULL) {
3299 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_SRC;
3300 		return (EINVAL);
3301 	}
3302 	if (isrcext != NULL && idstext == NULL) {
3303 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_DST;
3304 		return (EINVAL);
3305 	}
3306 	if (assoc == NULL) {
3307 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3308 		return (EINVAL);
3309 	}
3310 	if (ekey == NULL && assoc->sadb_sa_encrypt != SADB_EALG_NULL) {
3311 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_EKEY;
3312 		return (EINVAL);
3313 	}
3314 
3315 	src = (struct sockaddr_in *)(srcext + 1);
3316 	dst = (struct sockaddr_in *)(dstext + 1);
3317 	natt_loc = (struct sockaddr_in *)(nttext_loc + 1);
3318 	natt_loc6 = (struct sockaddr_in6 *)(nttext_loc + 1);
3319 	natt_rem = (struct sockaddr_in *)(nttext_rem + 1);
3320 	natt_rem6 = (struct sockaddr_in6 *)(nttext_rem + 1);
3321 
3322 	/* Sundry ADD-specific reality checks. */
3323 	/* XXX STATS :  Logging/stats here? */
3324 	if (assoc->sadb_sa_state != SADB_SASTATE_MATURE) {
3325 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SASTATE;
3326 		return (EINVAL);
3327 	}
3328 	if (assoc->sadb_sa_encrypt == SADB_EALG_NONE) {
3329 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3330 		return (EINVAL);
3331 	}
3332 
3333 	if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
3334 	    assoc->sadb_sa_auth == SADB_AALG_NONE) {
3335 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3336 		return (EINVAL);
3337 	}
3338 
3339 	if (assoc->sadb_sa_flags & ~(SADB_SAFLAGS_NOREPLAY |
3340 	    SADB_X_SAFLAGS_NATT_LOC | SADB_X_SAFLAGS_NATT_REM |
3341 	    SADB_X_SAFLAGS_TUNNEL)) {
3342 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
3343 		return (EINVAL);
3344 	}
3345 
3346 	if ((*diagnostic = sadb_hardsoftchk(hard, soft)) != 0) {
3347 		return (EINVAL);
3348 	}
3349 	ASSERT(src->sin_family == dst->sin_family);
3350 
3351 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_LOC) {
3352 		if (nttext_loc == NULL) {
3353 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_LOC;
3354 			return (EINVAL);
3355 		}
3356 
3357 		if (natt_loc->sin_family == AF_INET6 &&
3358 		    !IN6_IS_ADDR_V4MAPPED(&natt_loc6->sin6_addr)) {
3359 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_LOC;
3360 			return (EINVAL);
3361 		}
3362 	}
3363 
3364 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_REM) {
3365 		if (nttext_rem == NULL) {
3366 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_REM;
3367 			return (EINVAL);
3368 		}
3369 		if (natt_rem->sin_family == AF_INET6 &&
3370 		    !IN6_IS_ADDR_V4MAPPED(&natt_rem6->sin6_addr)) {
3371 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_REM;
3372 			return (EINVAL);
3373 		}
3374 	}
3375 
3376 
3377 	/* Stuff I don't support, for now.  XXX Diagnostic? */
3378 	if (ksi->ks_in_extv[SADB_EXT_LIFETIME_CURRENT] != NULL ||
3379 	    ksi->ks_in_extv[SADB_EXT_SENSITIVITY] != NULL)
3380 		return (EOPNOTSUPP);
3381 
3382 	/*
3383 	 * XXX Policy :  I'm not checking identities or sensitivity
3384 	 * labels at this time, but if I did, I'd do them here, before I sent
3385 	 * the weak key check up to the algorithm.
3386 	 */
3387 
3388 	mutex_enter(&ipss->ipsec_alg_lock);
3389 
3390 	/*
3391 	 * First locate the authentication algorithm.
3392 	 */
3393 	if (akey != NULL) {
3394 		ipsec_alginfo_t *aalg;
3395 
3396 		aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
3397 		    [assoc->sadb_sa_auth];
3398 		if (aalg == NULL || !ALG_VALID(aalg)) {
3399 			mutex_exit(&ipss->ipsec_alg_lock);
3400 			esp1dbg(espstack, ("Couldn't find auth alg #%d.\n",
3401 			    assoc->sadb_sa_auth));
3402 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3403 			return (EINVAL);
3404 		}
3405 
3406 		/*
3407 		 * Sanity check key sizes.
3408 		 * Note: It's not possible to use SADB_AALG_NONE because
3409 		 * this auth_alg is not defined with ALG_FLAG_VALID. If this
3410 		 * ever changes, the same check for SADB_AALG_NONE and
3411 		 * a auth_key != NULL should be made here ( see below).
3412 		 */
3413 		if (!ipsec_valid_key_size(akey->sadb_key_bits, aalg)) {
3414 			mutex_exit(&ipss->ipsec_alg_lock);
3415 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AKEYBITS;
3416 			return (EINVAL);
3417 		}
3418 		ASSERT(aalg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3419 
3420 		/* check key and fix parity if needed */
3421 		if (ipsec_check_key(aalg->alg_mech_type, akey, B_TRUE,
3422 		    diagnostic) != 0) {
3423 			mutex_exit(&ipss->ipsec_alg_lock);
3424 			return (EINVAL);
3425 		}
3426 	}
3427 
3428 	/*
3429 	 * Then locate the encryption algorithm.
3430 	 */
3431 	if (ekey != NULL) {
3432 		ipsec_alginfo_t *ealg;
3433 
3434 		ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
3435 		    [assoc->sadb_sa_encrypt];
3436 		if (ealg == NULL || !ALG_VALID(ealg)) {
3437 			mutex_exit(&ipss->ipsec_alg_lock);
3438 			esp1dbg(espstack, ("Couldn't find encr alg #%d.\n",
3439 			    assoc->sadb_sa_encrypt));
3440 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3441 			return (EINVAL);
3442 		}
3443 
3444 		/*
3445 		 * Sanity check key sizes. If the encryption algorithm is
3446 		 * SADB_EALG_NULL but the encryption key is NOT
3447 		 * NULL then complain.
3448 		 */
3449 		if ((assoc->sadb_sa_encrypt == SADB_EALG_NULL) ||
3450 		    (!ipsec_valid_key_size(ekey->sadb_key_bits, ealg))) {
3451 			mutex_exit(&ipss->ipsec_alg_lock);
3452 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EKEYBITS;
3453 			return (EINVAL);
3454 		}
3455 		ASSERT(ealg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3456 
3457 		/* check key */
3458 		if (ipsec_check_key(ealg->alg_mech_type, ekey, B_FALSE,
3459 		    diagnostic) != 0) {
3460 			mutex_exit(&ipss->ipsec_alg_lock);
3461 			return (EINVAL);
3462 		}
3463 	}
3464 	mutex_exit(&ipss->ipsec_alg_lock);
3465 
3466 	return (esp_add_sa_finish(mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
3467 	    diagnostic, espstack));
3468 }
3469 
3470 /*
3471  * Update a security association.  Updates come in two varieties.  The first
3472  * is an update of lifetimes on a non-larval SA.  The second is an update of
3473  * a larval SA, which ends up looking a lot more like an add.
3474  */
3475 static int
3476 esp_update_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3477     ipsecesp_stack_t *espstack)
3478 {
3479 	sadb_address_t *dstext =
3480 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3481 	struct sockaddr_in *sin;
3482 
3483 	if (dstext == NULL) {
3484 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3485 		return (EINVAL);
3486 	}
3487 
3488 	sin = (struct sockaddr_in *)(dstext + 1);
3489 	return (sadb_update_sa(mp, ksi,
3490 	    (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
3491 	    &espstack->esp_sadb.s_v4, diagnostic, espstack->esp_pfkey_q,
3492 	    esp_add_sa, espstack->ipsecesp_netstack));
3493 }
3494 
3495 /*
3496  * Delete a security association.  This is REALLY likely to be code common to
3497  * both AH and ESP.  Find the association, then unlink it.
3498  */
3499 static int
3500 esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3501     ipsecesp_stack_t *espstack)
3502 {
3503 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3504 	sadb_address_t *dstext =
3505 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3506 	sadb_address_t *srcext =
3507 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3508 	struct sockaddr_in *sin;
3509 
3510 	if (assoc == NULL) {
3511 		if (dstext != NULL) {
3512 			sin = (struct sockaddr_in *)(dstext + 1);
3513 		} else if (srcext != NULL) {
3514 			sin = (struct sockaddr_in *)(srcext + 1);
3515 		} else {
3516 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3517 			return (EINVAL);
3518 		}
3519 		return (sadb_purge_sa(mp, ksi,
3520 		    (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
3521 		    &espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
3522 		    espstack->esp_sadb.s_ip_q));
3523 	}
3524 
3525 	return (sadb_del_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
3526 	    espstack->esp_pfkey_q));
3527 }
3528 
3529 /*
3530  * Convert the entire contents of all of ESP's SA tables into PF_KEY SADB_DUMP
3531  * messages.
3532  */
3533 static void
3534 esp_dump(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
3535 {
3536 	int error;
3537 	sadb_msg_t *samsg;
3538 
3539 	/*
3540 	 * Dump each fanout, bailing if error is non-zero.
3541 	 */
3542 
3543 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi->ks_in_serial,
3544 	    &espstack->esp_sadb.s_v4);
3545 	if (error != 0)
3546 		goto bail;
3547 
3548 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi->ks_in_serial,
3549 	    &espstack->esp_sadb.s_v6);
3550 bail:
3551 	ASSERT(mp->b_cont != NULL);
3552 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3553 	samsg->sadb_msg_errno = (uint8_t)error;
3554 	sadb_pfkey_echo(espstack->esp_pfkey_q, mp,
3555 	    (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL);
3556 }
3557 
3558 /*
3559  * First-cut reality check for an inbound PF_KEY message.
3560  */
3561 static boolean_t
3562 esp_pfkey_reality_failures(mblk_t *mp, keysock_in_t *ksi,
3563     ipsecesp_stack_t *espstack)
3564 {
3565 	int diagnostic;
3566 
3567 	if (ksi->ks_in_extv[SADB_EXT_PROPOSAL] != NULL) {
3568 		diagnostic = SADB_X_DIAGNOSTIC_PROP_PRESENT;
3569 		goto badmsg;
3570 	}
3571 	if (ksi->ks_in_extv[SADB_EXT_SUPPORTED_AUTH] != NULL ||
3572 	    ksi->ks_in_extv[SADB_EXT_SUPPORTED_ENCRYPT] != NULL) {
3573 		diagnostic = SADB_X_DIAGNOSTIC_SUPP_PRESENT;
3574 		goto badmsg;
3575 	}
3576 	return (B_FALSE);	/* False ==> no failures */
3577 
3578 badmsg:
3579 	sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
3580 	    ksi->ks_in_serial);
3581 	return (B_TRUE);	/* True ==> failures */
3582 }
3583 
3584 /*
3585  * ESP parsing of PF_KEY messages.  Keysock did most of the really silly
3586  * error cases.  What I receive is a fully-formed, syntactically legal
3587  * PF_KEY message.  I then need to check semantics...
3588  *
3589  * This code may become common to AH and ESP.  Stay tuned.
3590  *
3591  * I also make the assumption that db_ref's are cool.  If this assumption
3592  * is wrong, this means that someone other than keysock or me has been
3593  * mucking with PF_KEY messages.
3594  */
3595 static void
3596 esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
3597 {
3598 	mblk_t *msg = mp->b_cont;
3599 	sadb_msg_t *samsg;
3600 	keysock_in_t *ksi;
3601 	int error;
3602 	int diagnostic = SADB_X_DIAGNOSTIC_NONE;
3603 
3604 	ASSERT(msg != NULL);
3605 
3606 	samsg = (sadb_msg_t *)msg->b_rptr;
3607 	ksi = (keysock_in_t *)mp->b_rptr;
3608 
3609 	/*
3610 	 * If applicable, convert unspecified AF_INET6 to unspecified
3611 	 * AF_INET.  And do other address reality checks.
3612 	 */
3613 	if (!sadb_addrfix(ksi, espstack->esp_pfkey_q, mp,
3614 	    espstack->ipsecesp_netstack) ||
3615 	    esp_pfkey_reality_failures(mp, ksi, espstack)) {
3616 		return;
3617 	}
3618 
3619 	switch (samsg->sadb_msg_type) {
3620 	case SADB_ADD:
3621 		error = esp_add_sa(mp, ksi, &diagnostic,
3622 		    espstack->ipsecesp_netstack);
3623 		if (error != 0) {
3624 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3625 			    diagnostic, ksi->ks_in_serial);
3626 		}
3627 		/* else esp_add_sa() took care of things. */
3628 		break;
3629 	case SADB_DELETE:
3630 		error = esp_del_sa(mp, ksi, &diagnostic, espstack);
3631 		if (error != 0) {
3632 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3633 			    diagnostic, ksi->ks_in_serial);
3634 		}
3635 		/* Else esp_del_sa() took care of things. */
3636 		break;
3637 	case SADB_GET:
3638 		error = sadb_get_sa(mp, ksi, &espstack->esp_sadb, &diagnostic,
3639 		    espstack->esp_pfkey_q);
3640 		if (error != 0) {
3641 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3642 			    diagnostic, ksi->ks_in_serial);
3643 		}
3644 		/* Else sadb_get_sa() took care of things. */
3645 		break;
3646 	case SADB_FLUSH:
3647 		sadbp_flush(&espstack->esp_sadb, espstack->ipsecesp_netstack);
3648 		sadb_pfkey_echo(espstack->esp_pfkey_q, mp, samsg, ksi, NULL);
3649 		break;
3650 	case SADB_REGISTER:
3651 		/*
3652 		 * Hmmm, let's do it!  Check for extensions (there should
3653 		 * be none), extract the fields, call esp_register_out(),
3654 		 * then either free or report an error.
3655 		 *
3656 		 * Keysock takes care of the PF_KEY bookkeeping for this.
3657 		 */
3658 		if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
3659 		    ksi->ks_in_serial, espstack)) {
3660 			freemsg(mp);
3661 		} else {
3662 			/*
3663 			 * Only way this path hits is if there is a memory
3664 			 * failure.  It will not return B_FALSE because of
3665 			 * lack of esp_pfkey_q if I am in wput().
3666 			 */
3667 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM,
3668 			    diagnostic, ksi->ks_in_serial);
3669 		}
3670 		break;
3671 	case SADB_UPDATE:
3672 		/*
3673 		 * Find a larval, if not there, find a full one and get
3674 		 * strict.
3675 		 */
3676 		error = esp_update_sa(mp, ksi, &diagnostic, espstack);
3677 		if (error != 0) {
3678 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3679 			    diagnostic, ksi->ks_in_serial);
3680 		}
3681 		/* else esp_update_sa() took care of things. */
3682 		break;
3683 	case SADB_GETSPI:
3684 		/*
3685 		 * Reserve a new larval entry.
3686 		 */
3687 		esp_getspi(mp, ksi, espstack);
3688 		break;
3689 	case SADB_ACQUIRE:
3690 		/*
3691 		 * Find larval and/or ACQUIRE record and kill it (them), I'm
3692 		 * most likely an error.  Inbound ACQUIRE messages should only
3693 		 * have the base header.
3694 		 */
3695 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3696 		    espstack->esp_pfkey_q, espstack->ipsecesp_netstack);
3697 		freemsg(mp);
3698 		break;
3699 	case SADB_DUMP:
3700 		/*
3701 		 * Dump all entries.
3702 		 */
3703 		esp_dump(mp, ksi, espstack);
3704 		/* esp_dump will take care of the return message, etc. */
3705 		break;
3706 	case SADB_EXPIRE:
3707 		/* Should never reach me. */
3708 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EOPNOTSUPP,
3709 		    diagnostic, ksi->ks_in_serial);
3710 		break;
3711 	default:
3712 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL,
3713 		    SADB_X_DIAGNOSTIC_UNKNOWN_MSG, ksi->ks_in_serial);
3714 		break;
3715 	}
3716 }
3717 
3718 /*
3719  * Handle case where PF_KEY says it can't find a keysock for one of my
3720  * ACQUIRE messages.
3721  */
3722 static void
3723 esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
3724 {
3725 	sadb_msg_t *samsg;
3726 	keysock_out_err_t *kse = (keysock_out_err_t *)mp->b_rptr;
3727 
3728 	if (mp->b_cont == NULL) {
3729 		freemsg(mp);
3730 		return;
3731 	}
3732 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3733 
3734 	/*
3735 	 * If keysock can't find any registered, delete the acquire record
3736 	 * immediately, and handle errors.
3737 	 */
3738 	if (samsg->sadb_msg_type == SADB_ACQUIRE) {
3739 		samsg->sadb_msg_errno = kse->ks_err_errno;
3740 		samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
3741 		/*
3742 		 * Use the write-side of the esp_pfkey_q, in case there is
3743 		 * no esp_sadb.s_ip_q.
3744 		 */
3745 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3746 		    WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
3747 	}
3748 
3749 	freemsg(mp);
3750 }
3751 
3752 /*
3753  * ESP module write put routine.
3754  */
3755 static void
3756 ipsecesp_wput(queue_t *q, mblk_t *mp)
3757 {
3758 	ipsec_info_t *ii;
3759 	struct iocblk *iocp;
3760 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
3761 
3762 	esp3dbg(espstack, ("In esp_wput().\n"));
3763 
3764 	/* NOTE: Each case must take care of freeing or passing mp. */
3765 	switch (mp->b_datap->db_type) {
3766 	case M_CTL:
3767 		if ((mp->b_wptr - mp->b_rptr) < sizeof (ipsec_info_t)) {
3768 			/* Not big enough message. */
3769 			freemsg(mp);
3770 			break;
3771 		}
3772 		ii = (ipsec_info_t *)mp->b_rptr;
3773 
3774 		switch (ii->ipsec_info_type) {
3775 		case KEYSOCK_OUT_ERR:
3776 			esp1dbg(espstack, ("Got KEYSOCK_OUT_ERR message.\n"));
3777 			esp_keysock_no_socket(mp, espstack);
3778 			break;
3779 		case KEYSOCK_IN:
3780 			ESP_BUMP_STAT(espstack, keysock_in);
3781 			esp3dbg(espstack, ("Got KEYSOCK_IN message.\n"));
3782 
3783 			/* Parse the message. */
3784 			esp_parse_pfkey(mp, espstack);
3785 			break;
3786 		case KEYSOCK_HELLO:
3787 			sadb_keysock_hello(&espstack->esp_pfkey_q, q, mp,
3788 			    esp_ager, (void *)espstack, &espstack->esp_event,
3789 			    SADB_SATYPE_ESP);
3790 			break;
3791 		default:
3792 			esp2dbg(espstack, ("Got M_CTL from above of 0x%x.\n",
3793 			    ii->ipsec_info_type));
3794 			freemsg(mp);
3795 			break;
3796 		}
3797 		break;
3798 	case M_IOCTL:
3799 		iocp = (struct iocblk *)mp->b_rptr;
3800 		switch (iocp->ioc_cmd) {
3801 		case ND_SET:
3802 		case ND_GET:
3803 			if (nd_getset(q, espstack->ipsecesp_g_nd, mp)) {
3804 				qreply(q, mp);
3805 				return;
3806 			} else {
3807 				iocp->ioc_error = ENOENT;
3808 			}
3809 			/* FALLTHRU */
3810 		default:
3811 			/* We really don't support any other ioctls, do we? */
3812 
3813 			/* Return EINVAL */
3814 			if (iocp->ioc_error != ENOENT)
3815 				iocp->ioc_error = EINVAL;
3816 			iocp->ioc_count = 0;
3817 			mp->b_datap->db_type = M_IOCACK;
3818 			qreply(q, mp);
3819 			return;
3820 		}
3821 	default:
3822 		esp3dbg(espstack,
3823 		    ("Got default message, type %d, passing to IP.\n",
3824 		    mp->b_datap->db_type));
3825 		putnext(q, mp);
3826 	}
3827 }
3828 
3829 /*
3830  * Process an outbound ESP packet that can be accelerated by a IPsec
3831  * hardware acceleration capable Provider.
3832  * The caller already inserted and initialized the ESP header.
3833  * This function allocates a tagging M_CTL, and adds room at the end
3834  * of the packet to hold the ICV if authentication is needed.
3835  *
3836  * On success returns B_TRUE, on failure returns B_FALSE and frees the
3837  * mblk chain ipsec_out.
3838  */
3839 static ipsec_status_t
3840 esp_outbound_accelerated(mblk_t *ipsec_out, uint_t icv_len)
3841 {
3842 	ipsec_out_t *io;
3843 	mblk_t *lastmp;
3844 	netstack_t	*ns;
3845 	ipsecesp_stack_t *espstack;
3846 	ipsec_stack_t	*ipss;
3847 
3848 	io = (ipsec_out_t *)ipsec_out->b_rptr;
3849 	ns = io->ipsec_out_ns;
3850 	espstack = ns->netstack_ipsecesp;
3851 	ipss = ns->netstack_ipsec;
3852 
3853 	ESP_BUMP_STAT(espstack, out_accelerated);
3854 
3855 	/* mark packet as being accelerated in IPSEC_OUT */
3856 	ASSERT(io->ipsec_out_accelerated == B_FALSE);
3857 	io->ipsec_out_accelerated = B_TRUE;
3858 
3859 	/*
3860 	 * add room at the end of the packet for the ICV if needed
3861 	 */
3862 	if (icv_len > 0) {
3863 		/* go to last mblk */
3864 		lastmp = ipsec_out;	/* For following while loop. */
3865 		do {
3866 			lastmp = lastmp->b_cont;
3867 		} while (lastmp->b_cont != NULL);
3868 
3869 		/* if not enough available room, allocate new mblk */
3870 		if ((lastmp->b_wptr + icv_len) > lastmp->b_datap->db_lim) {
3871 			lastmp->b_cont = allocb(icv_len, BPRI_HI);
3872 			if (lastmp->b_cont == NULL) {
3873 				ESP_BUMP_STAT(espstack, out_discards);
3874 				ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
3875 				    DROPPER(ipss, ipds_esp_nomem),
3876 				    &espstack->esp_dropper);
3877 				return (IPSEC_STATUS_FAILED);
3878 			}
3879 			lastmp = lastmp->b_cont;
3880 		}
3881 		lastmp->b_wptr += icv_len;
3882 	}
3883 
3884 	return (IPSEC_STATUS_SUCCESS);
3885 }
3886 
3887 /*
3888  * Process an inbound accelerated ESP packet.
3889  * On success returns B_TRUE, on failure returns B_FALSE and frees the
3890  * mblk chain ipsec_in.
3891  */
3892 static ipsec_status_t
3893 esp_inbound_accelerated(mblk_t *ipsec_in, mblk_t *data_mp, boolean_t isv4,
3894     ipsa_t *assoc)
3895 {
3896 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
3897 	mblk_t *hada_mp;
3898 	uint32_t icv_len = 0;
3899 	da_ipsec_t *hada;
3900 	ipha_t *ipha;
3901 	ip6_t *ip6h;
3902 	kstat_named_t *counter;
3903 	netstack_t	*ns = ii->ipsec_in_ns;
3904 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3905 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3906 
3907 	ESP_BUMP_STAT(espstack, in_accelerated);
3908 
3909 	hada_mp = ii->ipsec_in_da;
3910 	ASSERT(hada_mp != NULL);
3911 	hada = (da_ipsec_t *)hada_mp->b_rptr;
3912 
3913 	/*
3914 	 * We only support one level of decapsulation in hardware, so
3915 	 * nuke the pointer.
3916 	 */
3917 	ii->ipsec_in_da = NULL;
3918 	ii->ipsec_in_accelerated = B_FALSE;
3919 
3920 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE) {
3921 		/*
3922 		 * ESP with authentication. We expect the Provider to have
3923 		 * computed the ICV and placed it in the hardware acceleration
3924 		 * data attributes.
3925 		 *
3926 		 * Extract ICV length from attributes M_CTL and sanity check
3927 		 * its value. We allow the mblk to be smaller than da_ipsec_t
3928 		 * for a small ICV, as long as the entire ICV fits within the
3929 		 * mblk.
3930 		 *
3931 		 * Also ensures that the ICV length computed by Provider
3932 		 * corresponds to the ICV length of the agorithm specified by
3933 		 * the SA.
3934 		 */
3935 		icv_len = hada->da_icv_len;
3936 		if ((icv_len != assoc->ipsa_mac_len) ||
3937 		    (icv_len > DA_ICV_MAX_LEN) || (MBLKL(hada_mp) <
3938 		    (sizeof (da_ipsec_t) - DA_ICV_MAX_LEN + icv_len))) {
3939 			esp0dbg(("esp_inbound_accelerated: "
3940 			    "ICV len (%u) incorrect or mblk too small (%u)\n",
3941 			    icv_len, (uint32_t)(MBLKL(hada_mp))));
3942 			counter = DROPPER(ipss, ipds_esp_bad_auth);
3943 			goto esp_in_discard;
3944 		}
3945 	}
3946 
3947 	/* get pointers to IP header */
3948 	if (isv4) {
3949 		ipha = (ipha_t *)data_mp->b_rptr;
3950 	} else {
3951 		ip6h = (ip6_t *)data_mp->b_rptr;
3952 	}
3953 
3954 	/*
3955 	 * Compare ICV in ESP packet vs ICV computed by adapter.
3956 	 * We also remove the ICV from the end of the packet since
3957 	 * it will no longer be needed.
3958 	 *
3959 	 * Assume that esp_inbound() already ensured that the pkt
3960 	 * was in one mblk.
3961 	 */
3962 	ASSERT(data_mp->b_cont == NULL);
3963 	data_mp->b_wptr -= icv_len;
3964 	/* adjust IP header */
3965 	if (isv4)
3966 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - icv_len);
3967 	else
3968 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - icv_len);
3969 	if (icv_len && bcmp(hada->da_icv, data_mp->b_wptr, icv_len)) {
3970 		int af;
3971 		void *addr;
3972 
3973 		if (isv4) {
3974 			addr = &ipha->ipha_dst;
3975 			af = AF_INET;
3976 		} else {
3977 			addr = &ip6h->ip6_dst;
3978 			af = AF_INET6;
3979 		}
3980 
3981 		/*
3982 		 * Log the event. Don't print to the console, block
3983 		 * potential denial-of-service attack.
3984 		 */
3985 		ESP_BUMP_STAT(espstack, bad_auth);
3986 		ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
3987 		    "ESP Authentication failed spi %x, dst_addr %s",
3988 		    assoc->ipsa_spi, addr, af, espstack->ipsecesp_netstack);
3989 		counter = DROPPER(ipss, ipds_esp_bad_auth);
3990 		goto esp_in_discard;
3991 	}
3992 
3993 	esp3dbg(espstack, ("esp_inbound_accelerated: ESP authentication "
3994 	    "succeeded, checking replay\n"));
3995 
3996 	ipsec_in->b_cont = data_mp;
3997 
3998 	/*
3999 	 * Remove ESP header and padding from packet.
4000 	 */
4001 	if (!esp_strip_header(data_mp, ii->ipsec_in_v4, assoc->ipsa_iv_len,
4002 	    &counter, espstack)) {
4003 		esp1dbg(espstack, ("esp_inbound_accelerated: "
4004 		    "esp_strip_header() failed\n"));
4005 		goto esp_in_discard;
4006 	}
4007 
4008 	freeb(hada_mp);
4009 
4010 	/*
4011 	 * Account for usage..
4012 	 */
4013 	if (!esp_age_bytes(assoc, msgdsize(data_mp), B_TRUE)) {
4014 		/* The ipsa has hit hard expiration, LOG and AUDIT. */
4015 		ESP_BUMP_STAT(espstack, bytes_expired);
4016 		IP_ESP_BUMP_STAT(ipss, in_discards);
4017 		ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
4018 		    "ESP association 0x%x, dst %s had bytes expire.\n",
4019 		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
4020 		    espstack->ipsecesp_netstack);
4021 		ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
4022 		    DROPPER(ipss, ipds_esp_bytes_expire),
4023 		    &espstack->esp_dropper);
4024 		return (IPSEC_STATUS_FAILED);
4025 	}
4026 
4027 	/* done processing the packet */
4028 	return (IPSEC_STATUS_SUCCESS);
4029 
4030 esp_in_discard:
4031 	IP_ESP_BUMP_STAT(ipss, in_discards);
4032 	freeb(hada_mp);
4033 
4034 	ipsec_in->b_cont = data_mp;	/* For ip_drop_packet()'s sake... */
4035 	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter,
4036 	    &espstack->esp_dropper);
4037 
4038 	return (IPSEC_STATUS_FAILED);
4039 }
4040 
4041 /*
4042  * Wrapper to allow IP to trigger an ESP association failure message
4043  * during inbound SA selection.
4044  */
4045 void
4046 ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
4047     uint32_t spi, void *addr, int af, ipsecesp_stack_t *espstack)
4048 {
4049 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
4050 
4051 	if (espstack->ipsecesp_log_unknown_spi) {
4052 		ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
4053 		    addr, af, espstack->ipsecesp_netstack);
4054 	}
4055 
4056 	ip_drop_packet(mp, B_TRUE, NULL, NULL,
4057 	    DROPPER(ipss, ipds_esp_no_sa),
4058 	    &espstack->esp_dropper);
4059 }
4060 
4061 /*
4062  * Initialize the ESP input and output processing functions.
4063  */
4064 void
4065 ipsecesp_init_funcs(ipsa_t *sa)
4066 {
4067 	if (sa->ipsa_output_func == NULL)
4068 		sa->ipsa_output_func = esp_outbound;
4069 	if (sa->ipsa_input_func == NULL)
4070 		sa->ipsa_input_func = esp_inbound;
4071 }
4072