xref: /titanic_52/usr/src/uts/common/inet/ip/ipsecesp.c (revision 32ff2b3c67debc0dc66e07986e072d489ea88322)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/errno.h>
30 #include <sys/strlog.h>
31 #include <sys/tihdr.h>
32 #include <sys/socket.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/kmem.h>
36 #include <sys/zone.h>
37 #include <sys/sysmacros.h>
38 #include <sys/cmn_err.h>
39 #include <sys/vtrace.h>
40 #include <sys/debug.h>
41 #include <sys/atomic.h>
42 #include <sys/strsun.h>
43 #include <sys/random.h>
44 #include <netinet/in.h>
45 #include <net/if.h>
46 #include <netinet/ip6.h>
47 #include <net/pfkeyv2.h>
48 
49 #include <inet/common.h>
50 #include <inet/mi.h>
51 #include <inet/nd.h>
52 #include <inet/ip.h>
53 #include <inet/ip_impl.h>
54 #include <inet/ip6.h>
55 #include <inet/sadb.h>
56 #include <inet/ipsec_info.h>
57 #include <inet/ipsec_impl.h>
58 #include <inet/ipsecesp.h>
59 #include <inet/ipdrop.h>
60 #include <inet/tcp.h>
61 #include <sys/kstat.h>
62 #include <sys/policy.h>
63 #include <sys/strsun.h>
64 #include <inet/udp_impl.h>
65 #include <sys/taskq.h>
66 #include <sys/note.h>
67 
68 #include <sys/iphada.h>
69 
70 /*
71  * Table of ND variables supported by ipsecesp. These are loaded into
72  * ipsecesp_g_nd in ipsecesp_init_nd.
73  * All of these are alterable, within the min/max values given, at run time.
74  */
75 static	ipsecespparam_t	lcl_param_arr[] = {
76 	/* min	max			value	name */
77 	{ 0,	3,			0,	"ipsecesp_debug"},
78 	{ 125,	32000, SADB_AGE_INTERVAL_DEFAULT, "ipsecesp_age_interval"},
79 	{ 1,	10,			1,	"ipsecesp_reap_delay"},
80 	{ 1,	SADB_MAX_REPLAY,	64,	"ipsecesp_replay_size"},
81 	{ 1,	300,			15,	"ipsecesp_acquire_timeout"},
82 	{ 1,	1800,			90,	"ipsecesp_larval_timeout"},
83 	/* Default lifetime values for ACQUIRE messages. */
84 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_bytes"},
85 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_bytes"},
86 	{ 0,	0xffffffffU,	24000,	"ipsecesp_default_soft_addtime"},
87 	{ 0,	0xffffffffU,	28800,	"ipsecesp_default_hard_addtime"},
88 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_usetime"},
89 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_usetime"},
90 	{ 0,	1,		0,	"ipsecesp_log_unknown_spi"},
91 	{ 0,	2,		1,	"ipsecesp_padding_check"},
92 	{ 0,	600,		20,	"ipsecesp_nat_keepalive_interval"},
93 };
94 #define	ipsecesp_debug	ipsecesp_params[0].ipsecesp_param_value
95 #define	ipsecesp_age_interval ipsecesp_params[1].ipsecesp_param_value
96 #define	ipsecesp_age_int_max	ipsecesp_params[1].ipsecesp_param_max
97 #define	ipsecesp_reap_delay	ipsecesp_params[2].ipsecesp_param_value
98 #define	ipsecesp_replay_size	ipsecesp_params[3].ipsecesp_param_value
99 #define	ipsecesp_acquire_timeout	\
100 	ipsecesp_params[4].ipsecesp_param_value
101 #define	ipsecesp_larval_timeout	\
102 	ipsecesp_params[5].ipsecesp_param_value
103 #define	ipsecesp_default_soft_bytes	\
104 	ipsecesp_params[6].ipsecesp_param_value
105 #define	ipsecesp_default_hard_bytes	\
106 	ipsecesp_params[7].ipsecesp_param_value
107 #define	ipsecesp_default_soft_addtime	\
108 	ipsecesp_params[8].ipsecesp_param_value
109 #define	ipsecesp_default_hard_addtime	\
110 	ipsecesp_params[9].ipsecesp_param_value
111 #define	ipsecesp_default_soft_usetime	\
112 	ipsecesp_params[10].ipsecesp_param_value
113 #define	ipsecesp_default_hard_usetime	\
114 	ipsecesp_params[11].ipsecesp_param_value
115 #define	ipsecesp_log_unknown_spi	\
116 	ipsecesp_params[12].ipsecesp_param_value
117 #define	ipsecesp_padding_check	\
118 	ipsecesp_params[13].ipsecesp_param_value
119 /* For ipsecesp_nat_keepalive_interval, see ipsecesp.h. */
120 
121 #define	esp0dbg(a)	printf a
122 /* NOTE:  != 0 instead of > 0 so lint doesn't complain. */
123 #define	esp1dbg(espstack, a)	if (espstack->ipsecesp_debug != 0) printf a
124 #define	esp2dbg(espstack, a)	if (espstack->ipsecesp_debug > 1) printf a
125 #define	esp3dbg(espstack, a)	if (espstack->ipsecesp_debug > 2) printf a
126 
127 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
128 static int ipsecesp_close(queue_t *);
129 static void ipsecesp_rput(queue_t *, mblk_t *);
130 static void ipsecesp_wput(queue_t *, mblk_t *);
131 static void	*ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
132 static void	ipsecesp_stack_fini(netstackid_t stackid, void *arg);
133 static void esp_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
134 
135 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
136 static ipsec_status_t esp_outbound_accelerated(mblk_t *, uint_t);
137 static ipsec_status_t esp_inbound_accelerated(mblk_t *, mblk_t *,
138     boolean_t, ipsa_t *);
139 
140 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
141     ipsecesp_stack_t *);
142 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
143     kstat_named_t **, ipsecesp_stack_t *);
144 static ipsec_status_t esp_submit_req_inbound(mblk_t *, ipsa_t *, uint_t);
145 static ipsec_status_t esp_submit_req_outbound(mblk_t *, ipsa_t *, uchar_t *,
146     uint_t);
147 /* Setable in /etc/system */
148 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
149 
150 static struct module_info info = {
151 	5137, "ipsecesp", 0, INFPSZ, 65536, 1024
152 };
153 
154 static struct qinit rinit = {
155 	(pfi_t)ipsecesp_rput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
156 	NULL
157 };
158 
159 static struct qinit winit = {
160 	(pfi_t)ipsecesp_wput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
161 	NULL
162 };
163 
164 struct streamtab ipsecespinfo = {
165 	&rinit, &winit, NULL, NULL
166 };
167 
168 static taskq_t *esp_taskq;
169 
170 /*
171  * OTOH, this one is set at open/close, and I'm D_MTQPAIR for now.
172  *
173  * Question:	Do I need this, given that all instance's esps->esps_wq point
174  *		to IP?
175  *
176  * Answer:	Yes, because I need to know which queue is BOUND to
177  *		IPPROTO_ESP
178  */
179 
180 /*
181  * Stats.  This may eventually become a full-blown SNMP MIB once that spec
182  * stabilizes.
183  */
184 
185 typedef struct esp_kstats_s {
186 	kstat_named_t esp_stat_num_aalgs;
187 	kstat_named_t esp_stat_good_auth;
188 	kstat_named_t esp_stat_bad_auth;
189 	kstat_named_t esp_stat_bad_padding;
190 	kstat_named_t esp_stat_replay_failures;
191 	kstat_named_t esp_stat_replay_early_failures;
192 	kstat_named_t esp_stat_keysock_in;
193 	kstat_named_t esp_stat_out_requests;
194 	kstat_named_t esp_stat_acquire_requests;
195 	kstat_named_t esp_stat_bytes_expired;
196 	kstat_named_t esp_stat_out_discards;
197 	kstat_named_t esp_stat_in_accelerated;
198 	kstat_named_t esp_stat_out_accelerated;
199 	kstat_named_t esp_stat_noaccel;
200 	kstat_named_t esp_stat_crypto_sync;
201 	kstat_named_t esp_stat_crypto_async;
202 	kstat_named_t esp_stat_crypto_failures;
203 	kstat_named_t esp_stat_num_ealgs;
204 	kstat_named_t esp_stat_bad_decrypt;
205 	kstat_named_t esp_stat_sa_port_renumbers;
206 } esp_kstats_t;
207 
208 /*
209  * espstack->esp_kstats is equal to espstack->esp_ksp->ks_data if
210  * kstat_create_netstack for espstack->esp_ksp succeeds, but when it
211  * fails, it will be NULL. Note this is done for all stack instances,
212  * so it *could* fail. hence a non-NULL checking is done for
213  * ESP_BUMP_STAT and ESP_DEBUMP_STAT
214  */
215 #define	ESP_BUMP_STAT(espstack, x)					\
216 do {									\
217 	if (espstack->esp_kstats != NULL)				\
218 		(espstack->esp_kstats->esp_stat_ ## x).value.ui64++;	\
219 _NOTE(CONSTCOND)							\
220 } while (0)
221 
222 #define	ESP_DEBUMP_STAT(espstack, x)					\
223 do {									\
224 	if (espstack->esp_kstats != NULL)				\
225 		(espstack->esp_kstats->esp_stat_ ## x).value.ui64--;	\
226 _NOTE(CONSTCOND)							\
227 } while (0)
228 
229 static int	esp_kstat_update(kstat_t *, int);
230 
231 static boolean_t
232 esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
233 {
234 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
235 	    "net", KSTAT_TYPE_NAMED,
236 	    sizeof (esp_kstats_t) / sizeof (kstat_named_t),
237 	    KSTAT_FLAG_PERSISTENT, stackid);
238 
239 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
240 		return (B_FALSE);
241 
242 	espstack->esp_kstats = espstack->esp_ksp->ks_data;
243 
244 	espstack->esp_ksp->ks_update = esp_kstat_update;
245 	espstack->esp_ksp->ks_private = (void *)(uintptr_t)stackid;
246 
247 #define	K64 KSTAT_DATA_UINT64
248 #define	KI(x) kstat_named_init(&(espstack->esp_kstats->esp_stat_##x), #x, K64)
249 
250 	KI(num_aalgs);
251 	KI(num_ealgs);
252 	KI(good_auth);
253 	KI(bad_auth);
254 	KI(bad_padding);
255 	KI(replay_failures);
256 	KI(replay_early_failures);
257 	KI(keysock_in);
258 	KI(out_requests);
259 	KI(acquire_requests);
260 	KI(bytes_expired);
261 	KI(out_discards);
262 	KI(in_accelerated);
263 	KI(out_accelerated);
264 	KI(noaccel);
265 	KI(crypto_sync);
266 	KI(crypto_async);
267 	KI(crypto_failures);
268 	KI(bad_decrypt);
269 	KI(sa_port_renumbers);
270 
271 #undef KI
272 #undef K64
273 
274 	kstat_install(espstack->esp_ksp);
275 
276 	return (B_TRUE);
277 }
278 
279 static int
280 esp_kstat_update(kstat_t *kp, int rw)
281 {
282 	esp_kstats_t *ekp;
283 	netstackid_t	stackid = (zoneid_t)(uintptr_t)kp->ks_private;
284 	netstack_t	*ns;
285 	ipsec_stack_t	*ipss;
286 
287 	if ((kp == NULL) || (kp->ks_data == NULL))
288 		return (EIO);
289 
290 	if (rw == KSTAT_WRITE)
291 		return (EACCES);
292 
293 	ns = netstack_find_by_stackid(stackid);
294 	if (ns == NULL)
295 		return (-1);
296 	ipss = ns->netstack_ipsec;
297 	if (ipss == NULL) {
298 		netstack_rele(ns);
299 		return (-1);
300 	}
301 	ekp = (esp_kstats_t *)kp->ks_data;
302 
303 	mutex_enter(&ipss->ipsec_alg_lock);
304 	ekp->esp_stat_num_aalgs.value.ui64 =
305 	    ipss->ipsec_nalgs[IPSEC_ALG_AUTH];
306 	ekp->esp_stat_num_ealgs.value.ui64 =
307 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
308 	mutex_exit(&ipss->ipsec_alg_lock);
309 
310 	netstack_rele(ns);
311 	return (0);
312 }
313 
314 #ifdef DEBUG
315 /*
316  * Debug routine, useful to see pre-encryption data.
317  */
318 static char *
319 dump_msg(mblk_t *mp)
320 {
321 	char tmp_str[3], tmp_line[256];
322 
323 	while (mp != NULL) {
324 		unsigned char *ptr;
325 
326 		printf("mblk address 0x%p, length %ld, db_ref %d "
327 		    "type %d, base 0x%p, lim 0x%p\n",
328 		    (void *) mp, (long)(mp->b_wptr - mp->b_rptr),
329 		    mp->b_datap->db_ref, mp->b_datap->db_type,
330 		    (void *)mp->b_datap->db_base, (void *)mp->b_datap->db_lim);
331 		ptr = mp->b_rptr;
332 
333 		tmp_line[0] = '\0';
334 		while (ptr < mp->b_wptr) {
335 			uint_t diff;
336 
337 			diff = (ptr - mp->b_rptr);
338 			if (!(diff & 0x1f)) {
339 				if (strlen(tmp_line) > 0) {
340 					printf("bytes: %s\n", tmp_line);
341 					tmp_line[0] = '\0';
342 				}
343 			}
344 			if (!(diff & 0x3))
345 				(void) strcat(tmp_line, " ");
346 			(void) sprintf(tmp_str, "%02x", *ptr);
347 			(void) strcat(tmp_line, tmp_str);
348 			ptr++;
349 		}
350 		if (strlen(tmp_line) > 0)
351 			printf("bytes: %s\n", tmp_line);
352 
353 		mp = mp->b_cont;
354 	}
355 
356 	return ("\n");
357 }
358 
359 #else /* DEBUG */
360 static char *
361 dump_msg(mblk_t *mp)
362 {
363 	printf("Find value of mp %p.\n", mp);
364 	return ("\n");
365 }
366 #endif /* DEBUG */
367 
368 /*
369  * Don't have to lock age_interval, as only one thread will access it at
370  * a time, because I control the one function that does with timeout().
371  */
372 static void
373 esp_ager(void *arg)
374 {
375 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
376 	netstack_t	*ns = espstack->ipsecesp_netstack;
377 	hrtime_t begin = gethrtime();
378 
379 	sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
380 	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
381 	sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
382 	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
383 
384 	espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
385 	    esp_ager, espstack,
386 	    &espstack->ipsecesp_age_interval, espstack->ipsecesp_age_int_max,
387 	    info.mi_idnum);
388 }
389 
390 /*
391  * Get an ESP NDD parameter.
392  */
393 /* ARGSUSED */
394 static int
395 ipsecesp_param_get(q, mp, cp, cr)
396 	queue_t	*q;
397 	mblk_t	*mp;
398 	caddr_t	cp;
399 	cred_t *cr;
400 {
401 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
402 	uint_t value;
403 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
404 
405 	mutex_enter(&espstack->ipsecesp_param_lock);
406 	value = ipsecesppa->ipsecesp_param_value;
407 	mutex_exit(&espstack->ipsecesp_param_lock);
408 
409 	(void) mi_mpprintf(mp, "%u", value);
410 	return (0);
411 }
412 
413 /*
414  * This routine sets an NDD variable in a ipsecespparam_t structure.
415  */
416 /* ARGSUSED */
417 static int
418 ipsecesp_param_set(q, mp, value, cp, cr)
419 	queue_t	*q;
420 	mblk_t	*mp;
421 	char	*value;
422 	caddr_t	cp;
423 	cred_t *cr;
424 {
425 	ulong_t	new_value;
426 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
427 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
428 
429 	/*
430 	 * Fail the request if the new value does not lie within the
431 	 * required bounds.
432 	 */
433 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
434 	    new_value < ipsecesppa->ipsecesp_param_min ||
435 	    new_value > ipsecesppa->ipsecesp_param_max) {
436 		return (EINVAL);
437 	}
438 
439 	/* Set the new value */
440 	mutex_enter(&espstack->ipsecesp_param_lock);
441 	ipsecesppa->ipsecesp_param_value = new_value;
442 	mutex_exit(&espstack->ipsecesp_param_lock);
443 	return (0);
444 }
445 
446 /*
447  * Using lifetime NDD variables, fill in an extended combination's
448  * lifetime information.
449  */
450 void
451 ipsecesp_fill_defs(sadb_x_ecomb_t *ecomb, netstack_t *ns)
452 {
453 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
454 
455 	ecomb->sadb_x_ecomb_soft_bytes = espstack->ipsecesp_default_soft_bytes;
456 	ecomb->sadb_x_ecomb_hard_bytes = espstack->ipsecesp_default_hard_bytes;
457 	ecomb->sadb_x_ecomb_soft_addtime =
458 	    espstack->ipsecesp_default_soft_addtime;
459 	ecomb->sadb_x_ecomb_hard_addtime =
460 	    espstack->ipsecesp_default_hard_addtime;
461 	ecomb->sadb_x_ecomb_soft_usetime =
462 	    espstack->ipsecesp_default_soft_usetime;
463 	ecomb->sadb_x_ecomb_hard_usetime =
464 	    espstack->ipsecesp_default_hard_usetime;
465 }
466 
467 /*
468  * Initialize things for ESP at module load time.
469  */
470 boolean_t
471 ipsecesp_ddi_init(void)
472 {
473 	esp_taskq = taskq_create("esp_taskq", 1, minclsyspri,
474 	    IPSEC_TASKQ_MIN, IPSEC_TASKQ_MAX, 0);
475 
476 	/*
477 	 * We want to be informed each time a stack is created or
478 	 * destroyed in the kernel, so we can maintain the
479 	 * set of ipsecesp_stack_t's.
480 	 */
481 	netstack_register(NS_IPSECESP, ipsecesp_stack_init, NULL,
482 	    ipsecesp_stack_fini);
483 
484 	return (B_TRUE);
485 }
486 
487 /*
488  * Walk through the param array specified registering each element with the
489  * named dispatch handler.
490  */
491 static boolean_t
492 ipsecesp_param_register(IDP *ndp, ipsecespparam_t *espp, int cnt)
493 {
494 	for (; cnt-- > 0; espp++) {
495 		if (espp->ipsecesp_param_name != NULL &&
496 		    espp->ipsecesp_param_name[0]) {
497 			if (!nd_load(ndp,
498 			    espp->ipsecesp_param_name,
499 			    ipsecesp_param_get, ipsecesp_param_set,
500 			    (caddr_t)espp)) {
501 				nd_free(ndp);
502 				return (B_FALSE);
503 			}
504 		}
505 	}
506 	return (B_TRUE);
507 }
508 /*
509  * Initialize things for ESP for each stack instance
510  */
511 static void *
512 ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns)
513 {
514 	ipsecesp_stack_t	*espstack;
515 	ipsecespparam_t		*espp;
516 
517 	espstack = (ipsecesp_stack_t *)kmem_zalloc(sizeof (*espstack),
518 	    KM_SLEEP);
519 	espstack->ipsecesp_netstack = ns;
520 
521 	espp = (ipsecespparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
522 	espstack->ipsecesp_params = espp;
523 	bcopy(lcl_param_arr, espp, sizeof (lcl_param_arr));
524 
525 	(void) ipsecesp_param_register(&espstack->ipsecesp_g_nd, espp,
526 	    A_CNT(lcl_param_arr));
527 
528 	(void) esp_kstat_init(espstack, stackid);
529 
530 	espstack->esp_sadb.s_acquire_timeout =
531 	    &espstack->ipsecesp_acquire_timeout;
532 	espstack->esp_sadb.s_acqfn = esp_send_acquire;
533 	sadbp_init("ESP", &espstack->esp_sadb, SADB_SATYPE_ESP, esp_hash_size,
534 	    espstack->ipsecesp_netstack);
535 
536 	mutex_init(&espstack->ipsecesp_param_lock, NULL, MUTEX_DEFAULT, 0);
537 
538 	ip_drop_register(&espstack->esp_dropper, "IPsec ESP");
539 	return (espstack);
540 }
541 
542 /*
543  * Destroy things for ESP at module unload time.
544  */
545 void
546 ipsecesp_ddi_destroy(void)
547 {
548 	netstack_unregister(NS_IPSECESP);
549 	taskq_destroy(esp_taskq);
550 }
551 
552 /*
553  * Destroy things for ESP for one stack instance
554  */
555 static void
556 ipsecesp_stack_fini(netstackid_t stackid, void *arg)
557 {
558 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
559 
560 	if (espstack->esp_pfkey_q != NULL) {
561 		(void) quntimeout(espstack->esp_pfkey_q, espstack->esp_event);
562 	}
563 	espstack->esp_sadb.s_acqfn = NULL;
564 	espstack->esp_sadb.s_acquire_timeout = NULL;
565 	sadbp_destroy(&espstack->esp_sadb, espstack->ipsecesp_netstack);
566 	ip_drop_unregister(&espstack->esp_dropper);
567 	mutex_destroy(&espstack->ipsecesp_param_lock);
568 	nd_free(&espstack->ipsecesp_g_nd);
569 
570 	kmem_free(espstack->ipsecesp_params, sizeof (lcl_param_arr));
571 	espstack->ipsecesp_params = NULL;
572 	kstat_delete_netstack(espstack->esp_ksp, stackid);
573 	espstack->esp_ksp = NULL;
574 	espstack->esp_kstats = NULL;
575 	kmem_free(espstack, sizeof (*espstack));
576 }
577 
578 /*
579  * ESP module open routine.
580  */
581 /* ARGSUSED */
582 static int
583 ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
584 {
585 	netstack_t		*ns;
586 	ipsecesp_stack_t	*espstack;
587 
588 	if (secpolicy_ip_config(credp, B_FALSE) != 0)
589 		return (EPERM);
590 
591 	if (q->q_ptr != NULL)
592 		return (0);  /* Re-open of an already open instance. */
593 
594 	if (sflag != MODOPEN)
595 		return (EINVAL);
596 
597 	ns = netstack_find_by_cred(credp);
598 	ASSERT(ns != NULL);
599 	espstack = ns->netstack_ipsecesp;
600 	ASSERT(espstack != NULL);
601 
602 	/*
603 	 * ASSUMPTIONS (because I'm MT_OCEXCL):
604 	 *
605 	 *	* I'm being pushed on top of IP for all my opens (incl. #1).
606 	 *	* Only ipsecesp_open() can write into esp_sadb.s_ip_q.
607 	 *	* Because of this, I can check lazily for esp_sadb.s_ip_q.
608 	 *
609 	 *  If these assumptions are wrong, I'm in BIG trouble...
610 	 */
611 
612 	q->q_ptr = espstack;
613 	WR(q)->q_ptr = q->q_ptr;
614 
615 	if (espstack->esp_sadb.s_ip_q == NULL) {
616 		struct T_unbind_req *tur;
617 
618 		espstack->esp_sadb.s_ip_q = WR(q);
619 		/* Allocate an unbind... */
620 		espstack->esp_ip_unbind = allocb(sizeof (struct T_unbind_req),
621 		    BPRI_HI);
622 
623 		/*
624 		 * Send down T_BIND_REQ to bind IPPROTO_ESP.
625 		 * Handle the ACK here in ESP.
626 		 */
627 		qprocson(q);
628 		if (espstack->esp_ip_unbind == NULL ||
629 		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
630 			if (espstack->esp_ip_unbind != NULL) {
631 				freeb(espstack->esp_ip_unbind);
632 				espstack->esp_ip_unbind = NULL;
633 			}
634 			q->q_ptr = NULL;
635 			netstack_rele(espstack->ipsecesp_netstack);
636 			return (ENOMEM);
637 		}
638 
639 		espstack->esp_ip_unbind->b_datap->db_type = M_PROTO;
640 		tur = (struct T_unbind_req *)espstack->esp_ip_unbind->b_rptr;
641 		tur->PRIM_type = T_UNBIND_REQ;
642 	} else {
643 		qprocson(q);
644 	}
645 
646 	/*
647 	 * For now, there's not much I can do.  I'll be getting a message
648 	 * passed down to me from keysock (in my wput), and a T_BIND_ACK
649 	 * up from IP (in my rput).
650 	 */
651 
652 	return (0);
653 }
654 
655 /*
656  * ESP module close routine.
657  */
658 static int
659 ipsecesp_close(queue_t *q)
660 {
661 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
662 
663 	/*
664 	 * If esp_sadb.s_ip_q is attached to this instance, send a
665 	 * T_UNBIND_REQ to IP for the instance before doing
666 	 * a qprocsoff().
667 	 */
668 	if (WR(q) == espstack->esp_sadb.s_ip_q &&
669 	    espstack->esp_ip_unbind != NULL) {
670 		putnext(WR(q), espstack->esp_ip_unbind);
671 		espstack->esp_ip_unbind = NULL;
672 	}
673 
674 	/*
675 	 * Clean up q_ptr, if needed.
676 	 */
677 	qprocsoff(q);
678 
679 	/* Keysock queue check is safe, because of OCEXCL perimeter. */
680 
681 	if (q == espstack->esp_pfkey_q) {
682 		esp1dbg(espstack,
683 		    ("ipsecesp_close:  Ummm... keysock is closing ESP.\n"));
684 		espstack->esp_pfkey_q = NULL;
685 		/* Detach qtimeouts. */
686 		(void) quntimeout(q, espstack->esp_event);
687 	}
688 
689 	if (WR(q) == espstack->esp_sadb.s_ip_q) {
690 		/*
691 		 * If the esp_sadb.s_ip_q is attached to this instance, find
692 		 * another.  The OCEXCL outer perimeter helps us here.
693 		 */
694 		espstack->esp_sadb.s_ip_q = NULL;
695 
696 		/*
697 		 * Find a replacement queue for esp_sadb.s_ip_q.
698 		 */
699 		if (espstack->esp_pfkey_q != NULL &&
700 		    espstack->esp_pfkey_q != RD(q)) {
701 			/*
702 			 * See if we can use the pfkey_q.
703 			 */
704 			espstack->esp_sadb.s_ip_q = WR(espstack->esp_pfkey_q);
705 		}
706 
707 		if (espstack->esp_sadb.s_ip_q == NULL ||
708 		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
709 			esp1dbg(espstack, ("ipsecesp: Can't reassign ip_q.\n"));
710 			espstack->esp_sadb.s_ip_q = NULL;
711 		} else {
712 			espstack->esp_ip_unbind =
713 			    allocb(sizeof (struct T_unbind_req), BPRI_HI);
714 
715 			if (espstack->esp_ip_unbind != NULL) {
716 				struct T_unbind_req *tur;
717 
718 				espstack->esp_ip_unbind->b_datap->db_type =
719 				    M_PROTO;
720 				tur = (struct T_unbind_req *)
721 				    espstack->esp_ip_unbind->b_rptr;
722 				tur->PRIM_type = T_UNBIND_REQ;
723 			}
724 			/* If it's NULL, I can't do much here. */
725 		}
726 	}
727 
728 	netstack_rele(espstack->ipsecesp_netstack);
729 	return (0);
730 }
731 
732 /*
733  * Add a number of bytes to what the SA has protected so far.  Return
734  * B_TRUE if the SA can still protect that many bytes.
735  *
736  * Caller must REFRELE the passed-in assoc.  This function must REFRELE
737  * any obtained peer SA.
738  */
739 static boolean_t
740 esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
741 {
742 	ipsa_t *inassoc, *outassoc;
743 	isaf_t *bucket;
744 	boolean_t inrc, outrc, isv6;
745 	sadb_t *sp;
746 	int outhash;
747 	netstack_t		*ns = assoc->ipsa_netstack;
748 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
749 
750 	/* No peer?  No problem! */
751 	if (!assoc->ipsa_haspeer) {
752 		return (sadb_age_bytes(espstack->esp_pfkey_q, assoc, bytes,
753 		    B_TRUE));
754 	}
755 
756 	/*
757 	 * Otherwise, we want to grab both the original assoc and its peer.
758 	 * There might be a race for this, but if it's a real race, two
759 	 * expire messages may occur.  We limit this by only sending the
760 	 * expire message on one of the peers, we'll pick the inbound
761 	 * arbitrarily.
762 	 *
763 	 * If we need tight synchronization on the peer SA, then we need to
764 	 * reconsider.
765 	 */
766 
767 	/* Use address length to select IPv6/IPv4 */
768 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
769 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
770 
771 	if (inbound) {
772 		inassoc = assoc;
773 		if (isv6) {
774 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
775 			    &inassoc->ipsa_dstaddr));
776 		} else {
777 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
778 			    &inassoc->ipsa_dstaddr));
779 		}
780 		bucket = &sp->sdb_of[outhash];
781 		mutex_enter(&bucket->isaf_lock);
782 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
783 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
784 		    inassoc->ipsa_addrfam);
785 		mutex_exit(&bucket->isaf_lock);
786 		if (outassoc == NULL) {
787 			/* Q: Do we wish to set haspeer == B_FALSE? */
788 			esp0dbg(("esp_age_bytes: "
789 			    "can't find peer for inbound.\n"));
790 			return (sadb_age_bytes(espstack->esp_pfkey_q, inassoc,
791 			    bytes, B_TRUE));
792 		}
793 	} else {
794 		outassoc = assoc;
795 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
796 		mutex_enter(&bucket->isaf_lock);
797 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
798 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
799 		    outassoc->ipsa_addrfam);
800 		mutex_exit(&bucket->isaf_lock);
801 		if (inassoc == NULL) {
802 			/* Q: Do we wish to set haspeer == B_FALSE? */
803 			esp0dbg(("esp_age_bytes: "
804 			    "can't find peer for outbound.\n"));
805 			return (sadb_age_bytes(espstack->esp_pfkey_q, outassoc,
806 			    bytes, B_TRUE));
807 		}
808 	}
809 
810 	inrc = sadb_age_bytes(espstack->esp_pfkey_q, inassoc, bytes, B_TRUE);
811 	outrc = sadb_age_bytes(espstack->esp_pfkey_q, outassoc, bytes, B_FALSE);
812 
813 	/*
814 	 * REFRELE any peer SA.
815 	 *
816 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
817 	 * them in { }.
818 	 */
819 	if (inbound) {
820 		IPSA_REFRELE(outassoc);
821 	} else {
822 		IPSA_REFRELE(inassoc);
823 	}
824 
825 	return (inrc && outrc);
826 }
827 
828 /*
829  * Do incoming NAT-T manipulations for packet.
830  */
831 static ipsec_status_t
832 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
833 {
834 	ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
835 	tcpha_t *tcph;
836 	udpha_t *udpha;
837 	/* Initialize to our inbound cksum adjustment... */
838 	uint32_t sum = assoc->ipsa_inbound_cksum;
839 
840 	switch (ipha->ipha_protocol) {
841 	case IPPROTO_TCP:
842 		tcph = (tcpha_t *)(data_mp->b_rptr +
843 		    IPH_HDR_LENGTH(ipha));
844 
845 #define	DOWN_SUM(x) (x) = ((x) & 0xFFFF) +	 ((x) >> 16)
846 		sum += ~ntohs(tcph->tha_sum) & 0xFFFF;
847 		DOWN_SUM(sum);
848 		DOWN_SUM(sum);
849 		tcph->tha_sum = ~htons(sum);
850 		break;
851 	case IPPROTO_UDP:
852 		udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
853 
854 		if (udpha->uha_checksum != 0) {
855 			/* Adujst if the inbound one was not zero. */
856 			sum += ~ntohs(udpha->uha_checksum) & 0xFFFF;
857 			DOWN_SUM(sum);
858 			DOWN_SUM(sum);
859 			udpha->uha_checksum = ~htons(sum);
860 			if (udpha->uha_checksum == 0)
861 				udpha->uha_checksum = 0xFFFF;
862 		}
863 #undef DOWN_SUM
864 		break;
865 	case IPPROTO_IP:
866 		/*
867 		 * This case is only an issue for self-encapsulated
868 		 * packets.  So for now, fall through.
869 		 */
870 		break;
871 	}
872 	return (IPSEC_STATUS_SUCCESS);
873 }
874 
875 
876 /*
877  * Strip ESP header, check padding, and fix IP header.
878  * Returns B_TRUE on success, B_FALSE if an error occured.
879  */
880 static boolean_t
881 esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
882     kstat_named_t **counter, ipsecesp_stack_t *espstack)
883 {
884 	ipha_t *ipha;
885 	ip6_t *ip6h;
886 	uint_t divpoint;
887 	mblk_t *scratch;
888 	uint8_t nexthdr, padlen;
889 	uint8_t lastpad;
890 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
891 	uint8_t *lastbyte;
892 
893 	/*
894 	 * Strip ESP data and fix IP header.
895 	 *
896 	 * XXX In case the beginning of esp_inbound() changes to not do a
897 	 * pullup, this part of the code can remain unchanged.
898 	 */
899 	if (isv4) {
900 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ipha_t));
901 		ipha = (ipha_t *)data_mp->b_rptr;
902 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (esph_t) +
903 		    IPH_HDR_LENGTH(ipha));
904 		divpoint = IPH_HDR_LENGTH(ipha);
905 	} else {
906 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ip6_t));
907 		ip6h = (ip6_t *)data_mp->b_rptr;
908 		divpoint = ip_hdr_length_v6(data_mp, ip6h);
909 	}
910 
911 	scratch = data_mp;
912 	while (scratch->b_cont != NULL)
913 		scratch = scratch->b_cont;
914 
915 	ASSERT((scratch->b_wptr - scratch->b_rptr) >= 3);
916 
917 	/*
918 	 * "Next header" and padding length are the last two bytes in the
919 	 * ESP-protected datagram, thus the explicit - 1 and - 2.
920 	 * lastpad is the last byte of the padding, which can be used for
921 	 * a quick check to see if the padding is correct.
922 	 */
923 	lastbyte = scratch->b_wptr - 1;
924 	nexthdr = *lastbyte--;
925 	padlen = *lastbyte--;
926 
927 	if (isv4) {
928 		/* Fix part of the IP header. */
929 		ipha->ipha_protocol = nexthdr;
930 		/*
931 		 * Reality check the padlen.  The explicit - 2 is for the
932 		 * padding length and the next-header bytes.
933 		 */
934 		if (padlen >= ntohs(ipha->ipha_length) - sizeof (ipha_t) - 2 -
935 		    sizeof (esph_t) - ivlen) {
936 			ESP_BUMP_STAT(espstack, bad_decrypt);
937 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
938 			    info.mi_idnum, 0, 0,
939 			    SL_ERROR | SL_WARN,
940 			    "Corrupt ESP packet (padlen too big).\n");
941 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
942 			    padlen));
943 			esp1dbg(espstack, ("pkt len(%d) - ip hdr - esp "
944 			    "hdr - ivlen(%d) = %d.\n",
945 			    ntohs(ipha->ipha_length), ivlen,
946 			    (int)(ntohs(ipha->ipha_length) - sizeof (ipha_t) -
947 			    2 - sizeof (esph_t) - ivlen)));
948 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
949 			return (B_FALSE);
950 		}
951 
952 		/*
953 		 * Fix the rest of the header.  The explicit - 2 is for the
954 		 * padding length and the next-header bytes.
955 		 */
956 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - padlen -
957 		    2 - sizeof (esph_t) - ivlen);
958 		ipha->ipha_hdr_checksum = 0;
959 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
960 	} else {
961 		if (ip6h->ip6_nxt == IPPROTO_ESP) {
962 			ip6h->ip6_nxt = nexthdr;
963 		} else {
964 			ip6_pkt_t ipp;
965 
966 			bzero(&ipp, sizeof (ipp));
967 			(void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
968 			if (ipp.ipp_dstopts != NULL) {
969 				ipp.ipp_dstopts->ip6d_nxt = nexthdr;
970 			} else if (ipp.ipp_rthdr != NULL) {
971 				ipp.ipp_rthdr->ip6r_nxt = nexthdr;
972 			} else if (ipp.ipp_hopopts != NULL) {
973 				ipp.ipp_hopopts->ip6h_nxt = nexthdr;
974 			} else {
975 				/* Panic a DEBUG kernel. */
976 				ASSERT(ipp.ipp_hopopts != NULL);
977 				/* Otherwise, pretend it's IP + ESP. */
978 				cmn_err(CE_WARN, "ESP IPv6 headers wrong.\n");
979 				ip6h->ip6_nxt = nexthdr;
980 			}
981 		}
982 
983 		if (padlen >= ntohs(ip6h->ip6_plen) - 2 - sizeof (esph_t) -
984 		    ivlen) {
985 			ESP_BUMP_STAT(espstack, bad_decrypt);
986 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
987 			    info.mi_idnum, 0, 0,
988 			    SL_ERROR | SL_WARN,
989 			    "Corrupt ESP packet (v6 padlen too big).\n");
990 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
991 			    padlen));
992 			esp1dbg(espstack,
993 			    ("pkt len(%u) - ip hdr - esp hdr - ivlen(%d) = "
994 			    "%u.\n", (unsigned)(ntohs(ip6h->ip6_plen)
995 			    + sizeof (ip6_t)), ivlen,
996 			    (unsigned)(ntohs(ip6h->ip6_plen) - 2 -
997 			    sizeof (esph_t) - ivlen)));
998 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
999 			return (B_FALSE);
1000 		}
1001 
1002 
1003 		/*
1004 		 * Fix the rest of the header.  The explicit - 2 is for the
1005 		 * padding length and the next-header bytes.  IPv6 is nice,
1006 		 * because there's no hdr checksum!
1007 		 */
1008 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - padlen -
1009 		    2 - sizeof (esph_t) - ivlen);
1010 	}
1011 
1012 	if (espstack->ipsecesp_padding_check > 0 && padlen > 0) {
1013 		/*
1014 		 * Weak padding check: compare last-byte to length, they
1015 		 * should be equal.
1016 		 */
1017 		lastpad = *lastbyte--;
1018 
1019 		if (padlen != lastpad) {
1020 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
1021 			    info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1022 			    "Corrupt ESP packet (lastpad != padlen).\n");
1023 			esp1dbg(espstack,
1024 			    ("lastpad (%d) not equal to padlen (%d):\n",
1025 			    lastpad, padlen));
1026 			ESP_BUMP_STAT(espstack, bad_padding);
1027 			*counter = DROPPER(ipss, ipds_esp_bad_padding);
1028 			return (B_FALSE);
1029 		}
1030 
1031 		/*
1032 		 * Strong padding check: Check all pad bytes to see that
1033 		 * they're ascending.  Go backwards using a descending counter
1034 		 * to verify.  padlen == 1 is checked by previous block, so
1035 		 * only bother if we've more than 1 byte of padding.
1036 		 * Consequently, start the check one byte before the location
1037 		 * of "lastpad".
1038 		 */
1039 		if (espstack->ipsecesp_padding_check > 1) {
1040 			/*
1041 			 * This assert may have to become an if and a pullup
1042 			 * if we start accepting multi-dblk mblks. For now,
1043 			 * though, any packet here will have been pulled up in
1044 			 * esp_inbound.
1045 			 */
1046 			ASSERT(MBLKL(scratch) >= lastpad + 3);
1047 
1048 			/*
1049 			 * Use "--lastpad" because we already checked the very
1050 			 * last pad byte previously.
1051 			 */
1052 			while (--lastpad != 0) {
1053 				if (lastpad != *lastbyte) {
1054 					ipsec_rl_strlog(
1055 					    espstack->ipsecesp_netstack,
1056 					    info.mi_idnum, 0, 0,
1057 					    SL_ERROR | SL_WARN, "Corrupt ESP "
1058 					    "packet (bad padding).\n");
1059 					esp1dbg(espstack,
1060 					    ("padding not in correct"
1061 					    " format:\n"));
1062 					ESP_BUMP_STAT(espstack, bad_padding);
1063 					*counter = DROPPER(ipss,
1064 					    ipds_esp_bad_padding);
1065 					return (B_FALSE);
1066 				}
1067 				lastbyte--;
1068 			}
1069 		}
1070 	}
1071 
1072 	/* Trim off the padding. */
1073 	ASSERT(data_mp->b_cont == NULL);
1074 	data_mp->b_wptr -= (padlen + 2);
1075 
1076 	/*
1077 	 * Remove the ESP header.
1078 	 *
1079 	 * The above assertions about data_mp's size will make this work.
1080 	 *
1081 	 * XXX  Question:  If I send up and get back a contiguous mblk,
1082 	 * would it be quicker to bcopy over, or keep doing the dupb stuff?
1083 	 * I go with copying for now.
1084 	 */
1085 
1086 	if (IS_P2ALIGNED(data_mp->b_rptr, sizeof (uint32_t)) &&
1087 	    IS_P2ALIGNED(ivlen, sizeof (uint32_t))) {
1088 		uint8_t *start = data_mp->b_rptr;
1089 		uint32_t *src, *dst;
1090 
1091 		src = (uint32_t *)(start + divpoint);
1092 		dst = (uint32_t *)(start + divpoint + sizeof (esph_t) + ivlen);
1093 
1094 		ASSERT(IS_P2ALIGNED(dst, sizeof (uint32_t)) &&
1095 		    IS_P2ALIGNED(src, sizeof (uint32_t)));
1096 
1097 		do {
1098 			src--;
1099 			dst--;
1100 			*dst = *src;
1101 		} while (src != (uint32_t *)start);
1102 
1103 		data_mp->b_rptr = (uchar_t *)dst;
1104 	} else {
1105 		uint8_t *start = data_mp->b_rptr;
1106 		uint8_t *src, *dst;
1107 
1108 		src = start + divpoint;
1109 		dst = src + sizeof (esph_t) + ivlen;
1110 
1111 		do {
1112 			src--;
1113 			dst--;
1114 			*dst = *src;
1115 		} while (src != start);
1116 
1117 		data_mp->b_rptr = dst;
1118 	}
1119 
1120 	esp2dbg(espstack, ("data_mp after inbound ESP adjustment:\n"));
1121 	esp2dbg(espstack, (dump_msg(data_mp)));
1122 
1123 	return (B_TRUE);
1124 }
1125 
1126 /*
1127  * Updating use times can be tricky business if the ipsa_haspeer flag is
1128  * set.  This function is called once in an SA's lifetime.
1129  *
1130  * Caller has to REFRELE "assoc" which is passed in.  This function has
1131  * to REFRELE any peer SA that is obtained.
1132  */
1133 static void
1134 esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
1135 {
1136 	ipsa_t *inassoc, *outassoc;
1137 	isaf_t *bucket;
1138 	sadb_t *sp;
1139 	int outhash;
1140 	boolean_t isv6;
1141 	netstack_t		*ns = assoc->ipsa_netstack;
1142 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
1143 
1144 	/* No peer?  No problem! */
1145 	if (!assoc->ipsa_haspeer) {
1146 		sadb_set_usetime(assoc);
1147 		return;
1148 	}
1149 
1150 	/*
1151 	 * Otherwise, we want to grab both the original assoc and its peer.
1152 	 * There might be a race for this, but if it's a real race, the times
1153 	 * will be out-of-synch by at most a second, and since our time
1154 	 * granularity is a second, this won't be a problem.
1155 	 *
1156 	 * If we need tight synchronization on the peer SA, then we need to
1157 	 * reconsider.
1158 	 */
1159 
1160 	/* Use address length to select IPv6/IPv4 */
1161 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
1162 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
1163 
1164 	if (inbound) {
1165 		inassoc = assoc;
1166 		if (isv6) {
1167 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
1168 			    &inassoc->ipsa_dstaddr));
1169 		} else {
1170 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
1171 			    &inassoc->ipsa_dstaddr));
1172 		}
1173 		bucket = &sp->sdb_of[outhash];
1174 		mutex_enter(&bucket->isaf_lock);
1175 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
1176 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
1177 		    inassoc->ipsa_addrfam);
1178 		mutex_exit(&bucket->isaf_lock);
1179 		if (outassoc == NULL) {
1180 			/* Q: Do we wish to set haspeer == B_FALSE? */
1181 			esp0dbg(("esp_set_usetime: "
1182 			    "can't find peer for inbound.\n"));
1183 			sadb_set_usetime(inassoc);
1184 			return;
1185 		}
1186 	} else {
1187 		outassoc = assoc;
1188 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
1189 		mutex_enter(&bucket->isaf_lock);
1190 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
1191 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
1192 		    outassoc->ipsa_addrfam);
1193 		mutex_exit(&bucket->isaf_lock);
1194 		if (inassoc == NULL) {
1195 			/* Q: Do we wish to set haspeer == B_FALSE? */
1196 			esp0dbg(("esp_set_usetime: "
1197 			    "can't find peer for outbound.\n"));
1198 			sadb_set_usetime(outassoc);
1199 			return;
1200 		}
1201 	}
1202 
1203 	/* Update usetime on both. */
1204 	sadb_set_usetime(inassoc);
1205 	sadb_set_usetime(outassoc);
1206 
1207 	/*
1208 	 * REFRELE any peer SA.
1209 	 *
1210 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
1211 	 * them in { }.
1212 	 */
1213 	if (inbound) {
1214 		IPSA_REFRELE(outassoc);
1215 	} else {
1216 		IPSA_REFRELE(inassoc);
1217 	}
1218 }
1219 
1220 /*
1221  * Handle ESP inbound data for IPv4 and IPv6.
1222  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1223  * mblk chain ipsec_in_mp.
1224  */
1225 ipsec_status_t
1226 esp_inbound(mblk_t *ipsec_in_mp, void *arg)
1227 {
1228 	mblk_t *data_mp = ipsec_in_mp->b_cont;
1229 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
1230 	esph_t *esph = (esph_t *)arg;
1231 	ipsa_t *ipsa = ii->ipsec_in_esp_sa;
1232 	netstack_t	*ns = ii->ipsec_in_ns;
1233 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1234 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1235 
1236 	/*
1237 	 * We may wish to check replay in-range-only here as an optimization.
1238 	 * Include the reality check of ipsa->ipsa_replay >
1239 	 * ipsa->ipsa_replay_wsize for times when it's the first N packets,
1240 	 * where N == ipsa->ipsa_replay_wsize.
1241 	 *
1242 	 * Another check that may come here later is the "collision" check.
1243 	 * If legitimate packets flow quickly enough, this won't be a problem,
1244 	 * but collisions may cause authentication algorithm crunching to
1245 	 * take place when it doesn't need to.
1246 	 */
1247 	if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
1248 		ESP_BUMP_STAT(espstack, replay_early_failures);
1249 		IP_ESP_BUMP_STAT(ipss, in_discards);
1250 		/*
1251 		 * TODO: Extract inbound interface from the IPSEC_IN
1252 		 * message's ii->ipsec_in_rill_index.
1253 		 */
1254 		ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
1255 		    DROPPER(ipss, ipds_esp_early_replay),
1256 		    &espstack->esp_dropper);
1257 		return (IPSEC_STATUS_FAILED);
1258 	}
1259 
1260 	/*
1261 	 * Has this packet already been processed by a hardware
1262 	 * IPsec accelerator?
1263 	 */
1264 	if (ii->ipsec_in_accelerated) {
1265 		ipsec_status_t rv;
1266 		esp3dbg(espstack,
1267 		    ("esp_inbound: pkt processed by ill=%d isv6=%d\n",
1268 		    ii->ipsec_in_ill_index, !ii->ipsec_in_v4));
1269 		rv = esp_inbound_accelerated(ipsec_in_mp,
1270 		    data_mp, ii->ipsec_in_v4, ipsa);
1271 		return (rv);
1272 	}
1273 	ESP_BUMP_STAT(espstack, noaccel);
1274 
1275 	/*
1276 	 * Adjust the IP header's payload length to reflect the removal
1277 	 * of the ICV.
1278 	 */
1279 	if (!ii->ipsec_in_v4) {
1280 		ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
1281 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
1282 		    ipsa->ipsa_mac_len);
1283 	} else {
1284 		ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1285 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) -
1286 		    ipsa->ipsa_mac_len);
1287 	}
1288 
1289 	/* submit the request to the crypto framework */
1290 	return (esp_submit_req_inbound(ipsec_in_mp, ipsa,
1291 	    (uint8_t *)esph - data_mp->b_rptr));
1292 }
1293 
1294 /*
1295  * Perform the really difficult work of inserting the proposed situation.
1296  * Called while holding the algorithm lock.
1297  */
1298 static void
1299 esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
1300 {
1301 	sadb_comb_t *comb = (sadb_comb_t *)(prop + 1);
1302 	ipsec_out_t *io;
1303 	ipsec_action_t *ap;
1304 	ipsec_prot_t *prot;
1305 	netstack_t *ns;
1306 	ipsecesp_stack_t *espstack;
1307 	ipsec_stack_t *ipss;
1308 
1309 	io = (ipsec_out_t *)acqrec->ipsacq_mp->b_rptr;
1310 	ASSERT(io->ipsec_out_type == IPSEC_OUT);
1311 	ns = io->ipsec_out_ns;
1312 	espstack = ns->netstack_ipsecesp;
1313 	ipss = ns->netstack_ipsec;
1314 	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1315 
1316 	prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
1317 	prop->sadb_prop_len = SADB_8TO64(sizeof (sadb_prop_t));
1318 	*(uint32_t *)(&prop->sadb_prop_replay) = 0;	/* Quick zero-out! */
1319 
1320 	prop->sadb_prop_replay = espstack->ipsecesp_replay_size;
1321 
1322 	/*
1323 	 * Based upon algorithm properties, and what-not, prioritize
1324 	 * a proposal.  If the IPSEC_OUT message has an algorithm specified,
1325 	 * use it first and foremost.
1326 	 *
1327 	 * For each action in policy list
1328 	 *   Add combination.  If I've hit limit, return.
1329 	 */
1330 
1331 	for (ap = acqrec->ipsacq_act; ap != NULL;
1332 	    ap = ap->ipa_next) {
1333 		ipsec_alginfo_t *ealg = NULL;
1334 		ipsec_alginfo_t *aalg = NULL;
1335 
1336 		if (ap->ipa_act.ipa_type != IPSEC_POLICY_APPLY)
1337 			continue;
1338 
1339 		prot = &ap->ipa_act.ipa_apply;
1340 
1341 		if (!(prot->ipp_use_esp))
1342 			continue;
1343 
1344 		if (prot->ipp_esp_auth_alg != 0) {
1345 			aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
1346 			    [prot->ipp_esp_auth_alg];
1347 			if (aalg == NULL || !ALG_VALID(aalg))
1348 				continue;
1349 		}
1350 
1351 		ASSERT(prot->ipp_encr_alg > 0);
1352 		ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
1353 		    [prot->ipp_encr_alg];
1354 		if (ealg == NULL || !ALG_VALID(ealg))
1355 			continue;
1356 
1357 		comb->sadb_comb_flags = 0;
1358 		comb->sadb_comb_reserved = 0;
1359 		comb->sadb_comb_encrypt = ealg->alg_id;
1360 		comb->sadb_comb_encrypt_minbits =
1361 		    MAX(prot->ipp_espe_minbits, ealg->alg_ef_minbits);
1362 		comb->sadb_comb_encrypt_maxbits =
1363 		    MIN(prot->ipp_espe_maxbits, ealg->alg_ef_maxbits);
1364 		if (aalg == NULL) {
1365 			comb->sadb_comb_auth = 0;
1366 			comb->sadb_comb_auth_minbits = 0;
1367 			comb->sadb_comb_auth_maxbits = 0;
1368 		} else {
1369 			comb->sadb_comb_auth = aalg->alg_id;
1370 			comb->sadb_comb_auth_minbits =
1371 			    MAX(prot->ipp_espa_minbits, aalg->alg_ef_minbits);
1372 			comb->sadb_comb_auth_maxbits =
1373 			    MIN(prot->ipp_espa_maxbits, aalg->alg_ef_maxbits);
1374 		}
1375 
1376 		/*
1377 		 * The following may be based on algorithm
1378 		 * properties, but in the meantime, we just pick
1379 		 * some good, sensible numbers.  Key mgmt. can
1380 		 * (and perhaps should) be the place to finalize
1381 		 * such decisions.
1382 		 */
1383 
1384 		/*
1385 		 * No limits on allocations, since we really don't
1386 		 * support that concept currently.
1387 		 */
1388 		comb->sadb_comb_soft_allocations = 0;
1389 		comb->sadb_comb_hard_allocations = 0;
1390 
1391 		/*
1392 		 * These may want to come from policy rule..
1393 		 */
1394 		comb->sadb_comb_soft_bytes =
1395 		    espstack->ipsecesp_default_soft_bytes;
1396 		comb->sadb_comb_hard_bytes =
1397 		    espstack->ipsecesp_default_hard_bytes;
1398 		comb->sadb_comb_soft_addtime =
1399 		    espstack->ipsecesp_default_soft_addtime;
1400 		comb->sadb_comb_hard_addtime =
1401 		    espstack->ipsecesp_default_hard_addtime;
1402 		comb->sadb_comb_soft_usetime =
1403 		    espstack->ipsecesp_default_soft_usetime;
1404 		comb->sadb_comb_hard_usetime =
1405 		    espstack->ipsecesp_default_hard_usetime;
1406 
1407 		prop->sadb_prop_len += SADB_8TO64(sizeof (*comb));
1408 		if (--combs == 0)
1409 			break;	/* out of space.. */
1410 		comb++;
1411 	}
1412 }
1413 
1414 /*
1415  * Prepare and actually send the SADB_ACQUIRE message to PF_KEY.
1416  */
1417 static void
1418 esp_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns)
1419 {
1420 	uint_t combs;
1421 	sadb_msg_t *samsg;
1422 	sadb_prop_t *prop;
1423 	mblk_t *pfkeymp, *msgmp;
1424 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1425 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1426 
1427 	ESP_BUMP_STAT(espstack, acquire_requests);
1428 
1429 	if (espstack->esp_pfkey_q == NULL) {
1430 		mutex_exit(&acqrec->ipsacq_lock);
1431 		return;
1432 	}
1433 
1434 	/* Set up ACQUIRE. */
1435 	pfkeymp = sadb_setup_acquire(acqrec, SADB_SATYPE_ESP,
1436 	    ns->netstack_ipsec);
1437 	if (pfkeymp == NULL) {
1438 		esp0dbg(("sadb_setup_acquire failed.\n"));
1439 		mutex_exit(&acqrec->ipsacq_lock);
1440 		return;
1441 	}
1442 	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1443 	combs = ipss->ipsec_nalgs[IPSEC_ALG_AUTH] *
1444 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
1445 	msgmp = pfkeymp->b_cont;
1446 	samsg = (sadb_msg_t *)(msgmp->b_rptr);
1447 
1448 	/* Insert proposal here. */
1449 
1450 	prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len);
1451 	esp_insert_prop(prop, acqrec, combs);
1452 	samsg->sadb_msg_len += prop->sadb_prop_len;
1453 	msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len);
1454 
1455 	mutex_exit(&ipss->ipsec_alg_lock);
1456 
1457 	/*
1458 	 * Must mutex_exit() before sending PF_KEY message up, in
1459 	 * order to avoid recursive mutex_enter() if there are no registered
1460 	 * listeners.
1461 	 *
1462 	 * Once I've sent the message, I'm cool anyway.
1463 	 */
1464 	mutex_exit(&acqrec->ipsacq_lock);
1465 	if (extended != NULL) {
1466 		putnext(espstack->esp_pfkey_q, extended);
1467 	}
1468 	putnext(espstack->esp_pfkey_q, pfkeymp);
1469 }
1470 
1471 /*
1472  * Handle the SADB_GETSPI message.  Create a larval SA.
1473  */
1474 static void
1475 esp_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
1476 {
1477 	ipsa_t *newbie, *target;
1478 	isaf_t *outbound, *inbound;
1479 	int rc, diagnostic;
1480 	sadb_sa_t *assoc;
1481 	keysock_out_t *kso;
1482 	uint32_t newspi;
1483 
1484 	/*
1485 	 * Randomly generate a proposed SPI value
1486 	 */
1487 	(void) random_get_pseudo_bytes((uint8_t *)&newspi, sizeof (uint32_t));
1488 	newbie = sadb_getspi(ksi, newspi, &diagnostic,
1489 	    espstack->ipsecesp_netstack);
1490 
1491 	if (newbie == NULL) {
1492 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM, diagnostic,
1493 		    ksi->ks_in_serial);
1494 		return;
1495 	} else if (newbie == (ipsa_t *)-1) {
1496 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
1497 		    ksi->ks_in_serial);
1498 		return;
1499 	}
1500 
1501 	/*
1502 	 * XXX - We may randomly collide.  We really should recover from this.
1503 	 *	 Unfortunately, that could require spending way-too-much-time
1504 	 *	 in here.  For now, let the user retry.
1505 	 */
1506 
1507 	if (newbie->ipsa_addrfam == AF_INET6) {
1508 		outbound = OUTBOUND_BUCKET_V6(&espstack->esp_sadb.s_v6,
1509 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1510 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v6,
1511 		    newbie->ipsa_spi);
1512 	} else {
1513 		ASSERT(newbie->ipsa_addrfam == AF_INET);
1514 		outbound = OUTBOUND_BUCKET_V4(&espstack->esp_sadb.s_v4,
1515 		    *(uint32_t *)(newbie->ipsa_dstaddr));
1516 		inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v4,
1517 		    newbie->ipsa_spi);
1518 	}
1519 
1520 	mutex_enter(&outbound->isaf_lock);
1521 	mutex_enter(&inbound->isaf_lock);
1522 
1523 	/*
1524 	 * Check for collisions (i.e. did sadb_getspi() return with something
1525 	 * that already exists?).
1526 	 *
1527 	 * Try outbound first.  Even though SADB_GETSPI is traditionally
1528 	 * for inbound SAs, you never know what a user might do.
1529 	 */
1530 	target = ipsec_getassocbyspi(outbound, newbie->ipsa_spi,
1531 	    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr, newbie->ipsa_addrfam);
1532 	if (target == NULL) {
1533 		target = ipsec_getassocbyspi(inbound, newbie->ipsa_spi,
1534 		    newbie->ipsa_srcaddr, newbie->ipsa_dstaddr,
1535 		    newbie->ipsa_addrfam);
1536 	}
1537 
1538 	/*
1539 	 * I don't have collisions elsewhere!
1540 	 * (Nor will I because I'm still holding inbound/outbound locks.)
1541 	 */
1542 
1543 	if (target != NULL) {
1544 		rc = EEXIST;
1545 		IPSA_REFRELE(target);
1546 	} else {
1547 		/*
1548 		 * sadb_insertassoc() also checks for collisions, so
1549 		 * if there's a colliding entry, rc will be set
1550 		 * to EEXIST.
1551 		 */
1552 		rc = sadb_insertassoc(newbie, inbound);
1553 		newbie->ipsa_hardexpiretime = gethrestime_sec();
1554 		newbie->ipsa_hardexpiretime +=
1555 		    espstack->ipsecesp_larval_timeout;
1556 	}
1557 
1558 	/*
1559 	 * Can exit outbound mutex.  Hold inbound until we're done
1560 	 * with newbie.
1561 	 */
1562 	mutex_exit(&outbound->isaf_lock);
1563 
1564 	if (rc != 0) {
1565 		mutex_exit(&inbound->isaf_lock);
1566 		IPSA_REFRELE(newbie);
1567 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, rc,
1568 		    SADB_X_DIAGNOSTIC_NONE, ksi->ks_in_serial);
1569 		return;
1570 	}
1571 
1572 
1573 	/* Can write here because I'm still holding the bucket lock. */
1574 	newbie->ipsa_type = SADB_SATYPE_ESP;
1575 
1576 	/*
1577 	 * Construct successful return message. We have one thing going
1578 	 * for us in PF_KEY v2.  That's the fact that
1579 	 *	sizeof (sadb_spirange_t) == sizeof (sadb_sa_t)
1580 	 */
1581 	assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SPIRANGE];
1582 	assoc->sadb_sa_exttype = SADB_EXT_SA;
1583 	assoc->sadb_sa_spi = newbie->ipsa_spi;
1584 	*((uint64_t *)(&assoc->sadb_sa_replay)) = 0;
1585 	mutex_exit(&inbound->isaf_lock);
1586 
1587 	/* Convert KEYSOCK_IN to KEYSOCK_OUT. */
1588 	kso = (keysock_out_t *)ksi;
1589 	kso->ks_out_len = sizeof (*kso);
1590 	kso->ks_out_serial = ksi->ks_in_serial;
1591 	kso->ks_out_type = KEYSOCK_OUT;
1592 
1593 	/*
1594 	 * Can safely putnext() to esp_pfkey_q, because this is a turnaround
1595 	 * from the esp_pfkey_q.
1596 	 */
1597 	putnext(espstack->esp_pfkey_q, mp);
1598 }
1599 
1600 /*
1601  * Insert the ESP header into a packet.  Duplicate an mblk, and insert a newly
1602  * allocated mblk with the ESP header in between the two.
1603  */
1604 static boolean_t
1605 esp_insert_esp(mblk_t *mp, mblk_t *esp_mp, uint_t divpoint,
1606     ipsecesp_stack_t *espstack)
1607 {
1608 	mblk_t *split_mp = mp;
1609 	uint_t wheretodiv = divpoint;
1610 
1611 	while ((split_mp->b_wptr - split_mp->b_rptr) < wheretodiv) {
1612 		wheretodiv -= (split_mp->b_wptr - split_mp->b_rptr);
1613 		split_mp = split_mp->b_cont;
1614 		ASSERT(split_mp != NULL);
1615 	}
1616 
1617 	if (split_mp->b_wptr - split_mp->b_rptr != wheretodiv) {
1618 		mblk_t *scratch;
1619 
1620 		/* "scratch" is the 2nd half, split_mp is the first. */
1621 		scratch = dupb(split_mp);
1622 		if (scratch == NULL) {
1623 			esp1dbg(espstack,
1624 			    ("esp_insert_esp: can't allocate scratch.\n"));
1625 			return (B_FALSE);
1626 		}
1627 		/* NOTE:  dupb() doesn't set b_cont appropriately. */
1628 		scratch->b_cont = split_mp->b_cont;
1629 		scratch->b_rptr += wheretodiv;
1630 		split_mp->b_wptr = split_mp->b_rptr + wheretodiv;
1631 		split_mp->b_cont = scratch;
1632 	}
1633 	/*
1634 	 * At this point, split_mp is exactly "wheretodiv" bytes long, and
1635 	 * holds the end of the pre-ESP part of the datagram.
1636 	 */
1637 	esp_mp->b_cont = split_mp->b_cont;
1638 	split_mp->b_cont = esp_mp;
1639 
1640 	return (B_TRUE);
1641 }
1642 
1643 /*
1644  * Section 7 of RFC 3947 says:
1645  *
1646  * 7.  Recovering from the Expiring NAT Mappings
1647  *
1648  *    There are cases where NAT box decides to remove mappings that are still
1649  *    alive (for example, when the keepalive interval is too long, or when the
1650  *    NAT box is rebooted).  To recover from this, ends that are NOT behind
1651  *    NAT SHOULD use the last valid UDP encapsulated IKE or IPsec packet from
1652  *    the other end to determine which IP and port addresses should be used.
1653  *    The host behind dynamic NAT MUST NOT do this, as otherwise it opens a
1654  *    DoS attack possibility because the IP address or port of the other host
1655  *    will not change (it is not behind NAT).
1656  *
1657  *    Keepalives cannot be used for these purposes, as they are not
1658  *    authenticated, but any IKE authenticated IKE packet or ESP packet can be
1659  *    used to detect whether the IP address or the port has changed.
1660  *
1661  * The following function will check an SA and its explicitly-set pair to see
1662  * if the NAT-T remote port matches the received packet (which must have
1663  * passed ESP authentication, see esp_in_done() for the caller context).  If
1664  * there is a mismatch, the SAs are updated.  It is not important if we race
1665  * with a transmitting thread, as if there is a transmitting thread, it will
1666  * merely emit a packet that will most-likely be dropped.
1667  *
1668  * "ports" are ordered src,dst, and assoc is an inbound SA, where src should
1669  * match ipsa_remote_nat_port and dst should match ipsa_local_nat_port.
1670  */
1671 #ifdef _LITTLE_ENDIAN
1672 #define	FIRST_16(x) ((x) & 0xFFFF)
1673 #define	NEXT_16(x) (((x) >> 16) & 0xFFFF)
1674 #else
1675 #define	FIRST_16(x) (((x) >> 16) & 0xFFFF)
1676 #define	NEXT_16(x) ((x) & 0xFFFF)
1677 #endif
1678 static void
1679 esp_port_freshness(uint32_t ports, ipsa_t *assoc)
1680 {
1681 	uint16_t remote = FIRST_16(ports);
1682 	uint16_t local = NEXT_16(ports);
1683 	ipsa_t *outbound_peer;
1684 	isaf_t *bucket;
1685 	ipsecesp_stack_t *espstack = assoc->ipsa_netstack->netstack_ipsecesp;
1686 
1687 	/* We found a conn_t, therefore local != 0. */
1688 	ASSERT(local != 0);
1689 	/* Assume an IPv4 SA. */
1690 	ASSERT(assoc->ipsa_addrfam == AF_INET);
1691 
1692 	/*
1693 	 * On-the-wire rport == 0 means something's very wrong.
1694 	 * An unpaired SA is also useless to us.
1695 	 * If we are behind the NAT, don't bother.
1696 	 * A zero local NAT port defaults to 4500, so check that too.
1697 	 * And, of course, if the ports already match, we don't need to
1698 	 * bother.
1699 	 */
1700 	if (remote == 0 || assoc->ipsa_otherspi == 0 ||
1701 	    (assoc->ipsa_flags & IPSA_F_BEHIND_NAT) ||
1702 	    (assoc->ipsa_remote_nat_port == 0 &&
1703 	    remote == htons(IPPORT_IKE_NATT)) ||
1704 	    remote == assoc->ipsa_remote_nat_port)
1705 		return;
1706 
1707 	/* Try and snag the peer.   NOTE:  Assume IPv4 for now. */
1708 	bucket = OUTBOUND_BUCKET_V4(&(espstack->esp_sadb.s_v4),
1709 	    assoc->ipsa_srcaddr[0]);
1710 	mutex_enter(&bucket->isaf_lock);
1711 	outbound_peer = ipsec_getassocbyspi(bucket, assoc->ipsa_otherspi,
1712 	    assoc->ipsa_dstaddr, assoc->ipsa_srcaddr, AF_INET);
1713 	mutex_exit(&bucket->isaf_lock);
1714 
1715 	/* We probably lost a race to a deleting or expiring thread. */
1716 	if (outbound_peer == NULL)
1717 		return;
1718 
1719 	/*
1720 	 * Hold the mutexes for both SAs so we don't race another inbound
1721 	 * thread.  A lock-entry order shouldn't matter, since all other
1722 	 * per-ipsa locks are individually held-then-released.
1723 	 *
1724 	 * Luckily, this has nothing to do with the remote-NAT address,
1725 	 * so we don't have to re-scribble the cached-checksum differential.
1726 	 */
1727 	mutex_enter(&outbound_peer->ipsa_lock);
1728 	mutex_enter(&assoc->ipsa_lock);
1729 	outbound_peer->ipsa_remote_nat_port = assoc->ipsa_remote_nat_port =
1730 	    remote;
1731 	mutex_exit(&assoc->ipsa_lock);
1732 	mutex_exit(&outbound_peer->ipsa_lock);
1733 	IPSA_REFRELE(outbound_peer);
1734 	ESP_BUMP_STAT(espstack, sa_port_renumbers);
1735 }
1736 
1737 /*
1738  * Finish processing of an inbound ESP packet after processing by the
1739  * crypto framework.
1740  * - Remove the ESP header.
1741  * - Send packet back to IP.
1742  * If authentication was performed on the packet, this function is called
1743  * only if the authentication succeeded.
1744  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1745  * mblk chain ipsec_in_mp.
1746  */
1747 static ipsec_status_t
1748 esp_in_done(mblk_t *ipsec_in_mp)
1749 {
1750 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
1751 	mblk_t *data_mp;
1752 	ipsa_t *assoc;
1753 	uint_t espstart;
1754 	uint32_t ivlen = 0;
1755 	uint_t processed_len;
1756 	esph_t *esph;
1757 	kstat_named_t *counter;
1758 	boolean_t is_natt;
1759 	netstack_t	*ns = ii->ipsec_in_ns;
1760 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1761 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1762 
1763 	assoc = ii->ipsec_in_esp_sa;
1764 	ASSERT(assoc != NULL);
1765 
1766 	is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
1767 
1768 	/* get the pointer to the ESP header */
1769 	if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
1770 		/* authentication-only ESP */
1771 		espstart = ii->ipsec_in_crypto_data.cd_offset;
1772 		processed_len = ii->ipsec_in_crypto_data.cd_length;
1773 	} else {
1774 		/* encryption present */
1775 		ivlen = assoc->ipsa_iv_len;
1776 		if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
1777 			/* encryption-only ESP */
1778 			espstart = ii->ipsec_in_crypto_data.cd_offset -
1779 			    sizeof (esph_t) - assoc->ipsa_iv_len;
1780 			processed_len = ii->ipsec_in_crypto_data.cd_length +
1781 			    ivlen;
1782 		} else {
1783 			/* encryption with authentication */
1784 			espstart = ii->ipsec_in_crypto_dual_data.dd_offset1;
1785 			processed_len = ii->ipsec_in_crypto_dual_data.dd_len2 +
1786 			    ivlen;
1787 		}
1788 	}
1789 
1790 	data_mp = ipsec_in_mp->b_cont;
1791 	esph = (esph_t *)(data_mp->b_rptr + espstart);
1792 
1793 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE) {
1794 		/* authentication passed if we reach this point */
1795 		ESP_BUMP_STAT(espstack, good_auth);
1796 		data_mp->b_wptr -= assoc->ipsa_mac_len;
1797 
1798 		/*
1799 		 * Check replay window here!
1800 		 * For right now, assume keysock will set the replay window
1801 		 * size to zero for SAs that have an unspecified sender.
1802 		 * This may change...
1803 		 */
1804 
1805 		if (!sadb_replay_check(assoc, esph->esph_replay)) {
1806 			/*
1807 			 * Log the event. As of now we print out an event.
1808 			 * Do not print the replay failure number, or else
1809 			 * syslog cannot collate the error messages.  Printing
1810 			 * the replay number that failed opens a denial-of-
1811 			 * service attack.
1812 			 */
1813 			ipsec_assocfailure(info.mi_idnum, 0, 0,
1814 			    SL_ERROR | SL_WARN,
1815 			    "Replay failed for ESP spi 0x%x, dst %s.\n",
1816 			    assoc->ipsa_spi, assoc->ipsa_dstaddr,
1817 			    assoc->ipsa_addrfam, espstack->ipsecesp_netstack);
1818 			ESP_BUMP_STAT(espstack, replay_failures);
1819 			counter = DROPPER(ipss, ipds_esp_replay);
1820 			goto drop_and_bail;
1821 		}
1822 
1823 		if (is_natt)
1824 			esp_port_freshness(ii->ipsec_in_esp_udp_ports, assoc);
1825 	}
1826 
1827 	esp_set_usetime(assoc, B_TRUE);
1828 
1829 	if (!esp_age_bytes(assoc, processed_len, B_TRUE)) {
1830 		/* The ipsa has hit hard expiration, LOG and AUDIT. */
1831 		ipsec_assocfailure(info.mi_idnum, 0, 0,
1832 		    SL_ERROR | SL_WARN,
1833 		    "ESP association 0x%x, dst %s had bytes expire.\n",
1834 		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1835 		    espstack->ipsecesp_netstack);
1836 		ESP_BUMP_STAT(espstack, bytes_expired);
1837 		counter = DROPPER(ipss, ipds_esp_bytes_expire);
1838 		goto drop_and_bail;
1839 	}
1840 
1841 	/*
1842 	 * Remove ESP header and padding from packet.  I hope the compiler
1843 	 * spews "branch, predict taken" code for this.
1844 	 */
1845 
1846 	if (esp_strip_header(data_mp, ii->ipsec_in_v4, ivlen, &counter,
1847 	    espstack)) {
1848 		if (is_natt)
1849 			return (esp_fix_natt_checksums(data_mp, assoc));
1850 		return (IPSEC_STATUS_SUCCESS);
1851 	}
1852 
1853 	esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
1854 drop_and_bail:
1855 	IP_ESP_BUMP_STAT(ipss, in_discards);
1856 	/*
1857 	 * TODO: Extract inbound interface from the IPSEC_IN message's
1858 	 * ii->ipsec_in_rill_index.
1859 	 */
1860 	ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, counter,
1861 	    &espstack->esp_dropper);
1862 	return (IPSEC_STATUS_FAILED);
1863 }
1864 
1865 /*
1866  * Called upon failing the inbound ICV check. The message passed as
1867  * argument is freed.
1868  */
1869 static void
1870 esp_log_bad_auth(mblk_t *ipsec_in)
1871 {
1872 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
1873 	ipsa_t *assoc = ii->ipsec_in_esp_sa;
1874 	netstack_t	*ns = ii->ipsec_in_ns;
1875 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1876 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1877 
1878 	/*
1879 	 * Log the event. Don't print to the console, block
1880 	 * potential denial-of-service attack.
1881 	 */
1882 	ESP_BUMP_STAT(espstack, bad_auth);
1883 
1884 	ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1885 	    "ESP Authentication failed for spi 0x%x, dst %s.\n",
1886 	    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1887 	    espstack->ipsecesp_netstack);
1888 
1889 	IP_ESP_BUMP_STAT(ipss, in_discards);
1890 	/*
1891 	 * TODO: Extract inbound interface from the IPSEC_IN
1892 	 * message's ii->ipsec_in_rill_index.
1893 	 */
1894 	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
1895 	    DROPPER(ipss, ipds_esp_bad_auth),
1896 	    &espstack->esp_dropper);
1897 }
1898 
1899 
1900 /*
1901  * Invoked for outbound packets after ESP processing. If the packet
1902  * also requires AH, performs the AH SA selection and AH processing.
1903  * Returns B_TRUE if the AH processing was not needed or if it was
1904  * performed successfully. Returns B_FALSE and consumes the passed mblk
1905  * if AH processing was required but could not be performed.
1906  */
1907 static boolean_t
1908 esp_do_outbound_ah(mblk_t *ipsec_mp)
1909 {
1910 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
1911 	ipsec_status_t ipsec_rc;
1912 	ipsec_action_t *ap;
1913 
1914 	ap = io->ipsec_out_act;
1915 	if (ap == NULL) {
1916 		ipsec_policy_t *pp = io->ipsec_out_policy;
1917 		ap = pp->ipsp_act;
1918 	}
1919 
1920 	if (!ap->ipa_want_ah)
1921 		return (B_TRUE);
1922 
1923 	ASSERT(io->ipsec_out_ah_done == B_FALSE);
1924 
1925 	if (io->ipsec_out_ah_sa == NULL) {
1926 		if (!ipsec_outbound_sa(ipsec_mp, IPPROTO_AH)) {
1927 			sadb_acquire(ipsec_mp, io, B_TRUE, B_FALSE);
1928 			return (B_FALSE);
1929 		}
1930 	}
1931 	ASSERT(io->ipsec_out_ah_sa != NULL);
1932 
1933 	io->ipsec_out_ah_done = B_TRUE;
1934 	ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp);
1935 	return (ipsec_rc == IPSEC_STATUS_SUCCESS);
1936 }
1937 
1938 
1939 /*
1940  * Kernel crypto framework callback invoked after completion of async
1941  * crypto requests.
1942  */
1943 static void
1944 esp_kcf_callback(void *arg, int status)
1945 {
1946 	mblk_t *ipsec_mp = (mblk_t *)arg;
1947 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
1948 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
1949 	boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
1950 	netstackid_t	stackid;
1951 	netstack_t	*ns, *ns_arg;
1952 	ipsecesp_stack_t *espstack;
1953 	ipsec_stack_t	*ipss;
1954 
1955 	ASSERT(ipsec_mp->b_cont != NULL);
1956 
1957 	if (is_inbound) {
1958 		stackid = ii->ipsec_in_stackid;
1959 		ns_arg = ii->ipsec_in_ns;
1960 	} else {
1961 		stackid = io->ipsec_out_stackid;
1962 		ns_arg = io->ipsec_out_ns;
1963 	}
1964 
1965 	/*
1966 	 * Verify that the netstack is still around; could have vanished
1967 	 * while kEf was doing its work.
1968 	 */
1969 	ns = netstack_find_by_stackid(stackid);
1970 	if (ns == NULL || ns != ns_arg) {
1971 		/* Disappeared on us */
1972 		if (ns != NULL)
1973 			netstack_rele(ns);
1974 		freemsg(ipsec_mp);
1975 		return;
1976 	}
1977 
1978 	espstack = ns->netstack_ipsecesp;
1979 	ipss = ns->netstack_ipsec;
1980 
1981 	if (status == CRYPTO_SUCCESS) {
1982 		if (is_inbound) {
1983 			if (esp_in_done(ipsec_mp) != IPSEC_STATUS_SUCCESS) {
1984 				netstack_rele(ns);
1985 				return;
1986 			}
1987 			/* finish IPsec processing */
1988 			ip_fanout_proto_again(ipsec_mp, NULL, NULL, NULL);
1989 		} else {
1990 			/*
1991 			 * If a ICV was computed, it was stored by the
1992 			 * crypto framework at the end of the packet.
1993 			 */
1994 			ipha_t *ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
1995 
1996 			esp_set_usetime(io->ipsec_out_esp_sa, B_FALSE);
1997 			/* NAT-T packet. */
1998 			if (ipha->ipha_protocol == IPPROTO_UDP)
1999 				esp_prepare_udp(ns, ipsec_mp->b_cont, ipha);
2000 
2001 			/* do AH processing if needed */
2002 			if (!esp_do_outbound_ah(ipsec_mp)) {
2003 				netstack_rele(ns);
2004 				return;
2005 			}
2006 			/* finish IPsec processing */
2007 			if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
2008 				ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL,
2009 				    NULL);
2010 			} else {
2011 				ip6_t *ip6h = (ip6_t *)ipha;
2012 				ip_wput_ipsec_out_v6(NULL, ipsec_mp, ip6h,
2013 				    NULL, NULL);
2014 			}
2015 		}
2016 
2017 	} else if (status == CRYPTO_INVALID_MAC) {
2018 		esp_log_bad_auth(ipsec_mp);
2019 
2020 	} else {
2021 		esp1dbg(espstack,
2022 		    ("esp_kcf_callback: crypto failed with 0x%x\n",
2023 		    status));
2024 		ESP_BUMP_STAT(espstack, crypto_failures);
2025 		if (is_inbound)
2026 			IP_ESP_BUMP_STAT(ipss, in_discards);
2027 		else
2028 			ESP_BUMP_STAT(espstack, out_discards);
2029 		ip_drop_packet(ipsec_mp, is_inbound, NULL, NULL,
2030 		    DROPPER(ipss, ipds_esp_crypto_failed),
2031 		    &espstack->esp_dropper);
2032 	}
2033 	netstack_rele(ns);
2034 }
2035 
2036 /*
2037  * Invoked on crypto framework failure during inbound and outbound processing.
2038  */
2039 static void
2040 esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
2041     ipsecesp_stack_t *espstack)
2042 {
2043 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
2044 
2045 	esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
2046 	    is_inbound ? "inbound" : "outbound", kef_rc));
2047 	ip_drop_packet(mp, is_inbound, NULL, NULL,
2048 	    DROPPER(ipss, ipds_esp_crypto_failed),
2049 	    &espstack->esp_dropper);
2050 	ESP_BUMP_STAT(espstack, crypto_failures);
2051 	if (is_inbound)
2052 		IP_ESP_BUMP_STAT(ipss, in_discards);
2053 	else
2054 		ESP_BUMP_STAT(espstack, out_discards);
2055 }
2056 
2057 #define	ESP_INIT_CALLREQ(_cr) {						\
2058 	(_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_RESTRICTED;		\
2059 	(_cr)->cr_callback_arg = ipsec_mp;				\
2060 	(_cr)->cr_callback_func = esp_kcf_callback;			\
2061 }
2062 
2063 #define	ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) {			\
2064 	(mac)->cd_format = CRYPTO_DATA_RAW;				\
2065 	(mac)->cd_offset = 0;						\
2066 	(mac)->cd_length = icvlen;					\
2067 	(mac)->cd_raw.iov_base = (char *)icvbuf;			\
2068 	(mac)->cd_raw.iov_len = icvlen;					\
2069 }
2070 
2071 #define	ESP_INIT_CRYPTO_DATA(data, mp, off, len) {			\
2072 	if (MBLKL(mp) >= (len) + (off)) {				\
2073 		(data)->cd_format = CRYPTO_DATA_RAW;			\
2074 		(data)->cd_raw.iov_base = (char *)(mp)->b_rptr;		\
2075 		(data)->cd_raw.iov_len = MBLKL(mp);			\
2076 		(data)->cd_offset = off;				\
2077 	} else {							\
2078 		(data)->cd_format = CRYPTO_DATA_MBLK;			\
2079 		(data)->cd_mp = mp;			       		\
2080 		(data)->cd_offset = off;				\
2081 	}								\
2082 	(data)->cd_length = len;					\
2083 }
2084 
2085 #define	ESP_INIT_CRYPTO_DUAL_DATA(data, mp, off1, len1, off2, len2) {	\
2086 	(data)->dd_format = CRYPTO_DATA_MBLK;				\
2087 	(data)->dd_mp = mp;						\
2088 	(data)->dd_len1 = len1;						\
2089 	(data)->dd_offset1 = off1;					\
2090 	(data)->dd_len2 = len2;						\
2091 	(data)->dd_offset2 = off2;					\
2092 }
2093 
2094 static ipsec_status_t
2095 esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
2096 {
2097 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
2098 	boolean_t do_auth;
2099 	uint_t auth_offset, msg_len, auth_len;
2100 	crypto_call_req_t call_req;
2101 	mblk_t *esp_mp;
2102 	int kef_rc = CRYPTO_FAILED;
2103 	uint_t icv_len = assoc->ipsa_mac_len;
2104 	crypto_ctx_template_t auth_ctx_tmpl;
2105 	boolean_t do_encr;
2106 	uint_t encr_offset, encr_len;
2107 	uint_t iv_len = assoc->ipsa_iv_len;
2108 	crypto_ctx_template_t encr_ctx_tmpl;
2109 	netstack_t	*ns = ii->ipsec_in_ns;
2110 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2111 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2112 
2113 	ASSERT(ii->ipsec_in_type == IPSEC_IN);
2114 
2115 	/*
2116 	 * In case kEF queues and calls back, keep netstackid_t for
2117 	 * verification that the IP instance is still around in
2118 	 * esp_kcf_callback().
2119 	 */
2120 	ii->ipsec_in_stackid = ns->netstack_stackid;
2121 
2122 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2123 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2124 
2125 	/*
2126 	 * An inbound packet is of the form:
2127 	 * IPSEC_IN -> [IP,options,ESP,IV,data,ICV,pad]
2128 	 */
2129 	esp_mp = ipsec_mp->b_cont;
2130 	msg_len = MBLKL(esp_mp);
2131 
2132 	ESP_INIT_CALLREQ(&call_req);
2133 
2134 	if (do_auth) {
2135 		/* force asynchronous processing? */
2136 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
2137 		    IPSEC_ALGS_EXEC_ASYNC)
2138 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2139 
2140 		/* authentication context template */
2141 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2142 		    auth_ctx_tmpl);
2143 
2144 		/* ICV to be verified */
2145 		ESP_INIT_CRYPTO_MAC(&ii->ipsec_in_crypto_mac,
2146 		    icv_len, esp_mp->b_wptr - icv_len);
2147 
2148 		/* authentication starts at the ESP header */
2149 		auth_offset = esph_offset;
2150 		auth_len = msg_len - auth_offset - icv_len;
2151 		if (!do_encr) {
2152 			/* authentication only */
2153 			/* initialize input data argument */
2154 			ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
2155 			    esp_mp, auth_offset, auth_len);
2156 
2157 			/* call the crypto framework */
2158 			kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
2159 			    &ii->ipsec_in_crypto_data,
2160 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2161 			    &ii->ipsec_in_crypto_mac, &call_req);
2162 		}
2163 	}
2164 
2165 	if (do_encr) {
2166 		/* force asynchronous processing? */
2167 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
2168 		    IPSEC_ALGS_EXEC_ASYNC)
2169 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2170 
2171 		/* encryption template */
2172 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2173 		    encr_ctx_tmpl);
2174 
2175 		/* skip IV, since it is passed separately */
2176 		encr_offset = esph_offset + sizeof (esph_t) + iv_len;
2177 		encr_len = msg_len - encr_offset;
2178 
2179 		if (!do_auth) {
2180 			/* decryption only */
2181 			/* initialize input data argument */
2182 			ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
2183 			    esp_mp, encr_offset, encr_len);
2184 
2185 			/* specify IV */
2186 			ii->ipsec_in_crypto_data.cd_miscdata =
2187 			    (char *)esp_mp->b_rptr + sizeof (esph_t) +
2188 			    esph_offset;
2189 
2190 			/* call the crypto framework */
2191 			kef_rc = crypto_decrypt(&assoc->ipsa_emech,
2192 			    &ii->ipsec_in_crypto_data,
2193 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2194 			    NULL, &call_req);
2195 		}
2196 	}
2197 
2198 	if (do_auth && do_encr) {
2199 		/* dual operation */
2200 		/* initialize input data argument */
2201 		ESP_INIT_CRYPTO_DUAL_DATA(&ii->ipsec_in_crypto_dual_data,
2202 		    esp_mp, auth_offset, auth_len,
2203 		    encr_offset, encr_len - icv_len);
2204 
2205 		/* specify IV */
2206 		ii->ipsec_in_crypto_dual_data.dd_miscdata =
2207 		    (char *)esp_mp->b_rptr + sizeof (esph_t) + esph_offset;
2208 
2209 		/* call the framework */
2210 		kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
2211 		    &assoc->ipsa_emech, &ii->ipsec_in_crypto_dual_data,
2212 		    &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
2213 		    auth_ctx_tmpl, encr_ctx_tmpl, &ii->ipsec_in_crypto_mac,
2214 		    NULL, &call_req);
2215 	}
2216 
2217 	switch (kef_rc) {
2218 	case CRYPTO_SUCCESS:
2219 		ESP_BUMP_STAT(espstack, crypto_sync);
2220 		return (esp_in_done(ipsec_mp));
2221 	case CRYPTO_QUEUED:
2222 		/* esp_kcf_callback() will be invoked on completion */
2223 		ESP_BUMP_STAT(espstack, crypto_async);
2224 		return (IPSEC_STATUS_PENDING);
2225 	case CRYPTO_INVALID_MAC:
2226 		ESP_BUMP_STAT(espstack, crypto_sync);
2227 		esp_log_bad_auth(ipsec_mp);
2228 		return (IPSEC_STATUS_FAILED);
2229 	}
2230 
2231 	esp_crypto_failed(ipsec_mp, B_TRUE, kef_rc, espstack);
2232 	return (IPSEC_STATUS_FAILED);
2233 }
2234 
2235 /*
2236  * Compute the IP and UDP checksums -- common code for both keepalives and
2237  * actual ESP-in-UDP packets.  Be flexible with multiple mblks because ESP
2238  * uses mblk-insertion to insert the UDP header.
2239  * TODO - If there is an easy way to prep a packet for HW checksums, make
2240  * it happen here.
2241  */
2242 static void
2243 esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
2244 {
2245 	int offset;
2246 	uint32_t cksum;
2247 	uint16_t *arr;
2248 	mblk_t *udpmp = mp;
2249 	uint_t hlen = IPH_HDR_LENGTH(ipha);
2250 
2251 	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2252 
2253 	ipha->ipha_hdr_checksum = 0;
2254 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2255 
2256 	if (ns->netstack_udp->us_do_checksum) {
2257 		ASSERT(MBLKL(udpmp) >= sizeof (udpha_t));
2258 		/* arr points to the IP header. */
2259 		arr = (uint16_t *)ipha;
2260 		IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
2261 		IP_STAT_UPDATE(ns->netstack_ip, ip_udp_out_sw_cksum_bytes,
2262 		    ntohs(htons(ipha->ipha_length) - hlen));
2263 		/* arr[6-9] are the IP addresses. */
2264 		cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
2265 		    ntohs(htons(ipha->ipha_length) - hlen);
2266 		cksum = IP_CSUM(mp, hlen, cksum);
2267 		offset = hlen + UDP_CHECKSUM_OFFSET;
2268 		while (offset >= MBLKL(udpmp)) {
2269 			offset -= MBLKL(udpmp);
2270 			udpmp = udpmp->b_cont;
2271 		}
2272 		/* arr points to the UDP header's checksum field. */
2273 		arr = (uint16_t *)(udpmp->b_rptr + offset);
2274 		*arr = cksum;
2275 	}
2276 }
2277 
2278 /*
2279  * Send a one-byte UDP NAT-T keepalive.  Construct an IPSEC_OUT too that'll
2280  * get fed into esp_send_udp/ip_wput_ipsec_out.
2281  */
2282 void
2283 ipsecesp_send_keepalive(ipsa_t *assoc)
2284 {
2285 	mblk_t *mp = NULL, *ipsec_mp = NULL;
2286 	ipha_t *ipha;
2287 	udpha_t *udpha;
2288 	ipsec_out_t *io;
2289 
2290 	ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock));
2291 
2292 	mp = allocb(sizeof (ipha_t) + sizeof (udpha_t) + 1, BPRI_HI);
2293 	if (mp == NULL)
2294 		return;
2295 	ipha = (ipha_t *)mp->b_rptr;
2296 	ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2297 	ipha->ipha_type_of_service = 0;
2298 	ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (udpha_t) + 1);
2299 	/* Use the low-16 of the SPI so we have some clue where it came from. */
2300 	ipha->ipha_ident = *(((uint16_t *)(&assoc->ipsa_spi)) + 1);
2301 	ipha->ipha_fragment_offset_and_flags = 0;  /* Too small to fragment! */
2302 	ipha->ipha_ttl = 0xFF;
2303 	ipha->ipha_protocol = IPPROTO_UDP;
2304 	ipha->ipha_hdr_checksum = 0;
2305 	ipha->ipha_src = assoc->ipsa_srcaddr[0];
2306 	ipha->ipha_dst = assoc->ipsa_dstaddr[0];
2307 	udpha = (udpha_t *)(ipha + 1);
2308 	udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2309 	    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2310 	udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2311 	    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2312 	udpha->uha_length = htons(sizeof (udpha_t) + 1);
2313 	udpha->uha_checksum = 0;
2314 	mp->b_wptr = (uint8_t *)(udpha + 1);
2315 	*(mp->b_wptr++) = 0xFF;
2316 
2317 	ipsec_mp = ipsec_alloc_ipsec_out(assoc->ipsa_netstack);
2318 	if (ipsec_mp == NULL) {
2319 		freeb(mp);
2320 		return;
2321 	}
2322 	ipsec_mp->b_cont = mp;
2323 	io = (ipsec_out_t *)ipsec_mp->b_rptr;
2324 	io->ipsec_out_zoneid =
2325 	    netstackid_to_zoneid(assoc->ipsa_netstack->netstack_stackid);
2326 
2327 	esp_prepare_udp(assoc->ipsa_netstack, mp, ipha);
2328 	ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL, NULL);
2329 }
2330 
2331 static ipsec_status_t
2332 esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
2333     uint_t payload_len)
2334 {
2335 	ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
2336 	uint_t auth_len;
2337 	crypto_call_req_t call_req;
2338 	mblk_t *esp_mp;
2339 	int kef_rc = CRYPTO_FAILED;
2340 	uint_t icv_len = assoc->ipsa_mac_len;
2341 	crypto_ctx_template_t auth_ctx_tmpl;
2342 	boolean_t do_auth;
2343 	boolean_t do_encr;
2344 	uint_t iv_len = assoc->ipsa_iv_len;
2345 	crypto_ctx_template_t encr_ctx_tmpl;
2346 	boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
2347 	size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
2348 	netstack_t	*ns = io->ipsec_out_ns;
2349 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2350 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2351 
2352 	esp3dbg(espstack, ("esp_submit_req_outbound:%s",
2353 	    is_natt ? "natt" : "not natt"));
2354 
2355 	ASSERT(io->ipsec_out_type == IPSEC_OUT);
2356 
2357 	/*
2358 	 * In case kEF queues and calls back, keep netstackid_t for
2359 	 * verification that the IP instance is still around in
2360 	 * esp_kcf_callback().
2361 	 */
2362 	io->ipsec_out_stackid = ns->netstack_stackid;
2363 
2364 	do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2365 	do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2366 
2367 	/*
2368 	 * Outbound IPsec packets are of the form:
2369 	 * IPSEC_OUT -> [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
2370 	 * unless it's NATT, then it's
2371 	 * IPSEC_OUT -> [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
2372 	 * Get a pointer to the mblk containing the ESP header.
2373 	 */
2374 	ASSERT(ipsec_mp->b_cont != NULL && ipsec_mp->b_cont->b_cont != NULL);
2375 	esp_mp = ipsec_mp->b_cont->b_cont;
2376 
2377 	ESP_INIT_CALLREQ(&call_req);
2378 
2379 	if (do_auth) {
2380 		/* force asynchronous processing? */
2381 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
2382 		    IPSEC_ALGS_EXEC_ASYNC)
2383 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2384 
2385 		/* authentication context template */
2386 		IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2387 		    auth_ctx_tmpl);
2388 
2389 		/* where to store the computed mac */
2390 		ESP_INIT_CRYPTO_MAC(&io->ipsec_out_crypto_mac,
2391 		    icv_len, icv_buf);
2392 
2393 		/* authentication starts at the ESP header */
2394 		auth_len = payload_len + iv_len + sizeof (esph_t);
2395 		if (!do_encr) {
2396 			/* authentication only */
2397 			/* initialize input data argument */
2398 			ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
2399 			    esp_mp, esph_offset, auth_len);
2400 
2401 			/* call the crypto framework */
2402 			kef_rc = crypto_mac(&assoc->ipsa_amech,
2403 			    &io->ipsec_out_crypto_data,
2404 			    &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2405 			    &io->ipsec_out_crypto_mac, &call_req);
2406 		}
2407 	}
2408 
2409 	if (do_encr) {
2410 		/* force asynchronous processing? */
2411 		if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
2412 		    IPSEC_ALGS_EXEC_ASYNC)
2413 			call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
2414 
2415 		/* encryption context template */
2416 		IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2417 		    encr_ctx_tmpl);
2418 
2419 		if (!do_auth) {
2420 			/* encryption only, skip mblk that contains ESP hdr */
2421 			/* initialize input data argument */
2422 			ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
2423 			    esp_mp->b_cont, 0, payload_len);
2424 
2425 			/* specify IV */
2426 			io->ipsec_out_crypto_data.cd_miscdata =
2427 			    (char *)esp_mp->b_rptr + sizeof (esph_t) +
2428 			    esph_offset;
2429 
2430 			/* call the crypto framework */
2431 			kef_rc = crypto_encrypt(&assoc->ipsa_emech,
2432 			    &io->ipsec_out_crypto_data,
2433 			    &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2434 			    NULL, &call_req);
2435 		}
2436 	}
2437 
2438 	if (do_auth && do_encr) {
2439 		/*
2440 		 * Encryption and authentication:
2441 		 * Pass the pointer to the mblk chain starting at the ESP
2442 		 * header to the framework. Skip the ESP header mblk
2443 		 * for encryption, which is reflected by an encryption
2444 		 * offset equal to the length of that mblk. Start
2445 		 * the authentication at the ESP header, i.e. use an
2446 		 * authentication offset of zero.
2447 		 */
2448 		ESP_INIT_CRYPTO_DUAL_DATA(&io->ipsec_out_crypto_dual_data,
2449 		    esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
2450 
2451 		/* specify IV */
2452 		io->ipsec_out_crypto_dual_data.dd_miscdata =
2453 		    (char *)esp_mp->b_rptr + sizeof (esph_t) + esph_offset;
2454 
2455 		/* call the framework */
2456 		kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
2457 		    &assoc->ipsa_amech, NULL,
2458 		    &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
2459 		    encr_ctx_tmpl, auth_ctx_tmpl,
2460 		    &io->ipsec_out_crypto_dual_data,
2461 		    &io->ipsec_out_crypto_mac, &call_req);
2462 	}
2463 
2464 	switch (kef_rc) {
2465 	case CRYPTO_SUCCESS:
2466 		ESP_BUMP_STAT(espstack, crypto_sync);
2467 		esp_set_usetime(assoc, B_FALSE);
2468 		if (is_natt)
2469 			esp_prepare_udp(ns, ipsec_mp->b_cont,
2470 			    (ipha_t *)ipsec_mp->b_cont->b_rptr);
2471 		return (IPSEC_STATUS_SUCCESS);
2472 	case CRYPTO_QUEUED:
2473 		/* esp_kcf_callback() will be invoked on completion */
2474 		ESP_BUMP_STAT(espstack, crypto_async);
2475 		return (IPSEC_STATUS_PENDING);
2476 	}
2477 
2478 	esp_crypto_failed(ipsec_mp, B_TRUE, kef_rc, espstack);
2479 	return (IPSEC_STATUS_FAILED);
2480 }
2481 
2482 /*
2483  * Handle outbound IPsec processing for IPv4 and IPv6
2484  * On success returns B_TRUE, on failure returns B_FALSE and frees the
2485  * mblk chain ipsec_in_mp.
2486  */
2487 static ipsec_status_t
2488 esp_outbound(mblk_t *mp)
2489 {
2490 	mblk_t *ipsec_out_mp, *data_mp, *espmp, *tailmp;
2491 	ipsec_out_t *io;
2492 	ipha_t *ipha;
2493 	ip6_t *ip6h;
2494 	esph_t *esph;
2495 	uint_t af;
2496 	uint8_t *nhp;
2497 	uintptr_t divpoint, datalen, adj, padlen, i, alloclen;
2498 	uintptr_t esplen = sizeof (esph_t);
2499 	uint8_t protocol;
2500 	ipsa_t *assoc;
2501 	uint_t iv_len, mac_len = 0;
2502 	uchar_t *icv_buf;
2503 	udpha_t *udpha;
2504 	boolean_t is_natt = B_FALSE;
2505 	netstack_t	*ns;
2506 	ipsecesp_stack_t *espstack;
2507 	ipsec_stack_t	*ipss;
2508 
2509 	ipsec_out_mp = mp;
2510 	data_mp = ipsec_out_mp->b_cont;
2511 
2512 	io = (ipsec_out_t *)ipsec_out_mp->b_rptr;
2513 	ns = io->ipsec_out_ns;
2514 	espstack = ns->netstack_ipsecesp;
2515 	ipss = ns->netstack_ipsec;
2516 
2517 	ESP_BUMP_STAT(espstack, out_requests);
2518 
2519 	/*
2520 	 * <sigh> We have to copy the message here, because TCP (for example)
2521 	 * keeps a dupb() of the message lying around for retransmission.
2522 	 * Since ESP changes the whole of the datagram, we have to create our
2523 	 * own copy lest we clobber TCP's data.  Since we have to copy anyway,
2524 	 * we might as well make use of msgpullup() and get the mblk into one
2525 	 * contiguous piece!
2526 	 */
2527 	ipsec_out_mp->b_cont = msgpullup(data_mp, -1);
2528 	if (ipsec_out_mp->b_cont == NULL) {
2529 		esp0dbg(("esp_outbound: msgpullup() failed, "
2530 		    "dropping packet.\n"));
2531 		ipsec_out_mp->b_cont = data_mp;
2532 		/*
2533 		 * TODO:  Find the outbound IRE for this packet and
2534 		 * pass it to ip_drop_packet().
2535 		 */
2536 		ip_drop_packet(ipsec_out_mp, B_FALSE, NULL, NULL,
2537 		    DROPPER(ipss, ipds_esp_nomem),
2538 		    &espstack->esp_dropper);
2539 		return (IPSEC_STATUS_FAILED);
2540 	} else {
2541 		freemsg(data_mp);
2542 		data_mp = ipsec_out_mp->b_cont;
2543 	}
2544 
2545 	/*
2546 	 * Reality check....
2547 	 */
2548 
2549 	ipha = (ipha_t *)data_mp->b_rptr;  /* So we can call esp_acquire(). */
2550 
2551 	if (io->ipsec_out_v4) {
2552 		af = AF_INET;
2553 		divpoint = IPH_HDR_LENGTH(ipha);
2554 		datalen = ntohs(ipha->ipha_length) - divpoint;
2555 		nhp = (uint8_t *)&ipha->ipha_protocol;
2556 	} else {
2557 		ip6_pkt_t ipp;
2558 
2559 		af = AF_INET6;
2560 		ip6h = (ip6_t *)ipha;
2561 		bzero(&ipp, sizeof (ipp));
2562 		divpoint = ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
2563 		if (ipp.ipp_dstopts != NULL &&
2564 		    ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
2565 			/*
2566 			 * Destination options are tricky.  If we get in here,
2567 			 * then we have a terminal header following the
2568 			 * destination options.  We need to adjust backwards
2569 			 * so we insert ESP BEFORE the destination options
2570 			 * bag.  (So that the dstopts get encrypted!)
2571 			 *
2572 			 * Since this is for outbound packets only, we know
2573 			 * that non-terminal destination options only precede
2574 			 * routing headers.
2575 			 */
2576 			divpoint -= ipp.ipp_dstoptslen;
2577 		}
2578 		datalen = ntohs(ip6h->ip6_plen) + sizeof (ip6_t) - divpoint;
2579 
2580 		if (ipp.ipp_rthdr != NULL) {
2581 			nhp = &ipp.ipp_rthdr->ip6r_nxt;
2582 		} else if (ipp.ipp_hopopts != NULL) {
2583 			nhp = &ipp.ipp_hopopts->ip6h_nxt;
2584 		} else {
2585 			ASSERT(divpoint == sizeof (ip6_t));
2586 			/* It's probably IP + ESP. */
2587 			nhp = &ip6h->ip6_nxt;
2588 		}
2589 	}
2590 	assoc = io->ipsec_out_esp_sa;
2591 	ASSERT(assoc != NULL);
2592 
2593 	if (assoc->ipsa_auth_alg != SADB_AALG_NONE)
2594 		mac_len = assoc->ipsa_mac_len;
2595 
2596 	if (assoc->ipsa_flags & IPSA_F_NATT) {
2597 		/* wedge in fake UDP */
2598 		is_natt = B_TRUE;
2599 		esplen += UDPH_SIZE;
2600 	}
2601 
2602 	/*
2603 	 * Set up ESP header and encryption padding for ENCR PI request.
2604 	 */
2605 
2606 	/* Determine the padding length.  Pad to 4-bytes for no-encryption. */
2607 	if (assoc->ipsa_encr_alg != SADB_EALG_NULL) {
2608 		iv_len = assoc->ipsa_iv_len;
2609 
2610 		/*
2611 		 * Include the two additional bytes (hence the - 2) for the
2612 		 * padding length and the next header.  Take this into account
2613 		 * when calculating the actual length of the padding.
2614 		 */
2615 		ASSERT(ISP2(iv_len));
2616 		padlen = ((unsigned)(iv_len - datalen - 2)) & (iv_len - 1);
2617 	} else {
2618 		iv_len = 0;
2619 		padlen = ((unsigned)(sizeof (uint32_t) - datalen - 2)) &
2620 		    (sizeof (uint32_t) - 1);
2621 	}
2622 
2623 	/* Allocate ESP header and IV. */
2624 	esplen += iv_len;
2625 
2626 	/*
2627 	 * Update association byte-count lifetimes.  Don't forget to take
2628 	 * into account the padding length and next-header (hence the + 2).
2629 	 *
2630 	 * Use the amount of data fed into the "encryption algorithm".  This
2631 	 * is the IV, the data length, the padding length, and the final two
2632 	 * bytes (padlen, and next-header).
2633 	 *
2634 	 */
2635 
2636 	if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
2637 		/*
2638 		 * TODO:  Find the outbound IRE for this packet and
2639 		 * pass it to ip_drop_packet().
2640 		 */
2641 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2642 		    DROPPER(ipss, ipds_esp_bytes_expire),
2643 		    &espstack->esp_dropper);
2644 		return (IPSEC_STATUS_FAILED);
2645 	}
2646 
2647 	espmp = allocb(esplen, BPRI_HI);
2648 	if (espmp == NULL) {
2649 		ESP_BUMP_STAT(espstack, out_discards);
2650 		esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
2651 		/*
2652 		 * TODO:  Find the outbound IRE for this packet and
2653 		 * pass it to ip_drop_packet().
2654 		 */
2655 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2656 		    DROPPER(ipss, ipds_esp_nomem),
2657 		    &espstack->esp_dropper);
2658 		return (IPSEC_STATUS_FAILED);
2659 	}
2660 	espmp->b_wptr += esplen;
2661 	esph = (esph_t *)espmp->b_rptr;
2662 
2663 	if (is_natt) {
2664 		esp3dbg(espstack, ("esp_outbound: NATT"));
2665 
2666 		udpha = (udpha_t *)espmp->b_rptr;
2667 		udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2668 		    assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2669 		udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2670 		    assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2671 		/*
2672 		 * Set the checksum to 0, so that the esp_prepare_udp() call
2673 		 * can do the right thing.
2674 		 */
2675 		udpha->uha_checksum = 0;
2676 		esph = (esph_t *)(udpha + 1);
2677 	}
2678 
2679 	esph->esph_spi = assoc->ipsa_spi;
2680 
2681 	esph->esph_replay = htonl(atomic_add_32_nv(&assoc->ipsa_replay, 1));
2682 	if (esph->esph_replay == 0 && assoc->ipsa_replay_wsize != 0) {
2683 		/*
2684 		 * XXX We have replay counter wrapping.
2685 		 * We probably want to nuke this SA (and its peer).
2686 		 */
2687 		ipsec_assocfailure(info.mi_idnum, 0, 0,
2688 		    SL_ERROR | SL_CONSOLE | SL_WARN,
2689 		    "Outbound ESP SA (0x%x, %s) has wrapped sequence.\n",
2690 		    esph->esph_spi, assoc->ipsa_dstaddr, af,
2691 		    espstack->ipsecesp_netstack);
2692 
2693 		ESP_BUMP_STAT(espstack, out_discards);
2694 		sadb_replay_delete(assoc);
2695 		/*
2696 		 * TODO:  Find the outbound IRE for this packet and
2697 		 * pass it to ip_drop_packet().
2698 		 */
2699 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2700 		    DROPPER(ipss, ipds_esp_replay),
2701 		    &espstack->esp_dropper);
2702 		return (IPSEC_STATUS_FAILED);
2703 	}
2704 
2705 	/*
2706 	 * Set the IV to a random quantity.  We do not require the
2707 	 * highest quality random bits, but for best security with CBC
2708 	 * mode ciphers, the value must be unlikely to repeat and also
2709 	 * must not be known in advance to an adversary capable of
2710 	 * influencing the plaintext.
2711 	 */
2712 	(void) random_get_pseudo_bytes((uint8_t *)(esph + 1), iv_len);
2713 
2714 	/* Fix the IP header. */
2715 	alloclen = padlen + 2 + mac_len;
2716 	adj = alloclen + (espmp->b_wptr - espmp->b_rptr);
2717 
2718 	protocol = *nhp;
2719 
2720 	if (io->ipsec_out_v4) {
2721 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
2722 		if (is_natt) {
2723 			*nhp = IPPROTO_UDP;
2724 			udpha->uha_length = htons(ntohs(ipha->ipha_length) -
2725 			    IPH_HDR_LENGTH(ipha));
2726 		} else {
2727 			*nhp = IPPROTO_ESP;
2728 		}
2729 		ipha->ipha_hdr_checksum = 0;
2730 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
2731 	} else {
2732 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + adj);
2733 		*nhp = IPPROTO_ESP;
2734 	}
2735 
2736 	/* I've got the two ESP mblks, now insert them. */
2737 
2738 	esp2dbg(espstack, ("data_mp before outbound ESP adjustment:\n"));
2739 	esp2dbg(espstack, (dump_msg(data_mp)));
2740 
2741 	if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
2742 		ESP_BUMP_STAT(espstack, out_discards);
2743 		/* NOTE:  esp_insert_esp() only fails if there's no memory. */
2744 		/*
2745 		 * TODO:  Find the outbound IRE for this packet and
2746 		 * pass it to ip_drop_packet().
2747 		 */
2748 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
2749 		    DROPPER(ipss, ipds_esp_nomem),
2750 		    &espstack->esp_dropper);
2751 		freeb(espmp);
2752 		return (IPSEC_STATUS_FAILED);
2753 	}
2754 
2755 	/* Append padding (and leave room for ICV). */
2756 	for (tailmp = data_mp; tailmp->b_cont != NULL; tailmp = tailmp->b_cont)
2757 		;
2758 	if (tailmp->b_wptr + alloclen > tailmp->b_datap->db_lim) {
2759 		tailmp->b_cont = allocb(alloclen, BPRI_HI);
2760 		if (tailmp->b_cont == NULL) {
2761 			ESP_BUMP_STAT(espstack, out_discards);
2762 			esp0dbg(("esp_outbound:  Can't allocate tailmp.\n"));
2763 			/*
2764 			 * TODO:  Find the outbound IRE for this packet and
2765 			 * pass it to ip_drop_packet().
2766 			 */
2767 			ip_drop_packet(mp, B_FALSE, NULL, NULL,
2768 			    DROPPER(ipss, ipds_esp_nomem),
2769 			    &espstack->esp_dropper);
2770 			return (IPSEC_STATUS_FAILED);
2771 		}
2772 		tailmp = tailmp->b_cont;
2773 	}
2774 
2775 	/*
2776 	 * If there's padding, N bytes of padding must be of the form 0x1,
2777 	 * 0x2, 0x3... 0xN.
2778 	 */
2779 	for (i = 0; i < padlen; ) {
2780 		i++;
2781 		*tailmp->b_wptr++ = i;
2782 	}
2783 	*tailmp->b_wptr++ = i;
2784 	*tailmp->b_wptr++ = protocol;
2785 
2786 	esp2dbg(espstack, ("data_Mp before encryption:\n"));
2787 	esp2dbg(espstack, (dump_msg(data_mp)));
2788 
2789 	/*
2790 	 * The packet is eligible for hardware acceleration if the
2791 	 * following conditions are satisfied:
2792 	 *
2793 	 * 1. the packet will not be fragmented
2794 	 * 2. the provider supports the algorithms specified by SA
2795 	 * 3. there is no pending control message being exchanged
2796 	 * 4. snoop is not attached
2797 	 * 5. the destination address is not a multicast address
2798 	 *
2799 	 * All five of these conditions are checked by IP prior to
2800 	 * sending the packet to ESP.
2801 	 *
2802 	 * But We, and We Alone, can, nay MUST check if the packet
2803 	 * is over NATT, and then disqualify it from hardware
2804 	 * acceleration.
2805 	 */
2806 
2807 	if (io->ipsec_out_is_capab_ill && !(assoc->ipsa_flags & IPSA_F_NATT)) {
2808 		return (esp_outbound_accelerated(ipsec_out_mp, mac_len));
2809 	}
2810 	ESP_BUMP_STAT(espstack, noaccel);
2811 
2812 	/*
2813 	 * Okay.  I've set up the pre-encryption ESP.  Let's do it!
2814 	 */
2815 
2816 	if (mac_len > 0) {
2817 		ASSERT(tailmp->b_wptr + mac_len <= tailmp->b_datap->db_lim);
2818 		icv_buf = tailmp->b_wptr;
2819 		tailmp->b_wptr += mac_len;
2820 	} else {
2821 		icv_buf = NULL;
2822 	}
2823 
2824 	return (esp_submit_req_outbound(ipsec_out_mp, assoc, icv_buf,
2825 	    datalen + padlen + 2));
2826 }
2827 
2828 /*
2829  * IP calls this to validate the ICMP errors that
2830  * we got from the network.
2831  */
2832 ipsec_status_t
2833 ipsecesp_icmp_error(mblk_t *ipsec_mp)
2834 {
2835 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
2836 	boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
2837 	netstack_t	*ns;
2838 	ipsecesp_stack_t *espstack;
2839 	ipsec_stack_t	*ipss;
2840 
2841 	if (is_inbound) {
2842 		ns = ii->ipsec_in_ns;
2843 	} else {
2844 		ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
2845 
2846 		ns = io->ipsec_out_ns;
2847 	}
2848 	espstack = ns->netstack_ipsecesp;
2849 	ipss = ns->netstack_ipsec;
2850 
2851 	/*
2852 	 * Unless we get an entire packet back, this function is useless.
2853 	 * Why?
2854 	 *
2855 	 * 1.)	Partial packets are useless, because the "next header"
2856 	 *	is at the end of the decrypted ESP packet.  Without the
2857 	 *	whole packet, this is useless.
2858 	 *
2859 	 * 2.)	If we every use a stateful cipher, such as a stream or a
2860 	 *	one-time pad, we can't do anything.
2861 	 *
2862 	 * Since the chances of us getting an entire packet back are very
2863 	 * very small, we discard here.
2864 	 */
2865 	IP_ESP_BUMP_STAT(ipss, in_discards);
2866 	ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
2867 	    DROPPER(ipss, ipds_esp_icmp),
2868 	    &espstack->esp_dropper);
2869 	return (IPSEC_STATUS_FAILED);
2870 }
2871 
2872 /*
2873  * ESP module read put routine.
2874  */
2875 /* ARGSUSED */
2876 static void
2877 ipsecesp_rput(queue_t *q, mblk_t *mp)
2878 {
2879 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
2880 
2881 	ASSERT(mp->b_datap->db_type != M_CTL);	/* No more IRE_DB_REQ. */
2882 
2883 	switch (mp->b_datap->db_type) {
2884 	case M_PROTO:
2885 	case M_PCPROTO:
2886 		/* TPI message of some sort. */
2887 		switch (*((t_scalar_t *)mp->b_rptr)) {
2888 		case T_BIND_ACK:
2889 			esp3dbg(espstack,
2890 			    ("Thank you IP from ESP for T_BIND_ACK\n"));
2891 			break;
2892 		case T_ERROR_ACK:
2893 			cmn_err(CE_WARN,
2894 			    "ipsecesp:  ESP received T_ERROR_ACK from IP.");
2895 			/*
2896 			 * Make esp_sadb.s_ip_q NULL, and in the
2897 			 * future, perhaps try again.
2898 			 */
2899 			espstack->esp_sadb.s_ip_q = NULL;
2900 			break;
2901 		case T_OK_ACK:
2902 			/* Probably from a (rarely sent) T_UNBIND_REQ. */
2903 			break;
2904 		default:
2905 			esp0dbg(("Unknown M_{,PC}PROTO message.\n"));
2906 		}
2907 		freemsg(mp);
2908 		break;
2909 	default:
2910 		/* For now, passthru message. */
2911 		esp2dbg(espstack, ("ESP got unknown mblk type %d.\n",
2912 		    mp->b_datap->db_type));
2913 		putnext(q, mp);
2914 	}
2915 }
2916 
2917 /*
2918  * Construct an SADB_REGISTER message with the current algorithms.
2919  */
2920 static boolean_t
2921 esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
2922     ipsecesp_stack_t *espstack)
2923 {
2924 	mblk_t *pfkey_msg_mp, *keysock_out_mp;
2925 	sadb_msg_t *samsg;
2926 	sadb_supported_t *sasupp_auth = NULL;
2927 	sadb_supported_t *sasupp_encr = NULL;
2928 	sadb_alg_t *saalg;
2929 	uint_t allocsize = sizeof (*samsg);
2930 	uint_t i, numalgs_snap;
2931 	int current_aalgs;
2932 	ipsec_alginfo_t **authalgs;
2933 	uint_t num_aalgs;
2934 	int current_ealgs;
2935 	ipsec_alginfo_t **encralgs;
2936 	uint_t num_ealgs;
2937 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
2938 
2939 	/* Allocate the KEYSOCK_OUT. */
2940 	keysock_out_mp = sadb_keysock_out(serial);
2941 	if (keysock_out_mp == NULL) {
2942 		esp0dbg(("esp_register_out: couldn't allocate mblk.\n"));
2943 		return (B_FALSE);
2944 	}
2945 
2946 	/*
2947 	 * Allocate the PF_KEY message that follows KEYSOCK_OUT.
2948 	 */
2949 
2950 	mutex_enter(&ipss->ipsec_alg_lock);
2951 
2952 	/*
2953 	 * Fill SADB_REGISTER message's algorithm descriptors.  Hold
2954 	 * down the lock while filling it.
2955 	 *
2956 	 * Return only valid algorithms, so the number of algorithms
2957 	 * to send up may be less than the number of algorithm entries
2958 	 * in the table.
2959 	 */
2960 	authalgs = ipss->ipsec_alglists[IPSEC_ALG_AUTH];
2961 	for (num_aalgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2962 		if (authalgs[i] != NULL && ALG_VALID(authalgs[i]))
2963 			num_aalgs++;
2964 
2965 	if (num_aalgs != 0) {
2966 		allocsize += (num_aalgs * sizeof (*saalg));
2967 		allocsize += sizeof (*sasupp_auth);
2968 	}
2969 	encralgs = ipss->ipsec_alglists[IPSEC_ALG_ENCR];
2970 	for (num_ealgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
2971 		if (encralgs[i] != NULL && ALG_VALID(encralgs[i]))
2972 			num_ealgs++;
2973 
2974 	if (num_ealgs != 0) {
2975 		allocsize += (num_ealgs * sizeof (*saalg));
2976 		allocsize += sizeof (*sasupp_encr);
2977 	}
2978 	keysock_out_mp->b_cont = allocb(allocsize, BPRI_HI);
2979 	if (keysock_out_mp->b_cont == NULL) {
2980 		mutex_exit(&ipss->ipsec_alg_lock);
2981 		freemsg(keysock_out_mp);
2982 		return (B_FALSE);
2983 	}
2984 
2985 	pfkey_msg_mp = keysock_out_mp->b_cont;
2986 	pfkey_msg_mp->b_wptr += allocsize;
2987 	if (num_aalgs != 0) {
2988 		sasupp_auth = (sadb_supported_t *)
2989 		    (pfkey_msg_mp->b_rptr + sizeof (*samsg));
2990 		saalg = (sadb_alg_t *)(sasupp_auth + 1);
2991 
2992 		ASSERT(((ulong_t)saalg & 0x7) == 0);
2993 
2994 		numalgs_snap = 0;
2995 		for (i = 0;
2996 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_aalgs));
2997 		    i++) {
2998 			if (authalgs[i] == NULL || !ALG_VALID(authalgs[i]))
2999 				continue;
3000 
3001 			saalg->sadb_alg_id = authalgs[i]->alg_id;
3002 			saalg->sadb_alg_ivlen = 0;
3003 			saalg->sadb_alg_minbits	= authalgs[i]->alg_ef_minbits;
3004 			saalg->sadb_alg_maxbits	= authalgs[i]->alg_ef_maxbits;
3005 			saalg->sadb_x_alg_defincr = authalgs[i]->alg_ef_default;
3006 			saalg->sadb_x_alg_increment =
3007 			    authalgs[i]->alg_increment;
3008 			numalgs_snap++;
3009 			saalg++;
3010 		}
3011 		ASSERT(numalgs_snap == num_aalgs);
3012 #ifdef DEBUG
3013 		/*
3014 		 * Reality check to make sure I snagged all of the
3015 		 * algorithms.
3016 		 */
3017 		for (; i < IPSEC_MAX_ALGS; i++) {
3018 			if (authalgs[i] != NULL && ALG_VALID(authalgs[i])) {
3019 				cmn_err(CE_PANIC, "esp_register_out()! "
3020 				    "Missed aalg #%d.\n", i);
3021 			}
3022 		}
3023 #endif /* DEBUG */
3024 	} else {
3025 		saalg = (sadb_alg_t *)(pfkey_msg_mp->b_rptr + sizeof (*samsg));
3026 	}
3027 
3028 	if (num_ealgs != 0) {
3029 		sasupp_encr = (sadb_supported_t *)saalg;
3030 		saalg = (sadb_alg_t *)(sasupp_encr + 1);
3031 
3032 		numalgs_snap = 0;
3033 		for (i = 0;
3034 		    ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_ealgs)); i++) {
3035 			if (encralgs[i] == NULL || !ALG_VALID(encralgs[i]))
3036 				continue;
3037 			saalg->sadb_alg_id = encralgs[i]->alg_id;
3038 			saalg->sadb_alg_ivlen = encralgs[i]->alg_datalen;
3039 			saalg->sadb_alg_minbits	= encralgs[i]->alg_ef_minbits;
3040 			saalg->sadb_alg_maxbits	= encralgs[i]->alg_ef_maxbits;
3041 			saalg->sadb_x_alg_defincr = encralgs[i]->alg_ef_default;
3042 			saalg->sadb_x_alg_increment =
3043 			    encralgs[i]->alg_increment;
3044 			numalgs_snap++;
3045 			saalg++;
3046 		}
3047 		ASSERT(numalgs_snap == num_ealgs);
3048 #ifdef DEBUG
3049 		/*
3050 		 * Reality check to make sure I snagged all of the
3051 		 * algorithms.
3052 		 */
3053 		for (; i < IPSEC_MAX_ALGS; i++) {
3054 			if (encralgs[i] != NULL && ALG_VALID(encralgs[i])) {
3055 				cmn_err(CE_PANIC, "esp_register_out()! "
3056 				    "Missed ealg #%d.\n", i);
3057 			}
3058 		}
3059 #endif /* DEBUG */
3060 	}
3061 
3062 	current_aalgs = num_aalgs;
3063 	current_ealgs = num_ealgs;
3064 
3065 	mutex_exit(&ipss->ipsec_alg_lock);
3066 
3067 	/* Now fill the rest of the SADB_REGISTER message. */
3068 
3069 	samsg = (sadb_msg_t *)pfkey_msg_mp->b_rptr;
3070 	samsg->sadb_msg_version = PF_KEY_V2;
3071 	samsg->sadb_msg_type = SADB_REGISTER;
3072 	samsg->sadb_msg_errno = 0;
3073 	samsg->sadb_msg_satype = SADB_SATYPE_ESP;
3074 	samsg->sadb_msg_len = SADB_8TO64(allocsize);
3075 	samsg->sadb_msg_reserved = 0;
3076 	/*
3077 	 * Assume caller has sufficient sequence/pid number info.  If it's one
3078 	 * from me over a new alg., I could give two hoots about sequence.
3079 	 */
3080 	samsg->sadb_msg_seq = sequence;
3081 	samsg->sadb_msg_pid = pid;
3082 
3083 	if (sasupp_auth != NULL) {
3084 		sasupp_auth->sadb_supported_len = SADB_8TO64(
3085 		    sizeof (*sasupp_auth) + sizeof (*saalg) * current_aalgs);
3086 		sasupp_auth->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
3087 		sasupp_auth->sadb_supported_reserved = 0;
3088 	}
3089 
3090 	if (sasupp_encr != NULL) {
3091 		sasupp_encr->sadb_supported_len = SADB_8TO64(
3092 		    sizeof (*sasupp_encr) + sizeof (*saalg) * current_ealgs);
3093 		sasupp_encr->sadb_supported_exttype =
3094 		    SADB_EXT_SUPPORTED_ENCRYPT;
3095 		sasupp_encr->sadb_supported_reserved = 0;
3096 	}
3097 
3098 	if (espstack->esp_pfkey_q != NULL)
3099 		putnext(espstack->esp_pfkey_q, keysock_out_mp);
3100 	else {
3101 		freemsg(keysock_out_mp);
3102 		return (B_FALSE);
3103 	}
3104 
3105 	return (B_TRUE);
3106 }
3107 
3108 /*
3109  * Invoked when the algorithm table changes. Causes SADB_REGISTER
3110  * messages continaining the current list of algorithms to be
3111  * sent up to the ESP listeners.
3112  */
3113 void
3114 ipsecesp_algs_changed(netstack_t *ns)
3115 {
3116 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
3117 
3118 	/*
3119 	 * Time to send a PF_KEY SADB_REGISTER message to ESP listeners
3120 	 * everywhere.  (The function itself checks for NULL esp_pfkey_q.)
3121 	 */
3122 	(void) esp_register_out(0, 0, 0, espstack);
3123 }
3124 
3125 /*
3126  * taskq_dispatch handler.
3127  */
3128 static void
3129 inbound_task(void *arg)
3130 {
3131 	esph_t *esph;
3132 	mblk_t *mp = (mblk_t *)arg;
3133 	ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
3134 	netstack_t		*ns = ii->ipsec_in_ns;
3135 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
3136 	int ipsec_rc;
3137 
3138 	esp2dbg(espstack, ("in ESP inbound_task"));
3139 	ASSERT(espstack != NULL);
3140 
3141 	esph = ipsec_inbound_esp_sa(mp, ns);
3142 	if (esph == NULL)
3143 		return;
3144 	ASSERT(ii->ipsec_in_esp_sa != NULL);
3145 	ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(mp, esph);
3146 	if (ipsec_rc != IPSEC_STATUS_SUCCESS)
3147 		return;
3148 	ip_fanout_proto_again(mp, NULL, NULL, NULL);
3149 }
3150 
3151 /*
3152  * Now that weak-key passed, actually ADD the security association, and
3153  * send back a reply ADD message.
3154  */
3155 static int
3156 esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
3157     int *diagnostic, ipsecesp_stack_t *espstack)
3158 {
3159 	isaf_t *primary = NULL, *secondary, *inbound, *outbound;
3160 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3161 	sadb_address_t *dstext =
3162 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3163 	struct sockaddr_in *dst;
3164 	struct sockaddr_in6 *dst6;
3165 	boolean_t is_ipv4, clone = B_FALSE, is_inbound = B_FALSE;
3166 	uint32_t *dstaddr;
3167 	ipsa_t *larval = NULL;
3168 	ipsacq_t *acqrec;
3169 	iacqf_t *acq_bucket;
3170 	mblk_t *acq_msgs = NULL;
3171 	int rc;
3172 	sadb_t *sp;
3173 	int outhash;
3174 	mblk_t *lpkt;
3175 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3176 
3177 	/*
3178 	 * Locate the appropriate table(s).
3179 	 */
3180 
3181 	dst = (struct sockaddr_in *)(dstext + 1);
3182 	dst6 = (struct sockaddr_in6 *)dst;
3183 	is_ipv4 = (dst->sin_family == AF_INET);
3184 	if (is_ipv4) {
3185 		sp = &espstack->esp_sadb.s_v4;
3186 		dstaddr = (uint32_t *)(&dst->sin_addr);
3187 		outhash = OUTBOUND_HASH_V4(sp, *(ipaddr_t *)dstaddr);
3188 	} else {
3189 		sp = &espstack->esp_sadb.s_v6;
3190 		dstaddr = (uint32_t *)(&dst6->sin6_addr);
3191 		outhash = OUTBOUND_HASH_V6(sp, *(in6_addr_t *)dstaddr);
3192 	}
3193 
3194 	inbound = INBOUND_BUCKET(sp, assoc->sadb_sa_spi);
3195 	outbound = &sp->sdb_of[outhash];
3196 
3197 	/*
3198 	 * Use the direction flags provided by the KMD to determine
3199 	 * if the inbound or outbound table should be the primary
3200 	 * for this SA. If these flags were absent then make this
3201 	 * decision based on the addresses.
3202 	 */
3203 	if (assoc->sadb_sa_flags & IPSA_F_INBOUND) {
3204 		primary = inbound;
3205 		secondary = outbound;
3206 		is_inbound = B_TRUE;
3207 		if (assoc->sadb_sa_flags & IPSA_F_OUTBOUND)
3208 			clone = B_TRUE;
3209 	} else {
3210 		if (assoc->sadb_sa_flags & IPSA_F_OUTBOUND) {
3211 			primary = outbound;
3212 			secondary = inbound;
3213 		}
3214 	}
3215 
3216 	if (primary == NULL) {
3217 		/*
3218 		 * The KMD did not set a direction flag, determine which
3219 		 * table to insert the SA into based on addresses.
3220 		 */
3221 		switch (ksi->ks_in_dsttype) {
3222 		case KS_IN_ADDR_MBCAST:
3223 			clone = B_TRUE;	/* All mcast SAs can be bidirectional */
3224 			assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3225 			/* FALLTHRU */
3226 		/*
3227 		 * If the source address is either one of mine, or unspecified
3228 		 * (which is best summed up by saying "not 'not mine'"),
3229 		 * then the association is potentially bi-directional,
3230 		 * in that it can be used for inbound traffic and outbound
3231 		 * traffic.  The best example of such an SA is a multicast
3232 		 * SA (which allows me to receive the outbound traffic).
3233 		 */
3234 		case KS_IN_ADDR_ME:
3235 			assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3236 			primary = inbound;
3237 			secondary = outbound;
3238 			if (ksi->ks_in_srctype != KS_IN_ADDR_NOTME)
3239 				clone = B_TRUE;
3240 			is_inbound = B_TRUE;
3241 			break;
3242 		/*
3243 		 * If the source address literally not mine (either
3244 		 * unspecified or not mine), then this SA may have an
3245 		 * address that WILL be mine after some configuration.
3246 		 * We pay the price for this by making it a bi-directional
3247 		 * SA.
3248 		 */
3249 		case KS_IN_ADDR_NOTME:
3250 			assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3251 			primary = outbound;
3252 			secondary = inbound;
3253 			if (ksi->ks_in_srctype != KS_IN_ADDR_ME) {
3254 				assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3255 				clone = B_TRUE;
3256 			}
3257 			break;
3258 		default:
3259 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_DST;
3260 			return (EINVAL);
3261 		}
3262 	}
3263 
3264 	/*
3265 	 * Find a ACQUIRE list entry if possible.  If we've added an SA that
3266 	 * suits the needs of an ACQUIRE list entry, we can eliminate the
3267 	 * ACQUIRE list entry and transmit the enqueued packets.  Use the
3268 	 * high-bit of the sequence number to queue it.  Key off destination
3269 	 * addr, and change acqrec's state.
3270 	 */
3271 
3272 	if (samsg->sadb_msg_seq & IACQF_LOWEST_SEQ) {
3273 		acq_bucket = &sp->sdb_acq[outhash];
3274 		mutex_enter(&acq_bucket->iacqf_lock);
3275 		for (acqrec = acq_bucket->iacqf_ipsacq; acqrec != NULL;
3276 		    acqrec = acqrec->ipsacq_next) {
3277 			mutex_enter(&acqrec->ipsacq_lock);
3278 			/*
3279 			 * Q:  I only check sequence.  Should I check dst?
3280 			 * A: Yes, check dest because those are the packets
3281 			 *    that are queued up.
3282 			 */
3283 			if (acqrec->ipsacq_seq == samsg->sadb_msg_seq &&
3284 			    IPSA_ARE_ADDR_EQUAL(dstaddr,
3285 			    acqrec->ipsacq_dstaddr, acqrec->ipsacq_addrfam))
3286 				break;
3287 			mutex_exit(&acqrec->ipsacq_lock);
3288 		}
3289 		if (acqrec != NULL) {
3290 			/*
3291 			 * AHA!  I found an ACQUIRE record for this SA.
3292 			 * Grab the msg list, and free the acquire record.
3293 			 * I already am holding the lock for this record,
3294 			 * so all I have to do is free it.
3295 			 */
3296 			acq_msgs = acqrec->ipsacq_mp;
3297 			acqrec->ipsacq_mp = NULL;
3298 			mutex_exit(&acqrec->ipsacq_lock);
3299 			sadb_destroy_acquire(acqrec,
3300 			    espstack->ipsecesp_netstack);
3301 		}
3302 		mutex_exit(&acq_bucket->iacqf_lock);
3303 	}
3304 
3305 	/*
3306 	 * Find PF_KEY message, and see if I'm an update.  If so, find entry
3307 	 * in larval list (if there).
3308 	 */
3309 
3310 	if (samsg->sadb_msg_type == SADB_UPDATE) {
3311 		mutex_enter(&inbound->isaf_lock);
3312 		larval = ipsec_getassocbyspi(inbound, assoc->sadb_sa_spi,
3313 		    ALL_ZEROES_PTR, dstaddr, dst->sin_family);
3314 		mutex_exit(&inbound->isaf_lock);
3315 
3316 		if ((larval == NULL) ||
3317 		    (larval->ipsa_state != IPSA_STATE_LARVAL)) {
3318 			*diagnostic = SADB_X_DIAGNOSTIC_SA_NOTFOUND;
3319 			if (larval != NULL) {
3320 				IPSA_REFRELE(larval);
3321 			}
3322 			esp0dbg(("Larval update, but larval disappeared.\n"));
3323 			return (ESRCH);
3324 		} /* Else sadb_common_add unlinks it for me! */
3325 	}
3326 
3327 	lpkt = NULL;
3328 	if (larval != NULL)
3329 		lpkt = sadb_clear_lpkt(larval);
3330 
3331 	rc = sadb_common_add(espstack->esp_sadb.s_ip_q, espstack->esp_pfkey_q,
3332 	    mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
3333 	    diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb);
3334 
3335 	if (rc == 0 && lpkt != NULL) {
3336 		rc = !taskq_dispatch(esp_taskq, inbound_task,
3337 		    (void *) lpkt, TQ_NOSLEEP);
3338 	}
3339 
3340 	if (rc != 0) {
3341 		ip_drop_packet(lpkt, B_TRUE, NULL, NULL,
3342 		    DROPPER(ipss, ipds_sadb_inlarval_timeout),
3343 		    &espstack->esp_dropper);
3344 	}
3345 
3346 	/*
3347 	 * How much more stack will I create with all of these
3348 	 * esp_outbound() calls?
3349 	 */
3350 
3351 	while (acq_msgs != NULL) {
3352 		mblk_t *mp = acq_msgs;
3353 
3354 		acq_msgs = acq_msgs->b_next;
3355 		mp->b_next = NULL;
3356 		if (rc == 0) {
3357 			if (ipsec_outbound_sa(mp, IPPROTO_ESP)) {
3358 				((ipsec_out_t *)(mp->b_rptr))->
3359 				    ipsec_out_esp_done = B_TRUE;
3360 				if (esp_outbound(mp) == IPSEC_STATUS_SUCCESS) {
3361 					ipha_t *ipha;
3362 
3363 					/* do AH processing if needed */
3364 					if (!esp_do_outbound_ah(mp))
3365 						continue;
3366 
3367 					ipha = (ipha_t *)mp->b_cont->b_rptr;
3368 
3369 					/* finish IPsec processing */
3370 					if (is_ipv4) {
3371 						ip_wput_ipsec_out(NULL, mp,
3372 						    ipha, NULL, NULL);
3373 					} else {
3374 						ip6_t *ip6h = (ip6_t *)ipha;
3375 						ip_wput_ipsec_out_v6(NULL,
3376 						    mp, ip6h, NULL, NULL);
3377 					}
3378 				}
3379 				continue;
3380 			}
3381 		}
3382 		ESP_BUMP_STAT(espstack, out_discards);
3383 		ip_drop_packet(mp, B_FALSE, NULL, NULL,
3384 		    DROPPER(ipss, ipds_sadb_acquire_timeout),
3385 		    &espstack->esp_dropper);
3386 	}
3387 
3388 	return (rc);
3389 }
3390 
3391 /*
3392  * Add new ESP security association.  This may become a generic AH/ESP
3393  * routine eventually.
3394  */
3395 static int
3396 esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
3397 {
3398 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3399 	sadb_address_t *srcext =
3400 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3401 	sadb_address_t *dstext =
3402 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3403 	sadb_address_t *isrcext =
3404 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_SRC];
3405 	sadb_address_t *idstext =
3406 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_DST];
3407 	sadb_address_t *nttext_loc =
3408 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_LOC];
3409 	sadb_address_t *nttext_rem =
3410 	    (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_REM];
3411 	sadb_key_t *akey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_AUTH];
3412 	sadb_key_t *ekey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_ENCRYPT];
3413 	struct sockaddr_in *src, *dst;
3414 	struct sockaddr_in *natt_loc, *natt_rem;
3415 	struct sockaddr_in6 *natt_loc6, *natt_rem6;
3416 	sadb_lifetime_t *soft =
3417 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_SOFT];
3418 	sadb_lifetime_t *hard =
3419 	    (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_HARD];
3420 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3421 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3422 
3423 	/* I need certain extensions present for an ADD message. */
3424 	if (srcext == NULL) {
3425 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SRC;
3426 		return (EINVAL);
3427 	}
3428 	if (dstext == NULL) {
3429 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3430 		return (EINVAL);
3431 	}
3432 	if (isrcext == NULL && idstext != NULL) {
3433 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_SRC;
3434 		return (EINVAL);
3435 	}
3436 	if (isrcext != NULL && idstext == NULL) {
3437 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_DST;
3438 		return (EINVAL);
3439 	}
3440 	if (assoc == NULL) {
3441 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3442 		return (EINVAL);
3443 	}
3444 	if (ekey == NULL && assoc->sadb_sa_encrypt != SADB_EALG_NULL) {
3445 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_EKEY;
3446 		return (EINVAL);
3447 	}
3448 
3449 	src = (struct sockaddr_in *)(srcext + 1);
3450 	dst = (struct sockaddr_in *)(dstext + 1);
3451 	natt_loc = (struct sockaddr_in *)(nttext_loc + 1);
3452 	natt_loc6 = (struct sockaddr_in6 *)(nttext_loc + 1);
3453 	natt_rem = (struct sockaddr_in *)(nttext_rem + 1);
3454 	natt_rem6 = (struct sockaddr_in6 *)(nttext_rem + 1);
3455 
3456 	/* Sundry ADD-specific reality checks. */
3457 	/* XXX STATS :  Logging/stats here? */
3458 	if (assoc->sadb_sa_state != SADB_SASTATE_MATURE) {
3459 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SASTATE;
3460 		return (EINVAL);
3461 	}
3462 	if (assoc->sadb_sa_encrypt == SADB_EALG_NONE) {
3463 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3464 		return (EINVAL);
3465 	}
3466 
3467 	if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
3468 	    assoc->sadb_sa_auth == SADB_AALG_NONE) {
3469 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3470 		return (EINVAL);
3471 	}
3472 
3473 	if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) {
3474 		*diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
3475 		return (EINVAL);
3476 	}
3477 
3478 	if ((*diagnostic = sadb_hardsoftchk(hard, soft)) != 0) {
3479 		return (EINVAL);
3480 	}
3481 	ASSERT(src->sin_family == dst->sin_family);
3482 
3483 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_LOC) {
3484 		if (nttext_loc == NULL) {
3485 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_LOC;
3486 			return (EINVAL);
3487 		}
3488 
3489 		if (natt_loc->sin_family == AF_INET6 &&
3490 		    !IN6_IS_ADDR_V4MAPPED(&natt_loc6->sin6_addr)) {
3491 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_LOC;
3492 			return (EINVAL);
3493 		}
3494 	}
3495 
3496 	if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_REM) {
3497 		if (nttext_rem == NULL) {
3498 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_REM;
3499 			return (EINVAL);
3500 		}
3501 		if (natt_rem->sin_family == AF_INET6 &&
3502 		    !IN6_IS_ADDR_V4MAPPED(&natt_rem6->sin6_addr)) {
3503 			*diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_REM;
3504 			return (EINVAL);
3505 		}
3506 	}
3507 
3508 
3509 	/* Stuff I don't support, for now.  XXX Diagnostic? */
3510 	if (ksi->ks_in_extv[SADB_EXT_LIFETIME_CURRENT] != NULL ||
3511 	    ksi->ks_in_extv[SADB_EXT_SENSITIVITY] != NULL)
3512 		return (EOPNOTSUPP);
3513 
3514 	/*
3515 	 * XXX Policy :  I'm not checking identities or sensitivity
3516 	 * labels at this time, but if I did, I'd do them here, before I sent
3517 	 * the weak key check up to the algorithm.
3518 	 */
3519 
3520 	mutex_enter(&ipss->ipsec_alg_lock);
3521 
3522 	/*
3523 	 * First locate the authentication algorithm.
3524 	 */
3525 	if (akey != NULL) {
3526 		ipsec_alginfo_t *aalg;
3527 
3528 		aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
3529 		    [assoc->sadb_sa_auth];
3530 		if (aalg == NULL || !ALG_VALID(aalg)) {
3531 			mutex_exit(&ipss->ipsec_alg_lock);
3532 			esp1dbg(espstack, ("Couldn't find auth alg #%d.\n",
3533 			    assoc->sadb_sa_auth));
3534 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3535 			return (EINVAL);
3536 		}
3537 
3538 		/*
3539 		 * Sanity check key sizes.
3540 		 * Note: It's not possible to use SADB_AALG_NONE because
3541 		 * this auth_alg is not defined with ALG_FLAG_VALID. If this
3542 		 * ever changes, the same check for SADB_AALG_NONE and
3543 		 * a auth_key != NULL should be made here ( see below).
3544 		 */
3545 		if (!ipsec_valid_key_size(akey->sadb_key_bits, aalg)) {
3546 			mutex_exit(&ipss->ipsec_alg_lock);
3547 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_AKEYBITS;
3548 			return (EINVAL);
3549 		}
3550 		ASSERT(aalg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3551 
3552 		/* check key and fix parity if needed */
3553 		if (ipsec_check_key(aalg->alg_mech_type, akey, B_TRUE,
3554 		    diagnostic) != 0) {
3555 			mutex_exit(&ipss->ipsec_alg_lock);
3556 			return (EINVAL);
3557 		}
3558 	}
3559 
3560 	/*
3561 	 * Then locate the encryption algorithm.
3562 	 */
3563 	if (ekey != NULL) {
3564 		ipsec_alginfo_t *ealg;
3565 
3566 		ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
3567 		    [assoc->sadb_sa_encrypt];
3568 		if (ealg == NULL || !ALG_VALID(ealg)) {
3569 			mutex_exit(&ipss->ipsec_alg_lock);
3570 			esp1dbg(espstack, ("Couldn't find encr alg #%d.\n",
3571 			    assoc->sadb_sa_encrypt));
3572 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3573 			return (EINVAL);
3574 		}
3575 
3576 		/*
3577 		 * Sanity check key sizes. If the encryption algorithm is
3578 		 * SADB_EALG_NULL but the encryption key is NOT
3579 		 * NULL then complain.
3580 		 */
3581 		if ((assoc->sadb_sa_encrypt == SADB_EALG_NULL) ||
3582 		    (!ipsec_valid_key_size(ekey->sadb_key_bits, ealg))) {
3583 			mutex_exit(&ipss->ipsec_alg_lock);
3584 			*diagnostic = SADB_X_DIAGNOSTIC_BAD_EKEYBITS;
3585 			return (EINVAL);
3586 		}
3587 		ASSERT(ealg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3588 
3589 		/* check key */
3590 		if (ipsec_check_key(ealg->alg_mech_type, ekey, B_FALSE,
3591 		    diagnostic) != 0) {
3592 			mutex_exit(&ipss->ipsec_alg_lock);
3593 			return (EINVAL);
3594 		}
3595 	}
3596 	mutex_exit(&ipss->ipsec_alg_lock);
3597 
3598 	return (esp_add_sa_finish(mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
3599 	    diagnostic, espstack));
3600 }
3601 
3602 /*
3603  * Update a security association.  Updates come in two varieties.  The first
3604  * is an update of lifetimes on a non-larval SA.  The second is an update of
3605  * a larval SA, which ends up looking a lot more like an add.
3606  */
3607 static int
3608 esp_update_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3609     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3610 {
3611 	sadb_address_t *dstext =
3612 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3613 
3614 	if (dstext == NULL) {
3615 		*diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3616 		return (EINVAL);
3617 	}
3618 
3619 	return (sadb_update_sa(mp, ksi, &espstack->esp_sadb,
3620 	    diagnostic, espstack->esp_pfkey_q,
3621 	    esp_add_sa, espstack->ipsecesp_netstack, sadb_msg_type));
3622 }
3623 
3624 /*
3625  * Delete a security association.  This is REALLY likely to be code common to
3626  * both AH and ESP.  Find the association, then unlink it.
3627  */
3628 static int
3629 esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3630     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3631 {
3632 	sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3633 	sadb_address_t *dstext =
3634 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3635 	sadb_address_t *srcext =
3636 	    (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3637 	struct sockaddr_in *sin;
3638 
3639 	if (assoc == NULL) {
3640 		if (dstext != NULL) {
3641 			sin = (struct sockaddr_in *)(dstext + 1);
3642 		} else if (srcext != NULL) {
3643 			sin = (struct sockaddr_in *)(srcext + 1);
3644 		} else {
3645 			*diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3646 			return (EINVAL);
3647 		}
3648 		return (sadb_purge_sa(mp, ksi,
3649 		    (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
3650 		    &espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
3651 		    espstack->esp_sadb.s_ip_q));
3652 	}
3653 
3654 	return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
3655 	    espstack->esp_pfkey_q, sadb_msg_type));
3656 }
3657 
3658 /*
3659  * Convert the entire contents of all of ESP's SA tables into PF_KEY SADB_DUMP
3660  * messages.
3661  */
3662 static void
3663 esp_dump(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
3664 {
3665 	int error;
3666 	sadb_msg_t *samsg;
3667 
3668 	/*
3669 	 * Dump each fanout, bailing if error is non-zero.
3670 	 */
3671 
3672 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi->ks_in_serial,
3673 	    &espstack->esp_sadb.s_v4);
3674 	if (error != 0)
3675 		goto bail;
3676 
3677 	error = sadb_dump(espstack->esp_pfkey_q, mp, ksi->ks_in_serial,
3678 	    &espstack->esp_sadb.s_v6);
3679 bail:
3680 	ASSERT(mp->b_cont != NULL);
3681 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3682 	samsg->sadb_msg_errno = (uint8_t)error;
3683 	sadb_pfkey_echo(espstack->esp_pfkey_q, mp,
3684 	    (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL);
3685 }
3686 
3687 /*
3688  * First-cut reality check for an inbound PF_KEY message.
3689  */
3690 static boolean_t
3691 esp_pfkey_reality_failures(mblk_t *mp, keysock_in_t *ksi,
3692     ipsecesp_stack_t *espstack)
3693 {
3694 	int diagnostic;
3695 
3696 	if (ksi->ks_in_extv[SADB_EXT_PROPOSAL] != NULL) {
3697 		diagnostic = SADB_X_DIAGNOSTIC_PROP_PRESENT;
3698 		goto badmsg;
3699 	}
3700 	if (ksi->ks_in_extv[SADB_EXT_SUPPORTED_AUTH] != NULL ||
3701 	    ksi->ks_in_extv[SADB_EXT_SUPPORTED_ENCRYPT] != NULL) {
3702 		diagnostic = SADB_X_DIAGNOSTIC_SUPP_PRESENT;
3703 		goto badmsg;
3704 	}
3705 	return (B_FALSE);	/* False ==> no failures */
3706 
3707 badmsg:
3708 	sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
3709 	    ksi->ks_in_serial);
3710 	return (B_TRUE);	/* True ==> failures */
3711 }
3712 
3713 /*
3714  * ESP parsing of PF_KEY messages.  Keysock did most of the really silly
3715  * error cases.  What I receive is a fully-formed, syntactically legal
3716  * PF_KEY message.  I then need to check semantics...
3717  *
3718  * This code may become common to AH and ESP.  Stay tuned.
3719  *
3720  * I also make the assumption that db_ref's are cool.  If this assumption
3721  * is wrong, this means that someone other than keysock or me has been
3722  * mucking with PF_KEY messages.
3723  */
3724 static void
3725 esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
3726 {
3727 	mblk_t *msg = mp->b_cont;
3728 	sadb_msg_t *samsg;
3729 	keysock_in_t *ksi;
3730 	int error;
3731 	int diagnostic = SADB_X_DIAGNOSTIC_NONE;
3732 
3733 	ASSERT(msg != NULL);
3734 
3735 	samsg = (sadb_msg_t *)msg->b_rptr;
3736 	ksi = (keysock_in_t *)mp->b_rptr;
3737 
3738 	/*
3739 	 * If applicable, convert unspecified AF_INET6 to unspecified
3740 	 * AF_INET.  And do other address reality checks.
3741 	 */
3742 	if (!sadb_addrfix(ksi, espstack->esp_pfkey_q, mp,
3743 	    espstack->ipsecesp_netstack) ||
3744 	    esp_pfkey_reality_failures(mp, ksi, espstack)) {
3745 		return;
3746 	}
3747 
3748 	switch (samsg->sadb_msg_type) {
3749 	case SADB_ADD:
3750 		error = esp_add_sa(mp, ksi, &diagnostic,
3751 		    espstack->ipsecesp_netstack);
3752 		if (error != 0) {
3753 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3754 			    diagnostic, ksi->ks_in_serial);
3755 		}
3756 		/* else esp_add_sa() took care of things. */
3757 		break;
3758 	case SADB_DELETE:
3759 	case SADB_X_DELPAIR:
3760 		error = esp_del_sa(mp, ksi, &diagnostic, espstack,
3761 		    samsg->sadb_msg_type);
3762 		if (error != 0) {
3763 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3764 			    diagnostic, ksi->ks_in_serial);
3765 		}
3766 		/* Else esp_del_sa() took care of things. */
3767 		break;
3768 	case SADB_GET:
3769 		error = sadb_delget_sa(mp, ksi, &espstack->esp_sadb,
3770 		    &diagnostic, espstack->esp_pfkey_q, samsg->sadb_msg_type);
3771 		if (error != 0) {
3772 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3773 			    diagnostic, ksi->ks_in_serial);
3774 		}
3775 		/* Else sadb_get_sa() took care of things. */
3776 		break;
3777 	case SADB_FLUSH:
3778 		sadbp_flush(&espstack->esp_sadb, espstack->ipsecesp_netstack);
3779 		sadb_pfkey_echo(espstack->esp_pfkey_q, mp, samsg, ksi, NULL);
3780 		break;
3781 	case SADB_REGISTER:
3782 		/*
3783 		 * Hmmm, let's do it!  Check for extensions (there should
3784 		 * be none), extract the fields, call esp_register_out(),
3785 		 * then either free or report an error.
3786 		 *
3787 		 * Keysock takes care of the PF_KEY bookkeeping for this.
3788 		 */
3789 		if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
3790 		    ksi->ks_in_serial, espstack)) {
3791 			freemsg(mp);
3792 		} else {
3793 			/*
3794 			 * Only way this path hits is if there is a memory
3795 			 * failure.  It will not return B_FALSE because of
3796 			 * lack of esp_pfkey_q if I am in wput().
3797 			 */
3798 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM,
3799 			    diagnostic, ksi->ks_in_serial);
3800 		}
3801 		break;
3802 	case SADB_UPDATE:
3803 	case SADB_X_UPDATEPAIR:
3804 		/*
3805 		 * Find a larval, if not there, find a full one and get
3806 		 * strict.
3807 		 */
3808 		error = esp_update_sa(mp, ksi, &diagnostic, espstack,
3809 		    samsg->sadb_msg_type);
3810 		if (error != 0) {
3811 			sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3812 			    diagnostic, ksi->ks_in_serial);
3813 		}
3814 		/* else esp_update_sa() took care of things. */
3815 		break;
3816 	case SADB_GETSPI:
3817 		/*
3818 		 * Reserve a new larval entry.
3819 		 */
3820 		esp_getspi(mp, ksi, espstack);
3821 		break;
3822 	case SADB_ACQUIRE:
3823 		/*
3824 		 * Find larval and/or ACQUIRE record and kill it (them), I'm
3825 		 * most likely an error.  Inbound ACQUIRE messages should only
3826 		 * have the base header.
3827 		 */
3828 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3829 		    espstack->esp_pfkey_q, espstack->ipsecesp_netstack);
3830 		freemsg(mp);
3831 		break;
3832 	case SADB_DUMP:
3833 		/*
3834 		 * Dump all entries.
3835 		 */
3836 		esp_dump(mp, ksi, espstack);
3837 		/* esp_dump will take care of the return message, etc. */
3838 		break;
3839 	case SADB_EXPIRE:
3840 		/* Should never reach me. */
3841 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EOPNOTSUPP,
3842 		    diagnostic, ksi->ks_in_serial);
3843 		break;
3844 	default:
3845 		sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL,
3846 		    SADB_X_DIAGNOSTIC_UNKNOWN_MSG, ksi->ks_in_serial);
3847 		break;
3848 	}
3849 }
3850 
3851 /*
3852  * Handle case where PF_KEY says it can't find a keysock for one of my
3853  * ACQUIRE messages.
3854  */
3855 static void
3856 esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
3857 {
3858 	sadb_msg_t *samsg;
3859 	keysock_out_err_t *kse = (keysock_out_err_t *)mp->b_rptr;
3860 
3861 	if (mp->b_cont == NULL) {
3862 		freemsg(mp);
3863 		return;
3864 	}
3865 	samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3866 
3867 	/*
3868 	 * If keysock can't find any registered, delete the acquire record
3869 	 * immediately, and handle errors.
3870 	 */
3871 	if (samsg->sadb_msg_type == SADB_ACQUIRE) {
3872 		samsg->sadb_msg_errno = kse->ks_err_errno;
3873 		samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
3874 		/*
3875 		 * Use the write-side of the esp_pfkey_q, in case there is
3876 		 * no esp_sadb.s_ip_q.
3877 		 */
3878 		sadb_in_acquire(samsg, &espstack->esp_sadb,
3879 		    WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
3880 	}
3881 
3882 	freemsg(mp);
3883 }
3884 
3885 /*
3886  * ESP module write put routine.
3887  */
3888 static void
3889 ipsecesp_wput(queue_t *q, mblk_t *mp)
3890 {
3891 	ipsec_info_t *ii;
3892 	struct iocblk *iocp;
3893 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
3894 
3895 	esp3dbg(espstack, ("In esp_wput().\n"));
3896 
3897 	/* NOTE: Each case must take care of freeing or passing mp. */
3898 	switch (mp->b_datap->db_type) {
3899 	case M_CTL:
3900 		if ((mp->b_wptr - mp->b_rptr) < sizeof (ipsec_info_t)) {
3901 			/* Not big enough message. */
3902 			freemsg(mp);
3903 			break;
3904 		}
3905 		ii = (ipsec_info_t *)mp->b_rptr;
3906 
3907 		switch (ii->ipsec_info_type) {
3908 		case KEYSOCK_OUT_ERR:
3909 			esp1dbg(espstack, ("Got KEYSOCK_OUT_ERR message.\n"));
3910 			esp_keysock_no_socket(mp, espstack);
3911 			break;
3912 		case KEYSOCK_IN:
3913 			ESP_BUMP_STAT(espstack, keysock_in);
3914 			esp3dbg(espstack, ("Got KEYSOCK_IN message.\n"));
3915 
3916 			/* Parse the message. */
3917 			esp_parse_pfkey(mp, espstack);
3918 			break;
3919 		case KEYSOCK_HELLO:
3920 			sadb_keysock_hello(&espstack->esp_pfkey_q, q, mp,
3921 			    esp_ager, (void *)espstack, &espstack->esp_event,
3922 			    SADB_SATYPE_ESP);
3923 			break;
3924 		default:
3925 			esp2dbg(espstack, ("Got M_CTL from above of 0x%x.\n",
3926 			    ii->ipsec_info_type));
3927 			freemsg(mp);
3928 			break;
3929 		}
3930 		break;
3931 	case M_IOCTL:
3932 		iocp = (struct iocblk *)mp->b_rptr;
3933 		switch (iocp->ioc_cmd) {
3934 		case ND_SET:
3935 		case ND_GET:
3936 			if (nd_getset(q, espstack->ipsecesp_g_nd, mp)) {
3937 				qreply(q, mp);
3938 				return;
3939 			} else {
3940 				iocp->ioc_error = ENOENT;
3941 			}
3942 			/* FALLTHRU */
3943 		default:
3944 			/* We really don't support any other ioctls, do we? */
3945 
3946 			/* Return EINVAL */
3947 			if (iocp->ioc_error != ENOENT)
3948 				iocp->ioc_error = EINVAL;
3949 			iocp->ioc_count = 0;
3950 			mp->b_datap->db_type = M_IOCACK;
3951 			qreply(q, mp);
3952 			return;
3953 		}
3954 	default:
3955 		esp3dbg(espstack,
3956 		    ("Got default message, type %d, passing to IP.\n",
3957 		    mp->b_datap->db_type));
3958 		putnext(q, mp);
3959 	}
3960 }
3961 
3962 /*
3963  * Process an outbound ESP packet that can be accelerated by a IPsec
3964  * hardware acceleration capable Provider.
3965  * The caller already inserted and initialized the ESP header.
3966  * This function allocates a tagging M_CTL, and adds room at the end
3967  * of the packet to hold the ICV if authentication is needed.
3968  *
3969  * On success returns B_TRUE, on failure returns B_FALSE and frees the
3970  * mblk chain ipsec_out.
3971  */
3972 static ipsec_status_t
3973 esp_outbound_accelerated(mblk_t *ipsec_out, uint_t icv_len)
3974 {
3975 	ipsec_out_t *io;
3976 	mblk_t *lastmp;
3977 	netstack_t	*ns;
3978 	ipsecesp_stack_t *espstack;
3979 	ipsec_stack_t	*ipss;
3980 
3981 	io = (ipsec_out_t *)ipsec_out->b_rptr;
3982 	ns = io->ipsec_out_ns;
3983 	espstack = ns->netstack_ipsecesp;
3984 	ipss = ns->netstack_ipsec;
3985 
3986 	ESP_BUMP_STAT(espstack, out_accelerated);
3987 
3988 	/* mark packet as being accelerated in IPSEC_OUT */
3989 	ASSERT(io->ipsec_out_accelerated == B_FALSE);
3990 	io->ipsec_out_accelerated = B_TRUE;
3991 
3992 	/*
3993 	 * add room at the end of the packet for the ICV if needed
3994 	 */
3995 	if (icv_len > 0) {
3996 		/* go to last mblk */
3997 		lastmp = ipsec_out;	/* For following while loop. */
3998 		do {
3999 			lastmp = lastmp->b_cont;
4000 		} while (lastmp->b_cont != NULL);
4001 
4002 		/* if not enough available room, allocate new mblk */
4003 		if ((lastmp->b_wptr + icv_len) > lastmp->b_datap->db_lim) {
4004 			lastmp->b_cont = allocb(icv_len, BPRI_HI);
4005 			if (lastmp->b_cont == NULL) {
4006 				ESP_BUMP_STAT(espstack, out_discards);
4007 				ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
4008 				    DROPPER(ipss, ipds_esp_nomem),
4009 				    &espstack->esp_dropper);
4010 				return (IPSEC_STATUS_FAILED);
4011 			}
4012 			lastmp = lastmp->b_cont;
4013 		}
4014 		lastmp->b_wptr += icv_len;
4015 	}
4016 
4017 	return (IPSEC_STATUS_SUCCESS);
4018 }
4019 
4020 /*
4021  * Process an inbound accelerated ESP packet.
4022  * On success returns B_TRUE, on failure returns B_FALSE and frees the
4023  * mblk chain ipsec_in.
4024  */
4025 static ipsec_status_t
4026 esp_inbound_accelerated(mblk_t *ipsec_in, mblk_t *data_mp, boolean_t isv4,
4027     ipsa_t *assoc)
4028 {
4029 	ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
4030 	mblk_t *hada_mp;
4031 	uint32_t icv_len = 0;
4032 	da_ipsec_t *hada;
4033 	ipha_t *ipha;
4034 	ip6_t *ip6h;
4035 	kstat_named_t *counter;
4036 	netstack_t	*ns = ii->ipsec_in_ns;
4037 	ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
4038 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4039 
4040 	ESP_BUMP_STAT(espstack, in_accelerated);
4041 
4042 	hada_mp = ii->ipsec_in_da;
4043 	ASSERT(hada_mp != NULL);
4044 	hada = (da_ipsec_t *)hada_mp->b_rptr;
4045 
4046 	/*
4047 	 * We only support one level of decapsulation in hardware, so
4048 	 * nuke the pointer.
4049 	 */
4050 	ii->ipsec_in_da = NULL;
4051 	ii->ipsec_in_accelerated = B_FALSE;
4052 
4053 	if (assoc->ipsa_auth_alg != IPSA_AALG_NONE) {
4054 		/*
4055 		 * ESP with authentication. We expect the Provider to have
4056 		 * computed the ICV and placed it in the hardware acceleration
4057 		 * data attributes.
4058 		 *
4059 		 * Extract ICV length from attributes M_CTL and sanity check
4060 		 * its value. We allow the mblk to be smaller than da_ipsec_t
4061 		 * for a small ICV, as long as the entire ICV fits within the
4062 		 * mblk.
4063 		 *
4064 		 * Also ensures that the ICV length computed by Provider
4065 		 * corresponds to the ICV length of the agorithm specified by
4066 		 * the SA.
4067 		 */
4068 		icv_len = hada->da_icv_len;
4069 		if ((icv_len != assoc->ipsa_mac_len) ||
4070 		    (icv_len > DA_ICV_MAX_LEN) || (MBLKL(hada_mp) <
4071 		    (sizeof (da_ipsec_t) - DA_ICV_MAX_LEN + icv_len))) {
4072 			esp0dbg(("esp_inbound_accelerated: "
4073 			    "ICV len (%u) incorrect or mblk too small (%u)\n",
4074 			    icv_len, (uint32_t)(MBLKL(hada_mp))));
4075 			counter = DROPPER(ipss, ipds_esp_bad_auth);
4076 			goto esp_in_discard;
4077 		}
4078 	}
4079 
4080 	/* get pointers to IP header */
4081 	if (isv4) {
4082 		ipha = (ipha_t *)data_mp->b_rptr;
4083 	} else {
4084 		ip6h = (ip6_t *)data_mp->b_rptr;
4085 	}
4086 
4087 	/*
4088 	 * Compare ICV in ESP packet vs ICV computed by adapter.
4089 	 * We also remove the ICV from the end of the packet since
4090 	 * it will no longer be needed.
4091 	 *
4092 	 * Assume that esp_inbound() already ensured that the pkt
4093 	 * was in one mblk.
4094 	 */
4095 	ASSERT(data_mp->b_cont == NULL);
4096 	data_mp->b_wptr -= icv_len;
4097 	/* adjust IP header */
4098 	if (isv4)
4099 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - icv_len);
4100 	else
4101 		ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - icv_len);
4102 	if (icv_len && bcmp(hada->da_icv, data_mp->b_wptr, icv_len)) {
4103 		int af;
4104 		void *addr;
4105 
4106 		if (isv4) {
4107 			addr = &ipha->ipha_dst;
4108 			af = AF_INET;
4109 		} else {
4110 			addr = &ip6h->ip6_dst;
4111 			af = AF_INET6;
4112 		}
4113 
4114 		/*
4115 		 * Log the event. Don't print to the console, block
4116 		 * potential denial-of-service attack.
4117 		 */
4118 		ESP_BUMP_STAT(espstack, bad_auth);
4119 		ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
4120 		    "ESP Authentication failed spi %x, dst_addr %s",
4121 		    assoc->ipsa_spi, addr, af, espstack->ipsecesp_netstack);
4122 		counter = DROPPER(ipss, ipds_esp_bad_auth);
4123 		goto esp_in_discard;
4124 	}
4125 
4126 	esp3dbg(espstack, ("esp_inbound_accelerated: ESP authentication "
4127 	    "succeeded, checking replay\n"));
4128 
4129 	ipsec_in->b_cont = data_mp;
4130 
4131 	/*
4132 	 * Remove ESP header and padding from packet.
4133 	 */
4134 	if (!esp_strip_header(data_mp, ii->ipsec_in_v4, assoc->ipsa_iv_len,
4135 	    &counter, espstack)) {
4136 		esp1dbg(espstack, ("esp_inbound_accelerated: "
4137 		    "esp_strip_header() failed\n"));
4138 		goto esp_in_discard;
4139 	}
4140 
4141 	freeb(hada_mp);
4142 
4143 	/*
4144 	 * Account for usage..
4145 	 */
4146 	if (!esp_age_bytes(assoc, msgdsize(data_mp), B_TRUE)) {
4147 		/* The ipsa has hit hard expiration, LOG and AUDIT. */
4148 		ESP_BUMP_STAT(espstack, bytes_expired);
4149 		IP_ESP_BUMP_STAT(ipss, in_discards);
4150 		ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
4151 		    "ESP association 0x%x, dst %s had bytes expire.\n",
4152 		    assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
4153 		    espstack->ipsecesp_netstack);
4154 		ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
4155 		    DROPPER(ipss, ipds_esp_bytes_expire),
4156 		    &espstack->esp_dropper);
4157 		return (IPSEC_STATUS_FAILED);
4158 	}
4159 
4160 	/* done processing the packet */
4161 	return (IPSEC_STATUS_SUCCESS);
4162 
4163 esp_in_discard:
4164 	IP_ESP_BUMP_STAT(ipss, in_discards);
4165 	freeb(hada_mp);
4166 
4167 	ipsec_in->b_cont = data_mp;	/* For ip_drop_packet()'s sake... */
4168 	ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter,
4169 	    &espstack->esp_dropper);
4170 
4171 	return (IPSEC_STATUS_FAILED);
4172 }
4173 
4174 /*
4175  * Wrapper to allow IP to trigger an ESP association failure message
4176  * during inbound SA selection.
4177  */
4178 void
4179 ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
4180     uint32_t spi, void *addr, int af, ipsecesp_stack_t *espstack)
4181 {
4182 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
4183 
4184 	if (espstack->ipsecesp_log_unknown_spi) {
4185 		ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
4186 		    addr, af, espstack->ipsecesp_netstack);
4187 	}
4188 
4189 	ip_drop_packet(mp, B_TRUE, NULL, NULL,
4190 	    DROPPER(ipss, ipds_esp_no_sa),
4191 	    &espstack->esp_dropper);
4192 }
4193 
4194 /*
4195  * Initialize the ESP input and output processing functions.
4196  */
4197 void
4198 ipsecesp_init_funcs(ipsa_t *sa)
4199 {
4200 	if (sa->ipsa_output_func == NULL)
4201 		sa->ipsa_output_func = esp_outbound;
4202 	if (sa->ipsa_input_func == NULL)
4203 		sa->ipsa_input_func = esp_inbound;
4204 }
4205