xref: /freebsd/sys/crypto/ccp/ccp_hardware.c (revision 54e9e4e72d711fb41f88f793f6c64df1126112f9)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Chelsio Communications, Inc.
5  * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
6  * All rights reserved.
7  * Largely borrowed from ccr(4), Written by: John Baldwin <jhb@FreeBSD.org>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 
36 #include <sys/param.h>
37 #include <sys/bus.h>
38 #include <sys/lock.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mutex.h>
42 #include <sys/module.h>
43 #include <sys/rman.h>
44 #include <sys/sglist.h>
45 #include <sys/sysctl.h>
46 
47 #ifdef DDB
48 #include <ddb/ddb.h>
49 #endif
50 
51 #include <dev/pci/pcireg.h>
52 #include <dev/pci/pcivar.h>
53 
54 #include <machine/bus.h>
55 #include <machine/resource.h>
56 #include <machine/vmparam.h>
57 
58 #include <opencrypto/cryptodev.h>
59 #include <opencrypto/xform.h>
60 
61 #include <vm/vm.h>
62 #include <vm/pmap.h>
63 
64 #include "cryptodev_if.h"
65 
66 #include "ccp.h"
67 #include "ccp_hardware.h"
68 #include "ccp_lsb.h"
69 
70 CTASSERT(sizeof(struct ccp_desc) == 32);
71 
72 static struct ccp_xts_unitsize_map_entry {
73 	enum ccp_xts_unitsize cxu_id;
74 	unsigned cxu_size;
75 } ccp_xts_unitsize_map[] = {
76 	{ CCP_XTS_AES_UNIT_SIZE_16, 16 },
77 	{ CCP_XTS_AES_UNIT_SIZE_512, 512 },
78 	{ CCP_XTS_AES_UNIT_SIZE_1024, 1024 },
79 	{ CCP_XTS_AES_UNIT_SIZE_2048, 2048 },
80 	{ CCP_XTS_AES_UNIT_SIZE_4096, 4096 },
81 };
82 
83 SYSCTL_NODE(_hw, OID_AUTO, ccp, CTLFLAG_RD, 0, "ccp node");
84 
85 unsigned g_ccp_ring_order = 11;
86 SYSCTL_UINT(_hw_ccp, OID_AUTO, ring_order, CTLFLAG_RDTUN, &g_ccp_ring_order,
87     0, "Set CCP ring order.  (1 << this) == ring size.  Min: 6, Max: 16");
88 
89 /*
90  * Zero buffer, sufficient for padding LSB entries, that does not span a page
91  * boundary
92  */
93 static const char g_zeroes[32] __aligned(32);
94 
95 static inline uint32_t
96 ccp_read_4(struct ccp_softc *sc, uint32_t offset)
97 {
98 	return (bus_space_read_4(sc->pci_bus_tag, sc->pci_bus_handle, offset));
99 }
100 
101 static inline void
102 ccp_write_4(struct ccp_softc *sc, uint32_t offset, uint32_t value)
103 {
104 	bus_space_write_4(sc->pci_bus_tag, sc->pci_bus_handle, offset, value);
105 }
106 
107 static inline uint32_t
108 ccp_read_queue_4(struct ccp_softc *sc, unsigned queue, uint32_t offset)
109 {
110 	/*
111 	 * Each queue gets its own 4kB register space.  Queue 0 is at 0x1000.
112 	 */
113 	return (ccp_read_4(sc, (CMD_Q_STATUS_INCR * (1 + queue)) + offset));
114 }
115 
116 static inline void
117 ccp_write_queue_4(struct ccp_softc *sc, unsigned queue, uint32_t offset,
118     uint32_t value)
119 {
120 	ccp_write_4(sc, (CMD_Q_STATUS_INCR * (1 + queue)) + offset, value);
121 }
122 
123 void
124 ccp_queue_write_tail(struct ccp_queue *qp)
125 {
126 	ccp_write_queue_4(qp->cq_softc, qp->cq_qindex, CMD_Q_TAIL_LO_BASE,
127 	    ((uint32_t)qp->desc_ring_bus_addr) + (Q_DESC_SIZE * qp->cq_tail));
128 }
129 
130 /*
131  * Given a queue and a reserved LSB entry index, compute the LSB *entry id* of
132  * that entry for the queue's private LSB region.
133  */
134 static inline uint8_t
135 ccp_queue_lsb_entry(struct ccp_queue *qp, unsigned lsb_entry)
136 {
137 	return ((qp->private_lsb * LSB_REGION_LENGTH + lsb_entry));
138 }
139 
140 /*
141  * Given a queue and a reserved LSB entry index, compute the LSB *address* of
142  * that entry for the queue's private LSB region.
143  */
144 static inline uint32_t
145 ccp_queue_lsb_address(struct ccp_queue *qp, unsigned lsb_entry)
146 {
147 	return (ccp_queue_lsb_entry(qp, lsb_entry) * LSB_ENTRY_SIZE);
148 }
149 
150 /*
151  * Some terminology:
152  *
153  * LSB - Local Storage Block
154  * =========================
155  *
156  * 8 segments/regions, each containing 16 entries.
157  *
158  * Each entry contains 256 bits (32 bytes).
159  *
160  * Segments are virtually addressed in commands, but accesses cannot cross
161  * segment boundaries.  Virtual map uses an identity mapping by default
162  * (virtual segment N corresponds to physical segment N).
163  *
164  * Access to a physical region can be restricted to any subset of all five
165  * queues.
166  *
167  * "Pass-through" mode
168  * ===================
169  *
170  * Pass-through is a generic DMA engine, much like ioat(4).  Some nice
171  * features:
172  *
173  * - Supports byte-swapping for endian conversion (32- or 256-bit words)
174  * - AND, OR, XOR with fixed 256-bit mask
175  * - CRC32 of data (may be used in tandem with bswap, but not bit operations)
176  * - Read/write of LSB
177  * - Memset
178  *
179  * If bit manipulation mode is enabled, input must be a multiple of 256 bits
180  * (32 bytes).
181  *
182  * If byte-swapping is enabled, input must be a multiple of the word size.
183  *
184  * Zlib mode -- only usable from one queue at a time, single job at a time.
185  * ========================================================================
186  *
187  * Only usable from private host, aka PSP?  Not host processor?
188  *
189  * RNG.
190  * ====
191  *
192  * Raw bits are conditioned with AES and fed through CTR_DRBG.  Output goes in
193  * a ring buffer readable by software.
194  *
195  * NIST SP 800-90B Repetition Count and Adaptive Proportion health checks are
196  * implemented on the raw input stream and may be enabled to verify min-entropy
197  * of 0.5 bits per bit.
198  */
199 
200 static void
201 ccp_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
202 {
203 	bus_addr_t *baddr;
204 
205 	KASSERT(error == 0, ("%s: error:%d", __func__, error));
206 	baddr = arg;
207 	*baddr = segs->ds_addr;
208 }
209 
210 static int
211 ccp_hw_attach_queue(device_t dev, uint64_t lsbmask, unsigned queue)
212 {
213 	struct ccp_softc *sc;
214 	struct ccp_queue *qp;
215 	void *desc;
216 	size_t ringsz, num_descriptors;
217 	int error;
218 
219 	desc = NULL;
220 	sc = device_get_softc(dev);
221 	qp = &sc->queues[queue];
222 
223 	/*
224 	 * Don't bother allocating a ring for queues the host isn't allowed to
225 	 * drive.
226 	 */
227 	if ((sc->valid_queues & (1 << queue)) == 0)
228 		return (0);
229 
230 	ccp_queue_decode_lsb_regions(sc, lsbmask, queue);
231 
232 	/* Ignore queues that do not have any LSB access. */
233 	if (qp->lsb_mask == 0) {
234 		device_printf(dev, "Ignoring queue %u with no LSB access\n",
235 		    queue);
236 		sc->valid_queues &= ~(1 << queue);
237 		return (0);
238 	}
239 
240 	num_descriptors = 1 << sc->ring_size_order;
241 	ringsz = sizeof(struct ccp_desc) * num_descriptors;
242 
243 	/*
244 	 * "Queue_Size" is order - 1.
245 	 *
246 	 * Queue must be aligned to 5+Queue_Size+1 == 5 + order bits.
247 	 */
248 	error = bus_dma_tag_create(bus_get_dma_tag(dev),
249 	    1 << (5 + sc->ring_size_order),
250 #if defined(__i386__) && !defined(PAE)
251 	    0, BUS_SPACE_MAXADDR,
252 #else
253 	    (bus_addr_t)1 << 32, BUS_SPACE_MAXADDR_48BIT,
254 #endif
255 	    BUS_SPACE_MAXADDR, NULL, NULL, ringsz, 1,
256 	    ringsz, 0, NULL, NULL, &qp->ring_desc_tag);
257 	if (error != 0)
258 		goto out;
259 
260 	error = bus_dmamem_alloc(qp->ring_desc_tag, &desc,
261 	    BUS_DMA_ZERO | BUS_DMA_WAITOK, &qp->ring_desc_map);
262 	if (error != 0)
263 		goto out;
264 
265 	error = bus_dmamap_load(qp->ring_desc_tag, qp->ring_desc_map, desc,
266 	    ringsz, ccp_dmamap_cb, &qp->desc_ring_bus_addr, BUS_DMA_WAITOK);
267 	if (error != 0)
268 		goto out;
269 
270 	qp->desc_ring = desc;
271 	qp->completions_ring = malloc(num_descriptors *
272 	    sizeof(*qp->completions_ring), M_CCP, M_ZERO | M_WAITOK);
273 
274 	/* Zero control register; among other things, clears the RUN flag. */
275 	qp->qcontrol = 0;
276 	ccp_write_queue_4(sc, queue, CMD_Q_CONTROL_BASE, qp->qcontrol);
277 	ccp_write_queue_4(sc, queue, CMD_Q_INT_ENABLE_BASE, 0);
278 
279 	/* Clear any leftover interrupt status flags */
280 	ccp_write_queue_4(sc, queue, CMD_Q_INTERRUPT_STATUS_BASE,
281 	    ALL_INTERRUPTS);
282 
283 	qp->qcontrol |= (sc->ring_size_order - 1) << CMD_Q_SIZE_SHIFT;
284 
285 	ccp_write_queue_4(sc, queue, CMD_Q_TAIL_LO_BASE,
286 	    (uint32_t)qp->desc_ring_bus_addr);
287 	ccp_write_queue_4(sc, queue, CMD_Q_HEAD_LO_BASE,
288 	    (uint32_t)qp->desc_ring_bus_addr);
289 
290 	/*
291 	 * Enable completion interrupts, as well as error or administrative
292 	 * halt interrupts.  We don't use administrative halts, but they
293 	 * shouldn't trip unless we do, so it ought to be harmless.
294 	 */
295 	ccp_write_queue_4(sc, queue, CMD_Q_INT_ENABLE_BASE,
296 	    INT_COMPLETION | INT_ERROR | INT_QUEUE_STOPPED);
297 
298 	qp->qcontrol |= (qp->desc_ring_bus_addr >> 32) << CMD_Q_PTR_HI_SHIFT;
299 	qp->qcontrol |= CMD_Q_RUN;
300 	ccp_write_queue_4(sc, queue, CMD_Q_CONTROL_BASE, qp->qcontrol);
301 
302 out:
303 	if (error != 0) {
304 		if (qp->desc_ring != NULL)
305 			bus_dmamap_unload(qp->ring_desc_tag,
306 			    qp->ring_desc_map);
307 		if (desc != NULL)
308 			bus_dmamem_free(qp->ring_desc_tag, desc,
309 			    qp->ring_desc_map);
310 		if (qp->ring_desc_tag != NULL)
311 			bus_dma_tag_destroy(qp->ring_desc_tag);
312 	}
313 	return (error);
314 }
315 
316 static void
317 ccp_hw_detach_queue(device_t dev, unsigned queue)
318 {
319 	struct ccp_softc *sc;
320 	struct ccp_queue *qp;
321 
322 	sc = device_get_softc(dev);
323 	qp = &sc->queues[queue];
324 
325 	/*
326 	 * Don't bother allocating a ring for queues the host isn't allowed to
327 	 * drive.
328 	 */
329 	if ((sc->valid_queues & (1 << queue)) == 0)
330 		return;
331 
332 	free(qp->completions_ring, M_CCP);
333 	bus_dmamap_unload(qp->ring_desc_tag, qp->ring_desc_map);
334 	bus_dmamem_free(qp->ring_desc_tag, qp->desc_ring, qp->ring_desc_map);
335 	bus_dma_tag_destroy(qp->ring_desc_tag);
336 }
337 
338 static int
339 ccp_map_pci_bar(device_t dev)
340 {
341 	struct ccp_softc *sc;
342 
343 	sc = device_get_softc(dev);
344 
345 	sc->pci_resource_id = PCIR_BAR(2);
346 	sc->pci_resource = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
347 	    &sc->pci_resource_id, RF_ACTIVE);
348 	if (sc->pci_resource == NULL) {
349 		device_printf(dev, "unable to allocate pci resource\n");
350 		return (ENODEV);
351 	}
352 
353 	sc->pci_resource_id_msix = PCIR_BAR(5);
354 	sc->pci_resource_msix = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
355 	    &sc->pci_resource_id_msix, RF_ACTIVE);
356 	if (sc->pci_resource_msix == NULL) {
357 		device_printf(dev, "unable to allocate pci resource msix\n");
358 		bus_release_resource(dev, SYS_RES_MEMORY, sc->pci_resource_id,
359 		    sc->pci_resource);
360 		return (ENODEV);
361 	}
362 
363 	sc->pci_bus_tag = rman_get_bustag(sc->pci_resource);
364 	sc->pci_bus_handle = rman_get_bushandle(sc->pci_resource);
365 	return (0);
366 }
367 
368 static void
369 ccp_unmap_pci_bar(device_t dev)
370 {
371 	struct ccp_softc *sc;
372 
373 	sc = device_get_softc(dev);
374 
375 	bus_release_resource(dev, SYS_RES_MEMORY, sc->pci_resource_id_msix,
376 	    sc->pci_resource_msix);
377 	bus_release_resource(dev, SYS_RES_MEMORY, sc->pci_resource_id,
378 	    sc->pci_resource);
379 }
380 
381 const static struct ccp_error_code {
382 	uint8_t		ce_code;
383 	const char	*ce_name;
384 	int		ce_errno;
385 	const char	*ce_desc;
386 } ccp_error_codes[] = {
387 	{ 0x01, "ILLEGAL_ENGINE", EIO, "Requested engine was invalid" },
388 	{ 0x03, "ILLEGAL_FUNCTION_TYPE", EIO,
389 	    "A non-supported function type was specified" },
390 	{ 0x04, "ILLEGAL_FUNCTION_MODE", EIO,
391 	    "A non-supported function mode was specified" },
392 	{ 0x05, "ILLEGAL_FUNCTION_ENCRYPT", EIO,
393 	    "A CMAC type was specified when ENCRYPT was not specified" },
394 	{ 0x06, "ILLEGAL_FUNCTION_SIZE", EIO,
395 	    "A non-supported function size was specified.\n"
396 	    "AES-CFB: Size was not 127 or 7;\n"
397 	    "3DES-CFB: Size was not 7;\n"
398 	    "RSA: See supported size table (7.4.2);\n"
399 	    "ECC: Size was greater than 576 bits." },
400 	{ 0x07, "Zlib_MISSING_INIT_EOM", EIO,
401 	    "Zlib command does not have INIT and EOM set" },
402 	{ 0x08, "ILLEGAL_FUNCTION_RSVD", EIO,
403 	    "Reserved bits in a function specification were not 0" },
404 	{ 0x09, "ILLEGAL_BUFFER_LENGTH", EIO,
405 	    "The buffer length specified was not correct for the selected engine"
406 	},
407 	{ 0x0A, "VLSB_FAULT", EIO, "Illegal VLSB segment mapping:\n"
408 	    "Undefined VLSB segment mapping or\n"
409 	    "mapping to unsupported LSB segment id" },
410 	{ 0x0B, "ILLEGAL_MEM_ADDR", EFAULT,
411 	    "The specified source/destination buffer access was illegal:\n"
412 	    "Data buffer located in a LSB location disallowed by the LSB protection masks; or\n"
413 	    "Data buffer not completely contained within a single segment; or\n"
414 	    "Pointer with Fixed=1 is not 32-bit aligned; or\n"
415 	    "Pointer with Fixed=1 attempted to reference non-AXI1 (local) memory."
416 	},
417 	{ 0x0C, "ILLEGAL_MEM_SEL", EIO,
418 	    "A src_mem, dst_mem, or key_mem field was illegal:\n"
419 	    "A field was set to a reserved value; or\n"
420 	    "A public command attempted to reference AXI1 (local) or GART memory; or\n"
421 	    "A Zlib command attmpted to use the LSB." },
422 	{ 0x0D, "ILLEGAL_CONTEXT_ADDR", EIO,
423 	    "The specified context location was illegal:\n"
424 	    "Context located in a LSB location disallowed by the LSB protection masks; or\n"
425 	    "Context not completely contained within a single segment." },
426 	{ 0x0E, "ILLEGAL_KEY_ADDR", EIO,
427 	    "The specified key location was illegal:\n"
428 	    "Key located in a LSB location disallowed by the LSB protection masks; or\n"
429 	    "Key not completely contained within a single segment." },
430 	{ 0x12, "CMD_TIMEOUT", EIO, "A command timeout violation occurred" },
431 	/* XXX Could fill out these descriptions too */
432 	{ 0x13, "IDMA0_AXI_SLVERR", EIO, "" },
433 	{ 0x14, "IDMA0_AXI_DECERR", EIO, "" },
434 	{ 0x16, "IDMA1_AXI_SLVERR", EIO, "" },
435 	{ 0x17, "IDMA1_AXI_DECERR", EIO, "" },
436 	{ 0x19, "ZLIBVHB_AXI_SLVERR", EIO, "" },
437 	{ 0x1A, "ZLIBVHB_AXI_DECERR", EIO, "" },
438 	{ 0x1C, "ZLIB_UNEXPECTED_EOM", EIO, "" },
439 	{ 0x1D, "ZLIB_EXTRA_DATA", EIO, "" },
440 	{ 0x1E, "ZLIB_BTYPE", EIO, "" },
441 	{ 0x20, "ZLIB_UNDEFINED_DISTANCE_SYMBOL", EIO, "" },
442 	{ 0x21, "ZLIB_CODE_LENGTH_SYMBOL", EIO, "" },
443 	{ 0x22, "ZLIB_VHB_ILLEGAL_FETCH", EIO, "" },
444 	{ 0x23, "ZLIB_UNCOMPRESSED_LEN", EIO, "" },
445 	{ 0x24, "ZLIB_LIMIT_REACHED", EIO, "" },
446 	{ 0x25, "ZLIB_CHECKSUM_MISMATCH", EIO, "" },
447 	{ 0x26, "ODMA0_AXI_SLVERR", EIO, "" },
448 	{ 0x27, "ODMA0_AXI_DECERR", EIO, "" },
449 	{ 0x29, "ODMA1_AXI_SLVERR", EIO, "" },
450 	{ 0x2A, "ODMA1_AXI_DECERR", EIO, "" },
451 	{ 0x2B, "LSB_PARITY_ERR", EIO,
452 	    "A read from the LSB encountered a parity error" },
453 };
454 
455 static void
456 ccp_intr_handle_error(struct ccp_queue *qp, const struct ccp_desc *desc)
457 {
458 	struct ccp_completion_ctx *cctx;
459 	const struct ccp_error_code *ec;
460 	struct ccp_softc *sc;
461 	uint32_t status, error, esource, faultblock;
462 	unsigned q, idx;
463 	int errno;
464 
465 	sc = qp->cq_softc;
466 	q = qp->cq_qindex;
467 
468 	status = ccp_read_queue_4(sc, q, CMD_Q_STATUS_BASE);
469 
470 	error = status & STATUS_ERROR_MASK;
471 
472 	/* Decode error status */
473 	ec = NULL;
474 	for (idx = 0; idx < nitems(ccp_error_codes); idx++)
475 		if (ccp_error_codes[idx].ce_code == error) {
476 			ec = &ccp_error_codes[idx];
477 			break;
478 		}
479 
480 	esource = (status >> STATUS_ERRORSOURCE_SHIFT) &
481 	    STATUS_ERRORSOURCE_MASK;
482 	faultblock = (status >> STATUS_VLSB_FAULTBLOCK_SHIFT) &
483 	    STATUS_VLSB_FAULTBLOCK_MASK;
484 	device_printf(sc->dev, "Error: %s (%u) Source: %u Faulting LSB block: %u\n",
485 	    (ec != NULL) ? ec->ce_name : "(reserved)", error, esource,
486 	    faultblock);
487 	if (ec != NULL)
488 		device_printf(sc->dev, "Error description: %s\n", ec->ce_desc);
489 
490 	/* TODO Could format the desc nicely here */
491 	idx = desc - qp->desc_ring;
492 	DPRINTF(sc->dev, "Bad descriptor index: %u contents: %32D\n", idx,
493 	    (const void *)desc, " ");
494 
495 	/*
496 	 * TODO Per § 14.4 "Error Handling," DMA_Status, DMA_Read/Write_Status,
497 	 * Zlib Decompress status may be interesting.
498 	 */
499 
500 	while (true) {
501 		/* Keep unused descriptors zero for next use. */
502 		memset(&qp->desc_ring[idx], 0, sizeof(qp->desc_ring[idx]));
503 
504 		cctx = &qp->completions_ring[idx];
505 
506 		/*
507 		 * Restart procedure described in § 14.2.5.  Could be used by HoC if we
508 		 * used that.
509 		 *
510 		 * Advance HEAD_LO past bad descriptor + any remaining in
511 		 * transaction manually, then restart queue.
512 		 */
513 		idx = (idx + 1) % (1 << sc->ring_size_order);
514 
515 		/* Callback function signals end of transaction */
516 		if (cctx->callback_fn != NULL) {
517 			if (ec == NULL)
518 				errno = EIO;
519 			else
520 				errno = ec->ce_errno;
521 			/* TODO More specific error code */
522 			cctx->callback_fn(qp, cctx->session, cctx->callback_arg, errno);
523 			cctx->callback_fn = NULL;
524 			break;
525 		}
526 	}
527 
528 	qp->cq_head = idx;
529 	qp->cq_waiting = false;
530 	wakeup(&qp->cq_tail);
531 	DPRINTF(sc->dev, "%s: wrote sw head:%u\n", __func__, qp->cq_head);
532 	ccp_write_queue_4(sc, q, CMD_Q_HEAD_LO_BASE,
533 	    (uint32_t)qp->desc_ring_bus_addr + (idx * Q_DESC_SIZE));
534 	ccp_write_queue_4(sc, q, CMD_Q_CONTROL_BASE, qp->qcontrol);
535 	DPRINTF(sc->dev, "%s: Restarted queue\n", __func__);
536 }
537 
538 static void
539 ccp_intr_run_completions(struct ccp_queue *qp, uint32_t ints)
540 {
541 	struct ccp_completion_ctx *cctx;
542 	struct ccp_softc *sc;
543 	const struct ccp_desc *desc;
544 	uint32_t headlo, idx;
545 	unsigned q, completed;
546 
547 	sc = qp->cq_softc;
548 	q = qp->cq_qindex;
549 
550 	mtx_lock(&qp->cq_lock);
551 
552 	/*
553 	 * Hardware HEAD_LO points to the first incomplete descriptor.  Process
554 	 * any submitted and completed descriptors, up to but not including
555 	 * HEAD_LO.
556 	 */
557 	headlo = ccp_read_queue_4(sc, q, CMD_Q_HEAD_LO_BASE);
558 	idx = (headlo - (uint32_t)qp->desc_ring_bus_addr) / Q_DESC_SIZE;
559 
560 	DPRINTF(sc->dev, "%s: hw head:%u sw head:%u\n", __func__, idx,
561 	    qp->cq_head);
562 	completed = 0;
563 	while (qp->cq_head != idx) {
564 		DPRINTF(sc->dev, "%s: completing:%u\n", __func__, qp->cq_head);
565 
566 		cctx = &qp->completions_ring[qp->cq_head];
567 		if (cctx->callback_fn != NULL) {
568 			cctx->callback_fn(qp, cctx->session,
569 			    cctx->callback_arg, 0);
570 			cctx->callback_fn = NULL;
571 		}
572 
573 		/* Keep unused descriptors zero for next use. */
574 		memset(&qp->desc_ring[qp->cq_head], 0,
575 		    sizeof(qp->desc_ring[qp->cq_head]));
576 
577 		qp->cq_head = (qp->cq_head + 1) % (1 << sc->ring_size_order);
578 		completed++;
579 	}
580 	if (completed > 0) {
581 		qp->cq_waiting = false;
582 		wakeup(&qp->cq_tail);
583 	}
584 
585 	DPRINTF(sc->dev, "%s: wrote sw head:%u\n", __func__, qp->cq_head);
586 
587 	/*
588 	 * Desc points to the first incomplete descriptor, at the time we read
589 	 * HEAD_LO.  If there was an error flagged in interrupt status, the HW
590 	 * will not proceed past the erroneous descriptor by itself.
591 	 */
592 	desc = &qp->desc_ring[idx];
593 	if ((ints & INT_ERROR) != 0)
594 		ccp_intr_handle_error(qp, desc);
595 
596 	mtx_unlock(&qp->cq_lock);
597 }
598 
599 static void
600 ccp_intr_handler(void *arg)
601 {
602 	struct ccp_softc *sc = arg;
603 	size_t i;
604 	uint32_t ints;
605 
606 	DPRINTF(sc->dev, "%s: interrupt\n", __func__);
607 
608 	/*
609 	 * We get one global interrupt per PCI device, shared over all of
610 	 * its queues.  Scan each valid queue on interrupt for flags indicating
611 	 * activity.
612 	 */
613 	for (i = 0; i < nitems(sc->queues); i++) {
614 		if ((sc->valid_queues & (1 << i)) == 0)
615 			continue;
616 
617 		ints = ccp_read_queue_4(sc, i, CMD_Q_INTERRUPT_STATUS_BASE);
618 		if (ints == 0)
619 			continue;
620 
621 #if 0
622 		DPRINTF(sc->dev, "%s: %x interrupts on queue %zu\n", __func__,
623 		    (unsigned)ints, i);
624 #endif
625 		/* Write back 1s to clear interrupt status bits. */
626 		ccp_write_queue_4(sc, i, CMD_Q_INTERRUPT_STATUS_BASE, ints);
627 
628 		/*
629 		 * If there was an error, we still need to run completions on
630 		 * any descriptors prior to the error.  The completions handler
631 		 * invoked below will also handle the error descriptor.
632 		 */
633 		if ((ints & (INT_COMPLETION | INT_ERROR)) != 0)
634 			ccp_intr_run_completions(&sc->queues[i], ints);
635 
636 		if ((ints & INT_QUEUE_STOPPED) != 0)
637 			device_printf(sc->dev, "%s: queue %zu stopped\n",
638 			    __func__, i);
639 	}
640 
641 	/* Re-enable interrupts after processing */
642 	for (i = 0; i < nitems(sc->queues); i++) {
643 		if ((sc->valid_queues & (1 << i)) == 0)
644 			continue;
645 		ccp_write_queue_4(sc, i, CMD_Q_INT_ENABLE_BASE,
646 		    INT_COMPLETION | INT_ERROR | INT_QUEUE_STOPPED);
647 	}
648 }
649 
650 static int
651 ccp_intr_filter(void *arg)
652 {
653 	struct ccp_softc *sc = arg;
654 	size_t i;
655 
656 	/* TODO: Split individual queues into separate taskqueues? */
657 	for (i = 0; i < nitems(sc->queues); i++) {
658 		if ((sc->valid_queues & (1 << i)) == 0)
659 			continue;
660 
661 		/* Mask interrupt until task completes */
662 		ccp_write_queue_4(sc, i, CMD_Q_INT_ENABLE_BASE, 0);
663 	}
664 
665 	return (FILTER_SCHEDULE_THREAD);
666 }
667 
668 static int
669 ccp_setup_interrupts(struct ccp_softc *sc)
670 {
671 	uint32_t nvec;
672 	int rid, error, n, ridcopy;
673 
674 	n = pci_msix_count(sc->dev);
675 	if (n < 1) {
676 		device_printf(sc->dev, "%s: msix_count: %d\n", __func__, n);
677 		return (ENXIO);
678 	}
679 
680 	nvec = n;
681 	error = pci_alloc_msix(sc->dev, &nvec);
682 	if (error != 0) {
683 		device_printf(sc->dev, "%s: alloc_msix error: %d\n", __func__,
684 		    error);
685 		return (error);
686 	}
687 	if (nvec < 1) {
688 		device_printf(sc->dev, "%s: alloc_msix: 0 vectors\n",
689 		    __func__);
690 		return (ENXIO);
691 	}
692 	if (nvec > nitems(sc->intr_res)) {
693 		device_printf(sc->dev, "%s: too many vectors: %u\n", __func__,
694 		    nvec);
695 		nvec = nitems(sc->intr_res);
696 	}
697 
698 	for (rid = 1; rid < 1 + nvec; rid++) {
699 		ridcopy = rid;
700 		sc->intr_res[rid - 1] = bus_alloc_resource_any(sc->dev,
701 		    SYS_RES_IRQ, &ridcopy, RF_ACTIVE);
702 		if (sc->intr_res[rid - 1] == NULL) {
703 			device_printf(sc->dev, "%s: Failed to alloc IRQ resource\n",
704 			    __func__);
705 			return (ENXIO);
706 		}
707 
708 		sc->intr_tag[rid - 1] = NULL;
709 		error = bus_setup_intr(sc->dev, sc->intr_res[rid - 1],
710 		    INTR_MPSAFE | INTR_TYPE_MISC, ccp_intr_filter,
711 		    ccp_intr_handler, sc, &sc->intr_tag[rid - 1]);
712 		if (error != 0)
713 			device_printf(sc->dev, "%s: setup_intr: %d\n",
714 			    __func__, error);
715 	}
716 	sc->intr_count = nvec;
717 
718 	return (error);
719 }
720 
721 static void
722 ccp_release_interrupts(struct ccp_softc *sc)
723 {
724 	unsigned i;
725 
726 	for (i = 0; i < sc->intr_count; i++) {
727 		if (sc->intr_tag[i] != NULL)
728 			bus_teardown_intr(sc->dev, sc->intr_res[i],
729 			    sc->intr_tag[i]);
730 		if (sc->intr_res[i] != NULL)
731 			bus_release_resource(sc->dev, SYS_RES_IRQ,
732 			    rman_get_rid(sc->intr_res[i]), sc->intr_res[i]);
733 	}
734 
735 	pci_release_msi(sc->dev);
736 }
737 
738 int
739 ccp_hw_attach(device_t dev)
740 {
741 	struct ccp_softc *sc;
742 	uint64_t lsbmask;
743 	uint32_t version, lsbmasklo, lsbmaskhi;
744 	unsigned queue_idx, j;
745 	int error;
746 	bool bars_mapped, interrupts_setup;
747 
748 	queue_idx = 0;
749 	bars_mapped = interrupts_setup = false;
750 	sc = device_get_softc(dev);
751 
752 	error = ccp_map_pci_bar(dev);
753 	if (error != 0) {
754 		device_printf(dev, "%s: couldn't map BAR(s)\n", __func__);
755 		goto out;
756 	}
757 	bars_mapped = true;
758 
759 	error = pci_enable_busmaster(dev);
760 	if (error != 0) {
761 		device_printf(dev, "%s: couldn't enable busmaster\n",
762 		    __func__);
763 		goto out;
764 	}
765 
766 	sc->ring_size_order = g_ccp_ring_order;
767 	if (sc->ring_size_order < 6 || sc->ring_size_order > 16) {
768 		device_printf(dev, "bogus hw.ccp.ring_order\n");
769 		error = EINVAL;
770 		goto out;
771 	}
772 	sc->valid_queues = ccp_read_4(sc, CMD_QUEUE_MASK_OFFSET);
773 
774 	version = ccp_read_4(sc, VERSION_REG);
775 	if ((version & VERSION_NUM_MASK) < 5) {
776 		device_printf(dev,
777 		    "driver supports version 5 and later hardware\n");
778 		error = ENXIO;
779 		goto out;
780 	}
781 
782 	error = ccp_setup_interrupts(sc);
783 	if (error != 0)
784 		goto out;
785 	interrupts_setup = true;
786 
787 	sc->hw_version = version & VERSION_NUM_MASK;
788 	sc->num_queues = (version >> VERSION_NUMVQM_SHIFT) &
789 	    VERSION_NUMVQM_MASK;
790 	sc->num_lsb_entries = (version >> VERSION_LSBSIZE_SHIFT) &
791 	    VERSION_LSBSIZE_MASK;
792 	sc->hw_features = version & VERSION_CAP_MASK;
793 
794 	/*
795 	 * Copy private LSB mask to public registers to enable access to LSB
796 	 * from all queues allowed by BIOS.
797 	 */
798 	lsbmasklo = ccp_read_4(sc, LSB_PRIVATE_MASK_LO_OFFSET);
799 	lsbmaskhi = ccp_read_4(sc, LSB_PRIVATE_MASK_HI_OFFSET);
800 	ccp_write_4(sc, LSB_PUBLIC_MASK_LO_OFFSET, lsbmasklo);
801 	ccp_write_4(sc, LSB_PUBLIC_MASK_HI_OFFSET, lsbmaskhi);
802 
803 	lsbmask = ((uint64_t)lsbmaskhi << 30) | lsbmasklo;
804 
805 	for (; queue_idx < nitems(sc->queues); queue_idx++) {
806 		error = ccp_hw_attach_queue(dev, lsbmask, queue_idx);
807 		if (error != 0) {
808 			device_printf(dev, "%s: couldn't attach queue %u\n",
809 			    __func__, queue_idx);
810 			goto out;
811 		}
812 	}
813 	ccp_assign_lsb_regions(sc, lsbmask);
814 
815 out:
816 	if (error != 0) {
817 		if (interrupts_setup)
818 			ccp_release_interrupts(sc);
819 		for (j = 0; j < queue_idx; j++)
820 			ccp_hw_detach_queue(dev, j);
821 		if (sc->ring_size_order != 0)
822 			pci_disable_busmaster(dev);
823 		if (bars_mapped)
824 			ccp_unmap_pci_bar(dev);
825 	}
826 	return (error);
827 }
828 
829 void
830 ccp_hw_detach(device_t dev)
831 {
832 	struct ccp_softc *sc;
833 	unsigned i;
834 
835 	sc = device_get_softc(dev);
836 
837 	for (i = 0; i < nitems(sc->queues); i++)
838 		ccp_hw_detach_queue(dev, i);
839 
840 	ccp_release_interrupts(sc);
841 	pci_disable_busmaster(dev);
842 	ccp_unmap_pci_bar(dev);
843 }
844 
845 static int __must_check
846 ccp_passthrough(struct ccp_queue *qp, bus_addr_t dst,
847     enum ccp_memtype dst_type, bus_addr_t src, enum ccp_memtype src_type,
848     bus_size_t len, enum ccp_passthru_byteswap swapmode,
849     enum ccp_passthru_bitwise bitmode, bool interrupt,
850     const struct ccp_completion_ctx *cctx)
851 {
852 	struct ccp_desc *desc;
853 
854 	if (ccp_queue_get_ring_space(qp) == 0)
855 		return (EAGAIN);
856 
857 	desc = &qp->desc_ring[qp->cq_tail];
858 
859 	memset(desc, 0, sizeof(*desc));
860 	desc->engine = CCP_ENGINE_PASSTHRU;
861 
862 	desc->pt.ioc = interrupt;
863 	desc->pt.byteswap = swapmode;
864 	desc->pt.bitwise = bitmode;
865 	desc->length = len;
866 
867 	desc->src_lo = (uint32_t)src;
868 	desc->src_hi = src >> 32;
869 	desc->src_mem = src_type;
870 
871 	desc->dst_lo = (uint32_t)dst;
872 	desc->dst_hi = dst >> 32;
873 	desc->dst_mem = dst_type;
874 
875 	if (bitmode != CCP_PASSTHRU_BITWISE_NOOP)
876 		desc->lsb_ctx_id = ccp_queue_lsb_entry(qp, LSB_ENTRY_KEY);
877 
878 	if (cctx != NULL)
879 		memcpy(&qp->completions_ring[qp->cq_tail], cctx, sizeof(*cctx));
880 
881 	qp->cq_tail = (qp->cq_tail + 1) % (1 << qp->cq_softc->ring_size_order);
882 	return (0);
883 }
884 
885 static int __must_check
886 ccp_passthrough_sgl(struct ccp_queue *qp, bus_addr_t lsb_addr, bool tolsb,
887     struct sglist *sgl, bus_size_t len, bool interrupt,
888     const struct ccp_completion_ctx *cctx)
889 {
890 	struct sglist_seg *seg;
891 	size_t i, remain, nb;
892 	int error;
893 
894 	remain = len;
895 	for (i = 0; i < sgl->sg_nseg && remain != 0; i++) {
896 		seg = &sgl->sg_segs[i];
897 		/* crd_len is int, so 32-bit min() is ok. */
898 		nb = min(remain, seg->ss_len);
899 
900 		if (tolsb)
901 			error = ccp_passthrough(qp, lsb_addr, CCP_MEMTYPE_SB,
902 			    seg->ss_paddr, CCP_MEMTYPE_SYSTEM, nb,
903 			    CCP_PASSTHRU_BYTESWAP_NOOP,
904 			    CCP_PASSTHRU_BITWISE_NOOP,
905 			    (nb == remain) && interrupt, cctx);
906 		else
907 			error = ccp_passthrough(qp, seg->ss_paddr,
908 			    CCP_MEMTYPE_SYSTEM, lsb_addr, CCP_MEMTYPE_SB, nb,
909 			    CCP_PASSTHRU_BYTESWAP_NOOP,
910 			    CCP_PASSTHRU_BITWISE_NOOP,
911 			    (nb == remain) && interrupt, cctx);
912 		if (error != 0)
913 			return (error);
914 
915 		remain -= nb;
916 	}
917 	return (0);
918 }
919 
920 /*
921  * Note that these vectors are in reverse of the usual order.
922  */
923 const struct SHA_vectors {
924 	uint32_t SHA1[8];
925 	uint32_t SHA224[8];
926 	uint32_t SHA256[8];
927 	uint64_t SHA384[8];
928 	uint64_t SHA512[8];
929 } SHA_H __aligned(PAGE_SIZE) = {
930 	.SHA1 = {
931 		0xc3d2e1f0ul,
932 		0x10325476ul,
933 		0x98badcfeul,
934 		0xefcdab89ul,
935 		0x67452301ul,
936 		0,
937 		0,
938 		0,
939 	},
940 	.SHA224 = {
941 		0xbefa4fa4ul,
942 		0x64f98fa7ul,
943 		0x68581511ul,
944 		0xffc00b31ul,
945 		0xf70e5939ul,
946 		0x3070dd17ul,
947 		0x367cd507ul,
948 		0xc1059ed8ul,
949 	},
950 	.SHA256 = {
951 		0x5be0cd19ul,
952 		0x1f83d9abul,
953 		0x9b05688cul,
954 		0x510e527ful,
955 		0xa54ff53aul,
956 		0x3c6ef372ul,
957 		0xbb67ae85ul,
958 		0x6a09e667ul,
959 	},
960 	.SHA384 = {
961 		0x47b5481dbefa4fa4ull,
962 		0xdb0c2e0d64f98fa7ull,
963 		0x8eb44a8768581511ull,
964 		0x67332667ffc00b31ull,
965 		0x152fecd8f70e5939ull,
966 		0x9159015a3070dd17ull,
967 		0x629a292a367cd507ull,
968 		0xcbbb9d5dc1059ed8ull,
969 	},
970 	.SHA512 = {
971 		0x5be0cd19137e2179ull,
972 		0x1f83d9abfb41bd6bull,
973 		0x9b05688c2b3e6c1full,
974 		0x510e527fade682d1ull,
975 		0xa54ff53a5f1d36f1ull,
976 		0x3c6ef372fe94f82bull,
977 		0xbb67ae8584caa73bull,
978 		0x6a09e667f3bcc908ull,
979 	},
980 };
981 /*
982  * Ensure vectors do not cross a page boundary.
983  *
984  * Disabled due to a new Clang error:  "expression is not an integral constant
985  * expression."  GCC (cross toolchain) seems to handle this assertion with
986  * _Static_assert just fine.
987  */
988 #if 0
989 CTASSERT(PAGE_SIZE - ((uintptr_t)&SHA_H % PAGE_SIZE) >= sizeof(SHA_H));
990 #endif
991 
992 const struct SHA_Defn {
993 	enum sha_version version;
994 	const void *H_vectors;
995 	size_t H_size;
996 	struct auth_hash *axf;
997 	enum ccp_sha_type engine_type;
998 } SHA_definitions[] = {
999 	{
1000 		.version = SHA1,
1001 		.H_vectors = SHA_H.SHA1,
1002 		.H_size = sizeof(SHA_H.SHA1),
1003 		.axf = &auth_hash_hmac_sha1,
1004 		.engine_type = CCP_SHA_TYPE_1,
1005 	},
1006 #if 0
1007 	{
1008 		.version = SHA2_224,
1009 		.H_vectors = SHA_H.SHA224,
1010 		.H_size = sizeof(SHA_H.SHA224),
1011 		.axf = &auth_hash_hmac_sha2_224,
1012 		.engine_type = CCP_SHA_TYPE_224,
1013 	},
1014 #endif
1015 	{
1016 		.version = SHA2_256,
1017 		.H_vectors = SHA_H.SHA256,
1018 		.H_size = sizeof(SHA_H.SHA256),
1019 		.axf = &auth_hash_hmac_sha2_256,
1020 		.engine_type = CCP_SHA_TYPE_256,
1021 	},
1022 	{
1023 		.version = SHA2_384,
1024 		.H_vectors = SHA_H.SHA384,
1025 		.H_size = sizeof(SHA_H.SHA384),
1026 		.axf = &auth_hash_hmac_sha2_384,
1027 		.engine_type = CCP_SHA_TYPE_384,
1028 	},
1029 	{
1030 		.version = SHA2_512,
1031 		.H_vectors = SHA_H.SHA512,
1032 		.H_size = sizeof(SHA_H.SHA512),
1033 		.axf = &auth_hash_hmac_sha2_512,
1034 		.engine_type = CCP_SHA_TYPE_512,
1035 	},
1036 };
1037 
1038 static int __must_check
1039 ccp_sha_single_desc(struct ccp_queue *qp, const struct SHA_Defn *defn,
1040     vm_paddr_t addr, size_t len, bool start, bool end, uint64_t msgbits)
1041 {
1042 	struct ccp_desc *desc;
1043 
1044 	if (ccp_queue_get_ring_space(qp) == 0)
1045 		return (EAGAIN);
1046 
1047 	desc = &qp->desc_ring[qp->cq_tail];
1048 
1049 	memset(desc, 0, sizeof(*desc));
1050 	desc->engine = CCP_ENGINE_SHA;
1051 	desc->som = start;
1052 	desc->eom = end;
1053 
1054 	desc->sha.type = defn->engine_type;
1055 	desc->length = len;
1056 
1057 	if (end) {
1058 		desc->sha_len_lo = (uint32_t)msgbits;
1059 		desc->sha_len_hi = msgbits >> 32;
1060 	}
1061 
1062 	desc->src_lo = (uint32_t)addr;
1063 	desc->src_hi = addr >> 32;
1064 	desc->src_mem = CCP_MEMTYPE_SYSTEM;
1065 
1066 	desc->lsb_ctx_id = ccp_queue_lsb_entry(qp, LSB_ENTRY_SHA);
1067 
1068 	qp->cq_tail = (qp->cq_tail + 1) % (1 << qp->cq_softc->ring_size_order);
1069 	return (0);
1070 }
1071 
1072 static int __must_check
1073 ccp_sha(struct ccp_queue *qp, enum sha_version version, struct sglist *sgl_src,
1074     struct sglist *sgl_dst, const struct ccp_completion_ctx *cctx)
1075 {
1076 	const struct SHA_Defn *defn;
1077 	struct sglist_seg *seg;
1078 	size_t i, msgsize, remaining, nb;
1079 	uint32_t lsbaddr;
1080 	int error;
1081 
1082 	for (i = 0; i < nitems(SHA_definitions); i++)
1083 		if (SHA_definitions[i].version == version)
1084 			break;
1085 	if (i == nitems(SHA_definitions))
1086 		return (EINVAL);
1087 	defn = &SHA_definitions[i];
1088 
1089 	/* XXX validate input ??? */
1090 
1091 	/* Load initial SHA state into LSB */
1092 	/* XXX ensure H_vectors don't span page boundaries */
1093 	error = ccp_passthrough(qp, ccp_queue_lsb_address(qp, LSB_ENTRY_SHA),
1094 	    CCP_MEMTYPE_SB, pmap_kextract((vm_offset_t)defn->H_vectors),
1095 	    CCP_MEMTYPE_SYSTEM, roundup2(defn->H_size, LSB_ENTRY_SIZE),
1096 	    CCP_PASSTHRU_BYTESWAP_NOOP, CCP_PASSTHRU_BITWISE_NOOP, false,
1097 	    NULL);
1098 	if (error != 0)
1099 		return (error);
1100 
1101 	/* Execute series of SHA updates on correctly sized buffers */
1102 	msgsize = 0;
1103 	for (i = 0; i < sgl_src->sg_nseg; i++) {
1104 		seg = &sgl_src->sg_segs[i];
1105 		msgsize += seg->ss_len;
1106 		error = ccp_sha_single_desc(qp, defn, seg->ss_paddr,
1107 		    seg->ss_len, i == 0, i == sgl_src->sg_nseg - 1,
1108 		    msgsize << 3);
1109 		if (error != 0)
1110 			return (error);
1111 	}
1112 
1113 	/* Copy result out to sgl_dst */
1114 	remaining = roundup2(defn->H_size, LSB_ENTRY_SIZE);
1115 	lsbaddr = ccp_queue_lsb_address(qp, LSB_ENTRY_SHA);
1116 	for (i = 0; i < sgl_dst->sg_nseg; i++) {
1117 		seg = &sgl_dst->sg_segs[i];
1118 		/* crd_len is int, so 32-bit min() is ok. */
1119 		nb = min(remaining, seg->ss_len);
1120 
1121 		error = ccp_passthrough(qp, seg->ss_paddr, CCP_MEMTYPE_SYSTEM,
1122 		    lsbaddr, CCP_MEMTYPE_SB, nb, CCP_PASSTHRU_BYTESWAP_NOOP,
1123 		    CCP_PASSTHRU_BITWISE_NOOP,
1124 		    (cctx != NULL) ? (nb == remaining) : false,
1125 		    (nb == remaining) ? cctx : NULL);
1126 		if (error != 0)
1127 			return (error);
1128 
1129 		remaining -= nb;
1130 		lsbaddr += nb;
1131 		if (remaining == 0)
1132 			break;
1133 	}
1134 
1135 	return (0);
1136 }
1137 
1138 static void
1139 byteswap256(uint64_t *buffer)
1140 {
1141 	uint64_t t;
1142 
1143 	t = bswap64(buffer[3]);
1144 	buffer[3] = bswap64(buffer[0]);
1145 	buffer[0] = t;
1146 
1147 	t = bswap64(buffer[2]);
1148 	buffer[2] = bswap64(buffer[1]);
1149 	buffer[1] = t;
1150 }
1151 
1152 /*
1153  * Translate CCP internal LSB hash format into a standard hash ouput.
1154  *
1155  * Manipulates input buffer with byteswap256 operation.
1156  */
1157 static void
1158 ccp_sha_copy_result(char *output, char *buffer, enum sha_version version)
1159 {
1160 	const struct SHA_Defn *defn;
1161 	size_t i;
1162 
1163 	for (i = 0; i < nitems(SHA_definitions); i++)
1164 		if (SHA_definitions[i].version == version)
1165 			break;
1166 	if (i == nitems(SHA_definitions))
1167 		panic("bogus sha version auth_mode %u\n", (unsigned)version);
1168 
1169 	defn = &SHA_definitions[i];
1170 
1171 	/* Swap 256bit manually -- DMA engine can, but with limitations */
1172 	byteswap256((void *)buffer);
1173 	if (defn->axf->hashsize > LSB_ENTRY_SIZE)
1174 		byteswap256((void *)(buffer + LSB_ENTRY_SIZE));
1175 
1176 	switch (defn->version) {
1177 	case SHA1:
1178 		memcpy(output, buffer + 12, defn->axf->hashsize);
1179 		break;
1180 #if 0
1181 	case SHA2_224:
1182 		memcpy(output, buffer + XXX, defn->axf->hashsize);
1183 		break;
1184 #endif
1185 	case SHA2_256:
1186 		memcpy(output, buffer, defn->axf->hashsize);
1187 		break;
1188 	case SHA2_384:
1189 		memcpy(output,
1190 		    buffer + LSB_ENTRY_SIZE * 3 - defn->axf->hashsize,
1191 		    defn->axf->hashsize - LSB_ENTRY_SIZE);
1192 		memcpy(output + defn->axf->hashsize - LSB_ENTRY_SIZE, buffer,
1193 		    LSB_ENTRY_SIZE);
1194 		break;
1195 	case SHA2_512:
1196 		memcpy(output, buffer + LSB_ENTRY_SIZE, LSB_ENTRY_SIZE);
1197 		memcpy(output + LSB_ENTRY_SIZE, buffer, LSB_ENTRY_SIZE);
1198 		break;
1199 	}
1200 }
1201 
1202 static void
1203 ccp_do_hmac_done(struct ccp_queue *qp, struct ccp_session *s,
1204     struct cryptop *crp, struct cryptodesc *crd, int error)
1205 {
1206 	char ihash[SHA2_512_HASH_LEN /* max hash len */];
1207 	union authctx auth_ctx;
1208 	struct auth_hash *axf;
1209 
1210 	axf = s->hmac.auth_hash;
1211 
1212 	s->pending--;
1213 
1214 	if (error != 0) {
1215 		crp->crp_etype = error;
1216 		goto out;
1217 	}
1218 
1219 	/* Do remaining outer hash over small inner hash in software */
1220 	axf->Init(&auth_ctx);
1221 	axf->Update(&auth_ctx, s->hmac.opad, axf->blocksize);
1222 	ccp_sha_copy_result(ihash, s->hmac.ipad, s->hmac.auth_mode);
1223 #if 0
1224 	INSECURE_DEBUG(dev, "%s sha intermediate=%64D\n", __func__,
1225 	    (u_char *)ihash, " ");
1226 #endif
1227 	axf->Update(&auth_ctx, ihash, axf->hashsize);
1228 	axf->Final(s->hmac.ipad, &auth_ctx);
1229 
1230 	crypto_copyback(crp->crp_flags, crp->crp_buf, crd->crd_inject,
1231 	    s->hmac.hash_len, s->hmac.ipad);
1232 
1233 	/* Avoid leaking key material */
1234 	explicit_bzero(&auth_ctx, sizeof(auth_ctx));
1235 	explicit_bzero(s->hmac.ipad, sizeof(s->hmac.ipad));
1236 	explicit_bzero(s->hmac.opad, sizeof(s->hmac.opad));
1237 
1238 out:
1239 	crypto_done(crp);
1240 }
1241 
1242 static void
1243 ccp_hmac_done(struct ccp_queue *qp, struct ccp_session *s, void *vcrp,
1244     int error)
1245 {
1246 	struct cryptodesc *crd;
1247 	struct cryptop *crp;
1248 
1249 	crp = vcrp;
1250 	crd = crp->crp_desc;
1251 	ccp_do_hmac_done(qp, s, crp, crd, error);
1252 }
1253 
1254 static int __must_check
1255 ccp_do_hmac(struct ccp_queue *qp, struct ccp_session *s, struct cryptop *crp,
1256     struct cryptodesc *crd, const struct ccp_completion_ctx *cctx)
1257 {
1258 	device_t dev;
1259 	struct auth_hash *axf;
1260 	int error;
1261 
1262 	dev = qp->cq_softc->dev;
1263 	axf = s->hmac.auth_hash;
1264 
1265 	/*
1266 	 * Populate the SGL describing inside hash contents.  We want to hash
1267 	 * the ipad (key XOR fixed bit pattern) concatenated with the user
1268 	 * data.
1269 	 */
1270 	sglist_reset(qp->cq_sg_ulptx);
1271 	error = sglist_append(qp->cq_sg_ulptx, s->hmac.ipad, axf->blocksize);
1272 	if (error != 0)
1273 		return (error);
1274 	error = sglist_append_sglist(qp->cq_sg_ulptx, qp->cq_sg_crp,
1275 	    crd->crd_skip, crd->crd_len);
1276 	if (error != 0) {
1277 		DPRINTF(dev, "%s: sglist too short\n", __func__);
1278 		return (error);
1279 	}
1280 	/* Populate SGL for output -- just reuse hmac.ipad buffer. */
1281 	sglist_reset(qp->cq_sg_dst);
1282 	error = sglist_append(qp->cq_sg_dst, s->hmac.ipad,
1283 	    roundup2(axf->hashsize, LSB_ENTRY_SIZE));
1284 	if (error != 0)
1285 		return (error);
1286 
1287 	error = ccp_sha(qp, s->hmac.auth_mode, qp->cq_sg_ulptx, qp->cq_sg_dst,
1288 	    cctx);
1289 	if (error != 0) {
1290 		DPRINTF(dev, "%s: ccp_sha error\n", __func__);
1291 		return (error);
1292 	}
1293 	return (0);
1294 }
1295 
1296 int __must_check
1297 ccp_hmac(struct ccp_queue *qp, struct ccp_session *s, struct cryptop *crp)
1298 {
1299 	struct ccp_completion_ctx ctx;
1300 	struct cryptodesc *crd;
1301 
1302 	crd = crp->crp_desc;
1303 
1304 	ctx.callback_fn = ccp_hmac_done;
1305 	ctx.callback_arg = crp;
1306 	ctx.session = s;
1307 
1308 	return (ccp_do_hmac(qp, s, crp, crd, &ctx));
1309 }
1310 
1311 static void
1312 ccp_byteswap(char *data, size_t len)
1313 {
1314 	size_t i;
1315 	char t;
1316 
1317 	len--;
1318 	for (i = 0; i < len; i++, len--) {
1319 		t = data[i];
1320 		data[i] = data[len];
1321 		data[len] = t;
1322 	}
1323 }
1324 
1325 static void
1326 ccp_blkcipher_done(struct ccp_queue *qp, struct ccp_session *s, void *vcrp,
1327     int error)
1328 {
1329 	struct cryptop *crp;
1330 
1331 	explicit_bzero(&s->blkcipher, sizeof(s->blkcipher));
1332 
1333 	crp = vcrp;
1334 
1335 	s->pending--;
1336 
1337 	if (error != 0)
1338 		crp->crp_etype = error;
1339 
1340 	DPRINTF(qp->cq_softc->dev, "%s: qp=%p crp=%p\n", __func__, qp, crp);
1341 	crypto_done(crp);
1342 }
1343 
1344 static void
1345 ccp_collect_iv(struct ccp_session *s, struct cryptop *crp,
1346     struct cryptodesc *crd)
1347 {
1348 
1349 	if (crd->crd_flags & CRD_F_ENCRYPT) {
1350 		if (crd->crd_flags & CRD_F_IV_EXPLICIT)
1351 			memcpy(s->blkcipher.iv, crd->crd_iv,
1352 			    s->blkcipher.iv_len);
1353 		else
1354 			arc4rand(s->blkcipher.iv, s->blkcipher.iv_len, 0);
1355 		if ((crd->crd_flags & CRD_F_IV_PRESENT) == 0)
1356 			crypto_copyback(crp->crp_flags, crp->crp_buf,
1357 			    crd->crd_inject, s->blkcipher.iv_len,
1358 			    s->blkcipher.iv);
1359 	} else {
1360 		if (crd->crd_flags & CRD_F_IV_EXPLICIT)
1361 			memcpy(s->blkcipher.iv, crd->crd_iv,
1362 			    s->blkcipher.iv_len);
1363 		else
1364 			crypto_copydata(crp->crp_flags, crp->crp_buf,
1365 			    crd->crd_inject, s->blkcipher.iv_len,
1366 			    s->blkcipher.iv);
1367 	}
1368 
1369 	/*
1370 	 * If the input IV is 12 bytes, append an explicit counter of 1.
1371 	 */
1372 	if (crd->crd_alg == CRYPTO_AES_NIST_GCM_16 &&
1373 	    s->blkcipher.iv_len == 12) {
1374 		*(uint32_t *)&s->blkcipher.iv[12] = htobe32(1);
1375 		s->blkcipher.iv_len = AES_BLOCK_LEN;
1376 	}
1377 
1378 	if (crd->crd_alg == CRYPTO_AES_XTS && s->blkcipher.iv_len != AES_BLOCK_LEN) {
1379 		DPRINTF(NULL, "got ivlen != 16: %u\n", s->blkcipher.iv_len);
1380 		if (s->blkcipher.iv_len < AES_BLOCK_LEN)
1381 			memset(&s->blkcipher.iv[s->blkcipher.iv_len], 0,
1382 			    AES_BLOCK_LEN - s->blkcipher.iv_len);
1383 		s->blkcipher.iv_len = AES_BLOCK_LEN;
1384 	}
1385 
1386 	/* Reverse order of IV material for HW */
1387 	INSECURE_DEBUG(NULL, "%s: IV: %16D len: %u\n", __func__,
1388 	    s->blkcipher.iv, " ", s->blkcipher.iv_len);
1389 
1390 	/*
1391 	 * For unknown reasons, XTS mode expects the IV in the reverse byte
1392 	 * order to every other AES mode.
1393 	 */
1394 	if (crd->crd_alg != CRYPTO_AES_XTS)
1395 		ccp_byteswap(s->blkcipher.iv, s->blkcipher.iv_len);
1396 }
1397 
1398 static int __must_check
1399 ccp_do_pst_to_lsb(struct ccp_queue *qp, uint32_t lsbaddr, const void *src,
1400     size_t len)
1401 {
1402 	int error;
1403 
1404 	sglist_reset(qp->cq_sg_ulptx);
1405 	error = sglist_append(qp->cq_sg_ulptx, __DECONST(void *, src), len);
1406 	if (error != 0)
1407 		return (error);
1408 
1409 	error = ccp_passthrough_sgl(qp, lsbaddr, true, qp->cq_sg_ulptx, len,
1410 	    false, NULL);
1411 	return (error);
1412 }
1413 
1414 static int __must_check
1415 ccp_do_xts(struct ccp_queue *qp, struct ccp_session *s, struct cryptop *crp,
1416     struct cryptodesc *crd, enum ccp_cipher_dir dir,
1417     const struct ccp_completion_ctx *cctx)
1418 {
1419 	struct ccp_desc *desc;
1420 	device_t dev;
1421 	unsigned i;
1422 	enum ccp_xts_unitsize usize;
1423 
1424 	/* IV and Key data are already loaded */
1425 
1426 	dev = qp->cq_softc->dev;
1427 
1428 	for (i = 0; i < nitems(ccp_xts_unitsize_map); i++)
1429 		if (ccp_xts_unitsize_map[i].cxu_size == crd->crd_len) {
1430 			usize = ccp_xts_unitsize_map[i].cxu_id;
1431 			break;
1432 		}
1433 	if (i >= nitems(ccp_xts_unitsize_map))
1434 		return (EINVAL);
1435 
1436 	for (i = 0; i < qp->cq_sg_ulptx->sg_nseg; i++) {
1437 		struct sglist_seg *seg;
1438 
1439 		seg = &qp->cq_sg_ulptx->sg_segs[i];
1440 
1441 		desc = &qp->desc_ring[qp->cq_tail];
1442 		desc->engine = CCP_ENGINE_XTS_AES;
1443 		desc->som = (i == 0);
1444 		desc->eom = (i == qp->cq_sg_ulptx->sg_nseg - 1);
1445 		desc->ioc = (desc->eom && cctx != NULL);
1446 		DPRINTF(dev, "%s: XTS %u: som:%d eom:%d ioc:%d dir:%d\n",
1447 		    __func__, qp->cq_tail, (int)desc->som, (int)desc->eom,
1448 		    (int)desc->ioc, (int)dir);
1449 
1450 		if (desc->ioc)
1451 			memcpy(&qp->completions_ring[qp->cq_tail], cctx,
1452 			    sizeof(*cctx));
1453 
1454 		desc->aes_xts.encrypt = dir;
1455 		desc->aes_xts.type = s->blkcipher.cipher_type;
1456 		desc->aes_xts.size = usize;
1457 
1458 		DPRINTF(dev, "XXX %s: XTS %u: type:%u size:%u\n", __func__,
1459 		    qp->cq_tail, (unsigned)desc->aes_xts.type,
1460 		    (unsigned)desc->aes_xts.size);
1461 
1462 		desc->length = seg->ss_len;
1463 		desc->src_lo = (uint32_t)seg->ss_paddr;
1464 		desc->src_hi = (seg->ss_paddr >> 32);
1465 		desc->src_mem = CCP_MEMTYPE_SYSTEM;
1466 
1467 		/* Crypt in-place */
1468 		desc->dst_lo = desc->src_lo;
1469 		desc->dst_hi = desc->src_hi;
1470 		desc->dst_mem = desc->src_mem;
1471 
1472 		desc->key_lo = ccp_queue_lsb_address(qp, LSB_ENTRY_KEY);
1473 		desc->key_hi = 0;
1474 		desc->key_mem = CCP_MEMTYPE_SB;
1475 
1476 		desc->lsb_ctx_id = ccp_queue_lsb_entry(qp, LSB_ENTRY_IV);
1477 
1478 		qp->cq_tail = (qp->cq_tail + 1) %
1479 		    (1 << qp->cq_softc->ring_size_order);
1480 	}
1481 	return (0);
1482 }
1483 
1484 static int __must_check
1485 ccp_do_blkcipher(struct ccp_queue *qp, struct ccp_session *s,
1486     struct cryptop *crp, struct cryptodesc *crd,
1487     const struct ccp_completion_ctx *cctx)
1488 {
1489 	struct ccp_desc *desc;
1490 	char *keydata;
1491 	device_t dev;
1492 	enum ccp_cipher_dir dir;
1493 	int error;
1494 	size_t keydata_len;
1495 	unsigned i, j;
1496 
1497 	dev = qp->cq_softc->dev;
1498 
1499 	if (s->blkcipher.key_len == 0 || crd->crd_len == 0) {
1500 		DPRINTF(dev, "%s: empty\n", __func__);
1501 		return (EINVAL);
1502 	}
1503 	if ((crd->crd_len % AES_BLOCK_LEN) != 0) {
1504 		DPRINTF(dev, "%s: len modulo: %d\n", __func__, crd->crd_len);
1505 		return (EINVAL);
1506 	}
1507 
1508 	/*
1509 	 * Individual segments must be multiples of AES block size for the HW
1510 	 * to process it.  Non-compliant inputs aren't bogus, just not doable
1511 	 * on this hardware.
1512 	 */
1513 	for (i = 0; i < qp->cq_sg_crp->sg_nseg; i++)
1514 		if ((qp->cq_sg_crp->sg_segs[i].ss_len % AES_BLOCK_LEN) != 0) {
1515 			DPRINTF(dev, "%s: seg modulo: %zu\n", __func__,
1516 			    qp->cq_sg_crp->sg_segs[i].ss_len);
1517 			return (EINVAL);
1518 		}
1519 
1520 	/* Gather IV/nonce data */
1521 	ccp_collect_iv(s, crp, crd);
1522 
1523 	if ((crd->crd_flags & CRD_F_ENCRYPT) != 0)
1524 		dir = CCP_CIPHER_DIR_ENCRYPT;
1525 	else
1526 		dir = CCP_CIPHER_DIR_DECRYPT;
1527 
1528 	/* Set up passthrough op(s) to copy IV into LSB */
1529 	error = ccp_do_pst_to_lsb(qp, ccp_queue_lsb_address(qp, LSB_ENTRY_IV),
1530 	    s->blkcipher.iv, s->blkcipher.iv_len);
1531 	if (error != 0)
1532 		return (error);
1533 
1534 	/*
1535 	 * Initialize keydata and keydata_len for GCC.  The default case of the
1536 	 * following switch is impossible to reach, but GCC doesn't know that.
1537 	 */
1538 	keydata_len = 0;
1539 	keydata = NULL;
1540 
1541 	switch (crd->crd_alg) {
1542 	case CRYPTO_AES_XTS:
1543 		for (j = 0; j < nitems(ccp_xts_unitsize_map); j++)
1544 			if (ccp_xts_unitsize_map[j].cxu_size == crd->crd_len)
1545 				break;
1546 		/* Input buffer must be a supported UnitSize */
1547 		if (j >= nitems(ccp_xts_unitsize_map)) {
1548 			device_printf(dev, "%s: rejected block size: %u\n",
1549 			    __func__, crd->crd_len);
1550 			return (EOPNOTSUPP);
1551 		}
1552 		/* FALLTHROUGH */
1553 	case CRYPTO_AES_CBC:
1554 	case CRYPTO_AES_ICM:
1555 		keydata = s->blkcipher.enckey;
1556 		keydata_len = s->blkcipher.key_len;
1557 		break;
1558 	}
1559 
1560 	INSECURE_DEBUG(dev, "%s: KEY(%zu): %16D\n", __func__, keydata_len,
1561 	    keydata, " ");
1562 	if (crd->crd_alg == CRYPTO_AES_XTS)
1563 		INSECURE_DEBUG(dev, "%s: KEY(XTS): %64D\n", __func__, keydata, " ");
1564 
1565 	/* Reverse order of key material for HW */
1566 	ccp_byteswap(keydata, keydata_len);
1567 
1568 	/* Store key material into LSB to avoid page boundaries */
1569 	if (crd->crd_alg == CRYPTO_AES_XTS) {
1570 		/*
1571 		 * XTS mode uses 2 256-bit vectors for the primary key and the
1572 		 * tweak key.  For 128-bit keys, the vectors are zero-padded.
1573 		 *
1574 		 * After byteswapping the combined OCF-provided K1:K2 vector
1575 		 * above, we need to reverse the order again so the hardware
1576 		 * gets the swapped keys in the order K1':K2'.
1577 		 */
1578 		error = ccp_do_pst_to_lsb(qp,
1579 		    ccp_queue_lsb_address(qp, LSB_ENTRY_KEY + 1), keydata,
1580 		    keydata_len / 2);
1581 		if (error != 0)
1582 			return (error);
1583 		error = ccp_do_pst_to_lsb(qp,
1584 		    ccp_queue_lsb_address(qp, LSB_ENTRY_KEY),
1585 		    keydata + (keydata_len / 2), keydata_len / 2);
1586 
1587 		/* Zero-pad 128 bit keys */
1588 		if (keydata_len == 32) {
1589 			if (error != 0)
1590 				return (error);
1591 			error = ccp_do_pst_to_lsb(qp,
1592 			    ccp_queue_lsb_address(qp, LSB_ENTRY_KEY) +
1593 			    keydata_len / 2, g_zeroes, keydata_len / 2);
1594 			if (error != 0)
1595 				return (error);
1596 			error = ccp_do_pst_to_lsb(qp,
1597 			    ccp_queue_lsb_address(qp, LSB_ENTRY_KEY + 1) +
1598 			    keydata_len / 2, g_zeroes, keydata_len / 2);
1599 		}
1600 	} else
1601 		error = ccp_do_pst_to_lsb(qp,
1602 		    ccp_queue_lsb_address(qp, LSB_ENTRY_KEY), keydata,
1603 		    keydata_len);
1604 	if (error != 0)
1605 		return (error);
1606 
1607 	/*
1608 	 * Point SGLs at the subset of cryptop buffer contents representing the
1609 	 * data.
1610 	 */
1611 	sglist_reset(qp->cq_sg_ulptx);
1612 	error = sglist_append_sglist(qp->cq_sg_ulptx, qp->cq_sg_crp,
1613 	    crd->crd_skip, crd->crd_len);
1614 	if (error != 0)
1615 		return (error);
1616 
1617 	INSECURE_DEBUG(dev, "%s: Contents: %16D\n", __func__,
1618 	    (void *)PHYS_TO_DMAP(qp->cq_sg_ulptx->sg_segs[0].ss_paddr), " ");
1619 
1620 	DPRINTF(dev, "%s: starting AES ops @ %u\n", __func__, qp->cq_tail);
1621 
1622 	if (ccp_queue_get_ring_space(qp) < qp->cq_sg_ulptx->sg_nseg)
1623 		return (EAGAIN);
1624 
1625 	if (crd->crd_alg == CRYPTO_AES_XTS)
1626 		return (ccp_do_xts(qp, s, crp, crd, dir, cctx));
1627 
1628 	for (i = 0; i < qp->cq_sg_ulptx->sg_nseg; i++) {
1629 		struct sglist_seg *seg;
1630 
1631 		seg = &qp->cq_sg_ulptx->sg_segs[i];
1632 
1633 		desc = &qp->desc_ring[qp->cq_tail];
1634 		desc->engine = CCP_ENGINE_AES;
1635 		desc->som = (i == 0);
1636 		desc->eom = (i == qp->cq_sg_ulptx->sg_nseg - 1);
1637 		desc->ioc = (desc->eom && cctx != NULL);
1638 		DPRINTF(dev, "%s: AES %u: som:%d eom:%d ioc:%d dir:%d\n",
1639 		    __func__, qp->cq_tail, (int)desc->som, (int)desc->eom,
1640 		    (int)desc->ioc, (int)dir);
1641 
1642 		if (desc->ioc)
1643 			memcpy(&qp->completions_ring[qp->cq_tail], cctx,
1644 			    sizeof(*cctx));
1645 
1646 		desc->aes.encrypt = dir;
1647 		desc->aes.mode = s->blkcipher.cipher_mode;
1648 		desc->aes.type = s->blkcipher.cipher_type;
1649 		if (crd->crd_alg == CRYPTO_AES_ICM)
1650 			/*
1651 			 * Size of CTR value in bits, - 1.  ICM mode uses all
1652 			 * 128 bits as counter.
1653 			 */
1654 			desc->aes.size = 127;
1655 
1656 		DPRINTF(dev, "%s: AES %u: mode:%u type:%u size:%u\n", __func__,
1657 		    qp->cq_tail, (unsigned)desc->aes.mode,
1658 		    (unsigned)desc->aes.type, (unsigned)desc->aes.size);
1659 
1660 		desc->length = seg->ss_len;
1661 		desc->src_lo = (uint32_t)seg->ss_paddr;
1662 		desc->src_hi = (seg->ss_paddr >> 32);
1663 		desc->src_mem = CCP_MEMTYPE_SYSTEM;
1664 
1665 		/* Crypt in-place */
1666 		desc->dst_lo = desc->src_lo;
1667 		desc->dst_hi = desc->src_hi;
1668 		desc->dst_mem = desc->src_mem;
1669 
1670 		desc->key_lo = ccp_queue_lsb_address(qp, LSB_ENTRY_KEY);
1671 		desc->key_hi = 0;
1672 		desc->key_mem = CCP_MEMTYPE_SB;
1673 
1674 		desc->lsb_ctx_id = ccp_queue_lsb_entry(qp, LSB_ENTRY_IV);
1675 
1676 		qp->cq_tail = (qp->cq_tail + 1) %
1677 		    (1 << qp->cq_softc->ring_size_order);
1678 	}
1679 	return (0);
1680 }
1681 
1682 int __must_check
1683 ccp_blkcipher(struct ccp_queue *qp, struct ccp_session *s, struct cryptop *crp)
1684 {
1685 	struct ccp_completion_ctx ctx;
1686 	struct cryptodesc *crd;
1687 
1688 	crd = crp->crp_desc;
1689 
1690 	ctx.callback_fn = ccp_blkcipher_done;
1691 	ctx.session = s;
1692 	ctx.callback_arg = crp;
1693 
1694 	return (ccp_do_blkcipher(qp, s, crp, crd, &ctx));
1695 }
1696 
1697 static void
1698 ccp_authenc_done(struct ccp_queue *qp, struct ccp_session *s, void *vcrp,
1699     int error)
1700 {
1701 	struct cryptodesc *crda;
1702 	struct cryptop *crp;
1703 
1704 	explicit_bzero(&s->blkcipher, sizeof(s->blkcipher));
1705 
1706 	crp = vcrp;
1707 	if (s->cipher_first)
1708 		crda = crp->crp_desc->crd_next;
1709 	else
1710 		crda = crp->crp_desc;
1711 
1712 	ccp_do_hmac_done(qp, s, crp, crda, error);
1713 }
1714 
1715 int __must_check
1716 ccp_authenc(struct ccp_queue *qp, struct ccp_session *s, struct cryptop *crp,
1717     struct cryptodesc *crda, struct cryptodesc *crde)
1718 {
1719 	struct ccp_completion_ctx ctx;
1720 	int error;
1721 
1722 	ctx.callback_fn = ccp_authenc_done;
1723 	ctx.session = s;
1724 	ctx.callback_arg = crp;
1725 
1726 	/* Perform first operation */
1727 	if (s->cipher_first)
1728 		error = ccp_do_blkcipher(qp, s, crp, crde, NULL);
1729 	else
1730 		error = ccp_do_hmac(qp, s, crp, crda, NULL);
1731 	if (error != 0)
1732 		return (error);
1733 
1734 	/* Perform second operation */
1735 	if (s->cipher_first)
1736 		error = ccp_do_hmac(qp, s, crp, crda, &ctx);
1737 	else
1738 		error = ccp_do_blkcipher(qp, s, crp, crde, &ctx);
1739 	return (error);
1740 }
1741 
1742 static int __must_check
1743 ccp_do_ghash_aad(struct ccp_queue *qp, struct ccp_session *s)
1744 {
1745 	struct ccp_desc *desc;
1746 	struct sglist_seg *seg;
1747 	unsigned i;
1748 
1749 	if (ccp_queue_get_ring_space(qp) < qp->cq_sg_ulptx->sg_nseg)
1750 		return (EAGAIN);
1751 
1752 	for (i = 0; i < qp->cq_sg_ulptx->sg_nseg; i++) {
1753 		seg = &qp->cq_sg_ulptx->sg_segs[i];
1754 
1755 		desc = &qp->desc_ring[qp->cq_tail];
1756 
1757 		desc->engine = CCP_ENGINE_AES;
1758 		desc->aes.mode = CCP_AES_MODE_GHASH;
1759 		desc->aes.type = s->blkcipher.cipher_type;
1760 		desc->aes.encrypt = CCP_AES_MODE_GHASH_AAD;
1761 
1762 		desc->som = (i == 0);
1763 		desc->length = seg->ss_len;
1764 
1765 		desc->src_lo = (uint32_t)seg->ss_paddr;
1766 		desc->src_hi = (seg->ss_paddr >> 32);
1767 		desc->src_mem = CCP_MEMTYPE_SYSTEM;
1768 
1769 		desc->lsb_ctx_id = ccp_queue_lsb_entry(qp, LSB_ENTRY_IV);
1770 
1771 		desc->key_lo = ccp_queue_lsb_address(qp, LSB_ENTRY_KEY);
1772 		desc->key_mem = CCP_MEMTYPE_SB;
1773 
1774 		qp->cq_tail = (qp->cq_tail + 1) %
1775 		    (1 << qp->cq_softc->ring_size_order);
1776 	}
1777 	return (0);
1778 }
1779 
1780 static int __must_check
1781 ccp_do_gctr(struct ccp_queue *qp, struct ccp_session *s,
1782     enum ccp_cipher_dir dir, struct sglist_seg *seg, bool som, bool eom)
1783 {
1784 	struct ccp_desc *desc;
1785 
1786 	if (ccp_queue_get_ring_space(qp) == 0)
1787 		return (EAGAIN);
1788 
1789 	desc = &qp->desc_ring[qp->cq_tail];
1790 
1791 	desc->engine = CCP_ENGINE_AES;
1792 	desc->aes.mode = CCP_AES_MODE_GCTR;
1793 	desc->aes.type = s->blkcipher.cipher_type;
1794 	desc->aes.encrypt = dir;
1795 	desc->aes.size = 8 * (seg->ss_len % GMAC_BLOCK_LEN) - 1;
1796 
1797 	desc->som = som;
1798 	desc->eom = eom;
1799 
1800 	/* Trailing bytes will be masked off by aes.size above. */
1801 	desc->length = roundup2(seg->ss_len, GMAC_BLOCK_LEN);
1802 
1803 	desc->dst_lo = desc->src_lo = (uint32_t)seg->ss_paddr;
1804 	desc->dst_hi = desc->src_hi = seg->ss_paddr >> 32;
1805 	desc->dst_mem = desc->src_mem = CCP_MEMTYPE_SYSTEM;
1806 
1807 	desc->lsb_ctx_id = ccp_queue_lsb_entry(qp, LSB_ENTRY_IV);
1808 
1809 	desc->key_lo = ccp_queue_lsb_address(qp, LSB_ENTRY_KEY);
1810 	desc->key_mem = CCP_MEMTYPE_SB;
1811 
1812 	qp->cq_tail = (qp->cq_tail + 1) %
1813 	    (1 << qp->cq_softc->ring_size_order);
1814 	return (0);
1815 }
1816 
1817 static int __must_check
1818 ccp_do_ghash_final(struct ccp_queue *qp, struct ccp_session *s)
1819 {
1820 	struct ccp_desc *desc;
1821 
1822 	if (ccp_queue_get_ring_space(qp) == 0)
1823 		return (EAGAIN);
1824 
1825 	desc = &qp->desc_ring[qp->cq_tail];
1826 
1827 	desc->engine = CCP_ENGINE_AES;
1828 	desc->aes.mode = CCP_AES_MODE_GHASH;
1829 	desc->aes.type = s->blkcipher.cipher_type;
1830 	desc->aes.encrypt = CCP_AES_MODE_GHASH_FINAL;
1831 
1832 	desc->length = GMAC_BLOCK_LEN;
1833 
1834 	desc->src_lo = ccp_queue_lsb_address(qp, LSB_ENTRY_GHASH_IN);
1835 	desc->src_mem = CCP_MEMTYPE_SB;
1836 
1837 	desc->lsb_ctx_id = ccp_queue_lsb_entry(qp, LSB_ENTRY_IV);
1838 
1839 	desc->key_lo = ccp_queue_lsb_address(qp, LSB_ENTRY_KEY);
1840 	desc->key_mem = CCP_MEMTYPE_SB;
1841 
1842 	desc->dst_lo = ccp_queue_lsb_address(qp, LSB_ENTRY_GHASH);
1843 	desc->dst_mem = CCP_MEMTYPE_SB;
1844 
1845 	qp->cq_tail = (qp->cq_tail + 1) %
1846 	    (1 << qp->cq_softc->ring_size_order);
1847 	return (0);
1848 }
1849 
1850 static void
1851 ccp_gcm_done(struct ccp_queue *qp, struct ccp_session *s, void *vcrp,
1852     int error)
1853 {
1854 	char tag[GMAC_DIGEST_LEN];
1855 	struct cryptodesc *crde, *crda;
1856 	struct cryptop *crp;
1857 
1858 	crp = vcrp;
1859 	if (s->cipher_first) {
1860 		crde = crp->crp_desc;
1861 		crda = crp->crp_desc->crd_next;
1862 	} else {
1863 		crde = crp->crp_desc->crd_next;
1864 		crda = crp->crp_desc;
1865 	}
1866 
1867 	s->pending--;
1868 
1869 	if (error != 0) {
1870 		crp->crp_etype = error;
1871 		goto out;
1872 	}
1873 
1874 	/* Encrypt is done.  Decrypt needs to verify tag. */
1875 	if ((crde->crd_flags & CRD_F_ENCRYPT) != 0)
1876 		goto out;
1877 
1878 	/* Copy in message tag. */
1879 	crypto_copydata(crp->crp_flags, crp->crp_buf, crda->crd_inject,
1880 	    sizeof(tag), tag);
1881 
1882 	/* Verify tag against computed GMAC */
1883 	if (timingsafe_bcmp(tag, s->gmac.final_block, s->gmac.hash_len) != 0)
1884 		crp->crp_etype = EBADMSG;
1885 
1886 out:
1887 	explicit_bzero(&s->blkcipher, sizeof(s->blkcipher));
1888 	explicit_bzero(&s->gmac, sizeof(s->gmac));
1889 	crypto_done(crp);
1890 }
1891 
1892 int __must_check
1893 ccp_gcm(struct ccp_queue *qp, struct ccp_session *s, struct cryptop *crp,
1894     struct cryptodesc *crda, struct cryptodesc *crde)
1895 {
1896 	struct ccp_completion_ctx ctx;
1897 	enum ccp_cipher_dir dir;
1898 	device_t dev;
1899 	unsigned i;
1900 	int error;
1901 
1902 	if (s->blkcipher.key_len == 0)
1903 		return (EINVAL);
1904 
1905 	/*
1906 	 * AAD is only permitted before the cipher/plain text, not
1907 	 * after.
1908 	 */
1909 	if (crda->crd_len + crda->crd_skip > crde->crd_len + crde->crd_skip)
1910 		return (EINVAL);
1911 
1912 	dev = qp->cq_softc->dev;
1913 
1914 	if ((crde->crd_flags & CRD_F_ENCRYPT) != 0)
1915 		dir = CCP_CIPHER_DIR_ENCRYPT;
1916 	else
1917 		dir = CCP_CIPHER_DIR_DECRYPT;
1918 
1919 	/* Zero initial GHASH portion of context */
1920 	memset(s->blkcipher.iv, 0, sizeof(s->blkcipher.iv));
1921 
1922 	/* Gather IV data */
1923 	ccp_collect_iv(s, crp, crde);
1924 
1925 	/* Reverse order of key material for HW */
1926 	ccp_byteswap(s->blkcipher.enckey, s->blkcipher.key_len);
1927 
1928 	/* Prepare input buffer of concatenated lengths for final GHASH */
1929 	be64enc(s->gmac.final_block, (uint64_t)crda->crd_len * 8);
1930 	be64enc(&s->gmac.final_block[8], (uint64_t)crde->crd_len * 8);
1931 
1932 	/* Send IV + initial zero GHASH, key data, and lengths buffer to LSB */
1933 	error = ccp_do_pst_to_lsb(qp, ccp_queue_lsb_address(qp, LSB_ENTRY_IV),
1934 	    s->blkcipher.iv, 32);
1935 	if (error != 0)
1936 		return (error);
1937 	error = ccp_do_pst_to_lsb(qp, ccp_queue_lsb_address(qp, LSB_ENTRY_KEY),
1938 	    s->blkcipher.enckey, s->blkcipher.key_len);
1939 	if (error != 0)
1940 		return (error);
1941 	error = ccp_do_pst_to_lsb(qp,
1942 	    ccp_queue_lsb_address(qp, LSB_ENTRY_GHASH_IN), s->gmac.final_block,
1943 	    GMAC_BLOCK_LEN);
1944 	if (error != 0)
1945 		return (error);
1946 
1947 	/* First step - compute GHASH over AAD */
1948 	if (crda->crd_len != 0) {
1949 		sglist_reset(qp->cq_sg_ulptx);
1950 		error = sglist_append_sglist(qp->cq_sg_ulptx, qp->cq_sg_crp,
1951 		    crda->crd_skip, crda->crd_len);
1952 		if (error != 0)
1953 			return (error);
1954 
1955 		/* This engine cannot process non-block multiple AAD data. */
1956 		for (i = 0; i < qp->cq_sg_ulptx->sg_nseg; i++)
1957 			if ((qp->cq_sg_ulptx->sg_segs[i].ss_len %
1958 			    GMAC_BLOCK_LEN) != 0) {
1959 				DPRINTF(dev, "%s: AD seg modulo: %zu\n",
1960 				    __func__,
1961 				    qp->cq_sg_ulptx->sg_segs[i].ss_len);
1962 				return (EINVAL);
1963 			}
1964 
1965 		error = ccp_do_ghash_aad(qp, s);
1966 		if (error != 0)
1967 			return (error);
1968 	}
1969 
1970 	/* Feed data piece by piece into GCTR */
1971 	sglist_reset(qp->cq_sg_ulptx);
1972 	error = sglist_append_sglist(qp->cq_sg_ulptx, qp->cq_sg_crp,
1973 	    crde->crd_skip, crde->crd_len);
1974 	if (error != 0)
1975 		return (error);
1976 
1977 	/*
1978 	 * All segments except the last must be even multiples of AES block
1979 	 * size for the HW to process it.  Non-compliant inputs aren't bogus,
1980 	 * just not doable on this hardware.
1981 	 *
1982 	 * XXX: Well, the hardware will produce a valid tag for shorter final
1983 	 * segment inputs, but it will still write out a block-sized plaintext
1984 	 * or ciphertext chunk.  For a typical CRP this tramples trailing data,
1985 	 * including the provided message tag.  So, reject such inputs for now.
1986 	 */
1987 	for (i = 0; i < qp->cq_sg_ulptx->sg_nseg; i++)
1988 		if ((qp->cq_sg_ulptx->sg_segs[i].ss_len % AES_BLOCK_LEN) != 0) {
1989 			DPRINTF(dev, "%s: seg modulo: %zu\n", __func__,
1990 			    qp->cq_sg_ulptx->sg_segs[i].ss_len);
1991 			return (EINVAL);
1992 		}
1993 
1994 	for (i = 0; i < qp->cq_sg_ulptx->sg_nseg; i++) {
1995 		struct sglist_seg *seg;
1996 
1997 		seg = &qp->cq_sg_ulptx->sg_segs[i];
1998 		error = ccp_do_gctr(qp, s, dir, seg,
1999 		    (i == 0 && crda->crd_len == 0),
2000 		    i == (qp->cq_sg_ulptx->sg_nseg - 1));
2001 		if (error != 0)
2002 			return (error);
2003 	}
2004 
2005 	/* Send just initial IV (not GHASH!) to LSB again */
2006 	error = ccp_do_pst_to_lsb(qp, ccp_queue_lsb_address(qp, LSB_ENTRY_IV),
2007 	    s->blkcipher.iv, s->blkcipher.iv_len);
2008 	if (error != 0)
2009 		return (error);
2010 
2011 	ctx.callback_fn = ccp_gcm_done;
2012 	ctx.session = s;
2013 	ctx.callback_arg = crp;
2014 
2015 	/* Compute final hash and copy result back */
2016 	error = ccp_do_ghash_final(qp, s);
2017 	if (error != 0)
2018 		return (error);
2019 
2020 	/* When encrypting, copy computed tag out to caller buffer. */
2021 	sglist_reset(qp->cq_sg_ulptx);
2022 	if (dir == CCP_CIPHER_DIR_ENCRYPT)
2023 		error = sglist_append_sglist(qp->cq_sg_ulptx, qp->cq_sg_crp,
2024 		    crda->crd_inject, s->gmac.hash_len);
2025 	else
2026 		/*
2027 		 * For decrypting, copy the computed tag out to our session
2028 		 * buffer to verify in our callback.
2029 		 */
2030 		error = sglist_append(qp->cq_sg_ulptx, s->gmac.final_block,
2031 		    s->gmac.hash_len);
2032 	if (error != 0)
2033 		return (error);
2034 	error = ccp_passthrough_sgl(qp,
2035 	    ccp_queue_lsb_address(qp, LSB_ENTRY_GHASH), false, qp->cq_sg_ulptx,
2036 	    s->gmac.hash_len, true, &ctx);
2037 	return (error);
2038 }
2039 
2040 #define MAX_TRNG_RETRIES	10
2041 u_int
2042 random_ccp_read(void *v, u_int c)
2043 {
2044 	uint32_t *buf;
2045 	u_int i, j;
2046 
2047 	KASSERT(c % sizeof(*buf) == 0, ("%u not multiple of u_long", c));
2048 
2049 	buf = v;
2050 	for (i = c; i > 0; i -= sizeof(*buf)) {
2051 		for (j = 0; j < MAX_TRNG_RETRIES; j++) {
2052 			*buf = ccp_read_4(g_ccp_softc, TRNG_OUT_OFFSET);
2053 			if (*buf != 0)
2054 				break;
2055 		}
2056 		if (j == MAX_TRNG_RETRIES)
2057 			return (0);
2058 		buf++;
2059 	}
2060 	return (c);
2061 
2062 }
2063 
2064 #ifdef DDB
2065 void
2066 db_ccp_show_hw(struct ccp_softc *sc)
2067 {
2068 
2069 	db_printf("  queue mask: 0x%x\n",
2070 	    ccp_read_4(sc, CMD_QUEUE_MASK_OFFSET));
2071 	db_printf("  queue prio: 0x%x\n",
2072 	    ccp_read_4(sc, CMD_QUEUE_PRIO_OFFSET));
2073 	db_printf("  reqid: 0x%x\n", ccp_read_4(sc, CMD_REQID_CONFIG_OFFSET));
2074 	db_printf("  trng output: 0x%x\n", ccp_read_4(sc, TRNG_OUT_OFFSET));
2075 	db_printf("  cmd timeout: 0x%x\n",
2076 	    ccp_read_4(sc, CMD_CMD_TIMEOUT_OFFSET));
2077 	db_printf("  lsb public mask lo: 0x%x\n",
2078 	    ccp_read_4(sc, LSB_PUBLIC_MASK_LO_OFFSET));
2079 	db_printf("  lsb public mask hi: 0x%x\n",
2080 	    ccp_read_4(sc, LSB_PUBLIC_MASK_HI_OFFSET));
2081 	db_printf("  lsb private mask lo: 0x%x\n",
2082 	    ccp_read_4(sc, LSB_PRIVATE_MASK_LO_OFFSET));
2083 	db_printf("  lsb private mask hi: 0x%x\n",
2084 	    ccp_read_4(sc, LSB_PRIVATE_MASK_HI_OFFSET));
2085 	db_printf("  version: 0x%x\n", ccp_read_4(sc, VERSION_REG));
2086 }
2087 
2088 void
2089 db_ccp_show_queue_hw(struct ccp_queue *qp)
2090 {
2091 	const struct ccp_error_code *ec;
2092 	struct ccp_softc *sc;
2093 	uint32_t status, error, esource, faultblock, headlo, qcontrol;
2094 	unsigned q, i;
2095 
2096 	sc = qp->cq_softc;
2097 	q = qp->cq_qindex;
2098 
2099 	qcontrol = ccp_read_queue_4(sc, q, CMD_Q_CONTROL_BASE);
2100 	db_printf("  qcontrol: 0x%x%s%s\n", qcontrol,
2101 	    (qcontrol & CMD_Q_RUN) ? " RUN" : "",
2102 	    (qcontrol & CMD_Q_HALTED) ? " HALTED" : "");
2103 	db_printf("  tail_lo: 0x%x\n",
2104 	    ccp_read_queue_4(sc, q, CMD_Q_TAIL_LO_BASE));
2105 	headlo = ccp_read_queue_4(sc, q, CMD_Q_HEAD_LO_BASE);
2106 	db_printf("  head_lo: 0x%x\n", headlo);
2107 	db_printf("  int enable: 0x%x\n",
2108 	    ccp_read_queue_4(sc, q, CMD_Q_INT_ENABLE_BASE));
2109 	db_printf("  interrupt status: 0x%x\n",
2110 	    ccp_read_queue_4(sc, q, CMD_Q_INTERRUPT_STATUS_BASE));
2111 	status = ccp_read_queue_4(sc, q, CMD_Q_STATUS_BASE);
2112 	db_printf("  status: 0x%x\n", status);
2113 	db_printf("  int stats: 0x%x\n",
2114 	    ccp_read_queue_4(sc, q, CMD_Q_INT_STATUS_BASE));
2115 
2116 	error = status & STATUS_ERROR_MASK;
2117 	if (error == 0)
2118 		return;
2119 
2120 	esource = (status >> STATUS_ERRORSOURCE_SHIFT) &
2121 	    STATUS_ERRORSOURCE_MASK;
2122 	faultblock = (status >> STATUS_VLSB_FAULTBLOCK_SHIFT) &
2123 	    STATUS_VLSB_FAULTBLOCK_MASK;
2124 
2125 	ec = NULL;
2126 	for (i = 0; i < nitems(ccp_error_codes); i++)
2127 		if (ccp_error_codes[i].ce_code == error)
2128 			break;
2129 	if (i < nitems(ccp_error_codes))
2130 		ec = &ccp_error_codes[i];
2131 
2132 	db_printf("  Error: %s (%u) Source: %u Faulting LSB block: %u\n",
2133 	    (ec != NULL) ? ec->ce_name : "(reserved)", error, esource,
2134 	    faultblock);
2135 	if (ec != NULL)
2136 		db_printf("  Error description: %s\n", ec->ce_desc);
2137 
2138 	i = (headlo - (uint32_t)qp->desc_ring_bus_addr) / Q_DESC_SIZE;
2139 	db_printf("  Bad descriptor idx: %u contents:\n  %32D\n", i,
2140 	    (void *)&qp->desc_ring[i], " ");
2141 }
2142 #endif
2143