1 // SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only 2 /* 3 * Implement fast Fletcher4 using superscalar pipelines. 4 * 5 * Use regular C code to compute 6 * Fletcher4 in two incremental 64-bit parallel accumulator streams, 7 * and then combine the streams to form the final four checksum words. 8 * This implementation is a derivative of the AVX SIMD implementation by 9 * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). 10 * 11 * Copyright (C) 2016 Romain Dolbeau. 12 * 13 * Authors: 14 * Romain Dolbeau <romain.dolbeau@atos.net> 15 * 16 * This software is available to you under a choice of one of two 17 * licenses. You may choose to be licensed under the terms of the GNU 18 * General Public License (GPL) Version 2, available from the file 19 * COPYING in the main directory of this source tree, or the 20 * OpenIB.org BSD license below: 21 * 22 * Redistribution and use in source and binary forms, with or 23 * without modification, are permitted provided that the following 24 * conditions are met: 25 * 26 * - Redistributions of source code must retain the above 27 * copyright notice, this list of conditions and the following 28 * disclaimer. 29 * 30 * - Redistributions in binary form must reproduce the above 31 * copyright notice, this list of conditions and the following 32 * disclaimer in the documentation and/or other materials 33 * provided with the distribution. 34 * 35 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 36 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 37 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 38 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 39 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 40 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 41 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 42 * SOFTWARE. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/byteorder.h> 47 #include <sys/spa_checksum.h> 48 #include <sys/string.h> 49 #include <zfs_fletcher.h> 50 51 static void 52 fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx) 53 { 54 memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t)); 55 } 56 57 static void 58 fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 59 { 60 uint64_t A, B, C, D; 61 A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1]; 62 B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] - 63 ctx->superscalar[0].v[1]; 64 C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] + 65 4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1]; 66 D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] + 67 8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] + 68 ctx->superscalar[1].v[1]; 69 ZIO_SET_CHECKSUM(zcp, A, B, C, D); 70 } 71 72 static void 73 fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx, 74 const void *buf, uint64_t size) 75 { 76 const uint32_t *ip = buf; 77 const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 78 uint64_t a, b, c, d; 79 uint64_t a2, b2, c2, d2; 80 81 a = ctx->superscalar[0].v[0]; 82 b = ctx->superscalar[1].v[0]; 83 c = ctx->superscalar[2].v[0]; 84 d = ctx->superscalar[3].v[0]; 85 a2 = ctx->superscalar[0].v[1]; 86 b2 = ctx->superscalar[1].v[1]; 87 c2 = ctx->superscalar[2].v[1]; 88 d2 = ctx->superscalar[3].v[1]; 89 90 do { 91 a += ip[0]; 92 a2 += ip[1]; 93 b += a; 94 b2 += a2; 95 c += b; 96 c2 += b2; 97 d += c; 98 d2 += c2; 99 } while ((ip += 2) < ipend); 100 101 ctx->superscalar[0].v[0] = a; 102 ctx->superscalar[1].v[0] = b; 103 ctx->superscalar[2].v[0] = c; 104 ctx->superscalar[3].v[0] = d; 105 ctx->superscalar[0].v[1] = a2; 106 ctx->superscalar[1].v[1] = b2; 107 ctx->superscalar[2].v[1] = c2; 108 ctx->superscalar[3].v[1] = d2; 109 } 110 111 static void 112 fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx, 113 const void *buf, uint64_t size) 114 { 115 const uint32_t *ip = buf; 116 const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 117 uint64_t a, b, c, d; 118 uint64_t a2, b2, c2, d2; 119 120 a = ctx->superscalar[0].v[0]; 121 b = ctx->superscalar[1].v[0]; 122 c = ctx->superscalar[2].v[0]; 123 d = ctx->superscalar[3].v[0]; 124 a2 = ctx->superscalar[0].v[1]; 125 b2 = ctx->superscalar[1].v[1]; 126 c2 = ctx->superscalar[2].v[1]; 127 d2 = ctx->superscalar[3].v[1]; 128 129 do { 130 a += BSWAP_32(ip[0]); 131 a2 += BSWAP_32(ip[1]); 132 b += a; 133 b2 += a2; 134 c += b; 135 c2 += b2; 136 d += c; 137 d2 += c2; 138 } while ((ip += 2) < ipend); 139 140 ctx->superscalar[0].v[0] = a; 141 ctx->superscalar[1].v[0] = b; 142 ctx->superscalar[2].v[0] = c; 143 ctx->superscalar[3].v[0] = d; 144 ctx->superscalar[0].v[1] = a2; 145 ctx->superscalar[1].v[1] = b2; 146 ctx->superscalar[2].v[1] = c2; 147 ctx->superscalar[3].v[1] = d2; 148 } 149 150 static boolean_t fletcher_4_superscalar_valid(void) 151 { 152 return (B_TRUE); 153 } 154 155 const fletcher_4_ops_t fletcher_4_superscalar_ops = { 156 .init_native = fletcher_4_superscalar_init, 157 .compute_native = fletcher_4_superscalar_native, 158 .fini_native = fletcher_4_superscalar_fini, 159 .init_byteswap = fletcher_4_superscalar_init, 160 .compute_byteswap = fletcher_4_superscalar_byteswap, 161 .fini_byteswap = fletcher_4_superscalar_fini, 162 .valid = fletcher_4_superscalar_valid, 163 .uses_fpu = B_FALSE, 164 .name = "superscalar" 165 }; 166