1 // SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only 2 /* 3 * Implement fast Fletcher4 using superscalar pipelines. 4 * 5 * Use regular C code to compute 6 * Fletcher4 in four incremental 64-bit parallel accumulator streams, 7 * and then combine the streams to form the final four checksum words. 8 * This implementation is a derivative of the AVX SIMD implementation by 9 * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). 10 * 11 * Copyright (C) 2016 Romain Dolbeau. 12 * 13 * Authors: 14 * Romain Dolbeau <romain.dolbeau@atos.net> 15 * 16 * This software is available to you under a choice of one of two 17 * licenses. You may choose to be licensed under the terms of the GNU 18 * General Public License (GPL) Version 2, available from the file 19 * COPYING in the main directory of this source tree, or the 20 * OpenIB.org BSD license below: 21 * 22 * Redistribution and use in source and binary forms, with or 23 * without modification, are permitted provided that the following 24 * conditions are met: 25 * 26 * - Redistributions of source code must retain the above 27 * copyright notice, this list of conditions and the following 28 * disclaimer. 29 * 30 * - Redistributions in binary form must reproduce the above 31 * copyright notice, this list of conditions and the following 32 * disclaimer in the documentation and/or other materials 33 * provided with the distribution. 34 * 35 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 36 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 37 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 38 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 39 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 40 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 41 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 42 * SOFTWARE. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/byteorder.h> 47 #include <sys/spa_checksum.h> 48 #include <sys/string.h> 49 #include <zfs_fletcher.h> 50 51 static void 52 fletcher_4_superscalar4_init(fletcher_4_ctx_t *ctx) 53 { 54 memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t)); 55 } 56 57 static void 58 fletcher_4_superscalar4_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 59 { 60 uint64_t A, B, C, D; 61 62 A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1] + 63 ctx->superscalar[0].v[2] + ctx->superscalar[0].v[3]; 64 B = 0 - ctx->superscalar[0].v[1] - 2 * ctx->superscalar[0].v[2] - 65 3 * ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] + 66 4 * ctx->superscalar[1].v[1] + 4 * ctx->superscalar[1].v[2] + 67 4 * ctx->superscalar[1].v[3]; 68 69 C = ctx->superscalar[0].v[2] + 3 * ctx->superscalar[0].v[3] - 70 6 * ctx->superscalar[1].v[0] - 10 * ctx->superscalar[1].v[1] - 71 14 * ctx->superscalar[1].v[2] - 18 * ctx->superscalar[1].v[3] + 72 16 * ctx->superscalar[2].v[0] + 16 * ctx->superscalar[2].v[1] + 73 16 * ctx->superscalar[2].v[2] + 16 * ctx->superscalar[2].v[3]; 74 75 D = 0 - ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] + 76 10 * ctx->superscalar[1].v[1] + 20 * ctx->superscalar[1].v[2] + 77 34 * ctx->superscalar[1].v[3] - 48 * ctx->superscalar[2].v[0] - 78 64 * ctx->superscalar[2].v[1] - 80 * ctx->superscalar[2].v[2] - 79 96 * ctx->superscalar[2].v[3] + 64 * ctx->superscalar[3].v[0] + 80 64 * ctx->superscalar[3].v[1] + 64 * ctx->superscalar[3].v[2] + 81 64 * ctx->superscalar[3].v[3]; 82 83 ZIO_SET_CHECKSUM(zcp, A, B, C, D); 84 } 85 86 static void 87 fletcher_4_superscalar4_native(fletcher_4_ctx_t *ctx, 88 const void *buf, uint64_t size) 89 { 90 const uint32_t *ip = buf; 91 const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 92 uint64_t a, b, c, d; 93 uint64_t a2, b2, c2, d2; 94 uint64_t a3, b3, c3, d3; 95 uint64_t a4, b4, c4, d4; 96 97 a = ctx->superscalar[0].v[0]; 98 b = ctx->superscalar[1].v[0]; 99 c = ctx->superscalar[2].v[0]; 100 d = ctx->superscalar[3].v[0]; 101 a2 = ctx->superscalar[0].v[1]; 102 b2 = ctx->superscalar[1].v[1]; 103 c2 = ctx->superscalar[2].v[1]; 104 d2 = ctx->superscalar[3].v[1]; 105 a3 = ctx->superscalar[0].v[2]; 106 b3 = ctx->superscalar[1].v[2]; 107 c3 = ctx->superscalar[2].v[2]; 108 d3 = ctx->superscalar[3].v[2]; 109 a4 = ctx->superscalar[0].v[3]; 110 b4 = ctx->superscalar[1].v[3]; 111 c4 = ctx->superscalar[2].v[3]; 112 d4 = ctx->superscalar[3].v[3]; 113 114 do { 115 a += ip[0]; 116 a2 += ip[1]; 117 a3 += ip[2]; 118 a4 += ip[3]; 119 b += a; 120 b2 += a2; 121 b3 += a3; 122 b4 += a4; 123 c += b; 124 c2 += b2; 125 c3 += b3; 126 c4 += b4; 127 d += c; 128 d2 += c2; 129 d3 += c3; 130 d4 += c4; 131 } while ((ip += 4) < ipend); 132 133 ctx->superscalar[0].v[0] = a; 134 ctx->superscalar[1].v[0] = b; 135 ctx->superscalar[2].v[0] = c; 136 ctx->superscalar[3].v[0] = d; 137 ctx->superscalar[0].v[1] = a2; 138 ctx->superscalar[1].v[1] = b2; 139 ctx->superscalar[2].v[1] = c2; 140 ctx->superscalar[3].v[1] = d2; 141 ctx->superscalar[0].v[2] = a3; 142 ctx->superscalar[1].v[2] = b3; 143 ctx->superscalar[2].v[2] = c3; 144 ctx->superscalar[3].v[2] = d3; 145 ctx->superscalar[0].v[3] = a4; 146 ctx->superscalar[1].v[3] = b4; 147 ctx->superscalar[2].v[3] = c4; 148 ctx->superscalar[3].v[3] = d4; 149 } 150 151 static void 152 fletcher_4_superscalar4_byteswap(fletcher_4_ctx_t *ctx, 153 const void *buf, uint64_t size) 154 { 155 const uint32_t *ip = buf; 156 const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 157 uint64_t a, b, c, d; 158 uint64_t a2, b2, c2, d2; 159 uint64_t a3, b3, c3, d3; 160 uint64_t a4, b4, c4, d4; 161 162 a = ctx->superscalar[0].v[0]; 163 b = ctx->superscalar[1].v[0]; 164 c = ctx->superscalar[2].v[0]; 165 d = ctx->superscalar[3].v[0]; 166 a2 = ctx->superscalar[0].v[1]; 167 b2 = ctx->superscalar[1].v[1]; 168 c2 = ctx->superscalar[2].v[1]; 169 d2 = ctx->superscalar[3].v[1]; 170 a3 = ctx->superscalar[0].v[2]; 171 b3 = ctx->superscalar[1].v[2]; 172 c3 = ctx->superscalar[2].v[2]; 173 d3 = ctx->superscalar[3].v[2]; 174 a4 = ctx->superscalar[0].v[3]; 175 b4 = ctx->superscalar[1].v[3]; 176 c4 = ctx->superscalar[2].v[3]; 177 d4 = ctx->superscalar[3].v[3]; 178 179 do { 180 a += BSWAP_32(ip[0]); 181 a2 += BSWAP_32(ip[1]); 182 a3 += BSWAP_32(ip[2]); 183 a4 += BSWAP_32(ip[3]); 184 b += a; 185 b2 += a2; 186 b3 += a3; 187 b4 += a4; 188 c += b; 189 c2 += b2; 190 c3 += b3; 191 c4 += b4; 192 d += c; 193 d2 += c2; 194 d3 += c3; 195 d4 += c4; 196 } while ((ip += 4) < ipend); 197 198 ctx->superscalar[0].v[0] = a; 199 ctx->superscalar[1].v[0] = b; 200 ctx->superscalar[2].v[0] = c; 201 ctx->superscalar[3].v[0] = d; 202 ctx->superscalar[0].v[1] = a2; 203 ctx->superscalar[1].v[1] = b2; 204 ctx->superscalar[2].v[1] = c2; 205 ctx->superscalar[3].v[1] = d2; 206 ctx->superscalar[0].v[2] = a3; 207 ctx->superscalar[1].v[2] = b3; 208 ctx->superscalar[2].v[2] = c3; 209 ctx->superscalar[3].v[2] = d3; 210 ctx->superscalar[0].v[3] = a4; 211 ctx->superscalar[1].v[3] = b4; 212 ctx->superscalar[2].v[3] = c4; 213 ctx->superscalar[3].v[3] = d4; 214 } 215 216 static boolean_t fletcher_4_superscalar4_valid(void) 217 { 218 return (B_TRUE); 219 } 220 221 const fletcher_4_ops_t fletcher_4_superscalar4_ops = { 222 .init_native = fletcher_4_superscalar4_init, 223 .compute_native = fletcher_4_superscalar4_native, 224 .fini_native = fletcher_4_superscalar4_fini, 225 .init_byteswap = fletcher_4_superscalar4_init, 226 .compute_byteswap = fletcher_4_superscalar4_byteswap, 227 .fini_byteswap = fletcher_4_superscalar4_fini, 228 .valid = fletcher_4_superscalar4_valid, 229 .uses_fpu = B_FALSE, 230 .name = "superscalar4" 231 }; 232