1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IP/TCP/UDP checksumming routines 7 * 8 * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea 9 * Optimized by Joe Taylor 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public License 13 * as published by the Free Software Foundation; either version 14 * 2 of the License, or (at your option) any later version. 15 */ 16 17#include <asm/errno.h> 18#include <linux/linkage.h> 19#include <variant/core.h> 20 21/* 22 * computes a partial checksum, e.g. for TCP/UDP fragments 23 */ 24 25/* 26 * unsigned int csum_partial(const unsigned char *buf, int len, 27 * unsigned int sum); 28 * a2 = buf 29 * a3 = len 30 * a4 = sum 31 * 32 * This function assumes 2- or 4-byte alignment. Other alignments will fail! 33 */ 34 35/* ONES_ADD converts twos-complement math to ones-complement. */ 36#define ONES_ADD(sum, val) \ 37 add sum, sum, val ; \ 38 bgeu sum, val, 99f ; \ 39 addi sum, sum, 1 ; \ 4099: ; 41 42.text 43ENTRY(csum_partial) 44 /* 45 * Experiments with Ethernet and SLIP connections show that buf 46 * is aligned on either a 2-byte or 4-byte boundary. 47 */ 48 entry sp, 32 49 extui a5, a2, 0, 2 50 bnez a5, 8f /* branch if 2-byte aligned */ 51 /* Fall-through on common case, 4-byte alignment */ 521: 53 srli a5, a3, 5 /* 32-byte chunks */ 54#if XCHAL_HAVE_LOOPS 55 loopgtz a5, 2f 56#else 57 beqz a5, 2f 58 slli a5, a5, 5 59 add a5, a5, a2 /* a5 = end of last 32-byte chunk */ 60.Loop1: 61#endif 62 l32i a6, a2, 0 63 l32i a7, a2, 4 64 ONES_ADD(a4, a6) 65 ONES_ADD(a4, a7) 66 l32i a6, a2, 8 67 l32i a7, a2, 12 68 ONES_ADD(a4, a6) 69 ONES_ADD(a4, a7) 70 l32i a6, a2, 16 71 l32i a7, a2, 20 72 ONES_ADD(a4, a6) 73 ONES_ADD(a4, a7) 74 l32i a6, a2, 24 75 l32i a7, a2, 28 76 ONES_ADD(a4, a6) 77 ONES_ADD(a4, a7) 78 addi a2, a2, 4*8 79#if !XCHAL_HAVE_LOOPS 80 blt a2, a5, .Loop1 81#endif 822: 83 extui a5, a3, 2, 3 /* remaining 4-byte chunks */ 84#if XCHAL_HAVE_LOOPS 85 loopgtz a5, 3f 86#else 87 beqz a5, 3f 88 slli a5, a5, 2 89 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 90.Loop2: 91#endif 92 l32i a6, a2, 0 93 ONES_ADD(a4, a6) 94 addi a2, a2, 4 95#if !XCHAL_HAVE_LOOPS 96 blt a2, a5, .Loop2 97#endif 983: 99 _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ 100 l16ui a6, a2, 0 101 ONES_ADD(a4, a6) 102 addi a2, a2, 2 1035: 104 _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ 1056: l8ui a6, a2, 0 106#ifdef __XTENSA_EB__ 107 slli a6, a6, 8 /* load byte into bits 8..15 */ 108#endif 109 ONES_ADD(a4, a6) 1107: 111 mov a2, a4 112 retw 113 114 /* uncommon case, buf is 2-byte aligned */ 1158: 116 beqz a3, 7b /* branch if len == 0 */ 117 beqi a3, 1, 6b /* branch if len == 1 */ 118 119 extui a5, a2, 0, 1 120 bnez a5, 8f /* branch if 1-byte aligned */ 121 122 l16ui a6, a2, 0 /* common case, len >= 2 */ 123 ONES_ADD(a4, a6) 124 addi a2, a2, 2 /* adjust buf */ 125 addi a3, a3, -2 /* adjust len */ 126 j 1b /* now buf is 4-byte aligned */ 127 128 /* case: odd-byte aligned, len > 1 129 * This case is dog slow, so don't give us an odd address. 130 * (I don't think this ever happens, but just in case.) 131 */ 1328: 133 srli a5, a3, 2 /* 4-byte chunks */ 134#if XCHAL_HAVE_LOOPS 135 loopgtz a5, 2f 136#else 137 beqz a5, 2f 138 slli a5, a5, 2 139 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 140.Loop3: 141#endif 142 l8ui a6, a2, 0 /* bits 24..31 */ 143 l16ui a7, a2, 1 /* bits 8..23 */ 144 l8ui a8, a2, 3 /* bits 0.. 8 */ 145#ifdef __XTENSA_EB__ 146 slli a6, a6, 24 147#else 148 slli a8, a8, 24 149#endif 150 slli a7, a7, 8 151 or a7, a7, a6 152 or a7, a7, a8 153 ONES_ADD(a4, a7) 154 addi a2, a2, 4 155#if !XCHAL_HAVE_LOOPS 156 blt a2, a5, .Loop3 157#endif 1582: 159 _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ 160 l8ui a6, a2, 0 161 l8ui a7, a2, 1 162#ifdef __XTENSA_EB__ 163 slli a6, a6, 8 164#else 165 slli a7, a7, 8 166#endif 167 or a7, a7, a6 168 ONES_ADD(a4, a7) 169 addi a2, a2, 2 1703: 171 j 5b /* branch to handle the remaining byte */ 172 173ENDPROC(csum_partial) 174 175/* 176 * Copy from ds while checksumming, otherwise like csum_partial 177 * 178 * The macros SRC and DST specify the type of access for the instruction. 179 * thus we can call a custom exception handler for each access type. 180 */ 181 182#define SRC(y...) \ 183 9999: y; \ 184 .section __ex_table, "a"; \ 185 .long 9999b, 6001f ; \ 186 .previous 187 188#define DST(y...) \ 189 9999: y; \ 190 .section __ex_table, "a"; \ 191 .long 9999b, 6002f ; \ 192 .previous 193 194/* 195unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, 196 int sum, int *src_err_ptr, int *dst_err_ptr) 197 a2 = src 198 a3 = dst 199 a4 = len 200 a5 = sum 201 a6 = src_err_ptr 202 a7 = dst_err_ptr 203 a8 = temp 204 a9 = temp 205 a10 = temp 206 a11 = original len for exception handling 207 a12 = original dst for exception handling 208 209 This function is optimized for 4-byte aligned addresses. Other 210 alignments work, but not nearly as efficiently. 211 */ 212 213ENTRY(csum_partial_copy_generic) 214 215 entry sp, 32 216 mov a12, a3 217 mov a11, a4 218 or a10, a2, a3 219 220 /* We optimize the following alignment tests for the 4-byte 221 aligned case. Two bbsi.l instructions might seem more optimal 222 (commented out below). However, both labels 5: and 3: are out 223 of the imm8 range, so the assembler relaxes them into 224 equivalent bbci.l, j combinations, which is actually 225 slower. */ 226 227 extui a9, a10, 0, 2 228 beqz a9, 1f /* branch if both are 4-byte aligned */ 229 bbsi.l a10, 0, 5f /* branch if one address is odd */ 230 j 3f /* one address is 2-byte aligned */ 231 232/* _bbsi.l a10, 0, 5f */ /* branch if odd address */ 233/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ 234 2351: 236 /* src and dst are both 4-byte aligned */ 237 srli a10, a4, 5 /* 32-byte chunks */ 238#if XCHAL_HAVE_LOOPS 239 loopgtz a10, 2f 240#else 241 beqz a10, 2f 242 slli a10, a10, 5 243 add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ 244.Loop5: 245#endif 246SRC( l32i a9, a2, 0 ) 247SRC( l32i a8, a2, 4 ) 248DST( s32i a9, a3, 0 ) 249DST( s32i a8, a3, 4 ) 250 ONES_ADD(a5, a9) 251 ONES_ADD(a5, a8) 252SRC( l32i a9, a2, 8 ) 253SRC( l32i a8, a2, 12 ) 254DST( s32i a9, a3, 8 ) 255DST( s32i a8, a3, 12 ) 256 ONES_ADD(a5, a9) 257 ONES_ADD(a5, a8) 258SRC( l32i a9, a2, 16 ) 259SRC( l32i a8, a2, 20 ) 260DST( s32i a9, a3, 16 ) 261DST( s32i a8, a3, 20 ) 262 ONES_ADD(a5, a9) 263 ONES_ADD(a5, a8) 264SRC( l32i a9, a2, 24 ) 265SRC( l32i a8, a2, 28 ) 266DST( s32i a9, a3, 24 ) 267DST( s32i a8, a3, 28 ) 268 ONES_ADD(a5, a9) 269 ONES_ADD(a5, a8) 270 addi a2, a2, 32 271 addi a3, a3, 32 272#if !XCHAL_HAVE_LOOPS 273 blt a2, a10, .Loop5 274#endif 2752: 276 extui a10, a4, 2, 3 /* remaining 4-byte chunks */ 277 extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ 278#if XCHAL_HAVE_LOOPS 279 loopgtz a10, 3f 280#else 281 beqz a10, 3f 282 slli a10, a10, 2 283 add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ 284.Loop6: 285#endif 286SRC( l32i a9, a2, 0 ) 287DST( s32i a9, a3, 0 ) 288 ONES_ADD(a5, a9) 289 addi a2, a2, 4 290 addi a3, a3, 4 291#if !XCHAL_HAVE_LOOPS 292 blt a2, a10, .Loop6 293#endif 2943: 295 /* 296 Control comes to here in two cases: (1) It may fall through 297 to here from the 4-byte alignment case to process, at most, 298 one 2-byte chunk. (2) It branches to here from above if 299 either src or dst is 2-byte aligned, and we process all bytes 300 here, except for perhaps a trailing odd byte. It's 301 inefficient, so align your addresses to 4-byte boundaries. 302 303 a2 = src 304 a3 = dst 305 a4 = len 306 a5 = sum 307 */ 308 srli a10, a4, 1 /* 2-byte chunks */ 309#if XCHAL_HAVE_LOOPS 310 loopgtz a10, 4f 311#else 312 beqz a10, 4f 313 slli a10, a10, 1 314 add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ 315.Loop7: 316#endif 317SRC( l16ui a9, a2, 0 ) 318DST( s16i a9, a3, 0 ) 319 ONES_ADD(a5, a9) 320 addi a2, a2, 2 321 addi a3, a3, 2 322#if !XCHAL_HAVE_LOOPS 323 blt a2, a10, .Loop7 324#endif 3254: 326 /* This section processes a possible trailing odd byte. */ 327 _bbci.l a4, 0, 8f /* 1-byte chunk */ 328SRC( l8ui a9, a2, 0 ) 329DST( s8i a9, a3, 0 ) 330#ifdef __XTENSA_EB__ 331 slli a9, a9, 8 /* shift byte to bits 8..15 */ 332#endif 333 ONES_ADD(a5, a9) 3348: 335 mov a2, a5 336 retw 337 3385: 339 /* Control branch to here when either src or dst is odd. We 340 process all bytes using 8-bit accesses. Grossly inefficient, 341 so don't feed us an odd address. */ 342 343 srli a10, a4, 1 /* handle in pairs for 16-bit csum */ 344#if XCHAL_HAVE_LOOPS 345 loopgtz a10, 6f 346#else 347 beqz a10, 6f 348 slli a10, a10, 1 349 add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ 350.Loop8: 351#endif 352SRC( l8ui a9, a2, 0 ) 353SRC( l8ui a8, a2, 1 ) 354DST( s8i a9, a3, 0 ) 355DST( s8i a8, a3, 1 ) 356#ifdef __XTENSA_EB__ 357 slli a9, a9, 8 /* combine into a single 16-bit value */ 358#else /* for checksum computation */ 359 slli a8, a8, 8 360#endif 361 or a9, a9, a8 362 ONES_ADD(a5, a9) 363 addi a2, a2, 2 364 addi a3, a3, 2 365#if !XCHAL_HAVE_LOOPS 366 blt a2, a10, .Loop8 367#endif 3686: 369 j 4b /* process the possible trailing odd byte */ 370 371ENDPROC(csum_partial_copy_generic) 372 373 374# Exception handler: 375.section .fixup, "ax" 376/* 377 a6 = src_err_ptr 378 a7 = dst_err_ptr 379 a11 = original len for exception handling 380 a12 = original dst for exception handling 381*/ 382 3836001: 384 _movi a2, -EFAULT 385 s32i a2, a6, 0 /* src_err_ptr */ 386 387 # clear the complete destination - computing the rest 388 # is too much work 389 movi a2, 0 390#if XCHAL_HAVE_LOOPS 391 loopgtz a11, 2f 392#else 393 beqz a11, 2f 394 add a11, a11, a12 /* a11 = ending address */ 395.Leloop: 396#endif 397 s8i a2, a12, 0 398 addi a12, a12, 1 399#if !XCHAL_HAVE_LOOPS 400 blt a12, a11, .Leloop 401#endif 4022: 403 retw 404 4056002: 406 movi a2, -EFAULT 407 s32i a2, a7, 0 /* dst_err_ptr */ 408 movi a2, 0 409 retw 410 411.previous 412 413