1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IP/TCP/UDP checksumming routines 7 * 8 * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea 9 * Optimized by Joe Taylor 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public License 13 * as published by the Free Software Foundation; either version 14 * 2 of the License, or (at your option) any later version. 15 */ 16 17#include <asm/errno.h> 18#include <linux/linkage.h> 19#define _ASMLANGUAGE 20#include <xtensa/config/core.h> 21 22/* 23 * computes a partial checksum, e.g. for TCP/UDP fragments 24 */ 25 26/* 27 * unsigned int csum_partial(const unsigned char *buf, int len, 28 * unsigned int sum); 29 * a2 = buf 30 * a3 = len 31 * a4 = sum 32 * 33 * This function assumes 2- or 4-byte alignment. Other alignments will fail! 34 */ 35 36/* ONES_ADD converts twos-complement math to ones-complement. */ 37#define ONES_ADD(sum, val) \ 38 add sum, sum, val ; \ 39 bgeu sum, val, 99f ; \ 40 addi sum, sum, 1 ; \ 4199: ; 42 43.text 44ENTRY(csum_partial) 45 /* 46 * Experiments with Ethernet and SLIP connections show that buf 47 * is aligned on either a 2-byte or 4-byte boundary. 48 */ 49 entry sp, 32 50 extui a5, a2, 0, 2 51 bnez a5, 8f /* branch if 2-byte aligned */ 52 /* Fall-through on common case, 4-byte alignment */ 531: 54 srli a5, a3, 5 /* 32-byte chunks */ 55#if XCHAL_HAVE_LOOPS 56 loopgtz a5, 2f 57#else 58 beqz a5, 2f 59 slli a5, a5, 5 60 add a5, a5, a2 /* a5 = end of last 32-byte chunk */ 61.Loop1: 62#endif 63 l32i a6, a2, 0 64 l32i a7, a2, 4 65 ONES_ADD(a4, a6) 66 ONES_ADD(a4, a7) 67 l32i a6, a2, 8 68 l32i a7, a2, 12 69 ONES_ADD(a4, a6) 70 ONES_ADD(a4, a7) 71 l32i a6, a2, 16 72 l32i a7, a2, 20 73 ONES_ADD(a4, a6) 74 ONES_ADD(a4, a7) 75 l32i a6, a2, 24 76 l32i a7, a2, 28 77 ONES_ADD(a4, a6) 78 ONES_ADD(a4, a7) 79 addi a2, a2, 4*8 80#if !XCHAL_HAVE_LOOPS 81 blt a2, a5, .Loop1 82#endif 832: 84 extui a5, a3, 2, 3 /* remaining 4-byte chunks */ 85#if XCHAL_HAVE_LOOPS 86 loopgtz a5, 3f 87#else 88 beqz a5, 3f 89 slli a5, a5, 2 90 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 91.Loop2: 92#endif 93 l32i a6, a2, 0 94 ONES_ADD(a4, a6) 95 addi a2, a2, 4 96#if !XCHAL_HAVE_LOOPS 97 blt a2, a5, .Loop2 98#endif 993: 100 _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ 101 l16ui a6, a2, 0 102 ONES_ADD(a4, a6) 103 addi a2, a2, 2 1045: 105 _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ 1066: l8ui a6, a2, 0 107#ifdef __XTENSA_EB__ 108 slli a6, a6, 8 /* load byte into bits 8..15 */ 109#endif 110 ONES_ADD(a4, a6) 1117: 112 mov a2, a4 113 retw 114 115 /* uncommon case, buf is 2-byte aligned */ 1168: 117 beqz a3, 7b /* branch if len == 0 */ 118 beqi a3, 1, 6b /* branch if len == 1 */ 119 120 extui a5, a2, 0, 1 121 bnez a5, 8f /* branch if 1-byte aligned */ 122 123 l16ui a6, a2, 0 /* common case, len >= 2 */ 124 ONES_ADD(a4, a6) 125 addi a2, a2, 2 /* adjust buf */ 126 addi a3, a3, -2 /* adjust len */ 127 j 1b /* now buf is 4-byte aligned */ 128 129 /* case: odd-byte aligned, len > 1 130 * This case is dog slow, so don't give us an odd address. 131 * (I don't think this ever happens, but just in case.) 132 */ 1338: 134 srli a5, a3, 2 /* 4-byte chunks */ 135#if XCHAL_HAVE_LOOPS 136 loopgtz a5, 2f 137#else 138 beqz a5, 2f 139 slli a5, a5, 2 140 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 141.Loop3: 142#endif 143 l8ui a6, a2, 0 /* bits 24..31 */ 144 l16ui a7, a2, 1 /* bits 8..23 */ 145 l8ui a8, a2, 3 /* bits 0.. 8 */ 146#ifdef __XTENSA_EB__ 147 slli a6, a6, 24 148#else 149 slli a8, a8, 24 150#endif 151 slli a7, a7, 8 152 or a7, a7, a6 153 or a7, a7, a8 154 ONES_ADD(a4, a7) 155 addi a2, a2, 4 156#if !XCHAL_HAVE_LOOPS 157 blt a2, a5, .Loop3 158#endif 1592: 160 _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ 161 l8ui a6, a2, 0 162 l8ui a7, a2, 1 163#ifdef __XTENSA_EB__ 164 slli a6, a6, 8 165#else 166 slli a7, a7, 8 167#endif 168 or a7, a7, a6 169 ONES_ADD(a4, a7) 170 addi a2, a2, 2 1713: 172 j 5b /* branch to handle the remaining byte */ 173 174 175 176/* 177 * Copy from ds while checksumming, otherwise like csum_partial 178 * 179 * The macros SRC and DST specify the type of access for the instruction. 180 * thus we can call a custom exception handler for each access type. 181 */ 182 183#define SRC(y...) \ 184 9999: y; \ 185 .section __ex_table, "a"; \ 186 .long 9999b, 6001f ; \ 187 .previous 188 189#define DST(y...) \ 190 9999: y; \ 191 .section __ex_table, "a"; \ 192 .long 9999b, 6002f ; \ 193 .previous 194 195/* 196unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, 197 int sum, int *src_err_ptr, int *dst_err_ptr) 198 a2 = src 199 a3 = dst 200 a4 = len 201 a5 = sum 202 a6 = src_err_ptr 203 a7 = dst_err_ptr 204 a8 = temp 205 a9 = temp 206 a10 = temp 207 a11 = original len for exception handling 208 a12 = original dst for exception handling 209 210 This function is optimized for 4-byte aligned addresses. Other 211 alignments work, but not nearly as efficiently. 212 */ 213 214ENTRY(csum_partial_copy_generic) 215 entry sp, 32 216 mov a12, a3 217 mov a11, a4 218 or a10, a2, a3 219 220 /* We optimize the following alignment tests for the 4-byte 221 aligned case. Two bbsi.l instructions might seem more optimal 222 (commented out below). However, both labels 5: and 3: are out 223 of the imm8 range, so the assembler relaxes them into 224 equivalent bbci.l, j combinations, which is actually 225 slower. */ 226 227 extui a9, a10, 0, 2 228 beqz a9, 1f /* branch if both are 4-byte aligned */ 229 bbsi.l a10, 0, 5f /* branch if one address is odd */ 230 j 3f /* one address is 2-byte aligned */ 231 232/* _bbsi.l a10, 0, 5f */ /* branch if odd address */ 233/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ 234 2351: 236 /* src and dst are both 4-byte aligned */ 237 srli a10, a4, 5 /* 32-byte chunks */ 238#if XCHAL_HAVE_LOOPS 239 loopgtz a10, 2f 240#else 241 beqz a10, 2f 242 slli a10, a10, 5 243 add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ 244.Loop5: 245#endif 246SRC( l32i a9, a2, 0 ) 247SRC( l32i a8, a2, 4 ) 248DST( s32i a9, a3, 0 ) 249DST( s32i a8, a3, 4 ) 250 ONES_ADD(a5, a9) 251 ONES_ADD(a5, a8) 252SRC( l32i a9, a2, 8 ) 253SRC( l32i a8, a2, 12 ) 254DST( s32i a9, a3, 8 ) 255DST( s32i a8, a3, 12 ) 256 ONES_ADD(a5, a9) 257 ONES_ADD(a5, a8) 258SRC( l32i a9, a2, 16 ) 259SRC( l32i a8, a2, 20 ) 260DST( s32i a9, a3, 16 ) 261DST( s32i a8, a3, 20 ) 262 ONES_ADD(a5, a9) 263 ONES_ADD(a5, a8) 264SRC( l32i a9, a2, 24 ) 265SRC( l32i a8, a2, 28 ) 266DST( s32i a9, a3, 24 ) 267DST( s32i a8, a3, 28 ) 268 ONES_ADD(a5, a9) 269 ONES_ADD(a5, a8) 270 addi a2, a2, 32 271 addi a3, a3, 32 272#if !XCHAL_HAVE_LOOPS 273 blt a2, a10, .Loop5 274#endif 2752: 276 extui a10, a4, 2, 3 /* remaining 4-byte chunks */ 277 extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ 278#if XCHAL_HAVE_LOOPS 279 loopgtz a10, 3f 280#else 281 beqz a10, 3f 282 slli a10, a10, 2 283 add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ 284.Loop6: 285#endif 286SRC( l32i a9, a2, 0 ) 287DST( s32i a9, a3, 0 ) 288 ONES_ADD(a5, a9) 289 addi a2, a2, 4 290 addi a3, a3, 4 291#if !XCHAL_HAVE_LOOPS 292 blt a2, a10, .Loop6 293#endif 2943: 295 /* 296 Control comes to here in two cases: (1) It may fall through 297 to here from the 4-byte alignment case to process, at most, 298 one 2-byte chunk. (2) It branches to here from above if 299 either src or dst is 2-byte aligned, and we process all bytes 300 here, except for perhaps a trailing odd byte. It's 301 inefficient, so align your addresses to 4-byte boundaries. 302 303 a2 = src 304 a3 = dst 305 a4 = len 306 a5 = sum 307 */ 308 srli a10, a4, 1 /* 2-byte chunks */ 309#if XCHAL_HAVE_LOOPS 310 loopgtz a10, 4f 311#else 312 beqz a10, 4f 313 slli a10, a10, 1 314 add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ 315.Loop7: 316#endif 317SRC( l16ui a9, a2, 0 ) 318DST( s16i a9, a3, 0 ) 319 ONES_ADD(a5, a9) 320 addi a2, a2, 2 321 addi a3, a3, 2 322#if !XCHAL_HAVE_LOOPS 323 blt a2, a10, .Loop7 324#endif 3254: 326 /* This section processes a possible trailing odd byte. */ 327 _bbci.l a4, 0, 8f /* 1-byte chunk */ 328SRC( l8ui a9, a2, 0 ) 329DST( s8i a9, a3, 0 ) 330#ifdef __XTENSA_EB__ 331 slli a9, a9, 8 /* shift byte to bits 8..15 */ 332#endif 333 ONES_ADD(a5, a9) 3348: 335 mov a2, a5 336 retw 337 3385: 339 /* Control branch to here when either src or dst is odd. We 340 process all bytes using 8-bit accesses. Grossly inefficient, 341 so don't feed us an odd address. */ 342 343 srli a10, a4, 1 /* handle in pairs for 16-bit csum */ 344#if XCHAL_HAVE_LOOPS 345 loopgtz a10, 6f 346#else 347 beqz a10, 6f 348 slli a10, a10, 1 349 add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ 350.Loop8: 351#endif 352SRC( l8ui a9, a2, 0 ) 353SRC( l8ui a8, a2, 1 ) 354DST( s8i a9, a3, 0 ) 355DST( s8i a8, a3, 1 ) 356#ifdef __XTENSA_EB__ 357 slli a9, a9, 8 /* combine into a single 16-bit value */ 358#else /* for checksum computation */ 359 slli a8, a8, 8 360#endif 361 or a9, a9, a8 362 ONES_ADD(a5, a9) 363 addi a2, a2, 2 364 addi a3, a3, 2 365#if !XCHAL_HAVE_LOOPS 366 blt a2, a10, .Loop8 367#endif 3686: 369 j 4b /* process the possible trailing odd byte */ 370 371 372# Exception handler: 373.section .fixup, "ax" 374/* 375 a6 = src_err_ptr 376 a7 = dst_err_ptr 377 a11 = original len for exception handling 378 a12 = original dst for exception handling 379*/ 380 3816001: 382 _movi a2, -EFAULT 383 s32i a2, a6, 0 /* src_err_ptr */ 384 385 # clear the complete destination - computing the rest 386 # is too much work 387 movi a2, 0 388#if XCHAL_HAVE_LOOPS 389 loopgtz a11, 2f 390#else 391 beqz a11, 2f 392 add a11, a11, a12 /* a11 = ending address */ 393.Leloop: 394#endif 395 s8i a2, a12, 0 396 addi a12, a12, 1 397#if !XCHAL_HAVE_LOOPS 398 blt a12, a11, .Leloop 399#endif 4002: 401 retw 402 4036002: 404 movi a2, -EFAULT 405 s32i a2, a7, 0 /* dst_err_ptr */ 406 movi a2, 0 407 retw 408 409.previous 410 411