1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IP/TCP/UDP checksumming routines 7 * 8 * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea 9 * Optimized by Joe Taylor 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public License 13 * as published by the Free Software Foundation; either version 14 * 2 of the License, or (at your option) any later version. 15 */ 16 17#include <asm/errno.h> 18#include <linux/linkage.h> 19#include <variant/core.h> 20 21/* 22 * computes a partial checksum, e.g. for TCP/UDP fragments 23 */ 24 25/* 26 * unsigned int csum_partial(const unsigned char *buf, int len, 27 * unsigned int sum); 28 * a2 = buf 29 * a3 = len 30 * a4 = sum 31 * 32 * This function assumes 2- or 4-byte alignment. Other alignments will fail! 33 */ 34 35/* ONES_ADD converts twos-complement math to ones-complement. */ 36#define ONES_ADD(sum, val) \ 37 add sum, sum, val ; \ 38 bgeu sum, val, 99f ; \ 39 addi sum, sum, 1 ; \ 4099: ; 41 42.text 43ENTRY(csum_partial) 44 45 /* 46 * Experiments with Ethernet and SLIP connections show that buf 47 * is aligned on either a 2-byte or 4-byte boundary. 48 */ 49 entry sp, 32 50 extui a5, a2, 0, 2 51 bnez a5, 8f /* branch if 2-byte aligned */ 52 /* Fall-through on common case, 4-byte alignment */ 531: 54 srli a5, a3, 5 /* 32-byte chunks */ 55#if XCHAL_HAVE_LOOPS 56 loopgtz a5, 2f 57#else 58 beqz a5, 2f 59 slli a5, a5, 5 60 add a5, a5, a2 /* a5 = end of last 32-byte chunk */ 61.Loop1: 62#endif 63 l32i a6, a2, 0 64 l32i a7, a2, 4 65 ONES_ADD(a4, a6) 66 ONES_ADD(a4, a7) 67 l32i a6, a2, 8 68 l32i a7, a2, 12 69 ONES_ADD(a4, a6) 70 ONES_ADD(a4, a7) 71 l32i a6, a2, 16 72 l32i a7, a2, 20 73 ONES_ADD(a4, a6) 74 ONES_ADD(a4, a7) 75 l32i a6, a2, 24 76 l32i a7, a2, 28 77 ONES_ADD(a4, a6) 78 ONES_ADD(a4, a7) 79 addi a2, a2, 4*8 80#if !XCHAL_HAVE_LOOPS 81 blt a2, a5, .Loop1 82#endif 832: 84 extui a5, a3, 2, 3 /* remaining 4-byte chunks */ 85#if XCHAL_HAVE_LOOPS 86 loopgtz a5, 3f 87#else 88 beqz a5, 3f 89 slli a5, a5, 2 90 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 91.Loop2: 92#endif 93 l32i a6, a2, 0 94 ONES_ADD(a4, a6) 95 addi a2, a2, 4 96#if !XCHAL_HAVE_LOOPS 97 blt a2, a5, .Loop2 98#endif 993: 100 _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ 101 l16ui a6, a2, 0 102 ONES_ADD(a4, a6) 103 addi a2, a2, 2 1045: 105 _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ 1066: l8ui a6, a2, 0 107#ifdef __XTENSA_EB__ 108 slli a6, a6, 8 /* load byte into bits 8..15 */ 109#endif 110 ONES_ADD(a4, a6) 1117: 112 mov a2, a4 113 retw 114 115 /* uncommon case, buf is 2-byte aligned */ 1168: 117 beqz a3, 7b /* branch if len == 0 */ 118 beqi a3, 1, 6b /* branch if len == 1 */ 119 120 extui a5, a2, 0, 1 121 bnez a5, 8f /* branch if 1-byte aligned */ 122 123 l16ui a6, a2, 0 /* common case, len >= 2 */ 124 ONES_ADD(a4, a6) 125 addi a2, a2, 2 /* adjust buf */ 126 addi a3, a3, -2 /* adjust len */ 127 j 1b /* now buf is 4-byte aligned */ 128 129 /* case: odd-byte aligned, len > 1 130 * This case is dog slow, so don't give us an odd address. 131 * (I don't think this ever happens, but just in case.) 132 */ 1338: 134 srli a5, a3, 2 /* 4-byte chunks */ 135#if XCHAL_HAVE_LOOPS 136 loopgtz a5, 2f 137#else 138 beqz a5, 2f 139 slli a5, a5, 2 140 add a5, a5, a2 /* a5 = end of last 4-byte chunk */ 141.Loop3: 142#endif 143 l8ui a6, a2, 0 /* bits 24..31 */ 144 l16ui a7, a2, 1 /* bits 8..23 */ 145 l8ui a8, a2, 3 /* bits 0.. 8 */ 146#ifdef __XTENSA_EB__ 147 slli a6, a6, 24 148#else 149 slli a8, a8, 24 150#endif 151 slli a7, a7, 8 152 or a7, a7, a6 153 or a7, a7, a8 154 ONES_ADD(a4, a7) 155 addi a2, a2, 4 156#if !XCHAL_HAVE_LOOPS 157 blt a2, a5, .Loop3 158#endif 1592: 160 _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ 161 l8ui a6, a2, 0 162 l8ui a7, a2, 1 163#ifdef __XTENSA_EB__ 164 slli a6, a6, 8 165#else 166 slli a7, a7, 8 167#endif 168 or a7, a7, a6 169 ONES_ADD(a4, a7) 170 addi a2, a2, 2 1713: 172 j 5b /* branch to handle the remaining byte */ 173 174ENDPROC(csum_partial) 175 176/* 177 * Copy from ds while checksumming, otherwise like csum_partial 178 * 179 * The macros SRC and DST specify the type of access for the instruction. 180 * thus we can call a custom exception handler for each access type. 181 */ 182 183#define SRC(y...) \ 184 9999: y; \ 185 .section __ex_table, "a"; \ 186 .long 9999b, 6001f ; \ 187 .previous 188 189#define DST(y...) \ 190 9999: y; \ 191 .section __ex_table, "a"; \ 192 .long 9999b, 6002f ; \ 193 .previous 194 195/* 196unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, 197 int sum, int *src_err_ptr, int *dst_err_ptr) 198 a2 = src 199 a3 = dst 200 a4 = len 201 a5 = sum 202 a6 = src_err_ptr 203 a7 = dst_err_ptr 204 a8 = temp 205 a9 = temp 206 a10 = temp 207 a11 = original len for exception handling 208 a12 = original dst for exception handling 209 210 This function is optimized for 4-byte aligned addresses. Other 211 alignments work, but not nearly as efficiently. 212 */ 213 214ENTRY(csum_partial_copy_generic) 215 216 entry sp, 32 217 mov a12, a3 218 mov a11, a4 219 or a10, a2, a3 220 221 /* We optimize the following alignment tests for the 4-byte 222 aligned case. Two bbsi.l instructions might seem more optimal 223 (commented out below). However, both labels 5: and 3: are out 224 of the imm8 range, so the assembler relaxes them into 225 equivalent bbci.l, j combinations, which is actually 226 slower. */ 227 228 extui a9, a10, 0, 2 229 beqz a9, 1f /* branch if both are 4-byte aligned */ 230 bbsi.l a10, 0, 5f /* branch if one address is odd */ 231 j 3f /* one address is 2-byte aligned */ 232 233/* _bbsi.l a10, 0, 5f */ /* branch if odd address */ 234/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ 235 2361: 237 /* src and dst are both 4-byte aligned */ 238 srli a10, a4, 5 /* 32-byte chunks */ 239#if XCHAL_HAVE_LOOPS 240 loopgtz a10, 2f 241#else 242 beqz a10, 2f 243 slli a10, a10, 5 244 add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ 245.Loop5: 246#endif 247SRC( l32i a9, a2, 0 ) 248SRC( l32i a8, a2, 4 ) 249DST( s32i a9, a3, 0 ) 250DST( s32i a8, a3, 4 ) 251 ONES_ADD(a5, a9) 252 ONES_ADD(a5, a8) 253SRC( l32i a9, a2, 8 ) 254SRC( l32i a8, a2, 12 ) 255DST( s32i a9, a3, 8 ) 256DST( s32i a8, a3, 12 ) 257 ONES_ADD(a5, a9) 258 ONES_ADD(a5, a8) 259SRC( l32i a9, a2, 16 ) 260SRC( l32i a8, a2, 20 ) 261DST( s32i a9, a3, 16 ) 262DST( s32i a8, a3, 20 ) 263 ONES_ADD(a5, a9) 264 ONES_ADD(a5, a8) 265SRC( l32i a9, a2, 24 ) 266SRC( l32i a8, a2, 28 ) 267DST( s32i a9, a3, 24 ) 268DST( s32i a8, a3, 28 ) 269 ONES_ADD(a5, a9) 270 ONES_ADD(a5, a8) 271 addi a2, a2, 32 272 addi a3, a3, 32 273#if !XCHAL_HAVE_LOOPS 274 blt a2, a10, .Loop5 275#endif 2762: 277 extui a10, a4, 2, 3 /* remaining 4-byte chunks */ 278 extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ 279#if XCHAL_HAVE_LOOPS 280 loopgtz a10, 3f 281#else 282 beqz a10, 3f 283 slli a10, a10, 2 284 add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ 285.Loop6: 286#endif 287SRC( l32i a9, a2, 0 ) 288DST( s32i a9, a3, 0 ) 289 ONES_ADD(a5, a9) 290 addi a2, a2, 4 291 addi a3, a3, 4 292#if !XCHAL_HAVE_LOOPS 293 blt a2, a10, .Loop6 294#endif 2953: 296 /* 297 Control comes to here in two cases: (1) It may fall through 298 to here from the 4-byte alignment case to process, at most, 299 one 2-byte chunk. (2) It branches to here from above if 300 either src or dst is 2-byte aligned, and we process all bytes 301 here, except for perhaps a trailing odd byte. It's 302 inefficient, so align your addresses to 4-byte boundaries. 303 304 a2 = src 305 a3 = dst 306 a4 = len 307 a5 = sum 308 */ 309 srli a10, a4, 1 /* 2-byte chunks */ 310#if XCHAL_HAVE_LOOPS 311 loopgtz a10, 4f 312#else 313 beqz a10, 4f 314 slli a10, a10, 1 315 add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ 316.Loop7: 317#endif 318SRC( l16ui a9, a2, 0 ) 319DST( s16i a9, a3, 0 ) 320 ONES_ADD(a5, a9) 321 addi a2, a2, 2 322 addi a3, a3, 2 323#if !XCHAL_HAVE_LOOPS 324 blt a2, a10, .Loop7 325#endif 3264: 327 /* This section processes a possible trailing odd byte. */ 328 _bbci.l a4, 0, 8f /* 1-byte chunk */ 329SRC( l8ui a9, a2, 0 ) 330DST( s8i a9, a3, 0 ) 331#ifdef __XTENSA_EB__ 332 slli a9, a9, 8 /* shift byte to bits 8..15 */ 333#endif 334 ONES_ADD(a5, a9) 3358: 336 mov a2, a5 337 retw 338 3395: 340 /* Control branch to here when either src or dst is odd. We 341 process all bytes using 8-bit accesses. Grossly inefficient, 342 so don't feed us an odd address. */ 343 344 srli a10, a4, 1 /* handle in pairs for 16-bit csum */ 345#if XCHAL_HAVE_LOOPS 346 loopgtz a10, 6f 347#else 348 beqz a10, 6f 349 slli a10, a10, 1 350 add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ 351.Loop8: 352#endif 353SRC( l8ui a9, a2, 0 ) 354SRC( l8ui a8, a2, 1 ) 355DST( s8i a9, a3, 0 ) 356DST( s8i a8, a3, 1 ) 357#ifdef __XTENSA_EB__ 358 slli a9, a9, 8 /* combine into a single 16-bit value */ 359#else /* for checksum computation */ 360 slli a8, a8, 8 361#endif 362 or a9, a9, a8 363 ONES_ADD(a5, a9) 364 addi a2, a2, 2 365 addi a3, a3, 2 366#if !XCHAL_HAVE_LOOPS 367 blt a2, a10, .Loop8 368#endif 3696: 370 j 4b /* process the possible trailing odd byte */ 371 372ENDPROC(csum_partial_copy_generic) 373 374 375# Exception handler: 376.section .fixup, "ax" 377/* 378 a6 = src_err_ptr 379 a7 = dst_err_ptr 380 a11 = original len for exception handling 381 a12 = original dst for exception handling 382*/ 383 3846001: 385 _movi a2, -EFAULT 386 s32i a2, a6, 0 /* src_err_ptr */ 387 388 # clear the complete destination - computing the rest 389 # is too much work 390 movi a2, 0 391#if XCHAL_HAVE_LOOPS 392 loopgtz a11, 2f 393#else 394 beqz a11, 2f 395 add a11, a11, a12 /* a11 = ending address */ 396.Leloop: 397#endif 398 s8i a2, a12, 0 399 addi a12, a12, 1 400#if !XCHAL_HAVE_LOOPS 401 blt a12, a11, .Leloop 402#endif 4032: 404 retw 405 4066002: 407 movi a2, -EFAULT 408 s32i a2, a7, 0 /* dst_err_ptr */ 409 movi a2, 0 410 retw 411 412.previous 413