1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/errno.h> 18#include <asm/ppc_asm.h> 19 20/* 21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header 22 * len is in words and is always >= 5. 23 * 24 * In practice len == 5, but this is not guaranteed. So this code does not 25 * attempt to use doubleword instructions. 26 */ 27_GLOBAL(ip_fast_csum) 28 lwz r0,0(r3) 29 lwzu r5,4(r3) 30 addic. r4,r4,-2 31 addc r0,r0,r5 32 mtctr r4 33 blelr- 341: lwzu r4,4(r3) 35 adde r0,r0,r4 36 bdnz 1b 37 addze r0,r0 /* add in final carry */ 38 rldicl r4,r0,32,0 /* fold two 32-bit halves together */ 39 add r0,r0,r4 40 srdi r0,r0,32 41 rlwinm r3,r0,16,0,31 /* fold two halves together */ 42 add r3,r0,r3 43 not r3,r3 44 srwi r3,r3,16 45 blr 46 47/* 48 * Computes the checksum of a memory block at buff, length len, 49 * and adds in "sum" (32-bit). 50 * 51 * csum_partial(r3=buff, r4=len, r5=sum) 52 */ 53_GLOBAL(csum_partial) 54 addic r0,r5,0 /* clear carry */ 55 56 srdi. r6,r4,3 /* less than 8 bytes? */ 57 beq .Lcsum_tail_word 58 59 /* 60 * If only halfword aligned, align to a double word. Since odd 61 * aligned addresses should be rare and they would require more 62 * work to calculate the correct checksum, we ignore that case 63 * and take the potential slowdown of unaligned loads. 64 */ 65 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 66 beq .Lcsum_aligned 67 68 li r7,4 69 sub r6,r7,r6 70 mtctr r6 71 721: 73 lhz r6,0(r3) /* align to doubleword */ 74 subi r4,r4,2 75 addi r3,r3,2 76 adde r0,r0,r6 77 bdnz 1b 78 79.Lcsum_aligned: 80 /* 81 * We unroll the loop such that each iteration is 64 bytes with an 82 * entry and exit limb of 64 bytes, meaning a minimum size of 83 * 128 bytes. 84 */ 85 srdi. r6,r4,7 86 beq .Lcsum_tail_doublewords /* len < 128 */ 87 88 srdi r6,r4,6 89 subi r6,r6,1 90 mtctr r6 91 92 stdu r1,-STACKFRAMESIZE(r1) 93 std r14,STK_REG(R14)(r1) 94 std r15,STK_REG(R15)(r1) 95 std r16,STK_REG(R16)(r1) 96 97 ld r6,0(r3) 98 ld r9,8(r3) 99 100 ld r10,16(r3) 101 ld r11,24(r3) 102 103 /* 104 * On POWER6 and POWER7 back to back addes take 2 cycles because of 105 * the XER dependency. This means the fastest this loop can go is 106 * 16 cycles per iteration. The scheduling of the loop below has 107 * been shown to hit this on both POWER6 and POWER7. 108 */ 109 .align 5 1102: 111 adde r0,r0,r6 112 ld r12,32(r3) 113 ld r14,40(r3) 114 115 adde r0,r0,r9 116 ld r15,48(r3) 117 ld r16,56(r3) 118 addi r3,r3,64 119 120 adde r0,r0,r10 121 122 adde r0,r0,r11 123 124 adde r0,r0,r12 125 126 adde r0,r0,r14 127 128 adde r0,r0,r15 129 ld r6,0(r3) 130 ld r9,8(r3) 131 132 adde r0,r0,r16 133 ld r10,16(r3) 134 ld r11,24(r3) 135 bdnz 2b 136 137 138 adde r0,r0,r6 139 ld r12,32(r3) 140 ld r14,40(r3) 141 142 adde r0,r0,r9 143 ld r15,48(r3) 144 ld r16,56(r3) 145 addi r3,r3,64 146 147 adde r0,r0,r10 148 adde r0,r0,r11 149 adde r0,r0,r12 150 adde r0,r0,r14 151 adde r0,r0,r15 152 adde r0,r0,r16 153 154 ld r14,STK_REG(R14)(r1) 155 ld r15,STK_REG(R15)(r1) 156 ld r16,STK_REG(R16)(r1) 157 addi r1,r1,STACKFRAMESIZE 158 159 andi. r4,r4,63 160 161.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ 162 srdi. r6,r4,3 163 beq .Lcsum_tail_word 164 165 mtctr r6 1663: 167 ld r6,0(r3) 168 addi r3,r3,8 169 adde r0,r0,r6 170 bdnz 3b 171 172 andi. r4,r4,7 173 174.Lcsum_tail_word: /* Up to 7 bytes to go */ 175 srdi. r6,r4,2 176 beq .Lcsum_tail_halfword 177 178 lwz r6,0(r3) 179 addi r3,r3,4 180 adde r0,r0,r6 181 subi r4,r4,4 182 183.Lcsum_tail_halfword: /* Up to 3 bytes to go */ 184 srdi. r6,r4,1 185 beq .Lcsum_tail_byte 186 187 lhz r6,0(r3) 188 addi r3,r3,2 189 adde r0,r0,r6 190 subi r4,r4,2 191 192.Lcsum_tail_byte: /* Up to 1 byte to go */ 193 andi. r6,r4,1 194 beq .Lcsum_finish 195 196 lbz r6,0(r3) 197 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 198 adde r0,r0,r9 199 200.Lcsum_finish: 201 addze r0,r0 /* add in final carry */ 202 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 203 add r3,r4,r0 204 srdi r3,r3,32 205 blr 206 207 208 .macro srcnr 209100: 210 .section __ex_table,"a" 211 .align 3 212 .llong 100b,.Lsrc_error_nr 213 .previous 214 .endm 215 216 .macro source 217150: 218 .section __ex_table,"a" 219 .align 3 220 .llong 150b,.Lsrc_error 221 .previous 222 .endm 223 224 .macro dstnr 225200: 226 .section __ex_table,"a" 227 .align 3 228 .llong 200b,.Ldest_error_nr 229 .previous 230 .endm 231 232 .macro dest 233250: 234 .section __ex_table,"a" 235 .align 3 236 .llong 250b,.Ldest_error 237 .previous 238 .endm 239 240/* 241 * Computes the checksum of a memory block at src, length len, 242 * and adds in "sum" (32-bit), while copying the block to dst. 243 * If an access exception occurs on src or dst, it stores -EFAULT 244 * to *src_err or *dst_err respectively. The caller must take any action 245 * required in this case (zeroing memory, recalculating partial checksum etc). 246 * 247 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 248 */ 249_GLOBAL(csum_partial_copy_generic) 250 addic r0,r6,0 /* clear carry */ 251 252 srdi. r6,r5,3 /* less than 8 bytes? */ 253 beq .Lcopy_tail_word 254 255 /* 256 * If only halfword aligned, align to a double word. Since odd 257 * aligned addresses should be rare and they would require more 258 * work to calculate the correct checksum, we ignore that case 259 * and take the potential slowdown of unaligned loads. 260 * 261 * If the source and destination are relatively unaligned we only 262 * align the source. This keeps things simple. 263 */ 264 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 265 beq .Lcopy_aligned 266 267 li r9,4 268 sub r6,r9,r6 269 mtctr r6 270 2711: 272srcnr; lhz r6,0(r3) /* align to doubleword */ 273 subi r5,r5,2 274 addi r3,r3,2 275 adde r0,r0,r6 276dstnr; sth r6,0(r4) 277 addi r4,r4,2 278 bdnz 1b 279 280.Lcopy_aligned: 281 /* 282 * We unroll the loop such that each iteration is 64 bytes with an 283 * entry and exit limb of 64 bytes, meaning a minimum size of 284 * 128 bytes. 285 */ 286 srdi. r6,r5,7 287 beq .Lcopy_tail_doublewords /* len < 128 */ 288 289 srdi r6,r5,6 290 subi r6,r6,1 291 mtctr r6 292 293 stdu r1,-STACKFRAMESIZE(r1) 294 std r14,STK_REG(R14)(r1) 295 std r15,STK_REG(R15)(r1) 296 std r16,STK_REG(R16)(r1) 297 298source; ld r6,0(r3) 299source; ld r9,8(r3) 300 301source; ld r10,16(r3) 302source; ld r11,24(r3) 303 304 /* 305 * On POWER6 and POWER7 back to back addes take 2 cycles because of 306 * the XER dependency. This means the fastest this loop can go is 307 * 16 cycles per iteration. The scheduling of the loop below has 308 * been shown to hit this on both POWER6 and POWER7. 309 */ 310 .align 5 3112: 312 adde r0,r0,r6 313source; ld r12,32(r3) 314source; ld r14,40(r3) 315 316 adde r0,r0,r9 317source; ld r15,48(r3) 318source; ld r16,56(r3) 319 addi r3,r3,64 320 321 adde r0,r0,r10 322dest; std r6,0(r4) 323dest; std r9,8(r4) 324 325 adde r0,r0,r11 326dest; std r10,16(r4) 327dest; std r11,24(r4) 328 329 adde r0,r0,r12 330dest; std r12,32(r4) 331dest; std r14,40(r4) 332 333 adde r0,r0,r14 334dest; std r15,48(r4) 335dest; std r16,56(r4) 336 addi r4,r4,64 337 338 adde r0,r0,r15 339source; ld r6,0(r3) 340source; ld r9,8(r3) 341 342 adde r0,r0,r16 343source; ld r10,16(r3) 344source; ld r11,24(r3) 345 bdnz 2b 346 347 348 adde r0,r0,r6 349source; ld r12,32(r3) 350source; ld r14,40(r3) 351 352 adde r0,r0,r9 353source; ld r15,48(r3) 354source; ld r16,56(r3) 355 addi r3,r3,64 356 357 adde r0,r0,r10 358dest; std r6,0(r4) 359dest; std r9,8(r4) 360 361 adde r0,r0,r11 362dest; std r10,16(r4) 363dest; std r11,24(r4) 364 365 adde r0,r0,r12 366dest; std r12,32(r4) 367dest; std r14,40(r4) 368 369 adde r0,r0,r14 370dest; std r15,48(r4) 371dest; std r16,56(r4) 372 addi r4,r4,64 373 374 adde r0,r0,r15 375 adde r0,r0,r16 376 377 ld r14,STK_REG(R14)(r1) 378 ld r15,STK_REG(R15)(r1) 379 ld r16,STK_REG(R16)(r1) 380 addi r1,r1,STACKFRAMESIZE 381 382 andi. r5,r5,63 383 384.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ 385 srdi. r6,r5,3 386 beq .Lcopy_tail_word 387 388 mtctr r6 3893: 390srcnr; ld r6,0(r3) 391 addi r3,r3,8 392 adde r0,r0,r6 393dstnr; std r6,0(r4) 394 addi r4,r4,8 395 bdnz 3b 396 397 andi. r5,r5,7 398 399.Lcopy_tail_word: /* Up to 7 bytes to go */ 400 srdi. r6,r5,2 401 beq .Lcopy_tail_halfword 402 403srcnr; lwz r6,0(r3) 404 addi r3,r3,4 405 adde r0,r0,r6 406dstnr; stw r6,0(r4) 407 addi r4,r4,4 408 subi r5,r5,4 409 410.Lcopy_tail_halfword: /* Up to 3 bytes to go */ 411 srdi. r6,r5,1 412 beq .Lcopy_tail_byte 413 414srcnr; lhz r6,0(r3) 415 addi r3,r3,2 416 adde r0,r0,r6 417dstnr; sth r6,0(r4) 418 addi r4,r4,2 419 subi r5,r5,2 420 421.Lcopy_tail_byte: /* Up to 1 byte to go */ 422 andi. r6,r5,1 423 beq .Lcopy_finish 424 425srcnr; lbz r6,0(r3) 426 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 427 adde r0,r0,r9 428dstnr; stb r6,0(r4) 429 430.Lcopy_finish: 431 addze r0,r0 /* add in final carry */ 432 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 433 add r3,r4,r0 434 srdi r3,r3,32 435 blr 436 437.Lsrc_error: 438 ld r14,STK_REG(R14)(r1) 439 ld r15,STK_REG(R15)(r1) 440 ld r16,STK_REG(R16)(r1) 441 addi r1,r1,STACKFRAMESIZE 442.Lsrc_error_nr: 443 cmpdi 0,r7,0 444 beqlr 445 li r6,-EFAULT 446 stw r6,0(r7) 447 blr 448 449.Ldest_error: 450 ld r14,STK_REG(R14)(r1) 451 ld r15,STK_REG(R15)(r1) 452 ld r16,STK_REG(R16)(r1) 453 addi r1,r1,STACKFRAMESIZE 454.Ldest_error_nr: 455 cmpdi 0,r8,0 456 beqlr 457 li r6,-EFAULT 458 stw r6,0(r8) 459 blr 460