1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/errno.h> 18#include <asm/ppc_asm.h> 19 20/* 21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header 22 * len is in words and is always >= 5. 23 * 24 * In practice len == 5, but this is not guaranteed. So this code does not 25 * attempt to use doubleword instructions. 26 */ 27_GLOBAL(ip_fast_csum) 28 lwz r0,0(r3) 29 lwzu r5,4(r3) 30 addic. r4,r4,-2 31 addc r0,r0,r5 32 mtctr r4 33 blelr- 341: lwzu r4,4(r3) 35 adde r0,r0,r4 36 bdnz 1b 37 addze r0,r0 /* add in final carry */ 38 rldicl r4,r0,32,0 /* fold two 32-bit halves together */ 39 add r0,r0,r4 40 srdi r0,r0,32 41 rlwinm r3,r0,16,0,31 /* fold two halves together */ 42 add r3,r0,r3 43 not r3,r3 44 srwi r3,r3,16 45 blr 46 47/* 48 * Compute checksum of TCP or UDP pseudo-header: 49 * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) 50 * No real gain trying to do this specially for 64 bit, but 51 * the 32 bit addition may spill into the upper bits of 52 * the doubleword so we still must fold it down from 64. 53 */ 54_GLOBAL(csum_tcpudp_magic) 55 rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ 56 addc r0,r3,r4 /* add 4 32-bit words together */ 57 adde r0,r0,r5 58 adde r0,r0,r7 59 rldicl r4,r0,32,0 /* fold 64 bit value */ 60 add r0,r4,r0 61 srdi r0,r0,32 62 rlwinm r3,r0,16,0,31 /* fold two halves together */ 63 add r3,r0,r3 64 not r3,r3 65 srwi r3,r3,16 66 blr 67 68/* 69 * Computes the checksum of a memory block at buff, length len, 70 * and adds in "sum" (32-bit). 71 * 72 * csum_partial(r3=buff, r4=len, r5=sum) 73 */ 74_GLOBAL(csum_partial) 75 addic r0,r5,0 /* clear carry */ 76 77 srdi. r6,r4,3 /* less than 8 bytes? */ 78 beq .Lcsum_tail_word 79 80 /* 81 * If only halfword aligned, align to a double word. Since odd 82 * aligned addresses should be rare and they would require more 83 * work to calculate the correct checksum, we ignore that case 84 * and take the potential slowdown of unaligned loads. 85 */ 86 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 87 beq .Lcsum_aligned 88 89 li r7,4 90 sub r6,r7,r6 91 mtctr r6 92 931: 94 lhz r6,0(r3) /* align to doubleword */ 95 subi r4,r4,2 96 addi r3,r3,2 97 adde r0,r0,r6 98 bdnz 1b 99 100.Lcsum_aligned: 101 /* 102 * We unroll the loop such that each iteration is 64 bytes with an 103 * entry and exit limb of 64 bytes, meaning a minimum size of 104 * 128 bytes. 105 */ 106 srdi. r6,r4,7 107 beq .Lcsum_tail_doublewords /* len < 128 */ 108 109 srdi r6,r4,6 110 subi r6,r6,1 111 mtctr r6 112 113 stdu r1,-STACKFRAMESIZE(r1) 114 std r14,STK_REG(R14)(r1) 115 std r15,STK_REG(R15)(r1) 116 std r16,STK_REG(R16)(r1) 117 118 ld r6,0(r3) 119 ld r9,8(r3) 120 121 ld r10,16(r3) 122 ld r11,24(r3) 123 124 /* 125 * On POWER6 and POWER7 back to back addes take 2 cycles because of 126 * the XER dependency. This means the fastest this loop can go is 127 * 16 cycles per iteration. The scheduling of the loop below has 128 * been shown to hit this on both POWER6 and POWER7. 129 */ 130 .align 5 1312: 132 adde r0,r0,r6 133 ld r12,32(r3) 134 ld r14,40(r3) 135 136 adde r0,r0,r9 137 ld r15,48(r3) 138 ld r16,56(r3) 139 addi r3,r3,64 140 141 adde r0,r0,r10 142 143 adde r0,r0,r11 144 145 adde r0,r0,r12 146 147 adde r0,r0,r14 148 149 adde r0,r0,r15 150 ld r6,0(r3) 151 ld r9,8(r3) 152 153 adde r0,r0,r16 154 ld r10,16(r3) 155 ld r11,24(r3) 156 bdnz 2b 157 158 159 adde r0,r0,r6 160 ld r12,32(r3) 161 ld r14,40(r3) 162 163 adde r0,r0,r9 164 ld r15,48(r3) 165 ld r16,56(r3) 166 addi r3,r3,64 167 168 adde r0,r0,r10 169 adde r0,r0,r11 170 adde r0,r0,r12 171 adde r0,r0,r14 172 adde r0,r0,r15 173 adde r0,r0,r16 174 175 ld r14,STK_REG(R14)(r1) 176 ld r15,STK_REG(R15)(r1) 177 ld r16,STK_REG(R16)(r1) 178 addi r1,r1,STACKFRAMESIZE 179 180 andi. r4,r4,63 181 182.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ 183 srdi. r6,r4,3 184 beq .Lcsum_tail_word 185 186 mtctr r6 1873: 188 ld r6,0(r3) 189 addi r3,r3,8 190 adde r0,r0,r6 191 bdnz 3b 192 193 andi. r4,r4,7 194 195.Lcsum_tail_word: /* Up to 7 bytes to go */ 196 srdi. r6,r4,2 197 beq .Lcsum_tail_halfword 198 199 lwz r6,0(r3) 200 addi r3,r3,4 201 adde r0,r0,r6 202 subi r4,r4,4 203 204.Lcsum_tail_halfword: /* Up to 3 bytes to go */ 205 srdi. r6,r4,1 206 beq .Lcsum_tail_byte 207 208 lhz r6,0(r3) 209 addi r3,r3,2 210 adde r0,r0,r6 211 subi r4,r4,2 212 213.Lcsum_tail_byte: /* Up to 1 byte to go */ 214 andi. r6,r4,1 215 beq .Lcsum_finish 216 217 lbz r6,0(r3) 218 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 219 adde r0,r0,r9 220 221.Lcsum_finish: 222 addze r0,r0 /* add in final carry */ 223 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 224 add r3,r4,r0 225 srdi r3,r3,32 226 blr 227 228 229 .macro srcnr 230100: 231 .section __ex_table,"a" 232 .align 3 233 .llong 100b,.Lsrc_error_nr 234 .previous 235 .endm 236 237 .macro source 238150: 239 .section __ex_table,"a" 240 .align 3 241 .llong 150b,.Lsrc_error 242 .previous 243 .endm 244 245 .macro dstnr 246200: 247 .section __ex_table,"a" 248 .align 3 249 .llong 200b,.Ldest_error_nr 250 .previous 251 .endm 252 253 .macro dest 254250: 255 .section __ex_table,"a" 256 .align 3 257 .llong 250b,.Ldest_error 258 .previous 259 .endm 260 261/* 262 * Computes the checksum of a memory block at src, length len, 263 * and adds in "sum" (32-bit), while copying the block to dst. 264 * If an access exception occurs on src or dst, it stores -EFAULT 265 * to *src_err or *dst_err respectively. The caller must take any action 266 * required in this case (zeroing memory, recalculating partial checksum etc). 267 * 268 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 269 */ 270_GLOBAL(csum_partial_copy_generic) 271 addic r0,r6,0 /* clear carry */ 272 273 srdi. r6,r5,3 /* less than 8 bytes? */ 274 beq .Lcopy_tail_word 275 276 /* 277 * If only halfword aligned, align to a double word. Since odd 278 * aligned addresses should be rare and they would require more 279 * work to calculate the correct checksum, we ignore that case 280 * and take the potential slowdown of unaligned loads. 281 * 282 * If the source and destination are relatively unaligned we only 283 * align the source. This keeps things simple. 284 */ 285 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 286 beq .Lcopy_aligned 287 288 li r9,4 289 sub r6,r9,r6 290 mtctr r6 291 2921: 293srcnr; lhz r6,0(r3) /* align to doubleword */ 294 subi r5,r5,2 295 addi r3,r3,2 296 adde r0,r0,r6 297dstnr; sth r6,0(r4) 298 addi r4,r4,2 299 bdnz 1b 300 301.Lcopy_aligned: 302 /* 303 * We unroll the loop such that each iteration is 64 bytes with an 304 * entry and exit limb of 64 bytes, meaning a minimum size of 305 * 128 bytes. 306 */ 307 srdi. r6,r5,7 308 beq .Lcopy_tail_doublewords /* len < 128 */ 309 310 srdi r6,r5,6 311 subi r6,r6,1 312 mtctr r6 313 314 stdu r1,-STACKFRAMESIZE(r1) 315 std r14,STK_REG(R14)(r1) 316 std r15,STK_REG(R15)(r1) 317 std r16,STK_REG(R16)(r1) 318 319source; ld r6,0(r3) 320source; ld r9,8(r3) 321 322source; ld r10,16(r3) 323source; ld r11,24(r3) 324 325 /* 326 * On POWER6 and POWER7 back to back addes take 2 cycles because of 327 * the XER dependency. This means the fastest this loop can go is 328 * 16 cycles per iteration. The scheduling of the loop below has 329 * been shown to hit this on both POWER6 and POWER7. 330 */ 331 .align 5 3322: 333 adde r0,r0,r6 334source; ld r12,32(r3) 335source; ld r14,40(r3) 336 337 adde r0,r0,r9 338source; ld r15,48(r3) 339source; ld r16,56(r3) 340 addi r3,r3,64 341 342 adde r0,r0,r10 343dest; std r6,0(r4) 344dest; std r9,8(r4) 345 346 adde r0,r0,r11 347dest; std r10,16(r4) 348dest; std r11,24(r4) 349 350 adde r0,r0,r12 351dest; std r12,32(r4) 352dest; std r14,40(r4) 353 354 adde r0,r0,r14 355dest; std r15,48(r4) 356dest; std r16,56(r4) 357 addi r4,r4,64 358 359 adde r0,r0,r15 360source; ld r6,0(r3) 361source; ld r9,8(r3) 362 363 adde r0,r0,r16 364source; ld r10,16(r3) 365source; ld r11,24(r3) 366 bdnz 2b 367 368 369 adde r0,r0,r6 370source; ld r12,32(r3) 371source; ld r14,40(r3) 372 373 adde r0,r0,r9 374source; ld r15,48(r3) 375source; ld r16,56(r3) 376 addi r3,r3,64 377 378 adde r0,r0,r10 379dest; std r6,0(r4) 380dest; std r9,8(r4) 381 382 adde r0,r0,r11 383dest; std r10,16(r4) 384dest; std r11,24(r4) 385 386 adde r0,r0,r12 387dest; std r12,32(r4) 388dest; std r14,40(r4) 389 390 adde r0,r0,r14 391dest; std r15,48(r4) 392dest; std r16,56(r4) 393 addi r4,r4,64 394 395 adde r0,r0,r15 396 adde r0,r0,r16 397 398 ld r14,STK_REG(R14)(r1) 399 ld r15,STK_REG(R15)(r1) 400 ld r16,STK_REG(R16)(r1) 401 addi r1,r1,STACKFRAMESIZE 402 403 andi. r5,r5,63 404 405.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ 406 srdi. r6,r5,3 407 beq .Lcopy_tail_word 408 409 mtctr r6 4103: 411srcnr; ld r6,0(r3) 412 addi r3,r3,8 413 adde r0,r0,r6 414dstnr; std r6,0(r4) 415 addi r4,r4,8 416 bdnz 3b 417 418 andi. r5,r5,7 419 420.Lcopy_tail_word: /* Up to 7 bytes to go */ 421 srdi. r6,r5,2 422 beq .Lcopy_tail_halfword 423 424srcnr; lwz r6,0(r3) 425 addi r3,r3,4 426 adde r0,r0,r6 427dstnr; stw r6,0(r4) 428 addi r4,r4,4 429 subi r5,r5,4 430 431.Lcopy_tail_halfword: /* Up to 3 bytes to go */ 432 srdi. r6,r5,1 433 beq .Lcopy_tail_byte 434 435srcnr; lhz r6,0(r3) 436 addi r3,r3,2 437 adde r0,r0,r6 438dstnr; sth r6,0(r4) 439 addi r4,r4,2 440 subi r5,r5,2 441 442.Lcopy_tail_byte: /* Up to 1 byte to go */ 443 andi. r6,r5,1 444 beq .Lcopy_finish 445 446srcnr; lbz r6,0(r3) 447 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 448 adde r0,r0,r9 449dstnr; stb r6,0(r4) 450 451.Lcopy_finish: 452 addze r0,r0 /* add in final carry */ 453 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 454 add r3,r4,r0 455 srdi r3,r3,32 456 blr 457 458.Lsrc_error: 459 ld r14,STK_REG(R14)(r1) 460 ld r15,STK_REG(R15)(r1) 461 ld r16,STK_REG(R16)(r1) 462 addi r1,r1,STACKFRAMESIZE 463.Lsrc_error_nr: 464 cmpdi 0,r7,0 465 beqlr 466 li r6,-EFAULT 467 stw r6,0(r7) 468 blr 469 470.Ldest_error: 471 ld r14,STK_REG(R14)(r1) 472 ld r15,STK_REG(R15)(r1) 473 ld r16,STK_REG(R16)(r1) 474 addi r1,r1,STACKFRAMESIZE 475.Ldest_error_nr: 476 cmpdi 0,r8,0 477 beqlr 478 li r6,-EFAULT 479 stw r6,0(r8) 480 blr 481