1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/errno.h> 18#include <asm/ppc_asm.h> 19 20/* 21 * Computes the checksum of a memory block at buff, length len, 22 * and adds in "sum" (32-bit). 23 * 24 * __csum_partial(r3=buff, r4=len, r5=sum) 25 */ 26_GLOBAL(__csum_partial) 27 addic r0,r5,0 /* clear carry */ 28 29 srdi. r6,r4,3 /* less than 8 bytes? */ 30 beq .Lcsum_tail_word 31 32 /* 33 * If only halfword aligned, align to a double word. Since odd 34 * aligned addresses should be rare and they would require more 35 * work to calculate the correct checksum, we ignore that case 36 * and take the potential slowdown of unaligned loads. 37 */ 38 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 39 beq .Lcsum_aligned 40 41 li r7,4 42 sub r6,r7,r6 43 mtctr r6 44 451: 46 lhz r6,0(r3) /* align to doubleword */ 47 subi r4,r4,2 48 addi r3,r3,2 49 adde r0,r0,r6 50 bdnz 1b 51 52.Lcsum_aligned: 53 /* 54 * We unroll the loop such that each iteration is 64 bytes with an 55 * entry and exit limb of 64 bytes, meaning a minimum size of 56 * 128 bytes. 57 */ 58 srdi. r6,r4,7 59 beq .Lcsum_tail_doublewords /* len < 128 */ 60 61 srdi r6,r4,6 62 subi r6,r6,1 63 mtctr r6 64 65 stdu r1,-STACKFRAMESIZE(r1) 66 std r14,STK_REG(R14)(r1) 67 std r15,STK_REG(R15)(r1) 68 std r16,STK_REG(R16)(r1) 69 70 ld r6,0(r3) 71 ld r9,8(r3) 72 73 ld r10,16(r3) 74 ld r11,24(r3) 75 76 /* 77 * On POWER6 and POWER7 back to back addes take 2 cycles because of 78 * the XER dependency. This means the fastest this loop can go is 79 * 16 cycles per iteration. The scheduling of the loop below has 80 * been shown to hit this on both POWER6 and POWER7. 81 */ 82 .align 5 832: 84 adde r0,r0,r6 85 ld r12,32(r3) 86 ld r14,40(r3) 87 88 adde r0,r0,r9 89 ld r15,48(r3) 90 ld r16,56(r3) 91 addi r3,r3,64 92 93 adde r0,r0,r10 94 95 adde r0,r0,r11 96 97 adde r0,r0,r12 98 99 adde r0,r0,r14 100 101 adde r0,r0,r15 102 ld r6,0(r3) 103 ld r9,8(r3) 104 105 adde r0,r0,r16 106 ld r10,16(r3) 107 ld r11,24(r3) 108 bdnz 2b 109 110 111 adde r0,r0,r6 112 ld r12,32(r3) 113 ld r14,40(r3) 114 115 adde r0,r0,r9 116 ld r15,48(r3) 117 ld r16,56(r3) 118 addi r3,r3,64 119 120 adde r0,r0,r10 121 adde r0,r0,r11 122 adde r0,r0,r12 123 adde r0,r0,r14 124 adde r0,r0,r15 125 adde r0,r0,r16 126 127 ld r14,STK_REG(R14)(r1) 128 ld r15,STK_REG(R15)(r1) 129 ld r16,STK_REG(R16)(r1) 130 addi r1,r1,STACKFRAMESIZE 131 132 andi. r4,r4,63 133 134.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ 135 srdi. r6,r4,3 136 beq .Lcsum_tail_word 137 138 mtctr r6 1393: 140 ld r6,0(r3) 141 addi r3,r3,8 142 adde r0,r0,r6 143 bdnz 3b 144 145 andi. r4,r4,7 146 147.Lcsum_tail_word: /* Up to 7 bytes to go */ 148 srdi. r6,r4,2 149 beq .Lcsum_tail_halfword 150 151 lwz r6,0(r3) 152 addi r3,r3,4 153 adde r0,r0,r6 154 subi r4,r4,4 155 156.Lcsum_tail_halfword: /* Up to 3 bytes to go */ 157 srdi. r6,r4,1 158 beq .Lcsum_tail_byte 159 160 lhz r6,0(r3) 161 addi r3,r3,2 162 adde r0,r0,r6 163 subi r4,r4,2 164 165.Lcsum_tail_byte: /* Up to 1 byte to go */ 166 andi. r6,r4,1 167 beq .Lcsum_finish 168 169 lbz r6,0(r3) 170 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 171 adde r0,r0,r9 172 173.Lcsum_finish: 174 addze r0,r0 /* add in final carry */ 175 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 176 add r3,r4,r0 177 srdi r3,r3,32 178 blr 179 180 181 .macro srcnr 182100: 183 .section __ex_table,"a" 184 .align 3 185 .llong 100b,.Lsrc_error_nr 186 .previous 187 .endm 188 189 .macro source 190150: 191 .section __ex_table,"a" 192 .align 3 193 .llong 150b,.Lsrc_error 194 .previous 195 .endm 196 197 .macro dstnr 198200: 199 .section __ex_table,"a" 200 .align 3 201 .llong 200b,.Ldest_error_nr 202 .previous 203 .endm 204 205 .macro dest 206250: 207 .section __ex_table,"a" 208 .align 3 209 .llong 250b,.Ldest_error 210 .previous 211 .endm 212 213/* 214 * Computes the checksum of a memory block at src, length len, 215 * and adds in "sum" (32-bit), while copying the block to dst. 216 * If an access exception occurs on src or dst, it stores -EFAULT 217 * to *src_err or *dst_err respectively. The caller must take any action 218 * required in this case (zeroing memory, recalculating partial checksum etc). 219 * 220 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 221 */ 222_GLOBAL(csum_partial_copy_generic) 223 addic r0,r6,0 /* clear carry */ 224 225 srdi. r6,r5,3 /* less than 8 bytes? */ 226 beq .Lcopy_tail_word 227 228 /* 229 * If only halfword aligned, align to a double word. Since odd 230 * aligned addresses should be rare and they would require more 231 * work to calculate the correct checksum, we ignore that case 232 * and take the potential slowdown of unaligned loads. 233 * 234 * If the source and destination are relatively unaligned we only 235 * align the source. This keeps things simple. 236 */ 237 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 238 beq .Lcopy_aligned 239 240 li r9,4 241 sub r6,r9,r6 242 mtctr r6 243 2441: 245srcnr; lhz r6,0(r3) /* align to doubleword */ 246 subi r5,r5,2 247 addi r3,r3,2 248 adde r0,r0,r6 249dstnr; sth r6,0(r4) 250 addi r4,r4,2 251 bdnz 1b 252 253.Lcopy_aligned: 254 /* 255 * We unroll the loop such that each iteration is 64 bytes with an 256 * entry and exit limb of 64 bytes, meaning a minimum size of 257 * 128 bytes. 258 */ 259 srdi. r6,r5,7 260 beq .Lcopy_tail_doublewords /* len < 128 */ 261 262 srdi r6,r5,6 263 subi r6,r6,1 264 mtctr r6 265 266 stdu r1,-STACKFRAMESIZE(r1) 267 std r14,STK_REG(R14)(r1) 268 std r15,STK_REG(R15)(r1) 269 std r16,STK_REG(R16)(r1) 270 271source; ld r6,0(r3) 272source; ld r9,8(r3) 273 274source; ld r10,16(r3) 275source; ld r11,24(r3) 276 277 /* 278 * On POWER6 and POWER7 back to back addes take 2 cycles because of 279 * the XER dependency. This means the fastest this loop can go is 280 * 16 cycles per iteration. The scheduling of the loop below has 281 * been shown to hit this on both POWER6 and POWER7. 282 */ 283 .align 5 2842: 285 adde r0,r0,r6 286source; ld r12,32(r3) 287source; ld r14,40(r3) 288 289 adde r0,r0,r9 290source; ld r15,48(r3) 291source; ld r16,56(r3) 292 addi r3,r3,64 293 294 adde r0,r0,r10 295dest; std r6,0(r4) 296dest; std r9,8(r4) 297 298 adde r0,r0,r11 299dest; std r10,16(r4) 300dest; std r11,24(r4) 301 302 adde r0,r0,r12 303dest; std r12,32(r4) 304dest; std r14,40(r4) 305 306 adde r0,r0,r14 307dest; std r15,48(r4) 308dest; std r16,56(r4) 309 addi r4,r4,64 310 311 adde r0,r0,r15 312source; ld r6,0(r3) 313source; ld r9,8(r3) 314 315 adde r0,r0,r16 316source; ld r10,16(r3) 317source; ld r11,24(r3) 318 bdnz 2b 319 320 321 adde r0,r0,r6 322source; ld r12,32(r3) 323source; ld r14,40(r3) 324 325 adde r0,r0,r9 326source; ld r15,48(r3) 327source; ld r16,56(r3) 328 addi r3,r3,64 329 330 adde r0,r0,r10 331dest; std r6,0(r4) 332dest; std r9,8(r4) 333 334 adde r0,r0,r11 335dest; std r10,16(r4) 336dest; std r11,24(r4) 337 338 adde r0,r0,r12 339dest; std r12,32(r4) 340dest; std r14,40(r4) 341 342 adde r0,r0,r14 343dest; std r15,48(r4) 344dest; std r16,56(r4) 345 addi r4,r4,64 346 347 adde r0,r0,r15 348 adde r0,r0,r16 349 350 ld r14,STK_REG(R14)(r1) 351 ld r15,STK_REG(R15)(r1) 352 ld r16,STK_REG(R16)(r1) 353 addi r1,r1,STACKFRAMESIZE 354 355 andi. r5,r5,63 356 357.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ 358 srdi. r6,r5,3 359 beq .Lcopy_tail_word 360 361 mtctr r6 3623: 363srcnr; ld r6,0(r3) 364 addi r3,r3,8 365 adde r0,r0,r6 366dstnr; std r6,0(r4) 367 addi r4,r4,8 368 bdnz 3b 369 370 andi. r5,r5,7 371 372.Lcopy_tail_word: /* Up to 7 bytes to go */ 373 srdi. r6,r5,2 374 beq .Lcopy_tail_halfword 375 376srcnr; lwz r6,0(r3) 377 addi r3,r3,4 378 adde r0,r0,r6 379dstnr; stw r6,0(r4) 380 addi r4,r4,4 381 subi r5,r5,4 382 383.Lcopy_tail_halfword: /* Up to 3 bytes to go */ 384 srdi. r6,r5,1 385 beq .Lcopy_tail_byte 386 387srcnr; lhz r6,0(r3) 388 addi r3,r3,2 389 adde r0,r0,r6 390dstnr; sth r6,0(r4) 391 addi r4,r4,2 392 subi r5,r5,2 393 394.Lcopy_tail_byte: /* Up to 1 byte to go */ 395 andi. r6,r5,1 396 beq .Lcopy_finish 397 398srcnr; lbz r6,0(r3) 399 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 400 adde r0,r0,r9 401dstnr; stb r6,0(r4) 402 403.Lcopy_finish: 404 addze r0,r0 /* add in final carry */ 405 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 406 add r3,r4,r0 407 srdi r3,r3,32 408 blr 409 410.Lsrc_error: 411 ld r14,STK_REG(R14)(r1) 412 ld r15,STK_REG(R15)(r1) 413 ld r16,STK_REG(R16)(r1) 414 addi r1,r1,STACKFRAMESIZE 415.Lsrc_error_nr: 416 cmpdi 0,r7,0 417 beqlr 418 li r6,-EFAULT 419 stw r6,0(r7) 420 blr 421 422.Ldest_error: 423 ld r14,STK_REG(R14)(r1) 424 ld r15,STK_REG(R15)(r1) 425 ld r16,STK_REG(R16)(r1) 426 addi r1,r1,STACKFRAMESIZE 427.Ldest_error_nr: 428 cmpdi 0,r8,0 429 beqlr 430 li r6,-EFAULT 431 stw r6,0(r8) 432 blr 433