1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/errno.h> 18#include <asm/ppc_asm.h> 19#include <asm/export.h> 20 21/* 22 * Computes the checksum of a memory block at buff, length len, 23 * and adds in "sum" (32-bit). 24 * 25 * __csum_partial(r3=buff, r4=len, r5=sum) 26 */ 27_GLOBAL(__csum_partial) 28 addic r0,r5,0 /* clear carry */ 29 30 srdi. r6,r4,3 /* less than 8 bytes? */ 31 beq .Lcsum_tail_word 32 33 /* 34 * If only halfword aligned, align to a double word. Since odd 35 * aligned addresses should be rare and they would require more 36 * work to calculate the correct checksum, we ignore that case 37 * and take the potential slowdown of unaligned loads. 38 */ 39 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 40 beq .Lcsum_aligned 41 42 li r7,4 43 sub r6,r7,r6 44 mtctr r6 45 461: 47 lhz r6,0(r3) /* align to doubleword */ 48 subi r4,r4,2 49 addi r3,r3,2 50 adde r0,r0,r6 51 bdnz 1b 52 53.Lcsum_aligned: 54 /* 55 * We unroll the loop such that each iteration is 64 bytes with an 56 * entry and exit limb of 64 bytes, meaning a minimum size of 57 * 128 bytes. 58 */ 59 srdi. r6,r4,7 60 beq .Lcsum_tail_doublewords /* len < 128 */ 61 62 srdi r6,r4,6 63 subi r6,r6,1 64 mtctr r6 65 66 stdu r1,-STACKFRAMESIZE(r1) 67 std r14,STK_REG(R14)(r1) 68 std r15,STK_REG(R15)(r1) 69 std r16,STK_REG(R16)(r1) 70 71 ld r6,0(r3) 72 ld r9,8(r3) 73 74 ld r10,16(r3) 75 ld r11,24(r3) 76 77 /* 78 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 79 * because of the XER dependency. This means the fastest this loop can 80 * go is 16 cycles per iteration. The scheduling of the loop below has 81 * been shown to hit this on both POWER6 and POWER7. 82 */ 83 .align 5 842: 85 adde r0,r0,r6 86 ld r12,32(r3) 87 ld r14,40(r3) 88 89 adde r0,r0,r9 90 ld r15,48(r3) 91 ld r16,56(r3) 92 addi r3,r3,64 93 94 adde r0,r0,r10 95 96 adde r0,r0,r11 97 98 adde r0,r0,r12 99 100 adde r0,r0,r14 101 102 adde r0,r0,r15 103 ld r6,0(r3) 104 ld r9,8(r3) 105 106 adde r0,r0,r16 107 ld r10,16(r3) 108 ld r11,24(r3) 109 bdnz 2b 110 111 112 adde r0,r0,r6 113 ld r12,32(r3) 114 ld r14,40(r3) 115 116 adde r0,r0,r9 117 ld r15,48(r3) 118 ld r16,56(r3) 119 addi r3,r3,64 120 121 adde r0,r0,r10 122 adde r0,r0,r11 123 adde r0,r0,r12 124 adde r0,r0,r14 125 adde r0,r0,r15 126 adde r0,r0,r16 127 128 ld r14,STK_REG(R14)(r1) 129 ld r15,STK_REG(R15)(r1) 130 ld r16,STK_REG(R16)(r1) 131 addi r1,r1,STACKFRAMESIZE 132 133 andi. r4,r4,63 134 135.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ 136 srdi. r6,r4,3 137 beq .Lcsum_tail_word 138 139 mtctr r6 1403: 141 ld r6,0(r3) 142 addi r3,r3,8 143 adde r0,r0,r6 144 bdnz 3b 145 146 andi. r4,r4,7 147 148.Lcsum_tail_word: /* Up to 7 bytes to go */ 149 srdi. r6,r4,2 150 beq .Lcsum_tail_halfword 151 152 lwz r6,0(r3) 153 addi r3,r3,4 154 adde r0,r0,r6 155 subi r4,r4,4 156 157.Lcsum_tail_halfword: /* Up to 3 bytes to go */ 158 srdi. r6,r4,1 159 beq .Lcsum_tail_byte 160 161 lhz r6,0(r3) 162 addi r3,r3,2 163 adde r0,r0,r6 164 subi r4,r4,2 165 166.Lcsum_tail_byte: /* Up to 1 byte to go */ 167 andi. r6,r4,1 168 beq .Lcsum_finish 169 170 lbz r6,0(r3) 171 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 172 adde r0,r0,r9 173 174.Lcsum_finish: 175 addze r0,r0 /* add in final carry */ 176 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 177 add r3,r4,r0 178 srdi r3,r3,32 179 blr 180EXPORT_SYMBOL(__csum_partial) 181 182 183 .macro srcnr 184100: 185 EX_TABLE(100b,.Lsrc_error_nr) 186 .endm 187 188 .macro source 189150: 190 EX_TABLE(150b,.Lsrc_error) 191 .endm 192 193 .macro dstnr 194200: 195 EX_TABLE(200b,.Ldest_error_nr) 196 .endm 197 198 .macro dest 199250: 200 EX_TABLE(250b,.Ldest_error) 201 .endm 202 203/* 204 * Computes the checksum of a memory block at src, length len, 205 * and adds in "sum" (32-bit), while copying the block to dst. 206 * If an access exception occurs on src or dst, it stores -EFAULT 207 * to *src_err or *dst_err respectively. The caller must take any action 208 * required in this case (zeroing memory, recalculating partial checksum etc). 209 * 210 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 211 */ 212_GLOBAL(csum_partial_copy_generic) 213 addic r0,r6,0 /* clear carry */ 214 215 srdi. r6,r5,3 /* less than 8 bytes? */ 216 beq .Lcopy_tail_word 217 218 /* 219 * If only halfword aligned, align to a double word. Since odd 220 * aligned addresses should be rare and they would require more 221 * work to calculate the correct checksum, we ignore that case 222 * and take the potential slowdown of unaligned loads. 223 * 224 * If the source and destination are relatively unaligned we only 225 * align the source. This keeps things simple. 226 */ 227 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 228 beq .Lcopy_aligned 229 230 li r9,4 231 sub r6,r9,r6 232 mtctr r6 233 2341: 235srcnr; lhz r6,0(r3) /* align to doubleword */ 236 subi r5,r5,2 237 addi r3,r3,2 238 adde r0,r0,r6 239dstnr; sth r6,0(r4) 240 addi r4,r4,2 241 bdnz 1b 242 243.Lcopy_aligned: 244 /* 245 * We unroll the loop such that each iteration is 64 bytes with an 246 * entry and exit limb of 64 bytes, meaning a minimum size of 247 * 128 bytes. 248 */ 249 srdi. r6,r5,7 250 beq .Lcopy_tail_doublewords /* len < 128 */ 251 252 srdi r6,r5,6 253 subi r6,r6,1 254 mtctr r6 255 256 stdu r1,-STACKFRAMESIZE(r1) 257 std r14,STK_REG(R14)(r1) 258 std r15,STK_REG(R15)(r1) 259 std r16,STK_REG(R16)(r1) 260 261source; ld r6,0(r3) 262source; ld r9,8(r3) 263 264source; ld r10,16(r3) 265source; ld r11,24(r3) 266 267 /* 268 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 269 * because of the XER dependency. This means the fastest this loop can 270 * go is 16 cycles per iteration. The scheduling of the loop below has 271 * been shown to hit this on both POWER6 and POWER7. 272 */ 273 .align 5 2742: 275 adde r0,r0,r6 276source; ld r12,32(r3) 277source; ld r14,40(r3) 278 279 adde r0,r0,r9 280source; ld r15,48(r3) 281source; ld r16,56(r3) 282 addi r3,r3,64 283 284 adde r0,r0,r10 285dest; std r6,0(r4) 286dest; std r9,8(r4) 287 288 adde r0,r0,r11 289dest; std r10,16(r4) 290dest; std r11,24(r4) 291 292 adde r0,r0,r12 293dest; std r12,32(r4) 294dest; std r14,40(r4) 295 296 adde r0,r0,r14 297dest; std r15,48(r4) 298dest; std r16,56(r4) 299 addi r4,r4,64 300 301 adde r0,r0,r15 302source; ld r6,0(r3) 303source; ld r9,8(r3) 304 305 adde r0,r0,r16 306source; ld r10,16(r3) 307source; ld r11,24(r3) 308 bdnz 2b 309 310 311 adde r0,r0,r6 312source; ld r12,32(r3) 313source; ld r14,40(r3) 314 315 adde r0,r0,r9 316source; ld r15,48(r3) 317source; ld r16,56(r3) 318 addi r3,r3,64 319 320 adde r0,r0,r10 321dest; std r6,0(r4) 322dest; std r9,8(r4) 323 324 adde r0,r0,r11 325dest; std r10,16(r4) 326dest; std r11,24(r4) 327 328 adde r0,r0,r12 329dest; std r12,32(r4) 330dest; std r14,40(r4) 331 332 adde r0,r0,r14 333dest; std r15,48(r4) 334dest; std r16,56(r4) 335 addi r4,r4,64 336 337 adde r0,r0,r15 338 adde r0,r0,r16 339 340 ld r14,STK_REG(R14)(r1) 341 ld r15,STK_REG(R15)(r1) 342 ld r16,STK_REG(R16)(r1) 343 addi r1,r1,STACKFRAMESIZE 344 345 andi. r5,r5,63 346 347.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ 348 srdi. r6,r5,3 349 beq .Lcopy_tail_word 350 351 mtctr r6 3523: 353srcnr; ld r6,0(r3) 354 addi r3,r3,8 355 adde r0,r0,r6 356dstnr; std r6,0(r4) 357 addi r4,r4,8 358 bdnz 3b 359 360 andi. r5,r5,7 361 362.Lcopy_tail_word: /* Up to 7 bytes to go */ 363 srdi. r6,r5,2 364 beq .Lcopy_tail_halfword 365 366srcnr; lwz r6,0(r3) 367 addi r3,r3,4 368 adde r0,r0,r6 369dstnr; stw r6,0(r4) 370 addi r4,r4,4 371 subi r5,r5,4 372 373.Lcopy_tail_halfword: /* Up to 3 bytes to go */ 374 srdi. r6,r5,1 375 beq .Lcopy_tail_byte 376 377srcnr; lhz r6,0(r3) 378 addi r3,r3,2 379 adde r0,r0,r6 380dstnr; sth r6,0(r4) 381 addi r4,r4,2 382 subi r5,r5,2 383 384.Lcopy_tail_byte: /* Up to 1 byte to go */ 385 andi. r6,r5,1 386 beq .Lcopy_finish 387 388srcnr; lbz r6,0(r3) 389 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 390 adde r0,r0,r9 391dstnr; stb r6,0(r4) 392 393.Lcopy_finish: 394 addze r0,r0 /* add in final carry */ 395 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 396 add r3,r4,r0 397 srdi r3,r3,32 398 blr 399 400.Lsrc_error: 401 ld r14,STK_REG(R14)(r1) 402 ld r15,STK_REG(R15)(r1) 403 ld r16,STK_REG(R16)(r1) 404 addi r1,r1,STACKFRAMESIZE 405.Lsrc_error_nr: 406 cmpdi 0,r7,0 407 beqlr 408 li r6,-EFAULT 409 stw r6,0(r7) 410 blr 411 412.Ldest_error: 413 ld r14,STK_REG(R14)(r1) 414 ld r15,STK_REG(R15)(r1) 415 ld r16,STK_REG(R16)(r1) 416 addi r1,r1,STACKFRAMESIZE 417.Ldest_error_nr: 418 cmpdi 0,r8,0 419 beqlr 420 li r6,-EFAULT 421 stw r6,0(r8) 422 blr 423EXPORT_SYMBOL(csum_partial_copy_generic) 424