1/* 2 * This file contains assembly-language implementations 3 * of IP-style 1's complement checksum routines. 4 * 5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 13 */ 14 15#include <linux/sys.h> 16#include <asm/processor.h> 17#include <asm/errno.h> 18#include <asm/ppc_asm.h> 19 20/* 21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header 22 * len is in words and is always >= 5. 23 * 24 * In practice len == 5, but this is not guaranteed. So this code does not 25 * attempt to use doubleword instructions. 26 */ 27_GLOBAL(ip_fast_csum) 28 lwz r0,0(r3) 29 lwzu r5,4(r3) 30 addic. r4,r4,-2 31 addc r0,r0,r5 32 mtctr r4 33 blelr- 341: lwzu r4,4(r3) 35 adde r0,r0,r4 36 bdnz 1b 37 addze r0,r0 /* add in final carry */ 38 rldicl r4,r0,32,0 /* fold two 32-bit halves together */ 39 add r0,r0,r4 40 srdi r0,r0,32 41 rlwinm r3,r0,16,0,31 /* fold two halves together */ 42 add r3,r0,r3 43 not r3,r3 44 srwi r3,r3,16 45 blr 46 47/* 48 * Compute checksum of TCP or UDP pseudo-header: 49 * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) 50 * No real gain trying to do this specially for 64 bit, but 51 * the 32 bit addition may spill into the upper bits of 52 * the doubleword so we still must fold it down from 64. 53 */ 54_GLOBAL(csum_tcpudp_magic) 55 rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ 56 addc r0,r3,r4 /* add 4 32-bit words together */ 57 adde r0,r0,r5 58 adde r0,r0,r7 59 rldicl r4,r0,32,0 /* fold 64 bit value */ 60 add r0,r4,r0 61 srdi r0,r0,32 62 rlwinm r3,r0,16,0,31 /* fold two halves together */ 63 add r3,r0,r3 64 not r3,r3 65 srwi r3,r3,16 66 blr 67 68/* 69 * Computes the checksum of a memory block at buff, length len, 70 * and adds in "sum" (32-bit). 71 * 72 * csum_partial(r3=buff, r4=len, r5=sum) 73 */ 74_GLOBAL(csum_partial) 75 addic r0,r5,0 /* clear carry */ 76 77 srdi. r6,r4,3 /* less than 8 bytes? */ 78 beq .Lcsum_tail_word 79 80 /* 81 * If only halfword aligned, align to a double word. Since odd 82 * aligned addresses should be rare and they would require more 83 * work to calculate the correct checksum, we ignore that case 84 * and take the potential slowdown of unaligned loads. 85 */ 86 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 87 beq .Lcsum_aligned 88 89 li r7,4 90 sub r6,r7,r6 91 mtctr r6 92 931: 94 lhz r6,0(r3) /* align to doubleword */ 95 subi r4,r4,2 96 addi r3,r3,2 97 adde r0,r0,r6 98 bdnz 1b 99 100.Lcsum_aligned: 101 /* 102 * We unroll the loop such that each iteration is 64 bytes with an 103 * entry and exit limb of 64 bytes, meaning a minimum size of 104 * 128 bytes. 105 */ 106 srdi. r6,r4,7 107 beq .Lcsum_tail_doublewords /* len < 128 */ 108 109 srdi r6,r4,6 110 subi r6,r6,1 111 mtctr r6 112 113 stdu r1,-STACKFRAMESIZE(r1) 114 std r14,STK_REG(R14)(r1) 115 std r15,STK_REG(R15)(r1) 116 std r16,STK_REG(R16)(r1) 117 118 ld r6,0(r3) 119 ld r9,8(r3) 120 121 ld r10,16(r3) 122 ld r11,24(r3) 123 124 /* 125 * On POWER6 and POWER7 back to back addes take 2 cycles because of 126 * the XER dependency. This means the fastest this loop can go is 127 * 16 cycles per iteration. The scheduling of the loop below has 128 * been shown to hit this on both POWER6 and POWER7. 129 */ 130 .align 5 1312: 132 adde r0,r0,r6 133 ld r12,32(r3) 134 ld r14,40(r3) 135 136 adde r0,r0,r9 137 ld r15,48(r3) 138 ld r16,56(r3) 139 addi r3,r3,64 140 141 adde r0,r0,r10 142 143 adde r0,r0,r11 144 145 adde r0,r0,r12 146 147 adde r0,r0,r14 148 149 adde r0,r0,r15 150 ld r6,0(r3) 151 ld r9,8(r3) 152 153 adde r0,r0,r16 154 ld r10,16(r3) 155 ld r11,24(r3) 156 bdnz 2b 157 158 159 adde r0,r0,r6 160 ld r12,32(r3) 161 ld r14,40(r3) 162 163 adde r0,r0,r9 164 ld r15,48(r3) 165 ld r16,56(r3) 166 addi r3,r3,64 167 168 adde r0,r0,r10 169 adde r0,r0,r11 170 adde r0,r0,r12 171 adde r0,r0,r14 172 adde r0,r0,r15 173 adde r0,r0,r16 174 175 ld r14,STK_REG(R14)(r1) 176 ld r15,STK_REG(R15)(r1) 177 ld r16,STK_REG(R16)(r1) 178 addi r1,r1,STACKFRAMESIZE 179 180 andi. r4,r4,63 181 182.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ 183 srdi. r6,r4,3 184 beq .Lcsum_tail_word 185 186 mtctr r6 1873: 188 ld r6,0(r3) 189 addi r3,r3,8 190 adde r0,r0,r6 191 bdnz 3b 192 193 andi. r4,r4,7 194 195.Lcsum_tail_word: /* Up to 7 bytes to go */ 196 srdi. r6,r4,2 197 beq .Lcsum_tail_halfword 198 199 lwz r6,0(r3) 200 addi r3,r3,4 201 adde r0,r0,r6 202 subi r4,r4,4 203 204.Lcsum_tail_halfword: /* Up to 3 bytes to go */ 205 srdi. r6,r4,1 206 beq .Lcsum_tail_byte 207 208 lhz r6,0(r3) 209 addi r3,r3,2 210 adde r0,r0,r6 211 subi r4,r4,2 212 213.Lcsum_tail_byte: /* Up to 1 byte to go */ 214 andi. r6,r4,1 215 beq .Lcsum_finish 216 217 lbz r6,0(r3) 218 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 219 adde r0,r0,r9 220 221.Lcsum_finish: 222 addze r0,r0 /* add in final carry */ 223 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 224 add r3,r4,r0 225 srdi r3,r3,32 226 blr 227 228 229 .macro source 230100: 231 .section __ex_table,"a" 232 .align 3 233 .llong 100b,.Lsrc_error 234 .previous 235 .endm 236 237 .macro dest 238200: 239 .section __ex_table,"a" 240 .align 3 241 .llong 200b,.Ldest_error 242 .previous 243 .endm 244 245/* 246 * Computes the checksum of a memory block at src, length len, 247 * and adds in "sum" (32-bit), while copying the block to dst. 248 * If an access exception occurs on src or dst, it stores -EFAULT 249 * to *src_err or *dst_err respectively. The caller must take any action 250 * required in this case (zeroing memory, recalculating partial checksum etc). 251 * 252 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 253 */ 254_GLOBAL(csum_partial_copy_generic) 255 addic r0,r6,0 /* clear carry */ 256 257 srdi. r6,r5,3 /* less than 8 bytes? */ 258 beq .Lcopy_tail_word 259 260 /* 261 * If only halfword aligned, align to a double word. Since odd 262 * aligned addresses should be rare and they would require more 263 * work to calculate the correct checksum, we ignore that case 264 * and take the potential slowdown of unaligned loads. 265 * 266 * If the source and destination are relatively unaligned we only 267 * align the source. This keeps things simple. 268 */ 269 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ 270 beq .Lcopy_aligned 271 272 li r7,4 273 sub r6,r7,r6 274 mtctr r6 275 2761: 277source; lhz r6,0(r3) /* align to doubleword */ 278 subi r5,r5,2 279 addi r3,r3,2 280 adde r0,r0,r6 281dest; sth r6,0(r4) 282 addi r4,r4,2 283 bdnz 1b 284 285.Lcopy_aligned: 286 /* 287 * We unroll the loop such that each iteration is 64 bytes with an 288 * entry and exit limb of 64 bytes, meaning a minimum size of 289 * 128 bytes. 290 */ 291 srdi. r6,r5,7 292 beq .Lcopy_tail_doublewords /* len < 128 */ 293 294 srdi r6,r5,6 295 subi r6,r6,1 296 mtctr r6 297 298 stdu r1,-STACKFRAMESIZE(r1) 299 std r14,STK_REG(R14)(r1) 300 std r15,STK_REG(R15)(r1) 301 std r16,STK_REG(R16)(r1) 302 303source; ld r6,0(r3) 304source; ld r9,8(r3) 305 306source; ld r10,16(r3) 307source; ld r11,24(r3) 308 309 /* 310 * On POWER6 and POWER7 back to back addes take 2 cycles because of 311 * the XER dependency. This means the fastest this loop can go is 312 * 16 cycles per iteration. The scheduling of the loop below has 313 * been shown to hit this on both POWER6 and POWER7. 314 */ 315 .align 5 3162: 317 adde r0,r0,r6 318source; ld r12,32(r3) 319source; ld r14,40(r3) 320 321 adde r0,r0,r9 322source; ld r15,48(r3) 323source; ld r16,56(r3) 324 addi r3,r3,64 325 326 adde r0,r0,r10 327dest; std r6,0(r4) 328dest; std r9,8(r4) 329 330 adde r0,r0,r11 331dest; std r10,16(r4) 332dest; std r11,24(r4) 333 334 adde r0,r0,r12 335dest; std r12,32(r4) 336dest; std r14,40(r4) 337 338 adde r0,r0,r14 339dest; std r15,48(r4) 340dest; std r16,56(r4) 341 addi r4,r4,64 342 343 adde r0,r0,r15 344source; ld r6,0(r3) 345source; ld r9,8(r3) 346 347 adde r0,r0,r16 348source; ld r10,16(r3) 349source; ld r11,24(r3) 350 bdnz 2b 351 352 353 adde r0,r0,r6 354source; ld r12,32(r3) 355source; ld r14,40(r3) 356 357 adde r0,r0,r9 358source; ld r15,48(r3) 359source; ld r16,56(r3) 360 addi r3,r3,64 361 362 adde r0,r0,r10 363dest; std r6,0(r4) 364dest; std r9,8(r4) 365 366 adde r0,r0,r11 367dest; std r10,16(r4) 368dest; std r11,24(r4) 369 370 adde r0,r0,r12 371dest; std r12,32(r4) 372dest; std r14,40(r4) 373 374 adde r0,r0,r14 375dest; std r15,48(r4) 376dest; std r16,56(r4) 377 addi r4,r4,64 378 379 adde r0,r0,r15 380 adde r0,r0,r16 381 382 ld r14,STK_REG(R14)(r1) 383 ld r15,STK_REG(R15)(r1) 384 ld r16,STK_REG(R16)(r1) 385 addi r1,r1,STACKFRAMESIZE 386 387 andi. r5,r5,63 388 389.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ 390 srdi. r6,r5,3 391 beq .Lcopy_tail_word 392 393 mtctr r6 3943: 395source; ld r6,0(r3) 396 addi r3,r3,8 397 adde r0,r0,r6 398dest; std r6,0(r4) 399 addi r4,r4,8 400 bdnz 3b 401 402 andi. r5,r5,7 403 404.Lcopy_tail_word: /* Up to 7 bytes to go */ 405 srdi. r6,r5,2 406 beq .Lcopy_tail_halfword 407 408source; lwz r6,0(r3) 409 addi r3,r3,4 410 adde r0,r0,r6 411dest; stw r6,0(r4) 412 addi r4,r4,4 413 subi r5,r5,4 414 415.Lcopy_tail_halfword: /* Up to 3 bytes to go */ 416 srdi. r6,r5,1 417 beq .Lcopy_tail_byte 418 419source; lhz r6,0(r3) 420 addi r3,r3,2 421 adde r0,r0,r6 422dest; sth r6,0(r4) 423 addi r4,r4,2 424 subi r5,r5,2 425 426.Lcopy_tail_byte: /* Up to 1 byte to go */ 427 andi. r6,r5,1 428 beq .Lcopy_finish 429 430source; lbz r6,0(r3) 431 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 432 adde r0,r0,r9 433dest; stb r6,0(r4) 434 435.Lcopy_finish: 436 addze r0,r0 /* add in final carry */ 437 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 438 add r3,r4,r0 439 srdi r3,r3,32 440 blr 441 442.Lsrc_error: 443 cmpdi 0,r7,0 444 beqlr 445 li r6,-EFAULT 446 stw r6,0(r7) 447 blr 448 449.Ldest_error: 450 cmpdi 0,r8,0 451 beqlr 452 li r6,-EFAULT 453 stw r6,0(r8) 454 blr 455