blob: 404d5a6e3387ddce178a50e92d53078434599037 [file] [log] [blame]
Paul Mackerras14cf11a2005-09-26 16:04:21 +10001/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed. So this code does not
25 * attempt to use doubleword instructions.
26 */
27_GLOBAL(ip_fast_csum)
28 lwz r0,0(r3)
29 lwzu r5,4(r3)
30 addic. r4,r4,-2
31 addc r0,r0,r5
32 mtctr r4
33 blelr-
341: lwzu r4,4(r3)
35 adde r0,r0,r4
36 bdnz 1b
37 addze r0,r0 /* add in final carry */
38 rldicl r4,r0,32,0 /* fold two 32-bit halves together */
39 add r0,r0,r4
40 srdi r0,r0,32
41 rlwinm r3,r0,16,0,31 /* fold two halves together */
42 add r3,r0,r3
43 not r3,r3
44 srwi r3,r3,16
45 blr
46
47/*
48 * Compute checksum of TCP or UDP pseudo-header:
49 * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50 * No real gain trying to do this specially for 64 bit, but
51 * the 32 bit addition may spill into the upper bits of
52 * the doubleword so we still must fold it down from 64.
53 */
54_GLOBAL(csum_tcpudp_magic)
55 rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
56 addc r0,r3,r4 /* add 4 32-bit words together */
57 adde r0,r0,r5
58 adde r0,r0,r7
59 rldicl r4,r0,32,0 /* fold 64 bit value */
60 add r0,r4,r0
61 srdi r0,r0,32
62 rlwinm r3,r0,16,0,31 /* fold two halves together */
63 add r3,r0,r3
64 not r3,r3
65 srwi r3,r3,16
66 blr
67
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000068#define STACKFRAMESIZE 256
69#define STK_REG(i) (112 + ((i)-14)*8)
70
Paul Mackerras14cf11a2005-09-26 16:04:21 +100071/*
72 * Computes the checksum of a memory block at buff, length len,
73 * and adds in "sum" (32-bit).
74 *
Paul Mackerras14cf11a2005-09-26 16:04:21 +100075 * csum_partial(r3=buff, r4=len, r5=sum)
76 */
77_GLOBAL(csum_partial)
Anton Blanchard9b83ecb2010-08-02 20:08:34 +000078 addic r0,r5,0 /* clear carry */
79
80 srdi. r6,r4,3 /* less than 8 bytes? */
81 beq .Lcsum_tail_word
82
83 /*
84 * If only halfword aligned, align to a double word. Since odd
85 * aligned addresses should be rare and they would require more
86 * work to calculate the correct checksum, we ignore that case
87 * and take the potential slowdown of unaligned loads.
88 */
89 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
90 beq .Lcsum_aligned
91
92 li r7,4
93 sub r6,r7,r6
94 mtctr r6
95
961:
97 lhz r6,0(r3) /* align to doubleword */
98 subi r4,r4,2
99 addi r3,r3,2
100 adde r0,r0,r6
101 bdnz 1b
102
103.Lcsum_aligned:
104 /*
105 * We unroll the loop such that each iteration is 64 bytes with an
106 * entry and exit limb of 64 bytes, meaning a minimum size of
107 * 128 bytes.
108 */
109 srdi. r6,r4,7
110 beq .Lcsum_tail_doublewords /* len < 128 */
111
112 srdi r6,r4,6
113 subi r6,r6,1
114 mtctr r6
115
116 stdu r1,-STACKFRAMESIZE(r1)
117 std r14,STK_REG(r14)(r1)
118 std r15,STK_REG(r15)(r1)
119 std r16,STK_REG(r16)(r1)
120
121 ld r6,0(r3)
122 ld r9,8(r3)
123
124 ld r10,16(r3)
125 ld r11,24(r3)
126
127 /*
128 * On POWER6 and POWER7 back to back addes take 2 cycles because of
129 * the XER dependency. This means the fastest this loop can go is
130 * 16 cycles per iteration. The scheduling of the loop below has
131 * been shown to hit this on both POWER6 and POWER7.
132 */
133 .align 5
1342:
135 adde r0,r0,r6
136 ld r12,32(r3)
137 ld r14,40(r3)
138
139 adde r0,r0,r9
140 ld r15,48(r3)
141 ld r16,56(r3)
142 addi r3,r3,64
143
144 adde r0,r0,r10
145
146 adde r0,r0,r11
147
148 adde r0,r0,r12
149
150 adde r0,r0,r14
151
152 adde r0,r0,r15
153 ld r6,0(r3)
154 ld r9,8(r3)
155
156 adde r0,r0,r16
157 ld r10,16(r3)
158 ld r11,24(r3)
159 bdnz 2b
160
161
162 adde r0,r0,r6
163 ld r12,32(r3)
164 ld r14,40(r3)
165
166 adde r0,r0,r9
167 ld r15,48(r3)
168 ld r16,56(r3)
169 addi r3,r3,64
170
171 adde r0,r0,r10
172 adde r0,r0,r11
173 adde r0,r0,r12
174 adde r0,r0,r14
175 adde r0,r0,r15
176 adde r0,r0,r16
177
178 ld r14,STK_REG(r14)(r1)
179 ld r15,STK_REG(r15)(r1)
180 ld r16,STK_REG(r16)(r1)
181 addi r1,r1,STACKFRAMESIZE
182
183 andi. r4,r4,63
184
185.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
186 srdi. r6,r4,3
187 beq .Lcsum_tail_word
188
189 mtctr r6
1903:
191 ld r6,0(r3)
192 addi r3,r3,8
193 adde r0,r0,r6
194 bdnz 3b
195
196 andi. r4,r4,7
197
198.Lcsum_tail_word: /* Up to 7 bytes to go */
199 srdi. r6,r4,2
200 beq .Lcsum_tail_halfword
201
202 lwz r6,0(r3)
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000203 addi r3,r3,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000204 adde r0,r0,r6
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000205 subi r4,r4,4
Anton Blanchard9b83ecb2010-08-02 20:08:34 +0000206
207.Lcsum_tail_halfword: /* Up to 3 bytes to go */
208 srdi. r6,r4,1
209 beq .Lcsum_tail_byte
210
211 lhz r6,0(r3)
212 addi r3,r3,2
213 adde r0,r0,r6
214 subi r4,r4,2
215
216.Lcsum_tail_byte: /* Up to 1 byte to go */
217 andi. r6,r4,1
218 beq .Lcsum_finish
219
220 lbz r6,0(r3)
221 sldi r9,r6,8 /* Pad the byte out to 16 bits */
222 adde r0,r0,r9
223
224.Lcsum_finish:
225 addze r0,r0 /* add in final carry */
226 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
227 add r3,r4,r0
228 srdi r3,r3,32
229 blr
Paul Mackerras14cf11a2005-09-26 16:04:21 +1000230
231/*
232 * Computes the checksum of a memory block at src, length len,
233 * and adds in "sum" (32-bit), while copying the block to dst.
234 * If an access exception occurs on src or dst, it stores -EFAULT
235 * to *src_err or *dst_err respectively, and (for an error on
236 * src) zeroes the rest of dst.
237 *
238 * This code needs to be reworked to take advantage of 64 bit sum+copy.
239 * However, due to tokenring halfword alignment problems this will be very
240 * tricky. For now we'll leave it until we instrument it somehow.
241 *
242 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
243 */
244_GLOBAL(csum_partial_copy_generic)
245 addic r0,r6,0
246 subi r3,r3,4
247 subi r4,r4,4
248 srwi. r6,r5,2
249 beq 3f /* if we're doing < 4 bytes */
250 andi. r9,r4,2 /* Align dst to longword boundary */
251 beq+ 1f
25281: lhz r6,4(r3) /* do 2 bytes to get aligned */
253 addi r3,r3,2
254 subi r5,r5,2
25591: sth r6,4(r4)
256 addi r4,r4,2
257 addc r0,r0,r6
258 srwi. r6,r5,2 /* # words to do */
259 beq 3f
2601: mtctr r6
26182: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */
26292: stwu r6,4(r4) /* be unnecessary to unroll this loop */
263 adde r0,r0,r6
264 bdnz 82b
265 andi. r5,r5,3
2663: cmpwi 0,r5,2
267 blt+ 4f
26883: lhz r6,4(r3)
269 addi r3,r3,2
270 subi r5,r5,2
27193: sth r6,4(r4)
272 addi r4,r4,2
273 adde r0,r0,r6
2744: cmpwi 0,r5,1
275 bne+ 5f
27684: lbz r6,4(r3)
27794: stb r6,4(r4)
278 slwi r6,r6,8 /* Upper byte of word */
279 adde r0,r0,r6
2805: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */
281 rldicl r4,r3,32,0 /* fold 64 bit value */
282 add r3,r4,r3
283 srdi r3,r3,32
284 blr
285
286/* These shouldn't go in the fixup section, since that would
287 cause the ex_table addresses to get out of order. */
288
289 .globl src_error_1
290src_error_1:
291 li r6,0
292 subi r5,r5,2
29395: sth r6,4(r4)
294 addi r4,r4,2
295 srwi. r6,r5,2
296 beq 3f
297 mtctr r6
298 .globl src_error_2
299src_error_2:
300 li r6,0
30196: stwu r6,4(r4)
302 bdnz 96b
3033: andi. r5,r5,3
304 beq src_error
305 .globl src_error_3
306src_error_3:
307 li r6,0
308 mtctr r5
309 addi r4,r4,3
31097: stbu r6,1(r4)
311 bdnz 97b
312 .globl src_error
313src_error:
314 cmpdi 0,r7,0
315 beq 1f
316 li r6,-EFAULT
317 stw r6,0(r7)
3181: addze r3,r0
319 blr
320
321 .globl dst_error
322dst_error:
323 cmpdi 0,r8,0
324 beq 1f
325 li r6,-EFAULT
326 stw r6,0(r8)
3271: addze r3,r0
328 blr
329
330.section __ex_table,"a"
331 .align 3
332 .llong 81b,src_error_1
333 .llong 91b,dst_error
334 .llong 82b,src_error_2
335 .llong 92b,dst_error
336 .llong 83b,src_error_3
337 .llong 93b,dst_error
338 .llong 84b,src_error_3
339 .llong 94b,dst_error
340 .llong 95b,dst_error
341 .llong 96b,dst_error
342 .llong 97b,dst_error