blob: 04b797767b9ef7ae3f530195faaa562d8f317c97 [file] [log] [blame]
Huang Ying54b6a1b2009-01-18 16:28:34 +11001/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040012 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
Mathias Krause0d258ef2010-11-27 16:34:46 +080023 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
Huang Ying54b6a1b2009-01-18 16:28:34 +110026 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
Huang Yingb369e522009-11-23 19:54:06 +080033#include <asm/inst.h>
Huang Ying54b6a1b2009-01-18 16:28:34 +110034
Mathias Krause559ad0f2010-11-29 08:35:39 +080035#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040036.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
Huang Ying54b6a1b2009-01-18 16:28:34 +110056.text
57
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040058
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
Mathias Krause559ad0f2010-11-29 08:35:39 +080088#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -040089
90
Huang Ying54b6a1b2009-01-18 16:28:34 +110091#define STATE1 %xmm0
92#define STATE2 %xmm4
93#define STATE3 %xmm5
94#define STATE4 %xmm6
95#define STATE STATE1
96#define IN1 %xmm1
97#define IN2 %xmm7
98#define IN3 %xmm8
99#define IN4 %xmm9
100#define IN IN1
101#define KEY %xmm2
102#define IV %xmm3
Mathias Krause0d258ef2010-11-27 16:34:46 +0800103
Huang Ying12387a42010-03-10 18:28:55 +0800104#define BSWAP_MASK %xmm10
105#define CTR %xmm11
106#define INC %xmm12
Huang Ying54b6a1b2009-01-18 16:28:34 +1100107
Mathias Krause0d258ef2010-11-27 16:34:46 +0800108#ifdef __x86_64__
109#define AREG %rax
Huang Ying54b6a1b2009-01-18 16:28:34 +1100110#define KEYP %rdi
111#define OUTP %rsi
Mathias Krause0d258ef2010-11-27 16:34:46 +0800112#define UKEYP OUTP
Huang Ying54b6a1b2009-01-18 16:28:34 +1100113#define INP %rdx
114#define LEN %rcx
115#define IVP %r8
116#define KLEN %r9d
117#define T1 %r10
118#define TKEYP T1
119#define T2 %r11
Huang Ying12387a42010-03-10 18:28:55 +0800120#define TCTR_LOW T2
Mathias Krause0d258ef2010-11-27 16:34:46 +0800121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +1100133
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400134
Mathias Krause559ad0f2010-11-29 08:35:39 +0800135#ifdef __x86_64__
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400207
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400234 xor %r11, %r11 # initialise the data pointer offset as zero
235
236 # start AES for num_initial_blocks blocks
237
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400244.irpc index, \i_seq
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400250.endr
251.irpc index, \i_seq
252 pxor 16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
257.endr
258.irpc index, \i_seq
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
265.endr
266.irpc index, \i_seq
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
269.endr
270.irpc index, \i_seq
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
273.endr
274.irpc index, \i_seq
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
277.endr
278.irpc index, \i_seq
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
281.endr
282.irpc index, \i_seq
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
285.endr
286.irpc index, \i_seq
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
289.endr
290.irpc index, \i_seq
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
293.endr
294.irpc index, \i_seq
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
299 add $16, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800300
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400301 movdqa \TMP1, %xmm\index
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
304
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400305 # prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312 pxor %xmm5, %xmm6
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319 pxor %xmm6, %xmm7
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 pxor %xmm7, %xmm8
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324 pxor %xmm7, %xmm8
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327 cmp $64, %r13
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335 paddd ONE(%rip), \XMM0 # INCR Y0
336 movdqa \XMM0, \XMM1
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400340 paddd ONE(%rip), \XMM0 # INCR Y0
341 movdqa \XMM0, \XMM2
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400345 paddd ONE(%rip), \XMM0 # INCR Y0
346 movdqa \XMM0, \XMM3
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400350 paddd ONE(%rip), \XMM0 # INCR Y0
351 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
359 movdqa \TMP3, \TMP5
360 pshufd $78, \TMP3, \TMP1
361 pxor \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
368 pxor \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
372 AESENC \TMP1, \XMM1
373 AESENC \TMP1, \XMM2
374 AESENC \TMP1, \XMM3
375 AESENC \TMP1, \XMM4
376.endr
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
381 pxor \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
385 AESENC \TMP1, \XMM1
386 AESENC \TMP1, \XMM2
387 AESENC \TMP1, \XMM3
388 AESENC \TMP1, \XMM4
389.endr
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
394 pxor \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404 movdqa \TMP1, \XMM1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406 pxor \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM2
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410 pxor \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412 movdqa \TMP1, \XMM3
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 movdqa \TMP1, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800650
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400651 add $64, %r11
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400654 pxor \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400663_initial_blocks_done\num_initial_blocks\operation:
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800664
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800681 movdqa SHUF_MASK(%rip), %xmm15
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -0400804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk3c097b82010-12-13 19:51:15 +0800806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868 movdqa \XMM1, \XMM5
869 movdqa \XMM2, \XMM6
870 movdqa \XMM3, \XMM7
871 movdqa \XMM4, \XMM8
872
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
875
876 movdqa \XMM5, \TMP4
877 pshufd $78, \XMM5, \TMP6
878 pxor \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882 movdqa \XMM0, \XMM1
883 paddd ONE(%rip), \XMM0 # INCR CNT
884 movdqa \XMM0, \XMM2
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa \XMM0, \XMM3
887 paddd ONE(%rip), \XMM0 # INCR CNT
888 movdqa \XMM0, \XMM4
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895 pxor (%arg1), \XMM1
896 pxor (%arg1), \XMM2
897 pxor (%arg1), \XMM3
898 pxor (%arg1), \XMM4
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
903 AESENC \TMP1, \XMM2
904 AESENC \TMP1, \XMM3
905 AESENC \TMP1, \XMM4
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movdqa \XMM6, \TMP1
912 pshufd $78, \XMM6, \TMP2
913 pxor \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
931 AESENC \TMP3, \XMM2
932 AESENC \TMP3, \XMM3
933 AESENC \TMP3, \XMM4
934 pxor \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936 pxor \XMM6, \XMM5
937 pxor \TMP2, \TMP6
938 movdqa \XMM7, \TMP1
939 pshufd $78, \XMM7, \TMP2
940 pxor \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
942
943 # Multiply TMP5 * HashKey using karatsuba
944
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
948 AESENC \TMP3, \XMM2
949 AESENC \TMP3, \XMM3
950 AESENC \TMP3, \XMM4
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
961 AESENC \TMP3, \XMM2
962 AESENC \TMP3, \XMM3
963 AESENC \TMP3, \XMM4
964 pxor \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pxor \XMM7, \XMM5
967 pxor \TMP2, \TMP6
968
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
971
972 movdqa \XMM8, \TMP1
973 pshufd $78, \XMM8, \TMP2
974 pxor \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
979 AESENC \TMP3, \XMM2
980 AESENC \TMP3, \XMM3
981 AESENC \TMP3, \XMM4
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993 movdqa \TMP3, \XMM1
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997 movdqa \TMP3, \XMM2
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001 movdqa \TMP3, \XMM3
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005 movdqa \TMP3, \XMM4
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001010
1011 pxor \TMP4, \TMP1
1012 pxor \XMM8, \XMM5
1013 pxor \TMP6, \TMP2
1014 pxor \TMP1, \TMP2
1015 pxor \XMM5, \TMP2
1016 movdqa \TMP2, \TMP3
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019 pxor \TMP3, \XMM5
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022 # first phase of reduction
1023
1024 movdqa \XMM5, \TMP2
1025 movdqa \XMM5, \TMP3
1026 movdqa \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1032 pxor \TMP4, \TMP2
1033 movdqa \TMP2, \TMP5
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1036 pxor \TMP2, \XMM5
1037
1038 # second phase of reduction
1039
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041 movdqa \XMM5,\TMP3
1042 movdqa \XMM5,\TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1047 pxor \TMP4,\TMP2
1048 pxor \TMP5, \TMP2
1049 pxor \TMP2, \XMM5
1050 pxor \TMP1, \XMM5 # result is in TMP1
1051
1052 pxor \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061 movdqa \XMM1, \TMP6
1062 pshufd $78, \XMM1, \TMP2
1063 pxor \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074 movdqa \XMM2, \TMP1
1075 pshufd $78, \XMM2, \TMP2
1076 pxor \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 pxor \TMP1, \TMP6
1083 pxor \XMM2, \XMMDst
1084 pxor \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089 movdqa \XMM3, \TMP1
1090 pshufd $78, \XMM3, \TMP2
1091 pxor \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097 pxor \TMP1, \TMP6
1098 pxor \XMM3, \XMMDst
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1102 movdqa \XMM4, \TMP1
1103 pshufd $78, \XMM4, \TMP2
1104 pxor \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110 pxor \TMP1, \TMP6
1111 pxor \XMM4, \XMMDst
1112 pxor \XMM1, \TMP2
1113 pxor \TMP6, \TMP2
1114 pxor \XMMDst, \TMP2
1115 # middle section of the temp results combined as in karatsuba algorithm
1116 movdqa \TMP2, \TMP4
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119 pxor \TMP4, \XMMDst
1120 pxor \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1131 pxor \TMP4, \TMP2
1132 movdqa \TMP2, \TMP7
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135 pxor \TMP2, \XMMDst
1136
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1146 pxor \TMP4, \TMP2
1147 pxor \TMP7, \TMP2
1148 pxor \TMP2, \XMMDst
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155 pxor (%arg1), \XMM0
1156 movaps 16(%arg1), \TMP1
1157 AESENC \TMP1, \XMM0
1158 movaps 32(%arg1), \TMP1
1159 AESENC \TMP1, \XMM0
1160 movaps 48(%arg1), \TMP1
1161 AESENC \TMP1, \XMM0
1162 movaps 64(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 80(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 96(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 112(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 128(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 144(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182* const u8 *in, // Ciphertext input
1183* u64 plaintext_len, // Length of data in bytes for decryption.
1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186* // concatenated with 0x00000001. 16-byte aligned pointer.
1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188* const u8 *aad, // Additional Authentication Data (AAD)
1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191* // given authentication tag and only return the plaintext if they match.
1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193* // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199* set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202* 0 1 2 3
1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205* | Salt (From the SA) |
1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207* | Initialization Vector |
1208* | (This is the sequence number from IPSec header) |
1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210* | 0x1 |
1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216* AAD padded to 128 bits with 0
1217* for example, assume AAD is a u32 vector
1218*
1219* if AAD is 8 bytes:
1220* AAD[3] = {A0, A1};
1221* padded AAD in xmm register = {A1 A0 0 0}
1222*
1223* 0 1 2 3
1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226* | SPI (A1) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 32-bit Sequence Number (A0) |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230* | 0x0 |
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233* AAD Format with 32-bit Sequence Number
1234*
1235* if AAD is 12 bytes:
1236* AAD[3] = {A0, A1, A2};
1237* padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239* 0 1 2 3
1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A2) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 64-bit Extended Sequence Number {A1,A0} |
1247* | |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x0 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252* AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256* The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260* For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001265ENTRY(aesni_gcm_dec)
1266 push %r12
1267 push %r13
1268 push %r14
1269 mov %rsp, %r14
1270/*
1271* states of %xmm registers %xmm6:%xmm15 not saved
1272* all %xmm registers are clobbered
1273*/
1274 sub $VARIABLE_OFFSET, %rsp
1275 and $~63, %rsp # align rsp to 64 bytes
1276 mov %arg6, %r12
1277 movdqu (%r12), %xmm13 # %xmm13 = HashKey
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001278 movdqa SHUF_MASK(%rip), %xmm2
1279 PSHUFB_XMM %xmm2, %xmm13
1280
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001281
1282# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1283
1284 movdqa %xmm13, %xmm2
1285 psllq $1, %xmm13
1286 psrlq $63, %xmm2
1287 movdqa %xmm2, %xmm1
1288 pslldq $8, %xmm2
1289 psrldq $8, %xmm1
1290 por %xmm2, %xmm13
1291
1292 # Reduction
1293
1294 pshufd $0x24, %xmm1, %xmm2
1295 pcmpeqd TWOONE(%rip), %xmm2
1296 pand POLY(%rip), %xmm2
1297 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1298
1299
1300 # Decrypt first few blocks
1301
1302 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1303 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1304 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1305 mov %r13, %r12
1306 and $(3<<4), %r12
1307 jz _initial_num_blocks_is_0_decrypt
1308 cmp $(2<<4), %r12
1309 jb _initial_num_blocks_is_1_decrypt
1310 je _initial_num_blocks_is_2_decrypt
1311_initial_num_blocks_is_3_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001312 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001313%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1314 sub $48, %r13
1315 jmp _initial_blocks_decrypted
1316_initial_num_blocks_is_2_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001317 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001318%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1319 sub $32, %r13
1320 jmp _initial_blocks_decrypted
1321_initial_num_blocks_is_1_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001322 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001323%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1324 sub $16, %r13
1325 jmp _initial_blocks_decrypted
1326_initial_num_blocks_is_0_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001327 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001328%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1329_initial_blocks_decrypted:
1330 cmp $0, %r13
1331 je _zero_cipher_left_decrypt
1332 sub $64, %r13
1333 je _four_cipher_left_decrypt
1334_decrypt_by_4:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001335 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001336%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1337 add $64, %r11
1338 sub $64, %r13
1339 jne _decrypt_by_4
1340_four_cipher_left_decrypt:
1341 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1342%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1343_zero_cipher_left_decrypt:
1344 mov %arg4, %r13
1345 and $15, %r13 # %r13 = arg4 (mod 16)
1346 je _multiple_of_16_bytes_decrypt
1347
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001348 # Handle the last <16 byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001349
1350 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001351 movdqa SHUF_MASK(%rip), %xmm10
1352 PSHUFB_XMM %xmm10, %xmm0
1353
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001354 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1355 sub $16, %r11
1356 add %r13, %r11
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001357 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001358 lea SHIFT_MASK+16(%rip), %r12
1359 sub %r13, %r12
1360# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1361# (%r13 is the number of bytes in plaintext mod 16)
1362 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001363 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1364
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001365 movdqa %xmm1, %xmm2
1366 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1367 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1368 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1369 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1370 pand %xmm1, %xmm2
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001371 movdqa SHUF_MASK(%rip), %xmm10
1372 PSHUFB_XMM %xmm10 ,%xmm2
1373
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001374 pxor %xmm2, %xmm8
1375 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1376 # GHASH computation for the last <16 byte block
1377 sub %r13, %r11
1378 add $16, %r11
1379
1380 # output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001381 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001382 cmp $8, %r13
1383 jle _less_than_8_bytes_left_decrypt
1384 mov %rax, (%arg2 , %r11, 1)
1385 add $8, %r11
1386 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001387 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001388 sub $8, %r13
1389_less_than_8_bytes_left_decrypt:
1390 mov %al, (%arg2, %r11, 1)
1391 add $1, %r11
1392 shr $8, %rax
1393 sub $1, %r13
1394 jne _less_than_8_bytes_left_decrypt
1395_multiple_of_16_bytes_decrypt:
1396 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1397 shl $3, %r12 # convert into number of bits
1398 movd %r12d, %xmm15 # len(A) in %xmm15
1399 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001400 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001401 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1402 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1403 pxor %xmm15, %xmm8
1404 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1405 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001406 movdqa SHUF_MASK(%rip), %xmm10
1407 PSHUFB_XMM %xmm10, %xmm8
1408
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001409 mov %arg5, %rax # %rax = *Y0
1410 movdqu (%rax), %xmm0 # %xmm0 = Y0
1411 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1412 pxor %xmm8, %xmm0
1413_return_T_decrypt:
1414 mov arg9, %r10 # %r10 = authTag
1415 mov arg10, %r11 # %r11 = auth_tag_len
1416 cmp $16, %r11
1417 je _T_16_decrypt
1418 cmp $12, %r11
1419 je _T_12_decrypt
1420_T_8_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001421 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001422 mov %rax, (%r10)
1423 jmp _return_T_done_decrypt
1424_T_12_decrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001425 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001426 mov %rax, (%r10)
1427 psrldq $8, %xmm0
1428 movd %xmm0, %eax
1429 mov %eax, 8(%r10)
1430 jmp _return_T_done_decrypt
1431_T_16_decrypt:
1432 movdqu %xmm0, (%r10)
1433_return_T_done_decrypt:
1434 mov %r14, %rsp
1435 pop %r14
1436 pop %r13
1437 pop %r12
1438 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001439ENDPROC(aesni_gcm_dec)
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445* const u8 *in, // Plaintext input
1446* u64 plaintext_len, // Length of data in bytes for encryption.
1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449* // concatenated with 0x00000001. 16-byte aligned pointer.
1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451* const u8 *aad, // Additional Authentication Data (AAD)
1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453* u8 *auth_tag, // Authenticated Tag output.
1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455* // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460* keys are pre-expanded and aligned to 16 bytes. we are using the
1461* first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465* 0 1 2 3
1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468* | Salt (From the SA) |
1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470* | Initialization Vector |
1471* | (This is the sequence number from IPSec header) |
1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473* | 0x1 |
1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479* AAD padded to 128 bits with 0
1480* for example, assume AAD is a u32 vector
1481*
1482* if AAD is 8 bytes:
1483* AAD[3] = {A0, A1};
1484* padded AAD in xmm register = {A1 A0 0 0}
1485*
1486* 0 1 2 3
1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489* | SPI (A1) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 32-bit Sequence Number (A0) |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493* | 0x0 |
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496* AAD Format with 32-bit Sequence Number
1497*
1498* if AAD is 12 bytes:
1499* AAD[3] = {A0, A1, A2};
1500* padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502* 0 1 2 3
1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505* | SPI (A2) |
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | 64-bit Extended Sequence Number {A1,A0} |
1508* | |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517* The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521* For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526 push %r12
1527 push %r13
1528 push %r14
1529 mov %rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534 sub $VARIABLE_OFFSET, %rsp
1535 and $~63, %rsp
1536 mov %arg6, %r12
1537 movdqu (%r12), %xmm13
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544 movdqa %xmm13, %xmm2
1545 psllq $1, %xmm13
1546 psrlq $63, %xmm2
1547 movdqa %xmm2, %xmm1
1548 pslldq $8, %xmm2
1549 psrldq $8, %xmm1
1550 por %xmm2, %xmm13
1551
1552 # reduce HashKey<<1
1553
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1557 pxor %xmm2, %xmm13
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560 and $-16, %r13
1561 mov %r13, %r12
1562
1563 # Encrypt first few blocks
1564
1565 and $(3<<4), %r12
1566 jz _initial_num_blocks_is_0_encrypt
1567 cmp $(2<<4), %r12
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573 sub $48, %r13
1574 jmp _initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578 sub $32, %r13
1579 jmp _initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583 sub $16, %r13
1584 jmp _initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590 # Main loop - Encrypt remaining blocks
1591
1592 cmp $0, %r13
1593 je _zero_cipher_left_encrypt
1594 sub $64, %r13
1595 je _four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599 add $64, %r11
1600 sub $64, %r13
1601 jne _encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606 mov %arg4, %r13
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1609
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001610 # Handle the last <16 Byte block separately
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
Tadeusz Struk60af5202011-03-13 16:56:17 +08001615
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1617 sub $16, %r11
1618 add %r13, %r11
1619 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1620 lea SHIFT_MASK+16(%rip), %r12
1621 sub %r13, %r12
1622 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623 # (%r13 is the number of bytes in plaintext mod 16)
1624 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001625 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001626 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1627 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1628 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10,%xmm0
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001632
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001633 pxor %xmm0, %xmm8
1634 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635 # GHASH computation for the last <16 byte block
1636 sub %r13, %r11
1637 add $16, %r11
Tadeusz Struk60af5202011-03-13 16:56:17 +08001638
1639 movdqa SHUF_MASK(%rip), %xmm10
1640 PSHUFB_XMM %xmm10, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001641
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001642 # shuffle xmm0 back to output as ciphertext
1643
1644 # Output %r13 bytes
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001645 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001646 cmp $8, %r13
1647 jle _less_than_8_bytes_left_encrypt
1648 mov %rax, (%arg2 , %r11, 1)
1649 add $8, %r11
1650 psrldq $8, %xmm0
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001651 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001652 sub $8, %r13
1653_less_than_8_bytes_left_encrypt:
1654 mov %al, (%arg2, %r11, 1)
1655 add $1, %r11
1656 shr $8, %rax
1657 sub $1, %r13
1658 jne _less_than_8_bytes_left_encrypt
1659_multiple_of_16_bytes_encrypt:
1660 mov arg8, %r12 # %r12 = addLen (number of bytes)
1661 shl $3, %r12
1662 movd %r12d, %xmm15 # len(A) in %xmm15
1663 shl $3, %arg4 # len(C) in bits (*128)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001664 MOVQ_R64_XMM %arg4, %xmm1
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001665 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1666 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1667 pxor %xmm15, %xmm8
1668 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669 # final GHASH computation
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001670 movdqa SHUF_MASK(%rip), %xmm10
1671 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001672
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001673 mov %arg5, %rax # %rax = *Y0
1674 movdqu (%rax), %xmm0 # %xmm0 = Y0
1675 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1676 pxor %xmm8, %xmm0
1677_return_T_encrypt:
1678 mov arg9, %r10 # %r10 = authTag
1679 mov arg10, %r11 # %r11 = auth_tag_len
1680 cmp $16, %r11
1681 je _T_16_encrypt
1682 cmp $12, %r11
1683 je _T_12_encrypt
1684_T_8_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001685 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001686 mov %rax, (%r10)
1687 jmp _return_T_done_encrypt
1688_T_12_encrypt:
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001689 MOVQ_R64_XMM %xmm0, %rax
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001690 mov %rax, (%r10)
1691 psrldq $8, %xmm0
1692 movd %xmm0, %eax
1693 mov %eax, 8(%r10)
1694 jmp _return_T_done_encrypt
1695_T_16_encrypt:
1696 movdqu %xmm0, (%r10)
1697_return_T_done_encrypt:
1698 mov %r14, %rsp
1699 pop %r14
1700 pop %r13
1701 pop %r12
1702 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001703ENDPROC(aesni_gcm_enc)
Tadeusz Struk3c097b82010-12-13 19:51:15 +08001704
Mathias Krause559ad0f2010-11-29 08:35:39 +08001705#endif
Tadeusz Struk0bd82f52010-11-04 15:00:45 -04001706
1707
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001708.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001709_key_expansion_128:
1710_key_expansion_256a:
1711 pshufd $0b11111111, %xmm1, %xmm1
1712 shufps $0b00010000, %xmm0, %xmm4
1713 pxor %xmm4, %xmm0
1714 shufps $0b10001100, %xmm0, %xmm4
1715 pxor %xmm4, %xmm0
1716 pxor %xmm1, %xmm0
Mathias Krause0d258ef2010-11-27 16:34:46 +08001717 movaps %xmm0, (TKEYP)
1718 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001719 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001720ENDPROC(_key_expansion_128)
1721ENDPROC(_key_expansion_256a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001722
Mathias Krause0d258ef2010-11-27 16:34:46 +08001723.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001724_key_expansion_192a:
1725 pshufd $0b01010101, %xmm1, %xmm1
1726 shufps $0b00010000, %xmm0, %xmm4
1727 pxor %xmm4, %xmm0
1728 shufps $0b10001100, %xmm0, %xmm4
1729 pxor %xmm4, %xmm0
1730 pxor %xmm1, %xmm0
1731
1732 movaps %xmm2, %xmm5
1733 movaps %xmm2, %xmm6
1734 pslldq $4, %xmm5
1735 pshufd $0b11111111, %xmm0, %xmm3
1736 pxor %xmm3, %xmm2
1737 pxor %xmm5, %xmm2
1738
1739 movaps %xmm0, %xmm1
1740 shufps $0b01000100, %xmm0, %xmm6
Mathias Krause0d258ef2010-11-27 16:34:46 +08001741 movaps %xmm6, (TKEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001742 shufps $0b01001110, %xmm2, %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001743 movaps %xmm1, 0x10(TKEYP)
1744 add $0x20, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001745 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001746ENDPROC(_key_expansion_192a)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001747
Mathias Krause0d258ef2010-11-27 16:34:46 +08001748.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001749_key_expansion_192b:
1750 pshufd $0b01010101, %xmm1, %xmm1
1751 shufps $0b00010000, %xmm0, %xmm4
1752 pxor %xmm4, %xmm0
1753 shufps $0b10001100, %xmm0, %xmm4
1754 pxor %xmm4, %xmm0
1755 pxor %xmm1, %xmm0
1756
1757 movaps %xmm2, %xmm5
1758 pslldq $4, %xmm5
1759 pshufd $0b11111111, %xmm0, %xmm3
1760 pxor %xmm3, %xmm2
1761 pxor %xmm5, %xmm2
1762
Mathias Krause0d258ef2010-11-27 16:34:46 +08001763 movaps %xmm0, (TKEYP)
1764 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001765 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001766ENDPROC(_key_expansion_192b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001767
Mathias Krause0d258ef2010-11-27 16:34:46 +08001768.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001769_key_expansion_256b:
1770 pshufd $0b10101010, %xmm1, %xmm1
1771 shufps $0b00010000, %xmm2, %xmm4
1772 pxor %xmm4, %xmm2
1773 shufps $0b10001100, %xmm2, %xmm4
1774 pxor %xmm4, %xmm2
1775 pxor %xmm1, %xmm2
Mathias Krause0d258ef2010-11-27 16:34:46 +08001776 movaps %xmm2, (TKEYP)
1777 add $0x10, TKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001778 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001779ENDPROC(_key_expansion_256b)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001780
1781/*
1782 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1783 * unsigned int key_len)
1784 */
1785ENTRY(aesni_set_key)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001786#ifndef __x86_64__
1787 pushl KEYP
1788 movl 8(%esp), KEYP # ctx
1789 movl 12(%esp), UKEYP # in_key
1790 movl 16(%esp), %edx # key_len
1791#endif
1792 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1793 movaps %xmm0, (KEYP)
1794 lea 0x10(KEYP), TKEYP # key addr
1795 movl %edx, 480(KEYP)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001796 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1797 cmp $24, %dl
1798 jb .Lenc_key128
1799 je .Lenc_key192
Mathias Krause0d258ef2010-11-27 16:34:46 +08001800 movups 0x10(UKEYP), %xmm2 # other user key
1801 movaps %xmm2, (TKEYP)
1802 add $0x10, TKEYP
Huang Yingb369e522009-11-23 19:54:06 +08001803 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001804 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001805 AESKEYGENASSIST 0x1 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001806 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001807 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001808 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001809 AESKEYGENASSIST 0x2 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001810 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001811 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001812 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001813 AESKEYGENASSIST 0x4 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001814 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001815 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001816 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001817 AESKEYGENASSIST 0x8 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001818 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001819 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001820 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001821 AESKEYGENASSIST 0x10 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001822 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001823 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001824 call _key_expansion_256a
Huang Yingb369e522009-11-23 19:54:06 +08001825 AESKEYGENASSIST 0x20 %xmm0 %xmm1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001826 call _key_expansion_256b
Huang Yingb369e522009-11-23 19:54:06 +08001827 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001828 call _key_expansion_256a
1829 jmp .Ldec_key
1830.Lenc_key192:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001831 movq 0x10(UKEYP), %xmm2 # other user key
Huang Yingb369e522009-11-23 19:54:06 +08001832 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001833 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001834 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001835 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001836 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001837 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001838 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001839 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001840 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001841 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001842 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001843 call _key_expansion_192b
Huang Yingb369e522009-11-23 19:54:06 +08001844 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001845 call _key_expansion_192a
Huang Yingb369e522009-11-23 19:54:06 +08001846 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001847 call _key_expansion_192b
1848 jmp .Ldec_key
1849.Lenc_key128:
Huang Yingb369e522009-11-23 19:54:06 +08001850 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
Huang Ying54b6a1b2009-01-18 16:28:34 +11001851 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001852 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
Huang Ying54b6a1b2009-01-18 16:28:34 +11001853 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001854 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
Huang Ying54b6a1b2009-01-18 16:28:34 +11001855 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001856 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001857 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001858 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
Huang Ying54b6a1b2009-01-18 16:28:34 +11001859 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001860 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
Huang Ying54b6a1b2009-01-18 16:28:34 +11001861 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001862 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
Huang Ying54b6a1b2009-01-18 16:28:34 +11001863 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001864 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
Huang Ying54b6a1b2009-01-18 16:28:34 +11001865 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001866 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
Huang Ying54b6a1b2009-01-18 16:28:34 +11001867 call _key_expansion_128
Huang Yingb369e522009-11-23 19:54:06 +08001868 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
Huang Ying54b6a1b2009-01-18 16:28:34 +11001869 call _key_expansion_128
1870.Ldec_key:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001871 sub $0x10, TKEYP
1872 movaps (KEYP), %xmm0
1873 movaps (TKEYP), %xmm1
1874 movaps %xmm0, 240(TKEYP)
1875 movaps %xmm1, 240(KEYP)
1876 add $0x10, KEYP
1877 lea 240-16(TKEYP), UKEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001878.align 4
1879.Ldec_key_loop:
Mathias Krause0d258ef2010-11-27 16:34:46 +08001880 movaps (KEYP), %xmm0
Huang Yingb369e522009-11-23 19:54:06 +08001881 AESIMC %xmm0 %xmm1
Mathias Krause0d258ef2010-11-27 16:34:46 +08001882 movaps %xmm1, (UKEYP)
1883 add $0x10, KEYP
1884 sub $0x10, UKEYP
1885 cmp TKEYP, KEYP
Huang Ying54b6a1b2009-01-18 16:28:34 +11001886 jb .Ldec_key_loop
Mathias Krause0d258ef2010-11-27 16:34:46 +08001887 xor AREG, AREG
1888#ifndef __x86_64__
1889 popl KEYP
1890#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001891 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001892ENDPROC(aesni_set_key)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001893
1894/*
1895 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1896 */
1897ENTRY(aesni_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08001898#ifndef __x86_64__
1899 pushl KEYP
1900 pushl KLEN
1901 movl 12(%esp), KEYP
1902 movl 16(%esp), OUTP
1903 movl 20(%esp), INP
1904#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001905 movl 480(KEYP), KLEN # key length
1906 movups (INP), STATE # input
1907 call _aesni_enc1
1908 movups STATE, (OUTP) # output
Mathias Krause0d258ef2010-11-27 16:34:46 +08001909#ifndef __x86_64__
1910 popl KLEN
1911 popl KEYP
1912#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11001913 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001914ENDPROC(aesni_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001915
1916/*
1917 * _aesni_enc1: internal ABI
1918 * input:
1919 * KEYP: key struct pointer
1920 * KLEN: round count
1921 * STATE: initial state (input)
1922 * output:
1923 * STATE: finial state (output)
1924 * changed:
1925 * KEY
1926 * TKEYP (T1)
1927 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001928.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001929_aesni_enc1:
1930 movaps (KEYP), KEY # key
1931 mov KEYP, TKEYP
1932 pxor KEY, STATE # round 0
1933 add $0x30, TKEYP
1934 cmp $24, KLEN
1935 jb .Lenc128
1936 lea 0x20(TKEYP), TKEYP
1937 je .Lenc192
1938 add $0x20, TKEYP
1939 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001940 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001941 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001942 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001943.align 4
1944.Lenc192:
1945 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001946 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001947 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001948 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001949.align 4
1950.Lenc128:
1951 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001952 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001953 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001954 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001955 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001956 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001957 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001958 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001959 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001960 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001961 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001962 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001963 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001964 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001965 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001966 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001967 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001968 AESENC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001969 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08001970 AESENCLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11001971 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02001972ENDPROC(_aesni_enc1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11001973
1974/*
1975 * _aesni_enc4: internal ABI
1976 * input:
1977 * KEYP: key struct pointer
1978 * KLEN: round count
1979 * STATE1: initial state (input)
1980 * STATE2
1981 * STATE3
1982 * STATE4
1983 * output:
1984 * STATE1: finial state (output)
1985 * STATE2
1986 * STATE3
1987 * STATE4
1988 * changed:
1989 * KEY
1990 * TKEYP (T1)
1991 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08001992.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11001993_aesni_enc4:
1994 movaps (KEYP), KEY # key
1995 mov KEYP, TKEYP
1996 pxor KEY, STATE1 # round 0
1997 pxor KEY, STATE2
1998 pxor KEY, STATE3
1999 pxor KEY, STATE4
2000 add $0x30, TKEYP
2001 cmp $24, KLEN
2002 jb .L4enc128
2003 lea 0x20(TKEYP), TKEYP
2004 je .L4enc192
2005 add $0x20, TKEYP
2006 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002007 AESENC KEY STATE1
2008 AESENC KEY STATE2
2009 AESENC KEY STATE3
2010 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002011 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002012 AESENC KEY STATE1
2013 AESENC KEY STATE2
2014 AESENC KEY STATE3
2015 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002016#.align 4
2017.L4enc192:
2018 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002019 AESENC KEY STATE1
2020 AESENC KEY STATE2
2021 AESENC KEY STATE3
2022 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002023 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002024 AESENC KEY STATE1
2025 AESENC KEY STATE2
2026 AESENC KEY STATE3
2027 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002028#.align 4
2029.L4enc128:
2030 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002031 AESENC KEY STATE1
2032 AESENC KEY STATE2
2033 AESENC KEY STATE3
2034 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002035 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002036 AESENC KEY STATE1
2037 AESENC KEY STATE2
2038 AESENC KEY STATE3
2039 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002040 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002041 AESENC KEY STATE1
2042 AESENC KEY STATE2
2043 AESENC KEY STATE3
2044 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002045 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002046 AESENC KEY STATE1
2047 AESENC KEY STATE2
2048 AESENC KEY STATE3
2049 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002050 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002051 AESENC KEY STATE1
2052 AESENC KEY STATE2
2053 AESENC KEY STATE3
2054 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002055 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002056 AESENC KEY STATE1
2057 AESENC KEY STATE2
2058 AESENC KEY STATE3
2059 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002060 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002061 AESENC KEY STATE1
2062 AESENC KEY STATE2
2063 AESENC KEY STATE3
2064 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002065 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002066 AESENC KEY STATE1
2067 AESENC KEY STATE2
2068 AESENC KEY STATE3
2069 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002070 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002071 AESENC KEY STATE1
2072 AESENC KEY STATE2
2073 AESENC KEY STATE3
2074 AESENC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002075 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002076 AESENCLAST KEY STATE1 # last round
2077 AESENCLAST KEY STATE2
2078 AESENCLAST KEY STATE3
2079 AESENCLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002080 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002081ENDPROC(_aesni_enc4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002082
2083/*
2084 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2085 */
2086ENTRY(aesni_dec)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002087#ifndef __x86_64__
2088 pushl KEYP
2089 pushl KLEN
2090 movl 12(%esp), KEYP
2091 movl 16(%esp), OUTP
2092 movl 20(%esp), INP
2093#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002094 mov 480(KEYP), KLEN # key length
2095 add $240, KEYP
2096 movups (INP), STATE # input
2097 call _aesni_dec1
2098 movups STATE, (OUTP) #output
Mathias Krause0d258ef2010-11-27 16:34:46 +08002099#ifndef __x86_64__
2100 popl KLEN
2101 popl KEYP
2102#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002103 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002104ENDPROC(aesni_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002105
2106/*
2107 * _aesni_dec1: internal ABI
2108 * input:
2109 * KEYP: key struct pointer
2110 * KLEN: key length
2111 * STATE: initial state (input)
2112 * output:
2113 * STATE: finial state (output)
2114 * changed:
2115 * KEY
2116 * TKEYP (T1)
2117 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002118.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002119_aesni_dec1:
2120 movaps (KEYP), KEY # key
2121 mov KEYP, TKEYP
2122 pxor KEY, STATE # round 0
2123 add $0x30, TKEYP
2124 cmp $24, KLEN
2125 jb .Ldec128
2126 lea 0x20(TKEYP), TKEYP
2127 je .Ldec192
2128 add $0x20, TKEYP
2129 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002130 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002131 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002132 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002133.align 4
2134.Ldec192:
2135 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002136 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002137 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002138 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002139.align 4
2140.Ldec128:
2141 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002142 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002143 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002144 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002145 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002146 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002147 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002148 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002149 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002150 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002151 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002152 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002153 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002154 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002155 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002156 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002157 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002158 AESDEC KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002159 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002160 AESDECLAST KEY STATE
Huang Ying54b6a1b2009-01-18 16:28:34 +11002161 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002162ENDPROC(_aesni_dec1)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002163
2164/*
2165 * _aesni_dec4: internal ABI
2166 * input:
2167 * KEYP: key struct pointer
2168 * KLEN: key length
2169 * STATE1: initial state (input)
2170 * STATE2
2171 * STATE3
2172 * STATE4
2173 * output:
2174 * STATE1: finial state (output)
2175 * STATE2
2176 * STATE3
2177 * STATE4
2178 * changed:
2179 * KEY
2180 * TKEYP (T1)
2181 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002182.align 4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002183_aesni_dec4:
2184 movaps (KEYP), KEY # key
2185 mov KEYP, TKEYP
2186 pxor KEY, STATE1 # round 0
2187 pxor KEY, STATE2
2188 pxor KEY, STATE3
2189 pxor KEY, STATE4
2190 add $0x30, TKEYP
2191 cmp $24, KLEN
2192 jb .L4dec128
2193 lea 0x20(TKEYP), TKEYP
2194 je .L4dec192
2195 add $0x20, TKEYP
2196 movaps -0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002197 AESDEC KEY STATE1
2198 AESDEC KEY STATE2
2199 AESDEC KEY STATE3
2200 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002201 movaps -0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002202 AESDEC KEY STATE1
2203 AESDEC KEY STATE2
2204 AESDEC KEY STATE3
2205 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002206.align 4
2207.L4dec192:
2208 movaps -0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002209 AESDEC KEY STATE1
2210 AESDEC KEY STATE2
2211 AESDEC KEY STATE3
2212 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002213 movaps -0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002214 AESDEC KEY STATE1
2215 AESDEC KEY STATE2
2216 AESDEC KEY STATE3
2217 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002218.align 4
2219.L4dec128:
2220 movaps -0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002221 AESDEC KEY STATE1
2222 AESDEC KEY STATE2
2223 AESDEC KEY STATE3
2224 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002225 movaps -0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002226 AESDEC KEY STATE1
2227 AESDEC KEY STATE2
2228 AESDEC KEY STATE3
2229 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002230 movaps (TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002231 AESDEC KEY STATE1
2232 AESDEC KEY STATE2
2233 AESDEC KEY STATE3
2234 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002235 movaps 0x10(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002236 AESDEC KEY STATE1
2237 AESDEC KEY STATE2
2238 AESDEC KEY STATE3
2239 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002240 movaps 0x20(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002241 AESDEC KEY STATE1
2242 AESDEC KEY STATE2
2243 AESDEC KEY STATE3
2244 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002245 movaps 0x30(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002246 AESDEC KEY STATE1
2247 AESDEC KEY STATE2
2248 AESDEC KEY STATE3
2249 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002250 movaps 0x40(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002251 AESDEC KEY STATE1
2252 AESDEC KEY STATE2
2253 AESDEC KEY STATE3
2254 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002255 movaps 0x50(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002256 AESDEC KEY STATE1
2257 AESDEC KEY STATE2
2258 AESDEC KEY STATE3
2259 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002260 movaps 0x60(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002261 AESDEC KEY STATE1
2262 AESDEC KEY STATE2
2263 AESDEC KEY STATE3
2264 AESDEC KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002265 movaps 0x70(TKEYP), KEY
Huang Yingb369e522009-11-23 19:54:06 +08002266 AESDECLAST KEY STATE1 # last round
2267 AESDECLAST KEY STATE2
2268 AESDECLAST KEY STATE3
2269 AESDECLAST KEY STATE4
Huang Ying54b6a1b2009-01-18 16:28:34 +11002270 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002271ENDPROC(_aesni_dec4)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002272
2273/*
2274 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2275 * size_t len)
2276 */
2277ENTRY(aesni_ecb_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002278#ifndef __x86_64__
2279 pushl LEN
2280 pushl KEYP
2281 pushl KLEN
2282 movl 16(%esp), KEYP
2283 movl 20(%esp), OUTP
2284 movl 24(%esp), INP
2285 movl 28(%esp), LEN
2286#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002287 test LEN, LEN # check length
2288 jz .Lecb_enc_ret
2289 mov 480(KEYP), KLEN
2290 cmp $16, LEN
2291 jb .Lecb_enc_ret
2292 cmp $64, LEN
2293 jb .Lecb_enc_loop1
2294.align 4
2295.Lecb_enc_loop4:
2296 movups (INP), STATE1
2297 movups 0x10(INP), STATE2
2298 movups 0x20(INP), STATE3
2299 movups 0x30(INP), STATE4
2300 call _aesni_enc4
2301 movups STATE1, (OUTP)
2302 movups STATE2, 0x10(OUTP)
2303 movups STATE3, 0x20(OUTP)
2304 movups STATE4, 0x30(OUTP)
2305 sub $64, LEN
2306 add $64, INP
2307 add $64, OUTP
2308 cmp $64, LEN
2309 jge .Lecb_enc_loop4
2310 cmp $16, LEN
2311 jb .Lecb_enc_ret
2312.align 4
2313.Lecb_enc_loop1:
2314 movups (INP), STATE1
2315 call _aesni_enc1
2316 movups STATE1, (OUTP)
2317 sub $16, LEN
2318 add $16, INP
2319 add $16, OUTP
2320 cmp $16, LEN
2321 jge .Lecb_enc_loop1
2322.Lecb_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002323#ifndef __x86_64__
2324 popl KLEN
2325 popl KEYP
2326 popl LEN
2327#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002328 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002329ENDPROC(aesni_ecb_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002330
2331/*
2332 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2333 * size_t len);
2334 */
2335ENTRY(aesni_ecb_dec)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002336#ifndef __x86_64__
2337 pushl LEN
2338 pushl KEYP
2339 pushl KLEN
2340 movl 16(%esp), KEYP
2341 movl 20(%esp), OUTP
2342 movl 24(%esp), INP
2343 movl 28(%esp), LEN
2344#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002345 test LEN, LEN
2346 jz .Lecb_dec_ret
2347 mov 480(KEYP), KLEN
2348 add $240, KEYP
2349 cmp $16, LEN
2350 jb .Lecb_dec_ret
2351 cmp $64, LEN
2352 jb .Lecb_dec_loop1
2353.align 4
2354.Lecb_dec_loop4:
2355 movups (INP), STATE1
2356 movups 0x10(INP), STATE2
2357 movups 0x20(INP), STATE3
2358 movups 0x30(INP), STATE4
2359 call _aesni_dec4
2360 movups STATE1, (OUTP)
2361 movups STATE2, 0x10(OUTP)
2362 movups STATE3, 0x20(OUTP)
2363 movups STATE4, 0x30(OUTP)
2364 sub $64, LEN
2365 add $64, INP
2366 add $64, OUTP
2367 cmp $64, LEN
2368 jge .Lecb_dec_loop4
2369 cmp $16, LEN
2370 jb .Lecb_dec_ret
2371.align 4
2372.Lecb_dec_loop1:
2373 movups (INP), STATE1
2374 call _aesni_dec1
2375 movups STATE1, (OUTP)
2376 sub $16, LEN
2377 add $16, INP
2378 add $16, OUTP
2379 cmp $16, LEN
2380 jge .Lecb_dec_loop1
2381.Lecb_dec_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002382#ifndef __x86_64__
2383 popl KLEN
2384 popl KEYP
2385 popl LEN
2386#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002387 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002388ENDPROC(aesni_ecb_dec)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002389
2390/*
2391 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2392 * size_t len, u8 *iv)
2393 */
2394ENTRY(aesni_cbc_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002395#ifndef __x86_64__
2396 pushl IVP
2397 pushl LEN
2398 pushl KEYP
2399 pushl KLEN
2400 movl 20(%esp), KEYP
2401 movl 24(%esp), OUTP
2402 movl 28(%esp), INP
2403 movl 32(%esp), LEN
2404 movl 36(%esp), IVP
2405#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002406 cmp $16, LEN
2407 jb .Lcbc_enc_ret
2408 mov 480(KEYP), KLEN
2409 movups (IVP), STATE # load iv as initial state
2410.align 4
2411.Lcbc_enc_loop:
2412 movups (INP), IN # load input
2413 pxor IN, STATE
2414 call _aesni_enc1
2415 movups STATE, (OUTP) # store output
2416 sub $16, LEN
2417 add $16, INP
2418 add $16, OUTP
2419 cmp $16, LEN
2420 jge .Lcbc_enc_loop
2421 movups STATE, (IVP)
2422.Lcbc_enc_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002423#ifndef __x86_64__
2424 popl KLEN
2425 popl KEYP
2426 popl LEN
2427 popl IVP
2428#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002429 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002430ENDPROC(aesni_cbc_enc)
Huang Ying54b6a1b2009-01-18 16:28:34 +11002431
2432/*
2433 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2434 * size_t len, u8 *iv)
2435 */
2436ENTRY(aesni_cbc_dec)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002437#ifndef __x86_64__
2438 pushl IVP
2439 pushl LEN
2440 pushl KEYP
2441 pushl KLEN
2442 movl 20(%esp), KEYP
2443 movl 24(%esp), OUTP
2444 movl 28(%esp), INP
2445 movl 32(%esp), LEN
2446 movl 36(%esp), IVP
2447#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002448 cmp $16, LEN
Huang Yinge6efaa02009-06-18 19:33:57 +08002449 jb .Lcbc_dec_just_ret
Huang Ying54b6a1b2009-01-18 16:28:34 +11002450 mov 480(KEYP), KLEN
2451 add $240, KEYP
2452 movups (IVP), IV
2453 cmp $64, LEN
2454 jb .Lcbc_dec_loop1
2455.align 4
2456.Lcbc_dec_loop4:
2457 movups (INP), IN1
2458 movaps IN1, STATE1
2459 movups 0x10(INP), IN2
2460 movaps IN2, STATE2
Mathias Krause0d258ef2010-11-27 16:34:46 +08002461#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002462 movups 0x20(INP), IN3
2463 movaps IN3, STATE3
2464 movups 0x30(INP), IN4
2465 movaps IN4, STATE4
Mathias Krause0d258ef2010-11-27 16:34:46 +08002466#else
2467 movups 0x20(INP), IN1
2468 movaps IN1, STATE3
2469 movups 0x30(INP), IN2
2470 movaps IN2, STATE4
2471#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002472 call _aesni_dec4
2473 pxor IV, STATE1
Mathias Krause0d258ef2010-11-27 16:34:46 +08002474#ifdef __x86_64__
Huang Ying54b6a1b2009-01-18 16:28:34 +11002475 pxor IN1, STATE2
2476 pxor IN2, STATE3
2477 pxor IN3, STATE4
2478 movaps IN4, IV
Mathias Krause0d258ef2010-11-27 16:34:46 +08002479#else
Mathias Krause0d258ef2010-11-27 16:34:46 +08002480 pxor IN1, STATE4
2481 movaps IN2, IV
Mathias Krause7c8d5182012-05-30 01:43:08 +02002482 movups (INP), IN1
2483 pxor IN1, STATE2
2484 movups 0x10(INP), IN2
2485 pxor IN2, STATE3
Mathias Krause0d258ef2010-11-27 16:34:46 +08002486#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002487 movups STATE1, (OUTP)
2488 movups STATE2, 0x10(OUTP)
2489 movups STATE3, 0x20(OUTP)
2490 movups STATE4, 0x30(OUTP)
2491 sub $64, LEN
2492 add $64, INP
2493 add $64, OUTP
2494 cmp $64, LEN
2495 jge .Lcbc_dec_loop4
2496 cmp $16, LEN
2497 jb .Lcbc_dec_ret
2498.align 4
2499.Lcbc_dec_loop1:
2500 movups (INP), IN
2501 movaps IN, STATE
2502 call _aesni_dec1
2503 pxor IV, STATE
2504 movups STATE, (OUTP)
2505 movaps IN, IV
2506 sub $16, LEN
2507 add $16, INP
2508 add $16, OUTP
2509 cmp $16, LEN
2510 jge .Lcbc_dec_loop1
Huang Ying54b6a1b2009-01-18 16:28:34 +11002511.Lcbc_dec_ret:
Huang Yinge6efaa02009-06-18 19:33:57 +08002512 movups IV, (IVP)
2513.Lcbc_dec_just_ret:
Mathias Krause0d258ef2010-11-27 16:34:46 +08002514#ifndef __x86_64__
2515 popl KLEN
2516 popl KEYP
2517 popl LEN
2518 popl IVP
2519#endif
Huang Ying54b6a1b2009-01-18 16:28:34 +11002520 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002521ENDPROC(aesni_cbc_dec)
Huang Ying12387a42010-03-10 18:28:55 +08002522
Mathias Krause0d258ef2010-11-27 16:34:46 +08002523#ifdef __x86_64__
Huang Ying12387a42010-03-10 18:28:55 +08002524.align 16
2525.Lbswap_mask:
2526 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2527
2528/*
2529 * _aesni_inc_init: internal ABI
2530 * setup registers used by _aesni_inc
2531 * input:
2532 * IV
2533 * output:
2534 * CTR: == IV, in little endian
2535 * TCTR_LOW: == lower qword of CTR
2536 * INC: == 1, in little endian
2537 * BSWAP_MASK == endian swapping mask
2538 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002539.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002540_aesni_inc_init:
2541 movaps .Lbswap_mask, BSWAP_MASK
2542 movaps IV, CTR
2543 PSHUFB_XMM BSWAP_MASK CTR
2544 mov $1, TCTR_LOW
Huang Ying32cbd7d2010-03-13 16:28:42 +08002545 MOVQ_R64_XMM TCTR_LOW INC
2546 MOVQ_R64_XMM CTR TCTR_LOW
Huang Ying12387a42010-03-10 18:28:55 +08002547 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002548ENDPROC(_aesni_inc_init)
Huang Ying12387a42010-03-10 18:28:55 +08002549
2550/*
2551 * _aesni_inc: internal ABI
2552 * Increase IV by 1, IV is in big endian
2553 * input:
2554 * IV
2555 * CTR: == IV, in little endian
2556 * TCTR_LOW: == lower qword of CTR
2557 * INC: == 1, in little endian
2558 * BSWAP_MASK == endian swapping mask
2559 * output:
2560 * IV: Increase by 1
2561 * changed:
2562 * CTR: == output IV, in little endian
2563 * TCTR_LOW: == lower qword of CTR
2564 */
Mathias Krause0d258ef2010-11-27 16:34:46 +08002565.align 4
Huang Ying12387a42010-03-10 18:28:55 +08002566_aesni_inc:
2567 paddq INC, CTR
2568 add $1, TCTR_LOW
2569 jnc .Linc_low
2570 pslldq $8, INC
2571 paddq INC, CTR
2572 psrldq $8, INC
2573.Linc_low:
2574 movaps CTR, IV
2575 PSHUFB_XMM BSWAP_MASK IV
2576 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002577ENDPROC(_aesni_inc)
Huang Ying12387a42010-03-10 18:28:55 +08002578
2579/*
2580 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2581 * size_t len, u8 *iv)
2582 */
2583ENTRY(aesni_ctr_enc)
2584 cmp $16, LEN
2585 jb .Lctr_enc_just_ret
2586 mov 480(KEYP), KLEN
2587 movups (IVP), IV
2588 call _aesni_inc_init
2589 cmp $64, LEN
2590 jb .Lctr_enc_loop1
2591.align 4
2592.Lctr_enc_loop4:
2593 movaps IV, STATE1
2594 call _aesni_inc
2595 movups (INP), IN1
2596 movaps IV, STATE2
2597 call _aesni_inc
2598 movups 0x10(INP), IN2
2599 movaps IV, STATE3
2600 call _aesni_inc
2601 movups 0x20(INP), IN3
2602 movaps IV, STATE4
2603 call _aesni_inc
2604 movups 0x30(INP), IN4
2605 call _aesni_enc4
2606 pxor IN1, STATE1
2607 movups STATE1, (OUTP)
2608 pxor IN2, STATE2
2609 movups STATE2, 0x10(OUTP)
2610 pxor IN3, STATE3
2611 movups STATE3, 0x20(OUTP)
2612 pxor IN4, STATE4
2613 movups STATE4, 0x30(OUTP)
2614 sub $64, LEN
2615 add $64, INP
2616 add $64, OUTP
2617 cmp $64, LEN
2618 jge .Lctr_enc_loop4
2619 cmp $16, LEN
2620 jb .Lctr_enc_ret
2621.align 4
2622.Lctr_enc_loop1:
2623 movaps IV, STATE
2624 call _aesni_inc
2625 movups (INP), IN
2626 call _aesni_enc1
2627 pxor IN, STATE
2628 movups STATE, (OUTP)
2629 sub $16, LEN
2630 add $16, INP
2631 add $16, OUTP
2632 cmp $16, LEN
2633 jge .Lctr_enc_loop1
2634.Lctr_enc_ret:
2635 movups IV, (IVP)
2636.Lctr_enc_just_ret:
2637 ret
Jussi Kivilinna8309b742013-01-19 13:38:55 +02002638ENDPROC(aesni_ctr_enc)
Mathias Krause0d258ef2010-11-27 16:34:46 +08002639#endif