| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1 | /* | 
|  | 2 | * Implement AES algorithm in Intel AES-NI instructions. | 
|  | 3 | * | 
|  | 4 | * The white paper of AES-NI instructions can be downloaded from: | 
|  | 5 | *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf | 
|  | 6 | * | 
|  | 7 | * Copyright (C) 2008, Intel Corp. | 
|  | 8 | *    Author: Huang Ying <ying.huang@intel.com> | 
|  | 9 | *            Vinodh Gopal <vinodh.gopal@intel.com> | 
|  | 10 | *            Kahraman Akdemir | 
|  | 11 | * | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 12 | * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD | 
|  | 13 | * interface for 64-bit kernels. | 
|  | 14 | *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) | 
|  | 15 | *             Aidan O'Mahony (aidan.o.mahony@intel.com) | 
|  | 16 | *             Adrian Hoban <adrian.hoban@intel.com> | 
|  | 17 | *             James Guilford (james.guilford@intel.com) | 
|  | 18 | *             Gabriele Paoloni <gabriele.paoloni@intel.com> | 
|  | 19 | *             Tadeusz Struk (tadeusz.struk@intel.com) | 
|  | 20 | *             Wajdi Feghali (wajdi.k.feghali@intel.com) | 
|  | 21 | *    Copyright (c) 2010, Intel Corporation. | 
|  | 22 | * | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 23 | * Ported x86_64 version to x86: | 
|  | 24 | *    Author: Mathias Krause <minipli@googlemail.com> | 
|  | 25 | * | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 26 | * This program is free software; you can redistribute it and/or modify | 
|  | 27 | * it under the terms of the GNU General Public License as published by | 
|  | 28 | * the Free Software Foundation; either version 2 of the License, or | 
|  | 29 | * (at your option) any later version. | 
|  | 30 | */ | 
|  | 31 |  | 
|  | 32 | #include <linux/linkage.h> | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 33 | #include <asm/inst.h> | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 34 |  | 
| Mathias Krause | 559ad0f | 2010-11-29 08:35:39 +0800 | [diff] [blame] | 35 | #ifdef __x86_64__ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 36 | .data | 
|  | 37 | POLY:   .octa 0xC2000000000000000000000000000001 | 
|  | 38 | TWOONE: .octa 0x00000001000000000000000000000001 | 
|  | 39 |  | 
|  | 40 | # order of these constants should not change. | 
|  | 41 | # more specifically, ALL_F should follow SHIFT_MASK, | 
|  | 42 | # and ZERO should follow ALL_F | 
|  | 43 |  | 
|  | 44 | SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F | 
|  | 45 | MASK1:      .octa 0x0000000000000000ffffffffffffffff | 
|  | 46 | MASK2:      .octa 0xffffffffffffffff0000000000000000 | 
|  | 47 | SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 | 
|  | 48 | ALL_F:      .octa 0xffffffffffffffffffffffffffffffff | 
|  | 49 | ZERO:       .octa 0x00000000000000000000000000000000 | 
|  | 50 | ONE:        .octa 0x00000000000000000000000000000001 | 
|  | 51 | F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 | 
|  | 52 | dec:        .octa 0x1 | 
|  | 53 | enc:        .octa 0x2 | 
|  | 54 |  | 
|  | 55 |  | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 56 | .text | 
|  | 57 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 58 |  | 
|  | 59 | #define	STACK_OFFSET    8*3 | 
|  | 60 | #define	HashKey		16*0	// store HashKey <<1 mod poly here | 
|  | 61 | #define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here | 
|  | 62 | #define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here | 
|  | 63 | #define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here | 
|  | 64 | #define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64 | 
|  | 65 | // bits of  HashKey <<1 mod poly here | 
|  | 66 | //(for Karatsuba purposes) | 
|  | 67 | #define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64 | 
|  | 68 | // bits of  HashKey^2 <<1 mod poly here | 
|  | 69 | // (for Karatsuba purposes) | 
|  | 70 | #define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64 | 
|  | 71 | // bits of  HashKey^3 <<1 mod poly here | 
|  | 72 | // (for Karatsuba purposes) | 
|  | 73 | #define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64 | 
|  | 74 | // bits of  HashKey^4 <<1 mod poly here | 
|  | 75 | // (for Karatsuba purposes) | 
|  | 76 | #define	VARIABLE_OFFSET	16*8 | 
|  | 77 |  | 
|  | 78 | #define arg1 rdi | 
|  | 79 | #define arg2 rsi | 
|  | 80 | #define arg3 rdx | 
|  | 81 | #define arg4 rcx | 
|  | 82 | #define arg5 r8 | 
|  | 83 | #define arg6 r9 | 
|  | 84 | #define arg7 STACK_OFFSET+8(%r14) | 
|  | 85 | #define arg8 STACK_OFFSET+16(%r14) | 
|  | 86 | #define arg9 STACK_OFFSET+24(%r14) | 
|  | 87 | #define arg10 STACK_OFFSET+32(%r14) | 
| Mathias Krause | 559ad0f | 2010-11-29 08:35:39 +0800 | [diff] [blame] | 88 | #endif | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 89 |  | 
|  | 90 |  | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 91 | #define STATE1	%xmm0 | 
|  | 92 | #define STATE2	%xmm4 | 
|  | 93 | #define STATE3	%xmm5 | 
|  | 94 | #define STATE4	%xmm6 | 
|  | 95 | #define STATE	STATE1 | 
|  | 96 | #define IN1	%xmm1 | 
|  | 97 | #define IN2	%xmm7 | 
|  | 98 | #define IN3	%xmm8 | 
|  | 99 | #define IN4	%xmm9 | 
|  | 100 | #define IN	IN1 | 
|  | 101 | #define KEY	%xmm2 | 
|  | 102 | #define IV	%xmm3 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 103 |  | 
| Huang Ying | 12387a4 | 2010-03-10 18:28:55 +0800 | [diff] [blame] | 104 | #define BSWAP_MASK %xmm10 | 
|  | 105 | #define CTR	%xmm11 | 
|  | 106 | #define INC	%xmm12 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 107 |  | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 108 | #ifdef __x86_64__ | 
|  | 109 | #define AREG	%rax | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 110 | #define KEYP	%rdi | 
|  | 111 | #define OUTP	%rsi | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 112 | #define UKEYP	OUTP | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 113 | #define INP	%rdx | 
|  | 114 | #define LEN	%rcx | 
|  | 115 | #define IVP	%r8 | 
|  | 116 | #define KLEN	%r9d | 
|  | 117 | #define T1	%r10 | 
|  | 118 | #define TKEYP	T1 | 
|  | 119 | #define T2	%r11 | 
| Huang Ying | 12387a4 | 2010-03-10 18:28:55 +0800 | [diff] [blame] | 120 | #define TCTR_LOW T2 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 121 | #else | 
|  | 122 | #define AREG	%eax | 
|  | 123 | #define KEYP	%edi | 
|  | 124 | #define OUTP	AREG | 
|  | 125 | #define UKEYP	OUTP | 
|  | 126 | #define INP	%edx | 
|  | 127 | #define LEN	%esi | 
|  | 128 | #define IVP	%ebp | 
|  | 129 | #define KLEN	%ebx | 
|  | 130 | #define T1	%ecx | 
|  | 131 | #define TKEYP	T1 | 
|  | 132 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 133 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 134 |  | 
| Mathias Krause | 559ad0f | 2010-11-29 08:35:39 +0800 | [diff] [blame] | 135 | #ifdef __x86_64__ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 136 | /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | 
|  | 137 | * | 
|  | 138 | * | 
|  | 139 | * Input: A and B (128-bits each, bit-reflected) | 
|  | 140 | * Output: C = A*B*x mod poly, (i.e. >>1 ) | 
|  | 141 | * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | 
|  | 142 | * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | 
|  | 143 | * | 
|  | 144 | */ | 
|  | 145 | .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 | 
|  | 146 | movdqa	  \GH, \TMP1 | 
|  | 147 | pshufd	  $78, \GH, \TMP2 | 
|  | 148 | pshufd	  $78, \HK, \TMP3 | 
|  | 149 | pxor	  \GH, \TMP2            # TMP2 = a1+a0 | 
|  | 150 | pxor	  \HK, \TMP3            # TMP3 = b1+b0 | 
|  | 151 | PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1 | 
|  | 152 | PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0 | 
|  | 153 | PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0) | 
|  | 154 | pxor	  \GH, \TMP2 | 
|  | 155 | pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0) | 
|  | 156 | movdqa	  \TMP2, \TMP3 | 
|  | 157 | pslldq	  $8, \TMP3             # left shift TMP3 2 DWs | 
|  | 158 | psrldq	  $8, \TMP2             # right shift TMP2 2 DWs | 
|  | 159 | pxor	  \TMP3, \GH | 
|  | 160 | pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK | 
|  | 161 |  | 
|  | 162 | # first phase of the reduction | 
|  | 163 |  | 
|  | 164 | movdqa    \GH, \TMP2 | 
|  | 165 | movdqa    \GH, \TMP3 | 
|  | 166 | movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4 | 
|  | 167 | # in in order to perform | 
|  | 168 | # independent shifts | 
|  | 169 | pslld     $31, \TMP2            # packed right shift <<31 | 
|  | 170 | pslld     $30, \TMP3            # packed right shift <<30 | 
|  | 171 | pslld     $25, \TMP4            # packed right shift <<25 | 
|  | 172 | pxor      \TMP3, \TMP2          # xor the shifted versions | 
|  | 173 | pxor      \TMP4, \TMP2 | 
|  | 174 | movdqa    \TMP2, \TMP5 | 
|  | 175 | psrldq    $4, \TMP5             # right shift TMP5 1 DW | 
|  | 176 | pslldq    $12, \TMP2            # left shift TMP2 3 DWs | 
|  | 177 | pxor      \TMP2, \GH | 
|  | 178 |  | 
|  | 179 | # second phase of the reduction | 
|  | 180 |  | 
|  | 181 | movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4 | 
|  | 182 | # in in order to perform | 
|  | 183 | # independent shifts | 
|  | 184 | movdqa    \GH,\TMP3 | 
|  | 185 | movdqa    \GH,\TMP4 | 
|  | 186 | psrld     $1,\TMP2              # packed left shift >>1 | 
|  | 187 | psrld     $2,\TMP3              # packed left shift >>2 | 
|  | 188 | psrld     $7,\TMP4              # packed left shift >>7 | 
|  | 189 | pxor      \TMP3,\TMP2		# xor the shifted versions | 
|  | 190 | pxor      \TMP4,\TMP2 | 
|  | 191 | pxor      \TMP5, \TMP2 | 
|  | 192 | pxor      \TMP2, \GH | 
|  | 193 | pxor      \TMP1, \GH            # result is in TMP1 | 
|  | 194 | .endm | 
|  | 195 |  | 
|  | 196 | /* | 
|  | 197 | * if a = number of total plaintext bytes | 
|  | 198 | * b = floor(a/16) | 
|  | 199 | * num_initial_blocks = b mod 4 | 
|  | 200 | * encrypt the initial num_initial_blocks blocks and apply ghash on | 
|  | 201 | * the ciphertext | 
|  | 202 | * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers | 
|  | 203 | * are clobbered | 
|  | 204 | * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified | 
|  | 205 | */ | 
|  | 206 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 207 |  | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 208 | .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | 
|  | 209 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 210 | mov	   arg7, %r10           # %r10 = AAD | 
|  | 211 | mov	   arg8, %r12           # %r12 = aadLen | 
|  | 212 | mov	   %r12, %r11 | 
|  | 213 | pxor	   %xmm\i, %xmm\i | 
|  | 214 | _get_AAD_loop\num_initial_blocks\operation: | 
|  | 215 | movd	   (%r10), \TMP1 | 
|  | 216 | pslldq	   $12, \TMP1 | 
|  | 217 | psrldq	   $4, %xmm\i | 
|  | 218 | pxor	   \TMP1, %xmm\i | 
|  | 219 | add	   $4, %r10 | 
|  | 220 | sub	   $4, %r12 | 
|  | 221 | jne	   _get_AAD_loop\num_initial_blocks\operation | 
|  | 222 | cmp	   $16, %r11 | 
|  | 223 | je	   _get_AAD_loop2_done\num_initial_blocks\operation | 
|  | 224 | mov	   $16, %r12 | 
|  | 225 | _get_AAD_loop2\num_initial_blocks\operation: | 
|  | 226 | psrldq	   $4, %xmm\i | 
|  | 227 | sub	   $4, %r12 | 
|  | 228 | cmp	   %r11, %r12 | 
|  | 229 | jne	   _get_AAD_loop2\num_initial_blocks\operation | 
|  | 230 | _get_AAD_loop2_done\num_initial_blocks\operation: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 231 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 232 | PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data | 
|  | 233 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 234 | xor	   %r11, %r11 # initialise the data pointer offset as zero | 
|  | 235 |  | 
|  | 236 | # start AES for num_initial_blocks blocks | 
|  | 237 |  | 
|  | 238 | mov	   %arg5, %rax                      # %rax = *Y0 | 
|  | 239 | movdqu	   (%rax), \XMM0                    # XMM0 = Y0 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 240 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 241 | PSHUFB_XMM   %xmm14, \XMM0 | 
|  | 242 |  | 
|  | 243 | .if (\i == 5) || (\i == 6) || (\i == 7) | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 244 | .irpc index, \i_seq | 
|  | 245 | paddd	   ONE(%rip), \XMM0                 # INCR Y0 | 
|  | 246 | movdqa	   \XMM0, %xmm\index | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 247 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 248 | PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap | 
|  | 249 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 250 | .endr | 
|  | 251 | .irpc index, \i_seq | 
|  | 252 | pxor	   16*0(%arg1), %xmm\index | 
|  | 253 | .endr | 
|  | 254 | .irpc index, \i_seq | 
|  | 255 | movaps 0x10(%rdi), \TMP1 | 
|  | 256 | AESENC     \TMP1, %xmm\index          # Round 1 | 
|  | 257 | .endr | 
|  | 258 | .irpc index, \i_seq | 
|  | 259 | movaps 0x20(%arg1), \TMP1 | 
|  | 260 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 261 | .endr | 
|  | 262 | .irpc index, \i_seq | 
|  | 263 | movaps 0x30(%arg1), \TMP1 | 
|  | 264 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 265 | .endr | 
|  | 266 | .irpc index, \i_seq | 
|  | 267 | movaps 0x40(%arg1), \TMP1 | 
|  | 268 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 269 | .endr | 
|  | 270 | .irpc index, \i_seq | 
|  | 271 | movaps 0x50(%arg1), \TMP1 | 
|  | 272 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 273 | .endr | 
|  | 274 | .irpc index, \i_seq | 
|  | 275 | movaps 0x60(%arg1), \TMP1 | 
|  | 276 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 277 | .endr | 
|  | 278 | .irpc index, \i_seq | 
|  | 279 | movaps 0x70(%arg1), \TMP1 | 
|  | 280 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 281 | .endr | 
|  | 282 | .irpc index, \i_seq | 
|  | 283 | movaps 0x80(%arg1), \TMP1 | 
|  | 284 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 285 | .endr | 
|  | 286 | .irpc index, \i_seq | 
|  | 287 | movaps 0x90(%arg1), \TMP1 | 
|  | 288 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 289 | .endr | 
|  | 290 | .irpc index, \i_seq | 
|  | 291 | movaps 0xa0(%arg1), \TMP1 | 
|  | 292 | AESENCLAST \TMP1, %xmm\index         # Round 10 | 
|  | 293 | .endr | 
|  | 294 | .irpc index, \i_seq | 
|  | 295 | movdqu	   (%arg3 , %r11, 1), \TMP1 | 
|  | 296 | pxor	   \TMP1, %xmm\index | 
|  | 297 | movdqu	   %xmm\index, (%arg2 , %r11, 1) | 
|  | 298 | # write back plaintext/ciphertext for num_initial_blocks | 
|  | 299 | add	   $16, %r11 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 300 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 301 | movdqa     \TMP1, %xmm\index | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 302 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 303 | PSHUFB_XMM	   %xmm14, %xmm\index | 
|  | 304 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 305 | # prepare plaintext/ciphertext for GHASH computation | 
|  | 306 | .endr | 
|  | 307 | .endif | 
|  | 308 | GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 309 | # apply GHASH on num_initial_blocks blocks | 
|  | 310 |  | 
|  | 311 | .if \i == 5 | 
|  | 312 | pxor       %xmm5, %xmm6 | 
|  | 313 | GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 314 | pxor       %xmm6, %xmm7 | 
|  | 315 | GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 316 | pxor       %xmm7, %xmm8 | 
|  | 317 | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 318 | .elseif \i == 6 | 
|  | 319 | pxor       %xmm6, %xmm7 | 
|  | 320 | GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 321 | pxor       %xmm7, %xmm8 | 
|  | 322 | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 323 | .elseif \i == 7 | 
|  | 324 | pxor       %xmm7, %xmm8 | 
|  | 325 | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 326 | .endif | 
|  | 327 | cmp	   $64, %r13 | 
|  | 328 | jl	_initial_blocks_done\num_initial_blocks\operation | 
|  | 329 | # no need for precomputed values | 
|  | 330 | /* | 
|  | 331 | * | 
|  | 332 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | 
|  | 333 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | 
|  | 334 | */ | 
|  | 335 | paddd	   ONE(%rip), \XMM0              # INCR Y0 | 
|  | 336 | movdqa	   \XMM0, \XMM1 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 337 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 338 | PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap | 
|  | 339 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 340 | paddd	   ONE(%rip), \XMM0              # INCR Y0 | 
|  | 341 | movdqa	   \XMM0, \XMM2 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 342 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 343 | PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap | 
|  | 344 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 345 | paddd	   ONE(%rip), \XMM0              # INCR Y0 | 
|  | 346 | movdqa	   \XMM0, \XMM3 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 347 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 348 | PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap | 
|  | 349 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 350 | paddd	   ONE(%rip), \XMM0              # INCR Y0 | 
|  | 351 | movdqa	   \XMM0, \XMM4 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 352 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 353 | PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap | 
|  | 354 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 355 | pxor	   16*0(%arg1), \XMM1 | 
|  | 356 | pxor	   16*0(%arg1), \XMM2 | 
|  | 357 | pxor	   16*0(%arg1), \XMM3 | 
|  | 358 | pxor	   16*0(%arg1), \XMM4 | 
|  | 359 | movdqa	   \TMP3, \TMP5 | 
|  | 360 | pshufd	   $78, \TMP3, \TMP1 | 
|  | 361 | pxor	   \TMP3, \TMP1 | 
|  | 362 | movdqa	   \TMP1, HashKey_k(%rsp) | 
|  | 363 | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | 364 | # TMP5 = HashKey^2<<1 (mod poly) | 
|  | 365 | movdqa	   \TMP5, HashKey_2(%rsp) | 
|  | 366 | # HashKey_2 = HashKey^2<<1 (mod poly) | 
|  | 367 | pshufd	   $78, \TMP5, \TMP1 | 
|  | 368 | pxor	   \TMP5, \TMP1 | 
|  | 369 | movdqa	   \TMP1, HashKey_2_k(%rsp) | 
|  | 370 | .irpc index, 1234 # do 4 rounds | 
|  | 371 | movaps 0x10*\index(%arg1), \TMP1 | 
|  | 372 | AESENC	   \TMP1, \XMM1 | 
|  | 373 | AESENC	   \TMP1, \XMM2 | 
|  | 374 | AESENC	   \TMP1, \XMM3 | 
|  | 375 | AESENC	   \TMP1, \XMM4 | 
|  | 376 | .endr | 
|  | 377 | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | 378 | # TMP5 = HashKey^3<<1 (mod poly) | 
|  | 379 | movdqa	   \TMP5, HashKey_3(%rsp) | 
|  | 380 | pshufd	   $78, \TMP5, \TMP1 | 
|  | 381 | pxor	   \TMP5, \TMP1 | 
|  | 382 | movdqa	   \TMP1, HashKey_3_k(%rsp) | 
|  | 383 | .irpc index, 56789 # do next 5 rounds | 
|  | 384 | movaps 0x10*\index(%arg1), \TMP1 | 
|  | 385 | AESENC	   \TMP1, \XMM1 | 
|  | 386 | AESENC	   \TMP1, \XMM2 | 
|  | 387 | AESENC	   \TMP1, \XMM3 | 
|  | 388 | AESENC	   \TMP1, \XMM4 | 
|  | 389 | .endr | 
|  | 390 | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | 391 | # TMP5 = HashKey^3<<1 (mod poly) | 
|  | 392 | movdqa	   \TMP5, HashKey_4(%rsp) | 
|  | 393 | pshufd	   $78, \TMP5, \TMP1 | 
|  | 394 | pxor	   \TMP5, \TMP1 | 
|  | 395 | movdqa	   \TMP1, HashKey_4_k(%rsp) | 
|  | 396 | movaps 0xa0(%arg1), \TMP2 | 
|  | 397 | AESENCLAST \TMP2, \XMM1 | 
|  | 398 | AESENCLAST \TMP2, \XMM2 | 
|  | 399 | AESENCLAST \TMP2, \XMM3 | 
|  | 400 | AESENCLAST \TMP2, \XMM4 | 
|  | 401 | movdqu	   16*0(%arg3 , %r11 , 1), \TMP1 | 
|  | 402 | pxor	   \TMP1, \XMM1 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 403 | movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1) | 
|  | 404 | movdqa     \TMP1, \XMM1 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 405 | movdqu	   16*1(%arg3 , %r11 , 1), \TMP1 | 
|  | 406 | pxor	   \TMP1, \XMM2 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 407 | movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1) | 
|  | 408 | movdqa     \TMP1, \XMM2 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 409 | movdqu	   16*2(%arg3 , %r11 , 1), \TMP1 | 
|  | 410 | pxor	   \TMP1, \XMM3 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 411 | movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1) | 
|  | 412 | movdqa     \TMP1, \XMM3 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 413 | movdqu	   16*3(%arg3 , %r11 , 1), \TMP1 | 
|  | 414 | pxor	   \TMP1, \XMM4 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 415 | movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1) | 
|  | 416 | movdqa     \TMP1, \XMM4 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 417 | add	   $64, %r11 | 
|  | 418 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 419 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | 
|  | 420 | pxor	   \XMMDst, \XMM1 | 
|  | 421 | # combine GHASHed value with the corresponding ciphertext | 
|  | 422 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 423 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | 
|  | 424 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 425 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | 
|  | 426 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 427 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | 
|  | 428 |  | 
|  | 429 | _initial_blocks_done\num_initial_blocks\operation: | 
|  | 430 |  | 
|  | 431 | .endm | 
|  | 432 |  | 
|  | 433 |  | 
|  | 434 | /* | 
|  | 435 | * if a = number of total plaintext bytes | 
|  | 436 | * b = floor(a/16) | 
|  | 437 | * num_initial_blocks = b mod 4 | 
|  | 438 | * encrypt the initial num_initial_blocks blocks and apply ghash on | 
|  | 439 | * the ciphertext | 
|  | 440 | * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers | 
|  | 441 | * are clobbered | 
|  | 442 | * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified | 
|  | 443 | */ | 
|  | 444 |  | 
|  | 445 |  | 
|  | 446 | .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | 
|  | 447 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | 
|  | 448 | mov	   arg7, %r10           # %r10 = AAD | 
|  | 449 | mov	   arg8, %r12           # %r12 = aadLen | 
|  | 450 | mov	   %r12, %r11 | 
|  | 451 | pxor	   %xmm\i, %xmm\i | 
|  | 452 | _get_AAD_loop\num_initial_blocks\operation: | 
|  | 453 | movd	   (%r10), \TMP1 | 
|  | 454 | pslldq	   $12, \TMP1 | 
|  | 455 | psrldq	   $4, %xmm\i | 
|  | 456 | pxor	   \TMP1, %xmm\i | 
|  | 457 | add	   $4, %r10 | 
|  | 458 | sub	   $4, %r12 | 
|  | 459 | jne	   _get_AAD_loop\num_initial_blocks\operation | 
|  | 460 | cmp	   $16, %r11 | 
|  | 461 | je	   _get_AAD_loop2_done\num_initial_blocks\operation | 
|  | 462 | mov	   $16, %r12 | 
|  | 463 | _get_AAD_loop2\num_initial_blocks\operation: | 
|  | 464 | psrldq	   $4, %xmm\i | 
|  | 465 | sub	   $4, %r12 | 
|  | 466 | cmp	   %r11, %r12 | 
|  | 467 | jne	   _get_AAD_loop2\num_initial_blocks\operation | 
|  | 468 | _get_AAD_loop2_done\num_initial_blocks\operation: | 
|  | 469 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 470 | PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data | 
|  | 471 |  | 
|  | 472 | xor	   %r11, %r11 # initialise the data pointer offset as zero | 
|  | 473 |  | 
|  | 474 | # start AES for num_initial_blocks blocks | 
|  | 475 |  | 
|  | 476 | mov	   %arg5, %rax                      # %rax = *Y0 | 
|  | 477 | movdqu	   (%rax), \XMM0                    # XMM0 = Y0 | 
|  | 478 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 479 | PSHUFB_XMM   %xmm14, \XMM0 | 
|  | 480 |  | 
|  | 481 | .if (\i == 5) || (\i == 6) || (\i == 7) | 
|  | 482 | .irpc index, \i_seq | 
|  | 483 | paddd	   ONE(%rip), \XMM0                 # INCR Y0 | 
|  | 484 | movdqa	   \XMM0, %xmm\index | 
|  | 485 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 486 | PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap | 
|  | 487 |  | 
|  | 488 | .endr | 
|  | 489 | .irpc index, \i_seq | 
|  | 490 | pxor	   16*0(%arg1), %xmm\index | 
|  | 491 | .endr | 
|  | 492 | .irpc index, \i_seq | 
|  | 493 | movaps 0x10(%rdi), \TMP1 | 
|  | 494 | AESENC     \TMP1, %xmm\index          # Round 1 | 
|  | 495 | .endr | 
|  | 496 | .irpc index, \i_seq | 
|  | 497 | movaps 0x20(%arg1), \TMP1 | 
|  | 498 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 499 | .endr | 
|  | 500 | .irpc index, \i_seq | 
|  | 501 | movaps 0x30(%arg1), \TMP1 | 
|  | 502 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 503 | .endr | 
|  | 504 | .irpc index, \i_seq | 
|  | 505 | movaps 0x40(%arg1), \TMP1 | 
|  | 506 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 507 | .endr | 
|  | 508 | .irpc index, \i_seq | 
|  | 509 | movaps 0x50(%arg1), \TMP1 | 
|  | 510 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 511 | .endr | 
|  | 512 | .irpc index, \i_seq | 
|  | 513 | movaps 0x60(%arg1), \TMP1 | 
|  | 514 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 515 | .endr | 
|  | 516 | .irpc index, \i_seq | 
|  | 517 | movaps 0x70(%arg1), \TMP1 | 
|  | 518 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 519 | .endr | 
|  | 520 | .irpc index, \i_seq | 
|  | 521 | movaps 0x80(%arg1), \TMP1 | 
|  | 522 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 523 | .endr | 
|  | 524 | .irpc index, \i_seq | 
|  | 525 | movaps 0x90(%arg1), \TMP1 | 
|  | 526 | AESENC     \TMP1, %xmm\index          # Round 2 | 
|  | 527 | .endr | 
|  | 528 | .irpc index, \i_seq | 
|  | 529 | movaps 0xa0(%arg1), \TMP1 | 
|  | 530 | AESENCLAST \TMP1, %xmm\index         # Round 10 | 
|  | 531 | .endr | 
|  | 532 | .irpc index, \i_seq | 
|  | 533 | movdqu	   (%arg3 , %r11, 1), \TMP1 | 
|  | 534 | pxor	   \TMP1, %xmm\index | 
|  | 535 | movdqu	   %xmm\index, (%arg2 , %r11, 1) | 
|  | 536 | # write back plaintext/ciphertext for num_initial_blocks | 
|  | 537 | add	   $16, %r11 | 
|  | 538 |  | 
|  | 539 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 540 | PSHUFB_XMM	   %xmm14, %xmm\index | 
|  | 541 |  | 
|  | 542 | # prepare plaintext/ciphertext for GHASH computation | 
|  | 543 | .endr | 
|  | 544 | .endif | 
|  | 545 | GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 546 | # apply GHASH on num_initial_blocks blocks | 
|  | 547 |  | 
|  | 548 | .if \i == 5 | 
|  | 549 | pxor       %xmm5, %xmm6 | 
|  | 550 | GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 551 | pxor       %xmm6, %xmm7 | 
|  | 552 | GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 553 | pxor       %xmm7, %xmm8 | 
|  | 554 | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 555 | .elseif \i == 6 | 
|  | 556 | pxor       %xmm6, %xmm7 | 
|  | 557 | GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 558 | pxor       %xmm7, %xmm8 | 
|  | 559 | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 560 | .elseif \i == 7 | 
|  | 561 | pxor       %xmm7, %xmm8 | 
|  | 562 | GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 
|  | 563 | .endif | 
|  | 564 | cmp	   $64, %r13 | 
|  | 565 | jl	_initial_blocks_done\num_initial_blocks\operation | 
|  | 566 | # no need for precomputed values | 
|  | 567 | /* | 
|  | 568 | * | 
|  | 569 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | 
|  | 570 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | 
|  | 571 | */ | 
|  | 572 | paddd	   ONE(%rip), \XMM0              # INCR Y0 | 
|  | 573 | movdqa	   \XMM0, \XMM1 | 
|  | 574 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 575 | PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap | 
|  | 576 |  | 
|  | 577 | paddd	   ONE(%rip), \XMM0              # INCR Y0 | 
|  | 578 | movdqa	   \XMM0, \XMM2 | 
|  | 579 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 580 | PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap | 
|  | 581 |  | 
|  | 582 | paddd	   ONE(%rip), \XMM0              # INCR Y0 | 
|  | 583 | movdqa	   \XMM0, \XMM3 | 
|  | 584 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 585 | PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap | 
|  | 586 |  | 
|  | 587 | paddd	   ONE(%rip), \XMM0              # INCR Y0 | 
|  | 588 | movdqa	   \XMM0, \XMM4 | 
|  | 589 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 590 | PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap | 
|  | 591 |  | 
|  | 592 | pxor	   16*0(%arg1), \XMM1 | 
|  | 593 | pxor	   16*0(%arg1), \XMM2 | 
|  | 594 | pxor	   16*0(%arg1), \XMM3 | 
|  | 595 | pxor	   16*0(%arg1), \XMM4 | 
|  | 596 | movdqa	   \TMP3, \TMP5 | 
|  | 597 | pshufd	   $78, \TMP3, \TMP1 | 
|  | 598 | pxor	   \TMP3, \TMP1 | 
|  | 599 | movdqa	   \TMP1, HashKey_k(%rsp) | 
|  | 600 | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | 601 | # TMP5 = HashKey^2<<1 (mod poly) | 
|  | 602 | movdqa	   \TMP5, HashKey_2(%rsp) | 
|  | 603 | # HashKey_2 = HashKey^2<<1 (mod poly) | 
|  | 604 | pshufd	   $78, \TMP5, \TMP1 | 
|  | 605 | pxor	   \TMP5, \TMP1 | 
|  | 606 | movdqa	   \TMP1, HashKey_2_k(%rsp) | 
|  | 607 | .irpc index, 1234 # do 4 rounds | 
|  | 608 | movaps 0x10*\index(%arg1), \TMP1 | 
|  | 609 | AESENC	   \TMP1, \XMM1 | 
|  | 610 | AESENC	   \TMP1, \XMM2 | 
|  | 611 | AESENC	   \TMP1, \XMM3 | 
|  | 612 | AESENC	   \TMP1, \XMM4 | 
|  | 613 | .endr | 
|  | 614 | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | 615 | # TMP5 = HashKey^3<<1 (mod poly) | 
|  | 616 | movdqa	   \TMP5, HashKey_3(%rsp) | 
|  | 617 | pshufd	   $78, \TMP5, \TMP1 | 
|  | 618 | pxor	   \TMP5, \TMP1 | 
|  | 619 | movdqa	   \TMP1, HashKey_3_k(%rsp) | 
|  | 620 | .irpc index, 56789 # do next 5 rounds | 
|  | 621 | movaps 0x10*\index(%arg1), \TMP1 | 
|  | 622 | AESENC	   \TMP1, \XMM1 | 
|  | 623 | AESENC	   \TMP1, \XMM2 | 
|  | 624 | AESENC	   \TMP1, \XMM3 | 
|  | 625 | AESENC	   \TMP1, \XMM4 | 
|  | 626 | .endr | 
|  | 627 | GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | 
|  | 628 | # TMP5 = HashKey^3<<1 (mod poly) | 
|  | 629 | movdqa	   \TMP5, HashKey_4(%rsp) | 
|  | 630 | pshufd	   $78, \TMP5, \TMP1 | 
|  | 631 | pxor	   \TMP5, \TMP1 | 
|  | 632 | movdqa	   \TMP1, HashKey_4_k(%rsp) | 
|  | 633 | movaps 0xa0(%arg1), \TMP2 | 
|  | 634 | AESENCLAST \TMP2, \XMM1 | 
|  | 635 | AESENCLAST \TMP2, \XMM2 | 
|  | 636 | AESENCLAST \TMP2, \XMM3 | 
|  | 637 | AESENCLAST \TMP2, \XMM4 | 
|  | 638 | movdqu	   16*0(%arg3 , %r11 , 1), \TMP1 | 
|  | 639 | pxor	   \TMP1, \XMM1 | 
|  | 640 | movdqu	   16*1(%arg3 , %r11 , 1), \TMP1 | 
|  | 641 | pxor	   \TMP1, \XMM2 | 
|  | 642 | movdqu	   16*2(%arg3 , %r11 , 1), \TMP1 | 
|  | 643 | pxor	   \TMP1, \XMM3 | 
|  | 644 | movdqu	   16*3(%arg3 , %r11 , 1), \TMP1 | 
|  | 645 | pxor	   \TMP1, \XMM4 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 646 | movdqu     \XMM1, 16*0(%arg2 , %r11 , 1) | 
|  | 647 | movdqu     \XMM2, 16*1(%arg2 , %r11 , 1) | 
|  | 648 | movdqu     \XMM3, 16*2(%arg2 , %r11 , 1) | 
|  | 649 | movdqu     \XMM4, 16*3(%arg2 , %r11 , 1) | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 650 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 651 | add	   $64, %r11 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 652 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 653 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 654 | pxor	   \XMMDst, \XMM1 | 
|  | 655 | # combine GHASHed value with the corresponding ciphertext | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 656 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 657 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | 
|  | 658 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 659 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | 
|  | 660 | movdqa     SHUF_MASK(%rip), %xmm14 | 
|  | 661 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | 
|  | 662 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 663 | _initial_blocks_done\num_initial_blocks\operation: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 664 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 665 | .endm | 
|  | 666 |  | 
|  | 667 | /* | 
|  | 668 | * encrypt 4 blocks at a time | 
|  | 669 | * ghash the 4 previously encrypted ciphertext blocks | 
|  | 670 | * arg1, %arg2, %arg3 are used as pointers only, not modified | 
|  | 671 | * %r11 is the data offset value | 
|  | 672 | */ | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 673 | .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 674 | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | 
|  | 675 |  | 
|  | 676 | movdqa	  \XMM1, \XMM5 | 
|  | 677 | movdqa	  \XMM2, \XMM6 | 
|  | 678 | movdqa	  \XMM3, \XMM7 | 
|  | 679 | movdqa	  \XMM4, \XMM8 | 
|  | 680 |  | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 681 | movdqa    SHUF_MASK(%rip), %xmm15 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 682 | # multiply TMP5 * HashKey using karatsuba | 
|  | 683 |  | 
|  | 684 | movdqa	  \XMM5, \TMP4 | 
|  | 685 | pshufd	  $78, \XMM5, \TMP6 | 
|  | 686 | pxor	  \XMM5, \TMP6 | 
|  | 687 | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | 688 | movdqa	  HashKey_4(%rsp), \TMP5 | 
|  | 689 | PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1 | 
|  | 690 | movdqa    \XMM0, \XMM1 | 
|  | 691 | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | 692 | movdqa    \XMM0, \XMM2 | 
|  | 693 | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | 694 | movdqa    \XMM0, \XMM3 | 
|  | 695 | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | 696 | movdqa    \XMM0, \XMM4 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 697 | PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 698 | PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 699 | PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap | 
|  | 700 | PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap | 
|  | 701 | PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap | 
|  | 702 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 703 | pxor	  (%arg1), \XMM1 | 
|  | 704 | pxor	  (%arg1), \XMM2 | 
|  | 705 | pxor	  (%arg1), \XMM3 | 
|  | 706 | pxor	  (%arg1), \XMM4 | 
|  | 707 | movdqa	  HashKey_4_k(%rsp), \TMP5 | 
|  | 708 | PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0) | 
|  | 709 | movaps 0x10(%arg1), \TMP1 | 
|  | 710 | AESENC	  \TMP1, \XMM1              # Round 1 | 
|  | 711 | AESENC	  \TMP1, \XMM2 | 
|  | 712 | AESENC	  \TMP1, \XMM3 | 
|  | 713 | AESENC	  \TMP1, \XMM4 | 
|  | 714 | movaps 0x20(%arg1), \TMP1 | 
|  | 715 | AESENC	  \TMP1, \XMM1              # Round 2 | 
|  | 716 | AESENC	  \TMP1, \XMM2 | 
|  | 717 | AESENC	  \TMP1, \XMM3 | 
|  | 718 | AESENC	  \TMP1, \XMM4 | 
|  | 719 | movdqa	  \XMM6, \TMP1 | 
|  | 720 | pshufd	  $78, \XMM6, \TMP2 | 
|  | 721 | pxor	  \XMM6, \TMP2 | 
|  | 722 | movdqa	  HashKey_3(%rsp), \TMP5 | 
|  | 723 | PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1 | 
|  | 724 | movaps 0x30(%arg1), \TMP3 | 
|  | 725 | AESENC    \TMP3, \XMM1              # Round 3 | 
|  | 726 | AESENC    \TMP3, \XMM2 | 
|  | 727 | AESENC    \TMP3, \XMM3 | 
|  | 728 | AESENC    \TMP3, \XMM4 | 
|  | 729 | PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0 | 
|  | 730 | movaps 0x40(%arg1), \TMP3 | 
|  | 731 | AESENC	  \TMP3, \XMM1              # Round 4 | 
|  | 732 | AESENC	  \TMP3, \XMM2 | 
|  | 733 | AESENC	  \TMP3, \XMM3 | 
|  | 734 | AESENC	  \TMP3, \XMM4 | 
|  | 735 | movdqa	  HashKey_3_k(%rsp), \TMP5 | 
|  | 736 | PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0) | 
|  | 737 | movaps 0x50(%arg1), \TMP3 | 
|  | 738 | AESENC	  \TMP3, \XMM1              # Round 5 | 
|  | 739 | AESENC	  \TMP3, \XMM2 | 
|  | 740 | AESENC	  \TMP3, \XMM3 | 
|  | 741 | AESENC	  \TMP3, \XMM4 | 
|  | 742 | pxor	  \TMP1, \TMP4 | 
|  | 743 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | 
|  | 744 | pxor	  \XMM6, \XMM5 | 
|  | 745 | pxor	  \TMP2, \TMP6 | 
|  | 746 | movdqa	  \XMM7, \TMP1 | 
|  | 747 | pshufd	  $78, \XMM7, \TMP2 | 
|  | 748 | pxor	  \XMM7, \TMP2 | 
|  | 749 | movdqa	  HashKey_2(%rsp ), \TMP5 | 
|  | 750 |  | 
|  | 751 | # Multiply TMP5 * HashKey using karatsuba | 
|  | 752 |  | 
|  | 753 | PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1 | 
|  | 754 | movaps 0x60(%arg1), \TMP3 | 
|  | 755 | AESENC	  \TMP3, \XMM1              # Round 6 | 
|  | 756 | AESENC	  \TMP3, \XMM2 | 
|  | 757 | AESENC	  \TMP3, \XMM3 | 
|  | 758 | AESENC	  \TMP3, \XMM4 | 
|  | 759 | PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0 | 
|  | 760 | movaps 0x70(%arg1), \TMP3 | 
|  | 761 | AESENC	  \TMP3, \XMM1             # Round 7 | 
|  | 762 | AESENC	  \TMP3, \XMM2 | 
|  | 763 | AESENC	  \TMP3, \XMM3 | 
|  | 764 | AESENC	  \TMP3, \XMM4 | 
|  | 765 | movdqa	  HashKey_2_k(%rsp), \TMP5 | 
|  | 766 | PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0) | 
|  | 767 | movaps 0x80(%arg1), \TMP3 | 
|  | 768 | AESENC	  \TMP3, \XMM1             # Round 8 | 
|  | 769 | AESENC	  \TMP3, \XMM2 | 
|  | 770 | AESENC	  \TMP3, \XMM3 | 
|  | 771 | AESENC	  \TMP3, \XMM4 | 
|  | 772 | pxor	  \TMP1, \TMP4 | 
|  | 773 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | 
|  | 774 | pxor	  \XMM7, \XMM5 | 
|  | 775 | pxor	  \TMP2, \TMP6 | 
|  | 776 |  | 
|  | 777 | # Multiply XMM8 * HashKey | 
|  | 778 | # XMM8 and TMP5 hold the values for the two operands | 
|  | 779 |  | 
|  | 780 | movdqa	  \XMM8, \TMP1 | 
|  | 781 | pshufd	  $78, \XMM8, \TMP2 | 
|  | 782 | pxor	  \XMM8, \TMP2 | 
|  | 783 | movdqa	  HashKey(%rsp), \TMP5 | 
|  | 784 | PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1 | 
|  | 785 | movaps 0x90(%arg1), \TMP3 | 
|  | 786 | AESENC	  \TMP3, \XMM1            # Round 9 | 
|  | 787 | AESENC	  \TMP3, \XMM2 | 
|  | 788 | AESENC	  \TMP3, \XMM3 | 
|  | 789 | AESENC	  \TMP3, \XMM4 | 
|  | 790 | PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0 | 
|  | 791 | movaps 0xa0(%arg1), \TMP3 | 
|  | 792 | AESENCLAST \TMP3, \XMM1           # Round 10 | 
|  | 793 | AESENCLAST \TMP3, \XMM2 | 
|  | 794 | AESENCLAST \TMP3, \XMM3 | 
|  | 795 | AESENCLAST \TMP3, \XMM4 | 
|  | 796 | movdqa    HashKey_k(%rsp), \TMP5 | 
|  | 797 | PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0) | 
|  | 798 | movdqu	  (%arg3,%r11,1), \TMP3 | 
|  | 799 | pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 800 | movdqu	  16(%arg3,%r11,1), \TMP3 | 
|  | 801 | pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 802 | movdqu	  32(%arg3,%r11,1), \TMP3 | 
|  | 803 | pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 804 | movdqu	  48(%arg3,%r11,1), \TMP3 | 
|  | 805 | pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 806 | movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer | 
|  | 807 | movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer | 
|  | 808 | movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer | 
|  | 809 | movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer | 
|  | 810 | PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap | 
|  | 811 | PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap | 
|  | 812 | PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap | 
|  | 813 | PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap | 
|  | 814 |  | 
|  | 815 | pxor	  \TMP4, \TMP1 | 
|  | 816 | pxor	  \XMM8, \XMM5 | 
|  | 817 | pxor	  \TMP6, \TMP2 | 
|  | 818 | pxor	  \TMP1, \TMP2 | 
|  | 819 | pxor	  \XMM5, \TMP2 | 
|  | 820 | movdqa	  \TMP2, \TMP3 | 
|  | 821 | pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs | 
|  | 822 | psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs | 
|  | 823 | pxor	  \TMP3, \XMM5 | 
|  | 824 | pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5 | 
|  | 825 |  | 
|  | 826 | # first phase of reduction | 
|  | 827 |  | 
|  | 828 | movdqa    \XMM5, \TMP2 | 
|  | 829 | movdqa    \XMM5, \TMP3 | 
|  | 830 | movdqa    \XMM5, \TMP4 | 
|  | 831 | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | 
|  | 832 | pslld     $31, \TMP2                   # packed right shift << 31 | 
|  | 833 | pslld     $30, \TMP3                   # packed right shift << 30 | 
|  | 834 | pslld     $25, \TMP4                   # packed right shift << 25 | 
|  | 835 | pxor      \TMP3, \TMP2	               # xor the shifted versions | 
|  | 836 | pxor      \TMP4, \TMP2 | 
|  | 837 | movdqa    \TMP2, \TMP5 | 
|  | 838 | psrldq    $4, \TMP5                    # right shift T5 1 DW | 
|  | 839 | pslldq    $12, \TMP2                   # left shift T2 3 DWs | 
|  | 840 | pxor      \TMP2, \XMM5 | 
|  | 841 |  | 
|  | 842 | # second phase of reduction | 
|  | 843 |  | 
|  | 844 | movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | 
|  | 845 | movdqa    \XMM5,\TMP3 | 
|  | 846 | movdqa    \XMM5,\TMP4 | 
|  | 847 | psrld     $1, \TMP2                    # packed left shift >>1 | 
|  | 848 | psrld     $2, \TMP3                    # packed left shift >>2 | 
|  | 849 | psrld     $7, \TMP4                    # packed left shift >>7 | 
|  | 850 | pxor      \TMP3,\TMP2		       # xor the shifted versions | 
|  | 851 | pxor      \TMP4,\TMP2 | 
|  | 852 | pxor      \TMP5, \TMP2 | 
|  | 853 | pxor      \TMP2, \XMM5 | 
|  | 854 | pxor      \TMP1, \XMM5                 # result is in TMP1 | 
|  | 855 |  | 
|  | 856 | pxor	  \XMM5, \XMM1 | 
|  | 857 | .endm | 
|  | 858 |  | 
|  | 859 | /* | 
|  | 860 | * decrypt 4 blocks at a time | 
|  | 861 | * ghash the 4 previously decrypted ciphertext blocks | 
|  | 862 | * arg1, %arg2, %arg3 are used as pointers only, not modified | 
|  | 863 | * %r11 is the data offset value | 
|  | 864 | */ | 
|  | 865 | .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ | 
|  | 866 | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | 
|  | 867 |  | 
|  | 868 | movdqa	  \XMM1, \XMM5 | 
|  | 869 | movdqa	  \XMM2, \XMM6 | 
|  | 870 | movdqa	  \XMM3, \XMM7 | 
|  | 871 | movdqa	  \XMM4, \XMM8 | 
|  | 872 |  | 
|  | 873 | movdqa    SHUF_MASK(%rip), %xmm15 | 
|  | 874 | # multiply TMP5 * HashKey using karatsuba | 
|  | 875 |  | 
|  | 876 | movdqa	  \XMM5, \TMP4 | 
|  | 877 | pshufd	  $78, \XMM5, \TMP6 | 
|  | 878 | pxor	  \XMM5, \TMP6 | 
|  | 879 | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | 880 | movdqa	  HashKey_4(%rsp), \TMP5 | 
|  | 881 | PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1 | 
|  | 882 | movdqa    \XMM0, \XMM1 | 
|  | 883 | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | 884 | movdqa    \XMM0, \XMM2 | 
|  | 885 | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | 886 | movdqa    \XMM0, \XMM3 | 
|  | 887 | paddd     ONE(%rip), \XMM0		# INCR CNT | 
|  | 888 | movdqa    \XMM0, \XMM4 | 
|  | 889 | PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap | 
|  | 890 | PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0 | 
|  | 891 | PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap | 
|  | 892 | PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap | 
|  | 893 | PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap | 
|  | 894 |  | 
|  | 895 | pxor	  (%arg1), \XMM1 | 
|  | 896 | pxor	  (%arg1), \XMM2 | 
|  | 897 | pxor	  (%arg1), \XMM3 | 
|  | 898 | pxor	  (%arg1), \XMM4 | 
|  | 899 | movdqa	  HashKey_4_k(%rsp), \TMP5 | 
|  | 900 | PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0) | 
|  | 901 | movaps 0x10(%arg1), \TMP1 | 
|  | 902 | AESENC	  \TMP1, \XMM1              # Round 1 | 
|  | 903 | AESENC	  \TMP1, \XMM2 | 
|  | 904 | AESENC	  \TMP1, \XMM3 | 
|  | 905 | AESENC	  \TMP1, \XMM4 | 
|  | 906 | movaps 0x20(%arg1), \TMP1 | 
|  | 907 | AESENC	  \TMP1, \XMM1              # Round 2 | 
|  | 908 | AESENC	  \TMP1, \XMM2 | 
|  | 909 | AESENC	  \TMP1, \XMM3 | 
|  | 910 | AESENC	  \TMP1, \XMM4 | 
|  | 911 | movdqa	  \XMM6, \TMP1 | 
|  | 912 | pshufd	  $78, \XMM6, \TMP2 | 
|  | 913 | pxor	  \XMM6, \TMP2 | 
|  | 914 | movdqa	  HashKey_3(%rsp), \TMP5 | 
|  | 915 | PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1 | 
|  | 916 | movaps 0x30(%arg1), \TMP3 | 
|  | 917 | AESENC    \TMP3, \XMM1              # Round 3 | 
|  | 918 | AESENC    \TMP3, \XMM2 | 
|  | 919 | AESENC    \TMP3, \XMM3 | 
|  | 920 | AESENC    \TMP3, \XMM4 | 
|  | 921 | PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0 | 
|  | 922 | movaps 0x40(%arg1), \TMP3 | 
|  | 923 | AESENC	  \TMP3, \XMM1              # Round 4 | 
|  | 924 | AESENC	  \TMP3, \XMM2 | 
|  | 925 | AESENC	  \TMP3, \XMM3 | 
|  | 926 | AESENC	  \TMP3, \XMM4 | 
|  | 927 | movdqa	  HashKey_3_k(%rsp), \TMP5 | 
|  | 928 | PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0) | 
|  | 929 | movaps 0x50(%arg1), \TMP3 | 
|  | 930 | AESENC	  \TMP3, \XMM1              # Round 5 | 
|  | 931 | AESENC	  \TMP3, \XMM2 | 
|  | 932 | AESENC	  \TMP3, \XMM3 | 
|  | 933 | AESENC	  \TMP3, \XMM4 | 
|  | 934 | pxor	  \TMP1, \TMP4 | 
|  | 935 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | 
|  | 936 | pxor	  \XMM6, \XMM5 | 
|  | 937 | pxor	  \TMP2, \TMP6 | 
|  | 938 | movdqa	  \XMM7, \TMP1 | 
|  | 939 | pshufd	  $78, \XMM7, \TMP2 | 
|  | 940 | pxor	  \XMM7, \TMP2 | 
|  | 941 | movdqa	  HashKey_2(%rsp ), \TMP5 | 
|  | 942 |  | 
|  | 943 | # Multiply TMP5 * HashKey using karatsuba | 
|  | 944 |  | 
|  | 945 | PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1 | 
|  | 946 | movaps 0x60(%arg1), \TMP3 | 
|  | 947 | AESENC	  \TMP3, \XMM1              # Round 6 | 
|  | 948 | AESENC	  \TMP3, \XMM2 | 
|  | 949 | AESENC	  \TMP3, \XMM3 | 
|  | 950 | AESENC	  \TMP3, \XMM4 | 
|  | 951 | PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0 | 
|  | 952 | movaps 0x70(%arg1), \TMP3 | 
|  | 953 | AESENC	  \TMP3, \XMM1             # Round 7 | 
|  | 954 | AESENC	  \TMP3, \XMM2 | 
|  | 955 | AESENC	  \TMP3, \XMM3 | 
|  | 956 | AESENC	  \TMP3, \XMM4 | 
|  | 957 | movdqa	  HashKey_2_k(%rsp), \TMP5 | 
|  | 958 | PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0) | 
|  | 959 | movaps 0x80(%arg1), \TMP3 | 
|  | 960 | AESENC	  \TMP3, \XMM1             # Round 8 | 
|  | 961 | AESENC	  \TMP3, \XMM2 | 
|  | 962 | AESENC	  \TMP3, \XMM3 | 
|  | 963 | AESENC	  \TMP3, \XMM4 | 
|  | 964 | pxor	  \TMP1, \TMP4 | 
|  | 965 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | 
|  | 966 | pxor	  \XMM7, \XMM5 | 
|  | 967 | pxor	  \TMP2, \TMP6 | 
|  | 968 |  | 
|  | 969 | # Multiply XMM8 * HashKey | 
|  | 970 | # XMM8 and TMP5 hold the values for the two operands | 
|  | 971 |  | 
|  | 972 | movdqa	  \XMM8, \TMP1 | 
|  | 973 | pshufd	  $78, \XMM8, \TMP2 | 
|  | 974 | pxor	  \XMM8, \TMP2 | 
|  | 975 | movdqa	  HashKey(%rsp), \TMP5 | 
|  | 976 | PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1 | 
|  | 977 | movaps 0x90(%arg1), \TMP3 | 
|  | 978 | AESENC	  \TMP3, \XMM1            # Round 9 | 
|  | 979 | AESENC	  \TMP3, \XMM2 | 
|  | 980 | AESENC	  \TMP3, \XMM3 | 
|  | 981 | AESENC	  \TMP3, \XMM4 | 
|  | 982 | PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0 | 
|  | 983 | movaps 0xa0(%arg1), \TMP3 | 
|  | 984 | AESENCLAST \TMP3, \XMM1           # Round 10 | 
|  | 985 | AESENCLAST \TMP3, \XMM2 | 
|  | 986 | AESENCLAST \TMP3, \XMM3 | 
|  | 987 | AESENCLAST \TMP3, \XMM4 | 
|  | 988 | movdqa    HashKey_k(%rsp), \TMP5 | 
|  | 989 | PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0) | 
|  | 990 | movdqu	  (%arg3,%r11,1), \TMP3 | 
|  | 991 | pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK | 
|  | 992 | movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer | 
|  | 993 | movdqa    \TMP3, \XMM1 | 
|  | 994 | movdqu	  16(%arg3,%r11,1), \TMP3 | 
|  | 995 | pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK | 
|  | 996 | movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer | 
|  | 997 | movdqa    \TMP3, \XMM2 | 
|  | 998 | movdqu	  32(%arg3,%r11,1), \TMP3 | 
|  | 999 | pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK | 
|  | 1000 | movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer | 
|  | 1001 | movdqa    \TMP3, \XMM3 | 
|  | 1002 | movdqu	  48(%arg3,%r11,1), \TMP3 | 
|  | 1003 | pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1004 | movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer | 
|  | 1005 | movdqa    \TMP3, \XMM4 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1006 | PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap | 
|  | 1007 | PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap | 
|  | 1008 | PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap | 
|  | 1009 | PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1010 |  | 
|  | 1011 | pxor	  \TMP4, \TMP1 | 
|  | 1012 | pxor	  \XMM8, \XMM5 | 
|  | 1013 | pxor	  \TMP6, \TMP2 | 
|  | 1014 | pxor	  \TMP1, \TMP2 | 
|  | 1015 | pxor	  \XMM5, \TMP2 | 
|  | 1016 | movdqa	  \TMP2, \TMP3 | 
|  | 1017 | pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs | 
|  | 1018 | psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs | 
|  | 1019 | pxor	  \TMP3, \XMM5 | 
|  | 1020 | pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5 | 
|  | 1021 |  | 
|  | 1022 | # first phase of reduction | 
|  | 1023 |  | 
|  | 1024 | movdqa    \XMM5, \TMP2 | 
|  | 1025 | movdqa    \XMM5, \TMP3 | 
|  | 1026 | movdqa    \XMM5, \TMP4 | 
|  | 1027 | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | 
|  | 1028 | pslld     $31, \TMP2                   # packed right shift << 31 | 
|  | 1029 | pslld     $30, \TMP3                   # packed right shift << 30 | 
|  | 1030 | pslld     $25, \TMP4                   # packed right shift << 25 | 
|  | 1031 | pxor      \TMP3, \TMP2	               # xor the shifted versions | 
|  | 1032 | pxor      \TMP4, \TMP2 | 
|  | 1033 | movdqa    \TMP2, \TMP5 | 
|  | 1034 | psrldq    $4, \TMP5                    # right shift T5 1 DW | 
|  | 1035 | pslldq    $12, \TMP2                   # left shift T2 3 DWs | 
|  | 1036 | pxor      \TMP2, \XMM5 | 
|  | 1037 |  | 
|  | 1038 | # second phase of reduction | 
|  | 1039 |  | 
|  | 1040 | movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | 
|  | 1041 | movdqa    \XMM5,\TMP3 | 
|  | 1042 | movdqa    \XMM5,\TMP4 | 
|  | 1043 | psrld     $1, \TMP2                    # packed left shift >>1 | 
|  | 1044 | psrld     $2, \TMP3                    # packed left shift >>2 | 
|  | 1045 | psrld     $7, \TMP4                    # packed left shift >>7 | 
|  | 1046 | pxor      \TMP3,\TMP2		       # xor the shifted versions | 
|  | 1047 | pxor      \TMP4,\TMP2 | 
|  | 1048 | pxor      \TMP5, \TMP2 | 
|  | 1049 | pxor      \TMP2, \XMM5 | 
|  | 1050 | pxor      \TMP1, \XMM5                 # result is in TMP1 | 
|  | 1051 |  | 
|  | 1052 | pxor	  \XMM5, \XMM1 | 
|  | 1053 | .endm | 
|  | 1054 |  | 
|  | 1055 | /* GHASH the last 4 ciphertext blocks. */ | 
|  | 1056 | .macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ | 
|  | 1057 | TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst | 
|  | 1058 |  | 
|  | 1059 | # Multiply TMP6 * HashKey (using Karatsuba) | 
|  | 1060 |  | 
|  | 1061 | movdqa	  \XMM1, \TMP6 | 
|  | 1062 | pshufd	  $78, \XMM1, \TMP2 | 
|  | 1063 | pxor	  \XMM1, \TMP2 | 
|  | 1064 | movdqa	  HashKey_4(%rsp), \TMP5 | 
|  | 1065 | PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1 | 
|  | 1066 | PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0 | 
|  | 1067 | movdqa	  HashKey_4_k(%rsp), \TMP4 | 
|  | 1068 | PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0) | 
|  | 1069 | movdqa	  \XMM1, \XMMDst | 
|  | 1070 | movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1 | 
|  | 1071 |  | 
|  | 1072 | # Multiply TMP1 * HashKey (using Karatsuba) | 
|  | 1073 |  | 
|  | 1074 | movdqa	  \XMM2, \TMP1 | 
|  | 1075 | pshufd	  $78, \XMM2, \TMP2 | 
|  | 1076 | pxor	  \XMM2, \TMP2 | 
|  | 1077 | movdqa	  HashKey_3(%rsp), \TMP5 | 
|  | 1078 | PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1 | 
|  | 1079 | PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0 | 
|  | 1080 | movdqa	  HashKey_3_k(%rsp), \TMP4 | 
|  | 1081 | PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0) | 
|  | 1082 | pxor	  \TMP1, \TMP6 | 
|  | 1083 | pxor	  \XMM2, \XMMDst | 
|  | 1084 | pxor	  \TMP2, \XMM1 | 
|  | 1085 | # results accumulated in TMP6, XMMDst, XMM1 | 
|  | 1086 |  | 
|  | 1087 | # Multiply TMP1 * HashKey (using Karatsuba) | 
|  | 1088 |  | 
|  | 1089 | movdqa	  \XMM3, \TMP1 | 
|  | 1090 | pshufd	  $78, \XMM3, \TMP2 | 
|  | 1091 | pxor	  \XMM3, \TMP2 | 
|  | 1092 | movdqa	  HashKey_2(%rsp), \TMP5 | 
|  | 1093 | PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1 | 
|  | 1094 | PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0 | 
|  | 1095 | movdqa	  HashKey_2_k(%rsp), \TMP4 | 
|  | 1096 | PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0) | 
|  | 1097 | pxor	  \TMP1, \TMP6 | 
|  | 1098 | pxor	  \XMM3, \XMMDst | 
|  | 1099 | pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1 | 
|  | 1100 |  | 
|  | 1101 | # Multiply TMP1 * HashKey (using Karatsuba) | 
|  | 1102 | movdqa	  \XMM4, \TMP1 | 
|  | 1103 | pshufd	  $78, \XMM4, \TMP2 | 
|  | 1104 | pxor	  \XMM4, \TMP2 | 
|  | 1105 | movdqa	  HashKey(%rsp), \TMP5 | 
|  | 1106 | PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1 | 
|  | 1107 | PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0 | 
|  | 1108 | movdqa	  HashKey_k(%rsp), \TMP4 | 
|  | 1109 | PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0) | 
|  | 1110 | pxor	  \TMP1, \TMP6 | 
|  | 1111 | pxor	  \XMM4, \XMMDst | 
|  | 1112 | pxor	  \XMM1, \TMP2 | 
|  | 1113 | pxor	  \TMP6, \TMP2 | 
|  | 1114 | pxor	  \XMMDst, \TMP2 | 
|  | 1115 | # middle section of the temp results combined as in karatsuba algorithm | 
|  | 1116 | movdqa	  \TMP2, \TMP4 | 
|  | 1117 | pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs | 
|  | 1118 | psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs | 
|  | 1119 | pxor	  \TMP4, \XMMDst | 
|  | 1120 | pxor	  \TMP2, \TMP6 | 
|  | 1121 | # TMP6:XMMDst holds the result of the accumulated carry-less multiplications | 
|  | 1122 | # first phase of the reduction | 
|  | 1123 | movdqa    \XMMDst, \TMP2 | 
|  | 1124 | movdqa    \XMMDst, \TMP3 | 
|  | 1125 | movdqa    \XMMDst, \TMP4 | 
|  | 1126 | # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently | 
|  | 1127 | pslld     $31, \TMP2                # packed right shifting << 31 | 
|  | 1128 | pslld     $30, \TMP3                # packed right shifting << 30 | 
|  | 1129 | pslld     $25, \TMP4                # packed right shifting << 25 | 
|  | 1130 | pxor      \TMP3, \TMP2              # xor the shifted versions | 
|  | 1131 | pxor      \TMP4, \TMP2 | 
|  | 1132 | movdqa    \TMP2, \TMP7 | 
|  | 1133 | psrldq    $4, \TMP7                 # right shift TMP7 1 DW | 
|  | 1134 | pslldq    $12, \TMP2                # left shift TMP2 3 DWs | 
|  | 1135 | pxor      \TMP2, \XMMDst | 
|  | 1136 |  | 
|  | 1137 | # second phase of the reduction | 
|  | 1138 | movdqa    \XMMDst, \TMP2 | 
|  | 1139 | # make 3 copies of XMMDst for doing 3 shift operations | 
|  | 1140 | movdqa    \XMMDst, \TMP3 | 
|  | 1141 | movdqa    \XMMDst, \TMP4 | 
|  | 1142 | psrld     $1, \TMP2                 # packed left shift >> 1 | 
|  | 1143 | psrld     $2, \TMP3                 # packed left shift >> 2 | 
|  | 1144 | psrld     $7, \TMP4                 # packed left shift >> 7 | 
|  | 1145 | pxor      \TMP3, \TMP2              # xor the shifted versions | 
|  | 1146 | pxor      \TMP4, \TMP2 | 
|  | 1147 | pxor      \TMP7, \TMP2 | 
|  | 1148 | pxor      \TMP2, \XMMDst | 
|  | 1149 | pxor      \TMP6, \XMMDst            # reduced result is in XMMDst | 
|  | 1150 | .endm | 
|  | 1151 |  | 
|  | 1152 | /* Encryption of a single block done*/ | 
|  | 1153 | .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 | 
|  | 1154 |  | 
|  | 1155 | pxor	(%arg1), \XMM0 | 
|  | 1156 | movaps 16(%arg1), \TMP1 | 
|  | 1157 | AESENC	\TMP1, \XMM0 | 
|  | 1158 | movaps 32(%arg1), \TMP1 | 
|  | 1159 | AESENC	\TMP1, \XMM0 | 
|  | 1160 | movaps 48(%arg1), \TMP1 | 
|  | 1161 | AESENC	\TMP1, \XMM0 | 
|  | 1162 | movaps 64(%arg1), \TMP1 | 
|  | 1163 | AESENC	\TMP1, \XMM0 | 
|  | 1164 | movaps 80(%arg1), \TMP1 | 
|  | 1165 | AESENC	\TMP1, \XMM0 | 
|  | 1166 | movaps 96(%arg1), \TMP1 | 
|  | 1167 | AESENC	\TMP1, \XMM0 | 
|  | 1168 | movaps 112(%arg1), \TMP1 | 
|  | 1169 | AESENC	\TMP1, \XMM0 | 
|  | 1170 | movaps 128(%arg1), \TMP1 | 
|  | 1171 | AESENC	\TMP1, \XMM0 | 
|  | 1172 | movaps 144(%arg1), \TMP1 | 
|  | 1173 | AESENC	\TMP1, \XMM0 | 
|  | 1174 | movaps 160(%arg1), \TMP1 | 
|  | 1175 | AESENCLAST	\TMP1, \XMM0 | 
|  | 1176 | .endm | 
|  | 1177 |  | 
|  | 1178 |  | 
|  | 1179 | /***************************************************************************** | 
|  | 1180 | * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary. | 
|  | 1181 | *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed. | 
|  | 1182 | *                   const u8 *in,      // Ciphertext input | 
|  | 1183 | *                   u64 plaintext_len, // Length of data in bytes for decryption. | 
|  | 1184 | *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association) | 
|  | 1185 | *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | 
|  | 1186 | *                                      // concatenated with 0x00000001. 16-byte aligned pointer. | 
|  | 1187 | *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary. | 
|  | 1188 | *                   const u8 *aad,     // Additional Authentication Data (AAD) | 
|  | 1189 | *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | 
|  | 1190 | *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the | 
|  | 1191 | *                                      // given authentication tag and only return the plaintext if they match. | 
|  | 1192 | *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 | 
|  | 1193 | *                                      // (most likely), 12 or 8. | 
|  | 1194 | * | 
|  | 1195 | * Assumptions: | 
|  | 1196 | * | 
|  | 1197 | * keys: | 
|  | 1198 | *       keys are pre-expanded and aligned to 16 bytes. we are using the first | 
|  | 1199 | *       set of 11 keys in the data structure void *aes_ctx | 
|  | 1200 | * | 
|  | 1201 | * iv: | 
|  | 1202 | *       0                   1                   2                   3 | 
|  | 1203 | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | 1204 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1205 | *       |                             Salt  (From the SA)               | | 
|  | 1206 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1207 | *       |                     Initialization Vector                     | | 
|  | 1208 | *       |         (This is the sequence number from IPSec header)       | | 
|  | 1209 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1210 | *       |                              0x1                              | | 
|  | 1211 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1212 | * | 
|  | 1213 | * | 
|  | 1214 | * | 
|  | 1215 | * AAD: | 
|  | 1216 | *       AAD padded to 128 bits with 0 | 
|  | 1217 | *       for example, assume AAD is a u32 vector | 
|  | 1218 | * | 
|  | 1219 | *       if AAD is 8 bytes: | 
|  | 1220 | *       AAD[3] = {A0, A1}; | 
|  | 1221 | *       padded AAD in xmm register = {A1 A0 0 0} | 
|  | 1222 | * | 
|  | 1223 | *       0                   1                   2                   3 | 
|  | 1224 | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | 1225 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1226 | *       |                               SPI (A1)                        | | 
|  | 1227 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1228 | *       |                     32-bit Sequence Number (A0)               | | 
|  | 1229 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1230 | *       |                              0x0                              | | 
|  | 1231 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1232 | * | 
|  | 1233 | *                                       AAD Format with 32-bit Sequence Number | 
|  | 1234 | * | 
|  | 1235 | *       if AAD is 12 bytes: | 
|  | 1236 | *       AAD[3] = {A0, A1, A2}; | 
|  | 1237 | *       padded AAD in xmm register = {A2 A1 A0 0} | 
|  | 1238 | * | 
|  | 1239 | *       0                   1                   2                   3 | 
|  | 1240 | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | 1241 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1242 | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | 1243 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1244 | *       |                               SPI (A2)                        | | 
|  | 1245 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1246 | *       |                 64-bit Extended Sequence Number {A1,A0}       | | 
|  | 1247 | *       |                                                               | | 
|  | 1248 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1249 | *       |                              0x0                              | | 
|  | 1250 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1251 | * | 
|  | 1252 | *                        AAD Format with 64-bit Extended Sequence Number | 
|  | 1253 | * | 
|  | 1254 | * aadLen: | 
|  | 1255 | *       from the definition of the spec, aadLen can only be 8 or 12 bytes. | 
|  | 1256 | *       The code supports 16 too but for other sizes, the code will fail. | 
|  | 1257 | * | 
|  | 1258 | * TLen: | 
|  | 1259 | *       from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | 
|  | 1260 | *       For other sizes, the code will fail. | 
|  | 1261 | * | 
|  | 1262 | * poly = x^128 + x^127 + x^126 + x^121 + 1 | 
|  | 1263 | * | 
|  | 1264 | *****************************************************************************/ | 
|  | 1265 |  | 
|  | 1266 | ENTRY(aesni_gcm_dec) | 
|  | 1267 | push	%r12 | 
|  | 1268 | push	%r13 | 
|  | 1269 | push	%r14 | 
|  | 1270 | mov	%rsp, %r14 | 
|  | 1271 | /* | 
|  | 1272 | * states of %xmm registers %xmm6:%xmm15 not saved | 
|  | 1273 | * all %xmm registers are clobbered | 
|  | 1274 | */ | 
|  | 1275 | sub	$VARIABLE_OFFSET, %rsp | 
|  | 1276 | and	$~63, %rsp                        # align rsp to 64 bytes | 
|  | 1277 | mov	%arg6, %r12 | 
|  | 1278 | movdqu	(%r12), %xmm13			  # %xmm13 = HashKey | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1279 | movdqa  SHUF_MASK(%rip), %xmm2 | 
|  | 1280 | PSHUFB_XMM %xmm2, %xmm13 | 
|  | 1281 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1282 |  | 
|  | 1283 | # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) | 
|  | 1284 |  | 
|  | 1285 | movdqa	%xmm13, %xmm2 | 
|  | 1286 | psllq	$1, %xmm13 | 
|  | 1287 | psrlq	$63, %xmm2 | 
|  | 1288 | movdqa	%xmm2, %xmm1 | 
|  | 1289 | pslldq	$8, %xmm2 | 
|  | 1290 | psrldq	$8, %xmm1 | 
|  | 1291 | por	%xmm2, %xmm13 | 
|  | 1292 |  | 
|  | 1293 | # Reduction | 
|  | 1294 |  | 
|  | 1295 | pshufd	$0x24, %xmm1, %xmm2 | 
|  | 1296 | pcmpeqd TWOONE(%rip), %xmm2 | 
|  | 1297 | pand	POLY(%rip), %xmm2 | 
|  | 1298 | pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly) | 
|  | 1299 |  | 
|  | 1300 |  | 
|  | 1301 | # Decrypt first few blocks | 
|  | 1302 |  | 
|  | 1303 | movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly) | 
|  | 1304 | mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext | 
|  | 1305 | and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16) | 
|  | 1306 | mov %r13, %r12 | 
|  | 1307 | and $(3<<4), %r12 | 
|  | 1308 | jz _initial_num_blocks_is_0_decrypt | 
|  | 1309 | cmp $(2<<4), %r12 | 
|  | 1310 | jb _initial_num_blocks_is_1_decrypt | 
|  | 1311 | je _initial_num_blocks_is_2_decrypt | 
|  | 1312 | _initial_num_blocks_is_3_decrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1313 | INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1314 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec | 
|  | 1315 | sub	$48, %r13 | 
|  | 1316 | jmp	_initial_blocks_decrypted | 
|  | 1317 | _initial_num_blocks_is_2_decrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1318 | INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1319 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec | 
|  | 1320 | sub	$32, %r13 | 
|  | 1321 | jmp	_initial_blocks_decrypted | 
|  | 1322 | _initial_num_blocks_is_1_decrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1323 | INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1324 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec | 
|  | 1325 | sub	$16, %r13 | 
|  | 1326 | jmp	_initial_blocks_decrypted | 
|  | 1327 | _initial_num_blocks_is_0_decrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1328 | INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1329 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec | 
|  | 1330 | _initial_blocks_decrypted: | 
|  | 1331 | cmp	$0, %r13 | 
|  | 1332 | je	_zero_cipher_left_decrypt | 
|  | 1333 | sub	$64, %r13 | 
|  | 1334 | je	_four_cipher_left_decrypt | 
|  | 1335 | _decrypt_by_4: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1336 | GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1337 | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec | 
|  | 1338 | add	$64, %r11 | 
|  | 1339 | sub	$64, %r13 | 
|  | 1340 | jne	_decrypt_by_4 | 
|  | 1341 | _four_cipher_left_decrypt: | 
|  | 1342 | GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | 
|  | 1343 | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | 
|  | 1344 | _zero_cipher_left_decrypt: | 
|  | 1345 | mov	%arg4, %r13 | 
|  | 1346 | and	$15, %r13				# %r13 = arg4 (mod 16) | 
|  | 1347 | je	_multiple_of_16_bytes_decrypt | 
|  | 1348 |  | 
| Lucas De Marchi | 0d2eb44 | 2011-03-17 16:24:16 -0300 | [diff] [blame] | 1349 | # Handle the last <16 byte block separately | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1350 |  | 
|  | 1351 | paddd ONE(%rip), %xmm0         # increment CNT to get Yn | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1352 | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | 1353 | PSHUFB_XMM %xmm10, %xmm0 | 
|  | 1354 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1355 | ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn) | 
|  | 1356 | sub $16, %r11 | 
|  | 1357 | add %r13, %r11 | 
| Lucas De Marchi | 0d2eb44 | 2011-03-17 16:24:16 -0300 | [diff] [blame] | 1358 | movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1359 | lea SHIFT_MASK+16(%rip), %r12 | 
|  | 1360 | sub %r13, %r12 | 
|  | 1361 | # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes | 
|  | 1362 | # (%r13 is the number of bytes in plaintext mod 16) | 
|  | 1363 | movdqu (%r12), %xmm2           # get the appropriate shuffle mask | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1364 | PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes | 
|  | 1365 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1366 | movdqa  %xmm1, %xmm2 | 
|  | 1367 | pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn) | 
|  | 1368 | movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 | 
|  | 1369 | # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 | 
|  | 1370 | pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0 | 
|  | 1371 | pand    %xmm1, %xmm2 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1372 | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | 1373 | PSHUFB_XMM %xmm10 ,%xmm2 | 
|  | 1374 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1375 | pxor %xmm2, %xmm8 | 
|  | 1376 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | 
|  | 1377 | # GHASH computation for the last <16 byte block | 
|  | 1378 | sub %r13, %r11 | 
|  | 1379 | add $16, %r11 | 
|  | 1380 |  | 
|  | 1381 | # output %r13 bytes | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1382 | MOVQ_R64_XMM	%xmm0, %rax | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1383 | cmp	$8, %r13 | 
|  | 1384 | jle	_less_than_8_bytes_left_decrypt | 
|  | 1385 | mov	%rax, (%arg2 , %r11, 1) | 
|  | 1386 | add	$8, %r11 | 
|  | 1387 | psrldq	$8, %xmm0 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1388 | MOVQ_R64_XMM	%xmm0, %rax | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1389 | sub	$8, %r13 | 
|  | 1390 | _less_than_8_bytes_left_decrypt: | 
|  | 1391 | mov	%al,  (%arg2, %r11, 1) | 
|  | 1392 | add	$1, %r11 | 
|  | 1393 | shr	$8, %rax | 
|  | 1394 | sub	$1, %r13 | 
|  | 1395 | jne	_less_than_8_bytes_left_decrypt | 
|  | 1396 | _multiple_of_16_bytes_decrypt: | 
|  | 1397 | mov	arg8, %r12		  # %r13 = aadLen (number of bytes) | 
|  | 1398 | shl	$3, %r12		  # convert into number of bits | 
|  | 1399 | movd	%r12d, %xmm15		  # len(A) in %xmm15 | 
|  | 1400 | shl	$3, %arg4		  # len(C) in bits (*128) | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1401 | MOVQ_R64_XMM	%arg4, %xmm1 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1402 | pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000 | 
|  | 1403 | pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C) | 
|  | 1404 | pxor	%xmm15, %xmm8 | 
|  | 1405 | GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | 
|  | 1406 | # final GHASH computation | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1407 | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | 1408 | PSHUFB_XMM %xmm10, %xmm8 | 
|  | 1409 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1410 | mov	%arg5, %rax		  # %rax = *Y0 | 
|  | 1411 | movdqu	(%rax), %xmm0		  # %xmm0 = Y0 | 
|  | 1412 | ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0) | 
|  | 1413 | pxor	%xmm8, %xmm0 | 
|  | 1414 | _return_T_decrypt: | 
|  | 1415 | mov	arg9, %r10                # %r10 = authTag | 
|  | 1416 | mov	arg10, %r11               # %r11 = auth_tag_len | 
|  | 1417 | cmp	$16, %r11 | 
|  | 1418 | je	_T_16_decrypt | 
|  | 1419 | cmp	$12, %r11 | 
|  | 1420 | je	_T_12_decrypt | 
|  | 1421 | _T_8_decrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1422 | MOVQ_R64_XMM	%xmm0, %rax | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1423 | mov	%rax, (%r10) | 
|  | 1424 | jmp	_return_T_done_decrypt | 
|  | 1425 | _T_12_decrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1426 | MOVQ_R64_XMM	%xmm0, %rax | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1427 | mov	%rax, (%r10) | 
|  | 1428 | psrldq	$8, %xmm0 | 
|  | 1429 | movd	%xmm0, %eax | 
|  | 1430 | mov	%eax, 8(%r10) | 
|  | 1431 | jmp	_return_T_done_decrypt | 
|  | 1432 | _T_16_decrypt: | 
|  | 1433 | movdqu	%xmm0, (%r10) | 
|  | 1434 | _return_T_done_decrypt: | 
|  | 1435 | mov	%r14, %rsp | 
|  | 1436 | pop	%r14 | 
|  | 1437 | pop	%r13 | 
|  | 1438 | pop	%r12 | 
|  | 1439 | ret | 
|  | 1440 |  | 
|  | 1441 |  | 
|  | 1442 | /***************************************************************************** | 
|  | 1443 | * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary. | 
|  | 1444 | *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed. | 
|  | 1445 | *                    const u8 *in,       // Plaintext input | 
|  | 1446 | *                    u64 plaintext_len,  // Length of data in bytes for encryption. | 
|  | 1447 | *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association) | 
|  | 1448 | *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | 
|  | 1449 | *                                        // concatenated with 0x00000001. 16-byte aligned pointer. | 
|  | 1450 | *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary. | 
|  | 1451 | *                    const u8 *aad,      // Additional Authentication Data (AAD) | 
|  | 1452 | *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | 
|  | 1453 | *                    u8 *auth_tag,       // Authenticated Tag output. | 
|  | 1454 | *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely), | 
|  | 1455 | *                                        // 12 or 8. | 
|  | 1456 | * | 
|  | 1457 | * Assumptions: | 
|  | 1458 | * | 
|  | 1459 | * keys: | 
|  | 1460 | *       keys are pre-expanded and aligned to 16 bytes. we are using the | 
|  | 1461 | *       first set of 11 keys in the data structure void *aes_ctx | 
|  | 1462 | * | 
|  | 1463 | * | 
|  | 1464 | * iv: | 
|  | 1465 | *       0                   1                   2                   3 | 
|  | 1466 | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | 1467 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1468 | *       |                             Salt  (From the SA)               | | 
|  | 1469 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1470 | *       |                     Initialization Vector                     | | 
|  | 1471 | *       |         (This is the sequence number from IPSec header)       | | 
|  | 1472 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1473 | *       |                              0x1                              | | 
|  | 1474 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1475 | * | 
|  | 1476 | * | 
|  | 1477 | * | 
|  | 1478 | * AAD: | 
|  | 1479 | *       AAD padded to 128 bits with 0 | 
|  | 1480 | *       for example, assume AAD is a u32 vector | 
|  | 1481 | * | 
|  | 1482 | *       if AAD is 8 bytes: | 
|  | 1483 | *       AAD[3] = {A0, A1}; | 
|  | 1484 | *       padded AAD in xmm register = {A1 A0 0 0} | 
|  | 1485 | * | 
|  | 1486 | *       0                   1                   2                   3 | 
|  | 1487 | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | 1488 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1489 | *       |                               SPI (A1)                        | | 
|  | 1490 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1491 | *       |                     32-bit Sequence Number (A0)               | | 
|  | 1492 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1493 | *       |                              0x0                              | | 
|  | 1494 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1495 | * | 
|  | 1496 | *                                 AAD Format with 32-bit Sequence Number | 
|  | 1497 | * | 
|  | 1498 | *       if AAD is 12 bytes: | 
|  | 1499 | *       AAD[3] = {A0, A1, A2}; | 
|  | 1500 | *       padded AAD in xmm register = {A2 A1 A0 0} | 
|  | 1501 | * | 
|  | 1502 | *       0                   1                   2                   3 | 
|  | 1503 | *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 
|  | 1504 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1505 | *       |                               SPI (A2)                        | | 
|  | 1506 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1507 | *       |                 64-bit Extended Sequence Number {A1,A0}       | | 
|  | 1508 | *       |                                                               | | 
|  | 1509 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1510 | *       |                              0x0                              | | 
|  | 1511 | *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 
|  | 1512 | * | 
|  | 1513 | *                         AAD Format with 64-bit Extended Sequence Number | 
|  | 1514 | * | 
|  | 1515 | * aadLen: | 
|  | 1516 | *       from the definition of the spec, aadLen can only be 8 or 12 bytes. | 
|  | 1517 | *       The code supports 16 too but for other sizes, the code will fail. | 
|  | 1518 | * | 
|  | 1519 | * TLen: | 
|  | 1520 | *       from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | 
|  | 1521 | *       For other sizes, the code will fail. | 
|  | 1522 | * | 
|  | 1523 | * poly = x^128 + x^127 + x^126 + x^121 + 1 | 
|  | 1524 | ***************************************************************************/ | 
|  | 1525 | ENTRY(aesni_gcm_enc) | 
|  | 1526 | push	%r12 | 
|  | 1527 | push	%r13 | 
|  | 1528 | push	%r14 | 
|  | 1529 | mov	%rsp, %r14 | 
|  | 1530 | # | 
|  | 1531 | # states of %xmm registers %xmm6:%xmm15 not saved | 
|  | 1532 | # all %xmm registers are clobbered | 
|  | 1533 | # | 
|  | 1534 | sub	$VARIABLE_OFFSET, %rsp | 
|  | 1535 | and	$~63, %rsp | 
|  | 1536 | mov	%arg6, %r12 | 
|  | 1537 | movdqu	(%r12), %xmm13 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1538 | movdqa  SHUF_MASK(%rip), %xmm2 | 
|  | 1539 | PSHUFB_XMM %xmm2, %xmm13 | 
|  | 1540 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1541 |  | 
|  | 1542 | # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) | 
|  | 1543 |  | 
|  | 1544 | movdqa	%xmm13, %xmm2 | 
|  | 1545 | psllq	$1, %xmm13 | 
|  | 1546 | psrlq	$63, %xmm2 | 
|  | 1547 | movdqa	%xmm2, %xmm1 | 
|  | 1548 | pslldq	$8, %xmm2 | 
|  | 1549 | psrldq	$8, %xmm1 | 
|  | 1550 | por	%xmm2, %xmm13 | 
|  | 1551 |  | 
|  | 1552 | # reduce HashKey<<1 | 
|  | 1553 |  | 
|  | 1554 | pshufd	$0x24, %xmm1, %xmm2 | 
|  | 1555 | pcmpeqd TWOONE(%rip), %xmm2 | 
|  | 1556 | pand	POLY(%rip), %xmm2 | 
|  | 1557 | pxor	%xmm2, %xmm13 | 
|  | 1558 | movdqa	%xmm13, HashKey(%rsp) | 
|  | 1559 | mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly) | 
|  | 1560 | and	$-16, %r13 | 
|  | 1561 | mov	%r13, %r12 | 
|  | 1562 |  | 
|  | 1563 | # Encrypt first few blocks | 
|  | 1564 |  | 
|  | 1565 | and	$(3<<4), %r12 | 
|  | 1566 | jz	_initial_num_blocks_is_0_encrypt | 
|  | 1567 | cmp	$(2<<4), %r12 | 
|  | 1568 | jb	_initial_num_blocks_is_1_encrypt | 
|  | 1569 | je	_initial_num_blocks_is_2_encrypt | 
|  | 1570 | _initial_num_blocks_is_3_encrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1571 | INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1572 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc | 
|  | 1573 | sub	$48, %r13 | 
|  | 1574 | jmp	_initial_blocks_encrypted | 
|  | 1575 | _initial_num_blocks_is_2_encrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1576 | INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1577 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc | 
|  | 1578 | sub	$32, %r13 | 
|  | 1579 | jmp	_initial_blocks_encrypted | 
|  | 1580 | _initial_num_blocks_is_1_encrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1581 | INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1582 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc | 
|  | 1583 | sub	$16, %r13 | 
|  | 1584 | jmp	_initial_blocks_encrypted | 
|  | 1585 | _initial_num_blocks_is_0_encrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1586 | INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1587 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc | 
|  | 1588 | _initial_blocks_encrypted: | 
|  | 1589 |  | 
|  | 1590 | # Main loop - Encrypt remaining blocks | 
|  | 1591 |  | 
|  | 1592 | cmp	$0, %r13 | 
|  | 1593 | je	_zero_cipher_left_encrypt | 
|  | 1594 | sub	$64, %r13 | 
|  | 1595 | je	_four_cipher_left_encrypt | 
|  | 1596 | _encrypt_by_4_encrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1597 | GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1598 | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc | 
|  | 1599 | add	$64, %r11 | 
|  | 1600 | sub	$64, %r13 | 
|  | 1601 | jne	_encrypt_by_4_encrypt | 
|  | 1602 | _four_cipher_left_encrypt: | 
|  | 1603 | GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | 
|  | 1604 | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | 
|  | 1605 | _zero_cipher_left_encrypt: | 
|  | 1606 | mov	%arg4, %r13 | 
|  | 1607 | and	$15, %r13			# %r13 = arg4 (mod 16) | 
|  | 1608 | je	_multiple_of_16_bytes_encrypt | 
|  | 1609 |  | 
| Lucas De Marchi | 0d2eb44 | 2011-03-17 16:24:16 -0300 | [diff] [blame] | 1610 | # Handle the last <16 Byte block separately | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1611 | paddd ONE(%rip), %xmm0                # INCR CNT to get Yn | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1612 | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | 1613 | PSHUFB_XMM %xmm10, %xmm0 | 
|  | 1614 |  | 
| Tadeusz Struk | 60af520 | 2011-03-13 16:56:17 +0800 | [diff] [blame] | 1615 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1616 | ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn) | 
|  | 1617 | sub $16, %r11 | 
|  | 1618 | add %r13, %r11 | 
|  | 1619 | movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks | 
|  | 1620 | lea SHIFT_MASK+16(%rip), %r12 | 
|  | 1621 | sub %r13, %r12 | 
|  | 1622 | # adjust the shuffle mask pointer to be able to shift 16-r13 bytes | 
|  | 1623 | # (%r13 is the number of bytes in plaintext mod 16) | 
|  | 1624 | movdqu	(%r12), %xmm2           # get the appropriate shuffle mask | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1625 | PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1626 | pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn) | 
|  | 1627 | movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1 | 
|  | 1628 | # get the appropriate mask to mask out top 16-r13 bytes of xmm0 | 
|  | 1629 | pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1630 | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | 1631 | PSHUFB_XMM %xmm10,%xmm0 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1632 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1633 | pxor	%xmm0, %xmm8 | 
|  | 1634 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | 
|  | 1635 | # GHASH computation for the last <16 byte block | 
|  | 1636 | sub	%r13, %r11 | 
|  | 1637 | add	$16, %r11 | 
| Tadeusz Struk | 60af520 | 2011-03-13 16:56:17 +0800 | [diff] [blame] | 1638 |  | 
|  | 1639 | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | 1640 | PSHUFB_XMM %xmm10, %xmm0 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1641 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1642 | # shuffle xmm0 back to output as ciphertext | 
|  | 1643 |  | 
|  | 1644 | # Output %r13 bytes | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1645 | MOVQ_R64_XMM %xmm0, %rax | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1646 | cmp $8, %r13 | 
|  | 1647 | jle _less_than_8_bytes_left_encrypt | 
|  | 1648 | mov %rax, (%arg2 , %r11, 1) | 
|  | 1649 | add $8, %r11 | 
|  | 1650 | psrldq $8, %xmm0 | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1651 | MOVQ_R64_XMM %xmm0, %rax | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1652 | sub $8, %r13 | 
|  | 1653 | _less_than_8_bytes_left_encrypt: | 
|  | 1654 | mov %al,  (%arg2, %r11, 1) | 
|  | 1655 | add $1, %r11 | 
|  | 1656 | shr $8, %rax | 
|  | 1657 | sub $1, %r13 | 
|  | 1658 | jne _less_than_8_bytes_left_encrypt | 
|  | 1659 | _multiple_of_16_bytes_encrypt: | 
|  | 1660 | mov	arg8, %r12    # %r12 = addLen (number of bytes) | 
|  | 1661 | shl	$3, %r12 | 
|  | 1662 | movd	%r12d, %xmm15       # len(A) in %xmm15 | 
|  | 1663 | shl	$3, %arg4               # len(C) in bits (*128) | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1664 | MOVQ_R64_XMM	%arg4, %xmm1 | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1665 | pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000 | 
|  | 1666 | pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C) | 
|  | 1667 | pxor	%xmm15, %xmm8 | 
|  | 1668 | GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | 
|  | 1669 | # final GHASH computation | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1670 | movdqa SHUF_MASK(%rip), %xmm10 | 
|  | 1671 | PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1672 |  | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1673 | mov	%arg5, %rax		       # %rax  = *Y0 | 
|  | 1674 | movdqu	(%rax), %xmm0		       # %xmm0 = Y0 | 
|  | 1675 | ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0) | 
|  | 1676 | pxor	%xmm8, %xmm0 | 
|  | 1677 | _return_T_encrypt: | 
|  | 1678 | mov	arg9, %r10                     # %r10 = authTag | 
|  | 1679 | mov	arg10, %r11                    # %r11 = auth_tag_len | 
|  | 1680 | cmp	$16, %r11 | 
|  | 1681 | je	_T_16_encrypt | 
|  | 1682 | cmp	$12, %r11 | 
|  | 1683 | je	_T_12_encrypt | 
|  | 1684 | _T_8_encrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1685 | MOVQ_R64_XMM	%xmm0, %rax | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1686 | mov	%rax, (%r10) | 
|  | 1687 | jmp	_return_T_done_encrypt | 
|  | 1688 | _T_12_encrypt: | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1689 | MOVQ_R64_XMM	%xmm0, %rax | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1690 | mov	%rax, (%r10) | 
|  | 1691 | psrldq	$8, %xmm0 | 
|  | 1692 | movd	%xmm0, %eax | 
|  | 1693 | mov	%eax, 8(%r10) | 
|  | 1694 | jmp	_return_T_done_encrypt | 
|  | 1695 | _T_16_encrypt: | 
|  | 1696 | movdqu	%xmm0, (%r10) | 
|  | 1697 | _return_T_done_encrypt: | 
|  | 1698 | mov	%r14, %rsp | 
|  | 1699 | pop	%r14 | 
|  | 1700 | pop	%r13 | 
|  | 1701 | pop	%r12 | 
|  | 1702 | ret | 
| Tadeusz Struk | 3c097b8 | 2010-12-13 19:51:15 +0800 | [diff] [blame] | 1703 |  | 
| Mathias Krause | 559ad0f | 2010-11-29 08:35:39 +0800 | [diff] [blame] | 1704 | #endif | 
| Tadeusz Struk | 0bd82f5 | 2010-11-04 15:00:45 -0400 | [diff] [blame] | 1705 |  | 
|  | 1706 |  | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1707 | _key_expansion_128: | 
|  | 1708 | _key_expansion_256a: | 
|  | 1709 | pshufd $0b11111111, %xmm1, %xmm1 | 
|  | 1710 | shufps $0b00010000, %xmm0, %xmm4 | 
|  | 1711 | pxor %xmm4, %xmm0 | 
|  | 1712 | shufps $0b10001100, %xmm0, %xmm4 | 
|  | 1713 | pxor %xmm4, %xmm0 | 
|  | 1714 | pxor %xmm1, %xmm0 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1715 | movaps %xmm0, (TKEYP) | 
|  | 1716 | add $0x10, TKEYP | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1717 | ret | 
|  | 1718 |  | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1719 | .align 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1720 | _key_expansion_192a: | 
|  | 1721 | pshufd $0b01010101, %xmm1, %xmm1 | 
|  | 1722 | shufps $0b00010000, %xmm0, %xmm4 | 
|  | 1723 | pxor %xmm4, %xmm0 | 
|  | 1724 | shufps $0b10001100, %xmm0, %xmm4 | 
|  | 1725 | pxor %xmm4, %xmm0 | 
|  | 1726 | pxor %xmm1, %xmm0 | 
|  | 1727 |  | 
|  | 1728 | movaps %xmm2, %xmm5 | 
|  | 1729 | movaps %xmm2, %xmm6 | 
|  | 1730 | pslldq $4, %xmm5 | 
|  | 1731 | pshufd $0b11111111, %xmm0, %xmm3 | 
|  | 1732 | pxor %xmm3, %xmm2 | 
|  | 1733 | pxor %xmm5, %xmm2 | 
|  | 1734 |  | 
|  | 1735 | movaps %xmm0, %xmm1 | 
|  | 1736 | shufps $0b01000100, %xmm0, %xmm6 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1737 | movaps %xmm6, (TKEYP) | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1738 | shufps $0b01001110, %xmm2, %xmm1 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1739 | movaps %xmm1, 0x10(TKEYP) | 
|  | 1740 | add $0x20, TKEYP | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1741 | ret | 
|  | 1742 |  | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1743 | .align 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1744 | _key_expansion_192b: | 
|  | 1745 | pshufd $0b01010101, %xmm1, %xmm1 | 
|  | 1746 | shufps $0b00010000, %xmm0, %xmm4 | 
|  | 1747 | pxor %xmm4, %xmm0 | 
|  | 1748 | shufps $0b10001100, %xmm0, %xmm4 | 
|  | 1749 | pxor %xmm4, %xmm0 | 
|  | 1750 | pxor %xmm1, %xmm0 | 
|  | 1751 |  | 
|  | 1752 | movaps %xmm2, %xmm5 | 
|  | 1753 | pslldq $4, %xmm5 | 
|  | 1754 | pshufd $0b11111111, %xmm0, %xmm3 | 
|  | 1755 | pxor %xmm3, %xmm2 | 
|  | 1756 | pxor %xmm5, %xmm2 | 
|  | 1757 |  | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1758 | movaps %xmm0, (TKEYP) | 
|  | 1759 | add $0x10, TKEYP | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1760 | ret | 
|  | 1761 |  | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1762 | .align 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1763 | _key_expansion_256b: | 
|  | 1764 | pshufd $0b10101010, %xmm1, %xmm1 | 
|  | 1765 | shufps $0b00010000, %xmm2, %xmm4 | 
|  | 1766 | pxor %xmm4, %xmm2 | 
|  | 1767 | shufps $0b10001100, %xmm2, %xmm4 | 
|  | 1768 | pxor %xmm4, %xmm2 | 
|  | 1769 | pxor %xmm1, %xmm2 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1770 | movaps %xmm2, (TKEYP) | 
|  | 1771 | add $0x10, TKEYP | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1772 | ret | 
|  | 1773 |  | 
|  | 1774 | /* | 
|  | 1775 | * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, | 
|  | 1776 | *                   unsigned int key_len) | 
|  | 1777 | */ | 
|  | 1778 | ENTRY(aesni_set_key) | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1779 | #ifndef __x86_64__ | 
|  | 1780 | pushl KEYP | 
|  | 1781 | movl 8(%esp), KEYP		# ctx | 
|  | 1782 | movl 12(%esp), UKEYP		# in_key | 
|  | 1783 | movl 16(%esp), %edx		# key_len | 
|  | 1784 | #endif | 
|  | 1785 | movups (UKEYP), %xmm0		# user key (first 16 bytes) | 
|  | 1786 | movaps %xmm0, (KEYP) | 
|  | 1787 | lea 0x10(KEYP), TKEYP		# key addr | 
|  | 1788 | movl %edx, 480(KEYP) | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1789 | pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x | 
|  | 1790 | cmp $24, %dl | 
|  | 1791 | jb .Lenc_key128 | 
|  | 1792 | je .Lenc_key192 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1793 | movups 0x10(UKEYP), %xmm2	# other user key | 
|  | 1794 | movaps %xmm2, (TKEYP) | 
|  | 1795 | add $0x10, TKEYP | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1796 | AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1797 | call _key_expansion_256a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1798 | AESKEYGENASSIST 0x1 %xmm0 %xmm1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1799 | call _key_expansion_256b | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1800 | AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1801 | call _key_expansion_256a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1802 | AESKEYGENASSIST 0x2 %xmm0 %xmm1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1803 | call _key_expansion_256b | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1804 | AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1805 | call _key_expansion_256a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1806 | AESKEYGENASSIST 0x4 %xmm0 %xmm1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1807 | call _key_expansion_256b | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1808 | AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1809 | call _key_expansion_256a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1810 | AESKEYGENASSIST 0x8 %xmm0 %xmm1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1811 | call _key_expansion_256b | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1812 | AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1813 | call _key_expansion_256a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1814 | AESKEYGENASSIST 0x10 %xmm0 %xmm1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1815 | call _key_expansion_256b | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1816 | AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1817 | call _key_expansion_256a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1818 | AESKEYGENASSIST 0x20 %xmm0 %xmm1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1819 | call _key_expansion_256b | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1820 | AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1821 | call _key_expansion_256a | 
|  | 1822 | jmp .Ldec_key | 
|  | 1823 | .Lenc_key192: | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1824 | movq 0x10(UKEYP), %xmm2		# other user key | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1825 | AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1826 | call _key_expansion_192a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1827 | AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1828 | call _key_expansion_192b | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1829 | AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1830 | call _key_expansion_192a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1831 | AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1832 | call _key_expansion_192b | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1833 | AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1834 | call _key_expansion_192a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1835 | AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1836 | call _key_expansion_192b | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1837 | AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1838 | call _key_expansion_192a | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1839 | AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1840 | call _key_expansion_192b | 
|  | 1841 | jmp .Ldec_key | 
|  | 1842 | .Lenc_key128: | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1843 | AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1844 | call _key_expansion_128 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1845 | AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1846 | call _key_expansion_128 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1847 | AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1848 | call _key_expansion_128 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1849 | AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1850 | call _key_expansion_128 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1851 | AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1852 | call _key_expansion_128 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1853 | AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1854 | call _key_expansion_128 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1855 | AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1856 | call _key_expansion_128 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1857 | AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1858 | call _key_expansion_128 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1859 | AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1860 | call _key_expansion_128 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1861 | AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1862 | call _key_expansion_128 | 
|  | 1863 | .Ldec_key: | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1864 | sub $0x10, TKEYP | 
|  | 1865 | movaps (KEYP), %xmm0 | 
|  | 1866 | movaps (TKEYP), %xmm1 | 
|  | 1867 | movaps %xmm0, 240(TKEYP) | 
|  | 1868 | movaps %xmm1, 240(KEYP) | 
|  | 1869 | add $0x10, KEYP | 
|  | 1870 | lea 240-16(TKEYP), UKEYP | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1871 | .align 4 | 
|  | 1872 | .Ldec_key_loop: | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1873 | movaps (KEYP), %xmm0 | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1874 | AESIMC %xmm0 %xmm1 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1875 | movaps %xmm1, (UKEYP) | 
|  | 1876 | add $0x10, KEYP | 
|  | 1877 | sub $0x10, UKEYP | 
|  | 1878 | cmp TKEYP, KEYP | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1879 | jb .Ldec_key_loop | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1880 | xor AREG, AREG | 
|  | 1881 | #ifndef __x86_64__ | 
|  | 1882 | popl KEYP | 
|  | 1883 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1884 | ret | 
|  | 1885 |  | 
|  | 1886 | /* | 
|  | 1887 | * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) | 
|  | 1888 | */ | 
|  | 1889 | ENTRY(aesni_enc) | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1890 | #ifndef __x86_64__ | 
|  | 1891 | pushl KEYP | 
|  | 1892 | pushl KLEN | 
|  | 1893 | movl 12(%esp), KEYP | 
|  | 1894 | movl 16(%esp), OUTP | 
|  | 1895 | movl 20(%esp), INP | 
|  | 1896 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1897 | movl 480(KEYP), KLEN		# key length | 
|  | 1898 | movups (INP), STATE		# input | 
|  | 1899 | call _aesni_enc1 | 
|  | 1900 | movups STATE, (OUTP)		# output | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1901 | #ifndef __x86_64__ | 
|  | 1902 | popl KLEN | 
|  | 1903 | popl KEYP | 
|  | 1904 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1905 | ret | 
|  | 1906 |  | 
|  | 1907 | /* | 
|  | 1908 | * _aesni_enc1:		internal ABI | 
|  | 1909 | * input: | 
|  | 1910 | *	KEYP:		key struct pointer | 
|  | 1911 | *	KLEN:		round count | 
|  | 1912 | *	STATE:		initial state (input) | 
|  | 1913 | * output: | 
|  | 1914 | *	STATE:		finial state (output) | 
|  | 1915 | * changed: | 
|  | 1916 | *	KEY | 
|  | 1917 | *	TKEYP (T1) | 
|  | 1918 | */ | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1919 | .align 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1920 | _aesni_enc1: | 
|  | 1921 | movaps (KEYP), KEY		# key | 
|  | 1922 | mov KEYP, TKEYP | 
|  | 1923 | pxor KEY, STATE		# round 0 | 
|  | 1924 | add $0x30, TKEYP | 
|  | 1925 | cmp $24, KLEN | 
|  | 1926 | jb .Lenc128 | 
|  | 1927 | lea 0x20(TKEYP), TKEYP | 
|  | 1928 | je .Lenc192 | 
|  | 1929 | add $0x20, TKEYP | 
|  | 1930 | movaps -0x60(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1931 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1932 | movaps -0x50(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1933 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1934 | .align 4 | 
|  | 1935 | .Lenc192: | 
|  | 1936 | movaps -0x40(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1937 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1938 | movaps -0x30(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1939 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1940 | .align 4 | 
|  | 1941 | .Lenc128: | 
|  | 1942 | movaps -0x20(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1943 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1944 | movaps -0x10(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1945 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1946 | movaps (TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1947 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1948 | movaps 0x10(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1949 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1950 | movaps 0x20(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1951 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1952 | movaps 0x30(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1953 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1954 | movaps 0x40(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1955 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1956 | movaps 0x50(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1957 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1958 | movaps 0x60(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1959 | AESENC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1960 | movaps 0x70(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1961 | AESENCLAST KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1962 | ret | 
|  | 1963 |  | 
|  | 1964 | /* | 
|  | 1965 | * _aesni_enc4:	internal ABI | 
|  | 1966 | * input: | 
|  | 1967 | *	KEYP:		key struct pointer | 
|  | 1968 | *	KLEN:		round count | 
|  | 1969 | *	STATE1:		initial state (input) | 
|  | 1970 | *	STATE2 | 
|  | 1971 | *	STATE3 | 
|  | 1972 | *	STATE4 | 
|  | 1973 | * output: | 
|  | 1974 | *	STATE1:		finial state (output) | 
|  | 1975 | *	STATE2 | 
|  | 1976 | *	STATE3 | 
|  | 1977 | *	STATE4 | 
|  | 1978 | * changed: | 
|  | 1979 | *	KEY | 
|  | 1980 | *	TKEYP (T1) | 
|  | 1981 | */ | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 1982 | .align 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 1983 | _aesni_enc4: | 
|  | 1984 | movaps (KEYP), KEY		# key | 
|  | 1985 | mov KEYP, TKEYP | 
|  | 1986 | pxor KEY, STATE1		# round 0 | 
|  | 1987 | pxor KEY, STATE2 | 
|  | 1988 | pxor KEY, STATE3 | 
|  | 1989 | pxor KEY, STATE4 | 
|  | 1990 | add $0x30, TKEYP | 
|  | 1991 | cmp $24, KLEN | 
|  | 1992 | jb .L4enc128 | 
|  | 1993 | lea 0x20(TKEYP), TKEYP | 
|  | 1994 | je .L4enc192 | 
|  | 1995 | add $0x20, TKEYP | 
|  | 1996 | movaps -0x60(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 1997 | AESENC KEY STATE1 | 
|  | 1998 | AESENC KEY STATE2 | 
|  | 1999 | AESENC KEY STATE3 | 
|  | 2000 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2001 | movaps -0x50(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2002 | AESENC KEY STATE1 | 
|  | 2003 | AESENC KEY STATE2 | 
|  | 2004 | AESENC KEY STATE3 | 
|  | 2005 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2006 | #.align 4 | 
|  | 2007 | .L4enc192: | 
|  | 2008 | movaps -0x40(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2009 | AESENC KEY STATE1 | 
|  | 2010 | AESENC KEY STATE2 | 
|  | 2011 | AESENC KEY STATE3 | 
|  | 2012 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2013 | movaps -0x30(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2014 | AESENC KEY STATE1 | 
|  | 2015 | AESENC KEY STATE2 | 
|  | 2016 | AESENC KEY STATE3 | 
|  | 2017 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2018 | #.align 4 | 
|  | 2019 | .L4enc128: | 
|  | 2020 | movaps -0x20(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2021 | AESENC KEY STATE1 | 
|  | 2022 | AESENC KEY STATE2 | 
|  | 2023 | AESENC KEY STATE3 | 
|  | 2024 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2025 | movaps -0x10(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2026 | AESENC KEY STATE1 | 
|  | 2027 | AESENC KEY STATE2 | 
|  | 2028 | AESENC KEY STATE3 | 
|  | 2029 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2030 | movaps (TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2031 | AESENC KEY STATE1 | 
|  | 2032 | AESENC KEY STATE2 | 
|  | 2033 | AESENC KEY STATE3 | 
|  | 2034 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2035 | movaps 0x10(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2036 | AESENC KEY STATE1 | 
|  | 2037 | AESENC KEY STATE2 | 
|  | 2038 | AESENC KEY STATE3 | 
|  | 2039 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2040 | movaps 0x20(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2041 | AESENC KEY STATE1 | 
|  | 2042 | AESENC KEY STATE2 | 
|  | 2043 | AESENC KEY STATE3 | 
|  | 2044 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2045 | movaps 0x30(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2046 | AESENC KEY STATE1 | 
|  | 2047 | AESENC KEY STATE2 | 
|  | 2048 | AESENC KEY STATE3 | 
|  | 2049 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2050 | movaps 0x40(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2051 | AESENC KEY STATE1 | 
|  | 2052 | AESENC KEY STATE2 | 
|  | 2053 | AESENC KEY STATE3 | 
|  | 2054 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2055 | movaps 0x50(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2056 | AESENC KEY STATE1 | 
|  | 2057 | AESENC KEY STATE2 | 
|  | 2058 | AESENC KEY STATE3 | 
|  | 2059 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2060 | movaps 0x60(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2061 | AESENC KEY STATE1 | 
|  | 2062 | AESENC KEY STATE2 | 
|  | 2063 | AESENC KEY STATE3 | 
|  | 2064 | AESENC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2065 | movaps 0x70(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2066 | AESENCLAST KEY STATE1		# last round | 
|  | 2067 | AESENCLAST KEY STATE2 | 
|  | 2068 | AESENCLAST KEY STATE3 | 
|  | 2069 | AESENCLAST KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2070 | ret | 
|  | 2071 |  | 
|  | 2072 | /* | 
|  | 2073 | * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) | 
|  | 2074 | */ | 
|  | 2075 | ENTRY(aesni_dec) | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2076 | #ifndef __x86_64__ | 
|  | 2077 | pushl KEYP | 
|  | 2078 | pushl KLEN | 
|  | 2079 | movl 12(%esp), KEYP | 
|  | 2080 | movl 16(%esp), OUTP | 
|  | 2081 | movl 20(%esp), INP | 
|  | 2082 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2083 | mov 480(KEYP), KLEN		# key length | 
|  | 2084 | add $240, KEYP | 
|  | 2085 | movups (INP), STATE		# input | 
|  | 2086 | call _aesni_dec1 | 
|  | 2087 | movups STATE, (OUTP)		#output | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2088 | #ifndef __x86_64__ | 
|  | 2089 | popl KLEN | 
|  | 2090 | popl KEYP | 
|  | 2091 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2092 | ret | 
|  | 2093 |  | 
|  | 2094 | /* | 
|  | 2095 | * _aesni_dec1:		internal ABI | 
|  | 2096 | * input: | 
|  | 2097 | *	KEYP:		key struct pointer | 
|  | 2098 | *	KLEN:		key length | 
|  | 2099 | *	STATE:		initial state (input) | 
|  | 2100 | * output: | 
|  | 2101 | *	STATE:		finial state (output) | 
|  | 2102 | * changed: | 
|  | 2103 | *	KEY | 
|  | 2104 | *	TKEYP (T1) | 
|  | 2105 | */ | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2106 | .align 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2107 | _aesni_dec1: | 
|  | 2108 | movaps (KEYP), KEY		# key | 
|  | 2109 | mov KEYP, TKEYP | 
|  | 2110 | pxor KEY, STATE		# round 0 | 
|  | 2111 | add $0x30, TKEYP | 
|  | 2112 | cmp $24, KLEN | 
|  | 2113 | jb .Ldec128 | 
|  | 2114 | lea 0x20(TKEYP), TKEYP | 
|  | 2115 | je .Ldec192 | 
|  | 2116 | add $0x20, TKEYP | 
|  | 2117 | movaps -0x60(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2118 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2119 | movaps -0x50(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2120 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2121 | .align 4 | 
|  | 2122 | .Ldec192: | 
|  | 2123 | movaps -0x40(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2124 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2125 | movaps -0x30(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2126 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2127 | .align 4 | 
|  | 2128 | .Ldec128: | 
|  | 2129 | movaps -0x20(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2130 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2131 | movaps -0x10(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2132 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2133 | movaps (TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2134 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2135 | movaps 0x10(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2136 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2137 | movaps 0x20(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2138 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2139 | movaps 0x30(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2140 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2141 | movaps 0x40(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2142 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2143 | movaps 0x50(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2144 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2145 | movaps 0x60(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2146 | AESDEC KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2147 | movaps 0x70(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2148 | AESDECLAST KEY STATE | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2149 | ret | 
|  | 2150 |  | 
|  | 2151 | /* | 
|  | 2152 | * _aesni_dec4:	internal ABI | 
|  | 2153 | * input: | 
|  | 2154 | *	KEYP:		key struct pointer | 
|  | 2155 | *	KLEN:		key length | 
|  | 2156 | *	STATE1:		initial state (input) | 
|  | 2157 | *	STATE2 | 
|  | 2158 | *	STATE3 | 
|  | 2159 | *	STATE4 | 
|  | 2160 | * output: | 
|  | 2161 | *	STATE1:		finial state (output) | 
|  | 2162 | *	STATE2 | 
|  | 2163 | *	STATE3 | 
|  | 2164 | *	STATE4 | 
|  | 2165 | * changed: | 
|  | 2166 | *	KEY | 
|  | 2167 | *	TKEYP (T1) | 
|  | 2168 | */ | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2169 | .align 4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2170 | _aesni_dec4: | 
|  | 2171 | movaps (KEYP), KEY		# key | 
|  | 2172 | mov KEYP, TKEYP | 
|  | 2173 | pxor KEY, STATE1		# round 0 | 
|  | 2174 | pxor KEY, STATE2 | 
|  | 2175 | pxor KEY, STATE3 | 
|  | 2176 | pxor KEY, STATE4 | 
|  | 2177 | add $0x30, TKEYP | 
|  | 2178 | cmp $24, KLEN | 
|  | 2179 | jb .L4dec128 | 
|  | 2180 | lea 0x20(TKEYP), TKEYP | 
|  | 2181 | je .L4dec192 | 
|  | 2182 | add $0x20, TKEYP | 
|  | 2183 | movaps -0x60(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2184 | AESDEC KEY STATE1 | 
|  | 2185 | AESDEC KEY STATE2 | 
|  | 2186 | AESDEC KEY STATE3 | 
|  | 2187 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2188 | movaps -0x50(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2189 | AESDEC KEY STATE1 | 
|  | 2190 | AESDEC KEY STATE2 | 
|  | 2191 | AESDEC KEY STATE3 | 
|  | 2192 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2193 | .align 4 | 
|  | 2194 | .L4dec192: | 
|  | 2195 | movaps -0x40(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2196 | AESDEC KEY STATE1 | 
|  | 2197 | AESDEC KEY STATE2 | 
|  | 2198 | AESDEC KEY STATE3 | 
|  | 2199 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2200 | movaps -0x30(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2201 | AESDEC KEY STATE1 | 
|  | 2202 | AESDEC KEY STATE2 | 
|  | 2203 | AESDEC KEY STATE3 | 
|  | 2204 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2205 | .align 4 | 
|  | 2206 | .L4dec128: | 
|  | 2207 | movaps -0x20(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2208 | AESDEC KEY STATE1 | 
|  | 2209 | AESDEC KEY STATE2 | 
|  | 2210 | AESDEC KEY STATE3 | 
|  | 2211 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2212 | movaps -0x10(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2213 | AESDEC KEY STATE1 | 
|  | 2214 | AESDEC KEY STATE2 | 
|  | 2215 | AESDEC KEY STATE3 | 
|  | 2216 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2217 | movaps (TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2218 | AESDEC KEY STATE1 | 
|  | 2219 | AESDEC KEY STATE2 | 
|  | 2220 | AESDEC KEY STATE3 | 
|  | 2221 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2222 | movaps 0x10(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2223 | AESDEC KEY STATE1 | 
|  | 2224 | AESDEC KEY STATE2 | 
|  | 2225 | AESDEC KEY STATE3 | 
|  | 2226 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2227 | movaps 0x20(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2228 | AESDEC KEY STATE1 | 
|  | 2229 | AESDEC KEY STATE2 | 
|  | 2230 | AESDEC KEY STATE3 | 
|  | 2231 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2232 | movaps 0x30(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2233 | AESDEC KEY STATE1 | 
|  | 2234 | AESDEC KEY STATE2 | 
|  | 2235 | AESDEC KEY STATE3 | 
|  | 2236 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2237 | movaps 0x40(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2238 | AESDEC KEY STATE1 | 
|  | 2239 | AESDEC KEY STATE2 | 
|  | 2240 | AESDEC KEY STATE3 | 
|  | 2241 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2242 | movaps 0x50(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2243 | AESDEC KEY STATE1 | 
|  | 2244 | AESDEC KEY STATE2 | 
|  | 2245 | AESDEC KEY STATE3 | 
|  | 2246 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2247 | movaps 0x60(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2248 | AESDEC KEY STATE1 | 
|  | 2249 | AESDEC KEY STATE2 | 
|  | 2250 | AESDEC KEY STATE3 | 
|  | 2251 | AESDEC KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2252 | movaps 0x70(TKEYP), KEY | 
| Huang Ying | b369e52 | 2009-11-23 19:54:06 +0800 | [diff] [blame] | 2253 | AESDECLAST KEY STATE1		# last round | 
|  | 2254 | AESDECLAST KEY STATE2 | 
|  | 2255 | AESDECLAST KEY STATE3 | 
|  | 2256 | AESDECLAST KEY STATE4 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2257 | ret | 
|  | 2258 |  | 
|  | 2259 | /* | 
|  | 2260 | * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | 2261 | *		      size_t len) | 
|  | 2262 | */ | 
|  | 2263 | ENTRY(aesni_ecb_enc) | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2264 | #ifndef __x86_64__ | 
|  | 2265 | pushl LEN | 
|  | 2266 | pushl KEYP | 
|  | 2267 | pushl KLEN | 
|  | 2268 | movl 16(%esp), KEYP | 
|  | 2269 | movl 20(%esp), OUTP | 
|  | 2270 | movl 24(%esp), INP | 
|  | 2271 | movl 28(%esp), LEN | 
|  | 2272 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2273 | test LEN, LEN		# check length | 
|  | 2274 | jz .Lecb_enc_ret | 
|  | 2275 | mov 480(KEYP), KLEN | 
|  | 2276 | cmp $16, LEN | 
|  | 2277 | jb .Lecb_enc_ret | 
|  | 2278 | cmp $64, LEN | 
|  | 2279 | jb .Lecb_enc_loop1 | 
|  | 2280 | .align 4 | 
|  | 2281 | .Lecb_enc_loop4: | 
|  | 2282 | movups (INP), STATE1 | 
|  | 2283 | movups 0x10(INP), STATE2 | 
|  | 2284 | movups 0x20(INP), STATE3 | 
|  | 2285 | movups 0x30(INP), STATE4 | 
|  | 2286 | call _aesni_enc4 | 
|  | 2287 | movups STATE1, (OUTP) | 
|  | 2288 | movups STATE2, 0x10(OUTP) | 
|  | 2289 | movups STATE3, 0x20(OUTP) | 
|  | 2290 | movups STATE4, 0x30(OUTP) | 
|  | 2291 | sub $64, LEN | 
|  | 2292 | add $64, INP | 
|  | 2293 | add $64, OUTP | 
|  | 2294 | cmp $64, LEN | 
|  | 2295 | jge .Lecb_enc_loop4 | 
|  | 2296 | cmp $16, LEN | 
|  | 2297 | jb .Lecb_enc_ret | 
|  | 2298 | .align 4 | 
|  | 2299 | .Lecb_enc_loop1: | 
|  | 2300 | movups (INP), STATE1 | 
|  | 2301 | call _aesni_enc1 | 
|  | 2302 | movups STATE1, (OUTP) | 
|  | 2303 | sub $16, LEN | 
|  | 2304 | add $16, INP | 
|  | 2305 | add $16, OUTP | 
|  | 2306 | cmp $16, LEN | 
|  | 2307 | jge .Lecb_enc_loop1 | 
|  | 2308 | .Lecb_enc_ret: | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2309 | #ifndef __x86_64__ | 
|  | 2310 | popl KLEN | 
|  | 2311 | popl KEYP | 
|  | 2312 | popl LEN | 
|  | 2313 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2314 | ret | 
|  | 2315 |  | 
|  | 2316 | /* | 
|  | 2317 | * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | 2318 | *		      size_t len); | 
|  | 2319 | */ | 
|  | 2320 | ENTRY(aesni_ecb_dec) | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2321 | #ifndef __x86_64__ | 
|  | 2322 | pushl LEN | 
|  | 2323 | pushl KEYP | 
|  | 2324 | pushl KLEN | 
|  | 2325 | movl 16(%esp), KEYP | 
|  | 2326 | movl 20(%esp), OUTP | 
|  | 2327 | movl 24(%esp), INP | 
|  | 2328 | movl 28(%esp), LEN | 
|  | 2329 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2330 | test LEN, LEN | 
|  | 2331 | jz .Lecb_dec_ret | 
|  | 2332 | mov 480(KEYP), KLEN | 
|  | 2333 | add $240, KEYP | 
|  | 2334 | cmp $16, LEN | 
|  | 2335 | jb .Lecb_dec_ret | 
|  | 2336 | cmp $64, LEN | 
|  | 2337 | jb .Lecb_dec_loop1 | 
|  | 2338 | .align 4 | 
|  | 2339 | .Lecb_dec_loop4: | 
|  | 2340 | movups (INP), STATE1 | 
|  | 2341 | movups 0x10(INP), STATE2 | 
|  | 2342 | movups 0x20(INP), STATE3 | 
|  | 2343 | movups 0x30(INP), STATE4 | 
|  | 2344 | call _aesni_dec4 | 
|  | 2345 | movups STATE1, (OUTP) | 
|  | 2346 | movups STATE2, 0x10(OUTP) | 
|  | 2347 | movups STATE3, 0x20(OUTP) | 
|  | 2348 | movups STATE4, 0x30(OUTP) | 
|  | 2349 | sub $64, LEN | 
|  | 2350 | add $64, INP | 
|  | 2351 | add $64, OUTP | 
|  | 2352 | cmp $64, LEN | 
|  | 2353 | jge .Lecb_dec_loop4 | 
|  | 2354 | cmp $16, LEN | 
|  | 2355 | jb .Lecb_dec_ret | 
|  | 2356 | .align 4 | 
|  | 2357 | .Lecb_dec_loop1: | 
|  | 2358 | movups (INP), STATE1 | 
|  | 2359 | call _aesni_dec1 | 
|  | 2360 | movups STATE1, (OUTP) | 
|  | 2361 | sub $16, LEN | 
|  | 2362 | add $16, INP | 
|  | 2363 | add $16, OUTP | 
|  | 2364 | cmp $16, LEN | 
|  | 2365 | jge .Lecb_dec_loop1 | 
|  | 2366 | .Lecb_dec_ret: | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2367 | #ifndef __x86_64__ | 
|  | 2368 | popl KLEN | 
|  | 2369 | popl KEYP | 
|  | 2370 | popl LEN | 
|  | 2371 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2372 | ret | 
|  | 2373 |  | 
|  | 2374 | /* | 
|  | 2375 | * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | 2376 | *		      size_t len, u8 *iv) | 
|  | 2377 | */ | 
|  | 2378 | ENTRY(aesni_cbc_enc) | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2379 | #ifndef __x86_64__ | 
|  | 2380 | pushl IVP | 
|  | 2381 | pushl LEN | 
|  | 2382 | pushl KEYP | 
|  | 2383 | pushl KLEN | 
|  | 2384 | movl 20(%esp), KEYP | 
|  | 2385 | movl 24(%esp), OUTP | 
|  | 2386 | movl 28(%esp), INP | 
|  | 2387 | movl 32(%esp), LEN | 
|  | 2388 | movl 36(%esp), IVP | 
|  | 2389 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2390 | cmp $16, LEN | 
|  | 2391 | jb .Lcbc_enc_ret | 
|  | 2392 | mov 480(KEYP), KLEN | 
|  | 2393 | movups (IVP), STATE	# load iv as initial state | 
|  | 2394 | .align 4 | 
|  | 2395 | .Lcbc_enc_loop: | 
|  | 2396 | movups (INP), IN	# load input | 
|  | 2397 | pxor IN, STATE | 
|  | 2398 | call _aesni_enc1 | 
|  | 2399 | movups STATE, (OUTP)	# store output | 
|  | 2400 | sub $16, LEN | 
|  | 2401 | add $16, INP | 
|  | 2402 | add $16, OUTP | 
|  | 2403 | cmp $16, LEN | 
|  | 2404 | jge .Lcbc_enc_loop | 
|  | 2405 | movups STATE, (IVP) | 
|  | 2406 | .Lcbc_enc_ret: | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2407 | #ifndef __x86_64__ | 
|  | 2408 | popl KLEN | 
|  | 2409 | popl KEYP | 
|  | 2410 | popl LEN | 
|  | 2411 | popl IVP | 
|  | 2412 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2413 | ret | 
|  | 2414 |  | 
|  | 2415 | /* | 
|  | 2416 | * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | 2417 | *		      size_t len, u8 *iv) | 
|  | 2418 | */ | 
|  | 2419 | ENTRY(aesni_cbc_dec) | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2420 | #ifndef __x86_64__ | 
|  | 2421 | pushl IVP | 
|  | 2422 | pushl LEN | 
|  | 2423 | pushl KEYP | 
|  | 2424 | pushl KLEN | 
|  | 2425 | movl 20(%esp), KEYP | 
|  | 2426 | movl 24(%esp), OUTP | 
|  | 2427 | movl 28(%esp), INP | 
|  | 2428 | movl 32(%esp), LEN | 
|  | 2429 | movl 36(%esp), IVP | 
|  | 2430 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2431 | cmp $16, LEN | 
| Huang Ying | e6efaa0 | 2009-06-18 19:33:57 +0800 | [diff] [blame] | 2432 | jb .Lcbc_dec_just_ret | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2433 | mov 480(KEYP), KLEN | 
|  | 2434 | add $240, KEYP | 
|  | 2435 | movups (IVP), IV | 
|  | 2436 | cmp $64, LEN | 
|  | 2437 | jb .Lcbc_dec_loop1 | 
|  | 2438 | .align 4 | 
|  | 2439 | .Lcbc_dec_loop4: | 
|  | 2440 | movups (INP), IN1 | 
|  | 2441 | movaps IN1, STATE1 | 
|  | 2442 | movups 0x10(INP), IN2 | 
|  | 2443 | movaps IN2, STATE2 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2444 | #ifdef __x86_64__ | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2445 | movups 0x20(INP), IN3 | 
|  | 2446 | movaps IN3, STATE3 | 
|  | 2447 | movups 0x30(INP), IN4 | 
|  | 2448 | movaps IN4, STATE4 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2449 | #else | 
|  | 2450 | movups 0x20(INP), IN1 | 
|  | 2451 | movaps IN1, STATE3 | 
|  | 2452 | movups 0x30(INP), IN2 | 
|  | 2453 | movaps IN2, STATE4 | 
|  | 2454 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2455 | call _aesni_dec4 | 
|  | 2456 | pxor IV, STATE1 | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2457 | #ifdef __x86_64__ | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2458 | pxor IN1, STATE2 | 
|  | 2459 | pxor IN2, STATE3 | 
|  | 2460 | pxor IN3, STATE4 | 
|  | 2461 | movaps IN4, IV | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2462 | #else | 
|  | 2463 | pxor (INP), STATE2 | 
|  | 2464 | pxor 0x10(INP), STATE3 | 
|  | 2465 | pxor IN1, STATE4 | 
|  | 2466 | movaps IN2, IV | 
|  | 2467 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2468 | movups STATE1, (OUTP) | 
|  | 2469 | movups STATE2, 0x10(OUTP) | 
|  | 2470 | movups STATE3, 0x20(OUTP) | 
|  | 2471 | movups STATE4, 0x30(OUTP) | 
|  | 2472 | sub $64, LEN | 
|  | 2473 | add $64, INP | 
|  | 2474 | add $64, OUTP | 
|  | 2475 | cmp $64, LEN | 
|  | 2476 | jge .Lcbc_dec_loop4 | 
|  | 2477 | cmp $16, LEN | 
|  | 2478 | jb .Lcbc_dec_ret | 
|  | 2479 | .align 4 | 
|  | 2480 | .Lcbc_dec_loop1: | 
|  | 2481 | movups (INP), IN | 
|  | 2482 | movaps IN, STATE | 
|  | 2483 | call _aesni_dec1 | 
|  | 2484 | pxor IV, STATE | 
|  | 2485 | movups STATE, (OUTP) | 
|  | 2486 | movaps IN, IV | 
|  | 2487 | sub $16, LEN | 
|  | 2488 | add $16, INP | 
|  | 2489 | add $16, OUTP | 
|  | 2490 | cmp $16, LEN | 
|  | 2491 | jge .Lcbc_dec_loop1 | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2492 | .Lcbc_dec_ret: | 
| Huang Ying | e6efaa0 | 2009-06-18 19:33:57 +0800 | [diff] [blame] | 2493 | movups IV, (IVP) | 
|  | 2494 | .Lcbc_dec_just_ret: | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2495 | #ifndef __x86_64__ | 
|  | 2496 | popl KLEN | 
|  | 2497 | popl KEYP | 
|  | 2498 | popl LEN | 
|  | 2499 | popl IVP | 
|  | 2500 | #endif | 
| Huang Ying | 54b6a1b | 2009-01-18 16:28:34 +1100 | [diff] [blame] | 2501 | ret | 
| Huang Ying | 12387a4 | 2010-03-10 18:28:55 +0800 | [diff] [blame] | 2502 |  | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2503 | #ifdef __x86_64__ | 
| Huang Ying | 12387a4 | 2010-03-10 18:28:55 +0800 | [diff] [blame] | 2504 | .align 16 | 
|  | 2505 | .Lbswap_mask: | 
|  | 2506 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | 
|  | 2507 |  | 
|  | 2508 | /* | 
|  | 2509 | * _aesni_inc_init:	internal ABI | 
|  | 2510 | *	setup registers used by _aesni_inc | 
|  | 2511 | * input: | 
|  | 2512 | *	IV | 
|  | 2513 | * output: | 
|  | 2514 | *	CTR:	== IV, in little endian | 
|  | 2515 | *	TCTR_LOW: == lower qword of CTR | 
|  | 2516 | *	INC:	== 1, in little endian | 
|  | 2517 | *	BSWAP_MASK == endian swapping mask | 
|  | 2518 | */ | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2519 | .align 4 | 
| Huang Ying | 12387a4 | 2010-03-10 18:28:55 +0800 | [diff] [blame] | 2520 | _aesni_inc_init: | 
|  | 2521 | movaps .Lbswap_mask, BSWAP_MASK | 
|  | 2522 | movaps IV, CTR | 
|  | 2523 | PSHUFB_XMM BSWAP_MASK CTR | 
|  | 2524 | mov $1, TCTR_LOW | 
| Huang Ying | 32cbd7d | 2010-03-13 16:28:42 +0800 | [diff] [blame] | 2525 | MOVQ_R64_XMM TCTR_LOW INC | 
|  | 2526 | MOVQ_R64_XMM CTR TCTR_LOW | 
| Huang Ying | 12387a4 | 2010-03-10 18:28:55 +0800 | [diff] [blame] | 2527 | ret | 
|  | 2528 |  | 
|  | 2529 | /* | 
|  | 2530 | * _aesni_inc:		internal ABI | 
|  | 2531 | *	Increase IV by 1, IV is in big endian | 
|  | 2532 | * input: | 
|  | 2533 | *	IV | 
|  | 2534 | *	CTR:	== IV, in little endian | 
|  | 2535 | *	TCTR_LOW: == lower qword of CTR | 
|  | 2536 | *	INC:	== 1, in little endian | 
|  | 2537 | *	BSWAP_MASK == endian swapping mask | 
|  | 2538 | * output: | 
|  | 2539 | *	IV:	Increase by 1 | 
|  | 2540 | * changed: | 
|  | 2541 | *	CTR:	== output IV, in little endian | 
|  | 2542 | *	TCTR_LOW: == lower qword of CTR | 
|  | 2543 | */ | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2544 | .align 4 | 
| Huang Ying | 12387a4 | 2010-03-10 18:28:55 +0800 | [diff] [blame] | 2545 | _aesni_inc: | 
|  | 2546 | paddq INC, CTR | 
|  | 2547 | add $1, TCTR_LOW | 
|  | 2548 | jnc .Linc_low | 
|  | 2549 | pslldq $8, INC | 
|  | 2550 | paddq INC, CTR | 
|  | 2551 | psrldq $8, INC | 
|  | 2552 | .Linc_low: | 
|  | 2553 | movaps CTR, IV | 
|  | 2554 | PSHUFB_XMM BSWAP_MASK IV | 
|  | 2555 | ret | 
|  | 2556 |  | 
|  | 2557 | /* | 
|  | 2558 | * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | 
|  | 2559 | *		      size_t len, u8 *iv) | 
|  | 2560 | */ | 
|  | 2561 | ENTRY(aesni_ctr_enc) | 
|  | 2562 | cmp $16, LEN | 
|  | 2563 | jb .Lctr_enc_just_ret | 
|  | 2564 | mov 480(KEYP), KLEN | 
|  | 2565 | movups (IVP), IV | 
|  | 2566 | call _aesni_inc_init | 
|  | 2567 | cmp $64, LEN | 
|  | 2568 | jb .Lctr_enc_loop1 | 
|  | 2569 | .align 4 | 
|  | 2570 | .Lctr_enc_loop4: | 
|  | 2571 | movaps IV, STATE1 | 
|  | 2572 | call _aesni_inc | 
|  | 2573 | movups (INP), IN1 | 
|  | 2574 | movaps IV, STATE2 | 
|  | 2575 | call _aesni_inc | 
|  | 2576 | movups 0x10(INP), IN2 | 
|  | 2577 | movaps IV, STATE3 | 
|  | 2578 | call _aesni_inc | 
|  | 2579 | movups 0x20(INP), IN3 | 
|  | 2580 | movaps IV, STATE4 | 
|  | 2581 | call _aesni_inc | 
|  | 2582 | movups 0x30(INP), IN4 | 
|  | 2583 | call _aesni_enc4 | 
|  | 2584 | pxor IN1, STATE1 | 
|  | 2585 | movups STATE1, (OUTP) | 
|  | 2586 | pxor IN2, STATE2 | 
|  | 2587 | movups STATE2, 0x10(OUTP) | 
|  | 2588 | pxor IN3, STATE3 | 
|  | 2589 | movups STATE3, 0x20(OUTP) | 
|  | 2590 | pxor IN4, STATE4 | 
|  | 2591 | movups STATE4, 0x30(OUTP) | 
|  | 2592 | sub $64, LEN | 
|  | 2593 | add $64, INP | 
|  | 2594 | add $64, OUTP | 
|  | 2595 | cmp $64, LEN | 
|  | 2596 | jge .Lctr_enc_loop4 | 
|  | 2597 | cmp $16, LEN | 
|  | 2598 | jb .Lctr_enc_ret | 
|  | 2599 | .align 4 | 
|  | 2600 | .Lctr_enc_loop1: | 
|  | 2601 | movaps IV, STATE | 
|  | 2602 | call _aesni_inc | 
|  | 2603 | movups (INP), IN | 
|  | 2604 | call _aesni_enc1 | 
|  | 2605 | pxor IN, STATE | 
|  | 2606 | movups STATE, (OUTP) | 
|  | 2607 | sub $16, LEN | 
|  | 2608 | add $16, INP | 
|  | 2609 | add $16, OUTP | 
|  | 2610 | cmp $16, LEN | 
|  | 2611 | jge .Lctr_enc_loop1 | 
|  | 2612 | .Lctr_enc_ret: | 
|  | 2613 | movups IV, (IVP) | 
|  | 2614 | .Lctr_enc_just_ret: | 
|  | 2615 | ret | 
| Mathias Krause | 0d258ef | 2010-11-27 16:34:46 +0800 | [diff] [blame] | 2616 | #endif |