| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | // ------------------------------------------------------------------------- | 
 | 2 | // Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK. | 
 | 3 | // All rights reserved. | 
 | 4 | // | 
 | 5 | // LICENSE TERMS | 
 | 6 | // | 
 | 7 | // The free distribution and use of this software in both source and binary  | 
 | 8 | // form is allowed (with or without changes) provided that: | 
 | 9 | // | 
 | 10 | //   1. distributions of this source code include the above copyright  | 
 | 11 | //      notice, this list of conditions and the following disclaimer// | 
 | 12 | // | 
 | 13 | //   2. distributions in binary form include the above copyright | 
 | 14 | //      notice, this list of conditions and the following disclaimer | 
 | 15 | //      in the documentation and/or other associated materials// | 
 | 16 | // | 
 | 17 | //   3. the copyright holder's name is not used to endorse products  | 
 | 18 | //      built using this software without specific written permission. | 
 | 19 | // | 
 | 20 | // | 
 | 21 | // ALTERNATIVELY, provided that this notice is retained in full, this product | 
 | 22 | // may be distributed under the terms of the GNU General Public License (GPL), | 
 | 23 | // in which case the provisions of the GPL apply INSTEAD OF those given above. | 
 | 24 | // | 
 | 25 | // Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org> | 
 | 26 | // Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> | 
 | 27 |  | 
 | 28 | // DISCLAIMER | 
 | 29 | // | 
 | 30 | // This software is provided 'as is' with no explicit or implied warranties | 
 | 31 | // in respect of its properties including, but not limited to, correctness  | 
 | 32 | // and fitness for purpose. | 
 | 33 | // ------------------------------------------------------------------------- | 
 | 34 | // Issue Date: 29/07/2002 | 
 | 35 |  | 
 | 36 | .file "aes-i586-asm.S" | 
 | 37 | .text | 
 | 38 |  | 
 | 39 | // aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])// | 
 | 40 | // aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])// | 
 | 41 | 	 | 
 | 42 | #define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words) | 
 | 43 |  | 
 | 44 | // offsets to parameters with one register pushed onto stack | 
 | 45 |  | 
 | 46 | #define in_blk    8  // input byte array address parameter | 
 | 47 | #define out_blk  12  // output byte array address parameter | 
 | 48 | #define ctx      16  // AES context structure | 
 | 49 |  | 
 | 50 | // offsets in context structure | 
 | 51 |  | 
 | 52 | #define ekey     0   // encryption key schedule base address | 
 | 53 | #define nrnd   256   // number of rounds | 
 | 54 | #define dkey   260   // decryption key schedule base address | 
 | 55 |  | 
 | 56 | // register mapping for encrypt and decrypt subroutines | 
 | 57 |  | 
 | 58 | #define r0  eax | 
 | 59 | #define r1  ebx | 
 | 60 | #define r2  ecx | 
 | 61 | #define r3  edx | 
 | 62 | #define r4  esi | 
 | 63 | #define r5  edi | 
 | 64 |  | 
 | 65 | #define eaxl  al | 
 | 66 | #define eaxh  ah | 
 | 67 | #define ebxl  bl | 
 | 68 | #define ebxh  bh | 
 | 69 | #define ecxl  cl | 
 | 70 | #define ecxh  ch | 
 | 71 | #define edxl  dl | 
 | 72 | #define edxh  dh | 
 | 73 |  | 
 | 74 | #define _h(reg) reg##h | 
 | 75 | #define h(reg) _h(reg) | 
 | 76 |  | 
 | 77 | #define _l(reg) reg##l | 
 | 78 | #define l(reg) _l(reg) | 
 | 79 |  | 
 | 80 | // This macro takes a 32-bit word representing a column and uses | 
 | 81 | // each of its four bytes to index into four tables of 256 32-bit | 
 | 82 | // words to obtain values that are then xored into the appropriate | 
 | 83 | // output registers r0, r1, r4 or r5.   | 
 | 84 |  | 
 | 85 | // Parameters: | 
 | 86 | // table table base address | 
 | 87 | //   %1  out_state[0] | 
 | 88 | //   %2  out_state[1] | 
 | 89 | //   %3  out_state[2] | 
 | 90 | //   %4  out_state[3] | 
 | 91 | //   idx input register for the round (destroyed) | 
 | 92 | //   tmp scratch register for the round | 
 | 93 | // sched key schedule | 
 | 94 |  | 
 | 95 | #define do_col(table, a1,a2,a3,a4, idx, tmp)	\ | 
 | 96 | 	movzx   %l(idx),%tmp;			\ | 
 | 97 | 	xor     table(,%tmp,4),%a1;		\ | 
 | 98 | 	movzx   %h(idx),%tmp;			\ | 
 | 99 | 	shr     $16,%idx;			\ | 
 | 100 | 	xor     table+tlen(,%tmp,4),%a2;	\ | 
 | 101 | 	movzx   %l(idx),%tmp;			\ | 
 | 102 | 	movzx   %h(idx),%idx;			\ | 
 | 103 | 	xor     table+2*tlen(,%tmp,4),%a3;	\ | 
 | 104 | 	xor     table+3*tlen(,%idx,4),%a4; | 
 | 105 |  | 
 | 106 | // initialise output registers from the key schedule | 
 | 107 | // NB1: original value of a3 is in idx on exit | 
 | 108 | // NB2: original values of a1,a2,a4 aren't used | 
 | 109 | #define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \ | 
 | 110 | 	mov     0 sched,%a1;			\ | 
 | 111 | 	movzx   %l(idx),%tmp;			\ | 
 | 112 | 	mov     12 sched,%a2;			\ | 
 | 113 | 	xor     table(,%tmp,4),%a1;		\ | 
 | 114 | 	mov     4 sched,%a4;			\ | 
 | 115 | 	movzx   %h(idx),%tmp;			\ | 
 | 116 | 	shr     $16,%idx;			\ | 
 | 117 | 	xor     table+tlen(,%tmp,4),%a2;	\ | 
 | 118 | 	movzx   %l(idx),%tmp;			\ | 
 | 119 | 	movzx   %h(idx),%idx;			\ | 
 | 120 | 	xor     table+3*tlen(,%idx,4),%a4;	\ | 
 | 121 | 	mov     %a3,%idx;			\ | 
 | 122 | 	mov     8 sched,%a3;			\ | 
 | 123 | 	xor     table+2*tlen(,%tmp,4),%a3; | 
 | 124 |  | 
 | 125 | // initialise output registers from the key schedule | 
 | 126 | // NB1: original value of a3 is in idx on exit | 
 | 127 | // NB2: original values of a1,a2,a4 aren't used | 
 | 128 | #define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \ | 
 | 129 | 	mov     0 sched,%a1;			\ | 
 | 130 | 	movzx   %l(idx),%tmp;			\ | 
 | 131 | 	mov     4 sched,%a2;			\ | 
 | 132 | 	xor     table(,%tmp,4),%a1;		\ | 
 | 133 | 	mov     12 sched,%a4;			\ | 
 | 134 | 	movzx   %h(idx),%tmp;			\ | 
 | 135 | 	shr     $16,%idx;			\ | 
 | 136 | 	xor     table+tlen(,%tmp,4),%a2;	\ | 
 | 137 | 	movzx   %l(idx),%tmp;			\ | 
 | 138 | 	movzx   %h(idx),%idx;			\ | 
 | 139 | 	xor     table+3*tlen(,%idx,4),%a4;	\ | 
 | 140 | 	mov     %a3,%idx;			\ | 
 | 141 | 	mov     8 sched,%a3;			\ | 
 | 142 | 	xor     table+2*tlen(,%tmp,4),%a3; | 
 | 143 |  | 
 | 144 |  | 
 | 145 | // original Gladman had conditional saves to MMX regs. | 
 | 146 | #define save(a1, a2)		\ | 
 | 147 | 	mov     %a2,4*a1(%esp) | 
 | 148 |  | 
 | 149 | #define restore(a1, a2)		\ | 
 | 150 | 	mov     4*a2(%esp),%a1 | 
 | 151 |  | 
 | 152 | // These macros perform a forward encryption cycle. They are entered with | 
 | 153 | // the first previous round column values in r0,r1,r4,r5 and | 
 | 154 | // exit with the final values in the same registers, using stack | 
 | 155 | // for temporary storage. | 
 | 156 |  | 
 | 157 | // round column values | 
 | 158 | // on entry: r0,r1,r4,r5 | 
 | 159 | // on exit:  r2,r1,r4,r5 | 
 | 160 | #define fwd_rnd1(arg, table)						\ | 
 | 161 | 	save   (0,r1);							\ | 
 | 162 | 	save   (1,r5);							\ | 
 | 163 | 									\ | 
 | 164 | 	/* compute new column values */					\ | 
 | 165 | 	do_fcol(table, r2,r5,r4,r1, r0,r3, arg);	/* idx=r0 */	\ | 
 | 166 | 	do_col (table, r4,r1,r2,r5, r0,r3);		/* idx=r4 */	\ | 
 | 167 | 	restore(r0,0);							\ | 
 | 168 | 	do_col (table, r1,r2,r5,r4, r0,r3);		/* idx=r1 */	\ | 
 | 169 | 	restore(r0,1);							\ | 
 | 170 | 	do_col (table, r5,r4,r1,r2, r0,r3);		/* idx=r5 */ | 
 | 171 |  | 
 | 172 | // round column values | 
 | 173 | // on entry: r2,r1,r4,r5 | 
 | 174 | // on exit:  r0,r1,r4,r5 | 
 | 175 | #define fwd_rnd2(arg, table)						\ | 
 | 176 | 	save   (0,r1);							\ | 
 | 177 | 	save   (1,r5);							\ | 
 | 178 | 									\ | 
 | 179 | 	/* compute new column values */					\ | 
 | 180 | 	do_fcol(table, r0,r5,r4,r1, r2,r3, arg);	/* idx=r2 */	\ | 
 | 181 | 	do_col (table, r4,r1,r0,r5, r2,r3);		/* idx=r4 */	\ | 
 | 182 | 	restore(r2,0);							\ | 
 | 183 | 	do_col (table, r1,r0,r5,r4, r2,r3);		/* idx=r1 */	\ | 
 | 184 | 	restore(r2,1);							\ | 
 | 185 | 	do_col (table, r5,r4,r1,r0, r2,r3);		/* idx=r5 */ | 
 | 186 |  | 
 | 187 | // These macros performs an inverse encryption cycle. They are entered with | 
 | 188 | // the first previous round column values in r0,r1,r4,r5 and | 
 | 189 | // exit with the final values in the same registers, using stack | 
 | 190 | // for temporary storage | 
 | 191 |  | 
 | 192 | // round column values | 
 | 193 | // on entry: r0,r1,r4,r5 | 
 | 194 | // on exit:  r2,r1,r4,r5 | 
 | 195 | #define inv_rnd1(arg, table)						\ | 
 | 196 | 	save    (0,r1);							\ | 
 | 197 | 	save    (1,r5);							\ | 
 | 198 | 									\ | 
 | 199 | 	/* compute new column values */					\ | 
 | 200 | 	do_icol(table, r2,r1,r4,r5, r0,r3, arg);	/* idx=r0 */	\ | 
 | 201 | 	do_col (table, r4,r5,r2,r1, r0,r3);		/* idx=r4 */	\ | 
 | 202 | 	restore(r0,0);							\ | 
 | 203 | 	do_col (table, r1,r4,r5,r2, r0,r3);		/* idx=r1 */	\ | 
 | 204 | 	restore(r0,1);							\ | 
 | 205 | 	do_col (table, r5,r2,r1,r4, r0,r3);		/* idx=r5 */ | 
 | 206 |  | 
 | 207 | // round column values | 
 | 208 | // on entry: r2,r1,r4,r5 | 
 | 209 | // on exit:  r0,r1,r4,r5 | 
 | 210 | #define inv_rnd2(arg, table)						\ | 
 | 211 | 	save    (0,r1);							\ | 
 | 212 | 	save    (1,r5);							\ | 
 | 213 | 									\ | 
 | 214 | 	/* compute new column values */					\ | 
 | 215 | 	do_icol(table, r0,r1,r4,r5, r2,r3, arg);	/* idx=r2 */	\ | 
 | 216 | 	do_col (table, r4,r5,r0,r1, r2,r3);		/* idx=r4 */	\ | 
 | 217 | 	restore(r2,0);							\ | 
 | 218 | 	do_col (table, r1,r4,r5,r0, r2,r3);		/* idx=r1 */	\ | 
 | 219 | 	restore(r2,1);							\ | 
 | 220 | 	do_col (table, r5,r0,r1,r4, r2,r3);		/* idx=r5 */ | 
 | 221 |  | 
 | 222 | // AES (Rijndael) Encryption Subroutine | 
 | 223 |  | 
 | 224 | .global  aes_enc_blk | 
 | 225 |  | 
 | 226 | .extern  ft_tab | 
 | 227 | .extern  fl_tab | 
 | 228 |  | 
 | 229 | .align 4 | 
 | 230 |  | 
 | 231 | aes_enc_blk: | 
 | 232 | 	push    %ebp | 
 | 233 | 	mov     ctx(%esp),%ebp      // pointer to context | 
 | 234 |  | 
 | 235 | // CAUTION: the order and the values used in these assigns  | 
 | 236 | // rely on the register mappings | 
 | 237 |  | 
 | 238 | 1:	push    %ebx | 
 | 239 | 	mov     in_blk+4(%esp),%r2 | 
 | 240 | 	push    %esi | 
 | 241 | 	mov     nrnd(%ebp),%r3   // number of rounds | 
 | 242 | 	push    %edi | 
 | 243 | #if ekey != 0 | 
 | 244 | 	lea     ekey(%ebp),%ebp  // key pointer | 
 | 245 | #endif | 
 | 246 |  | 
 | 247 | // input four columns and xor in first round key | 
 | 248 |  | 
 | 249 | 	mov     (%r2),%r0 | 
 | 250 | 	mov     4(%r2),%r1 | 
 | 251 | 	mov     8(%r2),%r4 | 
 | 252 | 	mov     12(%r2),%r5 | 
 | 253 | 	xor     (%ebp),%r0 | 
 | 254 | 	xor     4(%ebp),%r1 | 
 | 255 | 	xor     8(%ebp),%r4 | 
 | 256 | 	xor     12(%ebp),%r5 | 
 | 257 |  | 
 | 258 | 	sub     $8,%esp           // space for register saves on stack | 
 | 259 | 	add     $16,%ebp          // increment to next round key | 
 | 260 | 	sub     $10,%r3           | 
 | 261 | 	je      4f              // 10 rounds for 128-bit key | 
 | 262 | 	add     $32,%ebp | 
 | 263 | 	sub     $2,%r3 | 
 | 264 | 	je      3f              // 12 rounds for 128-bit key | 
 | 265 | 	add     $32,%ebp | 
 | 266 |  | 
 | 267 | 2:	fwd_rnd1( -64(%ebp) ,ft_tab)	// 14 rounds for 128-bit key | 
 | 268 | 	fwd_rnd2( -48(%ebp) ,ft_tab) | 
 | 269 | 3:	fwd_rnd1( -32(%ebp) ,ft_tab)	// 12 rounds for 128-bit key | 
 | 270 | 	fwd_rnd2( -16(%ebp) ,ft_tab) | 
 | 271 | 4:	fwd_rnd1(    (%ebp) ,ft_tab)	// 10 rounds for 128-bit key | 
 | 272 | 	fwd_rnd2( +16(%ebp) ,ft_tab) | 
 | 273 | 	fwd_rnd1( +32(%ebp) ,ft_tab) | 
 | 274 | 	fwd_rnd2( +48(%ebp) ,ft_tab) | 
 | 275 | 	fwd_rnd1( +64(%ebp) ,ft_tab) | 
 | 276 | 	fwd_rnd2( +80(%ebp) ,ft_tab) | 
 | 277 | 	fwd_rnd1( +96(%ebp) ,ft_tab) | 
 | 278 | 	fwd_rnd2(+112(%ebp) ,ft_tab) | 
 | 279 | 	fwd_rnd1(+128(%ebp) ,ft_tab) | 
 | 280 | 	fwd_rnd2(+144(%ebp) ,fl_tab)	// last round uses a different table | 
 | 281 |  | 
 | 282 | // move final values to the output array.  CAUTION: the  | 
 | 283 | // order of these assigns rely on the register mappings | 
 | 284 |  | 
 | 285 | 	add     $8,%esp | 
 | 286 | 	mov     out_blk+12(%esp),%ebp | 
 | 287 | 	mov     %r5,12(%ebp) | 
 | 288 | 	pop     %edi | 
 | 289 | 	mov     %r4,8(%ebp) | 
 | 290 | 	pop     %esi | 
 | 291 | 	mov     %r1,4(%ebp) | 
 | 292 | 	pop     %ebx | 
 | 293 | 	mov     %r0,(%ebp) | 
 | 294 | 	pop     %ebp | 
 | 295 | 	mov     $1,%eax | 
 | 296 | 	ret | 
 | 297 |  | 
 | 298 | // AES (Rijndael) Decryption Subroutine | 
 | 299 |  | 
 | 300 | .global  aes_dec_blk | 
 | 301 |  | 
 | 302 | .extern  it_tab | 
 | 303 | .extern  il_tab | 
 | 304 |  | 
 | 305 | .align 4 | 
 | 306 |  | 
 | 307 | aes_dec_blk: | 
 | 308 | 	push    %ebp | 
 | 309 | 	mov     ctx(%esp),%ebp       // pointer to context | 
 | 310 |  | 
 | 311 | // CAUTION: the order and the values used in these assigns  | 
 | 312 | // rely on the register mappings | 
 | 313 |  | 
 | 314 | 1:	push    %ebx | 
 | 315 | 	mov     in_blk+4(%esp),%r2 | 
 | 316 | 	push    %esi | 
 | 317 | 	mov     nrnd(%ebp),%r3   // number of rounds | 
 | 318 | 	push    %edi | 
 | 319 | #if dkey != 0 | 
 | 320 | 	lea     dkey(%ebp),%ebp  // key pointer | 
 | 321 | #endif | 
 | 322 | 	mov     %r3,%r0 | 
 | 323 | 	shl     $4,%r0 | 
 | 324 | 	add     %r0,%ebp | 
 | 325 | 	 | 
 | 326 | // input four columns and xor in first round key | 
 | 327 |  | 
 | 328 | 	mov     (%r2),%r0 | 
 | 329 | 	mov     4(%r2),%r1 | 
 | 330 | 	mov     8(%r2),%r4 | 
 | 331 | 	mov     12(%r2),%r5 | 
 | 332 | 	xor     (%ebp),%r0 | 
 | 333 | 	xor     4(%ebp),%r1 | 
 | 334 | 	xor     8(%ebp),%r4 | 
 | 335 | 	xor     12(%ebp),%r5 | 
 | 336 |  | 
 | 337 | 	sub     $8,%esp         // space for register saves on stack | 
 | 338 | 	sub     $16,%ebp        // increment to next round key | 
 | 339 | 	sub     $10,%r3           | 
 | 340 | 	je      4f              // 10 rounds for 128-bit key | 
 | 341 | 	sub     $32,%ebp | 
 | 342 | 	sub     $2,%r3 | 
 | 343 | 	je      3f              // 12 rounds for 128-bit key | 
 | 344 | 	sub     $32,%ebp | 
 | 345 |  | 
 | 346 | 2:	inv_rnd1( +64(%ebp), it_tab)	// 14 rounds for 128-bit key | 
 | 347 | 	inv_rnd2( +48(%ebp), it_tab) | 
 | 348 | 3:	inv_rnd1( +32(%ebp), it_tab)	// 12 rounds for 128-bit key | 
 | 349 | 	inv_rnd2( +16(%ebp), it_tab) | 
 | 350 | 4:	inv_rnd1(    (%ebp), it_tab)	// 10 rounds for 128-bit key | 
 | 351 | 	inv_rnd2( -16(%ebp), it_tab) | 
 | 352 | 	inv_rnd1( -32(%ebp), it_tab) | 
 | 353 | 	inv_rnd2( -48(%ebp), it_tab) | 
 | 354 | 	inv_rnd1( -64(%ebp), it_tab) | 
 | 355 | 	inv_rnd2( -80(%ebp), it_tab) | 
 | 356 | 	inv_rnd1( -96(%ebp), it_tab) | 
 | 357 | 	inv_rnd2(-112(%ebp), it_tab) | 
 | 358 | 	inv_rnd1(-128(%ebp), it_tab) | 
 | 359 | 	inv_rnd2(-144(%ebp), il_tab)	// last round uses a different table | 
 | 360 |  | 
 | 361 | // move final values to the output array.  CAUTION: the  | 
 | 362 | // order of these assigns rely on the register mappings | 
 | 363 |  | 
 | 364 | 	add     $8,%esp | 
 | 365 | 	mov     out_blk+12(%esp),%ebp | 
 | 366 | 	mov     %r5,12(%ebp) | 
 | 367 | 	pop     %edi | 
 | 368 | 	mov     %r4,8(%ebp) | 
 | 369 | 	pop     %esi | 
 | 370 | 	mov     %r1,4(%ebp) | 
 | 371 | 	pop     %ebx | 
 | 372 | 	mov     %r0,(%ebp) | 
 | 373 | 	pop     %ebp | 
 | 374 | 	mov     $1,%eax | 
 | 375 | 	ret | 
 | 376 |  |