| Joachim Fritschi | b9f535f | 2006-06-20 20:59:16 +1000 | [diff] [blame] | 1 | /*************************************************************************** | 
|  | 2 | *   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        * | 
|  | 3 | *                                                                         * | 
|  | 4 | *   This program is free software; you can redistribute it and/or modify  * | 
|  | 5 | *   it under the terms of the GNU General Public License as published by  * | 
|  | 6 | *   the Free Software Foundation; either version 2 of the License, or     * | 
|  | 7 | *   (at your option) any later version.                                   * | 
|  | 8 | *                                                                         * | 
|  | 9 | *   This program is distributed in the hope that it will be useful,       * | 
|  | 10 | *   but WITHOUT ANY WARRANTY; without even the implied warranty of        * | 
|  | 11 | *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         * | 
|  | 12 | *   GNU General Public License for more details.                          * | 
|  | 13 | *                                                                         * | 
|  | 14 | *   You should have received a copy of the GNU General Public License     * | 
|  | 15 | *   along with this program; if not, write to the                         * | 
|  | 16 | *   Free Software Foundation, Inc.,                                       * | 
|  | 17 | *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             * | 
|  | 18 | ***************************************************************************/ | 
|  | 19 |  | 
|  | 20 | .file "twofish-i586-asm.S" | 
|  | 21 | .text | 
|  | 22 |  | 
|  | 23 | #include <asm/asm-offsets.h> | 
|  | 24 |  | 
| Daniel Mack | 3ad2f3f | 2010-02-03 08:01:28 +0800 | [diff] [blame] | 25 | /* return address at 0 */ | 
| Joachim Fritschi | b9f535f | 2006-06-20 20:59:16 +1000 | [diff] [blame] | 26 |  | 
|  | 27 | #define in_blk    12  /* input byte array address parameter*/ | 
|  | 28 | #define out_blk   8  /* output byte array address parameter*/ | 
|  | 29 | #define tfm       4  /* Twofish context structure */ | 
|  | 30 |  | 
|  | 31 | #define a_offset	0 | 
|  | 32 | #define b_offset	4 | 
|  | 33 | #define c_offset	8 | 
|  | 34 | #define d_offset	12 | 
|  | 35 |  | 
|  | 36 | /* Structure of the crypto context struct*/ | 
|  | 37 |  | 
|  | 38 | #define s0	0	/* S0 Array 256 Words each */ | 
|  | 39 | #define s1	1024	/* S1 Array */ | 
|  | 40 | #define s2	2048	/* S2 Array */ | 
|  | 41 | #define s3	3072	/* S3 Array */ | 
|  | 42 | #define w	4096	/* 8 whitening keys (word) */ | 
|  | 43 | #define k	4128	/* key 1-32 ( word ) */ | 
|  | 44 |  | 
|  | 45 | /* define a few register aliases to allow macro substitution */ | 
|  | 46 |  | 
|  | 47 | #define R0D    %eax | 
|  | 48 | #define R0B    %al | 
|  | 49 | #define R0H    %ah | 
|  | 50 |  | 
|  | 51 | #define R1D    %ebx | 
|  | 52 | #define R1B    %bl | 
|  | 53 | #define R1H    %bh | 
|  | 54 |  | 
|  | 55 | #define R2D    %ecx | 
|  | 56 | #define R2B    %cl | 
|  | 57 | #define R2H    %ch | 
|  | 58 |  | 
|  | 59 | #define R3D    %edx | 
|  | 60 | #define R3B    %dl | 
|  | 61 | #define R3H    %dh | 
|  | 62 |  | 
|  | 63 |  | 
|  | 64 | /* performs input whitening */ | 
|  | 65 | #define input_whitening(src,context,offset)\ | 
|  | 66 | xor	w+offset(context),	src; | 
|  | 67 |  | 
|  | 68 | /* performs input whitening */ | 
|  | 69 | #define output_whitening(src,context,offset)\ | 
|  | 70 | xor	w+16+offset(context),	src; | 
|  | 71 |  | 
|  | 72 | /* | 
|  | 73 | * a input register containing a (rotated 16) | 
|  | 74 | * b input register containing b | 
|  | 75 | * c input register containing c | 
|  | 76 | * d input register containing d (already rol $1) | 
|  | 77 | * operations on a and b are interleaved to increase performance | 
|  | 78 | */ | 
|  | 79 | #define encrypt_round(a,b,c,d,round)\ | 
|  | 80 | push	d ## D;\ | 
|  | 81 | movzx	b ## B,		%edi;\ | 
|  | 82 | mov	s1(%ebp,%edi,4),d ## D;\ | 
|  | 83 | movzx	a ## B,		%edi;\ | 
|  | 84 | mov	s2(%ebp,%edi,4),%esi;\ | 
|  | 85 | movzx	b ## H,		%edi;\ | 
|  | 86 | ror	$16,		b ## D;\ | 
|  | 87 | xor	s2(%ebp,%edi,4),d ## D;\ | 
|  | 88 | movzx	a ## H,		%edi;\ | 
|  | 89 | ror	$16,		a ## D;\ | 
|  | 90 | xor	s3(%ebp,%edi,4),%esi;\ | 
|  | 91 | movzx	b ## B,		%edi;\ | 
|  | 92 | xor	s3(%ebp,%edi,4),d ## D;\ | 
|  | 93 | movzx	a ## B,		%edi;\ | 
|  | 94 | xor	(%ebp,%edi,4),	%esi;\ | 
|  | 95 | movzx	b ## H,		%edi;\ | 
|  | 96 | ror	$15,		b ## D;\ | 
|  | 97 | xor	(%ebp,%edi,4),	d ## D;\ | 
|  | 98 | movzx	a ## H,		%edi;\ | 
|  | 99 | xor	s1(%ebp,%edi,4),%esi;\ | 
|  | 100 | pop	%edi;\ | 
|  | 101 | add	d ## D,		%esi;\ | 
|  | 102 | add	%esi,		d ## D;\ | 
|  | 103 | add	k+round(%ebp),	%esi;\ | 
|  | 104 | xor	%esi,		c ## D;\ | 
|  | 105 | rol	$15,		c ## D;\ | 
|  | 106 | add	k+4+round(%ebp),d ## D;\ | 
|  | 107 | xor	%edi,		d ## D; | 
|  | 108 |  | 
|  | 109 | /* | 
|  | 110 | * a input register containing a (rotated 16) | 
|  | 111 | * b input register containing b | 
|  | 112 | * c input register containing c | 
|  | 113 | * d input register containing d (already rol $1) | 
|  | 114 | * operations on a and b are interleaved to increase performance | 
|  | 115 | * last round has different rotations for the output preparation | 
|  | 116 | */ | 
|  | 117 | #define encrypt_last_round(a,b,c,d,round)\ | 
|  | 118 | push	d ## D;\ | 
|  | 119 | movzx	b ## B,		%edi;\ | 
|  | 120 | mov	s1(%ebp,%edi,4),d ## D;\ | 
|  | 121 | movzx	a ## B,		%edi;\ | 
|  | 122 | mov	s2(%ebp,%edi,4),%esi;\ | 
|  | 123 | movzx	b ## H,		%edi;\ | 
|  | 124 | ror	$16,		b ## D;\ | 
|  | 125 | xor	s2(%ebp,%edi,4),d ## D;\ | 
|  | 126 | movzx	a ## H,		%edi;\ | 
|  | 127 | ror	$16,		a ## D;\ | 
|  | 128 | xor	s3(%ebp,%edi,4),%esi;\ | 
|  | 129 | movzx	b ## B,		%edi;\ | 
|  | 130 | xor	s3(%ebp,%edi,4),d ## D;\ | 
|  | 131 | movzx	a ## B,		%edi;\ | 
|  | 132 | xor	(%ebp,%edi,4),	%esi;\ | 
|  | 133 | movzx	b ## H,		%edi;\ | 
|  | 134 | ror	$16,		b ## D;\ | 
|  | 135 | xor	(%ebp,%edi,4),	d ## D;\ | 
|  | 136 | movzx	a ## H,		%edi;\ | 
|  | 137 | xor	s1(%ebp,%edi,4),%esi;\ | 
|  | 138 | pop	%edi;\ | 
|  | 139 | add	d ## D,		%esi;\ | 
|  | 140 | add	%esi,		d ## D;\ | 
|  | 141 | add	k+round(%ebp),	%esi;\ | 
|  | 142 | xor	%esi,		c ## D;\ | 
|  | 143 | ror	$1,		c ## D;\ | 
|  | 144 | add	k+4+round(%ebp),d ## D;\ | 
|  | 145 | xor	%edi,		d ## D; | 
|  | 146 |  | 
|  | 147 | /* | 
|  | 148 | * a input register containing a | 
|  | 149 | * b input register containing b (rotated 16) | 
|  | 150 | * c input register containing c | 
|  | 151 | * d input register containing d (already rol $1) | 
|  | 152 | * operations on a and b are interleaved to increase performance | 
|  | 153 | */ | 
|  | 154 | #define decrypt_round(a,b,c,d,round)\ | 
|  | 155 | push	c ## D;\ | 
|  | 156 | movzx	a ## B,		%edi;\ | 
|  | 157 | mov	(%ebp,%edi,4),	c ## D;\ | 
|  | 158 | movzx	b ## B,		%edi;\ | 
|  | 159 | mov	s3(%ebp,%edi,4),%esi;\ | 
|  | 160 | movzx	a ## H,		%edi;\ | 
|  | 161 | ror	$16,		a ## D;\ | 
|  | 162 | xor	s1(%ebp,%edi,4),c ## D;\ | 
|  | 163 | movzx	b ## H,		%edi;\ | 
|  | 164 | ror	$16,		b ## D;\ | 
|  | 165 | xor	(%ebp,%edi,4),	%esi;\ | 
|  | 166 | movzx	a ## B,		%edi;\ | 
|  | 167 | xor	s2(%ebp,%edi,4),c ## D;\ | 
|  | 168 | movzx	b ## B,		%edi;\ | 
|  | 169 | xor	s1(%ebp,%edi,4),%esi;\ | 
|  | 170 | movzx	a ## H,		%edi;\ | 
|  | 171 | ror	$15,		a ## D;\ | 
|  | 172 | xor	s3(%ebp,%edi,4),c ## D;\ | 
|  | 173 | movzx	b ## H,		%edi;\ | 
|  | 174 | xor	s2(%ebp,%edi,4),%esi;\ | 
|  | 175 | pop	%edi;\ | 
|  | 176 | add	%esi,		c ## D;\ | 
|  | 177 | add	c ## D,		%esi;\ | 
|  | 178 | add	k+round(%ebp),	c ## D;\ | 
|  | 179 | xor	%edi,		c ## D;\ | 
|  | 180 | add	k+4+round(%ebp),%esi;\ | 
|  | 181 | xor	%esi,		d ## D;\ | 
|  | 182 | rol	$15,		d ## D; | 
|  | 183 |  | 
|  | 184 | /* | 
|  | 185 | * a input register containing a | 
|  | 186 | * b input register containing b (rotated 16) | 
|  | 187 | * c input register containing c | 
|  | 188 | * d input register containing d (already rol $1) | 
|  | 189 | * operations on a and b are interleaved to increase performance | 
|  | 190 | * last round has different rotations for the output preparation | 
|  | 191 | */ | 
|  | 192 | #define decrypt_last_round(a,b,c,d,round)\ | 
|  | 193 | push	c ## D;\ | 
|  | 194 | movzx	a ## B,		%edi;\ | 
|  | 195 | mov	(%ebp,%edi,4),	c ## D;\ | 
|  | 196 | movzx	b ## B,		%edi;\ | 
|  | 197 | mov	s3(%ebp,%edi,4),%esi;\ | 
|  | 198 | movzx	a ## H,		%edi;\ | 
|  | 199 | ror	$16,		a ## D;\ | 
|  | 200 | xor	s1(%ebp,%edi,4),c ## D;\ | 
|  | 201 | movzx	b ## H,		%edi;\ | 
|  | 202 | ror	$16,		b ## D;\ | 
|  | 203 | xor	(%ebp,%edi,4),	%esi;\ | 
|  | 204 | movzx	a ## B,		%edi;\ | 
|  | 205 | xor	s2(%ebp,%edi,4),c ## D;\ | 
|  | 206 | movzx	b ## B,		%edi;\ | 
|  | 207 | xor	s1(%ebp,%edi,4),%esi;\ | 
|  | 208 | movzx	a ## H,		%edi;\ | 
|  | 209 | ror	$16,		a ## D;\ | 
|  | 210 | xor	s3(%ebp,%edi,4),c ## D;\ | 
|  | 211 | movzx	b ## H,		%edi;\ | 
|  | 212 | xor	s2(%ebp,%edi,4),%esi;\ | 
|  | 213 | pop	%edi;\ | 
|  | 214 | add	%esi,		c ## D;\ | 
|  | 215 | add	c ## D,		%esi;\ | 
|  | 216 | add	k+round(%ebp),	c ## D;\ | 
|  | 217 | xor	%edi,		c ## D;\ | 
|  | 218 | add	k+4+round(%ebp),%esi;\ | 
|  | 219 | xor	%esi,		d ## D;\ | 
|  | 220 | ror	$1,		d ## D; | 
|  | 221 |  | 
|  | 222 | .align 4 | 
|  | 223 | .global twofish_enc_blk | 
|  | 224 | .global twofish_dec_blk | 
|  | 225 |  | 
|  | 226 | twofish_enc_blk: | 
|  | 227 | push	%ebp			/* save registers according to calling convention*/ | 
|  | 228 | push    %ebx | 
|  | 229 | push    %esi | 
|  | 230 | push    %edi | 
|  | 231 |  | 
|  | 232 | mov	tfm + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto tfm */ | 
| Daniel Mack | 3ad2f3f | 2010-02-03 08:01:28 +0800 | [diff] [blame] | 233 | add	$crypto_tfm_ctx_offset, %ebp	/* ctx address */ | 
|  | 234 | mov     in_blk+16(%esp),%edi	/* input address in edi */ | 
| Joachim Fritschi | b9f535f | 2006-06-20 20:59:16 +1000 | [diff] [blame] | 235 |  | 
|  | 236 | mov	(%edi),		%eax | 
|  | 237 | mov	b_offset(%edi),	%ebx | 
|  | 238 | mov	c_offset(%edi),	%ecx | 
|  | 239 | mov	d_offset(%edi),	%edx | 
|  | 240 | input_whitening(%eax,%ebp,a_offset) | 
|  | 241 | ror	$16,	%eax | 
|  | 242 | input_whitening(%ebx,%ebp,b_offset) | 
|  | 243 | input_whitening(%ecx,%ebp,c_offset) | 
|  | 244 | input_whitening(%edx,%ebp,d_offset) | 
|  | 245 | rol	$1,	%edx | 
|  | 246 |  | 
|  | 247 | encrypt_round(R0,R1,R2,R3,0); | 
|  | 248 | encrypt_round(R2,R3,R0,R1,8); | 
|  | 249 | encrypt_round(R0,R1,R2,R3,2*8); | 
|  | 250 | encrypt_round(R2,R3,R0,R1,3*8); | 
|  | 251 | encrypt_round(R0,R1,R2,R3,4*8); | 
|  | 252 | encrypt_round(R2,R3,R0,R1,5*8); | 
|  | 253 | encrypt_round(R0,R1,R2,R3,6*8); | 
|  | 254 | encrypt_round(R2,R3,R0,R1,7*8); | 
|  | 255 | encrypt_round(R0,R1,R2,R3,8*8); | 
|  | 256 | encrypt_round(R2,R3,R0,R1,9*8); | 
|  | 257 | encrypt_round(R0,R1,R2,R3,10*8); | 
|  | 258 | encrypt_round(R2,R3,R0,R1,11*8); | 
|  | 259 | encrypt_round(R0,R1,R2,R3,12*8); | 
|  | 260 | encrypt_round(R2,R3,R0,R1,13*8); | 
|  | 261 | encrypt_round(R0,R1,R2,R3,14*8); | 
|  | 262 | encrypt_last_round(R2,R3,R0,R1,15*8); | 
|  | 263 |  | 
|  | 264 | output_whitening(%eax,%ebp,c_offset) | 
|  | 265 | output_whitening(%ebx,%ebp,d_offset) | 
|  | 266 | output_whitening(%ecx,%ebp,a_offset) | 
|  | 267 | output_whitening(%edx,%ebp,b_offset) | 
|  | 268 | mov	out_blk+16(%esp),%edi; | 
|  | 269 | mov	%eax,		c_offset(%edi) | 
|  | 270 | mov	%ebx,		d_offset(%edi) | 
|  | 271 | mov	%ecx,		(%edi) | 
|  | 272 | mov	%edx,		b_offset(%edi) | 
|  | 273 |  | 
|  | 274 | pop	%edi | 
|  | 275 | pop	%esi | 
|  | 276 | pop	%ebx | 
|  | 277 | pop	%ebp | 
|  | 278 | mov	$1,	%eax | 
|  | 279 | ret | 
|  | 280 |  | 
|  | 281 | twofish_dec_blk: | 
|  | 282 | push	%ebp			/* save registers according to calling convention*/ | 
|  | 283 | push    %ebx | 
|  | 284 | push    %esi | 
|  | 285 | push    %edi | 
|  | 286 |  | 
|  | 287 |  | 
|  | 288 | mov	tfm + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto tfm */ | 
| Daniel Mack | 3ad2f3f | 2010-02-03 08:01:28 +0800 | [diff] [blame] | 289 | add	$crypto_tfm_ctx_offset, %ebp	/* ctx address */ | 
|  | 290 | mov     in_blk+16(%esp),%edi	/* input address in edi */ | 
| Joachim Fritschi | b9f535f | 2006-06-20 20:59:16 +1000 | [diff] [blame] | 291 |  | 
|  | 292 | mov	(%edi),		%eax | 
|  | 293 | mov	b_offset(%edi),	%ebx | 
|  | 294 | mov	c_offset(%edi),	%ecx | 
|  | 295 | mov	d_offset(%edi),	%edx | 
|  | 296 | output_whitening(%eax,%ebp,a_offset) | 
|  | 297 | output_whitening(%ebx,%ebp,b_offset) | 
|  | 298 | ror	$16,	%ebx | 
|  | 299 | output_whitening(%ecx,%ebp,c_offset) | 
|  | 300 | output_whitening(%edx,%ebp,d_offset) | 
|  | 301 | rol	$1,	%ecx | 
|  | 302 |  | 
|  | 303 | decrypt_round(R0,R1,R2,R3,15*8); | 
|  | 304 | decrypt_round(R2,R3,R0,R1,14*8); | 
|  | 305 | decrypt_round(R0,R1,R2,R3,13*8); | 
|  | 306 | decrypt_round(R2,R3,R0,R1,12*8); | 
|  | 307 | decrypt_round(R0,R1,R2,R3,11*8); | 
|  | 308 | decrypt_round(R2,R3,R0,R1,10*8); | 
|  | 309 | decrypt_round(R0,R1,R2,R3,9*8); | 
|  | 310 | decrypt_round(R2,R3,R0,R1,8*8); | 
|  | 311 | decrypt_round(R0,R1,R2,R3,7*8); | 
|  | 312 | decrypt_round(R2,R3,R0,R1,6*8); | 
|  | 313 | decrypt_round(R0,R1,R2,R3,5*8); | 
|  | 314 | decrypt_round(R2,R3,R0,R1,4*8); | 
|  | 315 | decrypt_round(R0,R1,R2,R3,3*8); | 
|  | 316 | decrypt_round(R2,R3,R0,R1,2*8); | 
|  | 317 | decrypt_round(R0,R1,R2,R3,1*8); | 
|  | 318 | decrypt_last_round(R2,R3,R0,R1,0); | 
|  | 319 |  | 
|  | 320 | input_whitening(%eax,%ebp,c_offset) | 
|  | 321 | input_whitening(%ebx,%ebp,d_offset) | 
|  | 322 | input_whitening(%ecx,%ebp,a_offset) | 
|  | 323 | input_whitening(%edx,%ebp,b_offset) | 
|  | 324 | mov	out_blk+16(%esp),%edi; | 
|  | 325 | mov	%eax,		c_offset(%edi) | 
|  | 326 | mov	%ebx,		d_offset(%edi) | 
|  | 327 | mov	%ecx,		(%edi) | 
|  | 328 | mov	%edx,		b_offset(%edi) | 
|  | 329 |  | 
|  | 330 | pop	%edi | 
|  | 331 | pop	%esi | 
|  | 332 | pop	%ebx | 
|  | 333 | pop	%ebp | 
|  | 334 | mov	$1,	%eax | 
|  | 335 | ret |