| Jussi Kivilinna | cba1cce | 2012-10-20 15:06:41 +0300 | [diff] [blame] | 1 | /* | 
 | 2 |  * Shared glue code for 128bit block ciphers, AVX assembler macros | 
 | 3 |  * | 
 | 4 |  * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 
 | 5 |  * | 
 | 6 |  * This program is free software; you can redistribute it and/or modify | 
 | 7 |  * it under the terms of the GNU General Public License as published by | 
 | 8 |  * the Free Software Foundation; either version 2 of the License, or | 
 | 9 |  * (at your option) any later version. | 
 | 10 |  * | 
 | 11 |  * This program is distributed in the hope that it will be useful, | 
 | 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
 | 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
 | 14 |  * GNU General Public License for more details. | 
 | 15 |  * | 
 | 16 |  */ | 
 | 17 |  | 
 | 18 | #define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ | 
 | 19 | 	vmovdqu (0*16)(src), x0; \ | 
 | 20 | 	vmovdqu (1*16)(src), x1; \ | 
 | 21 | 	vmovdqu (2*16)(src), x2; \ | 
 | 22 | 	vmovdqu (3*16)(src), x3; \ | 
 | 23 | 	vmovdqu (4*16)(src), x4; \ | 
 | 24 | 	vmovdqu (5*16)(src), x5; \ | 
 | 25 | 	vmovdqu (6*16)(src), x6; \ | 
 | 26 | 	vmovdqu (7*16)(src), x7; | 
 | 27 |  | 
 | 28 | #define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | 
 | 29 | 	vmovdqu x0, (0*16)(dst); \ | 
 | 30 | 	vmovdqu x1, (1*16)(dst); \ | 
 | 31 | 	vmovdqu x2, (2*16)(dst); \ | 
 | 32 | 	vmovdqu x3, (3*16)(dst); \ | 
 | 33 | 	vmovdqu x4, (4*16)(dst); \ | 
 | 34 | 	vmovdqu x5, (5*16)(dst); \ | 
 | 35 | 	vmovdqu x6, (6*16)(dst); \ | 
 | 36 | 	vmovdqu x7, (7*16)(dst); | 
 | 37 |  | 
 | 38 | #define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | 
 | 39 | 	vpxor (0*16)(src), x1, x1; \ | 
 | 40 | 	vpxor (1*16)(src), x2, x2; \ | 
 | 41 | 	vpxor (2*16)(src), x3, x3; \ | 
 | 42 | 	vpxor (3*16)(src), x4, x4; \ | 
 | 43 | 	vpxor (4*16)(src), x5, x5; \ | 
 | 44 | 	vpxor (5*16)(src), x6, x6; \ | 
 | 45 | 	vpxor (6*16)(src), x7, x7; \ | 
 | 46 | 	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); | 
 | 47 |  | 
 | 48 | #define inc_le128(x, minus_one, tmp) \ | 
 | 49 | 	vpcmpeqq minus_one, x, tmp; \ | 
 | 50 | 	vpsubq minus_one, x, x; \ | 
 | 51 | 	vpslldq $8, tmp, tmp; \ | 
 | 52 | 	vpsubq tmp, x, x; | 
 | 53 |  | 
 | 54 | #define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \ | 
 | 55 | 	vpcmpeqd t0, t0, t0; \ | 
 | 56 | 	vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \ | 
 | 57 | 	vmovdqa bswap, t1; \ | 
 | 58 | 	\ | 
 | 59 | 	/* load IV and byteswap */ \ | 
 | 60 | 	vmovdqu (iv), x7; \ | 
 | 61 | 	vpshufb t1, x7, x0; \ | 
 | 62 | 	\ | 
 | 63 | 	/* construct IVs */ \ | 
 | 64 | 	inc_le128(x7, t0, t2); \ | 
 | 65 | 	vpshufb t1, x7, x1; \ | 
 | 66 | 	inc_le128(x7, t0, t2); \ | 
 | 67 | 	vpshufb t1, x7, x2; \ | 
 | 68 | 	inc_le128(x7, t0, t2); \ | 
 | 69 | 	vpshufb t1, x7, x3; \ | 
 | 70 | 	inc_le128(x7, t0, t2); \ | 
 | 71 | 	vpshufb t1, x7, x4; \ | 
 | 72 | 	inc_le128(x7, t0, t2); \ | 
 | 73 | 	vpshufb t1, x7, x5; \ | 
 | 74 | 	inc_le128(x7, t0, t2); \ | 
 | 75 | 	vpshufb t1, x7, x6; \ | 
 | 76 | 	inc_le128(x7, t0, t2); \ | 
 | 77 | 	vmovdqa x7, t2; \ | 
 | 78 | 	vpshufb t1, x7, x7; \ | 
 | 79 | 	inc_le128(t2, t0, t1); \ | 
 | 80 | 	vmovdqu t2, (iv); | 
 | 81 |  | 
 | 82 | #define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | 
 | 83 | 	vpxor (0*16)(src), x0, x0; \ | 
 | 84 | 	vpxor (1*16)(src), x1, x1; \ | 
 | 85 | 	vpxor (2*16)(src), x2, x2; \ | 
 | 86 | 	vpxor (3*16)(src), x3, x3; \ | 
 | 87 | 	vpxor (4*16)(src), x4, x4; \ | 
 | 88 | 	vpxor (5*16)(src), x5, x5; \ | 
 | 89 | 	vpxor (6*16)(src), x6, x6; \ | 
 | 90 | 	vpxor (7*16)(src), x7, x7; \ | 
 | 91 | 	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); |