blob: a7ce819ff71cab664a3871b6ddea3d6c1b10c042 [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef MEMCMP
32# define MEMCMP ssse3_memcmp3_new
33#endif
34
35#ifndef L
36# define L(label) .L##label
37#endif
38
39#ifndef ALIGN
40# define ALIGN(n) .p2align n
41#endif
42
43#ifndef cfi_startproc
44# define cfi_startproc .cfi_startproc
45#endif
46
47#ifndef cfi_endproc
48# define cfi_endproc .cfi_endproc
49#endif
50
51#ifndef cfi_rel_offset
52# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
53#endif
54
55#ifndef cfi_restore
56# define cfi_restore(reg) .cfi_restore (reg)
57#endif
58
59#ifndef cfi_adjust_cfa_offset
60# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
61#endif
62
63#ifndef ENTRY
64# define ENTRY(name) \
65 .type name, @function; \
66 .globl name; \
67 .p2align 4; \
68name: \
69 cfi_startproc
70#endif
71
72#ifndef END
73# define END(name) \
74 cfi_endproc; \
75 .size name, .-name
76#endif
77
78#define CFI_PUSH(REG) \
79 cfi_adjust_cfa_offset (4); \
80 cfi_rel_offset (REG, 0)
81
82#define CFI_POP(REG) \
83 cfi_adjust_cfa_offset (-4); \
84 cfi_restore (REG)
85
86#define PUSH(REG) pushl REG; CFI_PUSH (REG)
87#define POP(REG) popl REG; CFI_POP (REG)
88
89#define PARMS 4
90#define BLK1 PARMS
91#define BLK2 BLK1+4
92#define LEN BLK2+4
93#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
94#define RETURN RETURN_END; CFI_PUSH (%ebx); CFI_PUSH (%edi); \
95 CFI_PUSH (%esi)
96
97 .section .text.ssse3,"ax",@progbits
98ENTRY (MEMCMP)
99 movl LEN(%esp), %ecx
100 movl BLK1(%esp), %eax
101 cmp $48, %ecx
102 movl BLK2(%esp), %edx
103 jae L(48bytesormore)
104 cmp $1, %ecx
105 jbe L(less1bytes)
106 PUSH (%ebx)
107 add %ecx, %edx
108 add %ecx, %eax
109 jmp L(less48bytes)
110
111 CFI_POP (%ebx)
112 ALIGN (4)
113L(less1bytes):
114 jb L(zero)
115 movb (%eax), %cl
116 cmp (%edx), %cl
117 je L(zero)
118 mov $1, %eax
119 ja L(1bytesend)
120 neg %eax
121L(1bytesend):
122 ret
123
124 ALIGN (4)
125L(zero):
126 mov $0, %eax
127 ret
128
129 ALIGN (4)
130L(48bytesormore):
131 PUSH (%ebx)
132 PUSH (%esi)
133 PUSH (%edi)
134 movdqu (%eax), %xmm3
135 movdqu (%edx), %xmm0
136 movl %eax, %edi
137 movl %edx, %esi
138 pcmpeqb %xmm0, %xmm3
139 pmovmskb %xmm3, %edx
140 lea 16(%edi), %edi
141
142 sub $0xffff, %edx
143 lea 16(%esi), %esi
144 jnz L(less16bytes)
145 mov %edi, %edx
146 and $0xf, %edx
147 xor %edx, %edi
148 sub %edx, %esi
149 add %edx, %ecx
150 mov %esi, %edx
151 and $0xf, %edx
152 jz L(shr_0)
153 xor %edx, %esi
154
155 cmp $8, %edx
156 jae L(next_unaligned_table)
157 cmp $0, %edx
158 je L(shr_0)
159 cmp $1, %edx
160 je L(shr_1)
161 cmp $2, %edx
162 je L(shr_2)
163 cmp $3, %edx
164 je L(shr_3)
165 cmp $4, %edx
166 je L(shr_4)
167 cmp $5, %edx
168 je L(shr_5)
169 cmp $6, %edx
170 je L(shr_6)
171 jmp L(shr_7)
172
173 ALIGN (4)
174L(next_unaligned_table):
175 cmp $8, %edx
176 je L(shr_8)
177 cmp $9, %edx
178 je L(shr_9)
179 cmp $10, %edx
180 je L(shr_10)
181 cmp $11, %edx
182 je L(shr_11)
183 cmp $12, %edx
184 je L(shr_12)
185 cmp $13, %edx
186 je L(shr_13)
187 cmp $14, %edx
188 je L(shr_14)
189 jmp L(shr_15)
190
191 ALIGN (4)
192L(shr_0):
193 cmp $80, %ecx
194 jae L(shr_0_gobble)
195 lea -48(%ecx), %ecx
196 xor %eax, %eax
197 movaps (%esi), %xmm1
198 pcmpeqb (%edi), %xmm1
199 movaps 16(%esi), %xmm2
200 pcmpeqb 16(%edi), %xmm2
201 pand %xmm1, %xmm2
202 pmovmskb %xmm2, %edx
203 add $32, %edi
204 add $32, %esi
205 sub $0xffff, %edx
206 jnz L(exit)
207
208 lea (%ecx, %edi,1), %eax
209 lea (%ecx, %esi,1), %edx
210 POP (%edi)
211 POP (%esi)
212 jmp L(less48bytes)
213
214 CFI_PUSH (%esi)
215 CFI_PUSH (%edi)
216 ALIGN (4)
217L(shr_0_gobble):
218 lea -48(%ecx), %ecx
219 movdqa (%esi), %xmm0
220 xor %eax, %eax
221 pcmpeqb (%edi), %xmm0
222 sub $32, %ecx
223 movdqa 16(%esi), %xmm2
224 pcmpeqb 16(%edi), %xmm2
225L(shr_0_gobble_loop):
226 pand %xmm0, %xmm2
227 sub $32, %ecx
228 pmovmskb %xmm2, %edx
229 movdqa %xmm0, %xmm1
230 movdqa 32(%esi), %xmm0
231 movdqa 48(%esi), %xmm2
232 sbb $0xffff, %edx
233 pcmpeqb 32(%edi), %xmm0
234 pcmpeqb 48(%edi), %xmm2
235 lea 32(%edi), %edi
236 lea 32(%esi), %esi
237 jz L(shr_0_gobble_loop)
238
239 pand %xmm0, %xmm2
240 cmp $0, %ecx
241 jge L(shr_0_gobble_loop_next)
242 inc %edx
243 add $32, %ecx
244L(shr_0_gobble_loop_next):
245 test %edx, %edx
246 jnz L(exit)
247
248 pmovmskb %xmm2, %edx
249 movdqa %xmm0, %xmm1
250 lea 32(%edi), %edi
251 lea 32(%esi), %esi
252 sub $0xffff, %edx
253 jnz L(exit)
254 lea (%ecx, %edi,1), %eax
255 lea (%ecx, %esi,1), %edx
256 POP (%edi)
257 POP (%esi)
258 jmp L(less48bytes)
259
260 CFI_PUSH (%esi)
261 CFI_PUSH (%edi)
262 ALIGN (4)
263L(shr_1):
264 cmp $80, %ecx
265 lea -48(%ecx), %ecx
266 mov %edx, %eax
267 jae L(shr_1_gobble)
268
269 movdqa 16(%esi), %xmm1
270 movdqa %xmm1, %xmm2
271 palignr $1,(%esi), %xmm1
272 pcmpeqb (%edi), %xmm1
273
274 movdqa 32(%esi), %xmm3
275 palignr $1,%xmm2, %xmm3
276 pcmpeqb 16(%edi), %xmm3
277
278 pand %xmm1, %xmm3
279 pmovmskb %xmm3, %edx
280 lea 32(%edi), %edi
281 lea 32(%esi), %esi
282 sub $0xffff, %edx
283 jnz L(exit)
284 lea (%ecx, %edi,1), %eax
285 lea 1(%ecx, %esi,1), %edx
286 POP (%edi)
287 POP (%esi)
288 jmp L(less48bytes)
289
290 CFI_PUSH (%esi)
291 CFI_PUSH (%edi)
292 ALIGN (4)
293L(shr_1_gobble):
294 sub $32, %ecx
295 movdqa 16(%esi), %xmm0
296 palignr $1,(%esi), %xmm0
297 pcmpeqb (%edi), %xmm0
298
299 movdqa 32(%esi), %xmm3
300 palignr $1,16(%esi), %xmm3
301 pcmpeqb 16(%edi), %xmm3
302
303L(shr_1_gobble_loop):
304 pand %xmm0, %xmm3
305 sub $32, %ecx
306 pmovmskb %xmm3, %edx
307 movdqa %xmm0, %xmm1
308
309 movdqa 64(%esi), %xmm3
310 palignr $1,48(%esi), %xmm3
311 sbb $0xffff, %edx
312 movdqa 48(%esi), %xmm0
313 palignr $1,32(%esi), %xmm0
314 pcmpeqb 32(%edi), %xmm0
315 lea 32(%esi), %esi
316 pcmpeqb 48(%edi), %xmm3
317
318 lea 32(%edi), %edi
319 jz L(shr_1_gobble_loop)
320 pand %xmm0, %xmm3
321
322 cmp $0, %ecx
323 jge L(shr_1_gobble_next)
324 inc %edx
325 add $32, %ecx
326L(shr_1_gobble_next):
327 test %edx, %edx
328 jnz L(exit)
329
330 pmovmskb %xmm3, %edx
331 movdqa %xmm0, %xmm1
332 lea 32(%edi), %edi
333 lea 32(%esi), %esi
334 sub $0xffff, %edx
335 jnz L(exit)
336
337 lea (%ecx, %edi,1), %eax
338 lea 1(%ecx, %esi,1), %edx
339 POP (%edi)
340 POP (%esi)
341 jmp L(less48bytes)
342
343 CFI_PUSH (%esi)
344 CFI_PUSH (%edi)
345 ALIGN (4)
346L(shr_2):
347 cmp $80, %ecx
348 lea -48(%ecx), %ecx
349 mov %edx, %eax
350 jae L(shr_2_gobble)
351
352 movdqa 16(%esi), %xmm1
353 movdqa %xmm1, %xmm2
354 palignr $2,(%esi), %xmm1
355 pcmpeqb (%edi), %xmm1
356
357 movdqa 32(%esi), %xmm3
358 palignr $2,%xmm2, %xmm3
359 pcmpeqb 16(%edi), %xmm3
360
361 pand %xmm1, %xmm3
362 pmovmskb %xmm3, %edx
363 lea 32(%edi), %edi
364 lea 32(%esi), %esi
365 sub $0xffff, %edx
366 jnz L(exit)
367 lea (%ecx, %edi,1), %eax
368 lea 2(%ecx, %esi,1), %edx
369 POP (%edi)
370 POP (%esi)
371 jmp L(less48bytes)
372
373 CFI_PUSH (%esi)
374 CFI_PUSH (%edi)
375 ALIGN (4)
376L(shr_2_gobble):
377 sub $32, %ecx
378 movdqa 16(%esi), %xmm0
379 palignr $2,(%esi), %xmm0
380 pcmpeqb (%edi), %xmm0
381
382 movdqa 32(%esi), %xmm3
383 palignr $2,16(%esi), %xmm3
384 pcmpeqb 16(%edi), %xmm3
385
386L(shr_2_gobble_loop):
387 pand %xmm0, %xmm3
388 sub $32, %ecx
389 pmovmskb %xmm3, %edx
390 movdqa %xmm0, %xmm1
391
392 movdqa 64(%esi), %xmm3
393 palignr $2,48(%esi), %xmm3
394 sbb $0xffff, %edx
395 movdqa 48(%esi), %xmm0
396 palignr $2,32(%esi), %xmm0
397 pcmpeqb 32(%edi), %xmm0
398 lea 32(%esi), %esi
399 pcmpeqb 48(%edi), %xmm3
400
401 lea 32(%edi), %edi
402 jz L(shr_2_gobble_loop)
403 pand %xmm0, %xmm3
404
405 cmp $0, %ecx
406 jge L(shr_2_gobble_next)
407 inc %edx
408 add $32, %ecx
409L(shr_2_gobble_next):
410 test %edx, %edx
411 jnz L(exit)
412
413 pmovmskb %xmm3, %edx
414 movdqa %xmm0, %xmm1
415 lea 32(%edi), %edi
416 lea 32(%esi), %esi
417 sub $0xffff, %edx
418 jnz L(exit)
419
420 lea (%ecx, %edi,1), %eax
421 lea 2(%ecx, %esi,1), %edx
422 POP (%edi)
423 POP (%esi)
424 jmp L(less48bytes)
425
426 CFI_PUSH (%esi)
427 CFI_PUSH (%edi)
428 ALIGN (4)
429L(shr_3):
430 cmp $80, %ecx
431 lea -48(%ecx), %ecx
432 mov %edx, %eax
433 jae L(shr_3_gobble)
434
435 movdqa 16(%esi), %xmm1
436 movdqa %xmm1, %xmm2
437 palignr $3,(%esi), %xmm1
438 pcmpeqb (%edi), %xmm1
439
440 movdqa 32(%esi), %xmm3
441 palignr $3,%xmm2, %xmm3
442 pcmpeqb 16(%edi), %xmm3
443
444 pand %xmm1, %xmm3
445 pmovmskb %xmm3, %edx
446 lea 32(%edi), %edi
447 lea 32(%esi), %esi
448 sub $0xffff, %edx
449 jnz L(exit)
450 lea (%ecx, %edi,1), %eax
451 lea 3(%ecx, %esi,1), %edx
452 POP (%edi)
453 POP (%esi)
454 jmp L(less48bytes)
455
456 CFI_PUSH (%esi)
457 CFI_PUSH (%edi)
458 ALIGN (4)
459L(shr_3_gobble):
460 sub $32, %ecx
461 movdqa 16(%esi), %xmm0
462 palignr $3,(%esi), %xmm0
463 pcmpeqb (%edi), %xmm0
464
465 movdqa 32(%esi), %xmm3
466 palignr $3,16(%esi), %xmm3
467 pcmpeqb 16(%edi), %xmm3
468
469L(shr_3_gobble_loop):
470 pand %xmm0, %xmm3
471 sub $32, %ecx
472 pmovmskb %xmm3, %edx
473 movdqa %xmm0, %xmm1
474
475 movdqa 64(%esi), %xmm3
476 palignr $3,48(%esi), %xmm3
477 sbb $0xffff, %edx
478 movdqa 48(%esi), %xmm0
479 palignr $3,32(%esi), %xmm0
480 pcmpeqb 32(%edi), %xmm0
481 lea 32(%esi), %esi
482 pcmpeqb 48(%edi), %xmm3
483
484 lea 32(%edi), %edi
485 jz L(shr_3_gobble_loop)
486 pand %xmm0, %xmm3
487
488 cmp $0, %ecx
489 jge L(shr_3_gobble_next)
490 inc %edx
491 add $32, %ecx
492L(shr_3_gobble_next):
493 test %edx, %edx
494 jnz L(exit)
495
496 pmovmskb %xmm3, %edx
497 movdqa %xmm0, %xmm1
498 lea 32(%edi), %edi
499 lea 32(%esi), %esi
500 sub $0xffff, %edx
501 jnz L(exit)
502
503 lea (%ecx, %edi,1), %eax
504 lea 3(%ecx, %esi,1), %edx
505 POP (%edi)
506 POP (%esi)
507 jmp L(less48bytes)
508
509 CFI_PUSH (%esi)
510 CFI_PUSH (%edi)
511 ALIGN (4)
512L(shr_4):
513 cmp $80, %ecx
514 lea -48(%ecx), %ecx
515 mov %edx, %eax
516 jae L(shr_4_gobble)
517
518 movdqa 16(%esi), %xmm1
519 movdqa %xmm1, %xmm2
520 palignr $4,(%esi), %xmm1
521 pcmpeqb (%edi), %xmm1
522
523 movdqa 32(%esi), %xmm3
524 palignr $4,%xmm2, %xmm3
525 pcmpeqb 16(%edi), %xmm3
526
527 pand %xmm1, %xmm3
528 pmovmskb %xmm3, %edx
529 lea 32(%edi), %edi
530 lea 32(%esi), %esi
531 sub $0xffff, %edx
532 jnz L(exit)
533 lea (%ecx, %edi,1), %eax
534 lea 4(%ecx, %esi,1), %edx
535 POP (%edi)
536 POP (%esi)
537 jmp L(less48bytes)
538
539 CFI_PUSH (%esi)
540 CFI_PUSH (%edi)
541 ALIGN (4)
542L(shr_4_gobble):
543 sub $32, %ecx
544 movdqa 16(%esi), %xmm0
545 palignr $4,(%esi), %xmm0
546 pcmpeqb (%edi), %xmm0
547
548 movdqa 32(%esi), %xmm3
549 palignr $4,16(%esi), %xmm3
550 pcmpeqb 16(%edi), %xmm3
551
552L(shr_4_gobble_loop):
553 pand %xmm0, %xmm3
554 sub $32, %ecx
555 pmovmskb %xmm3, %edx
556 movdqa %xmm0, %xmm1
557
558 movdqa 64(%esi), %xmm3
559 palignr $4,48(%esi), %xmm3
560 sbb $0xffff, %edx
561 movdqa 48(%esi), %xmm0
562 palignr $4,32(%esi), %xmm0
563 pcmpeqb 32(%edi), %xmm0
564 lea 32(%esi), %esi
565 pcmpeqb 48(%edi), %xmm3
566
567 lea 32(%edi), %edi
568 jz L(shr_4_gobble_loop)
569 pand %xmm0, %xmm3
570
571 cmp $0, %ecx
572 jge L(shr_4_gobble_next)
573 inc %edx
574 add $32, %ecx
575L(shr_4_gobble_next):
576 test %edx, %edx
577 jnz L(exit)
578
579 pmovmskb %xmm3, %edx
580 movdqa %xmm0, %xmm1
581 lea 32(%edi), %edi
582 lea 32(%esi), %esi
583 sub $0xffff, %edx
584 jnz L(exit)
585
586 lea (%ecx, %edi,1), %eax
587 lea 4(%ecx, %esi,1), %edx
588 POP (%edi)
589 POP (%esi)
590 jmp L(less48bytes)
591
592 CFI_PUSH (%esi)
593 CFI_PUSH (%edi)
594 ALIGN (4)
595L(shr_5):
596 cmp $80, %ecx
597 lea -48(%ecx), %ecx
598 mov %edx, %eax
599 jae L(shr_5_gobble)
600
601 movdqa 16(%esi), %xmm1
602 movdqa %xmm1, %xmm2
603 palignr $5,(%esi), %xmm1
604 pcmpeqb (%edi), %xmm1
605
606 movdqa 32(%esi), %xmm3
607 palignr $5,%xmm2, %xmm3
608 pcmpeqb 16(%edi), %xmm3
609
610 pand %xmm1, %xmm3
611 pmovmskb %xmm3, %edx
612 lea 32(%edi), %edi
613 lea 32(%esi), %esi
614 sub $0xffff, %edx
615 jnz L(exit)
616 lea (%ecx, %edi,1), %eax
617 lea 5(%ecx, %esi,1), %edx
618 POP (%edi)
619 POP (%esi)
620 jmp L(less48bytes)
621
622 CFI_PUSH (%esi)
623 CFI_PUSH (%edi)
624 ALIGN (4)
625L(shr_5_gobble):
626 sub $32, %ecx
627 movdqa 16(%esi), %xmm0
628 palignr $5,(%esi), %xmm0
629 pcmpeqb (%edi), %xmm0
630
631 movdqa 32(%esi), %xmm3
632 palignr $5,16(%esi), %xmm3
633 pcmpeqb 16(%edi), %xmm3
634
635L(shr_5_gobble_loop):
636 pand %xmm0, %xmm3
637 sub $32, %ecx
638 pmovmskb %xmm3, %edx
639 movdqa %xmm0, %xmm1
640
641 movdqa 64(%esi), %xmm3
642 palignr $5,48(%esi), %xmm3
643 sbb $0xffff, %edx
644 movdqa 48(%esi), %xmm0
645 palignr $5,32(%esi), %xmm0
646 pcmpeqb 32(%edi), %xmm0
647 lea 32(%esi), %esi
648 pcmpeqb 48(%edi), %xmm3
649
650 lea 32(%edi), %edi
651 jz L(shr_5_gobble_loop)
652 pand %xmm0, %xmm3
653
654 cmp $0, %ecx
655 jge L(shr_5_gobble_next)
656 inc %edx
657 add $32, %ecx
658L(shr_5_gobble_next):
659 test %edx, %edx
660 jnz L(exit)
661
662 pmovmskb %xmm3, %edx
663 movdqa %xmm0, %xmm1
664 lea 32(%edi), %edi
665 lea 32(%esi), %esi
666 sub $0xffff, %edx
667 jnz L(exit)
668
669 lea (%ecx, %edi,1), %eax
670 lea 5(%ecx, %esi,1), %edx
671 POP (%edi)
672 POP (%esi)
673 jmp L(less48bytes)
674
675 CFI_PUSH (%esi)
676 CFI_PUSH (%edi)
677 ALIGN (4)
678L(shr_6):
679 cmp $80, %ecx
680 lea -48(%ecx), %ecx
681 mov %edx, %eax
682 jae L(shr_6_gobble)
683
684 movdqa 16(%esi), %xmm1
685 movdqa %xmm1, %xmm2
686 palignr $6,(%esi), %xmm1
687 pcmpeqb (%edi), %xmm1
688
689 movdqa 32(%esi), %xmm3
690 palignr $6,%xmm2, %xmm3
691 pcmpeqb 16(%edi), %xmm3
692
693 pand %xmm1, %xmm3
694 pmovmskb %xmm3, %edx
695 lea 32(%edi), %edi
696 lea 32(%esi), %esi
697 sub $0xffff, %edx
698 jnz L(exit)
699 lea (%ecx, %edi,1), %eax
700 lea 6(%ecx, %esi,1), %edx
701 POP (%edi)
702 POP (%esi)
703 jmp L(less48bytes)
704
705 CFI_PUSH (%esi)
706 CFI_PUSH (%edi)
707 ALIGN (4)
708L(shr_6_gobble):
709 sub $32, %ecx
710 movdqa 16(%esi), %xmm0
711 palignr $6,(%esi), %xmm0
712 pcmpeqb (%edi), %xmm0
713
714 movdqa 32(%esi), %xmm3
715 palignr $6,16(%esi), %xmm3
716 pcmpeqb 16(%edi), %xmm3
717
718L(shr_6_gobble_loop):
719 pand %xmm0, %xmm3
720 sub $32, %ecx
721 pmovmskb %xmm3, %edx
722 movdqa %xmm0, %xmm1
723
724 movdqa 64(%esi), %xmm3
725 palignr $6,48(%esi), %xmm3
726 sbb $0xffff, %edx
727 movdqa 48(%esi), %xmm0
728 palignr $6,32(%esi), %xmm0
729 pcmpeqb 32(%edi), %xmm0
730 lea 32(%esi), %esi
731 pcmpeqb 48(%edi), %xmm3
732
733 lea 32(%edi), %edi
734 jz L(shr_6_gobble_loop)
735 pand %xmm0, %xmm3
736
737 cmp $0, %ecx
738 jge L(shr_6_gobble_next)
739 inc %edx
740 add $32, %ecx
741L(shr_6_gobble_next):
742 test %edx, %edx
743 jnz L(exit)
744
745 pmovmskb %xmm3, %edx
746 movdqa %xmm0, %xmm1
747 lea 32(%edi), %edi
748 lea 32(%esi), %esi
749 sub $0xffff, %edx
750 jnz L(exit)
751
752 lea (%ecx, %edi,1), %eax
753 lea 6(%ecx, %esi,1), %edx
754 POP (%edi)
755 POP (%esi)
756 jmp L(less48bytes)
757
758 CFI_PUSH (%esi)
759 CFI_PUSH (%edi)
760 ALIGN (4)
761L(shr_7):
762 cmp $80, %ecx
763 lea -48(%ecx), %ecx
764 mov %edx, %eax
765 jae L(shr_7_gobble)
766
767 movdqa 16(%esi), %xmm1
768 movdqa %xmm1, %xmm2
769 palignr $7,(%esi), %xmm1
770 pcmpeqb (%edi), %xmm1
771
772 movdqa 32(%esi), %xmm3
773 palignr $7,%xmm2, %xmm3
774 pcmpeqb 16(%edi), %xmm3
775
776 pand %xmm1, %xmm3
777 pmovmskb %xmm3, %edx
778 lea 32(%edi), %edi
779 lea 32(%esi), %esi
780 sub $0xffff, %edx
781 jnz L(exit)
782 lea (%ecx, %edi,1), %eax
783 lea 7(%ecx, %esi,1), %edx
784 POP (%edi)
785 POP (%esi)
786 jmp L(less48bytes)
787
788 CFI_PUSH (%esi)
789 CFI_PUSH (%edi)
790 ALIGN (4)
791L(shr_7_gobble):
792 sub $32, %ecx
793 movdqa 16(%esi), %xmm0
794 palignr $7,(%esi), %xmm0
795 pcmpeqb (%edi), %xmm0
796
797 movdqa 32(%esi), %xmm3
798 palignr $7,16(%esi), %xmm3
799 pcmpeqb 16(%edi), %xmm3
800
801L(shr_7_gobble_loop):
802 pand %xmm0, %xmm3
803 sub $32, %ecx
804 pmovmskb %xmm3, %edx
805 movdqa %xmm0, %xmm1
806
807 movdqa 64(%esi), %xmm3
808 palignr $7,48(%esi), %xmm3
809 sbb $0xffff, %edx
810 movdqa 48(%esi), %xmm0
811 palignr $7,32(%esi), %xmm0
812 pcmpeqb 32(%edi), %xmm0
813 lea 32(%esi), %esi
814 pcmpeqb 48(%edi), %xmm3
815
816 lea 32(%edi), %edi
817 jz L(shr_7_gobble_loop)
818 pand %xmm0, %xmm3
819
820 cmp $0, %ecx
821 jge L(shr_7_gobble_next)
822 inc %edx
823 add $32, %ecx
824L(shr_7_gobble_next):
825 test %edx, %edx
826 jnz L(exit)
827
828 pmovmskb %xmm3, %edx
829 movdqa %xmm0, %xmm1
830 lea 32(%edi), %edi
831 lea 32(%esi), %esi
832 sub $0xffff, %edx
833 jnz L(exit)
834
835 lea (%ecx, %edi,1), %eax
836 lea 7(%ecx, %esi,1), %edx
837 POP (%edi)
838 POP (%esi)
839 jmp L(less48bytes)
840
841 CFI_PUSH (%esi)
842 CFI_PUSH (%edi)
843 ALIGN (4)
844L(shr_8):
845 cmp $80, %ecx
846 lea -48(%ecx), %ecx
847 mov %edx, %eax
848 jae L(shr_8_gobble)
849
850 movdqa 16(%esi), %xmm1
851 movdqa %xmm1, %xmm2
852 palignr $8,(%esi), %xmm1
853 pcmpeqb (%edi), %xmm1
854
855 movdqa 32(%esi), %xmm3
856 palignr $8,%xmm2, %xmm3
857 pcmpeqb 16(%edi), %xmm3
858
859 pand %xmm1, %xmm3
860 pmovmskb %xmm3, %edx
861 lea 32(%edi), %edi
862 lea 32(%esi), %esi
863 sub $0xffff, %edx
864 jnz L(exit)
865 lea (%ecx, %edi,1), %eax
866 lea 8(%ecx, %esi,1), %edx
867 POP (%edi)
868 POP (%esi)
869 jmp L(less48bytes)
870
871 CFI_PUSH (%esi)
872 CFI_PUSH (%edi)
873 ALIGN (4)
874L(shr_8_gobble):
875 sub $32, %ecx
876 movdqa 16(%esi), %xmm0
877 palignr $8,(%esi), %xmm0
878 pcmpeqb (%edi), %xmm0
879
880 movdqa 32(%esi), %xmm3
881 palignr $8,16(%esi), %xmm3
882 pcmpeqb 16(%edi), %xmm3
883
884L(shr_8_gobble_loop):
885 pand %xmm0, %xmm3
886 sub $32, %ecx
887 pmovmskb %xmm3, %edx
888 movdqa %xmm0, %xmm1
889
890 movdqa 64(%esi), %xmm3
891 palignr $8,48(%esi), %xmm3
892 sbb $0xffff, %edx
893 movdqa 48(%esi), %xmm0
894 palignr $8,32(%esi), %xmm0
895 pcmpeqb 32(%edi), %xmm0
896 lea 32(%esi), %esi
897 pcmpeqb 48(%edi), %xmm3
898
899 lea 32(%edi), %edi
900 jz L(shr_8_gobble_loop)
901 pand %xmm0, %xmm3
902
903 cmp $0, %ecx
904 jge L(shr_8_gobble_next)
905 inc %edx
906 add $32, %ecx
907L(shr_8_gobble_next):
908 test %edx, %edx
909 jnz L(exit)
910
911 pmovmskb %xmm3, %edx
912 movdqa %xmm0, %xmm1
913 lea 32(%edi), %edi
914 lea 32(%esi), %esi
915 sub $0xffff, %edx
916 jnz L(exit)
917
918 lea (%ecx, %edi,1), %eax
919 lea 8(%ecx, %esi,1), %edx
920 POP (%edi)
921 POP (%esi)
922 jmp L(less48bytes)
923
924 CFI_PUSH (%esi)
925 CFI_PUSH (%edi)
926 ALIGN (4)
927L(shr_9):
928 cmp $80, %ecx
929 lea -48(%ecx), %ecx
930 mov %edx, %eax
931 jae L(shr_9_gobble)
932
933 movdqa 16(%esi), %xmm1
934 movdqa %xmm1, %xmm2
935 palignr $9,(%esi), %xmm1
936 pcmpeqb (%edi), %xmm1
937
938 movdqa 32(%esi), %xmm3
939 palignr $9,%xmm2, %xmm3
940 pcmpeqb 16(%edi), %xmm3
941
942 pand %xmm1, %xmm3
943 pmovmskb %xmm3, %edx
944 lea 32(%edi), %edi
945 lea 32(%esi), %esi
946 sub $0xffff, %edx
947 jnz L(exit)
948 lea (%ecx, %edi,1), %eax
949 lea 9(%ecx, %esi,1), %edx
950 POP (%edi)
951 POP (%esi)
952 jmp L(less48bytes)
953
954 CFI_PUSH (%esi)
955 CFI_PUSH (%edi)
956 ALIGN (4)
957L(shr_9_gobble):
958 sub $32, %ecx
959 movdqa 16(%esi), %xmm0
960 palignr $9,(%esi), %xmm0
961 pcmpeqb (%edi), %xmm0
962
963 movdqa 32(%esi), %xmm3
964 palignr $9,16(%esi), %xmm3
965 pcmpeqb 16(%edi), %xmm3
966
967L(shr_9_gobble_loop):
968 pand %xmm0, %xmm3
969 sub $32, %ecx
970 pmovmskb %xmm3, %edx
971 movdqa %xmm0, %xmm1
972
973 movdqa 64(%esi), %xmm3
974 palignr $9,48(%esi), %xmm3
975 sbb $0xffff, %edx
976 movdqa 48(%esi), %xmm0
977 palignr $9,32(%esi), %xmm0
978 pcmpeqb 32(%edi), %xmm0
979 lea 32(%esi), %esi
980 pcmpeqb 48(%edi), %xmm3
981
982 lea 32(%edi), %edi
983 jz L(shr_9_gobble_loop)
984 pand %xmm0, %xmm3
985
986 cmp $0, %ecx
987 jge L(shr_9_gobble_next)
988 inc %edx
989 add $32, %ecx
990L(shr_9_gobble_next):
991 test %edx, %edx
992 jnz L(exit)
993
994 pmovmskb %xmm3, %edx
995 movdqa %xmm0, %xmm1
996 lea 32(%edi), %edi
997 lea 32(%esi), %esi
998 sub $0xffff, %edx
999 jnz L(exit)
1000
1001 lea (%ecx, %edi,1), %eax
1002 lea 9(%ecx, %esi,1), %edx
1003 POP (%edi)
1004 POP (%esi)
1005 jmp L(less48bytes)
1006
1007 CFI_PUSH (%esi)
1008 CFI_PUSH (%edi)
1009 ALIGN (4)
1010L(shr_10):
1011 cmp $80, %ecx
1012 lea -48(%ecx), %ecx
1013 mov %edx, %eax
1014 jae L(shr_10_gobble)
1015
1016 movdqa 16(%esi), %xmm1
1017 movdqa %xmm1, %xmm2
1018 palignr $10, (%esi), %xmm1
1019 pcmpeqb (%edi), %xmm1
1020
1021 movdqa 32(%esi), %xmm3
1022 palignr $10,%xmm2, %xmm3
1023 pcmpeqb 16(%edi), %xmm3
1024
1025 pand %xmm1, %xmm3
1026 pmovmskb %xmm3, %edx
1027 lea 32(%edi), %edi
1028 lea 32(%esi), %esi
1029 sub $0xffff, %edx
1030 jnz L(exit)
1031 lea (%ecx, %edi,1), %eax
1032 lea 10(%ecx, %esi,1), %edx
1033 POP (%edi)
1034 POP (%esi)
1035 jmp L(less48bytes)
1036
1037 CFI_PUSH (%esi)
1038 CFI_PUSH (%edi)
1039 ALIGN (4)
1040L(shr_10_gobble):
1041 sub $32, %ecx
1042 movdqa 16(%esi), %xmm0
1043 palignr $10, (%esi), %xmm0
1044 pcmpeqb (%edi), %xmm0
1045
1046 movdqa 32(%esi), %xmm3
1047 palignr $10, 16(%esi), %xmm3
1048 pcmpeqb 16(%edi), %xmm3
1049
1050L(shr_10_gobble_loop):
1051 pand %xmm0, %xmm3
1052 sub $32, %ecx
1053 pmovmskb %xmm3, %edx
1054 movdqa %xmm0, %xmm1
1055
1056 movdqa 64(%esi), %xmm3
1057 palignr $10,48(%esi), %xmm3
1058 sbb $0xffff, %edx
1059 movdqa 48(%esi), %xmm0
1060 palignr $10,32(%esi), %xmm0
1061 pcmpeqb 32(%edi), %xmm0
1062 lea 32(%esi), %esi
1063 pcmpeqb 48(%edi), %xmm3
1064
1065 lea 32(%edi), %edi
1066 jz L(shr_10_gobble_loop)
1067 pand %xmm0, %xmm3
1068
1069 cmp $0, %ecx
1070 jge L(shr_10_gobble_next)
1071 inc %edx
1072 add $32, %ecx
1073L(shr_10_gobble_next):
1074 test %edx, %edx
1075 jnz L(exit)
1076
1077 pmovmskb %xmm3, %edx
1078 movdqa %xmm0, %xmm1
1079 lea 32(%edi), %edi
1080 lea 32(%esi), %esi
1081 sub $0xffff, %edx
1082 jnz L(exit)
1083
1084 lea (%ecx, %edi,1), %eax
1085 lea 10(%ecx, %esi,1), %edx
1086 POP (%edi)
1087 POP (%esi)
1088 jmp L(less48bytes)
1089
1090 CFI_PUSH (%esi)
1091 CFI_PUSH (%edi)
1092 ALIGN (4)
1093L(shr_11):
1094 cmp $80, %ecx
1095 lea -48(%ecx), %ecx
1096 mov %edx, %eax
1097 jae L(shr_11_gobble)
1098
1099 movdqa 16(%esi), %xmm1
1100 movdqa %xmm1, %xmm2
1101 palignr $11, (%esi), %xmm1
1102 pcmpeqb (%edi), %xmm1
1103
1104 movdqa 32(%esi), %xmm3
1105 palignr $11, %xmm2, %xmm3
1106 pcmpeqb 16(%edi), %xmm3
1107
1108 pand %xmm1, %xmm3
1109 pmovmskb %xmm3, %edx
1110 lea 32(%edi), %edi
1111 lea 32(%esi), %esi
1112 sub $0xffff, %edx
1113 jnz L(exit)
1114 lea (%ecx, %edi,1), %eax
1115 lea 11(%ecx, %esi,1), %edx
1116 POP (%edi)
1117 POP (%esi)
1118 jmp L(less48bytes)
1119
1120 CFI_PUSH (%esi)
1121 CFI_PUSH (%edi)
1122 ALIGN (4)
1123L(shr_11_gobble):
1124 sub $32, %ecx
1125 movdqa 16(%esi), %xmm0
1126 palignr $11, (%esi), %xmm0
1127 pcmpeqb (%edi), %xmm0
1128
1129 movdqa 32(%esi), %xmm3
1130 palignr $11, 16(%esi), %xmm3
1131 pcmpeqb 16(%edi), %xmm3
1132
1133L(shr_11_gobble_loop):
1134 pand %xmm0, %xmm3
1135 sub $32, %ecx
1136 pmovmskb %xmm3, %edx
1137 movdqa %xmm0, %xmm1
1138
1139 movdqa 64(%esi), %xmm3
1140 palignr $11,48(%esi), %xmm3
1141 sbb $0xffff, %edx
1142 movdqa 48(%esi), %xmm0
1143 palignr $11,32(%esi), %xmm0
1144 pcmpeqb 32(%edi), %xmm0
1145 lea 32(%esi), %esi
1146 pcmpeqb 48(%edi), %xmm3
1147
1148 lea 32(%edi), %edi
1149 jz L(shr_11_gobble_loop)
1150 pand %xmm0, %xmm3
1151
1152 cmp $0, %ecx
1153 jge L(shr_11_gobble_next)
1154 inc %edx
1155 add $32, %ecx
1156L(shr_11_gobble_next):
1157 test %edx, %edx
1158 jnz L(exit)
1159
1160 pmovmskb %xmm3, %edx
1161 movdqa %xmm0, %xmm1
1162 lea 32(%edi), %edi
1163 lea 32(%esi), %esi
1164 sub $0xffff, %edx
1165 jnz L(exit)
1166
1167 lea (%ecx, %edi,1), %eax
1168 lea 11(%ecx, %esi,1), %edx
1169 POP (%edi)
1170 POP (%esi)
1171 jmp L(less48bytes)
1172
1173 CFI_PUSH (%esi)
1174 CFI_PUSH (%edi)
1175 ALIGN (4)
1176L(shr_12):
1177 cmp $80, %ecx
1178 lea -48(%ecx), %ecx
1179 mov %edx, %eax
1180 jae L(shr_12_gobble)
1181
1182 movdqa 16(%esi), %xmm1
1183 movdqa %xmm1, %xmm2
1184 palignr $12, (%esi), %xmm1
1185 pcmpeqb (%edi), %xmm1
1186
1187 movdqa 32(%esi), %xmm3
1188 palignr $12, %xmm2, %xmm3
1189 pcmpeqb 16(%edi), %xmm3
1190
1191 pand %xmm1, %xmm3
1192 pmovmskb %xmm3, %edx
1193 lea 32(%edi), %edi
1194 lea 32(%esi), %esi
1195 sub $0xffff, %edx
1196 jnz L(exit)
1197 lea (%ecx, %edi,1), %eax
1198 lea 12(%ecx, %esi,1), %edx
1199 POP (%edi)
1200 POP (%esi)
1201 jmp L(less48bytes)
1202
1203 CFI_PUSH (%esi)
1204 CFI_PUSH (%edi)
1205 ALIGN (4)
1206L(shr_12_gobble):
1207 sub $32, %ecx
1208 movdqa 16(%esi), %xmm0
1209 palignr $12, (%esi), %xmm0
1210 pcmpeqb (%edi), %xmm0
1211
1212 movdqa 32(%esi), %xmm3
1213 palignr $12, 16(%esi), %xmm3
1214 pcmpeqb 16(%edi), %xmm3
1215
1216L(shr_12_gobble_loop):
1217 pand %xmm0, %xmm3
1218 sub $32, %ecx
1219 pmovmskb %xmm3, %edx
1220 movdqa %xmm0, %xmm1
1221
1222 movdqa 64(%esi), %xmm3
1223 palignr $12,48(%esi), %xmm3
1224 sbb $0xffff, %edx
1225 movdqa 48(%esi), %xmm0
1226 palignr $12,32(%esi), %xmm0
1227 pcmpeqb 32(%edi), %xmm0
1228 lea 32(%esi), %esi
1229 pcmpeqb 48(%edi), %xmm3
1230
1231 lea 32(%edi), %edi
1232 jz L(shr_12_gobble_loop)
1233 pand %xmm0, %xmm3
1234
1235 cmp $0, %ecx
1236 jge L(shr_12_gobble_next)
1237 inc %edx
1238 add $32, %ecx
1239L(shr_12_gobble_next):
1240 test %edx, %edx
1241 jnz L(exit)
1242
1243 pmovmskb %xmm3, %edx
1244 movdqa %xmm0, %xmm1
1245 lea 32(%edi), %edi
1246 lea 32(%esi), %esi
1247 sub $0xffff, %edx
1248 jnz L(exit)
1249
1250 lea (%ecx, %edi,1), %eax
1251 lea 12(%ecx, %esi,1), %edx
1252 POP (%edi)
1253 POP (%esi)
1254 jmp L(less48bytes)
1255
1256 CFI_PUSH (%esi)
1257 CFI_PUSH (%edi)
1258 ALIGN (4)
1259L(shr_13):
1260 cmp $80, %ecx
1261 lea -48(%ecx), %ecx
1262 mov %edx, %eax
1263 jae L(shr_13_gobble)
1264
1265 movdqa 16(%esi), %xmm1
1266 movdqa %xmm1, %xmm2
1267 palignr $13, (%esi), %xmm1
1268 pcmpeqb (%edi), %xmm1
1269
1270 movdqa 32(%esi), %xmm3
1271 palignr $13, %xmm2, %xmm3
1272 pcmpeqb 16(%edi), %xmm3
1273
1274 pand %xmm1, %xmm3
1275 pmovmskb %xmm3, %edx
1276 lea 32(%edi), %edi
1277 lea 32(%esi), %esi
1278 sub $0xffff, %edx
1279 jnz L(exit)
1280 lea (%ecx, %edi,1), %eax
1281 lea 13(%ecx, %esi,1), %edx
1282 POP (%edi)
1283 POP (%esi)
1284 jmp L(less48bytes)
1285
1286 CFI_PUSH (%esi)
1287 CFI_PUSH (%edi)
1288 ALIGN (4)
1289L(shr_13_gobble):
1290 sub $32, %ecx
1291 movdqa 16(%esi), %xmm0
1292 palignr $13, (%esi), %xmm0
1293 pcmpeqb (%edi), %xmm0
1294
1295 movdqa 32(%esi), %xmm3
1296 palignr $13, 16(%esi), %xmm3
1297 pcmpeqb 16(%edi), %xmm3
1298
1299L(shr_13_gobble_loop):
1300 pand %xmm0, %xmm3
1301 sub $32, %ecx
1302 pmovmskb %xmm3, %edx
1303 movdqa %xmm0, %xmm1
1304
1305 movdqa 64(%esi), %xmm3
1306 palignr $13,48(%esi), %xmm3
1307 sbb $0xffff, %edx
1308 movdqa 48(%esi), %xmm0
1309 palignr $13,32(%esi), %xmm0
1310 pcmpeqb 32(%edi), %xmm0
1311 lea 32(%esi), %esi
1312 pcmpeqb 48(%edi), %xmm3
1313
1314 lea 32(%edi), %edi
1315 jz L(shr_13_gobble_loop)
1316 pand %xmm0, %xmm3
1317
1318 cmp $0, %ecx
1319 jge L(shr_13_gobble_next)
1320 inc %edx
1321 add $32, %ecx
1322L(shr_13_gobble_next):
1323 test %edx, %edx
1324 jnz L(exit)
1325
1326 pmovmskb %xmm3, %edx
1327 movdqa %xmm0, %xmm1
1328 lea 32(%edi), %edi
1329 lea 32(%esi), %esi
1330 sub $0xffff, %edx
1331 jnz L(exit)
1332
1333 lea (%ecx, %edi,1), %eax
1334 lea 13(%ecx, %esi,1), %edx
1335 POP (%edi)
1336 POP (%esi)
1337 jmp L(less48bytes)
1338
1339 CFI_PUSH (%esi)
1340 CFI_PUSH (%edi)
1341 ALIGN (4)
1342L(shr_14):
1343 cmp $80, %ecx
1344 lea -48(%ecx), %ecx
1345 mov %edx, %eax
1346 jae L(shr_14_gobble)
1347
1348 movdqa 16(%esi), %xmm1
1349 movdqa %xmm1, %xmm2
1350 palignr $14, (%esi), %xmm1
1351 pcmpeqb (%edi), %xmm1
1352
1353 movdqa 32(%esi), %xmm3
1354 palignr $14, %xmm2, %xmm3
1355 pcmpeqb 16(%edi), %xmm3
1356
1357 pand %xmm1, %xmm3
1358 pmovmskb %xmm3, %edx
1359 lea 32(%edi), %edi
1360 lea 32(%esi), %esi
1361 sub $0xffff, %edx
1362 jnz L(exit)
1363 lea (%ecx, %edi,1), %eax
1364 lea 14(%ecx, %esi,1), %edx
1365 POP (%edi)
1366 POP (%esi)
1367 jmp L(less48bytes)
1368
1369 CFI_PUSH (%esi)
1370 CFI_PUSH (%edi)
1371 ALIGN (4)
1372L(shr_14_gobble):
1373 sub $32, %ecx
1374 movdqa 16(%esi), %xmm0
1375 palignr $14, (%esi), %xmm0
1376 pcmpeqb (%edi), %xmm0
1377
1378 movdqa 32(%esi), %xmm3
1379 palignr $14, 16(%esi), %xmm3
1380 pcmpeqb 16(%edi), %xmm3
1381
1382L(shr_14_gobble_loop):
1383 pand %xmm0, %xmm3
1384 sub $32, %ecx
1385 pmovmskb %xmm3, %edx
1386 movdqa %xmm0, %xmm1
1387
1388 movdqa 64(%esi), %xmm3
1389 palignr $14,48(%esi), %xmm3
1390 sbb $0xffff, %edx
1391 movdqa 48(%esi), %xmm0
1392 palignr $14,32(%esi), %xmm0
1393 pcmpeqb 32(%edi), %xmm0
1394 lea 32(%esi), %esi
1395 pcmpeqb 48(%edi), %xmm3
1396
1397 lea 32(%edi), %edi
1398 jz L(shr_14_gobble_loop)
1399 pand %xmm0, %xmm3
1400
1401 cmp $0, %ecx
1402 jge L(shr_14_gobble_next)
1403 inc %edx
1404 add $32, %ecx
1405L(shr_14_gobble_next):
1406 test %edx, %edx
1407 jnz L(exit)
1408
1409 pmovmskb %xmm3, %edx
1410 movdqa %xmm0, %xmm1
1411 lea 32(%edi), %edi
1412 lea 32(%esi), %esi
1413 sub $0xffff, %edx
1414 jnz L(exit)
1415
1416 lea (%ecx, %edi,1), %eax
1417 lea 14(%ecx, %esi,1), %edx
1418 POP (%edi)
1419 POP (%esi)
1420 jmp L(less48bytes)
1421
1422 CFI_PUSH (%esi)
1423 CFI_PUSH (%edi)
1424 ALIGN (4)
1425L(shr_15):
1426 cmp $80, %ecx
1427 lea -48(%ecx), %ecx
1428 mov %edx, %eax
1429 jae L(shr_15_gobble)
1430
1431 movdqa 16(%esi), %xmm1
1432 movdqa %xmm1, %xmm2
1433 palignr $15, (%esi), %xmm1
1434 pcmpeqb (%edi), %xmm1
1435
1436 movdqa 32(%esi), %xmm3
1437 palignr $15, %xmm2, %xmm3
1438 pcmpeqb 16(%edi), %xmm3
1439
1440 pand %xmm1, %xmm3
1441 pmovmskb %xmm3, %edx
1442 lea 32(%edi), %edi
1443 lea 32(%esi), %esi
1444 sub $0xffff, %edx
1445 jnz L(exit)
1446 lea (%ecx, %edi,1), %eax
1447 lea 15(%ecx, %esi,1), %edx
1448 POP (%edi)
1449 POP (%esi)
1450 jmp L(less48bytes)
1451
1452 CFI_PUSH (%esi)
1453 CFI_PUSH (%edi)
1454 ALIGN (4)
1455L(shr_15_gobble):
1456 sub $32, %ecx
1457 movdqa 16(%esi), %xmm0
1458 palignr $15, (%esi), %xmm0
1459 pcmpeqb (%edi), %xmm0
1460
1461 movdqa 32(%esi), %xmm3
1462 palignr $15, 16(%esi), %xmm3
1463 pcmpeqb 16(%edi), %xmm3
1464
1465L(shr_15_gobble_loop):
1466 pand %xmm0, %xmm3
1467 sub $32, %ecx
1468 pmovmskb %xmm3, %edx
1469 movdqa %xmm0, %xmm1
1470
1471 movdqa 64(%esi), %xmm3
1472 palignr $15,48(%esi), %xmm3
1473 sbb $0xffff, %edx
1474 movdqa 48(%esi), %xmm0
1475 palignr $15,32(%esi), %xmm0
1476 pcmpeqb 32(%edi), %xmm0
1477 lea 32(%esi), %esi
1478 pcmpeqb 48(%edi), %xmm3
1479
1480 lea 32(%edi), %edi
1481 jz L(shr_15_gobble_loop)
1482 pand %xmm0, %xmm3
1483
1484 cmp $0, %ecx
1485 jge L(shr_15_gobble_next)
1486 inc %edx
1487 add $32, %ecx
1488L(shr_15_gobble_next):
1489 test %edx, %edx
1490 jnz L(exit)
1491
1492 pmovmskb %xmm3, %edx
1493 movdqa %xmm0, %xmm1
1494 lea 32(%edi), %edi
1495 lea 32(%esi), %esi
1496 sub $0xffff, %edx
1497 jnz L(exit)
1498
1499 lea (%ecx, %edi,1), %eax
1500 lea 15(%ecx, %esi,1), %edx
1501 POP (%edi)
1502 POP (%esi)
1503 jmp L(less48bytes)
1504
1505 CFI_PUSH (%esi)
1506 CFI_PUSH (%edi)
1507 ALIGN (4)
1508L(exit):
1509 pmovmskb %xmm1, %ebx
1510 sub $0xffff, %ebx
1511 jz L(first16bytes)
1512 lea -16(%esi), %esi
1513 lea -16(%edi), %edi
1514 mov %ebx, %edx
1515L(first16bytes):
1516 add %eax, %esi
1517L(less16bytes):
1518 test %dl, %dl
1519 jz L(next_24_bytes)
1520
1521 test $0x01, %dl
1522 jnz L(Byte16)
1523
1524 test $0x02, %dl
1525 jnz L(Byte17)
1526
1527 test $0x04, %dl
1528 jnz L(Byte18)
1529
1530 test $0x08, %dl
1531 jnz L(Byte19)
1532
1533 test $0x10, %dl
1534 jnz L(Byte20)
1535
1536 test $0x20, %dl
1537 jnz L(Byte21)
1538
1539 test $0x40, %dl
1540 jnz L(Byte22)
1541L(Byte23):
1542 movzbl -9(%edi), %eax
1543 movzbl -9(%esi), %edx
1544 sub %edx, %eax
1545 RETURN
1546
1547 ALIGN (4)
1548L(Byte16):
1549 movzbl -16(%edi), %eax
1550 movzbl -16(%esi), %edx
1551 sub %edx, %eax
1552 RETURN
1553
1554 ALIGN (4)
1555L(Byte17):
1556 movzbl -15(%edi), %eax
1557 movzbl -15(%esi), %edx
1558 sub %edx, %eax
1559 RETURN
1560
1561 ALIGN (4)
1562L(Byte18):
1563 movzbl -14(%edi), %eax
1564 movzbl -14(%esi), %edx
1565 sub %edx, %eax
1566 RETURN
1567
1568 ALIGN (4)
1569L(Byte19):
1570 movzbl -13(%edi), %eax
1571 movzbl -13(%esi), %edx
1572 sub %edx, %eax
1573 RETURN
1574
1575 ALIGN (4)
1576L(Byte20):
1577 movzbl -12(%edi), %eax
1578 movzbl -12(%esi), %edx
1579 sub %edx, %eax
1580 RETURN
1581
1582 ALIGN (4)
1583L(Byte21):
1584 movzbl -11(%edi), %eax
1585 movzbl -11(%esi), %edx
1586 sub %edx, %eax
1587 RETURN
1588
1589 ALIGN (4)
1590L(Byte22):
1591 movzbl -10(%edi), %eax
1592 movzbl -10(%esi), %edx
1593 sub %edx, %eax
1594 RETURN
1595
1596 ALIGN (4)
1597L(next_24_bytes):
1598 lea 8(%edi), %edi
1599 lea 8(%esi), %esi
1600 test $0x01, %dh
1601 jnz L(Byte16)
1602
1603 test $0x02, %dh
1604 jnz L(Byte17)
1605
1606 test $0x04, %dh
1607 jnz L(Byte18)
1608
1609 test $0x08, %dh
1610 jnz L(Byte19)
1611
1612 test $0x10, %dh
1613 jnz L(Byte20)
1614
1615 test $0x20, %dh
1616 jnz L(Byte21)
1617
1618 test $0x40, %dh
1619 jnz L(Byte22)
1620
1621 ALIGN (4)
1622L(Byte31):
1623 movzbl -9(%edi), %eax
1624 movzbl -9(%esi), %edx
1625 sub %edx, %eax
1626 RETURN_END
1627 CFI_PUSH (%ebx)
1628
1629 ALIGN (4)
1630L(more8bytes):
1631 cmp $16, %ecx
1632 jae L(more16bytes)
1633 cmp $8, %ecx
1634 je L(8bytes)
1635 cmp $9, %ecx
1636 je L(9bytes)
1637 cmp $10, %ecx
1638 je L(10bytes)
1639 cmp $11, %ecx
1640 je L(11bytes)
1641 cmp $12, %ecx
1642 je L(12bytes)
1643 cmp $13, %ecx
1644 je L(13bytes)
1645 cmp $14, %ecx
1646 je L(14bytes)
1647 jmp L(15bytes)
1648
1649 ALIGN (4)
1650L(more16bytes):
1651 cmp $24, %ecx
1652 jae L(more24bytes)
1653 cmp $16, %ecx
1654 je L(16bytes)
1655 cmp $17, %ecx
1656 je L(17bytes)
1657 cmp $18, %ecx
1658 je L(18bytes)
1659 cmp $19, %ecx
1660 je L(19bytes)
1661 cmp $20, %ecx
1662 je L(20bytes)
1663 cmp $21, %ecx
1664 je L(21bytes)
1665 cmp $22, %ecx
1666 je L(22bytes)
1667 jmp L(23bytes)
1668
1669 ALIGN (4)
1670L(more24bytes):
1671 cmp $32, %ecx
1672 jae L(more32bytes)
1673 cmp $24, %ecx
1674 je L(24bytes)
1675 cmp $25, %ecx
1676 je L(25bytes)
1677 cmp $26, %ecx
1678 je L(26bytes)
1679 cmp $27, %ecx
1680 je L(27bytes)
1681 cmp $28, %ecx
1682 je L(28bytes)
1683 cmp $29, %ecx
1684 je L(29bytes)
1685 cmp $30, %ecx
1686 je L(30bytes)
1687 jmp L(31bytes)
1688
1689 ALIGN (4)
1690L(more32bytes):
1691 cmp $40, %ecx
1692 jae L(more40bytes)
1693 cmp $32, %ecx
1694 je L(32bytes)
1695 cmp $33, %ecx
1696 je L(33bytes)
1697 cmp $34, %ecx
1698 je L(34bytes)
1699 cmp $35, %ecx
1700 je L(35bytes)
1701 cmp $36, %ecx
1702 je L(36bytes)
1703 cmp $37, %ecx
1704 je L(37bytes)
1705 cmp $38, %ecx
1706 je L(38bytes)
1707 jmp L(39bytes)
1708
1709 ALIGN (4)
1710L(more40bytes):
1711 cmp $40, %ecx
1712 je L(40bytes)
1713 cmp $41, %ecx
1714 je L(41bytes)
1715 cmp $42, %ecx
1716 je L(42bytes)
1717 cmp $43, %ecx
1718 je L(43bytes)
1719 cmp $44, %ecx
1720 je L(44bytes)
1721 cmp $45, %ecx
1722 je L(45bytes)
1723 cmp $46, %ecx
1724 je L(46bytes)
1725 jmp L(47bytes)
1726
1727 ALIGN (4)
1728L(less48bytes):
1729 cmp $8, %ecx
1730 jae L(more8bytes)
1731 cmp $2, %ecx
1732 je L(2bytes)
1733 cmp $3, %ecx
1734 je L(3bytes)
1735 cmp $4, %ecx
1736 je L(4bytes)
1737 cmp $5, %ecx
1738 je L(5bytes)
1739 cmp $6, %ecx
1740 je L(6bytes)
1741 jmp L(7bytes)
1742
1743
1744 ALIGN (4)
1745L(44bytes):
1746 mov -44(%eax), %ecx
1747 mov -44(%edx), %ebx
1748 cmp %ebx, %ecx
1749 jne L(find_diff)
1750L(40bytes):
1751 mov -40(%eax), %ecx
1752 mov -40(%edx), %ebx
1753 cmp %ebx, %ecx
1754 jne L(find_diff)
1755L(36bytes):
1756 mov -36(%eax), %ecx
1757 mov -36(%edx), %ebx
1758 cmp %ebx, %ecx
1759 jne L(find_diff)
1760L(32bytes):
1761 mov -32(%eax), %ecx
1762 mov -32(%edx), %ebx
1763 cmp %ebx, %ecx
1764 jne L(find_diff)
1765L(28bytes):
1766 mov -28(%eax), %ecx
1767 mov -28(%edx), %ebx
1768 cmp %ebx, %ecx
1769 jne L(find_diff)
1770L(24bytes):
1771 mov -24(%eax), %ecx
1772 mov -24(%edx), %ebx
1773 cmp %ebx, %ecx
1774 jne L(find_diff)
1775L(20bytes):
1776 mov -20(%eax), %ecx
1777 mov -20(%edx), %ebx
1778 cmp %ebx, %ecx
1779 jne L(find_diff)
1780L(16bytes):
1781 mov -16(%eax), %ecx
1782 mov -16(%edx), %ebx
1783 cmp %ebx, %ecx
1784 jne L(find_diff)
1785L(12bytes):
1786 mov -12(%eax), %ecx
1787 mov -12(%edx), %ebx
1788 cmp %ebx, %ecx
1789 jne L(find_diff)
1790L(8bytes):
1791 mov -8(%eax), %ecx
1792 mov -8(%edx), %ebx
1793 cmp %ebx, %ecx
1794 jne L(find_diff)
1795L(4bytes):
1796 mov -4(%eax), %ecx
1797 mov -4(%edx), %ebx
1798 cmp %ebx, %ecx
1799 mov $0, %eax
1800 jne L(find_diff)
1801 POP (%ebx)
1802 ret
1803 CFI_PUSH (%ebx)
1804
1805 ALIGN (4)
1806L(45bytes):
1807 mov -45(%eax), %ecx
1808 mov -45(%edx), %ebx
1809 cmp %ebx, %ecx
1810 jne L(find_diff)
1811L(41bytes):
1812 mov -41(%eax), %ecx
1813 mov -41(%edx), %ebx
1814 cmp %ebx, %ecx
1815 jne L(find_diff)
1816L(37bytes):
1817 mov -37(%eax), %ecx
1818 mov -37(%edx), %ebx
1819 cmp %ebx, %ecx
1820 jne L(find_diff)
1821L(33bytes):
1822 mov -33(%eax), %ecx
1823 mov -33(%edx), %ebx
1824 cmp %ebx, %ecx
1825 jne L(find_diff)
1826L(29bytes):
1827 mov -29(%eax), %ecx
1828 mov -29(%edx), %ebx
1829 cmp %ebx, %ecx
1830 jne L(find_diff)
1831L(25bytes):
1832 mov -25(%eax), %ecx
1833 mov -25(%edx), %ebx
1834 cmp %ebx, %ecx
1835 jne L(find_diff)
1836L(21bytes):
1837 mov -21(%eax), %ecx
1838 mov -21(%edx), %ebx
1839 cmp %ebx, %ecx
1840 jne L(find_diff)
1841L(17bytes):
1842 mov -17(%eax), %ecx
1843 mov -17(%edx), %ebx
1844 cmp %ebx, %ecx
1845 jne L(find_diff)
1846L(13bytes):
1847 mov -13(%eax), %ecx
1848 mov -13(%edx), %ebx
1849 cmp %ebx, %ecx
1850 jne L(find_diff)
1851L(9bytes):
1852 mov -9(%eax), %ecx
1853 mov -9(%edx), %ebx
1854 cmp %ebx, %ecx
1855 jne L(find_diff)
1856L(5bytes):
1857 mov -5(%eax), %ecx
1858 mov -5(%edx), %ebx
1859 cmp %ebx, %ecx
1860 jne L(find_diff)
1861 movzbl -1(%eax), %ecx
1862 cmp -1(%edx), %cl
1863 mov $0, %eax
1864 jne L(end)
1865 POP (%ebx)
1866 ret
1867 CFI_PUSH (%ebx)
1868
1869 ALIGN (4)
1870L(46bytes):
1871 mov -46(%eax), %ecx
1872 mov -46(%edx), %ebx
1873 cmp %ebx, %ecx
1874 jne L(find_diff)
1875L(42bytes):
1876 mov -42(%eax), %ecx
1877 mov -42(%edx), %ebx
1878 cmp %ebx, %ecx
1879 jne L(find_diff)
1880L(38bytes):
1881 mov -38(%eax), %ecx
1882 mov -38(%edx), %ebx
1883 cmp %ebx, %ecx
1884 jne L(find_diff)
1885L(34bytes):
1886 mov -34(%eax), %ecx
1887 mov -34(%edx), %ebx
1888 cmp %ebx, %ecx
1889 jne L(find_diff)
1890L(30bytes):
1891 mov -30(%eax), %ecx
1892 mov -30(%edx), %ebx
1893 cmp %ebx, %ecx
1894 jne L(find_diff)
1895L(26bytes):
1896 mov -26(%eax), %ecx
1897 mov -26(%edx), %ebx
1898 cmp %ebx, %ecx
1899 jne L(find_diff)
1900L(22bytes):
1901 mov -22(%eax), %ecx
1902 mov -22(%edx), %ebx
1903 cmp %ebx, %ecx
1904 jne L(find_diff)
1905L(18bytes):
1906 mov -18(%eax), %ecx
1907 mov -18(%edx), %ebx
1908 cmp %ebx, %ecx
1909 jne L(find_diff)
1910L(14bytes):
1911 mov -14(%eax), %ecx
1912 mov -14(%edx), %ebx
1913 cmp %ebx, %ecx
1914 jne L(find_diff)
1915L(10bytes):
1916 mov -10(%eax), %ecx
1917 mov -10(%edx), %ebx
1918 cmp %ebx, %ecx
1919 jne L(find_diff)
1920L(6bytes):
1921 mov -6(%eax), %ecx
1922 mov -6(%edx), %ebx
1923 cmp %ebx, %ecx
1924 jne L(find_diff)
1925L(2bytes):
1926 movzwl -2(%eax), %ecx
1927 movzwl -2(%edx), %ebx
1928 cmp %bl, %cl
1929 jne L(end)
1930 cmp %bh, %ch
1931 mov $0, %eax
1932 jne L(end)
1933 POP (%ebx)
1934 ret
1935 CFI_PUSH (%ebx)
1936
1937 ALIGN (4)
1938L(47bytes):
1939 movl -47(%eax), %ecx
1940 movl -47(%edx), %ebx
1941 cmp %ebx, %ecx
1942 jne L(find_diff)
1943L(43bytes):
1944 movl -43(%eax), %ecx
1945 movl -43(%edx), %ebx
1946 cmp %ebx, %ecx
1947 jne L(find_diff)
1948L(39bytes):
1949 movl -39(%eax), %ecx
1950 movl -39(%edx), %ebx
1951 cmp %ebx, %ecx
1952 jne L(find_diff)
1953L(35bytes):
1954 movl -35(%eax), %ecx
1955 movl -35(%edx), %ebx
1956 cmp %ebx, %ecx
1957 jne L(find_diff)
1958L(31bytes):
1959 movl -31(%eax), %ecx
1960 movl -31(%edx), %ebx
1961 cmp %ebx, %ecx
1962 jne L(find_diff)
1963L(27bytes):
1964 movl -27(%eax), %ecx
1965 movl -27(%edx), %ebx
1966 cmp %ebx, %ecx
1967 jne L(find_diff)
1968L(23bytes):
1969 movl -23(%eax), %ecx
1970 movl -23(%edx), %ebx
1971 cmp %ebx, %ecx
1972 jne L(find_diff)
1973L(19bytes):
1974 movl -19(%eax), %ecx
1975 movl -19(%edx), %ebx
1976 cmp %ebx, %ecx
1977 jne L(find_diff)
1978L(15bytes):
1979 movl -15(%eax), %ecx
1980 movl -15(%edx), %ebx
1981 cmp %ebx, %ecx
1982 jne L(find_diff)
1983L(11bytes):
1984 movl -11(%eax), %ecx
1985 movl -11(%edx), %ebx
1986 cmp %ebx, %ecx
1987 jne L(find_diff)
1988L(7bytes):
1989 movl -7(%eax), %ecx
1990 movl -7(%edx), %ebx
1991 cmp %ebx, %ecx
1992 jne L(find_diff)
1993L(3bytes):
1994 movzwl -3(%eax), %ecx
1995 movzwl -3(%edx), %ebx
1996 cmpb %bl, %cl
1997 jne L(end)
1998 cmp %bx, %cx
1999 jne L(end)
2000 movzbl -1(%eax), %eax
2001 cmpb -1(%edx), %al
2002 mov $0, %eax
2003 jne L(end)
2004 POP (%ebx)
2005 ret
2006 CFI_PUSH (%ebx)
2007
2008 ALIGN (4)
2009L(find_diff):
2010 cmpb %bl, %cl
2011 jne L(end)
2012 cmp %bx, %cx
2013 jne L(end)
2014 shr $16,%ecx
2015 shr $16,%ebx
2016 cmp %bl, %cl
2017 jne L(end)
2018 cmp %bx, %cx
2019L(end):
2020 POP (%ebx)
2021 mov $1, %eax
2022 ja L(bigger)
2023 neg %eax
2024L(bigger):
2025 ret
2026
2027END (MEMCMP)