| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | Copyright 2003 Richard Curnow, SuperH (UK) Ltd. | 
|  | 3 |  | 
|  | 4 | This file is subject to the terms and conditions of the GNU General Public | 
|  | 5 | License.  See the file "COPYING" in the main directory of this archive | 
|  | 6 | for more details. | 
|  | 7 |  | 
|  | 8 | Tight version of mempy for the case of just copying a page. | 
|  | 9 | Prefetch strategy empirically optimised against RTL simulations | 
|  | 10 | of SH5-101 cut2 eval chip with Cayman board DDR memory. | 
|  | 11 |  | 
|  | 12 | Parameters: | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 13 | r2 : destination effective address (start of page) | 
|  | 14 | r3 : source effective address (start of page) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 15 |  | 
|  | 16 | Always copies 4096 bytes. | 
|  | 17 |  | 
|  | 18 | Points to review. | 
|  | 19 | * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. | 
|  | 20 | It seems like the prefetch needs to be at at least 4 lines ahead to get | 
|  | 21 | the data into the cache in time, and the allocos contend with outstanding | 
|  | 22 | prefetches for the same cache set, so it's better to have the numbers | 
|  | 23 | different. | 
|  | 24 | */ | 
|  | 25 |  | 
|  | 26 | .section .text..SHmedia32,"ax" | 
|  | 27 | .little | 
|  | 28 |  | 
|  | 29 | .balign 8 | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 30 | .global copy_page | 
|  | 31 | copy_page: | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 32 |  | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 33 | /* Copy 4096 bytes worth of data from r3 to r2. | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 34 | Do prefetches 4 lines ahead. | 
|  | 35 | Do alloco 2 lines ahead */ | 
|  | 36 |  | 
|  | 37 | pta 1f, tr1 | 
|  | 38 | pta 2f, tr2 | 
|  | 39 | pta 3f, tr3 | 
|  | 40 | ptabs r18, tr0 | 
|  | 41 |  | 
|  | 42 | #if 0 | 
|  | 43 | /* TAKum03020 */ | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 44 | ld.q r3, 0x00, r63 | 
|  | 45 | ld.q r3, 0x20, r63 | 
|  | 46 | ld.q r3, 0x40, r63 | 
|  | 47 | ld.q r3, 0x60, r63 | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 48 | #endif | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 49 | alloco r2, 0x00 | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 50 | synco		! TAKum03020 | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 51 | alloco r2, 0x20 | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 52 | synco		! TAKum03020 | 
|  | 53 |  | 
|  | 54 | movi 3968, r6 | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 55 | add  r2, r6, r6 | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 56 | addi r6, 64, r7 | 
|  | 57 | addi r7, 64, r8 | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 58 | sub r3, r2, r60 | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 59 | addi r60, 8, r61 | 
|  | 60 | addi r61, 8, r62 | 
|  | 61 | addi r62, 8, r23 | 
|  | 62 | addi r60, 0x80, r22 | 
|  | 63 |  | 
|  | 64 | /* Minimal code size.  The extra branches inside the loop don't cost much | 
|  | 65 | because they overlap with the time spent waiting for prefetches to | 
|  | 66 | complete. */ | 
|  | 67 | 1: | 
|  | 68 | #if 0 | 
|  | 69 | /* TAKum03020 */ | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 70 | bge/u r2, r6, tr2  ! skip prefetch for last 4 lines | 
|  | 71 | ldx.q r2, r22, r63 ! prefetch 4 lines hence | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 72 | #endif | 
|  | 73 | 2: | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 74 | bge/u r2, r7, tr3  ! skip alloco for last 2 lines | 
|  | 75 | alloco r2, 0x40    ! alloc destination line 2 lines ahead | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 76 | synco		! TAKum03020 | 
|  | 77 | 3: | 
| Paul Mundt | 379a95d | 2007-11-20 16:51:28 +0900 | [diff] [blame] | 78 | ldx.q r2, r60, r36 | 
|  | 79 | ldx.q r2, r61, r37 | 
|  | 80 | ldx.q r2, r62, r38 | 
|  | 81 | ldx.q r2, r23, r39 | 
|  | 82 | st.q  r2,   0, r36 | 
|  | 83 | st.q  r2,   8, r37 | 
|  | 84 | st.q  r2,  16, r38 | 
|  | 85 | st.q  r2,  24, r39 | 
|  | 86 | addi r2, 32, r2 | 
|  | 87 | bgt/l r8, r2, tr1 | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 88 |  | 
|  | 89 | blink tr0, r63	   ! return |