mirror of
https://github.com/adulau/aha.git
synced 2025-01-04 07:03:38 +00:00
9895f9429c
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
89 lines
2.1 KiB
ArmAsm
89 lines
2.1 KiB
ArmAsm
/*
|
|
Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
|
|
|
|
This file is subject to the terms and conditions of the GNU General Public
|
|
License. See the file "COPYING" in the main directory of this archive
|
|
for more details.
|
|
|
|
Tight version of mempy for the case of just copying a page.
|
|
Prefetch strategy empirically optimised against RTL simulations
|
|
of SH5-101 cut2 eval chip with Cayman board DDR memory.
|
|
|
|
Parameters:
|
|
r2 : destination effective address (start of page)
|
|
r3 : source effective address (start of page)
|
|
|
|
Always copies 4096 bytes.
|
|
|
|
Points to review.
|
|
* Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
|
|
It seems like the prefetch needs to be at at least 4 lines ahead to get
|
|
the data into the cache in time, and the allocos contend with outstanding
|
|
prefetches for the same cache set, so it's better to have the numbers
|
|
different.
|
|
*/
|
|
|
|
.section .text..SHmedia32,"ax"
|
|
.little
|
|
|
|
.balign 8
|
|
.global copy_page
|
|
copy_page:
|
|
|
|
/* Copy 4096 bytes worth of data from r3 to r2.
|
|
Do prefetches 4 lines ahead.
|
|
Do alloco 2 lines ahead */
|
|
|
|
pta 1f, tr1
|
|
pta 2f, tr2
|
|
pta 3f, tr3
|
|
ptabs r18, tr0
|
|
|
|
#if 0
|
|
/* TAKum03020 */
|
|
ld.q r3, 0x00, r63
|
|
ld.q r3, 0x20, r63
|
|
ld.q r3, 0x40, r63
|
|
ld.q r3, 0x60, r63
|
|
#endif
|
|
alloco r2, 0x00
|
|
synco ! TAKum03020
|
|
alloco r2, 0x20
|
|
synco ! TAKum03020
|
|
|
|
movi 3968, r6
|
|
add r2, r6, r6
|
|
addi r6, 64, r7
|
|
addi r7, 64, r8
|
|
sub r3, r2, r60
|
|
addi r60, 8, r61
|
|
addi r61, 8, r62
|
|
addi r62, 8, r23
|
|
addi r60, 0x80, r22
|
|
|
|
/* Minimal code size. The extra branches inside the loop don't cost much
|
|
because they overlap with the time spent waiting for prefetches to
|
|
complete. */
|
|
1:
|
|
#if 0
|
|
/* TAKum03020 */
|
|
bge/u r2, r6, tr2 ! skip prefetch for last 4 lines
|
|
ldx.q r2, r22, r63 ! prefetch 4 lines hence
|
|
#endif
|
|
2:
|
|
bge/u r2, r7, tr3 ! skip alloco for last 2 lines
|
|
alloco r2, 0x40 ! alloc destination line 2 lines ahead
|
|
synco ! TAKum03020
|
|
3:
|
|
ldx.q r2, r60, r36
|
|
ldx.q r2, r61, r37
|
|
ldx.q r2, r62, r38
|
|
ldx.q r2, r23, r39
|
|
st.q r2, 0, r36
|
|
st.q r2, 8, r37
|
|
st.q r2, 16, r38
|
|
st.q r2, 24, r39
|
|
addi r2, 32, r2
|
|
bgt/l r8, r2, tr1
|
|
|
|
blink tr0, r63 ! return
|