mirror of
https://github.com/adulau/aha.git
synced 2024-12-30 20:56:23 +00:00
185f3d3890
Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
133 lines
2.3 KiB
ArmAsm
133 lines
2.3 KiB
ArmAsm
/* Copyright 2002 Andi Kleen, SuSE Labs */
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/dwarf2.h>
|
|
|
|
/*
|
|
* ISO C memset - set a memory block to a byte value.
|
|
*
|
|
* rdi destination
|
|
* rsi value (char)
|
|
* rdx count (bytes)
|
|
*
|
|
* rax original destination
|
|
*/
|
|
ALIGN
|
|
memset_c:
|
|
CFI_STARTPROC
|
|
movq %rdi,%r9
|
|
movl %edx,%r8d
|
|
andl $7,%r8d
|
|
movl %edx,%ecx
|
|
shrl $3,%ecx
|
|
/* expand byte value */
|
|
movzbl %sil,%esi
|
|
movabs $0x0101010101010101,%rax
|
|
mulq %rsi /* with rax, clobbers rdx */
|
|
rep stosq
|
|
movl %r8d,%ecx
|
|
rep stosb
|
|
movq %r9,%rax
|
|
ret
|
|
CFI_ENDPROC
|
|
ENDPROC(memset_c)
|
|
|
|
ENTRY(memset)
|
|
ENTRY(__memset)
|
|
CFI_STARTPROC
|
|
movq %rdi,%r10
|
|
movq %rdx,%r11
|
|
|
|
/* expand byte value */
|
|
movzbl %sil,%ecx
|
|
movabs $0x0101010101010101,%rax
|
|
mul %rcx /* with rax, clobbers rdx */
|
|
|
|
/* align dst */
|
|
movl %edi,%r9d
|
|
andl $7,%r9d
|
|
jnz .Lbad_alignment
|
|
CFI_REMEMBER_STATE
|
|
.Lafter_bad_alignment:
|
|
|
|
movl %r11d,%ecx
|
|
shrl $6,%ecx
|
|
jz .Lhandle_tail
|
|
|
|
.p2align 4
|
|
.Lloop_64:
|
|
decl %ecx
|
|
movq %rax,(%rdi)
|
|
movq %rax,8(%rdi)
|
|
movq %rax,16(%rdi)
|
|
movq %rax,24(%rdi)
|
|
movq %rax,32(%rdi)
|
|
movq %rax,40(%rdi)
|
|
movq %rax,48(%rdi)
|
|
movq %rax,56(%rdi)
|
|
leaq 64(%rdi),%rdi
|
|
jnz .Lloop_64
|
|
|
|
/* Handle tail in loops. The loops should be faster than hard
|
|
to predict jump tables. */
|
|
.p2align 4
|
|
.Lhandle_tail:
|
|
movl %r11d,%ecx
|
|
andl $63&(~7),%ecx
|
|
jz .Lhandle_7
|
|
shrl $3,%ecx
|
|
.p2align 4
|
|
.Lloop_8:
|
|
decl %ecx
|
|
movq %rax,(%rdi)
|
|
leaq 8(%rdi),%rdi
|
|
jnz .Lloop_8
|
|
|
|
.Lhandle_7:
|
|
movl %r11d,%ecx
|
|
andl $7,%ecx
|
|
jz .Lende
|
|
.p2align 4
|
|
.Lloop_1:
|
|
decl %ecx
|
|
movb %al,(%rdi)
|
|
leaq 1(%rdi),%rdi
|
|
jnz .Lloop_1
|
|
|
|
.Lende:
|
|
movq %r10,%rax
|
|
ret
|
|
|
|
CFI_RESTORE_STATE
|
|
.Lbad_alignment:
|
|
cmpq $7,%r11
|
|
jbe .Lhandle_7
|
|
movq %rax,(%rdi) /* unaligned store */
|
|
movq $8,%r8
|
|
subq %r9,%r8
|
|
addq %r8,%rdi
|
|
subq %r8,%r11
|
|
jmp .Lafter_bad_alignment
|
|
.Lfinal:
|
|
CFI_ENDPROC
|
|
ENDPROC(memset)
|
|
ENDPROC(__memset)
|
|
|
|
/* Some CPUs run faster using the string instructions.
|
|
It is also a lot simpler. Use this when possible */
|
|
|
|
#include <asm/cpufeature.h>
|
|
|
|
.section .altinstr_replacement,"ax"
|
|
1: .byte 0xeb /* jmp <disp8> */
|
|
.byte (memset_c - memset) - (2f - 1b) /* offset */
|
|
2:
|
|
.previous
|
|
.section .altinstructions,"a"
|
|
.align 8
|
|
.quad memset
|
|
.quad 1b
|
|
.byte X86_FEATURE_REP_GOOD
|
|
.byte .Lfinal - memset
|
|
.byte 2b - 1b
|
|
.previous
|