however, if you use -gen gcc then they may be a bit faster, but not necessarily, depends on the O level, btw, I think that if they were written in FB rather than asm then gcc could optimize them even more
note, the first two are the originals, the bottom two are the tweaked ones plus there's a benchmark
Code: Select all
Function _ASM_Sin6th(fX As Double) As Double
'By Eukalyptus
Asm
jmp 0f
1: .Double 683565275.57643158
2: .Double -0.0000000061763971109087229
3: .Double 6755399441055744.0
0:
movq xmm0, [fX]
mulsd xmm0, [1b]
addsd xmm0, [3b]
movd ebx, xmm0
lea eax, [ebx*2+0x80000000]
sar eax, 2
imul eax
sar ebx, 31
lea eax, [edx*2-0x70000000]
lea ecx, [edx*8+edx-0x24000000]
imul edx
Xor ecx, ebx
lea eax, [edx*8+edx+0x44A00000]
imul ecx
cvtsi2sd xmm0, edx
mulsd xmm0, [2b]
movq [Function], xmm0
End Asm
End Function
Function _ASM_Cos6th(fX As Double) As Double
'By Eukalyptus
Asm
jmp 0f
1: .Double 683565275.57643158
2: .Double -0.0000000061763971109087229
3: .Double 6755399441055744.0
0:
movq xmm0, [fX]
mulsd xmm0, [1b]
addsd xmm0, [3b]
movd ebx, xmm0
Add ebx, 0x40000000 'SinToCos
lea eax, [ebx*2+0x80000000]
sar eax, 2
imul eax
sar ebx, 31
lea eax, [edx*2-0x70000000]
lea ecx, [edx*8+edx-0x24000000]
imul edx
Xor ecx, ebx
lea eax, [edx*8+edx+0x44A00000]
imul ecx
cvtsi2sd xmm0, edx
mulsd xmm0, [2b]
movq [Function], xmm0
End Asm
End Function
Function ASM_Sin6th naked cdecl(byval fX As Double) As Double
'By Eukalyptus
Asm
' if FB-32-bit, then load fx from stack, else it's already in xmm0
' ebx/rbx needs to be preserved, not sure about ecx/rcx
#ifndef __FB_64BIT__
lea eax, [esp+4]
push ebx
push ecx
movq xmm0, [eax]
#else
push rbx
push rcx
#endif
mulsd xmm0, [1f]
addsd xmm0, [3f]
movd ebx, xmm0
lea eax, [ebx*2+0x80000000]
sar eax, 2
imul eax
sar ebx, 31
lea eax, [edx*2-0x70000000]
lea ecx, [edx*8+edx-0x24000000]
imul edx
Xor ecx, ebx
lea eax, [edx*8+edx+0x44A00000]
imul ecx
cvtsi2sd xmm0, edx
mulsd xmm0, [2f]
' if FB-32-bit, then transfer xmm0 into fpu, else we are done
' restore saved registers
#ifndef __FB_64BIT__
pop ecx
pop ebx
movq [esp-12], xmm0
fld qword ptr [esp-12]
#else
pop rcx
pop rbx
#endif
ret
1: .Double 683565275.57643158
2: .Double -0.0000000061763971109087229
3: .Double 6755399441055744.0
End Asm
End Function
Function ASM_Cos6th naked cdecl(byval fX As Double) As Double
'By Eukalyptus
Asm
' if FB-32-bit, then load fx from stack, else it's already in xmm0
' ebx/rbx needs to be preserved, not sure about ecx/rcx
#ifndef __FB_64BIT__
lea eax, [esp+4]
push ebx
push ecx
movq xmm0, [eax]
#else
push rbx
push rcx
#endif
mulsd xmm0, [1f]
addsd xmm0, [3f]
movd ebx, xmm0
Add ebx, 0x40000000 'SinToCos
lea eax, [ebx*2+0x80000000]
sar eax, 2
imul eax
sar ebx, 31
lea eax, [edx*2-0x70000000]
lea ecx, [edx*8+edx-0x24000000]
imul edx
Xor ecx, ebx
lea eax, [edx*8+edx+0x44A00000]
imul ecx
cvtsi2sd xmm0, edx
mulsd xmm0, [2f]
' if FB-32-bit, then transfer xmm0 into fpu, else we are done
' restore saved registers
#ifndef __FB_64BIT__
pop ecx
pop ebx
movq [esp-12], xmm0
fld qword ptr [esp-12]
#else
pop rcx
pop rbx
#endif
ret
1: .Double 683565275.57643158
2: .Double -0.0000000061763971109087229
3: .Double 6755399441055744.0
End Asm
End Function
Print "just now starting"
dim as double s, t
dim as double x
dim as integer k
s=0
t=timer
for k=1 to 100000000
x=k
s+=ASM_Sin6th(x)
next
t=timer-t
Print t, s, "29.19713395039346"
s=0
t=timer
for k=1 to 100000000
x=k
s+=ASM_Cos6th(x)
next
t=timer-t
Print t, s, "9.347239003491984"
Print "------------------------"
s=0
t=timer
for k=1 to 100000000
x=k
s+=_ASM_Sin6th(x)
next
t=timer-t
Print t, s, "29.19713395039346"
s=0
t=timer
for k=1 to 100000000
x=k
s+=_ASM_Cos6th(x)
next
t=timer-t
Print t, s, "9.347239003491984"