This code tests two methods of speeding up floating-point division, see Agner Fog’s optimizing_assembly.pdf for the details, available
here, and note that the document also covers integer division. And note that this code requires SSE2 support.
Code: Select all
''=============================================================================
#define FPC_24BITS &b0000000000
#define FPC_53BITS &b1000000000
#define FPC_64BITS &b1100000000
''-------------------------------------------------------------------------
'' This macro sets the FPU precision control bits in the FPU control word
'' to one of the above values. The precision control setting determines
'' the precision that the FPU maintains internally. In the initialized
'' state the precision is set to 64 bits. Windows sets the precision to
'' 53 bits when it launches an application. A lower precision setting will
'' allow FPU divides and sqrts to cycle faster.
''-------------------------------------------------------------------------
#macro FSETPC(pc)
#ifndef __fpu__cw__
dim as ushort __fpu__cw__
#endif
asm
fstcw [__fpu__cw__]
mov ax, pc
and WORD PTR [__fpu__cw__], ~ FPC_64BITS
or [__fpu__cw__], ax
fldcw [__fpu__cw__]
end asm
#endmacro
''=============================================================================
sub ShowPC
dim as ushort fpucw
asm
fstcw [fpucw]
end asm
fpucw and= FPC_64BITS
print "PC = ";
select case fpucw
case FPC_24BITS
print "24 bits"
case FPC_53BITS
print "53 bits"
case FPC_64BITS
print "64 bits"
end select
end sub
''=============================================================================
dim as double rd,nd=5.4321,dd=1.2345,t
dim as single r,n=5.4321,d=1.2345
ShowPC
rd = nd/dd
print using "#.###############";rd
print
FSETPC(FPC_53BITS)
ShowPC
rd = nd/dd
print using "#.###############";rd
print
FSETPC(FPC_24BITS)
ShowPC
rd = nd/dd
print using "#.###############";rd
print
print "Using multiply with 12-bit precision reciprocal:"
asm
movd xmm1, [d]
rcpss xmm0, xmm1
mulss xmm0, [n]
movd [r], xmm0
end asm
print using "#.###############";r
print
print "Using multiply with reciprocal extended to 23-bit precision:"
asm
movd xmm1, [d]
rcpss xmm0, xmm1
mulss xmm1, xmm0
mulss xmm1, xmm0
addss xmm0, xmm0
subss xmm0, xmm1
mulss xmm0, [n]
movd [r], xmm0
end asm
print using "#.###############";r
print
print
sleep 5000
FSETPC(FPC_64BITS)
ShowPC
t = timer
for i as integer = 1 to 10000000
rd = nd/dd
next
print timer-t;" seconds"
print
FSETPC(FPC_53BITS)
ShowPC
t = timer
for i as integer = 1 to 10000000
rd = nd/dd
next
print timer-t;" seconds"
print
FSETPC(FPC_24BITS)
ShowPC
t = timer
for i as integer = 1 to 10000000
rd = nd/dd
next
print timer-t;" seconds"
print
print "Using multiply with 12-bit precision reciprocal:"
t = timer
for i as integer = 1 to 10000000
asm
movd xmm1, [d]
rcpss xmm0, xmm1
mulss xmm0, [n]
movd [r], xmm0
end asm
next
print timer-t;" seconds"
print
print "Using multiply with reciprocal extended to 23-bit precision:"
t = timer
for i as integer = 1 to 10000000
asm
movd xmm1, [d]
rcpss xmm0, xmm1
mulss xmm1, xmm0
mulss xmm1, xmm0
addss xmm0, xmm0
subss xmm0, xmm1
mulss xmm0, [n]
movd [r], xmm0
end asm
next
print timer-t;" seconds"
sleep
Running on a P4 Northwood:
Code: Select all
PC = 64 bits
4.400243013365736
PC = 53 bits
4.400243013365736
PC = 24 bits
4.400242805480957
Using multiply with 12-bit precision reciprocal:
4.399655818939209
Using multiply with reciprocal extended to 23-bit precision:
4.400242805480957
PC = 64 bits
0.1631717336153855 seconds
PC = 53 bits
0.1302028878562851 seconds
PC = 24 bits
0.078125 seconds
Using multiply with 12-bit precision reciprocal:
0.03125 seconds
Using multiply with reciprocal extended to 23-bit precision:
0.0859375 seconds