My _hrgb_ code above is wrong, and for the hrgb_ code I could not find any coding that avoided partial registers and that seemed likely to be significantly faster than the original code. This is my attempt to verify that the original code produces the correct result.
Code: Select all
''=======================================================================================
#MACRO hrgb_(red,green,blue,rgb16)
ASM
mov al, [red]
Shl eax, 5
mov al, [green]
Shl eax, 6
mov al, [blue]
Shr eax, 3
mov [rgb16], ax
End ASM
#ENDMACRO
''=======================================================================================
function rgb16 naked( byval red as uinteger, _
byval green as uinteger, _
byval blue as uinteger ) as ushort
asm
mov al, [esp+4]
shl eax, 5
mov al, [esp+8]
shl eax, 6
mov al, [esp+12]
shr eax, 3
ret 12
end asm
end function
''=======================================================================================
#define RGB_R( c ) ( CUInt( c ) Shr 16 And 255 )
#define RGB_G( c ) ( CUInt( c ) Shr 8 And 255 )
#define RGB_B( c ) ( CUInt( c ) And 255 )
''=======================================================================================
''--------------------------------------------------------------------------------------
'' This ported from code I found here:
'' http://crpppc19.epfl.ch/doc/ffmpeg-doc/html/rgb2rgb__template_8c_source.html#l00188
''--------------------------------------------------------------------------------------
#define RGB32to16(rgb32)((rgb32 and &hff) shr 3 + _
(rgb32 and &hfc00) shr 5 + _
(rgb32 and &hf80000) shr 8 )
''=======================================================================================
dim as any ptr i1, i2, i3, i4
dim as ushort c16
dim as uinteger c, r, g, b
dim as uinteger ptr p1
dim as ushort ptr p2, p3, p4
screenres 640,480,16
width 80,30
i1 = imagecreate(200,200,rgb(127,127,127),32)
i2 = imagecreate(200,200,0)
i3 = imagecreate(200,200,0)
i4 = imagecreate(200,200,0)
imageinfo(i1,,,,,p1)
imageinfo(i2,,,,,p2)
imageinfo(i3,,,,,p3)
imageinfo(i4,,,,,p4)
c = *p1
print bin(RGB32to16(c),16)
r = RGB_R(c)
g = RGB_G(c)
b = RGB_B(c)
hrgb_( r, g, b, c16 )
print bin(c16,16)
c16 = rgb16( r, g, b )
print bin(c16,16)
print
print bin(r,8),bin(g,8),bin(b,8)
print
for y as integer = 0 to 199
for x as integer = 0 to 199
c = p1[y*200+x]
r = RGB_R(c)
assert( r = &b01111111 )
g = RGB_G(c)
assert( g = &b01111111 )
b = RGB_B(c)
assert( b = &b01111111 )
hrgb_(r,g,b,c16)
assert( c16 = &b0111101111101111 )
p2[y*200+x] = c16
c16 = rgb16( r, g, b)
assert( c16 = &b0111101111101111 )
p3[y*200+x] = c16
next
next
for y as integer = 0 to 199
ImageConvertRow( @p1[y*200], 32, @p4[y*200], 16, 200 )
next
print
print bin(*p4,16)
put(10,250),i2
put(220,250),i3
put(430,250),i4
sleep
This cycle count code shows that passing ubytes instead of uintegers slows the code down significantly.
Code: Select all
''===================================================================================
#include "counter.bas"
''===================================================================================
''
'' The newer cycle count macros are available here:
''
'' http://www.freebasic.net/forum/viewtopic.php?f=7&t=20003
''
''===================================================================================
#MACRO hrgb_(red,green,blue,rgb16)
ASM
mov al, [red]
Shl eax, 5
mov al, [green]
Shl eax, 6
mov al, [blue]
Shr eax, 3
mov [rgb16], ax
End ASM
#ENDMACRO
''===================================================================================
function rgb16ub naked( byval red as ubyte, _
byval green as ubyte, _
byval blue as ubyte ) as ushort
asm
mov al, [esp+4]
shl eax, 5
mov al, [esp+8]
shl eax, 6
mov al, [esp+12]
shr eax, 3
ret 12
end asm
end function
function rgb16 naked( byval red as uinteger, _
byval green as uinteger, _
byval blue as uinteger ) as ushort
asm
mov al, [esp+4]
shl eax, 5
mov al, [esp+8]
shl eax, 6
mov al, [esp+12]
shr eax, 3
ret 12
end asm
end function
''===================================================================================
dim as uinteger red, green, blue
dim as ushort c
SetProcessAffinityMask( GetCurrentProcess(), 1)
sleep 5000
for i as integer = 1 to 4
counter_begin( 10000000, REALTIME_PRIORITY_CLASS, THREAD_PRIORITY_TIME_CRITICAL )
counter_end()
print counter_cycles;" cycles"
counter_begin( 10000000, REALTIME_PRIORITY_CLASS, THREAD_PRIORITY_TIME_CRITICAL )
hrgb_( red, green, blue, c )
counter_end()
print counter_cycles;" cycles"
counter_begin( 10000000, REALTIME_PRIORITY_CLASS, THREAD_PRIORITY_TIME_CRITICAL )
c = rgb16ub( red, green, blue )
counter_end()
print counter_cycles;" cycles"
counter_begin( 10000000, REALTIME_PRIORITY_CLASS, THREAD_PRIORITY_TIME_CRITICAL )
asm
movzx eax, byte ptr [ebp-16]
push eax
movzx eax, byte ptr [ebp-12]
push eax
movzx eax, byte ptr [ebp-8]
push eax
call _RGB16UB@12
mov word ptr [ebp-20], ax
end asm
counter_end()
print counter_cycles;" cycles"
counter_begin( 10000000, REALTIME_PRIORITY_CLASS, THREAD_PRIORITY_TIME_CRITICAL )
c = rgb16( red, green, blue )
counter_end()
print counter_cycles;" cycles"
print
next
sleep
The problem appears to be the three additional partial register accesses, but I have no idea why they have such a large effect:
Code: Select all
mov al, byte ptr [ebp-16]
push eax
mov al, byte ptr [ebp-12]
push eax
mov al, byte ptr [ebp-8]
push eax
call _RGB16UB@12
Edit:
After taking a break I do have some idea of why the effect is so large, the partial register accesses are changing only the lower 8 bits of EAX. According to
Agner Fog’s optimization manuals on a P3 each instance causes a partial register stall with a delay of 5-6 clock cycles before the whole register can be pushed. On a P4 the nature of the problem is different, but it still involves a delay.
So to test this I did as the microarchitecture manual recommends and substituted a movzx instruction that zeros the upper 24 bits of the register, see the cycle-count code above, and the results below. And before someone points it out, there is no way 15 instructions can execute in 0 cycles, there must be some sort of non-obvious effect acting here. Fortunately, this and similar P4 anomalies are no longer worth working around.
This optimization apparently never made it into the asm emitter, but AFAICT it should be just a matter of recognizing the problem areas and changing the instruction mnemonic.
Running on a P3:
Code: Select all
0 cycles
23 cycles
54 cycles
32 cycles
34 cycles
0 cycles
23 cycles
54 cycles
32 cycles
34 cycles
0 cycles
23 cycles
54 cycles
32 cycles
34 cycles
0 cycles
23 cycles
54 cycles
32 cycles
34 cycles
Running on a P4 (Northwood):
Code: Select all
0 cycles
5 cycles
21 cycles
3 cycles
5 cycles
0 cycles
3 cycles
22 cycles
0 cycles
6 cycles
0 cycles
3 cycles
22 cycles
0 cycles
5 cycles
0 cycles
3 cycles
21 cycles
0 cycles
5 cycles
I corrected the problem in the test code by passing uintegers.