Was MOVDQA any faster than MOVAPS?1000101 wrote:I couldn't help playing with the asm code in this thread.
ASM assistance, if you please.
Not really, but I'm on Amd and the results may (will) differ on an Intel. I've added console output to the code for easier viewing as well a definition to skip the demo and just compare the routines (NO_DEMO).MichaelW wrote:Was MOVDQA any faster than MOVAPS?1000101 wrote:I couldn't help playing with the asm code in this thread.
Code: Select all
#Include Once "counter.bas"
#Include Once "vbcompat.bi"
/'
Window size to use
'/
#Define SCREEN_WIDTH 512
#Define SCREEN_HEIGHT 512
/'
Allow/Disallow scrollers
'/
#Define USE_ASM_386
#Define USE_ASM_MMX
#Define USE_ASM_SSE
#Define USE_ASM_SSE2
/'
Allow/Disallow AMD optimizations
'/
#Define IS_AMD
/'
Just benchmark the code, don't demo it
'/
#Define NO_DEMO
'=========================================================================
Sub scrollup0 ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
For x As Integer = 0 To ( SCREEN_WIDTH - 1 )
prb[x] = pfb[x]
Next
For y As Integer = 0 To ( SCREEN_HEIGHT - 2 )
For x As Integer = 0 To ( SCREEN_WIDTH - 1 )
pfb[x+y*SCREEN_WIDTH] = pfb[x+(y+1)*SCREEN_WIDTH]
Next
Next
For x As Integer = 0 To ( SCREEN_WIDTH - 1 )
pfb[x+( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH] = prb[x]
Next
End Sub
'=========================================================================
Sub scrollup1 ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
asm
mov esi, [pfb]
mov edi, [prb]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov [edi+ecx*4], eax
dec ecx
jns 0b
'---------------------------------------------------------------
'' Compared to the corresponding code in scrollup0, most of the
'' speed advantage for this code is from the elimination of two
'' imul eax, SCREEN_WIDTH instructions in the inner loop.
''
'' Doing this efficiently required one more register than is
'' normally available, so the code uses EBP. Note that EBP must
'' be preserved around this use because it is otherwise needed
'' to access the procedure parameters and variables allocated
'' from the stack, and to restore the entry value of ESP in the
'' epilogue code.
'---------------------------------------------------------------
push ebp
mov edx, ( SCREEN_HEIGHT - 1 ) '' y counter
Xor edi, edi '' y component of address
0:
add edi, SCREEN_WIDTH
mov ecx, SCREEN_WIDTH '' x counter
Xor ebp, ebp '' x component of address
1:
mov ebx, ebp
add ebx, edi
mov eax, [esi+ebx*4]
Sub ebx, SCREEN_WIDTH
inc ebp
mov [esi+ebx*4], eax
dec ecx
jnz 1b
dec edx
jnz 0b
pop ebp
mov esi, [prb]
mov edi, [pfb]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov ebx, ecx
add ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
mov [edi+ebx*4], eax
dec ecx
jns 0b
End asm
End Sub
'=========================================================================
Sub scrollupmmx ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
asm
mov esi, [pfb]
mov edi, [prb]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov [edi+ecx*4], eax
dec ecx
jns 0b
push ebp
mov edx, ( SCREEN_HEIGHT - 1 ) '' y counter
Xor edi, edi '' y component of address
0:
add edi, ( SCREEN_WIDTH Shr 1 )
mov ecx, ( SCREEN_WIDTH Shr 1 ) '' x counter
Xor ebp, ebp '' x component of address
1:
mov ebx, ebp
add ebx, edi
movq mm0, [esi+ebx*8]
Sub ebx, ( SCREEN_WIDTH Shr 1 )
inc ebp
movq [esi+ebx*8], mm0
dec ecx
jnz 1b
dec edx
jnz 0b
pop ebp
mov esi, [prb]
mov edi, [pfb]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov ebx, ecx
add ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
mov [edi+ebx*4], eax
dec ecx
jns 0b
'----------------------------------------------------------------
'' Empty the MMX state to avoid interfering with FPU operations.
'----------------------------------------------------------------
#Ifdef IS_AMD
femms
#Else
emms
#EndIf
End Asm
End Sub
'=========================================================================
Sub scrollupsse ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
asm
mov esi, [pfb]
mov edi, [prb]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov [edi+ecx*4], eax
dec ecx
jns 0b
push ebp
mov edx, ( SCREEN_HEIGHT - 1 ) '' y counter
Xor edi, edi '' y component of address
0:
add edi, ( SCREEN_WIDTH Shr 2 )
mov ecx, ( SCREEN_WIDTH Shr 2 ) '' x counter
Xor ebp, ebp '' x component of address
1:
mov ebx, ebp
add ebx, edi
'------------------------------------------------------
'' Scale factors are limited to 2,4,and 8, so scaling
'' EBX by 16 must be done with a separate instruction.
'------------------------------------------------------
Shl ebx, 4
movaps xmm0, [esi+ebx] '' SSE1
inc ebp
movaps [esi+ebx-( SCREEN_WIDTH Shl 2 )], xmm0 '' SSE1
dec ecx
jnz 1b
dec edx
jnz 0b
pop ebp
mov esi, [prb]
mov edi, [pfb]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov ebx, ecx
add ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
mov [edi+ebx*4], eax
dec ecx
jns 0b
End asm
End Sub
'=========================================================================
Sub scrollupsse2 ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
asm
mov esi, [pfb]
mov edi, [prb]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov [edi+ecx*4], eax
dec ecx
jns 0b
push ebp
mov edx, ( SCREEN_HEIGHT - 1 ) '' y counter
Xor edi, edi '' y component of address
0:
add edi, ( SCREEN_WIDTH Shr 2 )
mov ecx, ( SCREEN_WIDTH Shr 2 ) '' x counter
Xor ebp, ebp '' x component of address
1:
mov ebx, ebp
add ebx, edi
'------------------------------------------------------
'' Scale factors are limited to 2,4,and 8, so scaling
'' EBX by 16 must be done with a separate instruction.
'------------------------------------------------------
Shl ebx, 4
movdqa xmm0, [esi+ebx] '' SSE2
inc ebp
movdqa [esi+ebx-( SCREEN_WIDTH Shl 2 )], xmm0 '' SSE2
dec ecx
jnz 1b
dec edx
jnz 0b
pop ebp
mov esi, [prb]
mov edi, [pfb]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov ebx, ecx
add ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
mov [edi+ebx*4], eax
dec ecx
jns 0b
End asm
End Sub
'=========================================================================
/'
Naked means there is no prologue or epilogue code.
Although we should maintain all used registers, fbc assumes all
registeres are destroyed anyway, so we won't bother.
The exception to the above is ebp. ebp always has to be preserved.
'/
Sub nakedup1 naked ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
#define pfb_ esp+8
#define prb_ esp+12
Asm
'' We still need to save ebp for the caller local pointer
push ebp
mov esi, [pfb_]
mov edi, [prb_]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov [edi+ecx*4], eax
dec ecx
jns 0b
'---------------------------------------------------------------
'' Compared to the corresponding code in scrollup0, most of the
'' speed advantage for this code is from the elimination of two
'' imul eax, SCREEN_WIDTH instructions in the inner loop.
''
'' Notice how converting this to being a naked function auto-
'' matically gives us ebp as a general purpose register.
'---------------------------------------------------------------
mov edx, ( SCREEN_HEIGHT - 1 ) '' y counter
Xor edi, edi '' y component of address
0:
add edi, SCREEN_WIDTH
mov ecx, SCREEN_WIDTH '' x counter
Xor ebp, ebp '' x component of address
1:
mov ebx, ebp
add ebx, edi
mov eax, [esi+ebx*4]
Sub ebx, SCREEN_WIDTH
inc ebp
mov [esi+ebx*4], eax
dec ecx
jnz 1b
dec edx
jnz 0b
mov esi, [prb_]
mov edi, [pfb_]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov ebx, ecx
add ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
mov [edi+ebx*4], eax
dec ecx
jns 0b
'' Restore ebp and return
pop ebp
ret
End asm
#undef prb_
#undef pfb_
End Sub
'=========================================================================
Sub nakedupmmx naked ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
#define pfb_ esp+4
#define prb_ esp+8
#define s_ebp esp-8
Asm
'' We still need to save ebp for the caller local pointer
mov [s_ebp], ebp
mov esi, [pfb_]
mov edi, [prb_]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov [edi+ecx*4], eax
dec ecx
jns 0b
mov edx, ( SCREEN_HEIGHT - 1 ) '' y counter
Xor edi, edi '' y component of address
0:
add edi, ( SCREEN_WIDTH Shr 1 )
mov ecx, ( SCREEN_WIDTH Shr 1 ) '' x counter
Xor ebp, ebp '' x component of address
1:
mov ebx, ebp
add ebx, edi
movq mm0, [esi+ebx*8]
Sub ebx, ( SCREEN_WIDTH Shr 1 )
inc ebp
movq [esi+ebx*8], mm0
dec ecx
jnz 1b
dec edx
jnz 0b
mov esi, [prb_]
mov edi, [pfb_]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov ebx, ecx
add ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
mov [edi+ebx*4], eax
dec ecx
jns 0b
'----------------------------------------------------------------
'' Empty the MMX state to avoid interfering with FPU operations.
'----------------------------------------------------------------
#Ifdef IS_AMD
femms
#Else
emms
#EndIf
'' Restore ebp and return
mov ebp, [s_ebp]
ret
End Asm
#undef s_ebp
#undef prb_
#undef pfb_
End Sub
'=========================================================================
Sub nakedupsse naked ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
#define pfb_ esp+4
#define prb_ esp+8
#define s_ebp esp-8
Asm
'' We still need to save ebp for the caller local pointer
mov [s_ebp], ebp
mov esi, [pfb_]
mov edi, [prb_]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov [edi+ecx*4], eax
dec ecx
jns 0b
mov edx, ( SCREEN_HEIGHT - 1 ) '' y counter
Xor edi, edi '' y component of address
0:
add edi, ( SCREEN_WIDTH Shr 2 )
mov ecx, ( SCREEN_WIDTH Shr 2 ) '' x counter
Xor ebp, ebp '' x component of address
1:
mov ebx, ebp
add ebx, edi
'------------------------------------------------------
'' Scale factors are limited to 2,4,and 8, so scaling
'' EBX by 16 must be done with a separate instruction.
'------------------------------------------------------
Shl ebx, 4
movaps xmm0, [esi+ebx] '' SSE1
inc ebp
movaps [esi+ebx-( SCREEN_WIDTH Shl 2 )], xmm0 '' SSE1
dec ecx
jnz 1b
dec edx
jnz 0b
mov esi, [prb_]
mov edi, [pfb_]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov ebx, ecx
add ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
mov [edi+ebx*4], eax
dec ecx
jns 0b
'' Restore ebp and return
mov ebp, [s_ebp]
ret
End Asm
#undef s_ebp
#undef prb_
#undef pfb_
End Sub
'=========================================================================
Sub nakedupsse2 naked ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
#define pfb_ esp+4
#define prb_ esp+8
#define s_ebp esp-8
Asm
'' We still need to save ebp for the caller local pointer
mov [s_ebp], ebp
mov esi, [pfb_]
mov edi, [prb_]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov [edi+ecx*4], eax
dec ecx
jns 0b
mov edx, ( SCREEN_HEIGHT - 1 ) '' y counter
Xor edi, edi '' y component of address
0:
add edi, ( SCREEN_WIDTH Shr 2 )
mov ecx, ( SCREEN_WIDTH Shr 2 ) '' x counter
Xor ebp, ebp '' x component of address
1:
mov ebx, ebp
add ebx, edi
'------------------------------------------------------
'' Scale factors are limited to 2,4,and 8, so scaling
'' EBX by 16 must be done with a separate instruction.
'------------------------------------------------------
Shl ebx, 4
movdqa xmm0, [esi+ebx] '' SSE2
inc ebp
movdqa [esi+ebx-( SCREEN_WIDTH Shl 2 )], xmm0 '' SSE2
dec ecx
jnz 1b
dec edx
jnz 0b
mov esi, [prb_]
mov edi, [pfb_]
mov ecx, ( SCREEN_WIDTH - 1 )
0:
mov eax, [esi+ecx*4]
mov ebx, ecx
add ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
mov [edi+ebx*4], eax
dec ecx
jns 0b
'' Restore ebp and return
mov ebp, [s_ebp]
ret
End Asm
#undef s_ebp
#undef prb_
#undef pfb_
End Sub
'=========================================================================
Dim Shared As Any Ptr horzsave
Dim Shared As Any Ptr frame
Sub scrollup2 ( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
Get(0,0)-(( SCREEN_WIDTH - 1 ),0),horzSave
Get(0,1)-(( SCREEN_WIDTH - 1 ),( SCREEN_HEIGHT - 1 )),frame
Put(0,0),frame,Pset
Put(0,( SCREEN_HEIGHT - 1 )),horzSave,Pset
End Sub
'=========================================================================
#macro shadowtext( _x_, _y_, _text_ )
Draw String ( _x_-1, _y_-1 ), _text_, 0
Draw String ( _x_-1, _y_+1 ), _text_, 0
Draw String ( _x_+1, _y_-1 ), _text_, 0
Draw String ( _x_+1, _y_+1 ), _text_, 0
Draw String ( _x_, _y_ ), _text_
Open Cons For Append As #123
Print #123, Space( ( _x_ ) / 8 ); _text_;
Close #123
#EndMacro
'=========================================================================
#Macro rollup()
For i As Integer = 1 To 8
ScreenSync
ScreenLock
pfb = ScreenPtr
scrollup1( pfb, @rb( 0 ) )
ScreenUnlock
Next
Open Cons For Append As #123
Print #123,
Close #123
#EndMacro
'=========================================================================
#Macro screengarbage()
Scope
ScreenLock
pfb = ScreenPtr
For i As Integer = 0 To ( SCREEN_WIDTH * SCREEN_HEIGHT ) - 1
pfb[i] = rgb( Rnd * 255, Rnd * 255, Rnd * 255 )
Next
ScreenUnLock
End Scope
#endmacro
'=========================================================================
#Macro compare( _func_ )
Scope
Const As Integer INTTERATIONS = SCREEN_HEIGHT
#If __FB_DEBUG__ = 0
Dim As Double t2 = Timer
Dim As Double t1 = Timer
#EndIf
ScreenSync
ScreenLock
pfb = ScreenPtr
#If __FB_DEBUG__ = 0
t1 = Timer
COUNTER_BEGIN( INTTERATIONS, REALTIME_PRIORITY_CLASS )
#Else
Asm int3
#EndIf
_func_( pfb, @rb( 0 ) )
#If __FB_DEBUG__ = 0
COUNTER_END
t2 = Timer
#EndIf
#If __FB_DEBUG__ = 0
var text = Format( ( t2 - t1 ) * ( 1000000 / INTTERATIONS ), !"#####\230s" )
shadowtext( 128 + 1, 1, text )
#Else
shadowtext( 128 + 1, 1, "Debug" )
#EndIf
ScreenUnLock
rollup()
End Scope
#endmacro
'=========================================================================
#Macro scroll( _func_ )
Scope
shadowtext( 1, 1, #_func_ )
#IfNDef NO_DEMO
For i As Integer = 1 To SCREEN_HEIGHT
ScreenSync
ScreenLock
pfb = ScreenPtr
_func_( pfb, @rb( 0 ) )
ScreenUnLock
Next
#EndIf
End Scope
compare( _func_ )
#EndMacro
'=========================================================================
Dim As Uinteger rb( SCREEN_WIDTH )
Dim As UInteger Ptr pfb = Any
ScreenRes SCREEN_WIDTH,SCREEN_HEIGHT,32
horzsave = ImageCreate(SCREEN_WIDTH,1)
frame = imagecreate(SCREEN_WIDTH,SCREEN_HEIGHT)
'=========================================================================
screengarbage()
scroll( scrollup0 )
rollup()
#IfDef USE_ASM_386
scroll( scrollup1 )
scroll( nakedup1 )
rollup()
#EndIf
#IfDef USE_ASM_MMX
scroll( scrollupmmx )
scroll( nakedupmmx )
rollup()
#EndIf
#IfDef USE_ASM_SSE
scroll( scrollupsse )
scroll( nakedupsse )
rollup()
#EndIf
#IfDef USE_ASM_SSE2
scroll( scrollupsse2 )
scroll( nakedupsse2 )
rollup()
#EndIf
scroll( scrollup2 )
rollup()
shadowtext( 1, 1, "fin" )
rollup()
'=========================================================================
Sleep
ImageDestroy( horzsave )
ImageDestroy( frame )
Output from my machine (512x512, IS_AMD, NO_DEMO):
Code: Select all
scrollup0 2256µs
scrollup1 737µs
nakedup1 681µs
scrollupmmx 734µs
nakedupmmx 695µs
scrollupsse 634µs
nakedupsse 609µs
scrollupsse2 634µs
nakedupsse2 606µs
scrollup2 1717µs
fin
-
- Posts: 3954
- Joined: Jan 01, 2009 7:03
- Location: Australia
kiyotewolf wrote:
I used to love assembler on the old machines but 10 years ago when I tried to update to using it on a modern machine I found out it was nothing but calling Win32 API functions I had no direct access to anything. So until I found FreeBasic I dabbled in other languages.
Using tricks like dithering to increase the apparent number of colors is just a complication. Yes a palette of colors is nice and possible with rgb colors but its also nice to have the simplification of rgb colors when you need them.
I know that if you are an expert doing your calculations with an abacus or a slide rule you might show resistence to using one of those new fangled electronic calculators, and certainly those who are the guardians of the old ways are of historical value, but I would much prefer to have had the brains and time to keep up with the latest programming techniques and modern technology.
However I make a distinction between a professor of English and someone who can write a good novel. Some people are experts on the language while others actually use it for something interesting. Both kinds are required. For example someone might be able to write a great graphics program but have no artistic ability to actually use it. Others might be great on the math involved in 2D or 3D graphic programs and yet not be able to come up with a fun game.
It seems to me that if you want to write some of the older kind of games then FreeBasic has everything you need without resorting to asm or needing the hardware sprites etc required in the older machines.
JohnC
But if you are not using a 6502 what can you do about that? Forget about it I would suggest. Let the assembler generate the hexadecimal values.One of the stumbling blocks for me is, in 6502, you had all the hexadecimal values for every op-code.
I've never seen a complete mapping of each usage of each ASM command in 8088+ language, turned into it's exact hex opcode equivilants.
Yes, IF, IF you are programming in qbasic. But if you are not using qbasic in a DOS machine then it is a different ball game. And if your program is a computer game you are going to want to make use of fast graphics and that means using DirectX.What's that good for? Well, it's very good to know, if you're coding inline asm in Qbasic, and you need, actual, numbers, in hex, to plug into an array or string to then CALL ABSOLUTE to.
I used to love assembler on the old machines but 10 years ago when I tried to update to using it on a modern machine I found out it was nothing but calling Win32 API functions I had no direct access to anything. So until I found FreeBasic I dabbled in other languages.
Using tricks like dithering to increase the apparent number of colors is just a complication. Yes a palette of colors is nice and possible with rgb colors but its also nice to have the simplification of rgb colors when you need them.
I know that if you are an expert doing your calculations with an abacus or a slide rule you might show resistence to using one of those new fangled electronic calculators, and certainly those who are the guardians of the old ways are of historical value, but I would much prefer to have had the brains and time to keep up with the latest programming techniques and modern technology.
However I make a distinction between a professor of English and someone who can write a good novel. Some people are experts on the language while others actually use it for something interesting. Both kinds are required. For example someone might be able to write a great graphics program but have no artistic ability to actually use it. Others might be great on the math involved in 2D or 3D graphic programs and yet not be able to come up with a fun game.
It seems to me that if you want to write some of the older kind of games then FreeBasic has everything you need without resorting to asm or needing the hardware sprites etc required in the older machines.
JohnC
-
- Posts: 1009
- Joined: Oct 11, 2008 7:42
- Location: ABQ, NM
- Contact:
-
- Posts: 3954
- Joined: Jan 01, 2009 7:03
- Location: Australia
kiyotewolf wrote:I guess you might call me a "Steampunk Programmer"..
Resorting to old steam methods of powering my game engines.
~Kiyote!
Yes I guess so :)
Nostalgia is a pleasant experience but to live in the past is to lose your present and thus your future past and your potential for growth. Still we must accept our limits thus I am using FreeBasic instead of C++.
"Game programmers have resisted the Windows platform since the beginning of time, but like the Borg say, "Resistence is futile..." I tend to agree."
Tricks of the Windows game programming gurus - Andre LaMothe
JohnC
-
- Posts: 1009
- Joined: Oct 11, 2008 7:42
- Location: ABQ, NM
- Contact:
When I used to go to Barnes & Noble, way back when, I found books on Commodore stuff, and books on DOS.
Then, these strange new language books started coming out, including web based stuff, and I couldn't do anything but scratch my head and watch as the books I understood slowly drifted from the shelves forever more.
I know what I do can be archaic at times, but I'm still playing catch-up, to what happened over 15+ years ago, when things went way beyond my control.
I'm going to be using OOP very soon, cause someone's most recent demo of something else, included very rudimentary OOP implementation which I ACTUALLY UNDERSTOOD.
So, I'm working on moving into the new century of coding.
Srsly. I'm trying.
~Kiyote!
Then, these strange new language books started coming out, including web based stuff, and I couldn't do anything but scratch my head and watch as the books I understood slowly drifted from the shelves forever more.
I know what I do can be archaic at times, but I'm still playing catch-up, to what happened over 15+ years ago, when things went way beyond my control.
I'm going to be using OOP very soon, cause someone's most recent demo of something else, included very rudimentary OOP implementation which I ACTUALLY UNDERSTOOD.
So, I'm working on moving into the new century of coding.
Srsly. I'm trying.
~Kiyote!
-
- Posts: 3954
- Joined: Jan 01, 2009 7:03
- Location: Australia
Yes my tale is much the same. I had some angst over my hard earned knowledge and skills in my hobby becoming dated and had to make a conscious effort to forget about the old machines and try to move on ...When I used to go to Barnes & Noble, way back when, I found books on Commodore stuff, and books on DOS.
Then, these strange new language books started coming out, including web based stuff, and I couldn't do anything but scratch my head and watch as the books I understood slowly drifted from the shelves forever more.
I know what I do can be archaic at times, but I'm still playing catch-up, to what happened over 15+ years ago, when things went way beyond my control.
I'm going to be using OOP very soon, cause someone's most recent demo of something else, included very rudimentary OOP implementation which I ACTUALLY UNDERSTOOD.
So, I'm working on moving into the new century of coding.
Srsly. I'm trying.
~Kiyote!
I did some OOP with C++ but I haven't the time to learn FB OO nomenclature. Most of DJPeter's code for example I simply cannot read. I was happy to go back in time to FB as an updated QB but without the need to add stuff such as a mouse driver and with the bonus of plenty of memory, speed and great graphics. Apart from that FB for me is QB and I just try to keep my programs modular.
I would have stuck with DevC++ and SDL for graphics if I hadn't found FreeBasic and DJPeter's FB code for using .dlls for accessing a webcam and the k8085 i/o board.
If you want a scrolly game you will have to use PUT if you want the same kind of display speeds you could get on the old machines.
JohnC
-
- Posts: 1009
- Joined: Oct 11, 2008 7:42
- Location: ABQ, NM
- Contact:
Did you guys find any real answer to this as an optimization method, vs the original FB code that I posted, for BasicCoder2 to use for the conversion to ASM?
<.< I find now, that here, as I'm needing it here very shortly, that I can only rely on my code, because I found out BasicCoder2's ASM version missed some, on his conversion.
The ASM is incomplete, on at least one of the scroll functions. I know that for sure, but did not research exactly what was missing / wrong with the conversion on a whole.
@BasicCoder2 ~ (If you want me to point out where the ASM is broken, so you can re-do it, I will, but if you're tired of helping me with this, I can understand, and you don't have to.)
~~~
Did anyone figure out, is my original FB code, as efficient, as BasicCoder2's ASM version?
Even though his version has bugs, it is still 99% correct and complete. He did exactly what I wanted, so testing for speed is legal between my FB code and his ASM versions.
~Kiyote!
I just wanna put this to rest, so I can either ask BasicCoder2 for more help (pretty please, if you're still willing), or just stick to my FB version.
<.< I find now, that here, as I'm needing it here very shortly, that I can only rely on my code, because I found out BasicCoder2's ASM version missed some, on his conversion.
The ASM is incomplete, on at least one of the scroll functions. I know that for sure, but did not research exactly what was missing / wrong with the conversion on a whole.
@BasicCoder2 ~ (If you want me to point out where the ASM is broken, so you can re-do it, I will, but if you're tired of helping me with this, I can understand, and you don't have to.)
~~~
Did anyone figure out, is my original FB code, as efficient, as BasicCoder2's ASM version?
Even though his version has bugs, it is still 99% correct and complete. He did exactly what I wanted, so testing for speed is legal between my FB code and his ASM versions.
~Kiyote!
I just wanna put this to rest, so I can either ask BasicCoder2 for more help (pretty please, if you're still willing), or just stick to my FB version.
The intel (and AMD) architecture manuals are freely downloadable have all this info you need.kiyotewolf wrote:Honestly, I don't know why learning the 80386 set is so hard for me.
One of the stumbling blocks for me is, in 6502, you had all the hexadecimal values for every op-code.
The trouble is that it is a lot larger, and not as one dimensional as it was on the 6502.
Anyway, better move directly to x86_64 and forget all this old i386 stuff.
-
- Posts: 1009
- Joined: Oct 11, 2008 7:42
- Location: ABQ, NM
- Contact:
If you can't understand it, consider not diving into assembler. It is not as worthwhile as in 6502 times anyway.kiyotewolf wrote:But I could understand the one dimensional 6502!
Start with forgetting everything before pentium M. Preferably also P4, since when you finally have mastered that, they are near extinct. No need to dwell on the past.<.< it's 8088+ .. (and so on and so on)..
-
- Posts: 3954
- Joined: Jan 01, 2009 7:03
- Location: Australia
Sure point out where the ASM is broken.kiyotewolf wrote: @BasicCoder2 ~ (If you want me to point out where the ASM is broken, so you can re-do it, I will, but if you're tired of helping me with this, I can understand, and you don't have to.
If you want to scroll fast you will use the FreeBasic methods instead just as I demonstrated with the bigPic example which scolls as fast as you move the mouse by whatever amount and direction you like.
JohnC
-
- Posts: 1009
- Joined: Oct 11, 2008 7:42
- Location: ABQ, NM
- Contact:
some thoughts
some thoughts
looking at all the robotics kits out there, there are cpu's on small pcb boards that plug in via usb
It would be interesting if some ASM was written for these chips.
As the architecture of these chips will be well known & documented. What is written for one is written for all
What im getting at is this, if you had a coupe of cpus hanging off your usb ports this would provide some parallel processing, which would be handy if you where writing a game , vision application or other app which was bogging down your computers PC
apart from the read write delay of the of the usb mounted chips , (reading data off your pcs ram , and writes the results back ) the only other drain on you pc is a few milliamps of power being sucked out your usb port
Bonus is you can just unplug the chip/s from one machine and stick into another....where ever u need that little boost
I think you will agree this is an intriguing concept , harking back to the days when a fully modded spectrum had all sorts of bits chaining out of it
chess engine tree type searches
sorting large lists
and other mathematically demanding tasks could be offloaded
it wouldn't take long before i nice little library was built up
#include cpu's (1 to n'usb ports)
would be a groovy addition to the free basic environment
looking at all the robotics kits out there, there are cpu's on small pcb boards that plug in via usb
It would be interesting if some ASM was written for these chips.
As the architecture of these chips will be well known & documented. What is written for one is written for all
What im getting at is this, if you had a coupe of cpus hanging off your usb ports this would provide some parallel processing, which would be handy if you where writing a game , vision application or other app which was bogging down your computers PC
apart from the read write delay of the of the usb mounted chips , (reading data off your pcs ram , and writes the results back ) the only other drain on you pc is a few milliamps of power being sucked out your usb port
Bonus is you can just unplug the chip/s from one machine and stick into another....where ever u need that little boost
I think you will agree this is an intriguing concept , harking back to the days when a fully modded spectrum had all sorts of bits chaining out of it
chess engine tree type searches
sorting large lists
and other mathematically demanding tasks could be offloaded
it wouldn't take long before i nice little library was built up
#include cpu's (1 to n'usb ports)
would be a groovy addition to the free basic environment
veering off the original topic
What would be better is FPGAs as addon boards.
Back on topic
I'm not sure if this is what you are looking for, but here's my go at making them faster
I think they are a pretty sped up, as on my system 10000 calls takes between 0.125 seconds and 0.531 seconds
due to cache hits and other strange cpu phenomenon that I don't understand terribly well, scrolling up is the fastest at 0.125 and right is the slowest at 0.531. Left and down sit at a nice 0.250 and 0.202, respectively.
For comparison, 10000 calls of the FB version took around 4 seconds to run.
quick note: This is poorly optimized for the specific size of the buffer, if the buffer changes, you get to change the "magic numbers". It's also using the global
If your looking for a hex->instruction reference, do 2 things.
1. prepare your mind for the confusion your about to throw at it. Easiest way? percussive maintenance with a freaking hammer. I mean seriously, the x86 opcodes are the freaking craziest thing ever. Just wait till you see the horror that is an opcode followed by a modr/m byte followed by a SIB byte. Oh wait, it was prefixed with a size override modifier and it's followed by a displacement and an immediate. That's right, a single instruction could take up to 4 bytes for the opcode itself and another 8 of data.
2. goto http://ref.x86asm.net. They have a chart of all the possible opcodes. It's about 7~8 pages, one line per opcode. It is rather nice to have though, and I refer to it constantly for my JIT assembler I'm writing(give it asm, and it builds up a function at run time that you can call.)
It'd be interesting to see something like that, but you do realize a 8Mhz cpu can't compare to a 3Ghz one, right?What im getting at is this, if you had a coupe of cpus hanging off your usb ports this would provide some parallel processing, which would be handy if you where writing a game , vision application or other app which was bogging down your computers PC
What would be better is FPGAs as addon boards.
Back on topic
I'm not sure if this is what you are looking for, but here's my go at making them faster
Code: Select all
sub ScrollUp (RRam2() as ubyte)
dim z as integer
dim Memry(320) as ubyte
For z = 0 To 319
Memry(z) = RRam2(z) 'Point(z, 0)
Next z
asm
mov ECX, 15920
mov EAX, [RRam2]
mov EAX, [EAX]
mov EDI, EAX
add EAX, 320
mov ESI, EAX
rep MOVSD
end asm
'for z = 0 to 63679
' RRam(z) = RRam2(z+320)
'next z
For z = 0 To 319
RRam2(z+320*199) = Memry(z)
Next z
end sub
sub ScrollDown (RRam2() as ubyte)
dim z as integer
dim Memry(320) as ubyte
For z = 0 To 319
Memry(z) = RRam2(z + 320*199) 'Point(z, 0)
Next z
asm
mov ECX, 15920
mov EAX, [RRam2]
mov EAX, [EAX]
add EAX, 63676
mov ESI, EAX
add EAX, 320
mov EDI, EAX
std
rep MOVSD
cld
end asm
'for z = 63679 to 0 step -1
' RRam2(z+320) = RRam2(z)
'next z
For z = 0 To 319
RRam2(z) = Memry(z)
Next z
end sub
sub ScrollLeft (RRam2() as ubyte)
dim clr as ubyte, z as integer
clr = RRam2(0)
asm
mov ECX, 16000
mov EAX, [RRam2]
mov EAX, [EAX]
mov EDI, EAX
add EAX, 1
mov ESI, EAX
rep MOVSD
mov ECX, 3
rep MOVSB
end asm
'for z = 1 to 63999
' RRam2(z-1) = RRam2(z)
'next z
For z = 199 To 1 Step -1
RRam2(320 * z+319) = RRam2(320 * (z-1) + 319)
Next z
RRam(319) = clr
end sub
sub ScrollRight (RRam2() as ubyte)
dim clr as ubyte, z as integer
clr = RRam2(63999)
asm
mov ECX, 16000
mov EAX, [RRam2]
mov EAX, [EAX]
add EAX, 63993
mov ESI, EAX
add EAX, 1
mov EDI, EAX
std
rep MOVSD
mov ECX, 3
rep MOVSB
cld
end asm
'for z = 63998 to 0 step -1
' RRam2(z+1) = RRam2(z)
'next z
For z = 0 To 198
RRam2(z*320) = RRam2((z+1) * 320)
Next z
RRam2(199 * 320) = clr
end sub
due to cache hits and other strange cpu phenomenon that I don't understand terribly well, scrolling up is the fastest at 0.125 and right is the slowest at 0.531. Left and down sit at a nice 0.250 and 0.202, respectively.
For comparison, 10000 calls of the FB version took around 4 seconds to run.
quick note: This is poorly optimized for the specific size of the buffer, if the buffer changes, you get to change the "magic numbers". It's also using the global
If your looking for a hex->instruction reference, do 2 things.
1. prepare your mind for the confusion your about to throw at it. Easiest way? percussive maintenance with a freaking hammer. I mean seriously, the x86 opcodes are the freaking craziest thing ever. Just wait till you see the horror that is an opcode followed by a modr/m byte followed by a SIB byte. Oh wait, it was prefixed with a size override modifier and it's followed by a displacement and an immediate. That's right, a single instruction could take up to 4 bytes for the opcode itself and another 8 of data.
2. goto http://ref.x86asm.net. They have a chart of all the possible opcodes. It's about 7~8 pages, one line per opcode. It is rather nice to have though, and I refer to it constantly for my JIT assembler I'm writing(give it asm, and it builds up a function at run time that you can call.)