ASM assistance, if you please.

Game development specific discussions.
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Post by MichaelW »

1000101 wrote:I couldn't help playing with the asm code in this thread.
Was MOVDQA any faster than MOVAPS?
1000101
Posts: 2556
Joined: Jun 13, 2005 23:14
Location: SK, Canada

Post by 1000101 »

MichaelW wrote:
1000101 wrote:I couldn't help playing with the asm code in this thread.
Was MOVDQA any faster than MOVAPS?
Not really, but I'm on Amd and the results may (will) differ on an Intel. I've added console output to the code for easier viewing as well a definition to skip the demo and just compare the routines (NO_DEMO).

Code: Select all

#Include Once "counter.bas"
#Include Once "vbcompat.bi"


/'
	Window size to use
'/
#Define	SCREEN_WIDTH		512
#Define	SCREEN_HEIGHT		512


/'
	Allow/Disallow scrollers
'/
#Define	USE_ASM_386
#Define	USE_ASM_MMX
#Define	USE_ASM_SSE
#Define	USE_ASM_SSE2

/'
	Allow/Disallow AMD optimizations
'/
#Define	IS_AMD


/'
	Just benchmark the code, don't demo it
'/
#Define	NO_DEMO


'=========================================================================

Sub scrollup0		( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	
	For x As Integer = 0 To ( SCREEN_WIDTH - 1 )
		prb[x] = pfb[x]
	Next
	For y As Integer = 0 To ( SCREEN_HEIGHT - 2 )
		For x As Integer = 0 To ( SCREEN_WIDTH - 1 )
			pfb[x+y*SCREEN_WIDTH] = pfb[x+(y+1)*SCREEN_WIDTH]
		Next
	Next
	For x As Integer = 0 To ( SCREEN_WIDTH - 1 )
		pfb[x+( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH] = prb[x]
	Next
	
End Sub

'=========================================================================

Sub scrollup1		( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	
	asm
		mov	esi, [pfb]
		mov	edi, [prb]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	[edi+ecx*4], eax
		dec	ecx
		jns	0b
		
		'---------------------------------------------------------------
		'' Compared to the corresponding code in scrollup0, most of the
		'' speed advantage for this code is from the elimination of two
		'' imul eax, SCREEN_WIDTH instructions in the inner loop.
		''
		'' Doing this efficiently required one more register than is
		'' normally available, so the code uses EBP. Note that EBP must
		'' be preserved around this use because it is otherwise needed
		'' to access the procedure parameters and variables allocated
		'' from the stack, and to restore the entry value of ESP in the
		'' epilogue code.
		'---------------------------------------------------------------
		
		push	ebp
		mov	edx, ( SCREEN_HEIGHT - 1 )	'' y counter
		Xor	edi, edi	'' y component of address
		0:
		add	edi, SCREEN_WIDTH
		mov	ecx, SCREEN_WIDTH	'' x counter
		Xor	ebp, ebp	'' x component of address
		1:
		mov	ebx, ebp
		add	ebx, edi
		mov	eax, [esi+ebx*4]
		Sub	ebx, SCREEN_WIDTH
		inc	ebp
		mov	[esi+ebx*4], eax
		dec	ecx
		jnz	1b
		dec	edx
		jnz	0b
		pop	ebp
		
		mov	esi, [prb]
		mov	edi, [pfb]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	ebx, ecx
		add	ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
		mov	[edi+ebx*4], eax
		dec	ecx
		jns	0b
	End asm
	
End Sub

'=========================================================================

Sub scrollupmmx		( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	
	asm
		mov	esi, [pfb]
		mov	edi, [prb]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	[edi+ecx*4], eax
		dec	ecx
		jns	0b
		
		push	ebp
		mov	edx, ( SCREEN_HEIGHT - 1 )          '' y counter
		Xor	edi, edi          '' y component of address
		0:
		add	edi, ( SCREEN_WIDTH Shr 1 )
		mov	ecx, ( SCREEN_WIDTH Shr 1 )          '' x counter
		Xor	ebp, ebp          '' x component of address
		1:
		mov	ebx, ebp
		add	ebx, edi
		movq	mm0, [esi+ebx*8]
		Sub	ebx, ( SCREEN_WIDTH Shr 1 )
		inc	ebp
		movq	[esi+ebx*8], mm0
		dec	ecx
		jnz	1b
		dec	edx
		jnz	0b
		pop	ebp
		
		mov	esi, [prb]
		mov	edi, [pfb]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	ebx, ecx
		add	ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
		mov 	[edi+ebx*4], eax
		dec	ecx
		jns	0b
		
		
		'----------------------------------------------------------------
		'' Empty the MMX state to avoid interfering with FPU operations.
		'----------------------------------------------------------------
		#Ifdef	IS_AMD
			femms
		#Else
			emms
		#EndIf
		
	End Asm
	
	
End Sub

'=========================================================================

Sub scrollupsse		( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	
	asm
		mov	esi, [pfb]
		mov	edi, [prb]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	[edi+ecx*4], eax
		dec	ecx
		jns	0b
		
		push	ebp
		mov	edx, ( SCREEN_HEIGHT - 1 )	'' y counter
		Xor	edi, edi	'' y component of address
		0:
		add	edi, ( SCREEN_WIDTH Shr 2 )
		mov	ecx, ( SCREEN_WIDTH Shr 2 )		'' x counter
		Xor	ebp, ebp	'' x component of address
		1:
		mov	ebx, ebp
		add	ebx, edi
		'------------------------------------------------------
		'' Scale factors are limited to 2,4,and 8, so scaling
		'' EBX by 16 must be done with a separate instruction.
		'------------------------------------------------------
		Shl	ebx, 4
		movaps	xmm0, [esi+ebx]	'' SSE1
		inc	ebp
		movaps	[esi+ebx-( SCREEN_WIDTH Shl 2 )], xmm0	'' SSE1
		dec	ecx
		jnz	1b
		dec	edx
		jnz	0b
		pop	ebp
		
		mov	esi, [prb]
		mov	edi, [pfb]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	ebx, ecx
		add	ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
		mov	[edi+ebx*4], eax
		dec	ecx
		jns	0b
	End asm
	
	
End Sub

'=========================================================================

Sub scrollupsse2	( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	
	asm
		mov	esi, [pfb]
		mov	edi, [prb]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	[edi+ecx*4], eax
		dec	ecx
		jns	0b
		
		push	ebp
		mov	edx, ( SCREEN_HEIGHT - 1 )	'' y counter
		Xor	edi, edi	'' y component of address
		0:
		add	edi, ( SCREEN_WIDTH Shr 2 )
		mov	ecx, ( SCREEN_WIDTH Shr 2 )		'' x counter
		Xor	ebp, ebp	'' x component of address
		1:
		mov	ebx, ebp
		add	ebx, edi
		'------------------------------------------------------
		'' Scale factors are limited to 2,4,and 8, so scaling
		'' EBX by 16 must be done with a separate instruction.
		'------------------------------------------------------
		Shl	ebx, 4
		movdqa	xmm0, [esi+ebx]	'' SSE2
		inc	ebp
		movdqa	[esi+ebx-( SCREEN_WIDTH Shl 2 )], xmm0	'' SSE2
		dec	ecx
		jnz	1b
		dec	edx
		jnz	0b
		pop	ebp
		
		mov	esi, [prb]
		mov	edi, [pfb]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	ebx, ecx
		add	ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
		mov	[edi+ebx*4], eax
		dec	ecx
		jns	0b
	End asm
	
	
End Sub

'=========================================================================

/'
	Naked means there is no prologue or epilogue code.
	
	Although we should maintain all used registers, fbc assumes all
	registeres are destroyed anyway, so we won't bother.
	
	The exception to the above is ebp.  ebp always has to be preserved.
	
'/
Sub nakedup1		naked	( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	
	#define		pfb_	esp+8
	#define		prb_	esp+12
	
	Asm
		''	We still need to save ebp for the caller local pointer
		push	ebp
		
		mov	esi, [pfb_]
		mov	edi, [prb_]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	[edi+ecx*4], eax
		dec	ecx
		jns	0b
		
		'---------------------------------------------------------------
		'' Compared to the corresponding code in scrollup0, most of the
		'' speed advantage for this code is from the elimination of two
		'' imul eax, SCREEN_WIDTH instructions in the inner loop.
		''
		'' Notice how converting this to being a naked function auto-
		'' matically gives us ebp as a general purpose register.
		'---------------------------------------------------------------
		
		mov	edx, ( SCREEN_HEIGHT - 1 )	'' y counter
		Xor	edi, edi	'' y component of address
		0:
		add	edi, SCREEN_WIDTH
		mov	ecx, SCREEN_WIDTH	'' x counter
		Xor	ebp, ebp	'' x component of address
		1:
		mov	ebx, ebp
		add	ebx, edi
		mov	eax, [esi+ebx*4]
		Sub	ebx, SCREEN_WIDTH
		inc	ebp
		mov	[esi+ebx*4], eax
		dec	ecx
		jnz	1b
		dec	edx
		jnz	0b
		
		mov	esi, [prb_]
		mov	edi, [pfb_]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	ebx, ecx
		add	ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
		mov	[edi+ebx*4], eax
		dec	ecx
		jns	0b
		
		
		'' Restore ebp and return
		pop	ebp
		ret
	End asm
	
	#undef		prb_
	#undef		pfb_
	
End Sub

'=========================================================================

Sub nakedupmmx		naked	( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	
	#define		pfb_	esp+4
	#define		prb_	esp+8
	#define		s_ebp	esp-8
	
	Asm
		''	We still need to save ebp for the caller local pointer
		mov	[s_ebp], ebp
		
		mov	esi, [pfb_]
		mov	edi, [prb_]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	[edi+ecx*4], eax
		dec	ecx
		jns	0b
		
		mov	edx, ( SCREEN_HEIGHT - 1 )          '' y counter
		Xor	edi, edi          '' y component of address
		0:
		add	edi, ( SCREEN_WIDTH Shr 1 )
		mov	ecx, ( SCREEN_WIDTH Shr 1 )          '' x counter
		Xor	ebp, ebp          '' x component of address
		1:
		mov	ebx, ebp
		add	ebx, edi
		movq	mm0, [esi+ebx*8]
		Sub	ebx, ( SCREEN_WIDTH Shr 1 )
		inc	ebp
		movq	[esi+ebx*8], mm0
		dec	ecx
		jnz	1b
		dec	edx
		jnz	0b
		
		mov	esi, [prb_]
		mov	edi, [pfb_]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	ebx, ecx
		add	ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
		mov 	[edi+ebx*4], eax
		dec	ecx
		jns	0b
		
		
		'----------------------------------------------------------------
		'' Empty the MMX state to avoid interfering with FPU operations.
		'----------------------------------------------------------------
		#Ifdef	IS_AMD
			femms
		#Else
			emms
		#EndIf
		
		'' Restore ebp and return
		mov	ebp, [s_ebp]
		ret
	End Asm
	
	#undef		s_ebp
	#undef		prb_
	#undef		pfb_
	
End Sub

'=========================================================================

Sub nakedupsse		naked	( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	
	#define		pfb_	esp+4
	#define		prb_	esp+8
	#define		s_ebp	esp-8
	
	Asm
		''	We still need to save ebp for the caller local pointer
		mov	[s_ebp], ebp
		
		mov	esi, [pfb_]
		mov	edi, [prb_]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	[edi+ecx*4], eax
		dec	ecx
		jns	0b
		
		mov	edx, ( SCREEN_HEIGHT - 1 )	'' y counter
		Xor	edi, edi	'' y component of address
		0:
		add	edi, ( SCREEN_WIDTH Shr 2 )
		mov	ecx, ( SCREEN_WIDTH Shr 2 )		'' x counter
		Xor	ebp, ebp	'' x component of address
		1:
		mov	ebx, ebp
		add	ebx, edi
		'------------------------------------------------------
		'' Scale factors are limited to 2,4,and 8, so scaling
		'' EBX by 16 must be done with a separate instruction.
		'------------------------------------------------------
		Shl	ebx, 4
		movaps	xmm0, [esi+ebx]	'' SSE1
		inc	ebp
		movaps	[esi+ebx-( SCREEN_WIDTH Shl 2 )], xmm0	'' SSE1
		dec	ecx
		jnz	1b
		dec	edx
		jnz	0b
		
		mov	esi, [prb_]
		mov	edi, [pfb_]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	ebx, ecx
		add	ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
		mov	[edi+ebx*4], eax
		dec	ecx
		jns	0b
		
		'' Restore ebp and return
		mov	ebp, [s_ebp]
		ret
	End Asm
	
	#undef		s_ebp
	#undef		prb_
	#undef		pfb_
	
End Sub

'=========================================================================

Sub nakedupsse2		naked	( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	
	#define		pfb_	esp+4
	#define		prb_	esp+8
	#define		s_ebp	esp-8
	
	Asm
		''	We still need to save ebp for the caller local pointer
		mov	[s_ebp], ebp
		
		mov	esi, [pfb_]
		mov	edi, [prb_]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	[edi+ecx*4], eax
		dec	ecx
		jns	0b
		
		mov	edx, ( SCREEN_HEIGHT - 1 )	'' y counter
		Xor	edi, edi	'' y component of address
		0:
		add	edi, ( SCREEN_WIDTH Shr 2 )
		mov	ecx, ( SCREEN_WIDTH Shr 2 )		'' x counter
		Xor	ebp, ebp	'' x component of address
		1:
		mov	ebx, ebp
		add	ebx, edi
		'------------------------------------------------------
		'' Scale factors are limited to 2,4,and 8, so scaling
		'' EBX by 16 must be done with a separate instruction.
		'------------------------------------------------------
		Shl	ebx, 4
		movdqa	xmm0, [esi+ebx]	'' SSE2
		inc	ebp
		movdqa	[esi+ebx-( SCREEN_WIDTH Shl 2 )], xmm0	'' SSE2
		dec	ecx
		jnz	1b
		dec	edx
		jnz	0b
		
		mov	esi, [prb_]
		mov	edi, [pfb_]
		mov	ecx, ( SCREEN_WIDTH - 1 )
		0:
		mov	eax, [esi+ecx*4]
		mov	ebx, ecx
		add	ebx, ( SCREEN_HEIGHT - 1 )*SCREEN_WIDTH
		mov	[edi+ebx*4], eax
		dec	ecx
		jns	0b
		
		'' Restore ebp and return
		mov	ebp, [s_ebp]
		ret
	End Asm
	
	#undef		s_ebp
	#undef		prb_
	#undef		pfb_
	
End Sub

'=========================================================================
Dim Shared As Any Ptr horzsave
Dim Shared As Any Ptr frame

Sub scrollup2		( ByVal pfb As UInteger Ptr, ByVal prb As UInteger Ptr )
	Get(0,0)-(( SCREEN_WIDTH - 1 ),0),horzSave
	Get(0,1)-(( SCREEN_WIDTH - 1 ),( SCREEN_HEIGHT - 1 )),frame
	Put(0,0),frame,Pset
	Put(0,( SCREEN_HEIGHT - 1 )),horzSave,Pset
End Sub

'=========================================================================


#macro	shadowtext( _x_, _y_, _text_ )
	Draw String ( _x_-1, _y_-1 ), _text_, 0
	Draw String ( _x_-1, _y_+1 ), _text_, 0
	Draw String ( _x_+1, _y_-1 ), _text_, 0
	Draw String ( _x_+1, _y_+1 ), _text_, 0
	Draw String ( _x_, _y_ ), _text_
	Open Cons For Append As #123
	Print #123, Space( ( _x_ ) / 8 ); _text_;
	Close #123
#EndMacro


'=========================================================================

#Macro	rollup()
	For i As Integer = 1 To 8
		ScreenSync
		ScreenLock
		pfb		= ScreenPtr
		
		scrollup1( pfb, @rb( 0 ) )
		
		ScreenUnlock
	Next
	Open Cons For Append As #123
	Print #123,
	Close #123
#EndMacro


'=========================================================================

#Macro	screengarbage()
	Scope
		ScreenLock
		pfb		= ScreenPtr
		For i As Integer = 0 To ( SCREEN_WIDTH * SCREEN_HEIGHT ) - 1
			pfb[i]	= rgb( Rnd * 255, Rnd * 255, Rnd * 255 )
		Next
		ScreenUnLock
	End Scope
#endmacro


'=========================================================================

#Macro  compare( _func_ )
	Scope
		Const As Integer	INTTERATIONS	= SCREEN_HEIGHT
		
		#If __FB_DEBUG__ = 0
			
			Dim As Double   t2      	= Timer
			Dim As Double   t1		= Timer
			
		#EndIf
		
		
		ScreenSync
		ScreenLock
		pfb		= ScreenPtr
		
		
		#If __FB_DEBUG__ = 0
			t1      = Timer
			COUNTER_BEGIN( INTTERATIONS, REALTIME_PRIORITY_CLASS )
		#Else
			Asm	int3
		#EndIf
		
		
		_func_( pfb, @rb( 0 ) )
		
		
		#If __FB_DEBUG__ = 0
			COUNTER_END
			t2      = Timer
		#EndIf
		
		
		#If __FB_DEBUG__ = 0
			var	text	= Format( ( t2 - t1 ) * ( 1000000 / INTTERATIONS ), !"#####\230s" )
			shadowtext( 128 + 1, 1, text )
		#Else
			shadowtext( 128 + 1, 1, "Debug" )
		#EndIf
		
		
		ScreenUnLock
		
		
		rollup()
		
		
	End     Scope
#endmacro


'=========================================================================

#Macro	scroll( _func_ )
	Scope
		
		shadowtext( 1, 1, #_func_ )
		
		#IfNDef	NO_DEMO
			For i As Integer = 1 To SCREEN_HEIGHT
				ScreenSync
				ScreenLock
				pfb		= ScreenPtr
				
				_func_( pfb, @rb( 0 ) )
				
				ScreenUnLock
			Next
		#EndIf
	End Scope
	compare( _func_		)
#EndMacro


'=========================================================================

Dim As Uinteger		rb( SCREEN_WIDTH )
Dim As UInteger Ptr	pfb		= Any


ScreenRes SCREEN_WIDTH,SCREEN_HEIGHT,32

horzsave	= ImageCreate(SCREEN_WIDTH,1)
frame		= imagecreate(SCREEN_WIDTH,SCREEN_HEIGHT)


'=========================================================================

screengarbage()


scroll( scrollup0	)
rollup()


#IfDef	USE_ASM_386
	scroll( scrollup1	)
	scroll( nakedup1	)
	rollup()
#EndIf


#IfDef	USE_ASM_MMX
	scroll( scrollupmmx	)
	scroll( nakedupmmx	)
	rollup()
#EndIf


#IfDef	USE_ASM_SSE
	scroll( scrollupsse	)
	scroll( nakedupsse	)
	rollup()
#EndIf


#IfDef	USE_ASM_SSE2
	scroll( scrollupsse2	)
	scroll( nakedupsse2	)
	rollup()
#EndIf


scroll( scrollup2	)
rollup()


shadowtext( 1, 1, "fin" )
rollup()


'=========================================================================

Sleep
ImageDestroy( horzsave )
ImageDestroy( frame )

Output from my machine (512x512, IS_AMD, NO_DEMO):

Code: Select all

scrollup0                       2256µs

scrollup1                       737µs
nakedup1                        681µs

scrollupmmx                     734µs
nakedupmmx                      695µs

scrollupsse                     634µs
nakedupsse                      609µs

scrollupsse2                    634µs
nakedupsse2                     606µs

scrollup2                       1717µs

fin
BasicCoder2
Posts: 3954
Joined: Jan 01, 2009 7:03
Location: Australia

Post by BasicCoder2 »

kiyotewolf wrote:
One of the stumbling blocks for me is, in 6502, you had all the hexadecimal values for every op-code.

I've never seen a complete mapping of each usage of each ASM command in 8088+ language, turned into it's exact hex opcode equivilants.
But if you are not using a 6502 what can you do about that? Forget about it I would suggest. Let the assembler generate the hexadecimal values.
What's that good for? Well, it's very good to know, if you're coding inline asm in Qbasic, and you need, actual, numbers, in hex, to plug into an array or string to then CALL ABSOLUTE to.
Yes, IF, IF you are programming in qbasic. But if you are not using qbasic in a DOS machine then it is a different ball game. And if your program is a computer game you are going to want to make use of fast graphics and that means using DirectX.

I used to love assembler on the old machines but 10 years ago when I tried to update to using it on a modern machine I found out it was nothing but calling Win32 API functions I had no direct access to anything. So until I found FreeBasic I dabbled in other languages.

Using tricks like dithering to increase the apparent number of colors is just a complication. Yes a palette of colors is nice and possible with rgb colors but its also nice to have the simplification of rgb colors when you need them.

I know that if you are an expert doing your calculations with an abacus or a slide rule you might show resistence to using one of those new fangled electronic calculators, and certainly those who are the guardians of the old ways are of historical value, but I would much prefer to have had the brains and time to keep up with the latest programming techniques and modern technology.

However I make a distinction between a professor of English and someone who can write a good novel. Some people are experts on the language while others actually use it for something interesting. Both kinds are required. For example someone might be able to write a great graphics program but have no artistic ability to actually use it. Others might be great on the math involved in 2D or 3D graphic programs and yet not be able to come up with a fun game.

It seems to me that if you want to write some of the older kind of games then FreeBasic has everything you need without resorting to asm or needing the hardware sprites etc required in the older machines.

JohnC
kiyotewolf
Posts: 1009
Joined: Oct 11, 2008 7:42
Location: ABQ, NM
Contact:

Post by kiyotewolf »

I guess you might call me a "Steampunk Programmer"..

Resorting to old steam methods of powering my game engines.



~Kiyote!
BasicCoder2
Posts: 3954
Joined: Jan 01, 2009 7:03
Location: Australia

Post by BasicCoder2 »

kiyotewolf wrote:I guess you might call me a "Steampunk Programmer"..

Resorting to old steam methods of powering my game engines.

~Kiyote!

Yes I guess so :)

Nostalgia is a pleasant experience but to live in the past is to lose your present and thus your future past and your potential for growth. Still we must accept our limits thus I am using FreeBasic instead of C++.

"Game programmers have resisted the Windows platform since the beginning of time, but like the Borg say, "Resistence is futile..." I tend to agree."
Tricks of the Windows game programming gurus - Andre LaMothe

JohnC
kiyotewolf
Posts: 1009
Joined: Oct 11, 2008 7:42
Location: ABQ, NM
Contact:

Post by kiyotewolf »

When I used to go to Barnes & Noble, way back when, I found books on Commodore stuff, and books on DOS.

Then, these strange new language books started coming out, including web based stuff, and I couldn't do anything but scratch my head and watch as the books I understood slowly drifted from the shelves forever more.

I know what I do can be archaic at times, but I'm still playing catch-up, to what happened over 15+ years ago, when things went way beyond my control.

I'm going to be using OOP very soon, cause someone's most recent demo of something else, included very rudimentary OOP implementation which I ACTUALLY UNDERSTOOD.

So, I'm working on moving into the new century of coding.

Srsly. I'm trying.



~Kiyote!
BasicCoder2
Posts: 3954
Joined: Jan 01, 2009 7:03
Location: Australia

Post by BasicCoder2 »

When I used to go to Barnes & Noble, way back when, I found books on Commodore stuff, and books on DOS.

Then, these strange new language books started coming out, including web based stuff, and I couldn't do anything but scratch my head and watch as the books I understood slowly drifted from the shelves forever more.

I know what I do can be archaic at times, but I'm still playing catch-up, to what happened over 15+ years ago, when things went way beyond my control.

I'm going to be using OOP very soon, cause someone's most recent demo of something else, included very rudimentary OOP implementation which I ACTUALLY UNDERSTOOD.

So, I'm working on moving into the new century of coding.

Srsly. I'm trying.

~Kiyote!
Yes my tale is much the same. I had some angst over my hard earned knowledge and skills in my hobby becoming dated and had to make a conscious effort to forget about the old machines and try to move on ...

I did some OOP with C++ but I haven't the time to learn FB OO nomenclature. Most of DJPeter's code for example I simply cannot read. I was happy to go back in time to FB as an updated QB but without the need to add stuff such as a mouse driver and with the bonus of plenty of memory, speed and great graphics. Apart from that FB for me is QB and I just try to keep my programs modular.

I would have stuck with DevC++ and SDL for graphics if I hadn't found FreeBasic and DJPeter's FB code for using .dlls for accessing a webcam and the k8085 i/o board.

If you want a scrolly game you will have to use PUT if you want the same kind of display speeds you could get on the old machines.

JohnC
kiyotewolf
Posts: 1009
Joined: Oct 11, 2008 7:42
Location: ABQ, NM
Contact:

Post by kiyotewolf »

Did you guys find any real answer to this as an optimization method, vs the original FB code that I posted, for BasicCoder2 to use for the conversion to ASM?

<.< I find now, that here, as I'm needing it here very shortly, that I can only rely on my code, because I found out BasicCoder2's ASM version missed some, on his conversion.

The ASM is incomplete, on at least one of the scroll functions. I know that for sure, but did not research exactly what was missing / wrong with the conversion on a whole.

@BasicCoder2 ~ (If you want me to point out where the ASM is broken, so you can re-do it, I will, but if you're tired of helping me with this, I can understand, and you don't have to.)

~~~

Did anyone figure out, is my original FB code, as efficient, as BasicCoder2's ASM version?

Even though his version has bugs, it is still 99% correct and complete. He did exactly what I wanted, so testing for speed is legal between my FB code and his ASM versions.



~Kiyote!

I just wanna put this to rest, so I can either ask BasicCoder2 for more help (pretty please, if you're still willing), or just stick to my FB version.
marcov
Posts: 3503
Joined: Jun 16, 2005 9:45
Location: Netherlands
Contact:

Post by marcov »

kiyotewolf wrote:Honestly, I don't know why learning the 80386 set is so hard for me.

One of the stumbling blocks for me is, in 6502, you had all the hexadecimal values for every op-code.
The intel (and AMD) architecture manuals are freely downloadable have all this info you need.

The trouble is that it is a lot larger, and not as one dimensional as it was on the 6502.

Anyway, better move directly to x86_64 and forget all this old i386 stuff.
kiyotewolf
Posts: 1009
Joined: Oct 11, 2008 7:42
Location: ABQ, NM
Contact:

Post by kiyotewolf »

But I could understand the one dimensional 6502!

<.< it's 8088+ .. (and so on and so on)..

that confuses the living cupcakes out of me..........

O.o and I have no real clear idea as to why either!



~Kiyote!
marcov
Posts: 3503
Joined: Jun 16, 2005 9:45
Location: Netherlands
Contact:

Post by marcov »

kiyotewolf wrote:But I could understand the one dimensional 6502!
If you can't understand it, consider not diving into assembler. It is not as worthwhile as in 6502 times anyway.
<.< it's 8088+ .. (and so on and so on)..
Start with forgetting everything before pentium M. Preferably also P4, since when you finally have mastered that, they are near extinct. No need to dwell on the past.
BasicCoder2
Posts: 3954
Joined: Jan 01, 2009 7:03
Location: Australia

Post by BasicCoder2 »

kiyotewolf wrote: @BasicCoder2 ~ (If you want me to point out where the ASM is broken, so you can re-do it, I will, but if you're tired of helping me with this, I can understand, and you don't have to.
Sure point out where the ASM is broken.

If you want to scroll fast you will use the FreeBasic methods instead just as I demonstrated with the bigPic example which scolls as fast as you move the mouse by whatever amount and direction you like.

JohnC
kiyotewolf
Posts: 1009
Joined: Oct 11, 2008 7:42
Location: ABQ, NM
Contact:

Post by kiyotewolf »

@BasicCoder2

Sorry for the delay.

I'll get to the pointing out of the bad code here soonish.



~Kiyote!

Way too many irons in the fire, like usual.
TESLACOIL
Posts: 1769
Joined: Jun 20, 2010 16:04
Location: UK
Contact:

some thoughts

Post by TESLACOIL »

some thoughts

looking at all the robotics kits out there, there are cpu's on small pcb boards that plug in via usb

It would be interesting if some ASM was written for these chips.

As the architecture of these chips will be well known & documented. What is written for one is written for all

What im getting at is this, if you had a coupe of cpus hanging off your usb ports this would provide some parallel processing, which would be handy if you where writing a game , vision application or other app which was bogging down your computers PC

apart from the read write delay of the of the usb mounted chips , (reading data off your pcs ram , and writes the results back ) the only other drain on you pc is a few milliamps of power being sucked out your usb port



Bonus is you can just unplug the chip/s from one machine and stick into another....where ever u need that little boost

I think you will agree this is an intriguing concept , harking back to the days when a fully modded spectrum had all sorts of bits chaining out of it


chess engine tree type searches
sorting large lists
and other mathematically demanding tasks could be offloaded

it wouldn't take long before i nice little library was built up

#include cpu's (1 to n'usb ports)



would be a groovy addition to the free basic environment
Vendan
Posts: 48
Joined: Sep 18, 2006 0:25

Post by Vendan »

veering off the original topic
What im getting at is this, if you had a coupe of cpus hanging off your usb ports this would provide some parallel processing, which would be handy if you where writing a game , vision application or other app which was bogging down your computers PC
It'd be interesting to see something like that, but you do realize a 8Mhz cpu can't compare to a 3Ghz one, right?

What would be better is FPGAs as addon boards.

Back on topic

I'm not sure if this is what you are looking for, but here's my go at making them faster

Code: Select all


sub ScrollUp (RRam2() as ubyte)

dim z as integer

dim Memry(320) as ubyte

For z = 0 To 319
  Memry(z) = RRam2(z)   'Point(z, 0)
Next z

asm

	mov ECX, 15920
	mov EAX, [RRam2]
	mov EAX, [EAX]
	mov EDI, EAX
	add EAX, 320
	mov ESI, EAX
rep	MOVSD

end asm

'for z = 0 to 63679
'  RRam(z) = RRam2(z+320)
'next z

For z = 0 To 319
  RRam2(z+320*199) = Memry(z)
Next z


end sub
sub ScrollDown (RRam2() as ubyte)

dim z as integer

dim Memry(320) as ubyte

For z = 0 To 319
  Memry(z) = RRam2(z + 320*199)   'Point(z, 0)
Next z

asm
	
	mov ECX, 15920
	mov EAX, [RRam2]
	mov EAX, [EAX]
	add EAX, 63676
	mov ESI, EAX
	add EAX, 320
	mov EDI, EAX
	std
rep	MOVSD
	cld
	
end asm

'for z = 63679 to 0 step -1
'  RRam2(z+320) = RRam2(z)
'next z

For z = 0 To 319
  RRam2(z) = Memry(z)
Next z

end sub
sub ScrollLeft (RRam2() as ubyte)

dim clr as ubyte, z as integer

clr = RRam2(0)

asm

	mov ECX, 16000
	mov EAX, [RRam2]
	mov EAX, [EAX]
	mov EDI, EAX
	add EAX, 1
	mov ESI, EAX
rep	MOVSD
	mov ECX, 3
rep	MOVSB

end asm
'for z = 1 to 63999
'  RRam2(z-1) = RRam2(z)
'next z

For z = 199 To 1 Step -1
  RRam2(320 * z+319) = RRam2(320 * (z-1) + 319)
Next z

RRam(319) = clr

end sub
sub ScrollRight (RRam2() as ubyte)

dim clr as ubyte, z as integer

clr = RRam2(63999)

asm

	mov ECX, 16000
	mov EAX, [RRam2]
	mov EAX, [EAX]
	add EAX, 63993
	mov ESI, EAX
	add EAX, 1
	mov EDI, EAX
	std
rep	MOVSD
	mov ECX, 3
rep	MOVSB
	cld
end asm

'for z = 63998 to 0 step -1
'  RRam2(z+1) = RRam2(z)
'next z

For z = 0 To 198
  RRam2(z*320) = RRam2((z+1) * 320)
Next z

RRam2(199 * 320) = clr

end sub

I think they are a pretty sped up, as on my system 10000 calls takes between 0.125 seconds and 0.531 seconds

due to cache hits and other strange cpu phenomenon that I don't understand terribly well, scrolling up is the fastest at 0.125 and right is the slowest at 0.531. Left and down sit at a nice 0.250 and 0.202, respectively.

For comparison, 10000 calls of the FB version took around 4 seconds to run.

quick note: This is poorly optimized for the specific size of the buffer, if the buffer changes, you get to change the "magic numbers". It's also using the global

If your looking for a hex->instruction reference, do 2 things.
1. prepare your mind for the confusion your about to throw at it. Easiest way? percussive maintenance with a freaking hammer. I mean seriously, the x86 opcodes are the freaking craziest thing ever. Just wait till you see the horror that is an opcode followed by a modr/m byte followed by a SIB byte. Oh wait, it was prefixed with a size override modifier and it's followed by a displacement and an immediate. That's right, a single instruction could take up to 4 bytes for the opcode itself and another 8 of data.
2. goto http://ref.x86asm.net. They have a chart of all the possible opcodes. It's about 7~8 pages, one line per opcode. It is rather nice to have though, and I refer to it constantly for my JIT assembler I'm writing(give it asm, and it builds up a function at run time that you can call.)
Post Reply