CountInString()

Post your FreeBASIC source, examples, tips and tricks here. Please don’t post code without including an explanation.
D.J.Peters
Posts: 8586
Joined: May 28, 2005 3:28
Contact:

CountInString()

Post by D.J.Peters »

Adding delimiters is your home work :-)

Joshy

Code: Select all

' count how often the string "search" is in string "source" 
function CountInString overload (byref source as string, _
                                 byref search as string, _
                                 byval case_sensitive as boolean=true) as integer
  var sPos=0,count=0,nSource=len(source), nSearch=len(search)
  if nSearch<1 then return 0
  if nSource<nSearch then return 0
  while sPos+nSearch <= nSource
    var nSame=0,sStart=sPos
    if case_sensitive then
      for i as integer=0 to nSearch-1
        if source[sStart]<>search[i] then exit for
        sStart+=1:nSame+=1
      next
    else
      for i as integer=0 to nSearch-1
        var a=source[sStart],b=search[i]
        if a>64 andalso a<91 andalso b>96 andalso b<123 then
          ' if a in range of "A"-"Z" and b in range "a"-"z"
          b-=32 ' change b
        elseif b>64 andalso b<91 andalso a>96 andalso a<123 then
          ' if a in range of "a"-"z" and b in range "A"-"Z"
          a-=32 ' change a
        end if
        if a<>b then exit for
        sStart+=1:nSame+=1
      next 
    end if  
    if nSame=nSearch then
      count+=1 : sPos+=nSearch
    else
      sPos+=1
    end if  
  wend  
  return count
end function
' does the same but returns a redimed array of the positions 
function CountInString(byref source as string, _
                       byref search as string, _
                       positions() as integer, _
                       byval case_sensitive as boolean=true) as integer
  var sPos=0,count=0,nSource=len(source), nSearch=len(search)
  if nSearch<1 then return 0
  if nSource<nSearch then return 0
  while sPos+nSearch <= nSource
    var nSame=0,sStart=sPos
    if case_sensitive then
      for i as integer=0 to nSearch-1
        if source[sStart]<>search[i] then exit for
        sStart+=1:nSame+=1
      next
    else
      for i as integer=0 to nSearch-1
        var a=source[sStart],b=search[i]
        if a>64 andalso a<91 andalso b>96 andalso b<123 then
          ' if a in range of "A"-"Z" and b in range "a"-"z"
          b-=32 ' change b
        elseif b>64 andalso b<91 andalso a>96 andalso a<123 then
          ' if a in range of "a"-"z" and b in range "A"-"Z"
          a-=32 ' change a
        end if
        if a<>b then exit for
        sStart+=1:nSame+=1
      next 
    end if  
    if nSame=nSearch then
      if ubound(positions)<count then redim preserve positions(iif(count=0,0,count shl 1))
      positions(count)=sPos+1 : count+=1 : sPos+=nSearch
    else
      sPos+=1
    end if
  wend  
    
  return count
end function
var test = "az Az aZ"
print test
print "aZ",CountInString(test,"aZ")
print "Az",CountInString(test,"Az")
print "aZ",CountInString(test,"aZ")
print "AZ",CountInString(test,"AZ")
print "AZ",CountInString(test,"AZ",false)
print

redim as integer p()
test = "print Print PRINT PrInT"
print test
var count = CountInString(test,"print",p())
print "print " & count : if count then for i as integer=0 to count-1 : print p(i), : next : print
count = CountInString(test,"PRINT",p(),false)
print "PRINT " & count : if count then for i as integer=0 to count-1 : print p(i), : next : print 
print
sleep
jj2007
Posts: 2326
Joined: Oct 23, 2016 15:28
Location: Roma, Italia
Contact:

Re: CountInString()

Post by jj2007 »

Looks almost like Count(). Might be interesting to test the speed difference ;-)
grindstone
Posts: 862
Joined: May 05, 2015 5:35
Location: Germany

Re: CountInString()

Post by grindstone »

Looks quite complicated to me.

Code: Select all

' count how often the string "search" is in string "source"
Function CountInString OverLoad (ByRef source As String, _
                                 ByRef search As String, _
                                 ByVal case_sensitive As boolean=TRUE) As Integer
  
  Dim As Integer count, begin
  
  If Not case_sensitive Then
  	source = LCase(source)
  	search = LCase(search)
  EndIf
  
  Do
  	begin = InStr(begin + 1, source, search)
  	If begin Then
  		count += 1
  		begin += Len(search)
  	Else
  		Return count
  	EndIf
  Loop
    
End Function
' does the same but returns a redimed array of the positions
Function CountInString(ByRef source As String, _
                       ByRef search As String, _
                       positions() As Integer, _
                       ByVal case_sensitive As boolean=TRUE) As Integer
  
  Dim As Integer count, begin
  
  If Not case_sensitive Then
  	source = LCase(source)
  	search = LCase(search)
  EndIf
  
  Do
  	begin = InStr(begin + 1, source, search)
  	If begin Then
  		ReDim Preserve positions(count)
  		positions(count) = begin
  		begin += Len(search)
  		count += 1
  	Else
  		Return count
  	EndIf
  Loop
   
End Function
Var test = "az Az aZ"
Print test
Print "aZ",CountInString(test,"aZ")
Print "Az",CountInString(test,"Az")
Print "aZ",CountInString(test,"aZ")
Print "AZ",CountInString(test,"AZ")
Print "AZ",CountInString(test,"AZ",FALSE)
Print

ReDim As Integer p()
test = "print Print PRINT PrInT"
Print test
Var count = CountInString(test,"print",p())
Print "print " & count : If count Then For i As Integer=0 To count-1 : Print p(i), : Next : Print
count = CountInString(test,"PRINT",p(),FALSE)
Print "PRINT " & count : If count Then For i As Integer=0 To count-1 : Print p(i), : Next : Print
Print
Sleep

But maybe your way is faster than using InStr.
D.J.Peters
Posts: 8586
Joined: May 28, 2005 3:28
Contact:

Re: CountInString()

Post by D.J.Peters »

@grindstone I see stuff like a compiler or assembler and every call outside the current scope is not fast.

Imagine the source string is from a file and may be 1 MB in size.

If you call 10,00 times InStr() the current stack frame or used registers must be pushed on the stack

register or stack must be prepared for the call to InStr()

if InStr() return's the old registers or stack frame must be restored.

The other point of view are:
Other forum readers or beginners can learn how stuff like InStr() works under the hood.

bla bla bla :-)

Joshy
jj2007
Posts: 2326
Joined: Oct 23, 2016 15:28
Location: Roma, Italia
Contact:

Re: CountInString()

Post by jj2007 »

D.J.Peters wrote:If you call 10,00 times InStr() the current stack frame or used registers must be pushed on the stack
The overhead is negligible; InStr() is probably much faster. My assembly version is over a factor 3 faster than this one (not to mention the case-insensitive case...).
D.J.Peters
Posts: 8586
Joined: May 28, 2005 3:28
Contact:

Re: CountInString()

Post by D.J.Peters »

@jj2007 Beginners can learn how simple a kind of InStr() function can be written by your self in BASIC.

By the way If you are hot for a completion win the first price from the FreeBASIC Love Letter challenge :-)

Joshy
StringEpsilon
Posts: 42
Joined: Apr 09, 2015 20:49

Re: CountInString()

Post by StringEpsilon »

The below is faster in the average case. It gets closer to D.J. Peters performance with more percent of the source string being a hit.

countInstr("1234567890a", "a") will be consierably faster

countInstr("aaaaaaaaaaa", "a") will be as fast or slightly slower.

Code: Select all

#include "crt.bi"

#ifndef strcasestr
extern "c"
declare function strcasestr (byval as const zstring ptr, byval as const zstring ptr) as zstring ptr
end extern
#endif

type fbString
    dim as byte ptr stringData
    dim as integer length
    dim as integer size
end type

function countInstr(byref target as string, byref query as string, caseSensitive as boolean = true) as integer
	dim as fbstring ptr targetPtr = cast(fbstring ptr, @target)
	dim as fbstring ptr queryPtr = cast(fbstring ptr, @query)
	if ( targetPtr->length = 0 ) then return 0
	if ( queryPtr->length = 0 ) then return 0
	
	dim searcher as function(as zstring ptr, as zstring ptr) as zstring ptr = iif(caseSensitive, @strstr, @strcasestr)
	dim count as integer
	dim position as any ptr = targetPtr->stringData
	do
		position = searcher( position, queryPtr->stringData )
		
		if (position > 0) then 
			position += queryPtr->length
			count+=1
		else
			return count
		end if
	loop
end function
Edit: Optimized how the function chooses strstr() vs. strcasestr(). That helped a lot. Now the above mentioned worst case is still faster.
dodicat
Posts: 7976
Joined: Jan 10, 2006 20:30
Location: Scotland

Re: CountInString()

Post by dodicat »

String epsilon
I can't find strcasestr anywhere, I tried, looked in msvcrt.def, not there.
So I couldn't include your functions.

My tally is really very basic, fast at some searches, slow at others.
Here are three comparisons for finding a number in a 10 million string of numbers.
all different results.

Code: Select all

Function TALLY overload(SomeString As String,PartString As String) As Long
    Dim As Long LenP=Len(PartString),count
    Dim As Long position=Instr(SomeString,PartString)
    If position=0 Then Return 0
    While position>0
        count+=1
        position=Instr(position+LenP,SomeString,PartString)
    Wend
    Return count
End Function

Function TALLY(SomeString As String,PartString As String,a() as integer) As Long
    Dim As Long LenP=Len(PartString),count
    Dim As Long position=Instr(SomeString,PartString)
    If position=0 Then Return 0
    While position>0
        count+=1
        position=Instr(position+LenP,SomeString,PartString)
    Wend
    redim a(1 to count)
    var u=count
    position=Instr(SomeString,PartString)
    a(1)=position
    count=1
     While count < u
        count+=1
        position=Instr(position+LenP,SomeString,PartString)
        a(count)=position
    Wend
    Return u
End Function


Function CountInStringG OverLoad (ByRef source As String, _
                                 ByRef search As String, _
                                 ByVal case_sensitive As boolean=TRUE) As Integer
  
  Dim As Integer count, begin
  
  If Not case_sensitive Then
     source = LCase(source)
     search = LCase(search)
  EndIf
  
  Do
     begin = InStr(begin + 1, source, search)
     If begin Then
        count += 1
        begin += Len(search)
     Else
        Return count
     EndIf
  Loop
    
End Function
' does the same but returns a redimed array of the positions
Function CountInStringG(ByRef source As String, _
                       ByRef search As String, _
                       positions() As Integer, _
                       ByVal case_sensitive As boolean=TRUE) As Integer
  
  Dim As Integer count, begin
  
  If Not case_sensitive Then
     source = LCase(source)
     search = LCase(search)
  EndIf
  
  Do
     begin = InStr(begin + 1, source, search)
     If begin Then
        ReDim Preserve positions(count)
        positions(count) = begin
        begin += Len(search)
        count += 1
     Else
        Return count
     EndIf
  Loop
   
End Function


function CountInString overload (byref source as string, _
                                 byref search as string, _
                                 byval case_sensitive as boolean=true) as integer
  var sPos=0,count=0,nSource=len(source), nSearch=len(search)
  if nSearch<1 then return 0
  if nSource<nSearch then return 0
  while sPos+nSearch <= nSource
    var nSame=0,sStart=sPos
    if case_sensitive then
      for i as integer=0 to nSearch-1
        if source[sStart]<>search[i] then exit for
        sStart+=1:nSame+=1
      next
    else
      for i as integer=0 to nSearch-1
        var a=source[sStart],b=search[i]
        if a>64 andalso a<91 andalso b>96 andalso b<123 then
          ' if a in range of "A"-"Z" and b in range "a"-"z"
          b-=32 ' change b
        elseif b>64 andalso b<91 andalso a>96 andalso a<123 then
          ' if a in range of "a"-"z" and b in range "A"-"Z"
          a-=32 ' change a
        end if
        if a<>b then exit for
        sStart+=1:nSame+=1
      next 
    end if  
    if nSame=nSearch then
      count+=1 : sPos+=nSearch
    else
      sPos+=1
    end if  
  wend  
  return count
end function
' does the same but returns a redimed array of the positions 
function CountInString(byref source as string, _
                       byref search as string, _
                       positions() as integer, _
                       byval case_sensitive as boolean=true) as integer
  var sPos=0,count=0,nSource=len(source), nSearch=len(search)
  if nSearch<1 then return 0
  if nSource<nSearch then return 0
  while sPos+nSearch <= nSource
    var nSame=0,sStart=sPos
    if case_sensitive then
      for i as integer=0 to nSearch-1
        if source[sStart]<>search[i] then exit for
        sStart+=1:nSame+=1
      next
    else
      for i as integer=0 to nSearch-1
        var a=source[sStart],b=search[i]
        if a>64 andalso a<91 andalso b>96 andalso b<123 then
          ' if a in range of "A"-"Z" and b in range "a"-"z"
          b-=32 ' change b
        elseif b>64 andalso b<91 andalso a>96 andalso a<123 then
          ' if a in range of "a"-"z" and b in range "A"-"Z"
          a-=32 ' change a
        end if
        if a<>b then exit for
        sStart+=1:nSame+=1
      next 
    end if  
    if nSame=nSearch then
      if ubound(positions)<count then redim preserve positions(iif(count=0,0,count shl 1))
      positions(count)=sPos+1 : count+=1 : sPos+=nSearch
    else
      sPos+=1
    end if
  wend  
    
  return count
end function

#define range(f,l) Int(Rnd*((l+1)-(f))+(f))

#macro create(g)
       g[0]=range(49,57)
       For n As Long=1 To Len(g)-1
           g[n]=range(48,57)
       Next
#endmacro

Dim As String n1=String(10000000,0) 'make a big number

create(n1)
dim as string n2=n1  'for checking


redim as integer a(),b(),c() ' one for each method

dim as string d="9"
dim as double t
for z as long=1 to 2 'run twice
t=timer
print tally(n1,d)
print tally(n1,d,a())

print lbound(a);" to ";ubound(a);" ";timer-t,"dodicat      find number  ";d,cbool(n1=n2)

print

t=timer
print countinstring(n1,d)
print countinstring(n1,d,b())

print lbound(b);" to ";ubound(b);" ";timer-t,"d.j.peters   find number  ";d,cbool(n1=n2)
print

t=timer
print countinstringg(n1,d)
print countinstringg(n1,d,c())

print lbound(c);" to ";ubound(c);" ";timer-t,"grindstone   find number  ";d,cbool(n1=n2)
print "--------------------------"
print
next z
sleep


  
MrSwiss
Posts: 3910
Joined: Jun 02, 2013 9:27
Location: Switzerland

Re: CountInString()

Post by MrSwiss »

dodicat,

I think it's somewhere in the c-runtime, have you noticed:
#Include "crt.bi" ? Probably, crt/string.bi.
StringEpsilon
Posts: 42
Joined: Apr 09, 2015 20:49

Re: CountInString()

Post by StringEpsilon »

strstr() is available on windows (crt.bi), but strcasestr isn't. Don't know if windows has a substitute.

I'm sorry, I really should have mentioned that in my above post.

Edit: Toying with that benchmark, I noticed that when I drop case-insensitivity support, I gain a smidge performance. Didnt expect that a function pointer makes a noticeable difference like that.

Also, grindstones implementation mutates the source and search string in insensitive mode.
jj2007
Posts: 2326
Joined: Oct 23, 2016 15:28
Location: Roma, Italia
Contact:

Re: CountInString()

Post by jj2007 »

For benchmarking, you could use e.g. http://www.gutenberg.org/ebooks/10 - download the Plain Text UTF-8 version, then read it into a string and count common words like hell. There are about 50 occurrences of "hell", depending on whether you choose case-insensitive, full word search etc, and counting takes less than 10 ms on an Intel Core i5 at 2.5GHz.

If you want more fun, download the HTML version instead and benchmark the 24,541 occurrences of <p> ;-)
The strcasestr() function is a non-standard extension
https://stackoverflow.com/questions/273 ... lower-case
dodicat
Posts: 7976
Joined: Jan 10, 2006 20:30
Location: Scotland

Re: CountInString()

Post by dodicat »

results for hell

Code: Select all


 50
 50
 1 to  50  0.01733574143145233            dodicat      find number  hell            true

 50
 50
 0 to  62  0.07592501089675352            d.j.peters   find number  hell            true

 50
 50
 0 to  49  0.01150925422552973            grindstone   find number  hell            true
--------------------------

 50
 50
 1 to  50  0.01723137317458168            dodicat      find number  hell            true

 50
 50
 0 to  62  0.07709975278703496            d.j.peters   find number  hell            true

 50
 50
 0 to  49  0.01197189651429653            grindstone   find number  hell            true
--------------------------

  
results for <p> in the hmtl file

Code: Select all

 16093
 16093
 1 to  16093  0.03585854370612651         dodicat      find number  <p>             true

 16093
 16093
 0 to  16382  0.04322146880440414         d.j.peters   find number  <p>             true

 16093
 16093
 0 to  16092  0.02793887373991311         grindstone   find number  <p>             true
--------------------------

 16093
 16093
 1 to  16093  0.03499451122479513         dodicat      find number  <p>             true

 16093
 16093
 0 to  16382  0.04290425765793771         d.j.peters   find number  <p>             true

 16093
 16093
 0 to  16092  0.02695678541203961         grindstone   find number  <p>             true
--------------------------

 
I cannot find 24541 occurrences, tried case insensitive (entered a lcase(string) as the parametrer)
jj2007
Posts: 2326
Joined: Oct 23, 2016 15:28
Location: Roma, Italia
Contact:

Re: CountInString()

Post by jj2007 »

dodicat wrote:I cannot find 24541 occurrences, tried case insensitive (entered a lcase(string) as the parametrer)
Filename is 10-h.htm, 4,639,250 bytes. Definitely 24541 occurrences (and they are all lowercase <p>).
dodicat
Posts: 7976
Joined: Jan 10, 2006 20:30
Location: Scotland

Re: CountInString()

Post by dodicat »

I have it now jj2007, 24541 in quick succession.
Now I have two copies of the king James bible.
Is this some sort of sign jj2007 -- searching for "hell"?
What sort of ecumenical sign is <p>?
I am not even a member of the Church of England, my ancestors were Covenanters.
jj2007
Posts: 2326
Joined: Oct 23, 2016 15:28
Location: Roma, Italia
Contact:

Re: CountInString()

Post by jj2007 »

You can search for "Jesus", too ;-)

The question is really what's the purpose of a Count(string, match, options). The case of "hell" is few matches in a big buffer, the case of "<p>" is lots of matches. Speed-wise, they differ only marginally. What is the real world application here? I have no idea, actually.
Post Reply