Search for Text and Replace

New to FreeBASIC? Post your questions here.
Alexa
Posts: 56
Joined: May 01, 2007 20:22

Postby Alexa » May 23, 2007 12:43

Thank you Tigra, i'll happy if you share it with us
Tigra
Posts: 155
Joined: Jan 07, 2007 17:21

Postby Tigra » May 24, 2007 0:09

Alexa wrote:Thank you Tigra, i'll happy if you share it with us


The new version does not use the regex library but rather a variation of the search code based on the Rabin-Karp algorithm Antoni suggested.

In place of a rolling hash value it maintains eight of them, to represent the eight possible hash values that an unsigned integer can hold.

It accepts the same parameters, and I strongly recommend that for large files that the buffer size parameter be used, otherwise it will first load the entire file into memory.

I'm trying as I type this to compare the old program and the new program, with a large file and a long list of words - the old one has not competed yet and it has not completed in about 10 minutes, the new program completed in 1¼ minutes.

Let me know. I did not implement the shortcut code that would kick in if all of the search words are eight or more characters. Let me know if you'd like that implemented.

Tigra

Code: Select all

'   search.bas
'   last date: 2007-05-23 19:46:08

#include once "crt.bi"
#include once "sdl/sdl_timer.bi"

Declare Function main( ByVal argc As Integer, ByVal argv As ZString Ptr Ptr ) As Integer
End main( __FB_ARGC__, __FB_ARGV__ )

#ifndef EXIT_SUCCESS

#define EXIT_SUCCESS 0
#define EXIT_FAILURE 1

#endif

Const TRUE = Not 0
Const FALSE = 0
Const HashMaxLength = 8

Type Matches
   As String Search
   As Integer lengthOfSearch
   As String Replace
   As ULongInt hash
   As Integer lengthOfHash
End Type

Function Hash OVERLOAD (Source() As UByte, ByVal Length As Integer) As ULongInt
   Dim As ULongInt result = 0ull

   If len(Source) > HashMaxLength Then
      length = HashMaxLength
   End If

   For index As Integer = 0 to length
      result = result shl 8 + Source(index)
   Next

   Return result
End Function

Function Hash OVERLOAD (ByVal Source As String) As ULongInt
   Dim As ULongInt result = 0ull
   Dim As Integer length

   If len(Source) > HashMaxLength Then
      length = HashMaxLength
   Else
      length = len(Source)
   End If

   For index As Integer = 0 to length-1
      result = result shl 8 + Source[index]
   Next

   Return result
End Function

Function SearchAndReplace(ByVal Source As String, ByVal Destination As String, ByVal BufferSize As Integer, Searches() As Matches) As Integer
   FUNCTION = EXIT_FAILURE

   Dim As Integer hSource
   Dim As Integer hDestination
   Dim As UByte Buffer()
   Dim As Integer longestSearch
   Dim As Integer UBoundSearches = UBound(Searches)
   Dim As Integer RemainingBufferSize
   Dim As Integer IsUCS_little_endian = 0
   Dim As Integer IsUCS_big_endian = 0
   Dim As Integer IsUTF8 = 0
   Dim As ULongInt hashOfBuffer(0 to 8)
   Dim As Integer idxHashOfBuffer
   Dim As Integer idxSearches

   hSource = FreeFile()
   Open Source For Binary Access Read As #hSource

   ReDim Buffer(0 To 3)
   Get #hSource, , Buffer()
   If Buffer(0) = &hFF And Buffer(1) = &hFE Then
      IsUCS_little_endian = TRUE
      Seek hSource, 3
   ElseIf Buffer(0) = &hFE And Buffer(1) = &hFF Then
      IsUCS_big_endian = TRUE
      Seek hSource, 3
   ElseIf Buffer(0) = &hEF And Buffer(1) = &hBB And Buffer(2) = &hBF Then
      IsUTF8 = TRUE
   Else
      Seek hSource, 1
   End If

   If IsUCS_little_endian Or IsUCS_big_endian Then
      If IsUCS_little_endian Then
         print "UCS-2 little endian files cannot be processed"
      ElseIf IsUCS_big_endian Then
         print "UCS-2 big endian files cannot be processed"
      End If
      Close #hSource
      Exit Function
   End If

   hDestination = FreeFile()
   Open Destination For Binary Access Write As #hDestination

   ' in case the program can handle these file types in the future...
   If IsUTF8 Or IsUCS_little_endian Or IsUCS_big_endian Then
      Put #hDestination, , @Buffer(0), Seek(hSource)-1
   End If

   longestSearch = -1
   For idxSearches = 0 to UBoundSearches
      If len(Searches(idxSearches).Search) > longestSearch Then longestSearch = len(Searches(idxSearches).Search)
   Next

   If BufferSize <= 0 Then
      BufferSize = LOF(hSource)
      longestSearch = 0
   ElseIf BufferSize < longestSearch Then
      BufferSize = longestSearch * 1.5
   End If

   ''~ cls

   Print "Processing "; Source; " to "; Destination; " in "; (LOF(hSource) \ BufferSize + sgn(LOF(hSource) Mod BufferSize)); " blocks of "; BufferSize

   Dim As UInt32 uiOverall = SDL_GetTicks()
   Dim As UInt32 uiNow = uiOverall
   Dim As UInt32 uiThen
   Dim As UInt32 uiTimes(10)

   For idxHashOfBuffer = 0 to 8
      hashOfBuffer(idxHashOfBuffer) = 0
   Next

   print
   print
   dim as integer ln = CsrLin()-2

   Do While Seek(hSource) < LOF(hSource)
      ''~ locate ln, 0, 0: print using "###.###"; ((seek(hSource)-1)*100)/lof(hSource)
      If Seek(hSource) + BufferSize-1 > LOF(hSource) Then BufferSize = LOF(hSource) - Seek(hSource) + 1
      ReDim Buffer(0 To BufferSize-1): RemainingBufferSize = BufferSize
      Get #hSource, , Buffer()
      uiThen = SDL_GetTicks(): uiTimes(0) = uiThen - uiNow: uiNow = uiThen

      Dim As UByte ptr pBuffer = @Buffer(0)
      Dim As UByte ptr pLastOutput = pBuffer
      Dim As UByte ptr pBufferEnd = @Buffer(BufferSize - 1 - longestSearch-1)

      Do While pBuffer <= pBufferEnd
         ''~ locate ln, 20, 0: print using "###.###"; ((pBuffer - @Buffer(0))*100)/(BufferSize - 1 - longestSearch-1)

         For idxHashOfBuffer = 1 to 8
            hashOfBuffer(idxHashOfBuffer) = hashOfBuffer(idxHashOfBuffer-1) shl 8 + pBuffer[idxHashOfBuffer-1]
         Next

         Dim As Integer idxMatch = -1
         For idxSearches = 0 to UBoundSearches
            If Searches(idxSearches).hash = hashOfBuffer(Searches(idxSearches).lengthOfHash) Then
               If Searches(idxSearches).lengthOfSearch <= 8 Then
                  idxMatch = idxSearches
                  Exit For
               ElseIf strncmp(Searches(idxSearches).Search, pBuffer, len(Searches(idxSearches).Search)) = 0 Then
                  idxMatch = idxSearches
                  Exit For
               End If
            End If
         Next
         If idxMatch > -1 Then
            If pBuffer <> pLastOutput Then Put #hDestination, , *pLastOutput, pBuffer - pLastOutput
            Put #hDestination, , Searches(idxSearches).Replace
            pBuffer += len(Searches(idxSearches).Search)
            pLastOutput = pBuffer
         End If

         pBuffer += 1
      Loop
      ''~ locate ln, 20, 0: print using "###.###"; ((pBuffer - @Buffer(0))*100)/(BufferSize - 1 - longestSearch-1)

      If pBuffer <> pLastOutput Then Put #hDestination, , *pLastOutput, pBuffer - pLastOutput

      If pLastOutput = @Buffer(0) Then
         uiTimes(3) += 1
      Else
         uiTimes(4) += 1
      End If
      uiThen = SDL_GetTicks(): uiTimes(1) = uiThen - uiNow: uiNow = uiThen

      ''~ If longestSearch > 0 And RemainingBufferSize >= longestSearch And Seek(hSource) < LOF(hSource) Then
         ''~ uiTimes(5) += 1
         ''~ Put #hDestination, , pBuffer, len(*pBuffer) - longestSearch
         ''~ Seek #hSource, Seek(hSource) - longestSearch
      ''~ Else
         ''~ uiTimes(6) += 1
         ''~ Put #hDestination, , *pBuffer
      ''~ End If
      uiThen = SDL_GetTicks(): uiTimes(2) = uiThen - uiNow: uiNow = uiThen

      ''~ locate 2, 0: print using "Load ####, process ####, output ####, (###### / ######), (###### / ######)"; uiTimes(0), uiTimes(1), uiTimes(2), uiTimes(3), uiTimes(4), uiTimes(5), uiTimes(6)
    Loop

   ''~ locate ln, 0, 0: print using "###.###"; ((seek(hSource)-1)*100)/lof(hSource)

   print
   print

   uiThen = SDL_GetTicks(): print "To process the file " & ( uiThen - uiOverall )

   locate CsrLin(), Pos(), 1

   Close #hSource
   Close #hDestination

   FUNCTION = EXIT_SUCCESS
End Function

Function main( ByVal argc As Integer, ByVal argv As ZString Ptr Ptr ) As Integer
   Dim As String Source
   Dim As String Destination
   Dim As String Strings
   Dim As Integer BufferSize

   If argc < 4 Then
      print *argv[0] & " Source Destination Strings [buffer]"
      print "Source is the file to search"
      print "Destination is the output file"
      print "Strings is the file with search-replace pairs"
      print "buffer is the size of the buffer to use; if omitted or zero or less then the entire file is read into memory"
   Else
      Dim As Matches Searches()
      Dim As Integer idxPairs = 0
      Dim As Integer hPairs

      Source = *argv[1]
      Destination = *argv[2]
      Strings = *argv[3]

      hPairs = FreeFile()
      Open Strings For Input As #hPairs
      Do While Not EOF(hPairs)
         ReDim Preserve Searches(0 To idxPairs)
         Input #hPairs, Searches(idxPairs).Search, Searches(idxPairs).Replace
         Searches(idxPairs).lengthOfSearch = len(Searches(idxPairs).Search)
         Searches(idxPairs).hash = Hash(Searches(idxPairs).Search)
         Searches(idxPairs).lengthOfHash = IIF(Searches(idxPairs).lengthOfSearch > HashMaxLength, HashMaxLength, Searches(idxPairs).lengthOfSearch)
         ''~ print Searches(idxPairs).Search, Searches(idxPairs).Replace, Searches(idxPairs).hash, Searches(idxPairs).lengthOfHash
         idxPairs += 1
      Loop
      Close #hPairs
      print

      If argc >= 5 Then BufferSize = val(*argv[4]) Else BufferSize = 0
      Return SearchAndReplace(Source, Destination, BufferSize, Searches())
   End If
   Return EXIT_SUCCESS
End Function

Return to “Beginners”

Who is online

Users browsing this forum: No registered users and 2 guests