http://www.w3.org/International/questio ... orms-utf-8
UTF-8 is a variable length multi byte character encoding for Unicode. Each code point (an UTF-8 byte stream represents a stream of Unicode code points) contained within the byte stream consists of either 1, 2, 3 or 4 bytes.
But not all combinations of bytes are 'valid'. There are rules UTF-8 encoded streams have to adhere to. These rules turn out to be so strict that it is relatively painless to check an UTF-8 byte stream for illegal characters (illegal code points).
To check the validity of an UTF-8 encoded byte stream you could use the following function. There is an example at the end of the code that shows how to call the function.
check_utf8 returns 0 if utf8_string is a 'legal' UTF-8 encoded byte stream and -1 if an error was found in the byte stream.
(len_ should equal the length of utf8_string (measured in bytes) )
Code: Select all
function check_utf8(byval utf8_string as ubyte ptr,byval len_ as integer) as integer
if (utf8_string = 0) then
return -1
end if
if (len_ <= 0) then
return -1
end if
var ct = 0
while (1)
again:
select case as const utf8_string[ct]
'ASCII range
case &h00 to &h7F
ct += 1
if (ct = len_) then
return 0
end if
goto again
'non - overlong 2 byte
case &hC2 to &hDF
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return 0
else
goto again
end if
end select
'exluding overlongs
case &hE0
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &hA0 to &hBF
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return 0
end if
end select
end select
'3 byte
case &hE1 to &hEC,&hEE,&hEF
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return 0
else
goto again
end if
end select
end select
'exclude surrogates (3 byte)
case &hED
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &h9F
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &h9F
ct += 1
if (ct = len_) then
return 0
else
goto again
end if
end select
end select
case &hF0
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h90 to &hBF
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return 0
else
goto again
end if
end select
end select
case &hF1 to &hF3
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return 0
else
goto again
end if
end select
end select
end select
case &hF4
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return -1
end if
select case as const utf8_string[ct]
case &h80 to &hBF
ct += 1
if (ct = len_) then
return 0
else
goto again
end if
end select
end select
end select
'illegal byte in utf8 string
case else
return -1
end select
wend
return -1
end function
dim tst(0 to ...) as ubyte => {&hC0,&hAE}
'output: "illegal utf8 content"
if (check_utf8(@tst(0),ubound(tst) + 1)) then
print "illegal utf8 content"
else
print "legal utf8 content"
end if
dim tst2(0 to ...) as ubyte => {&hd5,&ha2,&hd5,&ha1,&hd5,&hab,&hd5,&ha5,&hd6,&h82}
'output: "legal utf8 content" (tst2 contains "hello" in the Armenian language).
if (check_utf8(@tst2(0),ubound(tst2) + 1)) then
print "illegal utf8 content"
else
print "legal utf8 content"
end if