Fully support UTF-8 decoding

`Utf8ToAnsi` now parses all UTF-8 code points, chars not in
`utf8_decode` are represented by `&#u<hex>;` sequence.

Note: `&#x...;` and `&#u...;` have different meanings. The former
represents a char with an individual PNG glyph, and the latter a char
without one (and have to fallback to Unifont).
This commit is contained in:
syimyuzya 2021-12-05 19:16:21 +08:00
parent 2da37c48c0
commit 2ba26cc9cc
1 changed files with 46 additions and 4 deletions

View File

@ -259,9 +259,13 @@ function AnsiToUtf8(s)
end
function Utf8ToAnsi(s)
local a, j, r, b, scope = 0, 0, ""
local r, b = ""
local scope
local j, l, u
for i = 1, s and s:len() or 0 do
b = s:byte(i)
-- legacy parser
if b == 0x26 then
r = r .. "&#x26;"
elseif b < 128 then
@ -275,15 +279,53 @@ function Utf8ToAnsi(s)
scope = scope[b]
if "string" == type(scope) then
r, scope = r .. scope
j = -1 -- supress general UTF-8 parser
end
else
r, scope = r .. "_"
scope = nil
end
elseif utf8_decode[b] then
scope = utf8_decode[b]
else
r = r .. "_"
end
-- general UTF-8 parser
if j == -1 then -- supressed by legacy parser
j, l, u = nil
elseif b < 0x80 then
if j then
r = r .. "&#ufffd;"
j, l, u = nil
end
-- ASCII handled by legacy parser
elseif b >= 0xc0 then
if j then
r = r .. "&#ufffd;"
end
j = i
if b >= 0xf8 then
r = r .. "&#ufffd;"
j, l, u = nil
elseif b >= 0xf0 then
l, u = 4, b % (2 ^ 3)
elseif b >= 0xe0 then
l, u = 3, b % (2 ^ 4)
else
l, u = 2, b % (2 ^ 5)
end
else
if j then
u = u * (2 ^ 6) + b % (2 ^ 6)
if i == j + l - 1 then
r = r .. string.format("&#u%x;", u)
j, l, u = nil
end
else
r = r .. "&#ufffd;"
end
end
end
if j then
r = r .. "&#ufffd;"
end
return r
end