diff --git a/encoding.lua b/encoding.lua index 32abd60..820aa46 100644 --- a/encoding.lua +++ b/encoding.lua @@ -259,9 +259,13 @@ function AnsiToUtf8(s) end function Utf8ToAnsi(s) - local a, j, r, b, scope = 0, 0, "" + local r, b = "" + local scope + local j, l, u for i = 1, s and s:len() or 0 do b = s:byte(i) + + -- legacy parser if b == 0x26 then r = r .. "&" elseif b < 128 then @@ -275,15 +279,53 @@ function Utf8ToAnsi(s) scope = scope[b] if "string" == type(scope) then r, scope = r .. scope + j = -1 -- supress general UTF-8 parser end else - r, scope = r .. "_" + scope = nil end elseif utf8_decode[b] then scope = utf8_decode[b] - else - r = r .. "_" end + + -- general UTF-8 parser + if j == -1 then -- supressed by legacy parser + j, l, u = nil + elseif b < 0x80 then + if j then + r = r .. "&#ufffd;" + j, l, u = nil + end + -- ASCII handled by legacy parser + elseif b >= 0xc0 then + if j then + r = r .. "&#ufffd;" + end + j = i + if b >= 0xf8 then + r = r .. "&#ufffd;" + j, l, u = nil + elseif b >= 0xf0 then + l, u = 4, b % (2 ^ 3) + elseif b >= 0xe0 then + l, u = 3, b % (2 ^ 4) + else + l, u = 2, b % (2 ^ 5) + end + else + if j then + u = u * (2 ^ 6) + b % (2 ^ 6) + if i == j + l - 1 then + r = r .. string.format("&#u%x;", u) + j, l, u = nil + end + else + r = r .. "&#ufffd;" + end + end + end + if j then + r = r .. "&#ufffd;" end return r end