--
--  File ctex-zhconv.lua
--
--     Copyright (C) 2020 by Qing Lee <[email protected]>
--------------------------------------------------------------------------
--
--     This work may be distributed and/or modified under the
--     conditions of the LaTeX Project Public License, either
--     version 1.3c of this license or (at your option) any later
--     version. This version of this license is in
--        http://www.latex-project.org/lppl/lppl-1-3c.txt
--     and the latest version of this license is in
--        http://www.latex-project.org/lppl.txt
--     and version 1.3 or later is part of all distributions of
--     LaTeX version 2005/12/01 or later.
--
--     This work has the LPPL maintenance status "maintained".
--
--     The Current Maintainer of this work is Qing Lee.
--
--     This work consists of the files ctex-zhconv.lua, ctex-zhconv-make.lua
--               and the derived file  ctex-zhconv-index.lua.
--
--------------------------------------------------------------------------
--

if status.kpse_used ~= 1 then
 kpse.set_program_name("luatex")
end

local zhconv  = { }
zhconv.module = {
 name        = "ctex-zhconv",
 version     = "0.2",
 date        = "2020/05/02",
 description = "GBK/GB18030/Big5 encoder",
 author      = "Qing Lee",
 copyright   = "Qing Lee",
 license     = "LPPL v1.3c"
}

local utf = require("unicode").utf8
local ubyte, ugsub = utf.byte, utf.gsub

local floor = math.floor
local unpack, insert, sort = table.unpack, table.insert, table.sort
local char, format = string.char, string.format

zhconv.index = require("ctex-zhconv-index")
local index = zhconv.index

zhconv.mapping = { }
local mapping = zhconv.mapping

mapping.big5, mapping.gbk, mapping.gb18030 = { }, { }, { }
local gbk, gb18030, big5 = mapping.gbk, mapping.gb18030, mapping.big5

zhconv.bytes = { }
local bytes = zhconv.bytes

-- Let lead be pointer / 190 + 0x81.
-- Let trail be pointer % 190.
-- Let offset be 0x40 if trail is less than 0x3F and 0x41 otherwise.
-- Return two bytes whose values are lead and trail + offset.
function bytes.gbk (pointer)
 local lead = floor(pointer / 190) + 0x81
 local trail = pointer % 190
 local offset = trail < 0x3F and 0x40 or 0x41
 return format("%c%c", lead, trail + offset)
end

-- Let lead be pointer / 157 + 0x81.
-- If lead is less than 0xA1, return error with code point.
-- Let trail be pointer % 157.
-- Let offset be 0x40 if trail is less than 0x3F and 0x62 otherwise.
-- Return two bytes whose values are lead and trail + offset.
function bytes.big5 (pointer)
 local lead = floor(pointer / 157) + 0x81
 local trail = pointer % 157
 local offset = trail < 0x3F and 0x40 or 0x62
 return format("%c%c", lead, trail + offset)
end

-- Let byte1 be pointer / 10 / 126 / 10.
-- Set pointer to pointer - byte1 × 10 × 126 × 10.
-- Let byte2 be pointer / 10 / 126.
-- Set pointer to pointer - byte2 × 10 × 126.
-- Let byte3 be pointer / 10.
-- Let byte4 be pointer - byte3 × 10.
-- Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30.
function bytes.gb18030 (pointer)
 local pointer = pointer
 local byte1 = floor(pointer / 12600) + 0x81
 pointer = pointer % 12600
 local byte2 = floor(pointer / 1260) + 0x30
 pointer = pointer % 1260
 local byte3 = floor(pointer / 10) + 0x81
 local byte4 = pointer % 10 + 0x30
 return format("%c%c%c%c", byte1, byte2, byte3, byte4)
end

-- Let offset be the last code point in index gb18030 ranges that is equal to or
-- less than code point and let pointer offset be its corresponding pointer.
-- Return a pointer whose value is pointer offset + code point - offset.
do
 local metatable = { }
 local bytes, ranges = bytes.gb18030, index["gb18030_ranges"]
 function metatable.__index (t, key)
   if type(key) == "number" then
     local n = #ranges
     if key < 0x10000 then
       local s = gbk[key]
       if s then return s end
       repeat
         n = n - 1
       until ranges[n][2] <= key
     end
     local pointer, offset = unpack(ranges[n])
     return bytes(pointer + key - offset)
   end
 end
 gb18030 = setmetatable(gb18030, metatable)
end

-- The index Big5 pointer for code point is the return value of these steps:
-- 1. Let index be index Big5 excluding all entries whose pointer
--    is less than (0xA1 - 0x81) × 157.
-- 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345,
--    return the last pointer corresponding to code point in index.
local big5_last = {
 [0x2550] = true ,
 [0x255E] = true ,
 [0x2561] = true ,
 [0x256A] = true ,
 [0x5341] = true ,
 [0x5345] = true ,
}

do

local function spairs (t)
 local a = { }
 for i, n in pairs(t) do insert(a, {i, n}) end
 sort(a, function(b, c) return b[1] < c[1] end)
 local i = 0
 local iter = function ()
   i = i + 1
   if a[i] then
     return a[i][1], a[i][2]
   end
 end
 return iter
end

local chars = bytes.big5
local m = (0xA1 - 0x81) * 157
for i, v in spairs(index.big5) do
 if i >= m then
   if big5[v] then
     if big5_last[v] then big5[v] = chars(i) end
   else
     big5[v] = chars(i)
   end
 end
end

local chars = bytes.gbk
for i, v in pairs(index.gb18030) do
 gbk[v] = chars(i)
end

end

-- If the gbk flag is set and code point is U+20AC, return byte 0x80.
gb18030[0x20AC] = gbk[0x20AC]
gbk[0x20AC]     = char(0x80)

local io_open = io.open
local encode_error = "Encoding %q not available!"
local file_error   = "Open file %q failed!"

function zhconv.conv (encoding, input, output)
 local encoding = encoding:lower()
 local mapping = assert(mapping[encoding], encode_error:format(encoding))
 local encoder = function (s)
   local code_point = ubyte(s)
   return code_point > 0x7F and mapping[code_point]
 end
 if output then
   local handle = assert(io_open(input, "rb"), file_error:format(input))
   local stream = handle:read("*all")
   handle:close()
   handle = assert(io_open(output, "wb"), file_error:format(output))
   stream = stream:gsub("^\xEF\xBB\xBF", "")
   stream = ugsub(stream, ".", encoder)
   handle:write(stream)
   handle:close()
 else
   local s = ugsub(input, ".", encoder)
   return s
 end
end

return zhconv