--
-- File ctex-zhconv.lua
--
-- Copyright (C) 2020 by Qing Lee <
[email protected]>
--------------------------------------------------------------------------
--
-- This work may be distributed and/or modified under the
-- conditions of the LaTeX Project Public License, either
-- version 1.3c of this license or (at your option) any later
-- version. This version of this license is in
--
http://www.latex-project.org/lppl/lppl-1-3c.txt
-- and the latest version of this license is in
--
http://www.latex-project.org/lppl.txt
-- and version 1.3 or later is part of all distributions of
-- LaTeX version 2005/12/01 or later.
--
-- This work has the LPPL maintenance status "maintained".
--
-- The Current Maintainer of this work is Qing Lee.
--
-- This work consists of the files ctex-zhconv.lua, ctex-zhconv-make.lua
-- and the derived file ctex-zhconv-index.lua.
--
--------------------------------------------------------------------------
--
if status.kpse_used ~= 1 then
kpse.set_program_name("luatex")
end
local zhconv = { }
zhconv.module = {
name = "ctex-zhconv",
version = "0.2",
date = "2020/05/02",
description = "GBK/GB18030/Big5 encoder",
author = "Qing Lee",
copyright = "Qing Lee",
license = "LPPL v1.3c"
}
local utf = require("unicode").utf8
local ubyte, ugsub = utf.byte, utf.gsub
local floor = math.floor
local unpack, insert, sort = table.unpack, table.insert, table.sort
local char, format = string.char, string.format
zhconv.index = require("ctex-zhconv-index")
local index = zhconv.index
zhconv.mapping = { }
local mapping = zhconv.mapping
mapping.big5, mapping.gbk, mapping.gb18030 = { }, { }, { }
local gbk, gb18030, big5 = mapping.gbk, mapping.gb18030, mapping.big5
zhconv.bytes = { }
local bytes = zhconv.bytes
-- Let lead be pointer / 190 + 0x81.
-- Let trail be pointer % 190.
-- Let offset be 0x40 if trail is less than 0x3F and 0x41 otherwise.
-- Return two bytes whose values are lead and trail + offset.
function bytes.gbk (pointer)
local lead = floor(pointer / 190) + 0x81
local trail = pointer % 190
local offset = trail < 0x3F and 0x40 or 0x41
return format("%c%c", lead, trail + offset)
end
-- Let lead be pointer / 157 + 0x81.
-- If lead is less than 0xA1, return error with code point.
-- Let trail be pointer % 157.
-- Let offset be 0x40 if trail is less than 0x3F and 0x62 otherwise.
-- Return two bytes whose values are lead and trail + offset.
function bytes.big5 (pointer)
local lead = floor(pointer / 157) + 0x81
local trail = pointer % 157
local offset = trail < 0x3F and 0x40 or 0x62
return format("%c%c", lead, trail + offset)
end
-- Let byte1 be pointer / 10 / 126 / 10.
-- Set pointer to pointer - byte1 × 10 × 126 × 10.
-- Let byte2 be pointer / 10 / 126.
-- Set pointer to pointer - byte2 × 10 × 126.
-- Let byte3 be pointer / 10.
-- Let byte4 be pointer - byte3 × 10.
-- Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30.
function bytes.gb18030 (pointer)
local pointer = pointer
local byte1 = floor(pointer / 12600) + 0x81
pointer = pointer % 12600
local byte2 = floor(pointer / 1260) + 0x30
pointer = pointer % 1260
local byte3 = floor(pointer / 10) + 0x81
local byte4 = pointer % 10 + 0x30
return format("%c%c%c%c", byte1, byte2, byte3, byte4)
end
-- Let offset be the last code point in index gb18030 ranges that is equal to or
-- less than code point and let pointer offset be its corresponding pointer.
-- Return a pointer whose value is pointer offset + code point - offset.
do
local metatable = { }
local bytes, ranges = bytes.gb18030, index["gb18030_ranges"]
function metatable.__index (t, key)
if type(key) == "number" then
local n = #ranges
if key < 0x10000 then
local s = gbk[key]
if s then return s end
repeat
n = n - 1
until ranges[n][2] <= key
end
local pointer, offset = unpack(ranges[n])
return bytes(pointer + key - offset)
end
end
gb18030 = setmetatable(gb18030, metatable)
end
-- The index Big5 pointer for code point is the return value of these steps:
-- 1. Let index be index Big5 excluding all entries whose pointer
-- is less than (0xA1 - 0x81) × 157.
-- 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345,
-- return the last pointer corresponding to code point in index.
local big5_last = {
[0x2550] = true ,
[0x255E] = true ,
[0x2561] = true ,
[0x256A] = true ,
[0x5341] = true ,
[0x5345] = true ,
}
do
local function spairs (t)
local a = { }
for i, n in pairs(t) do insert(a, {i, n}) end
sort(a, function(b, c) return b[1] < c[1] end)
local i = 0
local iter = function ()
i = i + 1
if a[i] then
return a[i][1], a[i][2]
end
end
return iter
end
local chars = bytes.big5
local m = (0xA1 - 0x81) * 157
for i, v in spairs(index.big5) do
if i >= m then
if big5[v] then
if big5_last[v] then big5[v] = chars(i) end
else
big5[v] = chars(i)
end
end
end
local chars = bytes.gbk
for i, v in pairs(index.gb18030) do
gbk[v] = chars(i)
end
end
-- If the gbk flag is set and code point is U+20AC, return byte 0x80.
gb18030[0x20AC] = gbk[0x20AC]
gbk[0x20AC] = char(0x80)
local io_open = io.open
local encode_error = "Encoding %q not available!"
local file_error = "Open file %q failed!"
function zhconv.conv (encoding, input, output)
local encoding = encoding:lower()
local mapping = assert(mapping[encoding], encode_error:format(encoding))
local encoder = function (s)
local code_point = ubyte(s)
return code_point > 0x7F and mapping[code_point]
end
if output then
local handle = assert(io_open(input, "rb"), file_error:format(input))
local stream = handle:read("*all")
handle:close()
handle = assert(io_open(output, "wb"), file_error:format(output))
stream = stream:gsub("^\xEF\xBB\xBF", "")
stream = ugsub(stream, ".", encoder)
handle:write(stream)
handle:close()
else
local s = ugsub(input, ".", encoder)
return s
end
end
return zhconv