local M = {}
local log = logging.new "indexing"
-- Handle accented characters in files created with \usepackage[utf]{inputenc}
-- this code was originally part of
https://github.com/michal-h21/iec2utf/
local enc = {}
local licrs = {}
local codepoint2utf = unicode.utf8.char
local used_encodings = {}
-- load inputenc encoding file
local function load_encfiles(f)
local file= io.open(f,"r")
local encodings = file:read("*all")
file:close()
for codepoint, licr in encodings:gmatch('DeclareUnicodeCharacter(%b{})(%b{})') do
local codepoint = codepoint2utf(tonumber(codepoint:sub(2,-2),16))
local licr= licr:sub(2,-2):gsub('@tabacckludge','')
licrs[licr] = codepoint
end
end
local function sanitize_licr(l)
return l:gsub(" (.)",function(s) if s:match("[%a]") then return " "..s else return s end end):sub(2,-2)
end
local load_enc = function(enc)
-- use default encodings if used doesn't provide one
enc = enc or {"T1","T2A","T2B","T2C","T3","T5", "LGR"}
for _,e in pairs(enc) do
local filename = e:lower() .. "enc.dfu"
-- don't process an enc file multiple times
if not used_encodings[filename] then
local dfufile = kpse.find_file(filename)
if dfufile then
load_encfiles(dfufile)
end
end
used_encodings[filename] = true
end
end
local cache = {}
local get_utf8 = function(input)
local output = input:gsub('\\IeC[%s]*(%b{})',function(iec)
-- remove \protect commands
local iec = iec:gsub("\\protect%s*", "")
local code = cache[iec] or licrs[sanitize_licr(iec)] or '\\IeC '..iec
-- print(iec, code)
cache[iec] = code
return code
end)
return output
end
-- parse the idx file produced by tex4ht
-- it replaces the document page numbers by index entry number
-- each index entry can then link to place in the HTML file where the
-- \index command had been used
local parse_idx = function(content)
-- index entry number
local current_entry = 0
-- map between index entry number and corresponding HTML file and destination
local map = {}
local buffer = {}
for line in content:gmatch("([^\n]+)") do
if line:match("^\\beforeentry") then
-- increment index entry number
current_entry = current_entry + 1
local file, dest, locator = line:match("\\beforeentry%s*{(.-)}{(.-)}{(.-)}")
-- if the third argument to \beforeentry is not empty,
-- use it as a index entry locator instead of the index counter
if locator and locator == "" then locator = nil end
map[current_entry] = {file = file, dest = dest, locator = locator}
elseif line:match("^\\indexentry") then
-- replace the page number with the current
-- index entry number
local result = line:gsub("%b{}$", "{"..current_entry .."}")
buffer[#buffer+1] = get_utf8(result)
else
buffer[#buffer+1] = line
end
end
-- return table with page to dest map and updated idx file
return {map = map, idx = table.concat(buffer, "\n")}
end
local previous
-- replace numbers in .ind file with links back to text
local function replace_index_pages(rest, entries)
-- keep track of the previous page number
local count = 0
local delete_coma = false
return rest:gsub("(%s*%-*%s*)(,?%s*)(%{?)(%[?)(%d+)(%]?)(%}?)", function(dash, coma, lbrace, lbracket, page, rbracket, rbrace)
if lbracket == "[" and rbracket == "]" then
-- don't process numbers in brackets, they are not page numbers
return nil
end
local entry = entries[tonumber(page)]
count = count + 1
if entry then
page = entry.locator or page
if delete_coma then
-- if the coma was marked for deletion, remove it. this may happen after line breaks in the index
coma = ""
end
-- if the page number is the same as the previous one, don't create a link
-- this can happen when we use section numbers as locators. for example,
-- we could get 1.1 -- 1.1, 1.1, so we want to keep only the first one
if page == previous then
previous = page
-- if the first page number on a line is the same as the previous one, we need to delete the coma,
-- otherwise the coma will be left in the output
if count == 1 then
delete_coma = true
end
return ""
else
previous = page
-- don't forget to reset the delete_coma flag after page change
delete_coma = false
-- construct link to the index entry
return dash .. coma.. lbrace .. "\\Link[" .. entry.file .."]{".. entry.dest .."}{}" .. page .."\\EndLink{}" .. rbrace
end
else
return dash .. coma .. lbrace .. lbracket .. page .. rbracket .. rbrace
end
end)
end
local function fix_subitems(start, rest)
-- in xindex, subentries start with a comma, so if the subentry itself is number, it would be mistaken for the page number
-- the start should contain just \subitem -\
if start:match("%s*\\subitem %-\\$") then
-- the keyword in this case is the first item in the rest
local keyword, newrest = rest:match("(,?[^,]+,)(.+)")
if keyword and newrest then
-- join the extracted keyword with the start, newrest should contain only actual page numbers
return start .. keyword, newrest
end
end
return start, rest
end
-- replace page numbers in the ind file with hyperlinks
local fix_idx_pages = function(content, idxobj)
local buffer = {}
local entries = idxobj.map
for line in content:gmatch("([^\n]+)") do
local line, count = line:gsub("(%s*\\%a+[^%[^,]+)(.+)$", function(start,rest)
-- reset the previous page number
previous = nil
start, rest = fix_subitems(start, rest)
-- there is a problem when index term itself contains numbers, like Bible verses (1:2),
-- because they will be detected as page numbers too. I cannot find a good solution
-- that wouldn't break something else.
-- There can be also commands with numbers in braces. These numbers in braces will be ignored,
-- as they may be not page numbers
return start .. replace_index_pages(rest, entries) end)
-- longer index entries may be broken over several lines, in that case, we need to process only numbers
if count == 0 then
line = line:gsub("(%s*%d+.+)", function(rest)
return replace_index_pages(rest, entries)
end)
end
buffer[#buffer+1] = line
end
return table.concat(buffer, "\n")
end
-- prepare the .idx file produced by tex4ht
-- for use with Xindy or Makeindex
local prepare_idx = function(filename)
local f = io.open(filename, "r")
if not f then return nil, "Cannot open file :".. tostring(filename) end
local content = f:read("*all")
local idx = parse_idx(content)
local idxname = os.tmpname()
local f = io.open(idxname, "w")
f:write(idx.idx)
f:close()
-- return the object with mapping between dummy page numbers
-- and link destinations in the files, and the temporary .idx file
-- these can be used for the processing with the index processor
return idx, idxname
end
-- add links to a index file
local process_index = function(indname, idx)
local f = io.open(indname, "r")
if not f then return nil, "Cannot open .ind file: " .. tostring(indname) end
local content = f:read("*all")
f:close()
local newcontent = fix_idx_pages(content, idx)
local f = io.open(indname,"w")
f:write(newcontent)
f:close()
return true
end
local get_idxname = function(par)
return par.idxfile or par.input .. ".idx"
end
local prepare_tmp_idx = function(par)
par.idxfile = mkutils.file_in_builddir(get_idxname(par), par)
if not par.idxfile or not mkutils.file_exists(par.idxfile) then return nil, "Cannot load idx file " .. (par.idxfile or "''") end
-- construct the .ind name, based on the .idx name
par.indfile = par.indfile or par.idxfile:gsub("idx$", "ind")
load_enc()
-- save hyperlinks and clean the .idx file
local idxdata, newidxfile = prepare_idx(par.idxfile)
if not idxdata then
-- if the prepare_idx function returns nil, the second reuturned value contains error msg
return nil, newidxfile
end
return newidxfile, idxdata
end
local splitindex = function(par)
local files = {}
local idxfiles = {}
local buffer
local idxfile = get_idxname(par)
if not idxfile or not mkutils.file_exists(idxfile) then return nil, "Cannot load idx file " .. (idxfile or "''") end
for line in io.lines(idxfile) do
local file = line:match("indexentry%[(.-)%]")
if file then
-- generate idx name for the current output file
file = par.input .. "-" ..file .. ".idx"
local current = files[file] or {}
-- remove file name from the index entry
local indexentry = line:gsub("indexentry%[.-%]", "indexentry")
-- save the index entry and preseding line to the current buffer
table.insert(current, buffer)
table.insert(current, indexentry)
files[file] = current
end
--
buffer = line
end
-- save idx files
for filename, contents in pairs(files) do
log:info("Saving split index file: " .. filename)
idxfiles[#idxfiles+1] = filename
local f = io.open(filename, "w")
f:write(table.concat(contents, "\n"))
f:close()
end
return idxfiles
end
local function run_indexing_command (command, par)
-- detect command name from the command. It will be the first word
local cmd_name = command:match("^[%a]+") or "indexing"
local xindylog = logging.new(cmd_name)
-- support split index
local subindexes = splitindex(par) or {}
if #subindexes > 0 then
-- call the command again on all files produced by splitindex
for _, subindex in ipairs(subindexes) do
-- make copy of the parameters
local t = {}
for k,v in pairs(par) do t[k] = v end
t.idxfile = subindex
run_indexing_command(command, t)
end
return nil
end
local newidxfile, idxdata = prepare_tmp_idx(par)
if not newidxfile then
-- the idxdata will contain error message in the case of error
xindylog:warning(idxdata)
return false
end
par.newidxfile = newidxfile
xindylog:debug("Prepared temporary idx file: ", newidxfile)
-- prepare modules
local xindy_call = command % par
xindylog:info(xindy_call)
local status = mkutils.execute(xindy_call)
-- insert correct links to the index
local status, msg = process_index(par.indfile, idxdata)
if not status then xindylog:warning(msg) end
-- remove the temporary idx file
os.remove(newidxfile)
-- null the indfile, it is necessary in order to support
-- multiple indices
par.indfile = nil
end
M.get_utf8 = get_utf8
M.load_enc = load_enc
M.parse_idx = parse_idx
M.fix_idx_pages = fix_idx_pages
M.prepare_idx = prepare_idx
M.process_index = process_index
M.prepare_tmp_idx = prepare_tmp_idx
M.run_indexing_command = run_indexing_command
return M