-- The lexical analysis step of static analysis converts expl3 parts of the input files into TeX tokens.

-- The lexical analysis step of static analysis converts expl3 parts of the input files into TeX tokens.

local get_option = require("explcheck-config").get_option
local ranges = require("explcheck-ranges")
local obsolete = require("explcheck-latex3").obsolete
local parsers = require("explcheck-parsers")

local new_range = ranges.new_range
local range_flags = ranges.range_flags

local EXCLUSIVE = range_flags.EXCLUSIVE
local INCLUSIVE = range_flags.INCLUSIVE

local lpeg = require("lpeg")

local token_types = {
CONTROL_SEQUENCE = "control sequence",
CHARACTER = "character",
ARGUMENT = "argument", -- corresponds to zero or more tokens inserted by a function call, never produced by lexical analysis
}

local CONTROL_SEQUENCE = token_types.CONTROL_SEQUENCE
local CHARACTER = token_types.CHARACTER
local ARGUMENT = token_types.ARGUMENT

local simple_text_catcodes = {
[3] = true, -- math shift
[4] = true, -- alignment tab
[5] = true, -- end of line
[7] = true, -- superscript
[8] = true, -- subscript
[9] = true, -- ignored character
[10] = true, -- space
[11] = true, -- letter
[12] = true, -- other
}

-- Determine whether a token constitutes "simple text" [1, p. 383] with no expected side effects.
--
-- [1]: Donald Ervin Knuth. 1986. TeX: The Program. Addison-Wesley, USA.
--
local function is_token_simple(token)
if token.type == CONTROL_SEQUENCE or token.type == ARGUMENT then
return false
elseif token.type == CHARACTER then
return simple_text_catcodes[token.catcode] ~= nil
else
error('Unexpected token type "' .. token.type .. '"')
end
end

-- Get the byte range for a given token.
local function get_token_byte_range(tokens)
return function(token_number)
local byte_range = tokens[token_number].byte_range
return byte_range
end
end

-- Tokenize the content and register any issues.
local function lexical_analysis(pathname, content, issues, results, options)

-- Process bytes within a given range similarly to TeX's input processor (TeX's "eyes" [1]) and produce lines.
--
-- See also:
-- - Section 31 on page 16 and Section 362 on page 142 of Knuth (1986) [1]
-- - Section 7 on page 36 and Section 8 on page 42 of Knuth (1986) [2]
-- - Section 1.2 on page 12 of Olsak (2001) [3]
--
-- [1]: Donald Ervin Knuth. 1986. TeX: The Program. Addison-Wesley, USA.
-- [2]: Donald Ervin Knuth. 1986. The TeXbook. Addison-Wesley, USA.
-- [3]: Petr Olsak. 2001. TeXbook naruby. Konvoj, Brno.
-- https://petr.olsak.net/ftp/olsak/tbn/tbn.pdf
--
local function get_lines(range)
local range_content = content:sub(range:start(), range:stop())
for _, line in ipairs(lpeg.match(parsers.tex_lines, range_content)) do
local line_start, line_text, line_end = table.unpack(line)
local line_range = new_range(line_start, line_end, EXCLUSIVE, #content)
local map_back = (function(line_text, line_range) -- luacheck: ignore line_text line_range
return function (index)
assert(index > 0)
assert(index <= #line_text + #parsers.expl3_endlinechar)
if index <= #line_text then
local mapped_index = range:start() + line_range:start() + index - 2 -- a line character
assert(line_text[index] == range_content[mapped_index])
return mapped_index
elseif index > #line_text and index <= #line_text + #parsers.expl3_endlinechar then
return math.max(1, range:start() + line_range:start() + #line_text - 2) -- an \endlinechar
else
assert(false)
end
end
end)(line_text, line_range)
coroutine.yield(line_text .. parsers.expl3_endlinechar, map_back)
end
end

-- Process lines similarly to TeX's token processor (TeX's "mouth" [1]) and produce tokens and a tree of apparent TeX groupings.
--
-- See also:
-- - Section 303 on page 122 of Knuth (1986) [1]
-- - Section 7 on page 36 and Section 8 on page 42 of Knuth (1986) [2]
-- - Section 1.3 on page 19 of Olsak (2001) [3]
--
-- [1]: Donald Ervin Knuth. 1986. TeX: The Program. Addison-Wesley, USA.
-- [2]: Donald Ervin Knuth. 1986. The TeXbook. Addison-Wesley, USA.
-- [3]: Petr Olsak. 2001. TeXbook naruby. Konvoj, Brno.
-- https://petr.olsak.net/ftp/olsak/tbn/tbn.pdf
--
local function get_tokens(lines)
local tokens = {}

local groupings = {}
local current_grouping = groupings
local parent_grouping

local state

-- Determine the category code of the at sign ("@").
local make_at_letter = get_option("make_at_letter", options, pathname)
if make_at_letter == "auto" then
make_at_letter = results.seems_like_latex_style_file
end

for line_text, map_back in lines do
state = "N"
local character_index = 1

local function determine_expl3_catcode(character)
local catcode
if character == "@" then
if make_at_letter then
catcode = 11 -- letter
else
catcode = 12 -- other
end
else
catcode = lpeg.match(parsers.determine_expl3_catcode, character)
end
return catcode
end

local function get_character_and_catcode(index)
assert(index <= #line_text)
local character = line_text:sub(index, index)
local catcode = determine_expl3_catcode(character)
-- Process TeX' double circumflex convention (^^X and ^^XX).
local actual_character, index_increment = lpeg.match(parsers.double_superscript_convention, line_text, index)
if actual_character ~= nil then
local actual_catcode = determine_expl3_catcode(actual_character)
return actual_character, actual_catcode, index_increment -- double circumflex convention
else
return character, catcode, 1 -- single character
end
end

local previous_catcode, previous_csname = 9, nil
while character_index <= #line_text do
local character, catcode, character_index_increment = get_character_and_catcode(character_index)
local range = new_range(character_index, character_index, INCLUSIVE, #line_text, map_back, #content)
if (
catcode ~= 9 and catcode ~= 10 -- a potential missing stylistic whitespace
and (
previous_catcode == 0 -- right after a control sequence
or previous_catcode == 1 or previous_catcode == 2 -- or a begin/end grouping
)
) then
if (previous_catcode == 0) then
assert(previous_csname ~= nil)
end
if (
catcode ~= 0 and catcode ~= 1 and catcode ~= 2 -- for a control sequence or begin/end grouping, we handle this elsewhere
-- do not require whitespace after non-expl3 control sequences or control sequences with empty or one-character names
and (previous_catcode ~= 0 or #previous_csname > 1 and lpeg.match(parsers.expl3like_csname, previous_csname) ~= nil)
and (previous_catcode ~= 0 or character ~= ",") -- allow a comma after a control sequence without whitespace in between
and (previous_catcode ~= 1 or catcode ~= 6) -- allow a parameter after begin grouping without whitespace in between
and (previous_catcode ~= 2 or character ~= ",") -- allow a comma after end grouping without whitespace in between
) then
issues:add('s204', 'missing stylistic whitespaces', range)
end
end
if catcode == 0 then -- control sequence
local csname_table = {}
local csname_index = character_index + character_index_increment
local previous_csname_index = csname_index
if csname_index <= #line_text then
local csname_index_increment
character, catcode, csname_index_increment = get_character_and_catcode(csname_index)
table.insert(csname_table, character)
csname_index = csname_index + csname_index_increment
if catcode == 11 then -- control word
state = "S"
while csname_index <= #line_text do
character, catcode, csname_index_increment = get_character_and_catcode(csname_index)
if catcode == 11 then
table.insert(csname_table, character)
previous_csname_index = csname_index
csname_index = csname_index + csname_index_increment
else
break
end
end
elseif catcode == 10 then -- escaped space
state = "S"
else -- control symbol
state = "M"
end
end
local csname = table.concat(csname_table)
range = new_range(character_index, previous_csname_index, INCLUSIVE, #line_text, map_back, #content)
table.insert(tokens, {
type = CONTROL_SEQUENCE,
payload = csname,
catcode = 0,
byte_range = range,
})
if (
previous_catcode ~= 9 and previous_catcode ~= 10 -- a potential missing stylistic whitespace
-- do not require whitespace before non-expl3 control sequences or control sequences with empty or one-character names
and #csname > 1 and lpeg.match(parsers.expl3like_csname, csname) ~= nil
) then
issues:add('s204', 'missing stylistic whitespaces', range)
end
previous_catcode, previous_csname = 0, csname
character_index = csname_index
elseif catcode == 5 then -- end of line
if state == "N" then
table.insert(tokens, {
type = CONTROL_SEQUENCE,
payload = "par",
catcode = 0,
byte_range = range,
})
elseif state == "M" then
table.insert(tokens, {
type = CHARACTER,
payload = " ",
catcode = 10,
byte_range = range,
})
end
character_index = character_index + character_index_increment
elseif catcode == 9 then -- ignored character
previous_catcode = catcode
character_index = character_index + character_index_increment
elseif catcode == 10 then -- space
if state == "M" then
table.insert(tokens, {
type = CHARACTER,
payload = " ",
catcode = 10,
byte_range = range,
})
end
previous_catcode = catcode
character_index = character_index + character_index_increment
elseif catcode == 14 then -- comment character
character_index = #line_text + 1
else
if catcode == 15 then -- invalid character
issues:add('e209', 'invalid characters', range)
end
if catcode == 1 or catcode == 2 then -- begin/end grouping
if catcode == 1 then -- begin grouping
current_grouping = {parent = current_grouping, start = #tokens + 1}
assert(groupings[current_grouping.start] == nil)
assert(current_grouping.parent[current_grouping.start] == nil)
groupings[current_grouping.start] = current_grouping -- provide flat access to groupings
current_grouping.parent[current_grouping.start] = current_grouping -- provide recursive access to groupings
elseif catcode == 2 then -- end grouping
if current_grouping.parent ~= nil then
current_grouping.stop = #tokens + 1
assert(current_grouping.start ~= nil and current_grouping.start < current_grouping.stop)
parent_grouping = current_grouping.parent
current_grouping.parent = nil -- remove a circular reference for the current grouping
current_grouping = parent_grouping
else
issues:add('e208', 'too many closing braces', range)
end
end
if (
previous_catcode ~= 9 and previous_catcode ~= 10 -- a potential missing stylistic whitespace
-- do not require whitespace after non-expl3 control sequences or control sequences with empty or one-character names
and (previous_catcode ~= 0 or #previous_csname > 1 and lpeg.match(parsers.expl3like_csname, previous_csname) ~= nil)
and (previous_catcode ~= 1 or catcode ~= 2) -- allow an end grouping immediately after begin grouping
and (previous_catcode ~= 6 or catcode ~= 1 and catcode ~= 2) -- allow a parameter immediately before grouping
) then
issues:add('s204', 'missing stylistic whitespaces', range)
end
previous_catcode = catcode
elseif ( -- maybe a parameter?
previous_catcode == 6 and catcode == 12
and lpeg.match(parsers.decimal_digit, character) ~= nil
) then
previous_catcode = 6
else -- some other character
previous_catcode = catcode
end
table.insert(tokens, {
type = CHARACTER,
payload = character,
catcode = catcode,
byte_range = range,
})
state = "M"
character_index = character_index + character_index_increment
end
end
end
-- Remove circular references for all unclosed groupings.
while current_grouping.parent ~= nil do
parent_grouping = current_grouping.parent
current_grouping.parent = nil
current_grouping = parent_grouping
end
return tokens, groupings
end

-- Tokenize the content.
local tokens, groupings = {}, {}
for _, range in ipairs(results.expl_ranges) do
local lines = (function()
local co = coroutine.create(function()
get_lines(range)
end)
return function()
local _, line_text, map_back = coroutine.resume(co)
return line_text, map_back
end
end)()
local part_tokens, part_groupings = get_tokens(lines)
table.insert(tokens, part_tokens)
table.insert(groupings, part_groupings)
end

-- Record issues that are apparent after the lexical analysis.
for _, part_tokens in ipairs(tokens) do
for token_index, token in ipairs(part_tokens) do
if token.type == CONTROL_SEQUENCE then
local _, _, argument_specifiers = token.payload:find(":([^:]*)")
if argument_specifiers ~= nil then
if lpeg.match(parsers.do_not_use_argument_specifiers, argument_specifiers) then
issues:add('w200', '"do not use" argument specifiers', token.byte_range)
issues:ignore('s206', token.byte_range)
-- TODO: Add a configuration option that would allow us to express that w200 silences s206,
-- so that we don't need to do this manually.
end
if lpeg.match(parsers.argument_specifiers, argument_specifiers) == nil then
issues:add('e201', 'unknown argument specifiers', token.byte_range)
end
end
if lpeg.match(obsolete.deprecated_csname, token.payload) ~= nil then
issues:add('w202', 'deprecated control sequences', token.byte_range)
end
if token_index + 1 <= #part_tokens then
local next_token = part_tokens[token_index + 1]
if next_token.type == CONTROL_SEQUENCE then
if (
lpeg.match(parsers.expl3_function_definition_csname, token.payload) ~= nil
and lpeg.match(parsers.expl3like_csname, next_token.payload) ~= nil
and lpeg.match(parsers.expl3_expansion_csname, next_token.payload) == nil
and lpeg.match(parsers.expl3_function_csname, next_token.payload) == nil
) then
issues:add('s205', 'malformed function name', next_token.byte_range)
end
if (
lpeg.match(parsers.expl3_variable_or_constant_use_csname, token.payload) ~= nil
and lpeg.match(parsers.expl3like_csname, next_token.payload) ~= nil
and lpeg.match(parsers.expl3_expansion_csname, next_token.payload) == nil
and lpeg.match(parsers.expl3_scratch_variable_csname, next_token.payload) == nil
and lpeg.match(parsers.expl3_variable_or_constant_csname, next_token.payload) == nil
) then
issues:add('s206', 'malformed variable or constant name', next_token.byte_range)
end
if (
lpeg.match(parsers.expl3_quark_or_scan_mark_definition_csname, token.payload) ~= nil
and lpeg.match(parsers.expl3_quark_or_scan_mark_csname, next_token.payload) == nil
and lpeg.match(parsers.expl3_expansion_csname, next_token.payload) == nil
) then
issues:add('s207', 'malformed quark or scan mark name', next_token.byte_range)
end
end
end
end
end
end

-- Store the intermediate results of the analysis.
results.tokens = tokens
results.groupings = groupings
end

return {
get_token_byte_range = get_token_byte_range,
is_token_simple = is_token_simple,
process = lexical_analysis,
token_types = token_types,
}