-- Copyright (c) 2021-2025 Zeping Lee

--
-- Copyright (c) 2021-2025 Zeping Lee
-- Released under the MIT license.
-- Repository: https://github.com/zepinglee/citeproc-lua
--

local bibtex2csl = {}

local uni_utf8
local bibtex_parser
local bibtex_data
local journal_data
local latex_parser
local unicode
local util
local using_luatex, _ = pcall(require, "kpse")
if using_luatex then
uni_utf8 = require("unicode").utf8
bibtex_parser = require("citeproc-bibtex-parser")
bibtex_data = require("citeproc-bibtex-data")
unicode = require("citeproc-unicode")
util = require("citeproc-util")
else
uni_utf8 = require("lua-utf8")
bibtex_parser = require("citeproc.bibtex-parser")
bibtex_data = require("citeproc.bibtex-data")
unicode = require("citeproc.unicode")
util = require("citeproc.util")
end

---@alias CslItem ItemData
---@alias CslData CslItem[]

---Parse BibTeX content and convert to CSL-JSON
---@param str string
---@param keep_unknown_commands boolean? Keep unknown latex markups in <code>.
---@param case_protection boolean? Add case-protection to braces as in BibTeX.
---@param sentence_case_title boolean? Convert `title` and `booktitle` to sentence case.
---@param check_sentence_case boolean? Check titles that are already sentence cased and do not conver them.
---@return CslData?, Exception[]
function bibtex2csl.parse_bibtex_to_csl(str, keep_unknown_commands, case_protection, sentence_case_title,
check_sentence_case)
local strings = {}
for name, macro in pairs(bibtex_data.macros) do
strings[name] = macro.value
end
local bib_data, exceptions = bibtex_parser.parse(str, strings)
local csl_json_items = nil
if bib_data then
-- TODO: Ideally we should load all .bib files and then resolve crossrefs and related
bibtex_parser.resolve_crossrefs(bib_data.entries, bibtex_data.entries_by_id)
csl_json_items = bibtex2csl.convert_to_csl_data(bib_data, keep_unknown_commands, case_protection,
sentence_case_title, check_sentence_case)
end
return csl_json_items, exceptions
end

---@param bib BibtexData
---@param keep_unknown_commands boolean?
---@param case_protection boolean?
---@param sentence_case_title boolean?
---@param check_sentence_case boolean?
---@return CslData
function bibtex2csl.convert_to_csl_data(bib, keep_unknown_commands, case_protection, sentence_case_title,
check_sentence_case)
local csl_data = {}

-- BibTeX looks for crossref in a case-insensitive manner.
local bib_entry_dict = {}
local csl_item_dict = {}

for _, entry in ipairs(bib.entries) do
bib_entry_dict[unicode.casefold(entry.key)] = entry
local item = bibtex2csl.convert_to_csl_item(entry, keep_unknown_commands, case_protection, sentence_case_title,
check_sentence_case)

table.insert(csl_data, item)
csl_item_dict[item.id] = item
end

bibtex2csl.resolve_related(csl_item_dict, bib_entry_dict)

return csl_data
end

---@param entry BibtexEntry
---@param keep_unknown_commands boolean?
---@param case_protection boolean?
---@param sentence_case_title boolean?
---@param check_sentence_case boolean?
---@return CslItem
function bibtex2csl.convert_to_csl_item(entry, keep_unknown_commands, case_protection, sentence_case_title,
check_sentence_case, disable_journal_abbreviation)
---@type CslItem
local item = {
id = entry.key,
type = "document",
}

bibtex2csl.pre_process_special_fields(item, entry)

-- First convert primary fields
for field, csl_field in pairs(bibtex_data.primary_fields) do
local value = entry.fields[field]
if value then
local _, csl_value = bibtex2csl.convert_field(
field, value, keep_unknown_commands, case_protection, sentence_case_title, item.language, check_sentence_case)
if csl_field and csl_value and not item[csl_field] then
item[csl_field] = csl_value
end
end
end

-- Convert the fields in a fixed order
local field_list = {}
for field, _ in pairs(entry.fields) do
table.insert(field_list, field)
end
table.sort(field_list)

for _, field in ipairs(field_list) do
local value = entry.fields[field]
local csl_field, csl_value = bibtex2csl.convert_field(
field, value, keep_unknown_commands, case_protection, sentence_case_title, item.language, check_sentence_case)
if csl_field and csl_value and not item[csl_field] then
item[csl_field] = csl_value
end
end

bibtex2csl.post_process_special_fields(item, entry, disable_journal_abbreviation)
return item
end

---@param item CslItem
---@param entry BibtexEntry
function bibtex2csl.pre_process_special_fields(item, entry)
-- CSL types
local type_data = bibtex_data.types[entry.type]
if type_data and type_data.csl then
item.type = type_data.csl
elseif entry.fields.url then
item.type = "webpage"
end

-- BibTeX's `edition` is expected to be an ordinal.
if entry.fields.edition then
item.edition = util.convert_ordinal_to_arabic(entry.fields.edition)
end

-- language: convert `babel` language to ISO 639-1 language code
local lang = entry.fields.langid or entry.fields.language
if lang then
item.language = bibtex_data.language_code_map[unicode.casefold(lang)]
end
-- if not item.language then
-- if util.has_cjk_char(item.title) then
-- item.language = "zh"
-- end
-- end

-- Merge title, maintitle, subtitle, titleaddon
bibtex2csl.process_titles(entry)

-- biblatex-apa
if entry.fields.howpublished then
local howpublished = string.lower(entry.fields.howpublished)
if howpublished == "advance online publication" then
-- TODO: get_locale_term("advance online publication" )
item.status = "Advance online publication"
elseif howpublished == "manunpub" then
-- biblatex-apa
item.genre = "Unpublished manuscript"
elseif howpublished == "maninprep" then
-- biblatex-apa
item.genre = "Manuscript in preparation"
elseif howpublished == "mansub" then
-- biblatex-apa
item.genre = "Manuscript submitted for publication"
end
end
if entry.fields.pubstate
and string.lower(entry.fields.pubstate) == "inpress" then
-- TODO: get_locale_term("advance online publication" )
item.status = "in press"
end
end

---@param entry BibtexEntry
function bibtex2csl.process_titles(entry)
local fields = entry.fields
-- title and subtitle
if fields.subtitle then
if fields.title then
fields.title = util.join_title(fields.title, fields.subtitle)
if not fields.shorttitle then
fields.shorttitle = fields.title
end
else
fields.title = fields.subtitle
end
fields.subtitle = nil
end

-- booktitle and booksubtitle
if fields.booksubtitle then
if not fields["container-title-short"] then
fields["container-title-short"] = fields.booktitle
end
if fields.booktitle then
fields.booktitle = util.join_title(fields.booktitle, fields.booksubtitle)
else
fields.booktitle = fields.booksubtitle
end
fields.booksubtitle = nil
end

-- mainsubtitle
if fields.mainsubtitle then
if fields.maintitle then
fields.maintitle = util.join_title(fields.maintitle, fields.mainsubtitle)
else
fields.maintitle = fields.mainsubtitle
end
end

-- maintitle
if fields.maintitle then
if entry.type == "audio" or entry.type == "video" then
-- maintitle is the container-title
fields["container-title"] = fields.maintitle
elseif fields.booktitle then
-- maintitle is with booktitle
if not fields["volume-title"] then
fields["volume-title"] = fields.booktitle
fields.booktitle = fields.maintitle
end
else
-- maintitle is with title
if fields.title then
if not fields["volume-title"] then
fields["volume-title"] = fields.title
fields.title = fields.maintitle
end
else
-- This is unlikelu to happen.
fields.title = fields.maintitle
end
end
end

if fields.journalsubtitle then
if fields.journaltitle then
fields.journaltitle = util.join_title(fields.journaltitle, fields.journalsubtitle)
elseif fields.journal then
fields.journal = util.join_title(fields.journal, fields.journal)
end
fields.journalsubtitle = nil
end
if fields.issuesubtitle then
if fields.issuetitle then
fields.issuetitle = util.join_title(fields.issuetitle, fields.issuesubtitle)
else
fields.issuetitle = fields.issuesubtitle
end
fields.issuesubtitle = nil
end
end

---Convert BibTeX field to CSL field
---@param bib_field string
---@param value string
---@param keep_unknown_commands boolean?
---@param case_protection boolean?
---@param sentence_case_title boolean?
---@param language string?
---@param check_sentence_case boolean?
---@return string? csl_field
---@return string | table | number? csl_value
function bibtex2csl.convert_field(bib_field, value, keep_unknown_commands, case_protection, sentence_case_title,
language, check_sentence_case)
local field_data = bibtex_data.fields[bib_field]
if not field_data then
return nil, nil
end
local csl_field = field_data.csl
if not (csl_field and type(csl_field) == "string") then
return nil, nil
end

if using_luatex then
latex_parser = latex_parser or require("citeproc-latex-parser")
else
latex_parser = latex_parser or require("citeproc.latex-parser")
end

local field_type = field_data.type
local csl_value
if field_type == "name" then
-- 1. unicode 2. prify (remove LaTeX markups) 3. plain text 4. split name parts
value = latex_parser.latex_to_unicode(value)
local names = bibtex_parser.split_names(value)
csl_value = {}
for i, name_str in ipairs(names) do
local name_dict = bibtex_parser.split_name_parts(name_str)
csl_value[i] = bibtex2csl.convert_to_csl_name(name_dict)
end

elseif field_type == "date" then
if string.match(value, "\\") then
-- "{\noopsort{1973c}}1981"
value = latex_parser.latex_to_pseudo_html(value, false, false)
end
-- "-3000~" should not be converted to "-3000 "
csl_value = util.parse_edtf(value)

elseif bib_field == "title" or bib_field == "shorttitle"
or bib_field == "booktitle" or bib_field == "container-title-short" then
-- 1. unicode 2. sentence case 3. html tag
if sentence_case_title and (not language or util.startswith(language, "en")) then
csl_value = latex_parser.latex_to_sentence_case_pseudo_html(value, keep_unknown_commands, case_protection,
check_sentence_case)
else
csl_value = latex_parser.latex_to_pseudo_html(value, keep_unknown_commands, case_protection)
end

else
-- 1. unicode 2. html tag
csl_value = latex_parser.latex_to_pseudo_html(value, keep_unknown_commands, case_protection)
if csl_field == "volume" or csl_field == "page" then
csl_value = string.gsub(csl_value, util.unicode["en dash"], "-")
end
end

return csl_field, csl_value
end

local function clean_name_part(name_part)
if not name_part then
return nil
end
return string.gsub(name_part, "[{}]", "")
end

function bibtex2csl.convert_to_csl_name(bibtex_name)
if bibtex_name.last and not (bibtex_name.first or bibtex_name.von or bibtex_name.jr)
and string.match(bibtex_name.last, "^%b{}$") then
return {
literal = string.sub(bibtex_name.last, 2, -2),
}
end
local csl_name = {
family = clean_name_part(bibtex_name.last),
["non-dropping-particle"] = clean_name_part(bibtex_name.von),
given = clean_name_part(bibtex_name.first),
suffix = clean_name_part(bibtex_name.jr),
}
return csl_name
end

local arxiv_url_prefix = "https://arxiv.org/abs/"
local pubmed_url_prefix = "https://www.ncbi.nlm.nih.gov/pubmed/"
local pubmed_central_url_prefix = "https://www.ncbi.nlm.nih.gov/pmc/articles/"

---@param item CslItem
---@param entry BibtexEntry
function bibtex2csl.post_process_special_fields(item, entry, disable_journal_abbreviation)
local bib_type = entry.type
local bib_fields = entry.fields

-- biblatex-apa
if item.type == "article-journal" and entry.fields.entrysubtype == "nonacademic" then
item.type = "article-magazine"
item.genre = nil

elseif item.type == "motion_picture" then
if entry.fields.entrysubtype == "tvseries" then
item.type = "broadcast"
if item.genre == "tvseries" then
item.genre = "TV series"
end
elseif entry.fields.entrysubtype == "tvepisode" then
item.type = "broadcast"
if item.genre == "tvepisode" then
item.genre = "TV series episode"
end
end

elseif item.type == "song" then
if entry.fields.entrysubtype == "podcast" then
item.type = "broadcast"
if item.genre == "podcast" then
item.genre = "Audio podcast"
end

elseif entry.fields.entrysubtype == "podcastepisode" then
item.type = "broadcast"
if item.genre == "podcastepisode" then
item.genre = "Audio podcast episode"
end

elseif entry.fields.entrysubtype == "interview" then
item.type = "interview"

elseif entry.fields.entrysubtype == "speech" then
item.type = "speech"

end

elseif item.type == "graphic" then
item["archive-place"] = item["publisher-place"]

if entry.fields.entrysubtype == "map" then
item.type = "map"
end

elseif item.type == "webpage" then

-- eprinttype is mapped to
-- - `archive` for Google books;
-- - `container-title` for twitter post;
-- - `publisher` for arXiv preprint;

local eprint_type_map = {
facebook = "post",
instagram = "post",
reddit = "post",
twitter = "post",
arxiv = "preprint",
psyarxiv = "preprint",
pubmed = "preprint",
["pubmed central"] = "preprint",
}

-- Biblatex's `online` type can be mapped to `post`, `preprint`
-- local eprinttype = entry.fields.eprinttype or entry.fields.archiveprefix
if item.archive then
local eprint_type = eprint_type_map[string.lower(item.archive)]
if eprint_type then
item.type = eprint_type
elseif item.number then
item.type = "preprint"
elseif entry.fields.eprint then
item.type = "preprint"
item.numerb = entry.fields.eprint
elseif item.DOI then
item.type = "preprint"
elseif item.URL then
if util.startswith(item.URL, arxiv_url_prefix) then
item.type = "preprint"
elseif util.startswith(item.URL, pubmed_url_prefix) then
item.type = "preprint"
elseif util.startswith(item.URL, pubmed_central_url_prefix) then
item.type = "preprint"
end
end
if item.type == "preprint" then
if not item.publisher then
item.publisher = item.archive
item.archive = nil
end
if not item.number then
item.number = entry.fields.eprint
end
else
if not item["container-title"] then
item["container-title"] = item.archive
item.archive = nil
end
end
end

end

-- event-date
if item["event-date"] and not item.issued then
item.issued = util.deep_copy(item["event-date"])
end

-- event-title: for compatibility with CSL v1.0.1 and earlier versions
if item["event-title"] then
item.event = item["event-title"]
end

-- Jounal abbreviations
if not disable_journal_abbreviation then
if item.type == "article-journal" or item.type == "article-magazine"
or item.type == "article-newspaper" then
bibtex2csl.check_journal_abbreviations(item)
end
end

if not item.genre then
if bib_type == "phdthesis" then
-- from APA
item.genre = "Doctoral dissertation"
elseif bib_type == "mastersthesis" then
item.genre = "Master’s thesis"
end
end

-- month
-- local month = bib_fields.month
local month_text = bib_fields.month
if month_text then
month_text = latex_parser.latex_to_pseudo_html(month_text, false, false)
local month, day = uni_utf8.match(month_text, "^(%a+)%.?,?%s+(%d+)%a*$")
if not month then
day, month = uni_utf8.match(month_text, "^(%d+)%a*%s+(%a+)%.?$")
end
if not month then
month = string.match(month_text, "^(%a+)%.?$")
end
if month then
month = bibtex_data.months[unicode.casefold(month)]
end
if month and item.issued and item.issued["date-parts"] and
item.issued["date-parts"][1] and
item.issued["date-parts"][1][2] == nil then
item.issued["date-parts"][1][2] = tonumber(month)
if day then
item.issued["date-parts"][1][3] = tonumber(day)
end
end
end

-- number
if bib_fields.number then
if item.type == "article-journal" or item.type == "article-magazine" or
item.type == "article-newspaper" or item.type == "periodical" then
if not item.issue then
item.issue = bib_fields.number
end
elseif item["collection-title"] and not item["collection-number"] then
item["collection-number"] = bib_fields.number
elseif not item.number then
item.number = string.gsub(bib_fields.number, "([^-])%-([^-])", "%1\\-%2")
end
end

-- organization: the `organizer` that sponsors a conference or a `publisher` that publishes a `@manual` or `@online`.
if bib_fields.organization then
if item.publisher or bib_type == "inproceedings" or bib_type == "proceedings" then
if not item.organizer then
item.organizer = {
literal = bib_fields.organization,
}
end
elseif not item.publisher then
item.publisher = bib_fields.organization
end
end

-- DOI
if type(item.DOI) == "string" then
item.DOI = util.remove_prefix(item.DOI, "https://doi.org/")
item.DOI = util.remove_prefix(item.DOI, "doi.org/")
end

-- PMID
if bib_fields.eprint and type(bib_fields.eprinttype) == "string" and
string.lower(bib_fields.eprinttype) == "pubmed" and not item.PMID then
item.PMID = bib_fields.eprint
end

if item.URL then
if item.type == "preprint" and util.startswith(item.URL, arxiv_url_prefix) then
if not item.publisher then
item.publisher = "arXiv"
item.archive = nil
end
if not item.number then
item.number = util.remove_prefix(item.URL, pubmed_url_prefix)
end
end

if util.startswith(item.URL, pubmed_url_prefix) then
if not item.publisher then
item.publisher = "PubMed"
item.archive = nil
end
if not item.PMID then
item.PMID = util.remove_prefix(item.URL, pubmed_url_prefix)
end
end

if util.startswith(item.URL, pubmed_central_url_prefix) then
if not item.publisher then
item.publisher = "PubMed Central"
item.archive = nil
end
if not item.PMCID then
item.PMCID = util.remove_prefix(item.URL, pubmed_central_url_prefix)
end
end
end

-- `APA Education [@APAEducation], (2018, June 29). College students are forming menta/-health c/ubs-and they're making a difference @washingtonpost [Thumbnail with link attached]`
-- The `[Thumbnail with link attached]` should not be italicized.
-- if bib_fields.titleaddon and item.genre and item.genre ~= bib_fields.titleaddon then
-- if item.title then
-- item.title = string.format("%s [%s]", item.title, bib_fields.titleaddon)
-- end
-- end

end

function bibtex2csl.check_journal_abbreviations(item)
if item["container-title"] and not item["container-title-short"] then
if not journal_data then
if using_luatex then
journal_data = require("citeproc-journal-data")
else
journal_data = require("citeproc.journal-data")
end
end
local key = unicode.casefold(string.gsub(item["container-title"], "%.", ""))
local abbr = journal_data.abbrevs[key]
if abbr then
item["container-title-short"] = abbr
else
local full = journal_data.unabbrevs[key]
if full then
item["container-title-short"] = item["container-title"]
item["container-title"] = full
end
end
end
end

local original_field_dict = {
author = "original-author",
issued = "original-date",
publisher = "original-publisher",
["publisher-place"] = "original-publisher-place",
title = "original-title",
}

local reviewed_field_dict = {
author = "reviewed-author",
genre = "reviewed-genre",
title = "reviewed-title",
}

---@param csl_item_dict table<string, CslItem>
---@param bib_entry_dict table<string, BibtexEntry>
function bibtex2csl.resolve_related(csl_item_dict, bib_entry_dict)
for _, entry in pairs(bib_entry_dict) do
local related_key = entry.fields.related
local related_type = entry.fields.relatedtype
if related_key then
local related_bib_entry = bib_entry_dict[unicode.casefold(related_key)]
if related_bib_entry then
local csl_item = csl_item_dict[entry.key]
local related_csl_item = csl_item_dict[related_bib_entry.key]
if related_type == "reprintof" or related_type == "reprintfrom" then
for original_field, new_field in pairs(original_field_dict) do
if not csl_item[new_field] and related_csl_item[original_field] then
csl_item[new_field] = util.deep_copy(related_csl_item[original_field])
end
end

elseif related_type == "translationof" or related_type == "translationfrom" then
for original_field, new_field in pairs(original_field_dict) do
if not csl_item[new_field] and related_csl_item[original_field] then
csl_item[new_field] = util.deep_copy(related_csl_item[original_field])
end
end

elseif related_type == "reviewof" then
for reviewed_field, new_field in pairs(reviewed_field_dict) do
if not csl_item[new_field] and related_csl_item[reviewed_field] then
csl_item[new_field] = util.deep_copy(related_csl_item[reviewed_field])
end
end

end
else
util.warning(string.format('Related entry "%s" of "%s" not found.', related_key, entry.key))
end
end
end
end

return bibtex2csl