#!/usr/bin/env texlua
-- extractbb-lua
--
https://github.com/gucci-on-fleek/extractbb
-- SPDX-License-Identifier: MPL-2.0+
-- SPDX-FileCopyrightText: 2024--2025 Max Chernoff
--
-- Inclusion Methods
-- =================
--
-- This script can use two different methods to extract bounding boxes from
-- images: the "img" module and the "pdfe" module. The "img" module will be
-- automatically selected in most cases and supports all image types that are
-- supported by the original "extractbb" program. If and only if the "img"
-- module fails to load, the "pdfe" module will be used as a fallback. However,
-- the "pdfe" module only supports PDF files. Both modules are built in to the
-- LuaTeX binaries, however due to some technical issues, the "img" module may
-- fail to load on some more exotic platforms.
--
--
-- Compatibility
-- =============
--
-- Based off of my testing, this Lua script is 100% compatible with the original
-- C-based "extractbb" program, with the following exceptions:
--
-- * When running in "img" mode, the PDF version is always reported as "1.5".
--
-- * When running in "img" mode, if the requested bounding box is not found,
-- the script will fallback to the Crop box or the Media box, instead of
-- following the original fallback order. (In practice, almost all PDFs set
-- all their bounding boxes equal to each other, and even if the boxes are
-- set to different values, the script will still return the requested box,
-- provided that it is set in the PDF.)
--
-- * When running in "pdfe" mode, only PDF files are supported.
--
-- All of these issues are very unlikely to affect any real-world documents.
--
--
-- Security
-- ========
--
-- This script is designed to be safely ran from restricted shell escape. A few
-- security features:
--
-- * The majority of this script runs inside a sandboxed Lua environment,
-- which only exposes a very restricted set of functions.
--
-- * All file-related functions available inside the sandbox first check with
-- kpathsea to ensure that the file is allowed to be opened.
--
-- * In the event of any errors, the script immediately exits.
--
-- * This script does not run (fork/exec) any external programs.
--
-- * This script is written entirely in Lua, so overflow/use-after-free
-- vulnerabilities are not possible.
--
-- Some potential security concerns:
--
-- * This script has not been audited or reviewed by anyone other than myself.
--
-- * The underlying LuaTeX modules may themselves have security
-- vulnerabilities, which would be inherited by this script.
----------------------
--- Initialization ---
----------------------
-- Pre-sandbox variables/constants
local show_errors = true
local SOURCE_DATE_EPOCH = tonumber(os.getenv("SOURCE_DATE_EPOCH"))
local version = "extractbb.lua v1.1.0 (2025-02-11)" --%%version %%dashdate
-- Required for any kpathsea calls to work.
kpse.set_program_name("texlua", "extractbb")
-- Required to use the "img" module from texlua, but only works for LuaTeX
-- versions >= 1.21.0.
if not (status.development_id >= 7661) then
error("LuaTeX version is too old, cannot proceed.")
end
texconfig.texlua_img = true
-- We need to set \outputmode to PDF to be able to use most of the "img" module
-- functions, but to set \outputmode, we need to initialize the TeX interpreter.
tex.initialize()
_G.tex = package.loaded.tex
tex.enableprimitives("", tex.extraprimitives())
tex.outputmode = 1
tex.interactionmode = 0
-- "pdf" module
_G.pdf = package.loaded.pdf
pdf.setignoreunknownimages(1)
pdf.setmajorversion(2)
pdf.setminorversion(0)
------------------
--- Sandboxing ---
------------------
-- Prepare the sandbox for the rest of the script.
local env = {
arg = arg,
io = { stdout = io.stdout, },
ipairs = ipairs,
math = math,
os = { date = os.date, exit = os.exit, },
pairs = pairs,
pdfe = pdfe,
print = print,
select = select,
table = table,
tonumber = tonumber,
type = type,
}
do
-- Saved global functions
local debug_traceback = debug.traceback
local find_file = kpse.find_file
local img_scan = img.scan
local io_open = io.open
local io_stderr = io.stderr
local kpse_in_name_ok = kpse.in_name_ok
local kpse_out_name_ok = kpse.out_name_ok
local kpse_var_value = kpse.var_value
local lfs_attributes = lfs.attributes
local os_exit = os.exit
local os_setenv = os.setenv
local pdfe_open = pdfe.open
local select = select
local tostring = tostring
-- Error messages
local function error(...)
if show_errors then
-- Header
io_stderr:write("! extractbb ERROR: ")
-- Message
for i = 1, select("#", ...) do
io_stderr:write(tostring(select(i, ...)), " ")
end
-- Traceback
io_stderr:write("\n", "\n")
io_stderr:write(debug_traceback(nil, 2), "\n")
end
-- Flush and exit
io_stderr:flush()
os_exit(1)
end
env.error = error
-- Make sure that "openin_any" is at least "restricted", and that
-- "openout_any" is at least "paranoid".
local initial_openin = kpse_var_value("openin_any")
local initial_openout = kpse_var_value("openout_any")
if (initial_openin ~= "r") or (initial_openout ~= "p") then
os_setenv("openin_any", "r")
end
if (initial_openout ~= "p") then
os_setenv("openout_any", "p")
end
-- Check the input paths.
local function resolve_input_name(file_name)
local file_path = find_file(file_name, "graphic/figure", true)
if not file_path then
error("Cannot find input file:", file_name)
end
local allowed = kpse_in_name_ok(file_path)
if not allowed then
error("Input file is not allowed:", file_path)
end
local mode = lfs_attributes(file_path, "mode")
if mode ~= "file" then
error("Input file is not a regular file:", file_path)
end
return file_path
end
-- Check the output paths.
local function resolve_output_name(file_name)
local allowed = kpse_out_name_ok(file_name)
if not allowed then
error("Output file is not allowed:", file_name)
end
local name, extension = file_name:match("(.+)%.([^.]-)$")
if (not name) or (not extension) or
(name == "") or (extension == "")
then
error("Output file has no extension:", file_name)
end
if (extension ~= "xbb") and (extension ~= "bb") then
error("Output file has an invalid extension:", file_name)
end
-- We shouldn't allow files with weird characters in their names.
if name:match("[%c%%\t\r\n><*|]") then
error("Output file has an invalid name:", file_name)
end
return file_name
end
-- Opens a file.
function env.open_file(file_name, read_write, binary_text)
local file_path, mode
if read_write == "read" then
file_path = resolve_input_name(file_name)
mode = "r"
elseif read_write == "write" then
file_path = resolve_output_name(file_name)
mode = "w"
else
error("Invalid read/write mode:", read_write)
end
if binary_text == "binary" then
mode = mode .. "b"
elseif binary_text == "text" then
mode = mode .. ""
else
error("Invalid binary/text mode:", binary_text)
end
local file, message = io_open(file_path, mode)
if not file then
error("Cannot open file:", file_path, message)
end
return file
end
-- Open an PDF file.
function env.pdfe.open(file_name)
local file_path = resolve_input_name(file_name)
return pdfe_open(file_path)
end
-- Open an image file.
function env.open_image(file_name, page, box)
local file_path = resolve_input_name(file_name)
return img_scan {
filename = file_path,
filepath = file_path,
page = page,
pagebox = box,
}
end
if not img_scan then
env.open_image = false
end
end
-- Prevent trying to change the environment.
local function bad_index(...)
env.error("Attempt to access an undefined index:", select(2, ...))
end
setmetatable(env, {
__index = bad_index,
__metatable = false,
__newindex = bad_index,
})
-- Set the environment.
_ENV = env
-----------------------------------
--- Post-Sandbox Initialization ---
-----------------------------------
-- Constants
local BP_TO_SP = 65781.76
local IN_TO_BP = 72
local DATE_FORMAT = "%a %b %d %H:%M:%S %Y" -- "%c"
-- Save often-used globals for a slight speed boost.
local floor = math.floor
local insert = table.insert
local remove = table.remove
local script_arguments = arg
local unpack = table.unpack
-- General-purpose functions
local function round(number)
return floor(number +0.5)
end
-------------------------
--- Argument Handling ---
-------------------------
-- Define the argument handling functions.
local process_arguments = {}
-- > Specify a PDF pagebox for bounding box
-- > pagebox=cropbox, mediabox, artbox, trimbox, bleedbox
local bbox_option = "auto"
function process_arguments.B(script_arguments)
bbox_option = remove(script_arguments, 1)
end
-- > Show this help message and exit
function process_arguments.h(script_arguments)
print [[
Usage: extractbb [-B pagebox] [-p page] [-q|-v] [-O] [-m|-x] FILE...
extractbb --help|--version
Extract bounding box from PDF, PNG, JPEG, JP2, or BMP file; default output below.
Options:
-B pagebox Specify a PDF pagebox for bounding box
pagebox=cropbox, mediabox, artbox, trimbox, bleedbox
-h | --help Show this help message and exit
--version Output version information and exit
-p page Specify a PDF page to extract bounding box
-q Be quiet
-v Be verbose
-O Write output to stdout
-m Output .bb file used in DVIPDFM (default)
-x Output .xbb file used in DVIPDFMx
]]
os.exit(0)
end
process_arguments["-help"] = process_arguments.h
-- > Output version information and exit
function process_arguments.V(script_arguments)
print(version)
os.exit(0)
end
process_arguments["-version"] = process_arguments.V
-- > Specify a PDF page to extract bounding box
local page_number = 1
function process_arguments.p(script_arguments)
page_number = tonumber(remove(script_arguments, 1))
end
-- > Be quiet
function process_arguments.q(script_arguments)
show_errors = false
end
-- > Be verbose
function process_arguments.v(script_arguments)
show_errors = true
end
-- > Write output to stdout
local output_file
function process_arguments.O(script_arguments)
output_file = io.stdout
end
-- Output format
local output_format = "xbb"
if script_arguments[0]:match("ebb") then
output_format = "bb"
end
-- > Output .bb file used in DVIPDFM (default)
function process_arguments.m(script_arguments)
output_format = "bb"
end
-- > Output .xbb file used in DVIPDFMx
function process_arguments.x(script_arguments)
output_format = "xbb"
end
-- Get the input file name.
local input_name
function process_arguments.i(script_arguments)
input_name = remove(script_arguments, 1)
end
process_arguments["-input-name"] = process_arguments.i
-- Clear the interpreter and script names.
script_arguments[-1] = nil
script_arguments[0] = nil
-- Process the arguments.
while script_arguments[1] do
-- Get the next argument.
local arg = remove(script_arguments, 1)
local cmd = arg:match("^%-(.*)$")
-- Default to "--input-name" if no command is given.
if not cmd then
insert(script_arguments, 1, arg)
cmd = "-input-name"
end
-- Handle multi-character arguments.
if (cmd:len() >= 2) and (not cmd:match("^%-")) then
local i = 0
for char in cmd:gmatch(".") do
i = i + 1
insert(script_arguments, i, "-" .. char)
end
goto continue
end
-- Get the function to process the argument and run it.
local func = process_arguments[cmd]
if not func then
error("Invalid argument:", arg)
end
func(script_arguments)
::continue::
end
-- Validate the arguments.
if not type(page_number) == "number" then
error("Invalid page number:", page_number)
end
if not input_name then
error("No input file specified.")
end
-- Validate the bounding box type. We need this rather crazy fallback scheme
-- to match the behaviour of "extractbb".
local bbox_orders = {}
bbox_orders.mediabox = {
{ img = "media", pdfe = "MediaBox" },
}
bbox_orders.cropbox = {
{ img = "crop", pdfe = "CropBox" }, unpack(bbox_orders.mediabox)
}
bbox_orders.artbox = {
{ img = "art", pdfe = "ArtBox" }, unpack(bbox_orders.cropbox)
}
bbox_orders.trimbox = {
{ img = "trim", pdfe = "TrimBox" }, unpack(bbox_orders.artbox)
}
bbox_orders.bleedbox = {
{ img = "bleed", pdfe = "BleedBox" }, unpack(bbox_orders.trimbox)
}
bbox_orders.auto = {
bbox_orders.cropbox[1], bbox_orders.artbox[1], bbox_orders.trimbox[1],
bbox_orders.bleedbox[1], bbox_orders.mediabox[1],
}
local bbox_order = bbox_orders[bbox_option]
if not bbox_order then
error("Invalid PDF box type:", bbox_option)
end
-- Set the default pixel resolution.
local default_dpi
if output_format == "xbb" then
default_dpi = 72
elseif output_format == "bb" then
default_dpi = 100
else
error("Invalid output format:", output_format)
end
-- Open the output file.
if not output_file then
local base_name = input_name:match("(.+)%.([^.]-)$") or input_name
local output_name = base_name .. "." .. output_format
output_file = open_file(output_name, "write", "text")
end
------------------------
--- Image Processing ---
------------------------
local x_min, y_min, x_max, y_max
local num_pages, image_type
local pdf_major_version, pdf_minor_version
if open_image then
-- Check the number of pages.
local image = open_image(input_name)
num_pages = image.pages
if page_number > num_pages then
error("Invalid page number:", page_number)
end
-- Open the image to the specified page and bounding box. If the requested
-- bounding box is not available, LuaTeX will fall back to the crop box
-- or the media box.
image = open_image(input_name, page_number, bbox_order[1].img)
if not image then
error("Cannot open image:", input_name)
end
-- Get the image metadata.
image_type = image.imagetype
local bounding_box = image.bbox
if not bounding_box then
error("Cannot get bounding box:", page_number)
end
local x_resolution = image.xres
local y_resolution = image.yres
if (x_resolution or 0) == 0 then
x_resolution = default_dpi
end
if (y_resolution or 0) == 0 then
y_resolution = default_dpi
end
-- Convert the bounding box to PostScript points.
for i, dimen in ipairs(bounding_box) do
if image_type == "pdf" then
dimen = dimen / BP_TO_SP
else
if i % 2 == 1 then
dimen = dimen / x_resolution * IN_TO_BP
else
dimen = dimen / y_resolution * IN_TO_BP
end
end
bounding_box[i] = dimen
end
-- Save the bounding box.
x_min, y_min, x_max, y_max = unpack(bounding_box)
-- We can't get the PDF version with the "img" library, so we'll just
-- pretend that it's v1.5 (which supports most features).
pdf_major_version = 1
pdf_minor_version = 5
else
-- Fallback to PDFs only.
image_type = "pdf"
local document = pdfe.open(input_name)
if pdfe.getstatus(document) ~= 0 then
error("Cannot open PDF file:", input_name)
end
-- Check the number of pages.
num_pages = pdfe.getnofpages(document)
if type(num_pages) ~= "number" then
error("Invalid number of pages:", num_pages)
end
if page_number > num_pages then
error("Invalid page number:", page_number)
end
-- Get the page.
local page = pdfe.getpage(document, page_number)
if not page then
error("Cannot get page:", page_number)
end
-- Get the bounding box. Here, we check the boxes in the exact same order
-- that "extractbb" does.
local bounding_box
for _, bbox in ipairs(bbox_order) do
bounding_box = pdfe.getbox(page, bbox.pdfe)
if bounding_box then
break
end
end
if not bounding_box then
error("Cannot get bounding box:", page_number)
end
-- Save the bounding box.
x_min, y_min, x_max, y_max = unpack(bounding_box)
-- Get the PDF version.
pdf_major_version, pdf_minor_version = pdfe.getversion(document)
end
-- Validate the bounding box.
for _, dimen in ipairs { x_min, y_min, x_max, y_max } do
if type(dimen) ~= "number" then
error("Invalid bounding box:", x_min, y_min, x_max, y_max)
end
end
--------------
--- Output ---
--------------
-- Get the output fields and values.
local lines = {}
insert(lines, ("Title: %s"):format(input_name))
insert(lines, ("Creator: %s"):format(version))
insert(lines,
("BoundingBox: %d %d %d %d")
:format(round(x_min), round(y_min), round(x_max), round(y_max)))
if output_format == "xbb" then
insert(lines,
("HiResBoundingBox: %0.6f %0.6f %0.6f %0.6f")
:format(x_min, y_min, x_max, y_max))
if image_type == "pdf" then
insert(lines,
("PDFVersion: %d.%d")
:format(pdf_major_version, pdf_minor_version))
insert(lines, ("Pages: %d"):format(num_pages))
end
end
insert(lines, ("CreationDate: %s"):format(os.date(DATE_FORMAT, SOURCE_DATE_EPOCH)))
-- Create the output text.
local begin_line = "%%"
local end_line = "\n"
local text = begin_line ..
table.concat(lines, end_line .. begin_line) ..
end_line .. end_line
-- Write the output text.
output_file:write(text)
output_file:close()
-- Everything is done, so now we can exit.
os.exit(0)