#!/usr/bin/env python
# escapeTeXt.py
# Convert text to something LaTeX is less likely to choke on.
# 2006-Aug-24 Jim Hefferon Written. Based on stuff from Tristan Miller and
# Allin Cottrell
# Like txt2latex but beyond being callable from the command line
# ./escapeTeXt.py < file1.txt > file1.ltx
# it is callable from a program as a function.
# import escapeTeXt
# result=escapeTeXt.escapetext(string).output()
# TODO
# * at sign?
# * copyright, trademark?
# * allow conversion of "_Great Gatsby_" to "\textit{Great Gatsby}"
# * allow conversion of "I *really* do" to "I \emph{really} do"
# * add refinement of recognizing fractions "2/3" ==> "$2/3$"?
import sys, re
from getopt import getopt, GetoptError
DEBUG=False # currently unused
class escapetextError(StandardError): # currently unused
pass
class escapetext(object):
"""Turn plain text into strings that have a chance of making it through
LaTeX.
"""
def __init__(self,txt='',countQuotes=True,refinements=False):
"""Initialize instance. Build up the text in parts (say, as lines
from the input source) with this initializing, and using the feed()
routine. Then we dump it with output().
txt='' Text to start with
countQuotes=True Should we try to match open with closed quotes?
refinements=False Should we try to get fancy? (Try to fix
en dashes, em dashes, the 'p. 2' construct.)
"""
self.countQuotes=countQuotes
self.refinements=refinements
self.singleQuoteCount=0
self.doubleQuoteCount=0
self.strings=[txt] # array of strings that we have been fed
def feed(self,txt):
"""Add more text to the pile.
txt Text to add
"""
self.strings.append(txt)
pageNumberPattern="(\s)(p.)(\s)(\d)" # like 'p. 23'
pageNumberRE=re.compile(pageNumberPattern)
enDashPattern="(\d)-(\d)" # like '7-9'
enDashRE=re.compile(enDashPattern)
emDashPattern="((\D)-{1,2}(\D))" # like 'but-no'
emDashRE=re.compile(emDashPattern)
def texifyString(self,s):
"""Convert a string to a form more acceptable to LaTeX.
s Plain text string.
"""
# Do simple substitutions
for (old,new) in [(u"\\",u"{\\textbackslash}"), # backslashes
(u"{",u"\\{"), # open curly braces
(u"}",u"\\}"), # close curly braces
(u"\\{\\textbackslash\\}",u"{\\textbackslash}"), # have to fix up the effect of the prior two on the first line's result
(u"$",u"\\$"), # dollar signs
(u"%",u"\\%"), # percent signs
(u"_",u"\\_"), # underscores
(u"&",u"\\&"), # ampersands
(u"#",u"\\#"), # sharp signs
(u"<",u"{\\textless}"), # less than
(u">",u"{\\textgreater}"), # greater than
]:
s=s.replace(old,new)
if self.countQuotes: # make even-numbered instances open, odd closed
tmpS=""
priorC=None
for c in s:
if c=="'":
if (self.singleQuoteCount % 2)==0:
if priorC=='"': # open double quote followed by open single quote
tmpS+=u"\," # add a thinspace
tmpS+=u"`" # open single quote
else:
tmpS+=u"'" # close single quote
self.singleQuoteCount+=1
elif c=='"':
if (self.doubleQuoteCount % 2)==0:
tmpS+=u"``" # open double quotes
else:
if priorC=="'": # single close quote followed by double close quote
tmpS+=u"\," # add a thinspace
tmpS+=u"''" # close double quotes
self.doubleQuoteCount+=1
else:
tmpS+=c
priorC=c
s=tmpS
else:
s=s.replace(u'`',u"\verb!'!") # maybe u"\textquotesingle" if you \usepackage{textcomp}?
s=s.replace(u'"',u'\verb!"!') # maybe u"$^\\textquotestraightdblbase$" if you \usepackage{textcomp}?
if self.refinements:
s=escapetext.pageNumberRE.sub(r'\1\2\,\4',s) # replace " p. 2" with " p.\,2"
s=escapetext.enDashRE.sub(r'\1--\2',s) # replace "2-3" with "2--3"
s=escapetext.emDashRE.sub(r'\2---\3',s) # replace "but -- no" with "but --- no"
return s
def output(self,restartCounts=False):
"""Return a string that has the parts escaped. That clears the
internal buffer.
restartCounts=False Reset the counters for whether a single or
double quote is opening or closed
"""
r=map(self.texifyString,self.strings)
self.strings=[]
if restartCounts:
self.singleQuoteCount=0
self.doubleQuoteCount=0
return "".join(r)
#............... script main body
if __name__=='__main__':
# parse args
inputFilename=None
outputFilename=None
countQuotes=True
refinements=True
latexWrap=False
verbose=False # currently unused
usage="""%s: Convert plain text so it may make it through LaTeX
%s [options]
where the options are
-f filename (default %s) file to read from; if None then stdin is used
-o filename (default %s) file to write to; if None then stdout is used
-c Turn off the attempt to balance open and closed quotes
-r Turn off the refinements (em dashes, en dashes ..)
-l Add simple LaTeX header and footer
-v (default %s) sets verbose output
--help or -? Give this usage statement""" % (sys.argv[0],sys.argv[0],repr(inputFilename),repr(outputFilename),repr(verbose))
shortOptions='f:o:lcr?v'
longOptions=['help']
try:
(opts,args_proper)=getopt(sys.argv[1:],shortOptions,longOptions)
except GetoptError, err:
print "ERROR: Unable to parse the command line arguments: %s" % (err,)
print usage
sys.exit(1)
for (option,parameter) in opts:
if option=='-f':
inputFilename=parameter
elif option=='-o':
outputFilename=parameter
elif option=='-c':
countQuotes=False
elif option=='-r':
refinements=False
elif option=='-l':
latexWrap=True
elif option=='-v':
verbose=True
elif (option=='-?'
or option=='--help'):
print usage
sys.exit(0)
else:
print "Unknown option: ",option
sys.exit(2)
# Done getting options; now the logic
if (inputFilename is None):
inFile=sys.stdin
else:
inFile=open(inputFilename,'r')
if (outputFilename is None):
outFile=sys.stdout
else:
outFile=open(outputFilename,'w')
et=escapetext(countQuotes=countQuotes,refinements=refinements)
line=inFile.readline()
if latexWrap:
outFile.write(latexHead)
while line:
et.feed(line)
outFile.write(et.output())
line=inFile.readline()
if latexWrap:
outFile.write(latexFoot)
inFile.close()
outFile.close()