#!/usr/bin/env python
# escapeTeXt.py
#  Convert text to something LaTeX is less likely to choke on.
# 2006-Aug-24 Jim Hefferon  Written.  Based on stuff from Tristan Miller and
#  Allin Cottrell

# Like txt2latex but beyond being callable from the command line
#  ./escapeTeXt.py < file1.txt > file1.ltx
# it is callable from a program as a function.
#  import escapeTeXt
#  result=escapeTeXt.escapetext(string).output()

# TODO
# * at sign?
# * copyright, trademark?
# * allow conversion of "_Great Gatsby_" to "\textit{Great Gatsby}"
# * allow conversion of "I *really* do" to "I \emph{really} do"
# * add refinement of recognizing fractions "2/3" ==> "$2/3$"?

import sys, re
from getopt import getopt, GetoptError

DEBUG=False  # currently unused

class escapetextError(StandardError): # currently unused
   pass

class escapetext(object):
   """Turn plain text into strings that have a chance of making it through
   LaTeX.
   """
   def __init__(self,txt='',countQuotes=True,refinements=False):
       """Initialize instance.  Build up the text in parts (say, as lines
       from the input source) with this initializing, and using the feed()
       routine.  Then we dump it with output().
         txt=''  Text to start with
         countQuotes=True  Should we try to match open with closed quotes?
         refinements=False  Should we try to get fancy?  (Try to fix
           en dashes, em dashes, the 'p. 2' construct.)
       """
       self.countQuotes=countQuotes
       self.refinements=refinements
       self.singleQuoteCount=0
       self.doubleQuoteCount=0
       self.strings=[txt]  # array of strings that we have been fed

   def feed(self,txt):
       """Add more text to the pile.
         txt   Text to add
       """
       self.strings.append(txt)

   pageNumberPattern="(\s)(p.)(\s)(\d)"  # like 'p. 23'
   pageNumberRE=re.compile(pageNumberPattern)
   enDashPattern="(\d)-(\d)"  # like '7-9'
   enDashRE=re.compile(enDashPattern)
   emDashPattern="((\D)-{1,2}(\D))"  # like 'but-no'
   emDashRE=re.compile(emDashPattern)
   def texifyString(self,s):
       """Convert a string to a form more acceptable to LaTeX.
         s  Plain text string.
       """
       # Do simple substitutions
       for (old,new) in [(u"\\",u"{\\textbackslash}"),  # backslashes
                         (u"{",u"\\{"),        # open curly braces
                         (u"}",u"\\}"),        # close curly braces
                         (u"\\{\\textbackslash\\}",u"{\\textbackslash}"), # have to fix up the effect of the prior two on the first line's result
                         (u"$",u"\\$"),      # dollar signs
                         (u"%",u"\\%"),        # percent signs
                         (u"_",u"\\_"),        # underscores
                         (u"&",u"\\&"),        # ampersands
                         (u"#",u"\\#"),        # sharp signs
                         (u"<",u"{\\textless}"),        # less than
                         (u">",u"{\\textgreater}"),        # greater than
                         ]:
           s=s.replace(old,new)
       if self.countQuotes:  # make even-numbered instances open, odd closed
           tmpS=""
           priorC=None
           for c in s:
               if c=="'":
                   if (self.singleQuoteCount % 2)==0:
                       if priorC=='"':  # open double quote followed by open single quote
                           tmpS+=u"\,"  # add a thinspace
                       tmpS+=u"`" # open single quote
                   else:
                       tmpS+=u"'" # close single quote
                   self.singleQuoteCount+=1
               elif c=='"':
                   if (self.doubleQuoteCount % 2)==0:
                       tmpS+=u"``" # open double quotes
                   else:
                       if priorC=="'":  # single close quote followed by double close quote
                           tmpS+=u"\,"  # add a thinspace
                       tmpS+=u"''" # close double quotes
                   self.doubleQuoteCount+=1
               else:
                   tmpS+=c
               priorC=c
           s=tmpS
       else:
           s=s.replace(u'`',u"\verb!'!") #  maybe u"\textquotesingle" if you \usepackage{textcomp}?
           s=s.replace(u'"',u'\verb!"!') # maybe u"$^\\textquotestraightdblbase$" if you \usepackage{textcomp}?
       if self.refinements:
           s=escapetext.pageNumberRE.sub(r'\1\2\,\4',s)  # replace " p. 2" with " p.\,2"
           s=escapetext.enDashRE.sub(r'\1--\2',s)  # replace "2-3" with "2--3"
           s=escapetext.emDashRE.sub(r'\2---\3',s)  # replace "but -- no" with "but --- no"
       return s

   def output(self,restartCounts=False):
       """Return a string that has the parts escaped.  That clears the
       internal buffer.
         restartCounts=False  Reset the counters for whether a single or
           double quote is opening or closed
       """
       r=map(self.texifyString,self.strings)
       self.strings=[]
       if restartCounts:
           self.singleQuoteCount=0
           self.doubleQuoteCount=0
       return "".join(r)

latexHead="""\\documentclass{article}
\\begin{document}
"""
latexFoot="""\\end{document}"""

#............... script main body
if __name__=='__main__':
   # parse args
   inputFilename=None
   outputFilename=None
   countQuotes=True
   refinements=True
   latexWrap=False
   verbose=False  # currently unused

   usage="""%s: Convert plain text so it may make it through LaTeX
 %s [options]
where the options are
 -f filename (default %s) file to read from; if None then stdin is used
 -o filename (default %s) file to write to; if None then stdout is used
 -c Turn off the attempt to balance open and closed quotes
 -r Turn off the refinements (em dashes, en dashes ..)
 -l Add simple LaTeX header and footer
 -v (default %s) sets verbose output
 --help or -?  Give this usage statement""" % (sys.argv[0],sys.argv[0],repr(inputFilename),repr(outputFilename),repr(verbose))

   shortOptions='f:o:lcr?v'
   longOptions=['help']
   try:
       (opts,args_proper)=getopt(sys.argv[1:],shortOptions,longOptions)
   except GetoptError, err:
       print "ERROR: Unable to parse the command line arguments: %s" % (err,)
       print usage
       sys.exit(1)
   for (option,parameter) in opts:
       if option=='-f':
           inputFilename=parameter
       elif option=='-o':
           outputFilename=parameter
       elif option=='-c':
           countQuotes=False
       elif option=='-r':
           refinements=False
       elif option=='-l':
           latexWrap=True
       elif option=='-v':
           verbose=True
       elif (option=='-?'
             or option=='--help'):
           print usage
           sys.exit(0)
       else:
           print "Unknown option: ",option
           sys.exit(2)
   # Done getting options; now the logic
   if (inputFilename is None):
       inFile=sys.stdin
   else:
       inFile=open(inputFilename,'r')
   if (outputFilename is None):
       outFile=sys.stdout
   else:
       outFile=open(outputFilename,'w')

   et=escapetext(countQuotes=countQuotes,refinements=refinements)
   line=inFile.readline()
   if latexWrap:
       outFile.write(latexHead)
   while line:
       et.feed(line)
       outFile.write(et.output())
       line=inFile.readline()
   if latexWrap:
       outFile.write(latexFoot)
   inFile.close()
   outFile.close()

   if verbose:
       print "#s: done" % (sys.argv[0],)