#!/usr/bin/awk -f
# xml2tsv.awk version 8 by Ben Collver <[email protected]>
# reads XML from stdin and writes TSV to stdout

function append(str) {
   if (length(str) > 0) {
       if (length(text) == 0) {
           text = str
       } else {
           text = text eolstr str
       }
   }
   return
}

function hex_decode_print(str,     d) {
  while (length(str) > 0) {
      d = substr(str, 1, 2)
      str = substr(str, 3)
      if (d == "09") {
          printf "%s", tabstr
      } else if (d == "0A") {
          printf "%s", eolstr
      } else {
          printf "%c", hex_decode_ord[d]
      }
  }
  return
}

function hex_init(     i, d) {
   for (i = 0; i <= 255; i++) {
       d = sprintf("%02X", i)
       hex_decode_ord[d] = i
   }
   return
}

function normalize(str) {
   retval = str
   gsub(/^[\n\r ][\n\r ]*/, "", retval)
   gsub(/[\n\r ]*[\n\r ]$/, "", retval)
   gsub(/\r\n/, eolstr, retval)
   gsub(/\n/, eolstr, retval)
   gsub(/\t/, tabstr, retval)
   return retval
}

function trimleft(str) {
   retval = str
   gsub(/^[\n ][\n ]*/, "", retval)
   return retval
}

BEGIN {
   eolstr = "\\n"
   tabstr = "\\t"
   is_hex = 0
   path = ""
   text = ""
   RS = ">"
   hex_init()
}

/^<\?xml/ {
   # ignore xml header
   next
}

/<awk:ok xmlns:awk=/ {
   # ignore awk namespace declaration
   next
}

/<\// {
   # close tag
   closed_path = path
   count = split($0, tokens, /[<]/)
   if (count != 2) {
       print "Apparently malformed close tag: " $0 ">"
       exit(1)
   }
   str = normalize(tokens[1])
   append(str)
   name = substr(tokens[2], 2)
   if (name == "awk:cdata") {
       if (length(text) > 0) {
           printf "%s/text()\t", path
           hex_decode_print(text)
           print ""
       }
       is_hex = 0
   } else {
       namelen = length(name)
       pathlen = length(path)
       expected = substr(path, 1 + pathlen - namelen, namelen)
       if (name != expected) {
           printf "Error: Expected \"%s\" closing tag, got \"%s\"\n",
           expected, name
           exit(1)
       }
       if (length(text) > 0) {
           printf "%s/text()\t%s\n", path, text
       }
       path = substr(path, 1, pathlen - namelen - 1)
   }
   printf "%s\t\n", closed_path
   text = ""
   next
}

/</ {
   # open tag
   is_open = 1
   count = split($0, tokens, /</)
   if (count != 2) {
       print "Error: Apparently malformed open tag: " $0 ">"
       exit(1)
   }
   str = normalize(tokens[1])
   append(str)
   str = tokens[2]
   if (str ~ /^!\[CDATA\[/) {
       str = normalize(substr(str, 9, length(str) - 10))
       append(str)
       next
   } else if (str ~ /^!DOCTYPE /) {
       # ignore DOCTYPE
       next
   }
   if (match(str, /[\n ]*\/$/)) {
       # discard slash from self-closing tag
       str = substr(str, 1, length(str) - RLENGTH)
       is_open = 0
   }
   if (match(str, /^awk:cdata type="awk:hexBinary"/)) {
       is_hex = 1
       next
   }
   match(str, /^[^\n ][^\n ]*/)
   name = substr(str, 1, RLENGTH)
   str = substr(str, RLENGTH+1)

   oldpath = path
   path = path "/" name
   while (length(str) > 0) {
       str = trimleft(str)
       if (match(str, /^[^=]*="[^"]*"/)) {
           pair = substr(str, 1, RLENGTH)
           str = trimleft(substr(str, RLENGTH+1))
           count = split(pair, tokens, /=/)
           attr = tokens[1]
           value = substr(tokens[2], 2, length(tokens[2]) - 2)
       } else if (match(str, /^[^=]*=[^\n ]*/)) {
           pair = substr(str, 1, RLENGTH)
           str = trimleft(substr(str, RLENGTH+1))
           count = split(pair, tokens, /=/)
           attr = tokens[1]
           value = tokens[2]
       } else {
           printf "Apparently malformed attribute: \"%s\"\n", str
           exit(0)
       }
       printf "%s[@%s]\t%s\n", path, attr, value
   }
   if (!is_open) {
       printf "%s\t\n", path
       path = oldpath
   }
   next
}

{
   str = normalize($0)
   append(str)
}