#!/usr/bin/awk -f
# xml2tsv.awk version 8 by Ben Collver <
[email protected]>
# reads XML from stdin and writes TSV to stdout
function append(str) {
if (length(str) > 0) {
if (length(text) == 0) {
text = str
} else {
text = text eolstr str
}
}
return
}
function hex_decode_print(str, d) {
while (length(str) > 0) {
d = substr(str, 1, 2)
str = substr(str, 3)
if (d == "09") {
printf "%s", tabstr
} else if (d == "0A") {
printf "%s", eolstr
} else {
printf "%c", hex_decode_ord[d]
}
}
return
}
function hex_init( i, d) {
for (i = 0; i <= 255; i++) {
d = sprintf("%02X", i)
hex_decode_ord[d] = i
}
return
}
function normalize(str) {
retval = str
gsub(/^[\n\r ][\n\r ]*/, "", retval)
gsub(/[\n\r ]*[\n\r ]$/, "", retval)
gsub(/\r\n/, eolstr, retval)
gsub(/\n/, eolstr, retval)
gsub(/\t/, tabstr, retval)
return retval
}
function trimleft(str) {
retval = str
gsub(/^[\n ][\n ]*/, "", retval)
return retval
}
BEGIN {
eolstr = "\\n"
tabstr = "\\t"
is_hex = 0
path = ""
text = ""
RS = ">"
hex_init()
}
/^<\?xml/ {
# ignore xml header
next
}
/<awk:ok xmlns:awk=/ {
# ignore awk namespace declaration
next
}
/<\// {
# close tag
closed_path = path
count = split($0, tokens, /[<]/)
if (count != 2) {
print "Apparently malformed close tag: " $0 ">"
exit(1)
}
str = normalize(tokens[1])
append(str)
name = substr(tokens[2], 2)
if (name == "awk:cdata") {
if (length(text) > 0) {
printf "%s/text()\t", path
hex_decode_print(text)
print ""
}
is_hex = 0
} else {
namelen = length(name)
pathlen = length(path)
expected = substr(path, 1 + pathlen - namelen, namelen)
if (name != expected) {
printf "Error: Expected \"%s\" closing tag, got \"%s\"\n",
expected, name
exit(1)
}
if (length(text) > 0) {
printf "%s/text()\t%s\n", path, text
}
path = substr(path, 1, pathlen - namelen - 1)
}
printf "%s\t\n", closed_path
text = ""
next
}
/</ {
# open tag
is_open = 1
count = split($0, tokens, /</)
if (count != 2) {
print "Error: Apparently malformed open tag: " $0 ">"
exit(1)
}
str = normalize(tokens[1])
append(str)
str = tokens[2]
if (str ~ /^!\[CDATA\[/) {
str = normalize(substr(str, 9, length(str) - 10))
append(str)
next
} else if (str ~ /^!DOCTYPE /) {
# ignore DOCTYPE
next
}
if (match(str, /[\n ]*\/$/)) {
# discard slash from self-closing tag
str = substr(str, 1, length(str) - RLENGTH)
is_open = 0
}
if (match(str, /^awk:cdata type="awk:hexBinary"/)) {
is_hex = 1
next
}
match(str, /^[^\n ][^\n ]*/)
name = substr(str, 1, RLENGTH)
str = substr(str, RLENGTH+1)
oldpath = path
path = path "/" name
while (length(str) > 0) {
str = trimleft(str)
if (match(str, /^[^=]*="[^"]*"/)) {
pair = substr(str, 1, RLENGTH)
str = trimleft(substr(str, RLENGTH+1))
count = split(pair, tokens, /=/)
attr = tokens[1]
value = substr(tokens[2], 2, length(tokens[2]) - 2)
} else if (match(str, /^[^=]*=[^\n ]*/)) {
pair = substr(str, 1, RLENGTH)
str = trimleft(substr(str, RLENGTH+1))
count = split(pair, tokens, /=/)
attr = tokens[1]
value = tokens[2]
} else {
printf "Apparently malformed attribute: \"%s\"\n", str
exit(0)
}
printf "%s[@%s]\t%s\n", path, attr, value
}
if (!is_open) {
printf "%s\t\n", path
path = oldpath
}
next
}
{
str = normalize($0)
append(str)
}