news.utdallas.edu!wupost!howland.reston.ans.net!agate!doc.ic.ac.uk!uknet!mcsun!sun4nl!mhres!pronto!news Tue Mar 9 09:38:59 CST 1993
Article: 1468 of comp.lang.perl
Xref: feenix.metronet.com comp.lang.perl:1468
Path: feenix.metronet.com!news.utdallas.edu!wupost!howland.reston.ans.net!agate!doc.ic.ac.uk!uknet!mcsun!sun4nl!mhres!pronto!news
From:
[email protected] (Johan Vromans)
Newsgroups: comp.lang.perl
#Subject: WordPerfect document scanner
Message-ID: <
[email protected]>
Date: 9 Mar 93 13:18:08 GMT
Sender:
[email protected] (USEnet News System)
Followup-To: alt.sources.d
Organization: Multihouse Automation, the Netherlands
Lines: 288
X-Md4-Signature: d90d6f205746a8854b154ba6d0177648
Nntp-Posting-Host: largo:(jv)
This program reads a WordPerfect document and outputs its contents in
a way suitable to study and analyze it.
It can be used as a driver to perform your own WP to whatever
conversions.
Disclaimer: this program is not supported, use at your own risk.
Johan
---- Cut Here and feed the following to sh ----
#!/bin/sh
# This is a shell archive (produced by shar 3.49)
# To extract the files from this archive, save it to a file, remove
# everything above the "!/bin/sh" line above, and type "sh file_name".
#
# made 03/01/1993 15:47 UTC by jv@largo
# Source directory /a/pronto/mozart/users/jv
#
# existing files will NOT be overwritten unless -c is specified
#
# This shar contains:
# length mode name
# ------ ---------- ------------------------------------------
# 6374 -rw-r--r-- wpscan.pl
#
# ============= wpscan.pl ==============
if test -f 'wpscan.pl' -a X"$1" != X"-c"; then
echo 'x - skipping wpscan.pl (File already exists)'
else
echo 'x - extracting wpscan.pl (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'wpscan.pl' &&
X#!/usr/local/bin/perl
X# wp2txt.pl -- convert WP document to MH-Doc format
X# SCCS Status : %Z%@ %M% %I%
X# Author : Johan Vromans
X# Created On : Mon Jul 13 21:23:19 1992
X# Last Modified By: Johan Vromans
X# Last Modified On: Mon Mar 1 16:43:01 1993
X# Update Count : 315
X# Status : Unknown, Use with caution!
X
X################ Common stuff ################
X
X# $LIBDIR = $ENV{"MH_DOCLIB"} || "/usr/local/lib/mh_doc";
X# unshift (@INC, $LIBDIR);
X# require "common.pl";
X$my_package = "PRONTO/MH-Doc V1.02";
X$my_version = "%I%";
X$my_name = "%M%";
X
X################ Program parameters ################
X
X$opt_start = $opt_end = 0;
X$opt_outline = 0;
X$opt_analyze = 1;
X$opt_short = 0;
X$opt_attr = 0;
X$opt_header = 1;
X$opt_verbose = 0;
X&options if $ARGV[0] =~ /^-/;
X
X################ Presets ################
X
X@ARGV = ("-") unless @ARGV;
Xopen (WPD, @ARGV[0]) || die ("$ARGV[0]: $!\n");
X&wpd_check;
X
Xif ( $opt_analyze ) {
X &analyze;
X}
Xelse {
X &convert;
X}
X
Xexit (0);
X
X################ Subroutines ################
X
X# Extract a two-byte (little endian) word from the input.
Xsub opc_word {
X shift (@opc) | (shift (@opc) << 8);
X}
X
X# Skip a number of bytes.
Xsub opc_skip {
X local ($dis) = @_;
X splice (@opc, 0, $dis);
X}
X
X# Verify validity of WP document.
Xsub wpd_check {
X
X local ($hdr) = '';
X
X sysread (WPD, $hdr, 16);
X local ($id, $std, $filetype) = unpack ("a4Vv", $hdr);
X
X if ($id eq "\xffWPC" &&
X $filetype == 0x0A01 ) {
X
X # Seek to start of document, and slurp it.
X $opt_start = $std unless $opt_start;
X seek (WPD, $opt_start, 0);
X local ($/) = undef;
X $wpd = <WPD>;
X $wpdptr = $opt_start;
X
X # Truncate if requested.
X $end = $wpdptr + length ($wpd) - 1;
X if ( $opt_end > $opt_start ) {
X substr ($wpd, $opt_end-$wpdptr) = '';
X }
X else {
X $opt_end = $end;
X }
X
X # Feedback
X if ( $opt_debug || $opt_analyze ) {
X printf STDERR ("Startdoc @ 0x%x (%d)%s\n", $std, $std,
X ($opt_start != $std || $opt_end != $end) ?
X sprintf (", range is 0x%x (%d) thru 0x%x (%d)",
X $opt_start, $opt_start,
X $opt_end-1, $opt_end-1) : '');
X }
X }
X else {
X die ("Not a WP document\n");
X }
X}
X
X# The main processors --
X
Xsub analyze {
X
X local ($opc); # opcode under examination
X local ($opc_type); # type (0 = string, 1 = bytes)
X local ($opc_len); # total length op opcode sequence
X local ($opc_char); # text, if $opc_type == 0
X local (@opc); # bytes, if $opc_type == 1
X
X while ( &fetch >= 0 ) { ; } # &fetch will do all
X}
X
Xsub options {
X
X require "newgetopt.pl";
X local ($opt_noheader) = 0;
X
X $opt_help = $opt_ident = 0;
X
X if ( ! &NGetOpt
X ("start=s", "end=s",
X "analyze", "short", "outline",
X "attr", "header", "noheader",
X "verbose", "ident", "quiet", "help", "debug")
X || $opt_help ) {
X print STDERR <<EndOfUsage;
XThis is $my_package [$my_name $my_version]
X
XUsage: $0 [options] file...
X
X options are:
X -output file destination
X -start XXX start conversion at address XXX
X -end XXX end conversion before address XXX
X -ident print program name and version
X -verbose verbose info
X -attr include traces of attributes
X -noheader suppress MH-Doc header
X -analyze analyze input
X -outline make outline
X -short with -analyze: shorten long opcodes
X -help this message
XEndOfUsage
X exit (1);
X }
X
X $opt_start = oct ($opt_start) if $opt_start =~ /^0/;
X $opt_end = oct ($opt_end) if $opt_end =~ /^0/;
X
X if ( defined $opt_output && $opt_output ne '' ) {
X open (STDOUT, ">$opt_output");
X }
X
X if ( $opt_analyze ) {
X $opt_analyze = 2 unless $opt_short;
X open (STDERR, ">&STDOUT");
X if ( -t STDERR ) {
X select (STDERR);
X $| = 1;
X }
X }
X
X $opt_header = !$opt_noheader;
X print STDERR ("This is $my_package [$my_name $my_version]") if $opt_ident;
X}
X
Xsub fetch {
X
X # Fetch the next opcode from the input.
X #
X # $opc_type will designate the type of opcode:
X # 0 -> sequence of ASCII characters, delivered in $opc_char.
X # 1 -> sequence of bytes, delevired in @opc.
X # $opc_len will hold the number of bytes involved.
X #
X # Return value is $opc_type, or -1 if exhausted.
X
X return -1 if $wpd eq '';
X
X if ( $wpd =~ /^[ -~]+/ ) {
X $opc_char = $&;
X $wpd = $';
X $opc_len = length ($opc_char);
X $opc_type = 0;
X printf STDERR ("%04x: %s\n", $wpdptr, $opc_char)
X if $opt_debug || $opt_analyze;
X $wpdptr += $opc_len;
X return $opc_type;
X }
X
X $opc = ord ($wpd);
X $opc_type = 1;
X $opc_len = -1;
X if ( $opc <= 037 || $opc == 0x7f ) { $opc_len = 1; }
X elsif ( $opc >= 0x80 && $opc <= 0xbf ) { $opc_len = 1; }
X elsif ( $opc == 0xc0 ) { $opc_len = 4; }
X elsif ( $opc == 0xc1 ) { $opc_len = 9; }
X elsif ( $opc == 0xc2 ) { $opc_len = 11; }
X elsif ( $opc == 0xc3 || $opc == 0xc4 ) { $opc_len = 3; }
X elsif ( $opc == 0xc5 ) { $opc_len = 5; }
X elsif ( $opc == 0xc6 ) { $opc_len = 6; }
X elsif ( $opc == 0xc7 ) { $opc_len = 7; }
X elsif ( $opc >= 0xc8 && $opc <= 0xcf ) {
X printf STDERR ("%04x: [Reserved opcode %02x]\n", $wpdptr, $opc);
X $opc_len = 1;
X }
X
X if ( $opc_len > 0 ) {
X @opc = unpack ("C*", substr ($wpd, 0, $opc_len));
X substr ($wpd, 0, $opc_len) = '';
X if ( $opt_debug || $opt_analyze ) {
X printf STDERR ("%04x: [" . join(" ", ("%02x") x $opc_len) . "]\n",
X $wpdptr, @opc);
X }
X die ("*** PHASE ERROR ***\n") unless $opc == $opc[$#opc];
X $wpdptr += $opc_len;
X return $opc_type;
X }
X
X $opc_len = unpack ("v", substr ($wpd, 2, 2));
X @opc = unpack ("C*", substr ($wpd, 0, $opc_len+4));
X substr ($wpd, 0, $opc_len+4) = '';
X if ( $opt_debug || $opt_analyze ) {
X local (@opcb) = @opc;
X local (@finals) = splice (@opcb, $#opcb-3);
X printf STDERR ("%04x: [%02x %02x %02x%02x (%d)", $wpdptr,
X splice (@opcb, 0, 4), $opc_len);
X if ( $opt_analyze == 1 && @opcb > 6 ) {
X printf STDERR (" %02x %02x ... %02x %02x",
X $opcb[0], $opcb[1], $opcb[$#opcb-1], $opcb[$#opcb]);
X }
X else {
X foreach $b ( @opcb ) {
X printf STDERR (" %02x", $b);
X }
X }
X printf STDERR (" %02x%02x %02x %02x]\n", @finals);
X }
X die ("*** OPCODE LENGHT ERROR ***\n")
X unless ($opc[$#opc-3] | ($opc[$#opc-2] << 8)) == $opc_len;
X $wpdptr += $opc_len += 4;
X die ("*** OPCODE SUBFUN ERROR ***\n") unless $opc[1] == $opc[$#opc-1];
X die ("*** OPCODE ERROR ***\n") unless $opc == $opc[$#opc];
X return $opc_type;
X}
SHAR_EOF
chmod 0644 wpscan.pl ||
echo 'restore of wpscan.pl failed'
Wc_c="`wc -c < 'wpscan.pl'`"
test 6374 -eq "$Wc_c" ||
echo 'wpscan.pl: original size 6374, current size' "$Wc_c"
fi
exit 0
--
Johan Vromans
[email protected] via internet backbones
Multihouse Automatisering bv uucp:..!{uunet,sun4nl}!mh.nl!jv
Doesburgweg 7, 2803 PL Gouda, The Netherlands phone/fax: +31 1820 62911/62500
------------------------ "Arms are made for hugging" -------------------------