news.utdallas.edu!wupost!howland.reston.ans.net!agate!doc.ic.ac.uk!uknet!mcsun!sun4nl!mhres!pronto!news Tue Mar  9 09:38:59 CST 1993
Article: 1468 of comp.lang.perl
Xref: feenix.metronet.com comp.lang.perl:1468
Path: feenix.metronet.com!news.utdallas.edu!wupost!howland.reston.ans.net!agate!doc.ic.ac.uk!uknet!mcsun!sun4nl!mhres!pronto!news
From: [email protected] (Johan Vromans)
Newsgroups: comp.lang.perl
#Subject: WordPerfect document scanner
Message-ID: <[email protected]>
Date: 9 Mar 93 13:18:08 GMT
Sender: [email protected] (USEnet News System)
Followup-To: alt.sources.d
Organization: Multihouse Automation, the Netherlands
Lines: 288
X-Md4-Signature: d90d6f205746a8854b154ba6d0177648
Nntp-Posting-Host: largo:(jv)

This program reads a WordPerfect document and outputs its contents in
a way suitable to study and analyze it.
It can be used as a driver to perform your own WP to whatever
conversions.
Disclaimer: this program is not supported, use at your own risk.

       Johan

---- Cut Here and feed the following to sh ----
#!/bin/sh
# This is a shell archive (produced by shar 3.49)
# To extract the files from this archive, save it to a file, remove
# everything above the "!/bin/sh" line above, and type "sh file_name".
#
# made 03/01/1993 15:47 UTC by jv@largo
# Source directory /a/pronto/mozart/users/jv
#
# existing files will NOT be overwritten unless -c is specified
#
# This shar contains:
# length  mode       name
# ------ ---------- ------------------------------------------
#   6374 -rw-r--r-- wpscan.pl
#
# ============= wpscan.pl ==============
if test -f 'wpscan.pl' -a X"$1" != X"-c"; then
       echo 'x - skipping wpscan.pl (File already exists)'
else
echo 'x - extracting wpscan.pl (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'wpscan.pl' &&
X#!/usr/local/bin/perl
X# wp2txt.pl -- convert WP document to MH-Doc format
X# SCCS Status     : %Z%@ %M%   %I%
X# Author          : Johan Vromans
X# Created On      : Mon Jul 13 21:23:19 1992
X# Last Modified By: Johan Vromans
X# Last Modified On: Mon Mar  1 16:43:01 1993
X# Update Count    : 315
X# Status          : Unknown, Use with caution!
X
X################ Common stuff ################
X
X# $LIBDIR = $ENV{"MH_DOCLIB"} || "/usr/local/lib/mh_doc";
X# unshift (@INC, $LIBDIR);
X# require "common.pl";
X$my_package = "PRONTO/MH-Doc V1.02";
X$my_version = "%I%";
X$my_name = "%M%";
X
X################ Program parameters ################
X
X$opt_start = $opt_end = 0;
X$opt_outline = 0;
X$opt_analyze = 1;
X$opt_short = 0;
X$opt_attr = 0;
X$opt_header = 1;
X$opt_verbose = 0;
X&options if $ARGV[0] =~ /^-/;
X
X################ Presets ################
X
X@ARGV = ("-") unless @ARGV;
Xopen (WPD, @ARGV[0]) || die ("$ARGV[0]: $!\n");
X&wpd_check;
X
Xif ( $opt_analyze ) {
X    &analyze;
X}
Xelse {
X    &convert;
X}
X
Xexit (0);
X
X################ Subroutines ################
X
X# Extract a two-byte (little endian) word from the input.
Xsub opc_word {
X    shift (@opc) | (shift (@opc) << 8);
X}
X
X# Skip a number of bytes.
Xsub opc_skip {
X    local ($dis) = @_;
X    splice (@opc, 0, $dis);
X}
X
X# Verify validity of WP document.
Xsub wpd_check {
X
X    local ($hdr) = '';
X
X    sysread (WPD, $hdr, 16);
X    local ($id, $std, $filetype) = unpack ("a4Vv", $hdr);
X
X    if ($id eq "\xffWPC" &&
X       $filetype == 0x0A01 ) {
X
X       # Seek to start of document, and slurp it.
X       $opt_start = $std unless $opt_start;
X       seek (WPD, $opt_start, 0);
X       local ($/) = undef;
X       $wpd = <WPD>;
X       $wpdptr = $opt_start;
X
X       # Truncate if requested.
X       $end = $wpdptr + length ($wpd) - 1;
X       if ( $opt_end > $opt_start ) {
X           substr ($wpd, $opt_end-$wpdptr) = '';
X       }
X       else {
X           $opt_end = $end;
X       }
X
X       # Feedback
X       if ( $opt_debug || $opt_analyze ) {
X           printf STDERR ("Startdoc @ 0x%x (%d)%s\n", $std, $std,
X                          ($opt_start != $std || $opt_end != $end) ?
X                          sprintf (", range is 0x%x (%d) thru 0x%x (%d)",
X                                   $opt_start, $opt_start,
X                                   $opt_end-1, $opt_end-1) : '');
X       }
X    }
X    else {
X       die ("Not a WP document\n");
X    }
X}
X
X# The main processors --
X
Xsub analyze {
X
X    local ($opc);              # opcode under examination
X    local ($opc_type);         # type (0 = string, 1 = bytes)
X    local ($opc_len);          # total length op opcode sequence
X    local ($opc_char);         # text, if $opc_type == 0
X    local (@opc);              # bytes, if $opc_type == 1
X
X    while ( &fetch >= 0 ) { ; }        # &fetch will do all
X}
X
Xsub options {
X
X    require "newgetopt.pl";
X    local ($opt_noheader) = 0;
X
X    $opt_help = $opt_ident = 0;
X
X    if ( ! &NGetOpt
X       ("start=s", "end=s",
X        "analyze", "short", "outline",
X        "attr", "header", "noheader",
X        "verbose", "ident", "quiet", "help", "debug")
X           || $opt_help ) {
X       print STDERR <<EndOfUsage;
XThis is $my_package [$my_name $my_version]
X
XUsage: $0 [options] file...
X
X  options are:
X    -output file       destination
X    -start XXX         start conversion at address XXX
X    -end XXX           end conversion before address XXX
X    -ident             print program name and version
X    -verbose           verbose info
X    -attr              include traces of attributes
X    -noheader          suppress MH-Doc header
X    -analyze           analyze input
X    -outline           make outline
X    -short             with -analyze: shorten long opcodes
X    -help              this message
XEndOfUsage
X       exit (1);
X    }
X
X    $opt_start = oct ($opt_start) if $opt_start =~ /^0/;
X    $opt_end = oct ($opt_end) if $opt_end =~ /^0/;
X
X    if ( defined $opt_output && $opt_output ne '' ) {
X       open (STDOUT, ">$opt_output");
X    }
X
X    if ( $opt_analyze ) {
X       $opt_analyze = 2 unless $opt_short;
X       open (STDERR, ">&STDOUT");
X       if ( -t STDERR ) {
X           select (STDERR);
X           $| = 1;
X       }
X    }
X
X    $opt_header = !$opt_noheader;
X    print STDERR ("This is $my_package [$my_name $my_version]") if $opt_ident;
X}
X
Xsub fetch {
X
X    # Fetch the next opcode from the input.
X    #
X    # $opc_type will designate the type of opcode:
X    #   0 -> sequence of ASCII characters, delivered in $opc_char.
X    #   1 -> sequence of bytes, delevired in @opc.
X    # $opc_len will hold the number of bytes involved.
X    #
X    # Return value is $opc_type, or -1 if exhausted.
X
X    return -1 if $wpd eq '';
X
X    if ( $wpd =~ /^[ -~]+/ ) {
X       $opc_char = $&;
X       $wpd = $';
X       $opc_len = length ($opc_char);
X       $opc_type = 0;
X       printf STDERR ("%04x: %s\n", $wpdptr, $opc_char)
X           if $opt_debug || $opt_analyze;
X       $wpdptr += $opc_len;
X       return $opc_type;
X    }
X
X    $opc = ord ($wpd);
X    $opc_type = 1;
X    $opc_len = -1;
X       if ( $opc <= 037 || $opc == 0x7f )      { $opc_len = 1; }
X    elsif ( $opc >= 0x80 && $opc <= 0xbf )     { $opc_len = 1; }
X    elsif ( $opc == 0xc0 )                     { $opc_len = 4; }
X    elsif ( $opc == 0xc1 )                     { $opc_len = 9; }
X    elsif ( $opc == 0xc2 )                     { $opc_len = 11; }
X    elsif ( $opc == 0xc3 || $opc == 0xc4 )     { $opc_len = 3; }
X    elsif ( $opc == 0xc5 )                     { $opc_len = 5; }
X    elsif ( $opc == 0xc6 )                     { $opc_len = 6; }
X    elsif ( $opc == 0xc7 )                     { $opc_len = 7; }
X    elsif ( $opc >= 0xc8 && $opc <= 0xcf ) {
X       printf STDERR ("%04x: [Reserved opcode %02x]\n", $wpdptr, $opc);
X       $opc_len = 1;
X    }
X
X    if ( $opc_len > 0 ) {
X       @opc = unpack ("C*", substr ($wpd, 0, $opc_len));
X       substr ($wpd, 0, $opc_len) = '';
X       if ( $opt_debug || $opt_analyze ) {
X           printf STDERR ("%04x: [" . join(" ", ("%02x") x $opc_len) . "]\n",
X                          $wpdptr, @opc);
X       }
X       die ("*** PHASE ERROR ***\n") unless $opc == $opc[$#opc];
X       $wpdptr += $opc_len;
X       return $opc_type;
X    }
X
X    $opc_len = unpack ("v", substr ($wpd, 2, 2));
X    @opc = unpack ("C*", substr ($wpd, 0, $opc_len+4));
X    substr ($wpd, 0, $opc_len+4) = '';
X    if ( $opt_debug || $opt_analyze ) {
X       local (@opcb) = @opc;
X       local (@finals) = splice (@opcb, $#opcb-3);
X       printf STDERR ("%04x: [%02x %02x %02x%02x (%d)", $wpdptr,
X                      splice (@opcb, 0, 4), $opc_len);
X       if ( $opt_analyze == 1 && @opcb > 6 ) {
X           printf STDERR (" %02x %02x ... %02x %02x",
X                          $opcb[0], $opcb[1], $opcb[$#opcb-1], $opcb[$#opcb]);
X       }
X       else {
X           foreach $b ( @opcb ) {
X               printf STDERR (" %02x", $b);
X           }
X       }
X       printf STDERR (" %02x%02x %02x %02x]\n", @finals);
X    }
X    die ("*** OPCODE LENGHT ERROR ***\n")
X       unless ($opc[$#opc-3] | ($opc[$#opc-2] << 8)) == $opc_len;
X    $wpdptr += $opc_len += 4;
X    die ("*** OPCODE SUBFUN ERROR ***\n") unless $opc[1] == $opc[$#opc-1];
X    die ("*** OPCODE ERROR ***\n") unless $opc == $opc[$#opc];
X    return $opc_type;
X}
SHAR_EOF
chmod 0644 wpscan.pl ||
echo 'restore of wpscan.pl failed'
Wc_c="`wc -c < 'wpscan.pl'`"
test 6374 -eq "$Wc_c" ||
       echo 'wpscan.pl: original size 6374, current size' "$Wc_c"
fi
exit 0
--
Johan Vromans                                  [email protected] via internet backbones
Multihouse Automatisering bv                   uucp:..!{uunet,sun4nl}!mh.nl!jv
Doesburgweg 7, 2803 PL Gouda, The Netherlands  phone/fax: +31 1820 62911/62500
------------------------ "Arms are made for hugging" -------------------------