Article: 4225 of comp.lang.perl
Xref: feenix.metronet.com comp.lang.perl:4225
Newsgroups: comp.lang.perl
Path: feenix.metronet.com!news.utdallas.edu!wupost!howland.reston.ans.net!agate!ames!olivea!apple.com!voder!berlioz.nsc.com!jedi!arielf
From: [email protected] (Ariel Faigon)
Subject: ckbal - check balanced tokens in C source (perl script)
Message-ID: <[email protected]>
Followup-To: comp.lang.perl
Keywords: Reiser-cpp tokens balance C-language-tool
Sender: [email protected] (UseNet News account)
Reply-To: [email protected]
Organization: National Semiconductor Corp.
Date: Wed, 14 Jul 1993 20:26:05 GMT
Lines: 345

Have you ever been frustrated by the Reiser-cpp or pcc insufficient data
in error messages? Have you ever got a message like "1073: missing #endif"
without a reference to the line where the opening #if appears. Have you ever
left out a closing comment and searched your source for the error in the wrong
place? If so, 'ckbal' can help you.
I wrote this about a year ago in an evening of such frustration, and just thought
it might be a good idea to post this for the benefit of all.

Sorry, the man page is not wrapped, and it doesn't handle C++ style comments.
yet it saved me and my colleagues many long hours in the past year or so. Enjoy.

#!/usr/local/bin/perl
# On Messy-DOS systems start with the following 3 line prologue:
@REM=(qq!
@perl  %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9
@goto end !) if 0 ;

#
# ckbal - check balanced tokens in C source.
#
# Usage:
#       ckbal [-d] [-w] C-files...
#
#       checked token-pairs include
#               ( )
#               [ ]
#               { }
#               /* */
#               " "
#               ' '
#               #ifxxx [#els...] #endif
#
# Errors detected:
#       o EOF hit with unclosed token (opened on line X)
#       o Closing token on line X has no matching open token
#
# Options:
#       -w      Adds warnings about potential errors:
#               Unmatched token pairs within strings or comments
#               (e.g. Open comment within a comment)
#
#       -d      Adds debugging printouts
#
# States during parsing:
#       o In comment
#       o In string
#       o In char constant
#       o In code
#
# TODO:
#       o Make it simpler - more general, more table driven
#         but for a first shot it serves its purpose...
#       o Add C++, // style comments
# BUGS:
#       o A character (single quoted) string is arbitrary long
#       o There are cases where only '-w' will spot the problem
#
# Author: Ariel Faigon, ([email protected]), May 13, 1992
# Donated to the public domain - please leave author data intact. Enjoy.
#

# I use eval because it may not work on my DOS, your mileage may vary.
eval "require 'getopts.pl'";
unless ($@) {
       &Getopts('dw');
}

# -- Opening -> closing token mapping
%pairof = (
       '(', ')',
       '[', ']',
       '{', '}',
       '/*', '*/',
       '"', '"',
       "'", "'",
       '#if',     '#endif',
       '#ifdef',  '#endif',
       '#ifndef', '#endif',

);

# -- State transitions: state + token -> new-state
%next_state = ( "code$;/*", 'comment',
               "code$;\"", 'string',   # " close quote to keep 4dos happy
               "code$;'", 'char',
               "comment$;*/", 'code',
               "string$;\"", 'code',   # " close quote to keep 4dos happy
               "char$;'", 'code'
);

# -- single-char-codes -> real-tokens
# text is 'canonicalized' using single char codes for convenience + efficiency
%realtok_of = (
       "\201", '/*',
       "\202", '*/',

       "\211", '#if',
       "\212", '#ifdef',
       "\213", '#ifndef',
       "\214", '#else',
       "\215", '#elsif',
       "\216", '#endif'
);

# Given current state and token, determine if a token is "opening"
# Should probably be another assoc table...
sub opening {
   local($state, $tok) = @_;
   ($tok =~ m,[\[({]|/\*, || $state eq 'code' && $tok =~ /["']/ ||
    $tok =~ /^#if/)
           ? 1 : 0;
}

# Given current state and token determine if a token is "closing"
# Should probably be another assoc table...
sub closing {
   local($state, $tok) = @_;
   ($tok =~ m,[])}]|\*/, || $tok eq "'" && $state eq 'char' ||
    $tok eq '"' && $state eq 'string' || $tok eq '#endif')
       ? 1 : 0;
}

# -- pop and cleanup all remaining 'unclosed' tokens from stack
sub cleanup_pop {
   local(*stack) = @_;
   local($tok, $line, $prefix);

   $prefix = '';
   unless ($state eq 'code' || $at_eof) {
       $prefix = "warning (in $state): ";
   }
   while (@stack) {
       $tok = pop(@stack);
       $line = pop(@stack);
       if ($opt_w || !$prefix) {
           print STDERR "$0: $prefix$file, $line: '$tok': open without close\n";
       }
   }
}

$0 =~ s,.*/,,;                  # Trim pathname for error messages
unless (@ARGV) {
   print STDERR "ckbal: check balanced tokens in C source

Usage:
       ckbal [-d] [-w] C-files...

Checked token-pairs include:
               ( ), [ ], { }, /* */, \" \", ' '
               #ifxxx [#els...] #endif
Options:
       -w      Adds warnings about potential errors:
               Unmatched token pairs within strings or comments
               (e.g. Open comment within a comment)

       -d      Adds debugging printouts
";
       exit(1);
}

# -- main program
foreach $file (@ARGV) {
   &dofile;
}

# -- process one file
sub dofile {
   local(*stack);

   &canonicalize($file);
   if ($opt_d) {
       print "after canonicalize: '@tokens'\n";
   }


   $line = 1;
   $state = 'code';
   $at_eof = 0;
   while ($tok = shift(@tokens)) {
       if ($tok eq "\n") {
           $line++;
           next;
       }
       # Move to a new state if a transition is needed
       if ($next_state{$state, $tok}) {
           $new_state = $next_state{$state, $tok};
       } else {
           $new_state = $state;                        # default - no change
       }
       if (&opening($state, $tok)) {
           # --- 'opening' tokens
           # special case warning:
           if ($opt_w && $state eq 'comment' && $tok eq '/*') {
              print STDERR "$0: warning: $file, $line: open-comment in comment\n";
           }
           $state = $new_state;
           *stack = $state;
           $realstack = $state;
           if ($tok =~ /^#/) {
               push(@cpp, ($line, $tok));
               $realstack = 'cpp';
           } else {
               push(@stack, ($line, $tok));
           }
           if ($opt_d) {
               print STDERR "$0: line $line: pushed $tok on $realstack stack\n";
           }
       } elsif (&closing($state, $tok)) {
           # --- 'closing' tokens
           if ($tok =~ /^#/) {
               if (@cpp) {
                   $pop = pop(@cpp);
                   pop(@cpp);          # get rid of line-no too
                   if ($opt_d) {
                   print STDERR "$0: line $line: popped $pop from cpp stack\n";
                   }
               } else {
                   print STDERR "$0: $file, $line: '$tok': close without open\n";
               }
               next;
           }

           $prev_tok = $stack[$#stack];
           $prev_line = $stack[$#stack-1];
           if ($pairof{$prev_tok} ne $tok) {
               if ($state eq 'string' && $tok eq '"' ||
                   $state eq 'char' && $tok eq "'" ||
                   $state eq 'comment' && $tok eq '*/') {
                   shift(@stack); shift(@stack);       # get rid of opening
                   &cleanup_pop(*stack, $state);
                   $state = $new_state;
                   *stack = $state;
                   next;
               } elsif ($state eq 'code') {
                   print STDERR "$0: $file, $line: '$tok': close without open\n";
                   @string = ();
                   @char = ();
                   @comment = ();
               } else {                # not in code: comment, string or char
                   $prefix = "warning (in $state): ";
                   if ($opt_w) {
                       print STDERR "$0: $prefix$file, $line: '$tok': close without open\n";
                   }
                   if ($state eq 'string' && $tok eq '"') {
                       @string = ();
                   } elsif ($state eq 'comment' && $tok eq '*/') {
                       @comment = ();
                   } elsif ($state eq 'char' && $tok eq "'") {
                       @char = ();
                   }
               }
           } else {     # match found: pop matching token (and line) from stack
               pop(@stack);
               pop(@stack);
               if ($opt_d) {
                   print STDERR "$0: line $line: popped $tok from $state stack\n";
               }
               $state = $new_state;
               *stack = $state;
           }                           # found matching open to close token
       }                       # open or close token
       elsif ($tok =~ /#els/) {
           unless (@cpp) {
               print STDERR "$0: $file, $line: '#if' less '$tok'\n";
           }
       }
   }                   # while tokens in input
   $at_eof = 1;
   $eof_header = 0;
   for $st ('code', 'comment', 'string', 'char', 'cpp') {
       eval "*stack = *$st";
       if (@stack) {
           unless ($eof_header) {
               print STDERR "$0: $file", ': EOF with unclosed tokens:',"\n";
               $eof_header = 1;
           }
           &cleanup_pop(*stack, $st);
       }
   }
}

#
# canonicalize:
#       Convert input file to a list of tokens while stripping all
#       "uninteresting" tokens from input.
#
#       o Delete every \x to ease string and char constant parsing.
#       o Replace every two-chars interesting tokens (like /* and */)
#         by a single-char code
#       o Leave paired-tokens (like {}[]) and newlines (to count lines)
#         deleting all the rest
#
sub canonicalize {
   local($file) = @_;
   local($_, $tok) = ('', '');

#    undef $/;                # read all input into one long string
#    $* = 1;                # Allow multi-line pattern matching

   @tokens = ();
   open(FH, $file) || die "$0: cannot open '$file' - $!\n";

   while (<FH>) {
       @Ltokens = ();
       s/\\[^\n]//g; # Get rid of escape-sequences to simplify string parsing

       # normalize to one-char tokens to simplify parsing
       s,/\*,\201,g;
       s,\*/,\202,g;

       s,^\s*#\s*if,\211,;
       s,^\s*#\s*ifdef,\212,;
       s,^\s*#\s*ifndef,\213,;
       s,^\s*#\s*else,\214,;
       s,^\s*#\s*elsif,\215,;
       s,^\s*#\s*endif,\216,;

       # Get rid of all chars except 'interesting' chars
       y/()[]{}"'\n\201-\230//cd;

       @Ltokens = split(//, $_);       # convert line to list of tokens
       while ($tok = shift(@Ltokens)) {# translate back to real-tokens
           if ($realtok_of{$tok}) {
               push(@tokens, $realtok_of{$tok});
           } else {
               push(@tokens, $tok);
           }
       }
   }
   close(FH);
   # @tokens;          # return interesting list of tokens for whole file
}

#
# Messy-DOS epilogue:
#
@REM=(qq!
:end !) if 0 ;



---
Peace, Ariel
[email protected]