#!/usr/bin/perl -w
# see usage string below

use strict 'subs';

sub usage {
  print(<<"EOF");
usage: $0 [-o outname] file.tok [extension.tok [...]]

This script processes a master token description and produces several files:
  - a .h file with the enumeration listing all the tokens
  - a .cc file with a table of spellings, and table of flags
  - a .ids file with grammar token names, ids, and aliases

The filenames are named with the same base as the input .tok file,
or with whatever base is specified with the -o option.
EOF
}

$baseName = "";
$myCommand = "$0 " . join(' ', @ARGV);

while (@ARGV != 0 && $ARGV[0] =~ m/^-/) {
  my $arg = $ARGV[0];
  shift @ARGV;
  if ($arg eq "-o") {
    $baseName = $ARGV[0];
    shift @ARGV;
  }
  else {
    print("unknown option: $arg\n");
    usage();
    exit(2);
  }
}

if (@ARGV < 1) {
  usage();
  exit(0);
}

$fname = $ARGV[0];

if (!$baseName) {
  $baseName = $fname;
  $baseName =~ s|\.[^.]*$||;    # strip extension
}

# open the output files
open(H, ">$baseName.h") or die("cannot open $baseName.h: $!\n");
open(CC, ">$baseName.cc") or die("cannot open $baseName.cc: $!\n");
open(IDS, ">$baseName.ids") or die("cannot open $baseName.ids: $!\n");


# write the preambles

$latch = "$baseName.h";
$latch =~ tr|a-z./|A-Z__|;
print H (<<"EOF");
// $baseName.h
// do not edit; this file automatically generated by
//   $myCommand

#ifndef $latch
#define $latch

// token flags
enum TokenFlag {
  TF_MULTISPELL   = 0x01,     // token has multiple spellings
  TF_NONSEPARATOR = 0x02,     // token is a nonseparator
  TF_CPLUSPLUS    = 0x04,     // token is a keyword in C++, but an identifier in C
  ALL_TOKEN_FLAGS = 0x07      // bitwise OR of above
};

enum TokenType {
EOF


print CC (<<"EOF");
// $baseName.cc
// do not edit; this file automatically generated by
//   $myCommand

#include "$baseName.h"     // this module; defines TokenFlag

char const * const tokenNameTable[] = {
EOF

# I'll have to accumulate the flags in a big list and then
# emit them after I close the 'tokenNames' array
@flagsList = ();


print IDS (<<"EOF");
// $baseName.ids
// do not edit; this file automatically generated by
//   $myCommand

// form:
//   <code> : <name> [<alias>] ;

EOF


# process the input file(s), effectively a concatenation of all the
# files supplied on the command line
$nextId = 0;
while (@ARGV > 0) {
  $fname = $ARGV[0];
  $lineNum = 0;
  shift @ARGV;

  # open the input file
  open(IN, "<$fname") or die("cannot open $fname: $!\n");

  # process it
  while (defined($line = <IN>)) {
    $lineNum++;

    # blank lines and comment lines are copied to the output verbatim,
    # once we've seen the first line which is neither
    if ($line =~ m|^\s*$| or
        $line =~ m|^\s*//|) {
      if ($nextId) {
        # indent unindented comments
        if ($line =~ m|^//|) {
          $line = "  " . $line;
        }

        print H ($line);
        print CC ($line);
        push @flagsList, $line;
        print IDS ($line);
      }
      next;
    }

    # parse the line
    chomp($line);
    my ($enumerator, $spelling, $flags) =
      ($line =~ m|^\s*([a-zA-Z_0-9]+),\s*(\"[^\"]*\")\s*,\s*:(.*)$|);

    #print("enumerator: $enumerator\n");
    #print("spelling:   $spelling\n");
    #print("flags:      $flags\n");
    #exit(0);

    if (!defined($flags)) {
      die("$fname:$lineNum: malformed line\n");
    }

    # parse the flags
    $multiSpell = ($flags =~ m|m|);
    $nonsep = ($flags =~ m|n|);
    $cpp = ($flags =~ m|p|);

    # emit to each file
    print H ("  $enumerator,\n");
    printf CC ("  %-40s // $enumerator\n", "$spelling,");

    my @f = ("0");
    if ($multiSpell) { push @f, "TF_MULTISPELL"; }
    if ($nonsep)     { push @f, "TF_NONSEPARATOR"; }
    if ($cpp)        { push @f, "TF_CPLUSPLUS"; }
    push @flagsList, sprintf("  %-40s // $enumerator\n",
                             join(' | ', @f) . ",");

    printf IDS ("  %3d : %-30s %s;\n",
                $nextId,
                $enumerator,
                ($multiSpell? "" : $spelling));

    $nextId++;
  }
  
  close(IN) or die;
}


# print the epilogues

print H (<<"EOF");
  NUM_TOKEN_TYPES

};  // enum TokenType

// map TokenType to its spelling or description
extern char const * const tokenNameTable[];
extern int const tokenNameTableSize;

// map TokenType to a bitwise OR of TokenFlags
extern unsigned char tokenFlagTable[];

#endif // $latch
EOF



$flagsList = join('', @flagsList);
print CC (<<"EOF");
};  // tokenNameTable[]

// this is provided to allow a consistency check between the generated
// .h file and generated .cc file
int const tokenNameTableSize =
  sizeof(tokenNameTable) / sizeof(tokenNameTable[0]);


unsigned char tokenFlagTable[] = {
$flagsList
};
EOF


# the IDS file has no epilogue


# close the files
close(H) or die;
close(CC) or die;
close(IDS) or die;

exit(0);
