// cykp.cpp  CYK Parser  input grammar file, strings to parse, accept or reject
//                       for use with Context Free Grammars, CFG's
//
// Grammar definition of a CFG  G = ( V, T, P, S )
//    where V is set of variables
//          T is the set of terminal symbols
//          P is the productions of the grammar
//          S is the starting variable, S is in V
//
// Input Conventions:
//
// the file name extension for the grammar is typically  .g
// free form plain text input file
// typical execution is   cykp  your-grammar.g < strings.dat > your_output.out
//                   or   cykp  < grammar-and-input-strings > your-output.out
//
// A comment line has "// " as the first three characters
// A line may end with a comment using whitespace "//"
// Anything after whitespace after last expected input is a comment
//
// There are only a few keywords:
//                 start. variable, terminal, trace and verbose
// any line not starting with these keywords is a production
//
// all input must end with a comment "//" or a semicolon ";"
//
// Terminal symbols must be entered, else execution stops.
//
// Variables do not need to be entered, they are collected from
// the initial production variable, V from    V ->
//
// A production is of the form:
// V -> alpha   // where V is a variable, alpha is space separated
//              // variables and/or terminals
//              // the standard " | " is allowed in productions
//              // productions must end with " //" or " ;"
//
// special input-symbol #b for the blank character,
//                      #e for epsilon, the null string
//
// at least one production is required,
//
// variable, terminal, trace and productions may continue across end-of-line
// and thus must be terminated by a comment, "//", or a semicolon, ";"
//
// Use  #b  to input a blank (space) character in the string
//
// Method: read input grammar
//         eliminate useless variables that can not become terminals
//         eliminate useless variables that can not be reached from S
//         eliminate epsilon productions, A -> #e
//         eliminate unit productions,  A -> B
//         make productions Chomsky Normal Form
//         make productions Greibach Normal Form
//
// This is one of a series of automata simulators and formal language programs
// dfa  deterministic finite automata (with epsilon transitions and halt)
// nfa  nondeterministic finite automata
// tm   Turing machine, deterministic (recognizer and function computation)
// ntm  nondeterministic Turing machine (language recognizer only)
//
// compile with  GNU g++  2.8.1 or later    or   Microsoft 6.0 or later
//                                          or   cl /GX /ML cykp.cpp ...
// g++ -o cykp cykp.cpp cyk_eliminate.cpp cyk_chomsky.cpp \
//                      cyk_greibach.cpp  cyk_parse.cpp
// ANSI/ISO C++
// release JSS 2/15/00 v0.8
// update  JSS 3/27/00 v0.9
// update  JSS 4/18/00 v1.0 Greibach working
// update  JSS 4/2/01  v1.1 Accept "|" in productions
// update  JSS 3/16/04 v1.2 Greibach not working, add input "greibach"

#include "cykp.h" // all C++ and cykp type definitions

int verbose = 0;          // verbosity,0=minimum, 1=some, 2=each step, 3>=debug
bool accept_null = false; // if null string in language
                          // (set in epsilon elim, test in parse)
                          
int main(int argc, char *argv[]) //  cykp grammar.g < input-file > output-file
{                                //  cykp < grammar-and-input > output-file
  // primary objects
  var_type   V;      // V = all named variables
  ter_type   T;      // T = all possible terminals
  variable   S;      // S = starting variable
  prod_type  P;      // P = productions of the grammar
  var_type   trace;  // variables to trace during parsing or "all"
  production p1(""); // a production, temp variable
  var_type   VT;     // temp for testing V intersect T is phi
  prod_type  PT;     // temp for collecting valid productions
  
  // primary iterators  
  var_type::const_iterator  v_iter;  // for V and VT
  ter_type::const_iterator  t_iter;  // for T
  prod_type::const_iterator p_iter;  // for P and PT
  alpha::const_iterator     a_iter;  // for w

  // control variables
  bool accepted = false;    // input string parsed OK
  bool bad_production;      // ill formed production
  bool do_greibach = false; // until bug fixed

  // input variables
  string keyword;          // for inputting candidate keyword
  variable from_var;       // for production variable
  string     item;         // from V union T, an element of alpha
  variable v_item;         // for inputting a variable
  terminal t_item;         // for inputting a terminal
  string input_string;     // temp input string
  variable trace_var;      // trace variable or "all"
  
  // keywords and special symbols
  string key_start  = "start";     // followed by a state
  string key_var    = "variable";  // followed by a string for variable name
  string key_ter    = "terminal";  // followed by a terminal symbol, #b
  string key_enddef = "enddef";    // end of grammar, input strings may follow
  string key_trace  = "trace";     // variables to trace
  string key_verbos = "verbose";   // amount of printout
  string key_greibach = "greibach";  // do greibach
  string comm       = "//";        // start comment
  string empty      = "";          // empty string

  cout << "cykp v1.2 starting" << endl;

  // open grammar file (if no file, read from standard input)
  // to be implemented in v1.1

  while(!cin.eof()) // main input loop
  {
    cin >> keyword;
    if(cin.eof()) break;
    if(keyword==key_start)
    {
      if(S!=empty) cout << "Over writing a previous start." << endl;
      cin >> S;
      V.insert(S);
      if(S=="//") cout << "start looks like a comment?" << endl;
      cin.ignore(255,'\n');
      continue;
    }
    else if(keyword==key_var)
    {
      while(!cin.eof())
      {
        cin >> v_item;
        if(cin.eof()) break;    // make 3 lines a function, check ends also
        if(v_item=="//") break;
        if(v_item==";") break;
        V.insert(v_item);
      }
      cin.ignore(255,'\n');
      continue;
    }
    else if(keyword==key_ter)
    {
      while(!cin.eof())
      {
        cin >> t_item;
        if(cin.eof()) break;
        if(t_item=="//") break;
        if(t_item==";") break;
        if(t_item=="#b") t_item = " "; // blank special
        T.insert(t_item);
      }
      cin.ignore(255,'\n');
      continue;
    }
    else if(keyword==key_trace)
    {
      while(!cin.eof())
      {
        cin >> trace_var;
        if(cin.eof()) break;
        if(trace_var=="//") break;
        if(trace_var==";") break;
        trace.insert(trace_var);
      }
      cin.ignore(255,'\n');
      continue;
    }
    else if(keyword==key_verbos)
    {
      cin >> verbose;
      cin.ignore(255,'\n');
      continue;
    }
    else if(keyword==key_greibach)
    {
      do_greibach = true;
      cin.ignore(255,'\n');
      continue;
    }
    else if(keyword==key_enddef) // stop inputting
    {                            // now may only read input strings
      cin.ignore(255,'\n');
      break;
    }
    else if(keyword=="//") // skip comment "// "
    {
      cin.ignore(255,'\n');
      continue;
    }
    else if(keyword==";") // stray ";"
    {
      cin.ignore(255,'\n');
      continue;
    }
    else // should be a production
    {
      from_var = keyword;
      V.insert(from_var);
      cin >> item; if(cin.eof()) break;
      if(!(item == "->" || item == ":=")) // not a well formed production
      {
        cout << from_var << ' ' << item << " is not V -> or V :="
             << "  Thus, ignoring line" << endl;
        cin.ignore(255,'\n');
        continue;
      } 
      p1 = production(from_var);
      while(!cin.eof())
      {
        cin >> item;     if(cin.eof()) break;
        if(cin.eof()) break;
        if(item=="//") break;
        if(item==";") break;
        if(item=="|")
	{
          if(verbose>=3) cout << "adding production " << p1 << endl;
          P.push_back(p1);
          p1.w.clear(); // clear right hand side of production
          continue;
	}
        p1.w.push_back(item);
      }
      cin.ignore(255,'\n'); // delete trailing comments
      if(verbose>=3) cout << "adding production " << p1 << endl;
      P.push_back(p1);
    } // end 'if' check for key words
    
  } // end main input loop

  if(verbose>0)cout << "reading input finished." << endl;

  // print primary data objects, check for errors and warnings
  if(P.size()==0)
  {
    cout << "No productions input prevent parsing. Stopping." << endl;
    return 1;
  }

  if(verbose>0)
  {
    cout << "Start symbol: " << S << endl;
    cout << "Variables: ";
    for(v_iter=V.begin(); v_iter!=V.end(); v_iter++) cout << *v_iter << " ";
    cout << endl;
  }
  
  if(T.size()==0)
  {
    cout << "No terminal symbols input prevent parsing. Stopping." << endl;
    return 1;
  }
  
  set_intersection(V.begin(), V.end(), T.begin(), T.end(),
                   inserter(VT, VT.begin()));
  if(VT.size()!=0) // V intersect T must be empty
  {
    cout << "V intersect T non empty. Stopping." << endl;
    return 1;
  }
  
  if(verbose>0)
  {
    cout << "Terminals: ";
    for(t_iter=T.begin(); t_iter!=T.end(); t_iter++) cout << *t_iter << " ";
    cout << endl;
  }
  // sort and print productions

  p1 = production("");
  sort(P.begin(), P.end(), production::smaller );
  if(verbose>0) cout << "Sorted productions:" << endl;
  for(p_iter=P.begin(); p_iter!=P.end(); p_iter++)
  {
    // test for duplicate or bad, eliminate
    bad_production = false;
    if(verbose>0) cout << *p_iter << endl;
    if(p1 == *p_iter)
    {
      if(verbose>0)
        cout << "                             duplicate production" << endl;
      continue; // skips saving production
    }
    p1 = *p_iter;
    for(a_iter=p1.w.begin(); a_iter!=p1.w.end(); a_iter++)
    {
      if(T.find(*a_iter)==T.end() && V.find(*a_iter)==V.end() &&
         (!(*a_iter=="#e")))
      {
        if(verbose>0)cout << "            " << *a_iter
             << " item in production is not a terminal or variable" << endl;
        bad_production = true;
      }
    }
    if(!bad_production) PT.push_back(p1);
  }

  if(!trace.empty())
  {
    if(trace.find("all")!=trace.end()) trace = V;
    if(verbose>0)
    {
      cout << "Trace:";
      for(v_iter=trace.begin(); v_iter!=trace.end(); v_iter++) cout << *v_iter << " ";
      cout << endl;
    }
  }
  
  if(PT.size()==0)
  {
    cout << "no good productions left, stopping." << endl;
    return 1;
  }
  
  cyk_eliminate(V, T, PT, S);
  cyk_chomsky  (V, T, PT, S);
  if(do_greibach) cyk_greibach (V, T, PT, S);

  // main parsing for each input string
  while(!cin.eof()) //read input strings to parse, one per line
  {
    cin >> input_string;
    if(cin.eof()) return 0;
    if(input_string=="//") // comment
    {
      cin.ignore(255,'\n');
      continue;
    }
    if(input_string=="#e") input_string = ""; // null string
    // parse input string with processed grammar
    cout << endl << "about to parse " << input_string << endl;
    accepted = cyk_parse(V, T, PT, S, input_string);
  }

  return 0;
} // end of main

// for outputting objects of class  production

ostream &operator<<(ostream &stream, production p)
{
  alpha::const_iterator     a_iter;  // for w

  stream << p.V  << " -> "; 
  for(a_iter=p.w.begin(); a_iter!=p.w.end(); a_iter++)
  {
    stream << *a_iter << '\t'; // output terminals and variables in rule 
  }
  return stream;
}

// end cykp.cpp
