// file: $isip/class/pr/LanguageModelABNF/lmabnf_05.cc
// version: $Id: lmabnf_05.cc 10495 2006-03-15 21:37:57Z may $
//

// isip include files
//
#include "LanguageModelABNF.h"

// method: getRuleModel
//
// arguments:
//  none
//
// return: a RuleModel object containing normalized BNF rules
//
// Normalized BNF rules can be in one of the following forms:
//
// (rule)->(non_terminal)
// (rule)->(terminal),(non_terminal)
// (rule)->(epsilon)
//
// Any graph can be represented by a combination of these three rule types.
//
// 1) Preprocessing/Renaming
//
// Each terminal symbol in the set of rules is considered unique but many might
// have the same name.  The algorithm will first loop through all the rules
// and find the terminal symbols, renaming each of them to something unique.
// This can be implemented with a simple integer that increments each time
// a terminal symbols is encountered.  As each symbol is renamed, the original
// name and new name will be added to a vector of pair objects so that the
// original names can be restored after the conversion.  Conveniently, we can
// name the rules that reference each of these unique symbols based on whatever
// their unique name is.  For example, the symbols 1, 2, 3 could be referenced
// by the rules R1, R2, R3, repectivley.
//
// This preprocessing step will only take place in the LanguageModelABNF
// and will never be seen by the user.  Although it seems a bit messy, it's 
// necessary since it eliminates naming restrictions when creating graphs.
//
// 2) Create set of normalized BNF rules by iteratively searching for connected
// symbols (this means the same thing as two nodes connected by an arc in IHD),
// kleene star tokens, and kleene plus tokens.
//
// Starting with the first rule, search for all concatenation, kleene star, 
// and kleene plus tokens.  As the algorithm steps over each token, it will 
// keep up with its nesting level. For example:
//
// 0(1(2((5)4(5))))
//
// The the integers represent the nesting level.
//
// Each time a concatenation token is encountered, a list of left and right 
// symbols must be found.  Once this list is found, a set of normalized BNF 
// rules is created to connect each left symbol with each right symbol.  
// The number of rules created for each concatenation symbol will be n*m 
// where n is the number of left symbols, and m is the number of right symbols.
//
// Each time a kleene star or a kleene plus is ecountered, the last symbols of 
// the rule segment following the token must be connected to the first symbols 
// of the segment.  
//
// (A,*(B,C,D),E)
//
// An arc will be drawn from symbol D to symbol B.  Also,
//
// (A,+(B|C|D),E)
//
// The symbols B, C, and D will all have self loops.
//
// At this point in the algorithm, kleene star and kleene plus tokens are
// treated the same. To find which symbols connect to which, we find the 
// first and last token of the rule segment following the kleene token.  In
// the example above, these tokens would be:
//
// (A,+(B|C|D),E)
//     ^     ^   
// From these tokens, we find all right symbols starting from the 
// left token, and all left symbols starting from the right token.  The 
// same method will be used to find right and left symbols as was used
// for handling concatenation.  
//
// The following procedure will be used to find right and left symbols:
//
// At the highest level, there will be two methods: findLeftSymbols() and
// findRightSymbols().  Each will return a vector of symbols.
//
// 2-a) findLeftSymbols().
//
// This method will start by checking the current symbol. At the top level 
// of recursion, this will be the token to the left of the concatenation token.
// The token will be one of the following types: (i)terminal symbol, 
// (ii)non terminal symbol, or (iii)closing parenthesis. If any token other 
// than these three is encountered, then this isn't a valid rule and the 
// algorithm returns an error.  Since ABNF rules use prefix notation for 
// keen stars and kleen plus tokens, we won't encounter these to the left
// of a concatenation. 
//
// 2-a-i)  terminal symbol 
//
// This is the base case of this recursive method.  If a terminal symbol 
// is encountered, the findLeftSymbols simply adds this symbol to the symbol 
// vector and returns the vector.
//
// 2-a-ii) non_terminal symbol 
//
// If a non terminal symbol is encountered, findLeftSymbols is called
// recursivley on the last token of the rule referenced by the non_terminal.
// The resulting vector of findLeftSymbols is concatenated with the current
// vector and returned.
//
// 2-a-iii) closing parenthesis
//
// This is the most complicated of the three cases since parenthesis may
// provide crucial nesting and order-of-operations information, or they
// may provide no information all (they could be redundant).  For example:
//
// (A,((A,B,C)|((D|E|F),G)))
//
// ((A,((A),(B),(C))|((((D)|(E)|(F)),(G)))))
//
// Both of these are both equivalent, but the second contains lots of
// redundant parentheses. 
//
// When a closing parenthesis is encountered, the algorithm must look for
// alternation at the nesting level closed by the parenthesis. For example:
//
// (A,((A,B,C)|((D|E|F),G)))
//                    ^
// At the nesting level closed by this parenthesis,  there are 3 alternatives:
//
// (A,((A,B,C)|((D|E|F),G))) 
//               ^ ^ ^
// For each of these tokens, the findLeftSymbols method will be called 
// recursivley on the tokens to the left of the alternation token, and also
// on the token to the left of the closing parenthesis.  The results of each 
// call will be added to the symbols vector.  In this example, each token 
// happens to be a terminal symbol, and the recursion reaches the base case.
//
// 2-b) findRightSymbols()
//
// This method will check the token to the right of the current token.
// At the top level of recursion, this will be the token to the right of the 
// concatenation token. The token wll be one of the following types: 
// (i)terminal symbol, (ii) non terminal symbol, (iii) open parenthesis, 
// (iv) keene star, (v) kleen plus, (vi) epsilon. If any token other than 
// these three is encountered, then this isn't a valid rule and the algorithm 
// returns an error.
//
// 2-b-i) terminal symbol
//
// This is the base case of this recursive method. If a terminal symbol 
// is encountered, the findRightSymbols simply adds this symbol to the symbol 
// vector and returns the vector.
//
// 2-b-ii) non-terminal symbol
//
// If a non-terminal symbol is encountered, the findRightSymbols is called
// recursivley on the first token of the rule referenced by the non-terminal
// symbol.  The result of this call to findRightSymbols is added to the
// symbol vector.
//
// 2-b-iii) open parenthesis
//
// When an open parenthesis is encountered, the algorithm looks for alternation
// at the nesting level opened by the parenthesis.  The findRightSymbols method
// is called recursivley on the token to the right of each alternation symbol,
// and also on the token to the right of the opening parenthesis. If no 
// alternation is found, the method is only called recursivley on the
// token to the right of the opening parenthesis.
//
// 2-b-iv) kleene star
//
// Kleene star indicates that the following rule segment is repeated zero or 
// more times.  First, the method will recursivley call findRightSymbols on 
// the token following the Kleene star, the results of which are added to the
// symbol vector.  Next, we need to find a set of right symbols that 
// skip over the rule block following the kleene star.  The method will 
// search for a concatenation symbol at the same nesting level as the
// original concatenation symbol, and recursivley call findRightSymbols
// on the token after this concatenation.  Any kleene start operator within
// the nesting level of the rule segment following the original kleene star
// is considered redundant.  No action is taken if these are encountered. For
// example:
//
// (A,*(B|*(C,D,E)|F),G)
//
// is equivalent to:
//
// (A,*(B|(C,D,E)|F),G)
//
// 2-b-v) kleene plus
//
// Kleene plus indicates that the following rule segment is repeated one
// or more times. We use a procedure similar to how we treated the kleene star,
// but we don't need to find a set of right symbols that skips over the
// rule segment following the kleene plus.  We just recursivley call 
// findRightSymbols on the token following the Kleene star, the results of 
// which are added to the symbol vector.
//
// 2-b-vi) epsilon 
//
// This is another base case of the findRightSymbols method.  If an epsilon 
// token is encountered, we simply add the default end rule name to the
// symbol vector.  This rule name is defined by the constant TERM_RULE_NAME
// in the ProductionRule class.
//
// 3) Creating Rules
//
// Once we get a set of right and left symbols, we create a set of rules
// that connects the left symbols to the right symbols.  For example if the set
// of left symbols contains {1} and the set of left symbols contains {2,3,4,RT}
// then the resulting rules would be:
//
// R1->1,R2
// R1->1,R3
// R1->1,R4
// R1->RT
//
// Where R2, R3, and R4 reference rules for the symbols 2,3, and 4,
// respectivley. RT references the rule for an epsilon symbol:
//
// RT->(epsilon)
//
// If the rule under evaluation is a start rule, we call findRightSymbols on
// the first token of the rule, and reference the rule for each of the resultin
// symbols via a new start rule. For example, if the set of symbols returned is
// {1,2,3}, and the rule for each of these symbols is R1, R2, R3, respectivley,
// then we would create a new set of start rules:
//
// RS->R1
// RS->R2
// RS->R3
//
// 4) Restore Original Terminal Symbol Names
//
// Finally, we iterate over the vector of pairs created in Step 1 and replace
// the temporary unique names with their original names.
//
RuleModel LanguageModelABNF::getRuleModel() {

  Vector< Vector<ProductionRuleSet> > bnf_rule_model;
  RuleModel bnf_model;
  
  // loop over levels
  //
  for (int32 i = 0; i < abnf_model_d.first().length(); i++) {

    Vector<ProductionRuleSet> bnf_graphs;
    
    // loop over graphs at level i
    //
    for (int32 j = 0; j < abnf_model_d.first()(i).length(); j++) {

      // preprocessing:  replace all symbols with a unique name.  the original
      // names will be restored when the BNF rules have been extracted
      //
      Long symbol = 0;
      ProductionRuleSet abnf_rules_orig = abnf_model_d.first()(i)(j);
      ProductionRuleSet abnf_rules;
      ProductionRuleSet bnf_rules;
      Vector<Pair<SearchSymbol, SearchSymbol> > orig_symbol_chart;	
      
      for (int32 k=0; k < abnf_rules_orig.length(); k++) {
	
	ProductionRule current_rule = abnf_rules_orig(k);
	current_rule.gotoFirst();
	
	// loop over all tokens
	//
	do {
	  
	  if (current_rule.getType() == ProductionRuleTokenType::TERMINAL) {
	    
	    Pair<SearchSymbol, SearchSymbol> symbol_match;
	    String new_name;
	    new_name.concat(symbol++);
	    
	    symbol_match.assign(new_name, current_rule.getValue());
	    orig_symbol_chart.concat(symbol_match);
	    
	    current_rule.setValue(new_name);
	  }
	} while (current_rule.gotoNext());
	
	abnf_rules.concat(current_rule);
	
      }

      abnf_model_d.first()(i)(j).assign(abnf_rules);

      // create normalized bnf rules
      //
      for (int32 k=0; k < abnf_rules.length(); k++) {
	
	ProductionRule current_rule = abnf_rules(k);
	int32 position = 0, nest_level = 0;

	current_rule.gotoFirst();
	
	// loop over tokens in rule.  look for concatenation, kleene
	// star, and kleene plus
	//
	do {

	  Vector<SearchSymbol> left_symbols;
	  WeightedSymbols right_symbols;

	  if (current_rule.getType() ==
	      ProductionRuleTokenType::CONCATENATION) {

	    // find left symbols
	    //
	    left_symbols = findLeftSymbols(abnf_rules, k,
					  nest_level, position);

	    if (debug_level_d > Integral::NONE) {
	      left_symbols.debug(L"left_symbols");
	    }

	    // find right symbols
	    //
	    right_symbols = findRightSymbols(abnf_rules, k,
					    nest_level, position,
					    current_rule.getWeight());
	    	    
	    if (debug_level_d > Integral::NONE) {
	      right_symbols.debug(L"right_symbols");
	    }
	    
	  }
	  else if (current_rule.getType() ==
		   ProductionRuleTokenType::KLEENE_STAR ||
		   current_rule.getType() ==
		   ProductionRuleTokenType::KLEENE_PLUS) {

	    // find the first and last symbols in the rule segment following
	    // the kleene star or plus
	    //
	    right_symbols = findRightSymbols(abnf_rules, k,
					    nest_level, position,
					    current_rule.getWeight());

	    int32 end_nest = findEndOfNestLevel(current_rule,
					       position+1, RIGHT);
	    
	    left_symbols = findLeftSymbols(abnf_rules, k,
					  nest_level, end_nest+1);

	  }
	  else if (current_rule.getType() ==
		   ProductionRuleTokenType::OPEN_PAREN) {
	    nest_level++;
	  }
	  else if (current_rule.getType() ==
		   ProductionRuleTokenType::CLOSE_PAREN) {
	    nest_level--;
	  }	  

	  // use the right and left symbols to build a set of
	  // normalized BNF rules
	  //
	  buildNormBNFRules(left_symbols, right_symbols, bnf_rules);
	  
	  position++;
	} while (current_rule.gotoNext());

      }

      // create the BNF start rules
      //
      createStartRules(abnf_model_d, bnf_rules, i, j);

      // restore the original symbol names
      //
      restoreSymbols(bnf_rules, orig_symbol_chart);

      bnf_graphs.concat(bnf_rules);
    }

    bnf_rule_model.concat(bnf_graphs);
  }
  
  bnf_model.assign(bnf_rule_model, abnf_model_d.second());

  if (debug_level_d > Integral::NONE) {
    bnf_model.debug(L"bnf_model");
  }
  
  return bnf_model;
}