// file: $isip/class/pr/LanguageModelABNF/lmabnf_05.cc // version: $Id: lmabnf_05.cc 10495 2006-03-15 21:37:57Z may $ // // isip include files // #include "LanguageModelABNF.h" // method: getRuleModel // // arguments: // none // // return: a RuleModel object containing normalized BNF rules // // Normalized BNF rules can be in one of the following forms: // // (rule)->(non_terminal) // (rule)->(terminal),(non_terminal) // (rule)->(epsilon) // // Any graph can be represented by a combination of these three rule types. // // 1) Preprocessing/Renaming // // Each terminal symbol in the set of rules is considered unique but many might // have the same name. The algorithm will first loop through all the rules // and find the terminal symbols, renaming each of them to something unique. // This can be implemented with a simple integer that increments each time // a terminal symbols is encountered. As each symbol is renamed, the original // name and new name will be added to a vector of pair objects so that the // original names can be restored after the conversion. Conveniently, we can // name the rules that reference each of these unique symbols based on whatever // their unique name is. For example, the symbols 1, 2, 3 could be referenced // by the rules R1, R2, R3, repectivley. // // This preprocessing step will only take place in the LanguageModelABNF // and will never be seen by the user. Although it seems a bit messy, it's // necessary since it eliminates naming restrictions when creating graphs. // // 2) Create set of normalized BNF rules by iteratively searching for connected // symbols (this means the same thing as two nodes connected by an arc in IHD), // kleene star tokens, and kleene plus tokens. // // Starting with the first rule, search for all concatenation, kleene star, // and kleene plus tokens. As the algorithm steps over each token, it will // keep up with its nesting level. For example: // // 0(1(2((5)4(5)))) // // The the integers represent the nesting level. // // Each time a concatenation token is encountered, a list of left and right // symbols must be found. Once this list is found, a set of normalized BNF // rules is created to connect each left symbol with each right symbol. // The number of rules created for each concatenation symbol will be n*m // where n is the number of left symbols, and m is the number of right symbols. // // Each time a kleene star or a kleene plus is ecountered, the last symbols of // the rule segment following the token must be connected to the first symbols // of the segment. // // (A,*(B,C,D),E) // // An arc will be drawn from symbol D to symbol B. Also, // // (A,+(B|C|D),E) // // The symbols B, C, and D will all have self loops. // // At this point in the algorithm, kleene star and kleene plus tokens are // treated the same. To find which symbols connect to which, we find the // first and last token of the rule segment following the kleene token. In // the example above, these tokens would be: // // (A,+(B|C|D),E) // ^ ^ // From these tokens, we find all right symbols starting from the // left token, and all left symbols starting from the right token. The // same method will be used to find right and left symbols as was used // for handling concatenation. // // The following procedure will be used to find right and left symbols: // // At the highest level, there will be two methods: findLeftSymbols() and // findRightSymbols(). Each will return a vector of symbols. // // 2-a) findLeftSymbols(). // // This method will start by checking the current symbol. At the top level // of recursion, this will be the token to the left of the concatenation token. // The token will be one of the following types: (i)terminal symbol, // (ii)non terminal symbol, or (iii)closing parenthesis. If any token other // than these three is encountered, then this isn't a valid rule and the // algorithm returns an error. Since ABNF rules use prefix notation for // keen stars and kleen plus tokens, we won't encounter these to the left // of a concatenation. // // 2-a-i) terminal symbol // // This is the base case of this recursive method. If a terminal symbol // is encountered, the findLeftSymbols simply adds this symbol to the symbol // vector and returns the vector. // // 2-a-ii) non_terminal symbol // // If a non terminal symbol is encountered, findLeftSymbols is called // recursivley on the last token of the rule referenced by the non_terminal. // The resulting vector of findLeftSymbols is concatenated with the current // vector and returned. // // 2-a-iii) closing parenthesis // // This is the most complicated of the three cases since parenthesis may // provide crucial nesting and order-of-operations information, or they // may provide no information all (they could be redundant). For example: // // (A,((A,B,C)|((D|E|F),G))) // // ((A,((A),(B),(C))|((((D)|(E)|(F)),(G))))) // // Both of these are both equivalent, but the second contains lots of // redundant parentheses. // // When a closing parenthesis is encountered, the algorithm must look for // alternation at the nesting level closed by the parenthesis. For example: // // (A,((A,B,C)|((D|E|F),G))) // ^ // At the nesting level closed by this parenthesis, there are 3 alternatives: // // (A,((A,B,C)|((D|E|F),G))) // ^ ^ ^ // For each of these tokens, the findLeftSymbols method will be called // recursivley on the tokens to the left of the alternation token, and also // on the token to the left of the closing parenthesis. The results of each // call will be added to the symbols vector. In this example, each token // happens to be a terminal symbol, and the recursion reaches the base case. // // 2-b) findRightSymbols() // // This method will check the token to the right of the current token. // At the top level of recursion, this will be the token to the right of the // concatenation token. The token wll be one of the following types: // (i)terminal symbol, (ii) non terminal symbol, (iii) open parenthesis, // (iv) keene star, (v) kleen plus, (vi) epsilon. If any token other than // these three is encountered, then this isn't a valid rule and the algorithm // returns an error. // // 2-b-i) terminal symbol // // This is the base case of this recursive method. If a terminal symbol // is encountered, the findRightSymbols simply adds this symbol to the symbol // vector and returns the vector. // // 2-b-ii) non-terminal symbol // // If a non-terminal symbol is encountered, the findRightSymbols is called // recursivley on the first token of the rule referenced by the non-terminal // symbol. The result of this call to findRightSymbols is added to the // symbol vector. // // 2-b-iii) open parenthesis // // When an open parenthesis is encountered, the algorithm looks for alternation // at the nesting level opened by the parenthesis. The findRightSymbols method // is called recursivley on the token to the right of each alternation symbol, // and also on the token to the right of the opening parenthesis. If no // alternation is found, the method is only called recursivley on the // token to the right of the opening parenthesis. // // 2-b-iv) kleene star // // Kleene star indicates that the following rule segment is repeated zero or // more times. First, the method will recursivley call findRightSymbols on // the token following the Kleene star, the results of which are added to the // symbol vector. Next, we need to find a set of right symbols that // skip over the rule block following the kleene star. The method will // search for a concatenation symbol at the same nesting level as the // original concatenation symbol, and recursivley call findRightSymbols // on the token after this concatenation. Any kleene start operator within // the nesting level of the rule segment following the original kleene star // is considered redundant. No action is taken if these are encountered. For // example: // // (A,*(B|*(C,D,E)|F),G) // // is equivalent to: // // (A,*(B|(C,D,E)|F),G) // // 2-b-v) kleene plus // // Kleene plus indicates that the following rule segment is repeated one // or more times. We use a procedure similar to how we treated the kleene star, // but we don't need to find a set of right symbols that skips over the // rule segment following the kleene plus. We just recursivley call // findRightSymbols on the token following the Kleene star, the results of // which are added to the symbol vector. // // 2-b-vi) epsilon // // This is another base case of the findRightSymbols method. If an epsilon // token is encountered, we simply add the default end rule name to the // symbol vector. This rule name is defined by the constant TERM_RULE_NAME // in the ProductionRule class. // // 3) Creating Rules // // Once we get a set of right and left symbols, we create a set of rules // that connects the left symbols to the right symbols. For example if the set // of left symbols contains {1} and the set of left symbols contains {2,3,4,RT} // then the resulting rules would be: // // R1->1,R2 // R1->1,R3 // R1->1,R4 // R1->RT // // Where R2, R3, and R4 reference rules for the symbols 2,3, and 4, // respectivley. RT references the rule for an epsilon symbol: // // RT->(epsilon) // // If the rule under evaluation is a start rule, we call findRightSymbols on // the first token of the rule, and reference the rule for each of the resultin // symbols via a new start rule. For example, if the set of symbols returned is // {1,2,3}, and the rule for each of these symbols is R1, R2, R3, respectivley, // then we would create a new set of start rules: // // RS->R1 // RS->R2 // RS->R3 // // 4) Restore Original Terminal Symbol Names // // Finally, we iterate over the vector of pairs created in Step 1 and replace // the temporary unique names with their original names. // RuleModel LanguageModelABNF::getRuleModel() { Vector< Vector > bnf_rule_model; RuleModel bnf_model; // loop over levels // for (int32 i = 0; i < abnf_model_d.first().length(); i++) { Vector bnf_graphs; // loop over graphs at level i // for (int32 j = 0; j < abnf_model_d.first()(i).length(); j++) { // preprocessing: replace all symbols with a unique name. the original // names will be restored when the BNF rules have been extracted // Long symbol = 0; ProductionRuleSet abnf_rules_orig = abnf_model_d.first()(i)(j); ProductionRuleSet abnf_rules; ProductionRuleSet bnf_rules; Vector > orig_symbol_chart; for (int32 k=0; k < abnf_rules_orig.length(); k++) { ProductionRule current_rule = abnf_rules_orig(k); current_rule.gotoFirst(); // loop over all tokens // do { if (current_rule.getType() == ProductionRuleTokenType::TERMINAL) { Pair symbol_match; String new_name; new_name.concat(symbol++); symbol_match.assign(new_name, current_rule.getValue()); orig_symbol_chart.concat(symbol_match); current_rule.setValue(new_name); } } while (current_rule.gotoNext()); abnf_rules.concat(current_rule); } abnf_model_d.first()(i)(j).assign(abnf_rules); // create normalized bnf rules // for (int32 k=0; k < abnf_rules.length(); k++) { ProductionRule current_rule = abnf_rules(k); int32 position = 0, nest_level = 0; current_rule.gotoFirst(); // loop over tokens in rule. look for concatenation, kleene // star, and kleene plus // do { Vector left_symbols; WeightedSymbols right_symbols; if (current_rule.getType() == ProductionRuleTokenType::CONCATENATION) { // find left symbols // left_symbols = findLeftSymbols(abnf_rules, k, nest_level, position); if (debug_level_d > Integral::NONE) { left_symbols.debug(L"left_symbols"); } // find right symbols // right_symbols = findRightSymbols(abnf_rules, k, nest_level, position, current_rule.getWeight()); if (debug_level_d > Integral::NONE) { right_symbols.debug(L"right_symbols"); } } else if (current_rule.getType() == ProductionRuleTokenType::KLEENE_STAR || current_rule.getType() == ProductionRuleTokenType::KLEENE_PLUS) { // find the first and last symbols in the rule segment following // the kleene star or plus // right_symbols = findRightSymbols(abnf_rules, k, nest_level, position, current_rule.getWeight()); int32 end_nest = findEndOfNestLevel(current_rule, position+1, RIGHT); left_symbols = findLeftSymbols(abnf_rules, k, nest_level, end_nest+1); } else if (current_rule.getType() == ProductionRuleTokenType::OPEN_PAREN) { nest_level++; } else if (current_rule.getType() == ProductionRuleTokenType::CLOSE_PAREN) { nest_level--; } // use the right and left symbols to build a set of // normalized BNF rules // buildNormBNFRules(left_symbols, right_symbols, bnf_rules); position++; } while (current_rule.gotoNext()); } // create the BNF start rules // createStartRules(abnf_model_d, bnf_rules, i, j); // restore the original symbol names // restoreSymbols(bnf_rules, orig_symbol_chart); bnf_graphs.concat(bnf_rules); } bnf_rule_model.concat(bnf_graphs); } bnf_model.assign(bnf_rule_model, abnf_model_d.second()); if (debug_level_d > Integral::NONE) { bnf_model.debug(L"bnf_model"); } return bnf_model; }