// file: $isip/class/pr/LanguageModelJSGF/lmjsgf_09.cc // // system include files // #include // isip include files // #include "LanguageModelJSGF.h" // method: convertJSGFtoABNF // // arguments: // // return: // ProductionRuleSet LanguageModelJSGF::convertJSGFtoABNF(Vector grammar_a) { ProductionRuleSet abnf_grammar; String start_rule_name; int32 token_index = 0; // iterate over all tokens and construct rules // while(token_index < grammar_a.length()) { ProductionRule current_rule; // find the next rule_name token // while(token_index < grammar_a.length() && grammar_a(token_index).getTokenType() != JSGFToken::RULE_NAME) { token_index++; } if (token_index < grammar_a.length()) { current_rule.clear(); String rule_name(grammar_a(token_index).rulename_d); current_rule.setRuleName(rule_name); token_index++; } float32 weight = 0; // loop over tokens until a ';' operator is encountered // while(token_index < grammar_a.length() && grammar_a(token_index).operator_d(0).ne(L";") && current_rule.getRuleName().length() > 0) { if (grammar_a(token_index).getTokenType() == JSGFToken::WEIGHT) { weight = grammar_a(token_index).weight_d; token_index++; } else if (grammar_a(token_index).getTokenType() == JSGFToken::OPERATOR) { if (grammar_a(token_index).operator_d(0).eq(L"(")) { current_rule.append(ProductionRuleTokenType::OPEN_PAREN); token_index++; } else if (grammar_a(token_index).operator_d(0).eq(L")")) { current_rule.append(ProductionRuleTokenType::CLOSE_PAREN); token_index++; } else if (grammar_a(token_index).operator_d(0).eq(L"|")) { current_rule.append(ProductionRuleTokenType::ALTERNATION); token_index++; } else { token_index++; } } else if (grammar_a(token_index).getTokenType() == JSGFToken::TERMINAL) { if (grammar_a(token_index).terminal_d.eq(START_SYMBOL)) { current_rule.setRuleType(ProductionRule::START); } else if (grammar_a(token_index).terminal_d.eq(TERM_SYMBOL)) { current_rule.append(ProductionRuleTokenType::OPEN_PAREN, L"", weight); current_rule.append(ProductionRuleTokenType::NON_TERMINAL, grammar_a(token_index).terminal_d); current_rule.append(ProductionRuleTokenType::CLOSE_PAREN); ProductionRule epsilon_rule; epsilon_rule.setRuleName(grammar_a(token_index).terminal_d); epsilon_rule.append(ProductionRuleTokenType::EPSILON); // check to see if we already added an epsilon rule. if not // add it to the productionruleset // if (!abnf_grammar.contains(&epsilon_rule)) { abnf_grammar.concat(epsilon_rule); } } else { current_rule.append(ProductionRuleTokenType::TERMINAL, grammar_a(token_index).terminal_d); current_rule.append(ProductionRuleTokenType::CONCATENATION); } token_index++; } else if (grammar_a(token_index).getTokenType() == JSGFToken::RULE_NAME) { // don't do anything if this is the start rule reference. We've already // handled this by making this rule a START rule // if (grammar_a(token_index).rulename_d.ne(START_SYMBOL_REFERENCE)) { current_rule.append(ProductionRuleTokenType::OPEN_PAREN, L"", weight); current_rule.append(ProductionRuleTokenType::NON_TERMINAL, grammar_a(token_index).rulename_d); current_rule.append(ProductionRuleTokenType::CLOSE_PAREN); token_index++; } } else { token_index++; } } abnf_grammar.concat(current_rule); } return abnf_grammar; } // method: convertBNFtoJSGF // // arguments: // // return: // Vector LanguageModelJSGF::convertBNFtoJSGF(ProductionRuleSet rules_a) { Vector grammar; JSGFToken current_token; Vector graph_symbols; String term_rulename; // we first need to determine the unique nodes in this graph. this is accomplished // by finding unique pairs of rule_names and terminal tokens // Vector< Pair > nodes; // we also need to keep up with the nodes that have already been visited. // Vector< Pair > nodes_visited; // loop over all rules in this graph // for (int32 i = 0; i < rules_a.length(); i++) { rules_a(i).gotoFirst(); do { if (rules_a(i).length() > 0) { SearchSymbol symbol = rules_a(i).getValue(); // try to identify unique nodes by finding unique pairs of rule // names and terminal tokens. this is an attempt to minimize // the graph as much as possible. // Pair node(rules_a(i).getRuleName(), symbol); if (rules_a(i).getType() == ProductionRuleTokenType::TERMINAL && !nodes.contains(&node)) { nodes.concat(node); } } } while (rules_a(i).gotoNext()); } // we need to find which rule is the epsilon rule // for (int32 i = 0; i < rules_a.length(); i++) { rules_a(i).gotoFirst(); if (rules_a(i).length() == 1 && rules_a(i).getType() == ProductionRuleTokenType::EPSILON) { term_rulename.assign(rules_a(i).getRuleName()); } } // all rules will be converted to JSGFTokens and added to a single // vector of JSGFTokens. // // add the appropriate header information // - find start rule and set the grammar name to the name of the start rule // - add the appropriate keywords, and JSGF headers // // first token is the header token // current_token.setTokenType(JSGFToken::HEADER); current_token.setHeader(PARAM_JSGF_VERSION); grammar.concat(current_token); current_token.clear(); // next token is the grammar keyword and declaration // current_token.setTokenType(JSGFToken::GRAMMAR_NAME); current_token.setKeyword(KEYWORD_GRAMMAR); String grammar_definition(GRAMMAR_DEF_BASE); String grammar_name; // look for start rule and set grammar definition // int32 index = -1; do { index++; if (rules_a(index).getRuleType() == ProductionRule::START || rules_a(index).length() == 0) { grammar_name.assign(rules_a(index).getRuleName()); } } while(rules_a(index).getRuleType() != ProductionRule::START && index < rules_a.length()-1); grammar_definition.concat(grammar_name); current_token.setGrammarName(grammar_definition); grammar.concat(current_token); current_token.clear(); // close grammar definition with semicolon operator // current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(0).assign(OPERATOR_SEMICOLON); grammar.concat(current_token); current_token.clear(); // add the public keyword // current_token.setTokenType(JSGFToken::KEYWORD); current_token.setKeyword(KEYWORD_PUBLIC); grammar.concat(current_token); current_token.clear(); // set the rulename token // current_token.setTokenType(JSGFToken::RULE_NAME); current_token.setRuleName(grammar_name); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_OPENANGLEBRACKET; current_token.operator_d(OPERATOR_CLOSE_POS) = OPERATOR_CLOSEANGLEBRACKET; grammar.concat(current_token); current_token.clear(); // add the equal operator // current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_EQUALS; grammar.concat(current_token); current_token.clear(); // we will first define the JSGF start rule if it's not a dummy // if (rules_a.length() > 1) { current_token.setTokenType(JSGFToken::TERMINAL); current_token.setTerminal(START_SYMBOL); grammar.concat(current_token); current_token.clear(); } Vector temp_rules; // the first JSGF rule following the public keyword needs to be the start rule // for (int32 i = 0; i < rules_a.length(); i++) { if (rules_a(i).getRuleType() == ProductionRule::START && rules_a(i).getRuleName().eq(grammar_name)) { temp_rules.concat(rules_a(i)); rules_a.deleteRange(i--,1); } } if (temp_rules.length() > 1) { current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_OPENPARENS; grammar.concat(current_token); current_token.clear(); } for (int32 i = 0; i < temp_rules.length(); i++) { temp_rules(i).gotoFirst(); ProductionRule ref_rule; String rulename; float32 weight = 0; Pair node; current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_OPENPARENS; grammar.concat(current_token); current_token.clear(); if (temp_rules(i).length() == 1 && temp_rules(i).getType() == ProductionRuleTokenType::NON_TERMINAL) { ref_rule = findRule(temp_rules(i).getValue(), rules_a); weight = temp_rules(i).getWeight(); } else { Error::handle(L"LanguageModelJSGF", L"convertJSGFtoABNF - invalid start rule format", Error::ERROR, __FILE__, __LINE__); } current_token.setTokenType(JSGFToken::WEIGHT); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_FWDSLASH; current_token.operator_d(OPERATOR_CLOSE_POS) = OPERATOR_FWDSLASH; current_token.setWeight(weight); grammar.concat(current_token); current_token.clear(); ref_rule.gotoFirst(); node.assign(ref_rule.getRuleName(), ref_rule.getValue()); rulename.assign(RULE_NAME_BASE); rulename.concat(nodes.first(node)); current_token.setTokenType(JSGFToken::RULE_NAME); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_OPENANGLEBRACKET; current_token.operator_d(OPERATOR_CLOSE_POS) = OPERATOR_CLOSEANGLEBRACKET; current_token.setRuleName(rulename); grammar.concat(current_token); current_token.clear(); current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_CLOSEPARENS; grammar.concat(current_token); current_token.clear(); if (i < temp_rules.length()-1) { current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_ALTERNATION; grammar.concat(current_token); current_token.clear(); } } if (temp_rules.length() > 1) { current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_CLOSEPARENS; grammar.concat(current_token); current_token.clear(); } current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(0).assign(OPERATOR_SEMICOLON); grammar.concat(current_token); current_token.clear(); // iterate over rules. for each rule, check the remaining rules to find rules // with the same rulename and nonterminal. when these are found, and the // appropriate JSGF tokens have been added to the token vector, remove them // from the rules_a list. this will be implemented with an outerloop to loop // over rules, and an inner loop to loop over the remaining rules and find // matches. // for (int32 i = 0; i < rules_a.length(); i++) { temp_rules.clear(); ProductionRule current_rule = rules_a(i); Pair node; node.assign(current_rule.getRuleName(), current_rule.getValue()); if (current_rule.length() == 3 && nodes_visited.first(node) < 0) { nodes_visited.concat(node); String rulename(RULE_NAME_BASE); rulename.concat(nodes.first(node)); current_token.setTokenType(JSGFToken::RULE_NAME); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_OPENANGLEBRACKET; current_token.operator_d(OPERATOR_CLOSE_POS) = OPERATOR_CLOSEANGLEBRACKET; current_token.setRuleName(rulename); grammar.concat(current_token); current_token.clear(); current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_EQUALS; grammar.concat(current_token); current_token.clear(); current_token.setTokenType(JSGFToken::TERMINAL); current_token.setTerminal(current_rule.getValue()); grammar.concat(current_token); current_token.clear(); // the first JSGF rule following the public keyword needs to be the start rule // for (int32 j = i; j < rules_a.length(); j++) { if (rules_a(j).getRuleName().eq(current_rule.getRuleName())) { temp_rules.concat(rules_a(j)); } } if (temp_rules.length() > 1) { current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_OPENPARENS; grammar.concat(current_token); current_token.clear(); } for (int32 i = 0; i < temp_rules.length(); i++) { temp_rules(i).gotoFirst(); ProductionRule ref_rule; String rulename; float32 weight; Pair node; current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_OPENPARENS; grammar.concat(current_token); current_token.clear(); if (temp_rules(i).length() == 3) { temp_rules(i).gotoNext(); weight = temp_rules(i).getWeight(); temp_rules(i).gotoNext(); if(temp_rules(i).getType() == ProductionRuleTokenType::NON_TERMINAL) { ref_rule = findRule(temp_rules(i).getValue(), rules_a); } else { Error::handle(L"LanguageModelJSGF", L"convertJSGFtoABNF - invalid rule format", Error::ERROR, __FILE__, __LINE__); } current_token.setTokenType(JSGFToken::WEIGHT); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_FWDSLASH; current_token.operator_d(OPERATOR_CLOSE_POS) = OPERATOR_FWDSLASH; current_token.setWeight(weight); grammar.concat(current_token); current_token.clear(); if (ref_rule.getRuleName().eq(term_rulename)) { rulename.assign(PARAM_JSGF_TERM_SYMBOL); } else { ref_rule.gotoFirst(); node.assign(ref_rule.getRuleName(), ref_rule.getValue()); rulename.assign(RULE_NAME_BASE); rulename.concat(nodes.first(node)); } current_token.setTokenType(JSGFToken::RULE_NAME); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_OPENANGLEBRACKET; current_token.operator_d(OPERATOR_CLOSE_POS) = OPERATOR_CLOSEANGLEBRACKET; current_token.setRuleName(rulename); current_token.setRuleName(rulename); grammar.concat(current_token); current_token.clear(); current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_CLOSEPARENS; grammar.concat(current_token); current_token.clear(); if (i < temp_rules.length()-1) { current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_ALTERNATION; grammar.concat(current_token); current_token.clear(); } else if (temp_rules.length() > 1) { current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(OPERATOR_OPEN_POS) = OPERATOR_CLOSEPARENS; grammar.concat(current_token); current_token.clear(); } } } current_token.setTokenType(JSGFToken::OPERATOR); current_token.operator_d(0).assign(OPERATOR_SEMICOLON); grammar.concat(current_token); current_token.clear(); } } return grammar; } // method: findRule // // arguments: // // return: // ProductionRule LanguageModelJSGF::findRule(String ruleref_a, ProductionRuleSet rules_a) { ProductionRule def; for (int32 i = 0; i < rules_a.length(); i++) { if (rules_a(i).getRuleName().eq(ruleref_a)) { rules_a(i).gotoFirst(); if (rules_a(i).getType() == ProductionRuleTokenType::TERMINAL || rules_a(i).getType() == ProductionRuleTokenType::EPSILON) { return rules_a(i); } else if (rules_a(i).getType() != ProductionRuleTokenType::EPSILON) { return findRule(rules_a(i).getValue(), rules_a); } } } return def; }