// file: $isip/class/pr/LanguageModelXML/lmxml_06.cc // version: $Id: lmxml_06.cc 10542 2006-04-04 18:35:20Z may $ // ISIP include files // #include "LanguageModelXML.h" #include #include // method: parseXMLGrammar // // arguments: // String grammar: (input) a String containing an XML format grammar // // return: a bool8 indicating status // // this method parses an XML grammar // bool8 LanguageModelXML::parseXMLGrammar(String grammar_a, Vector& sub_symbol_list_a, Vector& token_vector_a, String& grammar_name_a) { // this method requires the XMLParser, so check for expat // #if defined(HAVE_EXPAT) grammar_name_a.clear(); sub_symbol_list_a.clear(); token_vector_a.clear(); // declare a parser to retrieve the grammar from the file // XMLParser xp; // make sure the parser is operating at the same debug level // as LanguageModelXML // xp.setDebug(debug_level_d); // tell the parser which tokens LanguageModelXML will allow. // xp.setValidTokenValues(getHandledValues()); if(debug_level_d > Integral::BRIEF) { Console::put(L"Parsing the grammar..."); } // parse the xml grammar within the string // xp.parseXML(grammar_a); // get the grammar from the parser // token_vector_a.assign(xp.getTokenVector()); // find all tokens and add them to symbol list // for (int32 i=0; iABNF conversion // RuleModel LanguageModelXML::getABNFRuleModel() { // clear inherited rule model // RuleModel ret_model; if (debug_level_d>Integral::BRIEF) { Console::put(L"performing XML->ABNF conversion"); Console::increaseIndention(); } int32 i, j, k; // perform conversion for every grammar in every level // for (i=0; iIntegral::BRIEF) { String out_string(L"level "); out_string.concat((Long)i); Console::put(out_string); Console::increaseIndention(); } Vector< ProductionRuleSet > curr_level; // loop through each grammar in this level // for (j=0; jIntegral::BRIEF) { String out_string(L"graph "); out_string.concat((Long)j); Console::put(out_string); Console::increaseIndention(); } ProductionRuleSet curr_graph; // clear members having to do with XML rule processing // clearXMLGrammar(); // partition this particular grammar into rules // partitionGrammar(grammars_d(i)(j)); // figure out start production // if (!grammar_start_tag_d.getAttributeValue(ROOT).eq(L"")) { // if it's specified in the file, use that // ProductionRule start_rule; start_rule.setRuleType(ProductionRule::START); start_rule.setRuleName(L"S"); start_rule.append(ProductionRuleTokenType::NON_TERMINAL, grammar_start_tag_d.getAttributeValue(ROOT)); curr_graph.concat(start_rule); } else { // if not, use the first rule // String root_rule_name; for (k=0; kIntegral::BRIEF) { Console::put(L"preprocessing"); Console::increaseIndention(); } // preprocess every rule // for (k=0; kIntegral::BRIEF) { Console::decreaseIndention(); Console::put(L"performing conversion"); } // do conversion for every rule // for (k=0; kIntegral::BRIEF) { Console::put(L"ABNF equivalent:"); Console::increaseIndention(); Console::put(curr_graph(curr_graph.length()-1).getRule()); Console::decreaseIndention(); } } // add this graph to the level // curr_level.concat(curr_graph); if (debug_level_d>Integral::BRIEF) { Console::decreaseIndention(); } } // add this level to the rule model // ret_model.first().concat(curr_level); if (debug_level_d>Integral::BRIEF) { Console::decreaseIndention(); } } // don't forget the IHD information // ret_model.second().assign(hg_d); if (debug_level_d>Integral::BRIEF) { Console::decreaseIndention(); } return ret_model; } // method: getRuleModel // // arguments: // none // // return: a RuleModel corresponding to the XML grammar in memory // // this method initiates ABNF->BNF conversion // RuleModel LanguageModelXML::getRuleModel() { abnf_model_d.assign(getABNFRuleModel()); // now we have an all ABNF RuleModel. here's the // tricky part: we instantiate a LanguageModelABNF // object to perform the rest of the conversion to // BNF // LanguageModelABNF lm_abnf; lm_abnf.setABNFRuleModel(abnf_model_d); // since redundancies are introduced when converting // from an any format to xml, it is necessary to // run a minimization routine in order to remove // those redundancies. this accomplished via the // the minimize method of the LanguageModelBNF class // LanguageModelBNF lm_bnf; lm_bnf.setRuleModel(lm_abnf.getRuleModel()); lm_bnf.minimizeGraph(); // return the minimized rule model // return lm_bnf.getRuleModel(); } // method: convertXMLtoABNF // // arguments: // Vector token_vector_a: vector of XML tokens to convert // // return: a ProductionRule corresponding to the XML grammar in memory // // this recursive method performs XML->ABNF conversion // ProductionRule LanguageModelXML::convertXMLtoABNF(Vector token_vector_a) { ProductionRule temp; // recursive base case // if (token_vector_a.length()==0) { return temp; } // if it's a start tag, see what kind it is and process // it accordingly // if (token_vector_a(0).isA(XMLToken::START_TAG)) { String token_value(token_vector_a(0).getValue()); token_value.toLower(); // if it's a start rule tag, add the grammar name to // the ABNF production and recurse on what's inside // the rule tags // if (token_value.eq(RULE)) { String rule_name(token_vector_a(0).getAttributeValue(ID)); temp.setRuleName(rule_name); temp.append(ProductionRuleTokenType::OPEN_PAREN); appendProduction(temp, convertXMLtoABNF(getRange(1, findMatchingEndTag(token_vector_a, 0), token_vector_a))); temp.append(ProductionRuleTokenType::CLOSE_PAREN); } // if it's a start item tag, add some parenthesis, // recurse on what's inside the item tags, and add // the appropriate kleene closure if repeat options // are present // else if (token_value.eq(ITEM)) { // if it's an item, figure out if we're repeating // String repeat_prob(token_vector_a(0).getAttributeValue(REPEAT_PROB)); String repeat(token_vector_a(0).getAttributeValue(REPEAT)); // if necessary, add kleene closure // if (!repeat.eq(String::NULL_STRING)) { if (!repeat_prob.eq(String::NULL_STRING)) { float32 repeat_prob_float; repeat_prob.get(repeat_prob_float); temp.append(ProductionRuleTokenType::KLEENE_PLUS, L"", repeat_prob_float); } else { temp.append(ProductionRuleTokenType::KLEENE_PLUS); } } // append open paren // temp.append(ProductionRuleTokenType::OPEN_PAREN); // recurse on stuff inside item tags // appendProduction(temp, convertXMLtoABNF(getRange(1, findMatchingEndTag(token_vector_a, 0), token_vector_a))); // append close paren // temp.append(ProductionRuleTokenType::CLOSE_PAREN); // if that's not the end, recurse and figure out the rest of the // rule // if (findMatchingEndTag(token_vector_a, 0)<(token_vector_a.length()-1)) { temp.append(ProductionRuleTokenType::CONCATENATION); appendProduction(temp, convertXMLtoABNF(getRange(findMatchingEndTag(token_vector_a, 0)+1, token_vector_a.length(), token_vector_a))); } } // if it's a start one-of tag, it's tricky. in this case, we // have to start an open parenthesis, find and recurse on all // alternatives, and make sure the appropriate weights are on // each of the alternative '|' symbols // else if (token_value.eq(ONE_OF)) { int32 next_item=findNextItem(0, token_vector_a); String next_weight(token_vector_a(next_item).getAttributeValue(WEIGHT)); String repeat_prob(token_vector_a(next_item).getAttributeValue(REPEAT_PROB)); String repeat(token_vector_a(next_item).getAttributeValue(REPEAT)); // if there is only one alternative, add kleene star now // if necessary // if (findNextItem(next_item, token_vector_a)==-1) { // if necessary, add kleene closure // if (!repeat.eq(String::NULL_STRING)) { if (!repeat_prob.eq(String::NULL_STRING)) { float repeat_prob_float; repeat_prob.get(repeat_prob_float); temp.append(ProductionRuleTokenType::KLEENE_PLUS, L"", repeat_prob_float); } else { temp.append(ProductionRuleTokenType::KLEENE_PLUS); } } } // append open paren // if (next_weight.eq(String::NULL_STRING)) { temp.append(ProductionRuleTokenType::OPEN_PAREN); } else { float next_weight_float; next_weight.get(next_weight_float); temp.append(ProductionRuleTokenType::OPEN_PAREN, L"", next_weight_float); } // if there is only one alternative, don't add the extra layer of // parenthesis // if (findNextItem(next_item, token_vector_a)==-1) { appendProduction(temp, convertXMLtoABNF(getRange(next_item+1, findMatchingEndTag(token_vector_a, next_item), token_vector_a))); temp.append(ProductionRuleTokenType::CLOSE_PAREN); } // if there is more than one alternative, handle // it accordingly // else { // if necessary, add kleene closure // if (!repeat.eq(String::NULL_STRING)) { if (!repeat_prob.eq(String::NULL_STRING)) { float repeat_prob_float; repeat_prob.get(repeat_prob_float); temp.append(ProductionRuleTokenType::KLEENE_PLUS, L"", repeat_prob_float); } else { temp.append(ProductionRuleTokenType::KLEENE_PLUS); } } next_weight=token_vector_a(next_item).getAttributeValue(WEIGHT); float next_weight_float; next_weight.get(next_weight_float); // add first alternative // temp.append(ProductionRuleTokenType::OPEN_PAREN, L"", next_weight_float); appendProduction(temp, convertXMLtoABNF(getRange(next_item+1, findMatchingEndTag(token_vector_a, next_item), token_vector_a))); temp.append(ProductionRuleTokenType::CLOSE_PAREN); // loop through and add all alternatives // while (findNextItem(next_item, token_vector_a)!=-1) { next_item=findNextItem(next_item, token_vector_a); next_weight=token_vector_a(next_item).getAttributeValue(WEIGHT); repeat_prob=token_vector_a(next_item).getAttributeValue(REPEAT_PROB); repeat=token_vector_a(next_item).getAttributeValue(REPEAT); // append alternation // if (next_weight.eq(String::NULL_STRING)) { temp.append(ProductionRuleTokenType::ALTERNATION); } else { temp.append(ProductionRuleTokenType::ALTERNATION); } // if necessary, add kleene closure // if (!repeat.eq(String::NULL_STRING)) { if (!repeat_prob.eq(String::NULL_STRING)) { float repeat_prob_float; repeat_prob.get(repeat_prob_float); temp.append(ProductionRuleTokenType::KLEENE_PLUS, L"", repeat_prob_float); } else { temp.append(ProductionRuleTokenType::KLEENE_PLUS); } } float next_weight_float; next_weight.get(next_weight_float); temp.append(ProductionRuleTokenType::OPEN_PAREN, L"", next_weight_float); appendProduction(temp, convertXMLtoABNF(getRange(next_item+1, findMatchingEndTag(token_vector_a, next_item), token_vector_a))); temp.append(ProductionRuleTokenType::CLOSE_PAREN); } temp.append(ProductionRuleTokenType::CLOSE_PAREN); } // if that's not the end, recurse and figure out the rest of the // rule // if (findMatchingEndTag(token_vector_a, 0)<(token_vector_a.length()-1)) { temp.append(ProductionRuleTokenType::CONCATENATION); appendProduction(temp, convertXMLtoABNF(getRange(findMatchingEndTag(token_vector_a, 0)+1, token_vector_a.length(), token_vector_a))); } } else { Error::handle(name(), L"convertXMLtoABNF encountered unhandlable token", Error::NOT_IMPLEM, __FILE__, __LINE__); } } // if it's CDATA, add a terminal and get on with it // else if (token_vector_a(0).isA(XMLToken::CDATA)) { String token_value(token_vector_a(0).getValue()); temp.append(ProductionRuleTokenType::TERMINAL, token_value); // if that's not the end, recurse and figure out the rest of the // rule // if (token_vector_a.length()>1) { temp.append(ProductionRuleTokenType::CONCATENATION); appendProduction(temp, convertXMLtoABNF(getRange(1, token_vector_a.length(), token_vector_a))); } } // if it's a START_AND_END_TAG, it's probably a ruleref. // add a nonterminal and get on with it // else if (token_vector_a(0).isA(XMLToken::START_AND_END_TAG)) { String token_value(token_vector_a(0).getValue()); token_value.toLower(); if (token_value.eq(RULEREF)) { String ref_name(token_vector_a(0).getAttributeValue(URI)); String special(token_vector_a(0).getAttributeValue(SPECIAL)); special.toUpper(); // check to see if it's a ruleref to null. we treat those // as epsilon transitions // if (special.eq(SPECIAL_NULL)) { temp.append(ProductionRuleTokenType::EPSILON); } // otherwise, handle it as a normal ruleref // else { ref_name.trim(L"#"); temp.append(ProductionRuleTokenType::NON_TERMINAL, ref_name); } // if that's not the end, recurse and figure out the rest of the // rule // if (token_vector_a.length()>1) { temp.append(ProductionRuleTokenType::CONCATENATION); appendProduction(temp, convertXMLtoABNF(getRange(1, token_vector_a.length(), token_vector_a))); } } } else { Error::handle(name(), L"convertXMLtoABNF encountered unhandlable token", Error::NOT_IMPLEM, __FILE__, __LINE__); } return temp; } // method: appendProduction // // arguments: // ProductionRule& prod1_a : first production rule // ProductionRule& prod2_a : second production rule // // return: a bool8 value indicating status // // this method appends the second production rule to the // first and picks the non-null LHS. // bool8 LanguageModelXML::appendProduction(ProductionRule& prod1_a, ProductionRule prod2_a){ if (prod2_a.gotoFirst()) { prod1_a.append(prod2_a.getType(), prod2_a.getValue(), prod2_a.getWeight()); while (prod2_a.gotoNext()) { prod1_a.append(prod2_a.getType(), prod2_a.getValue(), prod2_a.getWeight()); } } if (prod1_a.getRuleName().eq(String::NULL_STRING)) { prod1_a.setRuleName(prod2_a.getRuleName()); } return true; } // method: getRange // // arguments: // int32 start_a : starting index // int32 end_a : ending index // Vector vector_a : working vector // // return: Vector // // this method returns a vector containing the range // from start_a to end_a // Vector LanguageModelXML::getRange(int32 start_a, int32 end_a, Vector vector_a) { Vector ret_vector; int32 i; for (i=start_a; i vector_a : working vector // // return: int32 indicating index of next starting item tag // // this method finds the next starting item tag in // a one-of structure // int32 LanguageModelXML::findNextItem(int32 start_a, Vector vector_a) { int32 i=start_a; if (start_a>=vector_a.length()) { return (int32)-1; } String value(vector_a(start_a).getValue()); value.toLower(); int32 nesting=1; if (vector_a(start_a).isA(XMLToken::START_TAG) && value.eq(ITEM)) { nesting=1; } else if (vector_a(start_a).isA(XMLToken::START_TAG) && value.eq(ONE_OF) && start_a+1=vector_a.length()) { return (int32)-1; } else { return i; } } // method: preProcessXMLGrammar // // arguments: // Vector token_vector: (input/output) // a vector of xml tokens to // be checked for elements which require processing before // they can be given to the conversion process. These // events include: // // return: a bool8 indicating status // // this method performs any modifications to the // XML grammar (while maintaining correct XML structure) // that are necessary before it can be handled by the conversion // to digraph algorithm (such as duplicating bits of graph that // are supposed to occur M-N times) // bool8 LanguageModelXML::preProcessXMLGrammar(Vector& token_vector_a) { // this method requires the XMLParser, so check for expat // #if defined(HAVE_EXPAT) if (debug_level_d > Integral::BRIEF) { Console::put(L"Handling repeat requests..."); } // transform repeat attribute occurrences into a form that // our software can handle. Note that this is all still done // in legal XML. special rule references are used for dummy nodes. // handleRepeatRequests(token_vector_a); if (debug_level_d > Integral::DETAILED) { Console::put(L"The modified grammar:"); Console::put(XMLParser::toXML(token_vector_a)); } // if expat is not present, this method will generate an error // #else Error::handle(name(), L"preProcessXMLParser requires Expat", Error::NOT_IMPLEM, __FILE__, __LINE__); #endif // indicate success // return true; } // method: validateXMLGrammar // // arguments: none // // return: a bool8 indicating status // // this method insures that a parsed grammmar contains grammar // start and end tags. Later this would be a good place to add // a proper validating state machine. // bool8 LanguageModelXML::validateXMLGrammar() { // create a flag which will be set to false if any errors are // found in the XML grammar document. // bool8 valid_flag = true; // check for the gramamr start tag. It may be an 'empty' (start and end) // tag. // if(grammar_start_tag_d.getType() != XMLToken::START_TAG && grammar_start_tag_d.getType() != XMLToken::START_AND_END_TAG) { valid_flag = Error::handle(name(), L"validateXMLGrammar", ERR_GRAMMAR_NO_START, __FILE__, __LINE__); } // check for the grammar end tag // if(grammar_end_tag_d.getType() != XMLToken::END_TAG && grammar_start_tag_d.getType() != XMLToken::START_AND_END_TAG) { valid_flag = Error::handle(name(), L"validateXMLGrammar", ERR_GRAMMAR_NO_END, __FILE__, __LINE__); } // return the flag indicating whether there was an error // return valid_flag; } // method: partitionGrammar // // arguments: // Vector token_vector: (input) a vector of xml tokens to // be divided into rules // // return: a bool8 indicating status // // this method takes the raw vector of XML tokens and organizes it into // a grammar with rules // bool8 LanguageModelXML::partitionGrammar(Vector& token_vector_a) { // loop over all tokens in the parsed grammar // for(int32 i=0; i < token_vector_a.length(); i++) { // handle the start tags // if(token_vector_a(i).isA(XMLToken::START_TAG)) { handleStartElement(token_vector_a(i)); } // handle the end tags // else if (token_vector_a(i).isA(XMLToken::END_TAG)) { handleEndElement(token_vector_a(i)); } // handle CDATA tags // else if (token_vector_a(i).isA(XMLToken::CDATA)) { handleCharacterData(token_vector_a(i)); } // handle "empty" tags // else if (token_vector_a(i).isA(XMLToken::START_AND_END_TAG)) { // here may be placed hooks for any particular empty tags // that are expected. Also, the "content" attribute may // be handled. // handleStartAndEndElement(token_vector_a(i)); } } // indicate sucess // return true; } // method: handleStartAndEndElement // // arguments: // XMLToken xml_token: (input) an "empty" from the xml parser // // return: a bool8 indicating status // // This method is called every time an XML start_and_end tag is // encountered in the vector handed up by the parser. It replaces // the emtpy tag with appropriate subsitutions as specified by // the SRGS v1.0. // bool8 LanguageModelXML::handleStartAndEndElement (XMLToken xml_token_a) { // empty grammar tags are allowed // if(xml_token_a.isA(GRAMMAR)) { handleStartElement(xml_token_a); } // empty rule tags are not allowed // else if(xml_token_a.isA(RULE)) { return Error::handle(name(), L"handleStartAndEndElement", ERR_RULE_EMPTY, __FILE__, __LINE__); } // empty one-of tags are not allowed // else if(xml_token_a.isA(ONE_OF)) { return Error::handle(name(), L"handleStartAndEndElement", ERR_ONE_OF_EMPTY, __FILE__, __LINE__); } // empty item tags become // else if(xml_token_a.isA(ITEM)) { XMLToken ruleref_void; ruleref_void.init(XMLToken::START_AND_END_TAG, RULEREF); ruleref_void.addAttribute(SPECIAL, SPECIAL_VOID); handleStartElement(ruleref_void); } // "empty" tags are handled as though they are // start tags. They are not expected to have content, only // attributes // else if(xml_token_a.isA(RULEREF)) { handleStartElement(xml_token_a); } // indicate success // return true; } // method: handleStartElement // // arguments: // XMLToken xml_token: (input) a start tag from the xml parser // // return: a bool8 indicating status // // This method is called every time an XML start tag is encountered while // parsing. // bool8 LanguageModelXML::handleStartElement (XMLToken xml_token_a) { // if the tag's value is "grammar", and we are at // a rule depth of 0, store this as the grammar_start_tag // if(xml_token_a.getValue().eq(GRAMMAR, false) && rule_nesting_level_d == 0) { grammar_start_tag_d.assign(xml_token_a); } // if it is a rule, and we are not yet parsing a rule, clear the // rule object. init the token and push it onto the rule. // // if it is not a rule, and we are parsing a rule, make an XMLToken // and push it onto the expansion. // // Otherwise, it is an unhandled tag type, // such as metadata; push it onto the expansion. // else if(xml_token_a.getValue().eq(RULE, false) && rule_nesting_level_d == 0) { // start a new rule, add the token to the expansion // temp_rule_d.clear(Integral::RESET); temp_rule_d.concat(xml_token_a); // increment the rule nesting level // rule_nesting_level_d++; } // not the first rule, but a tag within a rule // add the token to the rule's expansion. Increment // the rule_nesting_level_d if this is a nested rule start // tag // else if(rule_nesting_level_d > 0) { if(xml_token_a.getValue().eq(RULE, false)) { rule_nesting_level_d++; } temp_rule_d.concat(xml_token_a); } // unhandled type // else { // declare a string to store an error message // String error_msg(L"The token: "); error_msg.concat(xml_token_a.toXML()); error_msg.concat(L"is not recognized."); return Error::handle(name(), L"handleStartElement", ERR_TOKEN_UNHANDLED, __FILE__, __LINE__); } return true; } // method: handleEndElement // // arguments: // XMLToken xml_token: (input) an XMLToken containing an // end tag from the xml parser // // return: bool8 // bool8 LanguageModelXML::handleEndElement (XMLToken xml_token_a) { // store the grammar end tag // if(xml_token_a.getValue().eq(GRAMMAR, false) && rule_nesting_level_d == 0) { grammar_end_tag_d.assign(xml_token_a); } // if this is the end tag to the top-level rule being parsed, // then push the end tag onto the expansion, push the rule onto // the vector of completed rules, decrement the nesting level, // and clear the temp rule for reuse // else if(xml_token_a.getValue().eq(RULE, false) && rule_nesting_level_d == 1) { rule_nesting_level_d--; temp_rule_d.concat(xml_token_a); rules_d.concat(temp_rule_d); temp_rule_d.clear(Integral::RESET); } // if this is the end tag to a nested rule, push the tag onto the // expansion, and decrement the nesting level // else if(xml_token_a.getValue().eq(RULE, false) && rule_nesting_level_d > 1) { rule_nesting_level_d--; temp_rule_d.concat(xml_token_a); } // if this is any other end tag, and we ARE parsing a rule, then push // the tag onto the expansion // else if(rule_nesting_level_d >= 1) { temp_rule_d.concat(xml_token_a); } // otherwise, this is the end tag of an unhandled tag type, such as // metadata. set the type, and push it onto the expansion. // else { // declare a string to store an error message // String error_msg(L"The token: "); error_msg.concat(xml_token_a.toXML()); error_msg.concat(L"is not recognized."); return Error::handle(name(), L"handleStartElement", ERR_TOKEN_UNHANDLED, __FILE__, __LINE__); } // indicate success // return true; } // method: handleCharacterData // // arguments: // XMLToken xml_token: (input) an xml token to be added to the current rule // // return: none // // this method is called whenever parsing encounters a piece of // character data not within < > delimiters. It creates an XMLToken // to store the character data, and adds the token to the current rule // // bool8 LanguageModelXML::handleCharacterData (XMLToken xml_token_a) { temp_rule_d.concat(xml_token_a); return true; } // method: handleSpecialRules // // arguments: // Vector& token_vector: (input) an xml grammar possibly containing // the special rule NULL // // return: a bool8 indicating status // // this method checks for the presence of special rules GARBAGE, VOID, // and NULL. // bool8 LanguageModelXML::handleSpecialRules (Vector& token_vector_a) { // iterate through the entire vector looking for the tag // for(int32 i = 0; i < token_vector_a.length(); i++) { // if a ruleref is found, check for the "special" attribute // if(token_vector_a(i).isA(RULEREF)) { // retrieve the value of the "special" attribute // String special_value = token_vector_a(i).getAttributeValue(SPECIAL); // if this is a special rule reference, check for which type // if(!special_value.eq(String::EMPTY)) { if(special_value.eq(SPECIAL_NULL)) { // reinitialize the token as // a dummy symbol, and add the dummy symbol to the // dummy symbol list // // save the token's depth // token_vector_a(i).init(XMLToken::CDATA, getDummySymbol(), token_vector_a(i).getDepth()); // add the dummy symbol to the symbol list // updateSymbolTable(token_vector_a(i)); } else if (special_value.eq(SPECIAL_GARBAGE)) { Console::put(special_value); return Error::handle(name(), L"handleSpecialRules", ERR_RULE_SPECIAL_INV, __FILE__, __LINE__); } else if (special_value.eq(SPECIAL_VOID)) { Console::put(special_value); return Error::handle(name(), L"handleSpecialRules", ERR_RULE_SPECIAL_INV, __FILE__, __LINE__); } // all other special rules are not currently handled // else { Console::put(special_value); return Error::handle(name(), L"handleSpecialRules", ERR_RULE_SPECIAL_INV, __FILE__, __LINE__); } // end branching on special type } // end checking for special } // end checking for ruleref } // end looping over tokens // indicate success // return true; } // end handleSpecialRules // method: handleRepeatRequests // // arguments: // Vector& token_vector: (input) a vector of xml tokens // to be processed // // return: a bool8 indicating status // // This method finds instances of the repeat attribute, and // duplicates the section of XMLTokens within the bounds of // the repeat attribute as necessary to meet the requirements // of the value of the repeat attribute. // // Here's how it works: // The short version: // 1. traverse from the second tag until a repeat request is found. // 2. if a repeat request is found, grab the section of the grammar // contained within it, and recurse on it. // 3. exit the recursion when the input grammar is empty or has no repeat // requests // 4. while traveling back up the recursion, check the first tag of each // section for the repeat attribute, and duplicate the section as necessary // The overall result is to expand repeat requests from the deepest level of // nesting upward. // // The int32 version: // // Traverse the Vector, starting from index 1, not index zero. // We will intentionally leave the first token (index 0) until // last. As we tranverse, each token that is not within a repeat // attribute is copied to the Vector section1. // If no repeat attribute is encountered, we will check the // first tag for a repeat. If one is found duplicate // section1 as requested, assign section1 to token_vector_a, and return. // If a repeat attribute is encountered in a start tag, find the matching // end tag, copy the start tag, end tag, and all tokens inbetween // into a vector section2. Call handleRepeatRequests on // section2. When this returns, tack section2 onto the end of section1. // bool8 LanguageModelXML::handleRepeatRequests(Vector& token_vector_a) { // this method requires the XMLParser, so check for expat // #if defined(HAVE_EXPAT) // declare vectors to store the processed (section1) and // not-yet-processed(section2) portions of the grammar // Vector section1; Vector section2; // save the first token of the token_vector_a // if(isInBounds(token_vector_a, 0)) { section1.concat(token_vector_a(0)); } // if the vector is empty, we have no need to be here. // else { return true; } // Note again, we are skipping the first element. // Iterate over the rest of the vector. // for(int32 i = 1; i < token_vector_a.length(); i++) { // Any start item tag may have a repeat attribute. // If one is found, copy the section of tokens bounded // by the pair of item tags, and recurse // if(token_vector_a(i).isA(ITEM, XMLToken::START_TAG) && !token_vector_a(i).getAttributeValue(REPEAT).eq(String::EMPTY)) { // the lower bound is i, the start tag // // the upper bound is the end tag matching that start tag // int32 upper_bound = findMatchingEndTag(token_vector_a, i); // make new vector out of the tags between the lower bound // and the upper bound, inclusive. // // here, this section2 variable has just been declared. // therefore, there should be absolutely no reason to clear it. // however, this variable inexplicably tends to keep // information from earlier // levels of recursion somehow, so it must be cleared explicitly // to insure against this. // section2.clear(Integral::RESET); // loop from the lower bound until the upper bound, // copying tokens // for(int32 j = i; j <= upper_bound; j++) { section2.concat(token_vector_a(j)); } // recurse on the sub-section // handleRepeatRequests(section2); // upon exit from recursion, concat section2 onto section1. // section1.concat(section2); // continue PAST the upper bound, since the upper bound // has already been processed. note the for loop will // increment i AFTER the upper_bound is assigned to it. // i = upper_bound; } // if no repeat attribute is found, copy the token onto section1 // else { section1.concat(token_vector_a(i)); } } // once this for loop has been exited, we know that there are no // repeat attributes between section1(0) and section1.length() that // need to be preprocessed; they will already have been converted to // an amenable form. THIS DOES NOT MEAN THAT ALL REPEAT ATTRIBUTES ARE // GONE. Just that they are now of the form "1-", which may // be handled during the conversion routine. // // since there are no more nested attributes to deal with, we may process // the repeat attribute at index 0 // String repeat_value = section1(0).getAttributeValue(REPEAT); // if no repeat attribute is found in the entire vector, // just return section1 // if(repeat_value.eq(String::EMPTY)) { // make sure that the depths are // consistent (the insertion of // new tags can upset the depths. // the duplication has no effect. // XMLParser::setTokenDepths(token_vector_a); token_vector_a.assign(section1); return true; } // create a vector to store the final result // of the repeat processing // Vector final_vector; Vector duplicated_section; // declare item tags to be automatically inserted // XMLToken item_start_tag; XMLToken item_end_tag; // declare a token that will be inserted before // and after the processed repeat request to insure // that the entire section is still considered a single // "item" once modified. This is important because if the // item requesting a repeat is in a branch, we don't want // the modified item to create additional branches. // item_start_tag.init(XMLToken::START_TAG, ITEM); item_end_tag.init(XMLToken::END_TAG, ITEM); // parse the information from the repeat value // Triple repeat_info(tokenizeRepeatValue(section1(0))); // insert the item at the beggining // final_vector.concat(item_start_tag); // convert repeat occurrences to a form that can be handled // by the conversion to digraph routine. // the following situations may occur: // A. repeat = "M" // B. repeat = "M-" // C. repeat = "M-N" // in any of these three cases, M may be zero, which will // require that a dummy node be placed in parallel with each of the entire // string of repeated sections. // // do not duplicate the repeat attribute itself! // section1(0).removeAttribute(REPEAT); // if M > 0 // duplicate the entire section of XML within the repeat scope M times, // if((int32)repeat_info.first() > 0) { // copy the section to be duplicated // duplicated_section.assign(section1); // this multiplies section1 m times // addManditoryRepeatSections(duplicated_section, repeat_info.first()); // add the duplicated section directly // final_vector.concat(duplicated_section); // if there is an infinite loop requested, we want to add a // "repeat = '1-'" attribute on the final item tag inserted just above. if (repeat_info.third()==true) final_vector(findMatchingStartTag(final_vector, final_vector.length()-1)).addAttribute(REPEAT, REPEAT_LOOP_BACK); } // for the special case of 0-, we add one optional // grouping. This, in tandem with the loop added by // replacing the "1-" value on the outer item tag, // will accomplish zero or more iterations. // // case "0" here is NOT included, nor is M < 0 // else if ((int32)repeat_info.first() == (int32)0 && repeat_info.third()==true) { // add "repeat = '1-'" final_vector(0).addAttribute(REPEAT, REPEAT_LOOP_BACK); // reset the section to be duplicated // to its original value // duplicated_section.assign(section1); // duplicate the section n - m times // Long one(1); addOptionalRepeatSections(duplicated_section, one); // add the resultant section directly // final_vector.concat(duplicated_section); } // if N is present // duplicate the entire section of XML M times, and then again N-M times but // with a dummy node in parallel with each of the N-M'th sections, making them // optional.a dummy node must also be placed between each of the N-M'th sections, // to siplify the layout. // if(repeat_info.second() != Integral::NO_POS) { // reset the section to be duplicated // to its original value // duplicated_section.assign(section1); // if there was a repeat-prob, now it must become a weight // String weight = section1(0).getAttributeValue(REPEAT_PROB); // remove the repeat prob attribute // section1(0).removeAttribute(REPEAT_PROB); // replace it with a weight whose attribute value // is the same as the repeat_prob // section1(0).addAttribute(WEIGHT, weight); // duplicate the section n - m times // addOptionalRepeatSections(duplicated_section, repeat_info.second() - repeat_info.first()); // add the resultant section directly // final_vector.concat(duplicated_section); } // finish nesting this repeated section within a pair of item tags // final_vector.concat(item_end_tag); // the inserted item tags tamper with the depths of the tokens in the // vector. re-initialize the depths. // XMLParser::setTokenDepths(token_vector_a); // return the modified vector // token_vector_a.assign(final_vector); // if expat is not present, this method will generate an error // #else Error::handle(name(), L"handleRepeatRequests requires Expat", Error::NOT_IMPLEM, __FILE__, __LINE__); #endif // indicate success // return true; } // method: addOptionalRepeatSections // // arguments: // Vector& token_vector: (input/output) // the vector of tokens to be duplicated // // Long m: (input) the number of times to duplicate the vector // // return: a bool8 indicating status // // this method accepts argument M from the "M-N" // repeat attribute value, and a token vector // containing the section of xml tokens which were contained // within that repeat request. It then duplicates // the section M times, and returns the vector // bool8 LanguageModelXML::addOptionalRepeatSections (Vector& token_vector_a, Long m_a) { // declare a vector to hold the section of tokens which we will duplicate // Vector duplicate; // declare the new tokens which will be inserted // XMLToken one_of_start_tag; XMLToken one_of_end_tag; XMLToken dummy; XMLToken start_item_tag; XMLToken end_item_tag; // dummy nodes are represented as // dummy.init(XMLToken::START_AND_END_TAG, RULEREF); dummy.addAttribute(SPECIAL, SPECIAL_NULL); // initialize the branch tags // one_of_start_tag.init(XMLToken::START_TAG, ONE_OF); one_of_end_tag.init(XMLToken::END_TAG, ONE_OF); // initialize start and end item tags for dummy node // start_item_tag.init(XMLToken::START_TAG, ITEM); end_item_tag.init(XMLToken::END_TAG, ITEM); // build the section to be duplicated. Since this is an optional // input section, place it in a branch, and put it in parallel with // a dummy node // duplicate.concat(one_of_start_tag); duplicate.concat(start_item_tag); duplicate.concat(dummy); duplicate.concat(end_item_tag); duplicate.concat(token_vector_a); duplicate.concat(one_of_end_tag); // reset the token vector, and add the // duplicate section to it m_a times. // token_vector_a.clear(Integral::RESET); for(int32 i = 0; i < m_a; i++) { // add an individual section to the output vector // token_vector_a.concat(duplicate); } // indicate success // return true; } // method: addManditoryRepeatSections // // arguments: // Vector& token_vector: (input/output) // the vector of tokens to be duplicated // // Long m: (input) the number of times to duplicate the vector // // return: a bool8 indicating status // // this method accepts argument M from the "M-N" // repeat attribute value, and a token vector // containing the section of xml tokens which were contained // within that repeat request. It then duplicates // the section M times, and returns the vector // bool8 LanguageModelXML::addManditoryRepeatSections (Vector& token_vector_a, Long m_a) { // declare a vector to store the section of tokens we will duplicate // Vector duplicate; // save the input section of tokens // duplicate.assign(token_vector_a); // note that here, i starts from 1, because // token_vector_a already contains 1 occurrance // of itself // for(int32 i = 1; i < m_a; i++) { // add a copy of the section to the xml tokens // token_vector_a.concat(duplicate); } // indicate success // return true; }