// file: $isip/class/pr/LanguageModelXML/lmxml_03.cc // version: $Id: lmxml_03.cc 10356 2006-01-10 18:52:05Z wholland $ // ISIP include files // #include "LanguageModelXML.h" // method: read // // arguments: // Sof& sof: (input) sof file object // int32 tag: (input) sof object instance tag // const String& name: (input) sof object instance name // // return: a bool8 value indicating status // // this method has the object read itself from an Sof file // bool8 LanguageModelXML::read(Sof& sof_a, int32 tag_a, const String& name_a) { if (debug_level_d > Integral::BRIEF) { Console::put(L"reading model"); Console::increaseIndention(); } // read the instance of the object from the Sof file // if (!sof_a.find(name_a, tag_a)) { return false; } // read the actual data from the sof file // if (!readData(sof_a)) { return false; } if (debug_level_d > Integral::BRIEF) { Console::decreaseIndention(); } // exit gracefully // return true; } // method: readData // // arguments: // Sof& sof: (input) sof file object // const String& pname: (input) parameter name // int32 size: (input) size in bytes of object (or full_size) // bool8 param: (input) is the parameter name in the file // bool8 nested: (input) are we nested? // // return: a bool8 value indicating status // // this method has the object read itself from an Sof file. it assumes // that the Sof file is already positioned correctly. // bool8 LanguageModelXML::readData(Sof& sof_a, const String& pname_a, int32 size_a, bool8 param_a, bool8 nested_a) { DebugLevel temp = debug_level_d; clear(Integral::FREE); setDebug(temp); SofParser parser; parser.setDebug(Integral::NONE); // are we nested? // if (nested_a) { parser.setNest(); } // load the parse // if (!parser.load(sof_a, size_a)) { return Error::handle(name(), L"readData", Error::READ, __FILE__, __LINE__, Error::WARNING); } Vector temp_vector; // count the number of levels // int32 num_levels = 0; String level_tag(SearchLevel::PARAM_LEVEL_TAG); level_tag.concat(PARAM_UNDERSCORE); level_tag.concat((Long)num_levels); while (parser.isPresent(sof_a, level_tag)) { num_levels++; level_tag.assign(SearchLevel::PARAM_LEVEL_TAG); level_tag.concat(PARAM_UNDERSCORE); level_tag.concat((Long)num_levels); } if (num_levels == 0) { return Error::handle(name(), L"readData: incorrect format", Error::READ, __FILE__, __LINE__, Error::WARNING); } hg_d.setLength(num_levels); if (debug_level_d > Integral::BRIEF) { Console::put(L"reading levels"); Console::increaseIndention(); } // loop over levels and read grammar, context mapping, and symbol // types for each level // for (int level = 0; level < num_levels; level++) { // set the level index // hg_d(level).setLevelIndex(level); bool8 context = false; if (level > 0 && hg_d(level - 1).getContextMap().length() > 0) { context = true; } if (debug_level_d > Integral::BRIEF) { Console::put(L"reading grammars"); Console::increaseIndention(); } // read XML grammars // if (context || level == 0) { readXMLGrammars(sof_a, PARAM_GRAMMARS, level, context, NULL, parser); } else{ readXMLGrammars(sof_a, PARAM_GRAMMARS, level, context, &(hg_d(level - 1).getSymbolTable()), parser); } if (debug_level_d > Integral::BRIEF) { Console::decreaseIndention(); Console::put(L"reading level tag"); Console::increaseIndention(); } // read the level tag // Vector level_tag; readSymbolType(sof_a, level, SearchLevel::PARAM_LEVEL_TAG, level_tag, parser); hg_d(level).setLevelTag(level_tag(0)); if (debug_level_d > Integral::BRIEF) { Console::decreaseIndention(); Console::put(L"reading symbols"); Console::increaseIndention(); } // read the nonspeech boundary symbols // readSymbolType(sof_a, level, SearchLevel::PARAM_NONSPEECH_BOUNDARY_SYMBOL, hg_d(level).getNonSpeechBoundarySymbolTable(), parser); // read the nonspeech internal symbols // readSymbolType(sof_a, level, SearchLevel::PARAM_NONSPEECH_INTERNAL_SYMBOL, hg_d(level).getNonSpeechInternalSymbolTable(), parser); // read the dummy symbols // readSymbolType(sof_a, level, SearchLevel::PARAM_DUMMY_SYMBOL, hg_d(level).getDummySymbolTable(), parser); // IF any dummy symbols were created to replace the special rule // NULL, AND the default dummy symbol does not exist // in the dummy symbols list read from the file, then add it // SearchSymbol def_dummy_symbol = getDummySymbol(); if(getSymbolList().contains(&def_dummy_symbol) && !hg_d(level).getDummySymbolTable().contains(&def_dummy_symbol)) { hg_d(level).getDummySymbolTable().concat(def_dummy_symbol); } // read the exclude symbols // readSymbolType(sof_a, level, SearchLevel::PARAM_EXCLUDE_SYMBOL, hg_d(level).getExcludeSymbolTable(), parser); // read the nsymbol exclude symbols // readSymbolType(sof_a, level, SearchLevel::PARAM_NSYMBOL_EXCLUDE_SYMBOL, hg_d(level).getNSymbolExcludeSymbolTable(), parser); // read the spenalty exclude symbols // readSymbolType(sof_a, level, SearchLevel::PARAM_SPENALTY_EXCLUDE_SYMBOL, hg_d(level).getSPenaltyExcludeSymbolTable(), parser); // read the context less symbols // readSymbolType(sof_a, level, SearchLevel::PARAM_CONTEXTLESS_SYMBOL, hg_d(level).getContextLessSymbolTable(), parser); // read the skip symbols // readSymbolType(sof_a, level, SearchLevel::PARAM_SKIP_SYMBOL, hg_d(level).getSkipSymbolTable(), parser); // read the non adaptation symbols // readSymbolType(sof_a, level, SearchLevel::PARAM_NON_ADAPT_SYMBOL, hg_d(level).getNonAdaptSymbolTable(), parser); if (debug_level_d > Integral::BRIEF) { Console::decreaseIndention(); Console::put(L"reading context mapping"); Console::increaseIndention(); } // read the context mapping (if it exists) for this level // // readContextMapping(sof_a, SearchLevel::PARAM_CONTEXT_MAPPING, level, parser); if (debug_level_d > Integral::BRIEF) { Console::decreaseIndention(); } } if (debug_level_d > Integral::BRIEF) { Console::decreaseIndention(); } // indicate success // return true; } // method: readXMLGrammars // // arguments: // Sof& sof: (input) sof file object // int32 level: (input) language model level // bool8 use_context: (input) are we using context? // const String& name: (input) parameter name // Vector symbol_list: (output) list of symbols // // return: a bool8 value indicating status // // this method reads the grammar for a particular level // bool8 LanguageModelXML::readXMLGrammars(Sof& sof_a, const String& name_a, int32 level_a, bool8 use_context_a, Vector* symbol_table_a, SofParser& parser_a) { // symbol list for the given search level // Vector symbol_list; Vector< Vector > grammar_list; Vector grammar_name_list; Vector grammars; String param_name(name_a); param_name.concat(PARAM_UNDERSCORE); param_name.concat((Long)level_a); // read the grammar string vector // if (parser_a.isPresent(sof_a, param_name)) { if (!grammars.readData(sof_a, param_name, parser_a.getEntry(sof_a, param_name), false, false)) { return Error::handle(name(), L"readData: error reading grammars", Error::READ, __FILE__, __LINE__, Error::WARNING); } } for (int i = 0; i < grammars.length(); i++) { // declare variables // Vector sub_symbol_list; Vector tmp_symbol_list; Vector grammar; String grammar_name; // process the grammar string // parseXMLGrammar(grammars(i), tmp_symbol_list, grammar, grammar_name); // convert the symbol list of Strings to // the sub symbol list of SearchSymbols // for (int32 i = 0; i < tmp_symbol_list.length(); i++) { SearchSymbol tmp; tmp.assign(tmp_symbol_list(i)); sub_symbol_list.concat(tmp); } // check if any symbol in sub_symbol_list already exists in symbol_list // bool8 same = false; for(int32 i=0; i 0) { // get the symbol table of the upper level // Vector& symbol_table = *symbol_table_a; Vector graph_symbols; // do we have context mapping ? // if (use_context_a) { // set the symbol table // for (int32 j = 0; j < num_grammars; j++) { String symbol; Ulong index; index.assign(j); symbol.assign(CONTEXT_LABEL_PREFIX); symbol.concat(index); graph_symbols.concat(symbol); } } // test the symbol_table // if (symbol_table_a == NULL && !use_context_a) { return Error::handle(name(), L"no symbol table of upper level", ERR_CTXT_NO_SYM_TAB, __FILE__, __LINE__); } // align the graphs in the graph list // if (use_context_a) { alignGraphs(grammar_list, graph_symbols, grammar_name_list); } else { alignGraphs(grammar_list, symbol_table, grammar_name_list); } } // add grammars to xml model // grammars_d.concat(grammar_list); // indicate success // return true; } // method: readSymbolType // // arguments: // Sof& sof: (input) sof file object // int32 level: (input) language model level // const String& name: (input) parameter name // Vector symbol_list: (output) list of symbols // // return: a bool8 value indicating status // // this method read one rule from the sof file // bool8 LanguageModelXML::readSymbolType(Sof& sof_a, int32 level_a, const String& name_a, Vector& symbol_list_a, SofParser& parser_a) { // to read the string from the sof file // SofParser parser; // to store the grammar // String grammar; // name of the parameter to find // String param_name(name_a); if (level_a != DEF_LEVEL) { param_name.concat(PARAM_UNDERSCORE); param_name.concat((Long)level_a); } // read the grammar // if (parser_a.isPresent(sof_a, param_name)) { if (!grammar.readData(sof_a, param_name, parser_a.getEntry(sof_a, param_name))) { return Error::handle(name(), L"readData", ERR_READ_SYM_TYPE, __FILE__, __LINE__, Error::WARNING); } // declare a lmxml object to read the grammar // a separate object is used here purposely. for debugging purposes // the symbol types are less useful than the main grammar, so the main // grammar is left intact in this object, and the symbol grammars // are stored in new objects // LanguageModelXML lmxml; lmxml.setDebug(debug_level_d); Vector temp_string_vector; Vector token_vector; String grammar_name; // read a single grammar from that location // lmxml.parseXMLGrammar(grammar, temp_string_vector, token_vector, grammar_name); symbol_list_a.clear(); // convert strings to search symbols. // for (int i = 0; i < temp_string_vector.length(); i++) { symbol_list_a.concat((SearchSymbol)temp_string_vector(i)); } } // indicate success // return true; } // method: alignGraphs // // arguments: // Vector< Digraph >& graph_list: (output) vector of graphs // Vector& symbol_table: (input) symbol table // Vector& graph_name_list: (input) vector of grammar names // // return: a bool8 value indicating status // // this method change the order of the graphs in given vector to // match the order of search symbols in given symbol table // bool8 LanguageModelXML::alignGraphs(Vector< Vector >& grammar_list_a, Vector& symbol_table_a, Vector& grammar_name_list_a) { // make sure the three input vectors have the same length // if(grammar_list_a.length() != symbol_table_a.length() || grammar_list_a.length() != grammar_name_list_a.length() || grammar_name_list_a.length() != symbol_table_a.length()) { symbol_table_a.debug(L"symbol_table"); grammar_name_list_a.debug(L"graph_name"); return Error::handle(name(), L"alignGraphs", ERR_CTXT_INV_LENGTHS, __FILE__, __LINE__); } // declare a temp vector of grammars // Vector< Vector > tmp_grammar_list; for(int32 i=0; i, each ContextMap // of which contains the SearchSymbol representations of the symbols // belonging to a particular context, and the index to which this // particular context belongs. // 2. A context-hash which is a HashTable, which contains // the index of a context, and a Context which contains a // CircularDelayLine // of the indices belonging to the SearchSymbols which represent the // symbols belonging to a particular context. // // // Conceptually, the organization of this data could be viewed like this: // // Context Index // | | // (hg_d.context_map) (hg_d.context_hash) // | | // Vector Context // 'a' --> i = hg_d.getSymbolIndex('a') // | // 'b' --> j = hg_d.getSymbolIndex('b') // | // 'c' --> k = hg_d.getSymbolIndex('c') // // The index of the context can be used to link a symbol from that Context // with the index of that symbol within the HierarchicalDigraph. // declare variables to store the raw input contexts, the // context maps generated from them, and a LMXML object // with which to parser the raw contexts. // Vector input_context_strings; Vector context_maps_vector; LanguageModelXML grammar_parser; // set up the parameter name to match the current level // String param_name(name_a); if (level_a != DEF_LEVEL) { param_name.concat(PARAM_UNDERSCORE); param_name.concat((Long)level_a); } // read the all the contextmaps as vector of string from the given // level-index // if (parser_a.isPresent(sof_a, param_name)) { if (!input_context_strings.readData(sof_a, param_name, parser_a.getEntry(sof_a, param_name), false, false)) { return Error::handle(name(), L"readContextMapping", Error::READ, __FILE__, __LINE__, Error::WARNING); } } // when there is NO context mapping table at this level // else { // symbols will be mapped to the model with the same index at lower level // Context dummy(1); // retrieve the contexthash. note that this is not a copy, this is // a reference, and modificatiosn to this context_hash will modify // the corresponding context_hash in the hierarchical digraph. // HashTable& context_hash = hg_d(level_a).getContextHash(); // for each symbol in the level, insert a dummy context with the index // of that symbol // for (Ulong i = 0; i < (uint32)(hg_d(level_a).getSymbolTable()).length(); i++) { dummy.assignAndAdvance(i); context_hash.insert(dummy, &i); } if (debug_level_d >= Integral::ALL) { context_hash.debug(L"dummy context hash table:"); } return true; } // end if no context mapping // make sure the symbol table has been read // int32 num_symbols = (hg_d(level_a).getSymbolTable()).length(); if (num_symbols < 1) { return Error::handle(name(), L"readContextMapping", ERR_CTXT_NO_SYM_TAB, __FILE__, __LINE__); } // set the length of the context_maps_vector to the number of contexts // read // context_maps_vector.setLength(input_context_strings.length()); // loop over all the input context strings, and parse them as XML // grammars. These context "grammars" each consist of a single rule // whose name contains all of the symbols within the context in // symbol1-symbol2-symbol3 fashion, and whose sole symbol is // G_# indicating Graph #, where # is the index of this context. // for(int32 i=0; i < input_context_strings.length(); i++) { // declare a vector to store a single context's symbols // Vector context_symbols; // declare a vector to store a context symbol table // Vector temp_symbol_table; // declare a vector to store grammar // Vector temp_token_vector; // declare a String to store a single symbol as it is tokenized // from the grammar name // String symbol; // declare a string to store the grammar's name (which will be the // name of the only rule that is present in the grammar) // String grammar_name; // declare a string to store the index of the context, which will // be retrieved from the only symbol in the only rule // String context_index_string; // declare a variable to store the position within strings being // tokenized // int32 pos = 0; // declare a ulong to store the index of the context // uint32 context_index_ulong; // parse the grammar to obtain the grammar name which contains // the context symbols, and the symbol table which will contain // the single symbol indicating the graph number // grammar_parser.parseXMLGrammar(input_context_strings(i), temp_symbol_table, temp_token_vector, grammar_name); // retrieve all the symbols from the grammar name // pos = 0; while(grammar_name.tokenize(symbol, pos, CONTEXT_SYMBOL_DELIM)) { // store all of the symbols tokenized // context_symbols.concat(symbol); } // end looping over embedded symbols // save the vector of symbols into the appropriate ContextMap within // the vector of context maps // if(!context_maps_vector(i).setContext(context_symbols)) { return Error::handle(name(), L"readContextMapping", ERR_CTXT_SCV, __FILE__, __LINE__, Error::WARNING); } // retrieve the only symbol from the only rule in the grammar // if(temp_symbol_table.length() != 1) { return Error::handle(name(), L"readContextMapping", ERR_CTXT_INV_SYM_TAB, __FILE__, __LINE__, Error::WARNING); } // retrieve the context index from the single symbol read from the // grammar. note that context_index_string will contain the last // token following a "G_" within the symbol. So, if symbol = "G_9", // context_index_string will = "9". // pos = 0; symbol.tokenize(context_index_string, pos, CONTEXT_LABEL_PREFIX); // set the context index of the current context map // if(!context_index_string.get(context_index_ulong)) { String err_str(L"readContextMapping: "); err_str.concat(context_index_string); return Error::handle(name(),err_str, ERR_CTXT_INV_TOKEN_INDEX, __FILE__, __LINE__, Error::WARNING); } context_maps_vector(i).setContextIndex(context_index_ulong); } // end looping over input contexts // store the vector of context maps in the HierarchicalDigraph // if (!hg_d(level_a).setContextMap(context_maps_vector)) { return Error::handle(name(), L"readContextMapping", ERR_CTXT_ESHG, __FILE__, __LINE__, Error::WARNING); } // when there is a context mapping table at this level // if (context_maps_vector.length() > 0) { // add start and terminal search symbol to the symbol table // Vector& symbol_table = hg_d(level_a).getSymbolTable(); symbol_table.concat(SearchSymbol::NO_LEFT_CONTEXT); symbol_table.concat(SearchSymbol::NO_RIGHT_CONTEXT); // set up start and terminal search node for each subgraph // Vector >& sub_graphs = hg_d(level_a).getSubGraphs(); for (int32 i = 0; i < sub_graphs.length(); i++) { // start vertex // SearchNode* snode_p = new SearchNode(); snode_p->setSearchLevel(&hg_d(level_a)); snode_p->setSymbol(SearchSymbol::NO_LEFT_CONTEXT); sub_graphs(i).getStart()->setItem(snode_p); // terminal vertex // snode_p = new SearchNode(); snode_p->setSearchLevel(&hg_d(level_a)); snode_p->setSymbol(SearchSymbol::NO_RIGHT_CONTEXT); sub_graphs(i).getTerm()->setItem(snode_p); } // end looping over subgraphs // insert all context pairs into the context mapping hash table // int32 total_context_length = context_maps_vector(0).getContext().length(); Context context(total_context_length); // loop over all context pairs // for (int32 i = 0; i < context_maps_vector.length(); i++) { // loop over the symbols in the context // for (int32 j = 0; j < total_context_length; j++) { SearchSymbol ss(context_maps_vector(i).getContext()(j)); int32 symbol_id = hg_d(level_a).getSymbolIndex(ss); // check whether the symbol is valid // if (symbol_id == Integral::NO_POS) { ss.debug(L"symbol"); return Error::handle(name(), L"readContextMapping", ERR_CTXT_INV_SYM, __FILE__, __LINE__); } else { context.assignAndAdvance(symbol_id); } } // end looping over symbols in context // insert a context specification into the hash table // Ulong index = context_maps_vector(i).getContextIndex(); // check if the context is already in the table // HashTable& context_hash = hg_d(level_a).getContextHash(); Ulong* existing_index = context_hash.get(context); // if this context is not in the table yet, insert it // if (existing_index == NULL) { context_hash.insert(context, &index); } // if the context is already in the table, then check whether // the index of the model is the same as the one from table // else { // if indices are different, explain conflict and return error // if (!existing_index->eq(index)) { context_maps_vector(i).debug(L"Context:"); String out; out.concat(L" is already in context mapping table with the index: "); out.concat(*existing_index); out.concat(L"\n while new index is: "); out.concat(index); Console::put(out); return Error::handle(name(), L"readContextMapping", ERR_CTXT_EXISTS_DIF_INDEX, __FILE__, __LINE__); } // otherwise indices are not conflicting, just print a warning // else { context_maps_vector(i).debug(L"Warning: This context: "); String out; out.concat(L" is already in context mapping table with the index: "); out.concat(*existing_index); out.concat(SysChar::NEWLINE); Console::put(out); Error::handle(name(), L"readContextMapping", ERR_CTXT_EXISTS_SAME_INDEX, __FILE__, __LINE__, Error::WARNING); } } // output the debugging information // if (debug_level_d >= Integral::ALL) { // retrieve the context hash // HashTable& context_hash = hg_d(level_a).getContextHash(); // output the context hash information // context_hash.debug(L"context hash table:"); } // end debugging info } // end looping over all contexts } // end handling context mapping for level // indicate success // return true; }