// file: $isip/class/mmedia/NGramParser/ngmp_05.cc // version: $Id: ngmp_05.cc 8326 2002-07-10 16:23:24Z alphonso $ // // isip include files // #include "NGramParser.h" #include // method: load // // arguments: // Sof& sof: (input) input ngram source file // HashTable& gram_hash: (output) output gram hash table // const Vector& symbol_table: (input) symbol table mapping index // const int32& tag: (input) sof object instance name // const String& name: (input) sof object instance name // // return: a bool8 indicating status // // this method reads in the ngram from a file with various format // bool8 NGramParser::load(Sof& sof_a, HashTable& gram_hash_a, const Vector& symbol_table_a, const int32& tag_a, const String& name_a) { // define some local variables // String buffer, buf; Char chr; VectorLong gram_size(order_d); // the size of each gram VectorLong num_gram_read(order_d); // the size of each gram Vector symbol(1); // N-gram symbols const Long *symbol_index; // indices of N-gram symbols SingleLinkedList symbol_list; // the list of ngram nodes NGramNode ngrm_node; NGramNode* prefix_node = NULL; NGramNode* history_node = NULL; SysString delimiter(L" \t="); HashTable* curr_hash = &gram_hash_a; HashTable symbol_hash; int32 gram_num; int32 pos, index = 0; bool8 flag = true; float32 backoff = NGramNode::DEF_BACKOFF; float32 lmscore = NGramNode::DEF_LM_SCORE; int32 tmp_int32; int32 curr_order = -1; // current order of grammar reading: 0, 1, 2, .. // debug variables // String value, output, empty_str; // initialize numbers // num_gram_read.assign((int32)0); // get hash table which maps String to Long // int32 len = symbol_table_a.length(); Long symbol_id; symbol_hash.setCapacity(len); for (int32 i = 0; i < len; i++) { symbol_id = i; symbol_hash.insert(symbol_table_a(i), &symbol_id); } // print debugging information // if (debug_level_d >= Integral::DETAILED) { symbol_hash.debug(L"symbol_hash"); } // we keep track of the current ngram prefix. since the input is sorted, // this makes it easier to know when to trigger events in the read // process // VectorLong curr_prefix(order_d); bool8 prefix_changed = false; int32 prefix_items = 0; // read the instance of the object from the Sof file // if (!sof_a.find(name_a, tag_a)) { return false; } // read the first format name // sof_a.gets(buffer, Sof::BUFFER_SIZE); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // format: NGRAM_ARPA // if (buffer.eq(L"format = \"NGRAM_ARPA\";")) { // read the ngram information such as the number of unigrams, // bigrams etc // while (sof_a.gets(buffer, Sof::BUFFER_SIZE)) { // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // get the first char // chr = buffer(0); // print debugging information // if (debug_level_d >= Integral::DETAILED) { chr.debug(L"chr"); } // ignore comment lines if the first char is '#' // if (chr.eq(L'#') || buffer.length() == 0) { // do nothing // continue; } // read "\data\" token // else if (chr.eq(L'\\')) { // read the ngram counts // if (buffer.compare(L"\\data\\") == Integral::EQUAL) { // read the ngram counts // for (int32 i = 0; i < order_d; i++) { sof_a.gets(buffer, Sof::BUFFER_SIZE); buffer.trimLeft(); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // get "ngram" token // pos = 0; buffer.tokenize(buf, pos, delimiter); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buf.debug(L"buf"); } if (buf.ne(L"ngram")) { buf.debug(L"TAG(ngram)"); return Error::handle(name(), L"load", ERR_TAG, __FILE__, __LINE__); } // get the number of the gram // buffer.tokenize(buf, pos, delimiter); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buf.debug(L"buf"); } buf.get(gram_num); if (gram_num > order_d) { value.assign(gram_num); output.debugStr(name(), empty_str, L"gram_num", value); return Error::handle(name(), L"load", ERR_ORDER, __FILE__, __LINE__); } index = gram_num - 1; // get the size of the gram // pos++; buffer.deleteRange(0, pos); buffer.get(tmp_int32); gram_size(index) = tmp_int32; } // initilize the hash table for unigram // gram_hash_a.setCapacity((int32)gram_size(0)); } // end if data // read end information // else if (buffer.compare(L"\\end\\")) { break; } // otherwise this is an ngram list: "n-gram" // else { // we may need to clean up after an ngram read // // see if the prefix has changed. if so, then dump the current ngram // data // // if there are items for this prefix then add them // if (prefix_items > 0) { // store the items in the list to hash table // num_gram_read(curr_order) += symbol_list.length(); storeToHash(gram_hash_a, symbol_list, prefix_node); } // reset variables // prefix_items = 0; // reset the prefix node // prefix_node = (NGramNode*)NULL; // clear the prefix // curr_prefix.setLength(gram_num); curr_prefix.clear(Integral::RETAIN); prefix_changed = true; curr_prefix.assign(-1); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // get the order of the current ngram list // curr_order++; buffer.deleteRange(0, 1); buffer.get(gram_num); if ((gram_num > order_d) && (gram_num == curr_order - 1)) { value.assign(gram_num); output.debugStr(name(), empty_str, L"gram_num", value); return Error::handle(name(), L"load", ERR_ORDER, __FILE__, __LINE__); } index = gram_num - 1; } } // end else if char is "\" // otherwise this is a data line // else { // break flag - if the particular ngram is not valid then we just skip // that line // bool8 break_flag = false; // reset flag // flag = true; // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // set the lm score // pos = 0; buffer.tokenize(buf, pos, delimiter); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buf.debug(L"buf"); } buf.get(lmscore); buffer.deleteRange(0, pos); buffer.trimLeft(); // as the score is in log10 form, convert it to log_e // lmscore *= Integral::LN10; // reset the current hash table pointer // curr_hash = &gram_hash_a; // check if there are any history words and read them // for (int32 i = 0; i < index; i++) { // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // read a word // pos = 0; buffer.tokenize(buf, pos, delimiter); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buf.debug(L"buf"); } buffer.deleteRange(0, pos); buffer.trimLeft(); // get the corresponding word from the lexicon // symbol_index = symbol_hash.get(buf); if (symbol_index == NULL) { flag = true; buf.debug(L"err_symbol"); return Error::handle(name(), L"load", ERR_SYMBOL, __FILE__, __LINE__, Error::WARNING); } // check the prefix // if (*symbol_index != curr_prefix(i)) { curr_prefix(i) = *symbol_index; prefix_changed = true; } // get the ngram node corresponding to this word at this // level // if (curr_hash != NULL) { history_node = curr_hash->get(*symbol_index); curr_hash = history_node->getNextGram(); } else { return Error::handle(name(), L"load", ERR_HASH, __FILE__, __LINE__); } // if the ngram lookup failed then break out and do not process this // ngram (it is invalid) // if (history_node == (NGramNode*)NULL) { break_flag = true; break; } } // end for loop // see if the prefix has changed. if so, then dump the current ngram // data // if (prefix_changed) { // if there are items for this prefix then add them // if (prefix_items > 0) { // store the items in the list to hash table // num_gram_read(curr_order) += symbol_list.length(); storeToHash(gram_hash_a, symbol_list, prefix_node); } // reset variables // prefix_items = 0; prefix_changed = false; // reset the prefix node // prefix_node = history_node; } if (!break_flag) { // print debugging information // if (debug_level_d >= Integral::DETAILED) { buffer.debug(L"buffer"); } // read the current word // pos = 0; buffer.tokenize(buf, pos, delimiter); // print debugging information // if (debug_level_d >= Integral::DETAILED) { buf.debug(L"buf"); } buffer.deleteRange(0, pos); buffer.trimLeft(); // get the corresponding word from the lexicon // symbol_index = symbol_hash.get(buf); if (symbol_index == NULL) { flag = false; buf.debug(L"err_symbol"); return Error::handle(name(), L"load", ERR_SYMBOL, __FILE__, __LINE__, Error::WARNING); } // read the backoff score if any // if (buffer.get(backoff)) { // as the score is in log10 form, convert it to log_e // backoff *= Integral::LN10; } else { backoff = 0; } // if the current word exists in the lexicon // if (flag) { // configure an n-gram node // ngrm_node.setIndex(*symbol_index); ngrm_node.setLmScore(lmscore); ngrm_node.setBackoff(backoff); // put the node in the list // symbol_list.insert(&ngrm_node); // increment the number of items for this prefix // prefix_items++; } } } // end else this is data line } // end while // we may need to clean up after an ngram read // // see if the prefix has changed. if so, then dump the current ngram // data // // if there are items for this prefix then add them // if (prefix_items > 0) { // store the items in the list to hash table // num_gram_read(curr_order) += symbol_list.length(); storeToHash(gram_hash_a, symbol_list, prefix_node); } // reset variables // prefix_items = 0; // reset the prefix node // prefix_node = (NGramNode*)NULL; // clear the prefix // prefix_changed = true; // check the number of ngram // for (int32 order = 0; order < order_d; order++) { if (gram_size(order) != num_gram_read(order)) { gram_size.debug(L"gram_size"); num_gram_read.debug(L"num_gram_read"); return Error::handle(name(), L"load", ERR_NUM_GRAM, __FILE__, __LINE__); } } } // end of NGRAM_ARPA format // other format // else { return Error::handle(name(), L"load", ERR_FORMAT, __FILE__, __LINE__); } // exit gracefully // return true; } // method: storeToHash // // arguments: // HashTable& gram_hash: (output) target gram hash table // SingleLinkedList& list: (input) ngram node list // NGramNode* prefix_node: (input) the upper order ngram node // // return: a logical_1 indicating status // // this method stores a list of ngram nodes into hash table // bool8 NGramParser::storeToHash(HashTable& gram_hash_a, SingleLinkedList& list_a, NGramNode* prefix_node_a) { // local variables // NGramNode* node; Long count; HashTable* hash; // allocate memory for hash pointer // if (prefix_node_a == NULL) { // point to the unigram hash table // hash = &gram_hash_a; } else { hash = new HashTable; prefix_node_a->setNextGram(hash); } // find the first ngram node from list // list_a.gotoFirst(); node = list_a.getFirst(); // store the nodes from list to hash table // while (node != NULL) { hash->insert(node->getIndex(), node); count++; node = list_a.getNext(); list_a.gotoNext(); } // clean list // list_a.clear(Integral::FREE); // exit gracefully // return true; } // method: store // // arguments: // Sof& sof: (input) output ngram source file // const HashTable& gram_hash: (input) input gram hash table // const Vector& symbol_table: (input) symbol table mapping index // const int32& tag: (input) sof object instance name // const String& name: (input) sof object instance name // // return: a bool8 indicating status // // this method stores ngram probabilities to a file with source format // bool8 NGramParser::store(Sof& sof_a, const HashTable& gram_hash_a, const Vector& symbol_table_a, const int32& tag_a, const String& name_a) const { // local variables // VectorLong gram_size(order_d); const HashTable* hash; Vector vec; const NGramNode* node = (NGramNode*)NULL; int32 len; Vector indices[order_d]; VectorLong keys; Vector tmp_keys; String symbol; // initialize values // gram_size.assign((int32)0); // read unigram // indices[0].setLength(1); // order = 1 gram_hash_a.keys(tmp_keys); // get all the unigram len = tmp_keys.length(); indices[0](0).setLength(len); for (int32 n = 0; n < len; n++) indices[0](0)(n) = tmp_keys(n); gram_size(0) = len; // read higher order gram // for (int32 order = 1; order < order_d; order++) { // initialize variables // int32 lower = order - 1; indices[order].setLength(order + 1); // set length to order // go through all the nodes in the lower order grams // for (int32 i = 0; i < gram_size(lower); i++) { // find the hash table for the gram // hash = &gram_hash_a; // initialize hash pointer for (int32 j = 0; j < order; j++) { node = hash->get(indices[lower](j)(i)); hash = node->getNextGram(); } // find the next gram for this node // if (hash == NULL) { continue; } hash->keys(tmp_keys); len = tmp_keys.length(); keys.setLength(len); gram_size(order) += len; for (int32 n = 0; n < len; n++) keys(n) = tmp_keys(n); int32 old_len = indices[order](0).length(); // assign the indices of lower gram // for (int32 k = 0; k < order; k++) { indices[order](k).setLength(old_len + len); indices[order](k).setRange(old_len, len, indices[lower](k)(i)); } // assign the index of new gram // indices[order](order).concat(keys); } } /* if (debug_level_d >= Integral::DETAILED) { gram_size.debug(L"gram_size"); } */ // declare local variables // int32 obj_size; if (sof_a.isText()) { obj_size = Sof::ANY_SIZE; } else { printf("no binary write\n"); exit(1); } // put the object into the sof file's index // if (!sof_a.put(name_a, tag_a, obj_size)) { return false; } // declare output variables // String output, value; // output source file format // output.assign(L"format = \"NGRAM_ARPA\";\n\n"); sof_a.puts(output); // output "\data\" // sof_a.puts(L"\\data\\\n"); // output ngram numbers // for (int32 i = 1; i <= order_d; i++) { output.assign(L"ngram "); value.assign(i); output.concat(value); output.concat(L"="); value.assign(gram_size(i-1)); output.concat(value); output.concat(L"\n"); sof_a.puts(output); } // output a blank line // sof_a.puts(L"\n"); // output ngram probabilities // for (int32 order = 0; order < order_d; order++) { // output "\n-grams:" // output.assign(L"\\"); value.assign(order + 1); output.concat(value); output.concat(L"-grams:\n"); sof_a.puts(output); // output probabilities from ngram nodes // len = indices[order](0).length(); for (int32 i = 0; i < len; i++) { // find the hash table for the gram // hash = &gram_hash_a; for (int32 j = 0; j <= order; j++) { node = hash->get(indices[order](j)(i)); hash = node->getNextGram(); } if (node == NULL) { return Error::handle(name(), L"store", ERR_SYMBOL, __FILE__, __LINE__); } // output probability // output.assign(node->getLmScore() / Integral::LN10); output.concat(L" \t"); // output N-symbols // for (int32 j = 0; j <= order; j++) { // search_level_a.getSymbol((SearchSymbol&)value, indices[order](j)(i)); symbol = symbol_table_a(indices[order](j)(i)); output.concat(symbol); output.concat(L" "); } // output backoff probability // if (node->getBackoff() != 0) { output.concat(L"\t"); value.assign(node->getBackoff() / Integral::LN10); output.concat(value); } // output to file // output.concat(L"\n"); sof_a.puts(output); } sof_a.puts(L"\n"); } // output "\end\" mark // sof_a.puts(L"\\end\\"); // exit gracefully // return true; }