// file: $isip/class/mmedia/TranscriptionDatabase/trans_06.cc // version: $Id: trans_06.cc 9421 2004-03-10 20:11:33Z parihar $ // // isip include files // #include "TranscriptionDatabase.h" // method: load // // arguments: // Filename& trans_file: (input) transcription file // String& name: (input) database name // String& level: (input) transcription level // // return: a bool8 flag indicating status // // this method load the transcription database // bool8 TranscriptionDatabase::load(Filename& trans_file_a, String& name_a, String& level_a) { // local variables // bool8 status = true; String line; int32 num_file = 0; File trans_file; // debugging information // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"\nloading input transcription file: "); output.concat(trans_file_a); Console::put(output); Console::decreaseIndention(); } // open the input transcription file in read mode // if (!trans_file.open(trans_file_a, File::READ_ONLY)) { String msg(L"Error: no input transcription file specified "); Console::put(msg); Error::handle(name(), L"load", Error::ERR, __FILE__, __LINE__); } // debugging information // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"database name: "); output.concat(name_a); output.concat(L"\ntranscription level: "); output.concat(level_a); Console::put(output); Console::decreaseIndention(); } // default type // String gtype(L"ORTHOGRAPHIC"); // default offset values for start and the stop times(0.0) // Float offset_start(AnnotationGraph::DEF_OFFSET); Float offset_stop(AnnotationGraph::DEF_OFFSET); // default feature name and unit for transcriptions // String fname(L"level"); String unit(L"seconds"); // read the transcription file line by line // while (trans_file.get(line)) { // get rid of blank spaces on both the sides of the line // line.trim(); // local variables // String id; String transcription; String first; String start_time; String stop_time; String channel_string; // default channel (0) // Long channel = Annotation::DEF_CHANNEL_INDEX; // skip any blank line // if (line.countTokens() == (int32)0) continue; // get the number of fields based on endlimiter ":" // int32 num_tokens = 0; num_tokens = line.countTokens(L":"); // if the format is: // : trans1 :trans2 ... // int32 tmp1 = 0; if (line.firstChr(L":", tmp1) == (int32)0) { // loop over all the transcriptions in this line // int32 pos = 0; while (line.tokenize(transcription, pos, L":")) { // get the fields // id.assign(num_file); transcription.trim(); // create the annotation graph // AnnotationGraph angr(name_a, gtype); Anchor* ancr_start = (Anchor*)NULL; Anchor* ancr_stop = (Anchor*)NULL; String newid_start = angr.createAnchor(name_a, unit); String newid_stop = angr.createAnchor(name_a, unit); ancr_start = angr.getAnchorById(newid_start); ancr_stop = angr.getAnchorById(newid_stop); String newid = angr.createAnnotation(name_a, ancr_start, ancr_stop, transcription, (int32)channel); if (!angr.setFeature(newid, fname, level_a)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } // insert the record in the database (identifier and // annotation graph) // if (!insertRecord(id, angr)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } angr.clear(); // increment the file count // num_file++; // debugging message // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"number of the file processed: "); output.concat((Long)num_file); output.concat(L"\nidentifier: "); output.concat(id); output.concat(L"\nstart_time: "); output.concat(start_time); output.concat(L"\nstop_time: "); output.concat(stop_time); output.concat(L"\ntranscription: "); output.concat(transcription); Console::put(output); Console::decreaseIndention(); } } } // else if the format is just the transcription: // transcription // else if (num_tokens == 1) { transcription.assign(line); id.assign(num_file); // create the annotation graph // AnnotationGraph angr(name_a, gtype); Anchor* ancr_start = (Anchor*)NULL; Anchor* ancr_stop = (Anchor*)NULL; String newid_start = angr.createAnchor(name_a, unit); String newid_stop = angr.createAnchor(name_a, unit); ancr_start = angr.getAnchorById(newid_start); ancr_stop = angr.getAnchorById(newid_stop); String newid = angr.createAnnotation(name_a, ancr_start, ancr_stop, transcription, (int32)channel); if (!angr.setFeature(newid, fname, level_a)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } // insert the record in the database (identifier and annotation graph) // if (!insertRecord(id, angr)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } angr.clear(); // increment the file count // num_file++; // debugging message // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"number of the file processed: "); output.concat((Long)num_file); output.concat(L"\nidentifier: "); output.concat(id); output.concat(L"\nstart_time: "); output.concat(start_time); output.concat(L"\nstop_time: "); output.concat(stop_time); output.concat(L"\ntranscription: "); output.concat(transcription); Console::put(output); Console::decreaseIndention(); } } // else if the formats is: // ident: transcription // ident start_time stop_time: transcription // ident start_time stop_time channel : transcription // else if (num_tokens == 2) { // get the number of fields based on space as an endlimiter // int32 pos = 0; line.tokenize(first, pos, L":"); int32 num_tokens_space = 0; num_tokens_space = first.countTokens(); first.trim(); line.tokenize(transcription, pos, L":"); transcription.trim(); // local variables // AnnotationGraph angr(name_a, gtype); String newid_start; String newid_stop; // if the number of tokens is 1, the format is: // ident: transcription // if (num_tokens_space == 1) { // get the fields // id.assign(first); // create anchors without timming information // newid_start = angr.createAnchor(name_a, unit); newid_stop = angr.createAnchor(name_a, unit); } // else if the number of tokens is 3, the format is: // ident start_time stop_time : transcription // else if (num_tokens_space == 3) { // get the fields // int32 pos = 0; first.tokenize(id, pos); first.tokenize(start_time, pos); first.tokenize(stop_time, pos); offset_start.assign(start_time); offset_stop.assign(stop_time); // create anchors with timming information // newid_start = angr.createAnchor(name_a, offset_start, unit); newid_stop = angr.createAnchor(name_a, offset_stop, unit); } // else if the number of tokens is 4, the format is: // ident start_time stop_time channel : transcription // else if (num_tokens_space == 4) { // get the fields // int32 pos = 0; first.tokenize(id, pos); first.tokenize(start_time, pos); first.tokenize(stop_time, pos); first.tokenize(channel_string, pos); offset_start.assign(start_time); offset_stop.assign(stop_time); channel.assign(channel_string); // create anchors with timming information // newid_start = angr.createAnchor(name_a, offset_start, unit); newid_stop = angr.createAnchor(name_a, offset_stop, unit); } // else error // else { String msg(L"Error: check the transcription file format:"); Console::put(msg); Error::handle(name(), L"load", Error::ERR, __FILE__, __LINE__); } // create the annotation graph // Anchor* ancr_start = (Anchor*)NULL; Anchor* ancr_stop = (Anchor*)NULL; ancr_start = angr.getAnchorById(newid_start); ancr_stop = angr.getAnchorById(newid_stop); String newid = angr.createAnnotation(name_a, ancr_start, ancr_stop, transcription, (int32)channel); if (!angr.setFeature(newid, fname, level_a)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } // insert the record in the database (identifier and annotation graph) // if (!insertRecord(id, angr)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } angr.clear(); // increment the file count // num_file++; // debugging message // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"number of the file processed: "); output.concat((Long)num_file); output.concat(L"\nidentifier: "); output.concat(id); output.concat(L"\nstart_time: "); output.concat(start_time); output.concat(L"\nstop_time: "); output.concat(stop_time); output.concat(L"\ntranscription: "); output.concat(transcription); Console::put(output); Console::decreaseIndention(); } } // else error // else { String msg(L"Error: check the transcription file format:"); Console::put(msg); Error::handle(name(), L"load", Error::ERR, __FILE__, __LINE__); } } // debugging message // if (debug_level_d >= Integral::NONE) { Console::increaseIndention(); String output; output.assign(L"total number of file processed: "); output.concat((Long)num_file); Console::put(output); Console::decreaseIndention(); } // close the input transcription file // trans_file.close(); // exit gracefully // return status; } // method: load // // arguments: // Sdb& sdb: (input) sdb file name list // Filename& trans_file: (input) transcription file // // return: logical error status // // this method load the data file to transcription database // bool8 TranscriptionDatabase::load(Sdb& sdb_a, Filename& trans_file_a) { // loop from start // if (!sdb_a.gotoFirst()) { String msg(L"Error: no input file specified "); Console::put(msg); Error::handle(name(), L"load", Error::NO_PARAM_FILE, __FILE__, __LINE__); } Filename trans_file; Sof transcription_file; String transcription; String substring; int32 num_file = 0; // declare a string vector to store the transcription information // Vector trans_vec; // open the input file in read mode // File read_trans_file; if (!read_trans_file.open(trans_file_a, File::READ_ONLY)) { Console::put(L"Error in opening transcription input file"); } // read the string lines // String input_line_01; while (read_trans_file.get(input_line_01)) { trans_vec.concat(input_line_01); } // close the input text file // read_trans_file.close(); String name_00(L"TIDIGITS"); // create the annotation graph // String gtype_00(L"ORTHOGRAPHIC"); String ident_00(L"id_00"); String ident_01(L"id_01"); String ident_02(L"id_02"); String ident_03(L"id_03"); String newid_00; String newid_01; String newid_02; Float offset_00(0.0); Float offset_01(0.0); Anchor* ancr_00 = (Anchor*)NULL; Anchor* ancr_01 = (Anchor*)NULL; String unit_00(L"seconds"); String feat_00(L"level"); String value_00(L"word"); String value_01(L"phoneme"); String channel_00; setDataBaseName(name_00); do { sdb_a.getName(trans_file); AnnotationGraph angr_00(name_00, gtype_00); // get the transcription // String transcription = trans_vec(num_file); // int32 trans_token = transcription.countTokens(); // int32 current_tran_token = 0; String atype_00; // tokenize the transcription and generate the graph // int32 pos = 0; // int32 syntac_num = 0; // int32 current_syn = 0; String syn_string; int32 skip_token = 0; // while (transcription.tokenize(atype_00, pos)) { atype_00.assign(transcription); Long x = transcription.firstStr(L":", 0); x.debug(L"x="); transcription.deleteRange(0, (int32)x+1); transcription.trim(); transcription.debug(L"TR"); atype_00.deleteRange(x, atype_00.length()-x); atype_00.debug(L"atype"); // get one token // atype_00.trim(); String time_00; while (atype_00.tokenize(time_00, pos)) { skip_token++; if (skip_token ==1) offset_00.assign(time_00); else if (skip_token == 2) offset_01.assign(time_00); else if (skip_token == 3) channel_00.assign(time_00); } int32 channel_x = 0; if ( channel_00.eq(L"A")) { channel_00.debug(L"channel A:"); channel_x = 0; } else if (channel_00.eq(L"B")) { channel_00.debug(L"channel B:"); channel_x = 1; } if (debug_level_d >= Integral::DETAILED) { atype_00.debug(L"word:"); } // insert the word to AG // newid_00 = angr_00.createAnchor(name_00, offset_00, unit_00); newid_01 = angr_00.createAnchor(name_00, offset_01, unit_00); ancr_00 = angr_00.getAnchorById(newid_00); ancr_01 = angr_00.getAnchorById(newid_01); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, transcription, channel_x); if (!angr_00.setFeature(newid_02, feat_00, value_00)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } } // end of transcription tokenize // test the insert method // if (!insertRecord(trans_file, angr_00)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } angr_00.clear(); // move one forward for file count // num_file++; if (debug_level_d >= Integral::DETAILED) { Long(num_file).debug(L"total number of file processed="); trans_file.debug(L"file name"); transcription.debug(L"transcription"); } // debug(L"upto now"); } while (sdb_a.gotoNext()); return true; } // method: load // // arguments: // Sdb& sdb: (input) sdb id list // Filename& trans_file: (input) transcription file // Filename& lexicon_file: (input) lexicon file // bool8 flag: (input) flag to indicate if the time information exist // // return: logical error status // // this method load data files to the transcription database // bool8 TranscriptionDatabase::load(Sdb& sdb_a, Filename& trans_file_a, Filename& lexicon_file_a, bool8 flag_a) { // loop from start // if (!sdb_a.gotoFirst()) { String msg(L"Error: no input file specified "); Console::put(msg); Error::handle(name(), L"load", Error::NO_PARAM_FILE, __FILE__, __LINE__); } Filename trans_file; Sof transcription_file; String transcription; int32 num_file = 0; // declare a string vector to store the transcription information // Vector trans_vec; // open the input file in read mode // File read_trans_file; if (!read_trans_file.open(trans_file_a, File::READ_ONLY)) { Console::put(L"Error in opening transcription input file"); } // read the string lines // String input_line_01; while (read_trans_file.get(input_line_01)) { trans_vec.concat(input_line_01); } // close the input text file // read_trans_file.close(); int32 length_01 = trans_vec.length(); Long(length_01).debug(L"total lines in transcription = "); // declare the hashtable for the word and its pronunciation // HashTable pronun_map_d; // open the input file in read lexicon // Vector lexicon_symbol_list; // open the input file in read mode // File read_lexicon_file; if (!read_lexicon_file.open(lexicon_file_a, File::READ_ONLY)) { Console::put(L"Error in opening lexicon input file"); } // declare variables // String str; Vector nonsp_def, pre_list, word_list, rule_list; // read each line // while (read_lexicon_file.get(str)) { str.debug(L"str"); // pre-process the input lexicon lines and merge the same lines // bool8 same = false; for (int32 i = 0; i < pre_list.length(); i++) { if (str.eq(pre_list(i))) same = true; } if (!same) { pre_list.concat(str); } } read_lexicon_file.close(); // process each lexicon line after pre-processing in the pre_list // for (int32 i = 0; i < pre_list.length(); i++) { String head_word, symbol, sequence; int32 pos(0); String delim(L" "); String lex_str(pre_list(i)); // get the first word in the lexicon line // lex_str.tokenize(head_word, pos); String key_word = head_word; int32 alt_index = 0; while (pronun_map_d.containsKey(key_word)) { key_word.assign(head_word); key_word.concat(L"."); key_word.concat(alt_index++); } String rest_string; lex_str.tokenize(rest_string, pos, lex_str.length() - pos); rest_string.trim(); rest_string.debug(L"lexicon"); pronun_map_d.insert(key_word, &rest_string); } if (debug_level_d >= Integral::DETAILED) { pronun_map_d.debug(L"lexicon"); } String name_00(L"SPINE"); // create the annotation graph // String gtype_00(L"ORTHOGRAPHIC"); String ident_00(L"id_00"); String ident_01(L"id_01"); String ident_02(L"id_02"); String ident_03(L"id_03"); String ident_04(L"id_04"); String newid_00; String newid_01; String newid_02; String newid_03; String newid_04; String synid_00; String synid_01; Float offset_00(0.0); Float offset_01(0.0); Float offset_02(0.0); Anchor* ancr_00 = (Anchor*)NULL; Anchor* ancr_01 = (Anchor*)NULL; String unit_00(L"seconds"); String feat_00(L"level"); String value_00(L"syntactic"); String value_01(L"word"); String value_02(L"phoneme"); String channel_00; setDataBaseName(name_00); do { sdb_a.getName(trans_file); AnnotationGraph angr_00(name_00, gtype_00); // get the transcription // String transcription = trans_vec(num_file); // pre-processing transcription // String atype_00; atype_00.assign(transcription); int32 pos = 0; int32 skip_token = 0; int32 channel_x = 0; if (flag_a) { int32 x = transcription.firstStr(L":", 0); Long(x).debug(L"':' position ="); transcription.deleteRange(0, (int32)x+1); transcription.trim(); transcription.debug(L"transcription"); atype_00.deleteRange(x, atype_00.length()-x); atype_00.debug(L"atype"); atype_00.trim(); // tokenize the transcription and generate the graph // String time_00; while (atype_00.tokenize(time_00, pos)) { skip_token++; if (skip_token ==1) offset_00.assign(time_00); else if (skip_token == 2) offset_01.assign(time_00); else if (skip_token == 3) channel_00.assign(time_00); } if ( channel_00.eq(L"A")) { channel_00.debug(L"channel A:"); channel_x = 0; } else if (channel_00.eq(L"B")) { channel_00.debug(L"channel B:"); channel_x = 1; } if (debug_level_d >= Integral::DETAILED) { atype_00.debug(L"word:"); } } // end of flag_a int32 trans_token = transcription.countTokens(); int32 current_tran_token = 0; int32 token_count = 0; if (flag_a) { newid_00 = angr_00.createAnchor(name_00, offset_00, unit_00); } else { newid_00 = angr_00.createAnchor(name_00, unit_00); } synid_00 = newid_00; pos = 0; while (transcription.tokenize(atype_00, pos)) { // get one token // atype_00.trim(); token_count++; atype_00.debug(L"word:"); if (debug_level_d >= Integral::DETAILED) { atype_00.debug(L"word:"); } // insert the word to AG // if (token_count == trans_token) { if (flag_a) { newid_01 = angr_00.createAnchor(name_00, offset_01, unit_00); } else { newid_01 = angr_00.createAnchor(name_00, unit_00); } } else { newid_01 = angr_00.createAnchor(name_00, unit_00); } ancr_00 = angr_00.getAnchorById(newid_00); ancr_01 = angr_00.getAnchorById(newid_01); ancr_00->debug(L"new_00"); ancr_01->debug(L"new_01"); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, atype_00, channel_x); ancr_00->debug(L"new_00A"); ancr_01->debug(L"new_01A"); if (!angr_00.setFeature(newid_02, feat_00, value_01)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } ancr_00->debug(L"new_00AA"); ancr_01->debug(L"new_01AA"); // add phone level AG here // String key_phone = atype_00; String delim(L" "); int32 alt_index = 0; while (pronun_map_d.containsKey(key_phone)) { int32 pos(0); String symbol, sub_symbol; symbol.assign(*pronun_map_d.get(key_phone)); if (debug_level_d >= Integral::DETAILED) { symbol.debug(L"sub_symbol--------------------"); } int32 total_token = symbol.countTokens(); int32 token_number = 0; newid_03 = newid_00; while (symbol.tokenize(sub_symbol, pos, delim)) { if (token_number == total_token - 1) { newid_04 = newid_01; } else { newid_04 = angr_00.createAnchor(name_00, unit_00); } ancr_00 = angr_00.getAnchorById(newid_03); ancr_01 = angr_00.getAnchorById(newid_04); ancr_00->debug(L"new_00B"); ancr_01->debug(L"new_01B"); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, sub_symbol); ancr_00->debug(L"new_00BB"); ancr_01->debug(L"new_01BB"); if (!angr_00.setFeature(newid_02, feat_00, value_02)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } ancr_00->debug(L"new_00BBB"); ancr_01->debug(L"new_01BBB"); newid_03 = newid_04; token_number++; } // end of while tokenize key_phone.assign(atype_00); key_phone.concat(L"."); key_phone.concat(alt_index++); if (debug_level_d >= Integral::DETAILED) { key_phone.debug(L"key_phone=============="); } } // end of while containsKey newid_00 = newid_01; current_tran_token++; if (current_tran_token == trans_token) { synid_01 = newid_01; ancr_00 = angr_00.getAnchorById(synid_00); ancr_01 = angr_00.getAnchorById(synid_01); //syn_string.trim(); ancr_00->debug(L"new_00C"); ancr_01->debug(L"new_01C"); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, transcription, channel_x); ancr_00->debug(L"new_00D"); ancr_01->debug(L"new_01D"); if (!angr_00.setFeature(newid_02, feat_00, value_00)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } // syn_string.clear(); ancr_00->debug(L"new_00E"); ancr_01->debug(L"new_01E"); synid_00 = newid_00; } } // end of transcription tokenize // test the insert method // if (!insertRecord(trans_file, angr_00)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } angr_00.clear(); // move one forward for file count // num_file++; if (debug_level_d >= Integral::DETAILED) { Long(num_file).debug(L"total number of file processed="); trans_file.debug(L"file name"); transcription.debug(L"transcription"); } } while (sdb_a.gotoNext()); return true; } // method: load // // arguments: // Sdb& sdb: (input) sdb id list // Filename& trans_file: (input) transcription file // Filename& lexicon_file: (input) lexicon file // Filename& syntactic_file: (input) syntactic file // // return: logical error status // // this method load data files to the transcription database // bool8 TranscriptionDatabase::load(Sdb& sdb_a, Filename& trans_file_a, Filename& lexicon_file_a, Filename& syntactic_file_a) { // loop from start // if (!sdb_a.gotoFirst()) { String msg(L"Error: no input file specified "); Console::put(msg); Error::handle(name(), L"load", Error::NO_PARAM_FILE, __FILE__, __LINE__); } String syntactic; // declare a string vector to store the syntactic information // Vector syntactic_vec; // open the input file in read mode // Sof syn_file; syn_file.open(syntactic_file_a); syntactic_vec.read(syn_file, 0); // close the input text file // syn_file.close(); int32 length = syntactic_vec.length(); if (debug_level_d >= Integral::DETAILED) { Long(length).debug(L"total lines="); syntactic_vec.debug(L"syn"); } Filename trans_file; Sof transcription_file; String transcription; int32 num_file = 0; // declare a string vector to store the transcription information // Vector trans_vec; // open the input file in read mode // File read_trans_file; if (!read_trans_file.open(trans_file_a, File::READ_ONLY)) { Console::put(L"Error in opening transcription input file"); } // read the string lines // String input_line_01; while (read_trans_file.get(input_line_01)) { trans_vec.concat(input_line_01); } // close the input text file // read_trans_file.close(); int32 length_01 = trans_vec.length(); Long(length_01).debug(L"total lines="); // declare the hashtable for the word and its pronunciation // HashTable pronun_map_d; // open the input file in read lexicon // Vector lexicon_symbol_list; // open the input file in read mode // File read_lexicon_file; if (!read_lexicon_file.open(lexicon_file_a, File::READ_ONLY)) { Console::put(L"Error in opening lexicon input file"); } // declare variables // String str; Vector nonsp_def, pre_list, word_list, rule_list; // read each line // while (read_lexicon_file.get(str)) { // pre-process the input lexicon lines and merge the same lines // bool8 same = false; for (int32 i = 0; i < pre_list.length(); i++) { if (str.eq(pre_list(i))) same = true; } if (!same) { pre_list.concat(str); } } read_lexicon_file.close(); // process each lexicon line after pre-processing in the pre_list // for (int32 i = 0; i < pre_list.length(); i++) { String head_word, symbol, sequence; int32 pos(0); String delim(L" "); String lex_str(pre_list(i)); // get the first word in the lexicon line // lex_str.tokenize(head_word, pos); String key_word = head_word; int32 alt_index = 0; while (pronun_map_d.containsKey(key_word)) { key_word.assign(head_word); key_word.concat(L"."); key_word.concat(alt_index++); } String rest_string; lex_str.tokenize(rest_string, pos, lex_str.length() - pos); rest_string.trim(); pronun_map_d.insert(key_word, &rest_string); } if (debug_level_d >= Integral::DETAILED) { pronun_map_d.debug(L"lexicon"); } String name_00(L"TIDIGITS"); // create the annotation graph // String gtype_00(L"ORTHOGRAPHIC"); String ident_00(L"id_00"); String ident_01(L"id_01"); String ident_02(L"id_02"); String ident_03(L"id_03"); String ident_04(L"id_04"); String newid_00; String newid_01; String newid_02; String newid_03; String newid_04; String synid_00; String synid_01; Float offset_00(0.0); Anchor* ancr_00 = (Anchor*)NULL; Anchor* ancr_01 = (Anchor*)NULL; String unit_00(L"seconds"); String feat_00(L"level"); String value_00(L"syntactic"); String value_01(L"word"); String value_02(L"phoneme"); setDataBaseName(name_00); do { sdb_a.getName(trans_file); AnnotationGraph angr_00(name_00, gtype_00); // get the transcription // String transcription = trans_vec(num_file); int32 trans_token = transcription.countTokens(); int32 current_tran_token = 0; String atype_00; // tokenize the transcription and generate the graph // int32 pos = 0; newid_00 = angr_00.createAnchor(name_00, offset_00, unit_00); synid_00 = newid_00; int32 syntac_num = 0; int32 current_syn = 0; String syn_string; int32 skip_token = 0; while (transcription.tokenize(atype_00, pos)) { // get one token // atype_00.trim(); skip_token++; atype_00.debug(L"word:"); if (skip_token < 4) continue; if (debug_level_d >= Integral::DETAILED) { atype_00.debug(L"word:"); } // insert the word to AG // newid_01 = angr_00.createAnchor(name_00, offset_00, unit_00); ancr_00 = angr_00.getAnchorById(newid_00); ancr_01 = angr_00.getAnchorById(newid_01); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, atype_00); if (!angr_00.setFeature(newid_02, feat_00, value_01)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } // add phone level AG here // String key_phone = atype_00; String delim(L" "); int32 alt_index = 0; while (pronun_map_d.containsKey(key_phone)) { int32 pos(0); String symbol, sub_symbol; symbol.assign(*pronun_map_d.get(key_phone)); if (debug_level_d >= Integral::DETAILED) { symbol.debug(L"sub_symbol--------------------"); } int32 total_token = symbol.countTokens(); int32 token_number = 0; newid_03 = newid_00; while (symbol.tokenize(sub_symbol, pos, delim)) { if (token_number == total_token - 1) { newid_04 = newid_01; } else { newid_04 = angr_00.createAnchor(name_00, offset_00, unit_00); } ancr_00 = angr_00.getAnchorById(newid_03); ancr_01 = angr_00.getAnchorById(newid_04); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, sub_symbol); if (!angr_00.setFeature(newid_02, feat_00, value_02)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } newid_03 = newid_04; token_number++; } // end of while tokenize key_phone.assign(atype_00); key_phone.concat(L"."); key_phone.concat(alt_index++); if (debug_level_d >= Integral::DETAILED) { key_phone.debug(L"key_phone=============="); } } // end of while containsKey newid_00 = newid_01; syntac_num++; syn_string.concat(atype_00); syn_string.concat(L" "); current_tran_token++; if (current_tran_token == trans_token || syntac_num == (int32)syntactic_vec(current_syn)) { synid_01 = newid_01; ancr_00 = angr_00.getAnchorById(synid_00); ancr_01 = angr_00.getAnchorById(synid_01); syn_string.trim(); newid_02 = angr_00.createAnnotation(name_00, ancr_00, ancr_01, syn_string); if (!angr_00.setFeature(newid_02, feat_00, value_00)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } syn_string.clear(); synid_00 = newid_00; syntac_num = 0; current_syn++; } } // end of transcription tokenize // test the insert method // if (!insertRecord(trans_file, angr_00)) { return Error::handle(name(), L"load", ERR, __FILE__, __LINE__); } angr_00.clear(); // move one forward for file count // num_file++; if (debug_level_d >= Integral::DETAILED) { Long(num_file).debug(L"total number of file processed="); trans_file.debug(L"file name"); transcription.debug(L"transcription"); } } while (sdb_a.gotoNext()); return true; } // method: storePartial // // arguments: // Sof& sof_a: (input) database file name // int32 tag: (input) sof tag // AnnotationGraph& graph: (input) annotation graph // // return: logical error status // // this method stores the annotation graph to the database // bool8 TranscriptionDatabase::storePartial(Sof& sof_a, int32 tag_a, AnnotationGraph& graph_a) { // write the annotation graph to the database // graph_a.write(sof_a, tag_a); // exit gracefully // return true; } // method: storePartial // // arguments: // Sof& sof_a: (input) database file name // int32 tag: (input) sof tag // Vector& keys: (input) database identifiers // // return: logical error status // // this method stores the database header to the database // bool8 TranscriptionDatabase::storePartial(Sof& sof_a, int32 tag_a, Vector& keys_a) { // declare local variables // int32 obj_size = 0; // determine the size of the object // if (sof_a.isText()) { obj_size = Sof::ANY_SIZE; } else { obj_size = name_d.sofSize() + keys_a.sofSize(); } // put the object into the sof file's index // if (!sof_a.put(CLASS_NAME, tag_a, obj_size)) { return false; } // write the database name // name_d.writeData(sof_a, PARAM_NAME); // write the keys associated with the hash table // keys_a.writeData(sof_a, PARAM_KEYS); // exit gracefully // return true; } // method: storePartial // // arguments: // String& trans: (input) transcription // String& name: (input) database name // String& level: (input) transcription level // int32 num: (input) the number of transcription in db // Sof& db_sof: (input) db file name // int32 tag: (input) tag // // return: a bool8 flag indicating status // // this method stores one transcription into the // TranscriptionDatabase. // bool8 TranscriptionDatabase::storePartial(String& trans_a, String& name_a, String& level_a, int32 num_a, Sof& db_sof_a, int32 tag_a) { // local variables // bool8 status = true; String line; File trans_file; // debugging information // if (debug_level_d > Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"database name: "); output.concat(name_a); output.concat(L"\ntranscription level: "); output.concat(level_a); Console::put(output); Console::decreaseIndention(); } // default type // String gtype(L"ORTHOGRAPHIC"); // default offset values for start and the stop times(0.0) // Float offset_start(AnnotationGraph::DEF_OFFSET); Float offset_stop(AnnotationGraph::DEF_OFFSET); // default feature name and unit for transcriptions // String fname(L"level"); String unit(L"seconds"); // default channel (0) // Long channel = Annotation::DEF_CHANNEL_INDEX; // create the annotation graph // AnnotationGraph angr(name_a, gtype); Anchor* ancr_start = (Anchor*)NULL; Anchor* ancr_stop = (Anchor*)NULL; String newid_start = angr.createAnchor(name_a, unit); String newid_stop = angr.createAnchor(name_a, unit); ancr_start = angr.getAnchorById(newid_start); ancr_stop = angr.getAnchorById(newid_stop); String newid = angr.createAnnotation(name_a, ancr_start, ancr_stop, trans_a, (int32)channel); if (!angr.setFeature(newid, fname, level_a)) { return Error::handle(name(), L"store", ERR, __FILE__, __LINE__); } // write the graph to file // if (!storePartial(db_sof_a, num_a++, angr)) { return Error::handle(name(), L"store", ERR, __FILE__, __LINE__); } angr.clear(); // exit gracefully // return status; } // method: store // // arguments: // Sof& db_sof: (input) db file name // int32 tag: (input) tag // // return: logical error status // // this method store transcription database to a file // bool8 TranscriptionDatabase::store(Sof& db_sof_a, int32 tag_a) { // write database index data // write(db_sof_a, tag_a); // get the values associated with the hash table // Vector values; if (!hash_d.values(values)) { return Error::handle(name(), L"store", Error::ARG, __FILE__, __LINE__); } // write the objects for each annotationgraph // int32 len = values.length(); for (int32 i = 0; i < len; i++) { values(i).write(db_sof_a, i); } // exit gracefully // return true; } // method: store // // arguments: // Filename& trans_file: (input) transcription file // String& name: (input) database name // String& level: (input) transcription level // Sof& db_sof: (input) db file name // int32 tag: (input) tag // // return: a bool8 flag indicating status // // this method parses the transcriptions and then stores them as the // TranscriptionDatabase. Note that unlike the load method, this // method parses and them stores the transcriptions one at a time so // that it requires small amount of memory // bool8 TranscriptionDatabase::store(Filename& trans_file_a, String& name_a, String& level_a, Sof& db_sof_a, int32 tag_a) { // local variables // bool8 status = true; String line; int32 num_file = 0; File trans_file; Vector symbol_table; // debugging information // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"\nloading input transcription file: "); output.concat(trans_file_a); Console::put(output); Console::decreaseIndention(); } // open the input transcription file in read mode // if (!trans_file.open(trans_file_a, File::READ_ONLY)) { String msg(L"Error: no input transcription file specified "); Console::put(msg); Error::handle(name(), L"store", Error::ERR, __FILE__, __LINE__); } // debugging information // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"database name: "); output.concat(name_a); output.concat(L"\ntranscription level: "); output.concat(level_a); Console::put(output); Console::decreaseIndention(); } // default type // String gtype(L"ORTHOGRAPHIC"); // default offset values for start and the stop times(0.0) // Float offset_start(AnnotationGraph::DEF_OFFSET); Float offset_stop(AnnotationGraph::DEF_OFFSET); // default feature name and unit for transcriptions // String fname(L"level"); String unit(L"seconds"); // read the transcription file line by line // while (trans_file.get(line)) { // get rid of blank spaces on both the sides of the line // line.trim(); // local variables // String id; String transcription; String first; String start_time; String stop_time; String channel_string; // default channel (0) // Long channel = Annotation::DEF_CHANNEL_INDEX; // skip any blank line // if (line.countTokens() == (int32)0) continue; // get the number of tokens based on endlimiter ":" // int32 num_tokens = 0; num_tokens = line.countTokens(L":"); // get the number of delimiters // int32 num_delims = 0; num_delims = line.countDelimiters(L":"); // if the format is: // : trans1 :trans2 ... // int32 tmp1 = 0; if (line.firstChr(L":", tmp1) == (int32)0) { // loop over all the transcriptions in this line // int32 pos = 0; while (line.tokenize(transcription, pos, L":")) { // get the fields // id.assign(num_file); transcription.trim(); // create the annotation graph // AnnotationGraph angr(name_a, gtype); Anchor* ancr_start = (Anchor*)NULL; Anchor* ancr_stop = (Anchor*)NULL; String newid_start = angr.createAnchor(name_a, unit); String newid_stop = angr.createAnchor(name_a, unit); ancr_start = angr.getAnchorById(newid_start); ancr_stop = angr.getAnchorById(newid_stop); String newid = angr.createAnnotation(name_a, ancr_start, ancr_stop, transcription, (int32)channel); if (!angr.setFeature(newid, fname, level_a)) { return Error::handle(name(), L"store", ERR, __FILE__, __LINE__); } // write the graph to file // if (!storePartial(db_sof_a, num_file++, angr)) { return Error::handle(name(), L"store", ERR, __FILE__, __LINE__); } angr.clear(); // save the identifier in the symbol-table // symbol_table.concat(id); // debugging message // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"number of the file processed: "); output.concat((Long)num_file); output.concat(L"\nidentifier: "); output.concat(id); output.concat(L"\nstart_time: "); output.concat(start_time); output.concat(L"\nstop_time: "); output.concat(stop_time); output.concat(L"\ntranscription: "); output.concat(transcription); Console::put(output); Console::decreaseIndention(); } } } // else if the format is just the transcription: // transcription // else if ((num_tokens == (int32)1) && (num_delims == (int32)0)) { transcription.assign(line); id.assign(num_file); // create the annotation graph // AnnotationGraph angr(name_a, gtype); Anchor* ancr_start = (Anchor*)NULL; Anchor* ancr_stop = (Anchor*)NULL; String newid_start = angr.createAnchor(name_a, unit); String newid_stop = angr.createAnchor(name_a, unit); ancr_start = angr.getAnchorById(newid_start); ancr_stop = angr.getAnchorById(newid_stop); String newid = angr.createAnnotation(name_a, ancr_start, ancr_stop, transcription, (int32)channel); if (!angr.setFeature(newid, fname, level_a)) { return Error::handle(name(), L"store", ERR, __FILE__, __LINE__); } // write the graph to file // if (!storePartial(db_sof_a, num_file++, angr)) { return Error::handle(name(), L"store", ERR, __FILE__, __LINE__); } angr.clear(); // save the identifier in the symbol-table // symbol_table.concat(id); // debugging message // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"number of the file processed: "); output.concat((Long)num_file); output.concat(L"\nidentifier: "); output.concat(id); output.concat(L"\nstart_time: "); output.concat(start_time); output.concat(L"\nstop_time: "); output.concat(stop_time); output.concat(L"\ntranscription: "); output.concat(transcription); Console::put(output); Console::decreaseIndention(); } } // else if the format is: // ident : // ident: transcription // ident start_time stop_time: transcription // ident start_time stop_time channel : transcription // else if (num_delims == (int32)1) { // get the number of fields based on space as an endlimiter // int32 pos = 0; line.tokenize(first, pos, L":"); int32 num_tokens_space = 0; num_tokens_space = first.countTokens(); first.trim(); line.tokenize(transcription, pos, L":"); transcription.trim(); // local variables // AnnotationGraph angr(name_a, gtype); String newid_start; String newid_stop; // if the format is and empty transcription with no symbols // ident : // if (num_tokens == 1) { // get the fields // id.assign(first); // create anchors without timming information // newid_start = angr.createAnchor(name_a, unit); newid_stop = angr.createAnchor(name_a, unit); } // else if the format is: // ident: transcription // ident start_time stop_time: transcription // ident start_time stop_time channel : transcription // else if (num_tokens == 2) { // if the number of tokens is 1, the format is: // ident: transcription // if (num_tokens_space == 1) { // get the fields // id.assign(first); // create anchors without timming information // newid_start = angr.createAnchor(name_a, unit); newid_stop = angr.createAnchor(name_a, unit); } // else if the number of tokens is 3, the format is: // ident start_time stop_time : transcription // else if ((num_tokens_space == 3) && (num_delims == (int32)1)) { // get the fields // int32 pos = 0; first.tokenize(id, pos); first.tokenize(start_time, pos); first.tokenize(stop_time, pos); offset_start.assign(start_time); offset_stop.assign(stop_time); // create anchors with timming information // newid_start = angr.createAnchor(name_a, offset_start, unit); newid_stop = angr.createAnchor(name_a, offset_stop, unit); } // else if the number of tokens is 4, the format is: // ident start_time stop_time channel : transcription // else if ((num_tokens_space == 4) && (num_delims == (int32)1)) { // get the fields // int32 pos = 0; first.tokenize(id, pos); first.tokenize(start_time, pos); first.tokenize(stop_time, pos); first.tokenize(channel_string, pos); offset_start.assign(start_time); offset_stop.assign(stop_time); channel.assign(channel_string); // create anchors with timming information // newid_start = angr.createAnchor(name_a, offset_start, unit); newid_stop = angr.createAnchor(name_a, offset_stop, unit); } } // else error // else { String msg(L"Error: check the transcription file format:"); Console::put(msg); Error::handle(name(), L"store", Error::ERR, __FILE__, __LINE__); } // create the annotation graph // Anchor* ancr_start = (Anchor*)NULL; Anchor* ancr_stop = (Anchor*)NULL; ancr_start = angr.getAnchorById(newid_start); ancr_stop = angr.getAnchorById(newid_stop); String newid = angr.createAnnotation(name_a, ancr_start, ancr_stop, transcription, (int32)channel); if (!angr.setFeature(newid, fname, level_a)) { return Error::handle(name(), L"store", ERR, __FILE__, __LINE__); } // write the graph to file // if (!storePartial(db_sof_a, num_file++, angr)) { return Error::handle(name(), L"store", ERR, __FILE__, __LINE__); } angr.clear(); // save the identifier in the symbol-table // symbol_table.concat(id); // debugging message // if (debug_level_d >= Integral::BRIEF) { Console::increaseIndention(); String output; output.assign(L"number of the file processed: "); output.concat((Long)num_file); output.concat(L"\nidentifier: "); output.concat(id); output.concat(L"\nstart_time: "); output.concat(start_time); output.concat(L"\nstop_time: "); output.concat(stop_time); output.concat(L"\ntranscription: "); output.concat(transcription); Console::put(output); Console::decreaseIndention(); } } // else error // else { String msg(L"Error: check the transcription file format:"); Console::put(msg); Error::handle(name(), L"store", Error::ERR, __FILE__, __LINE__); } } // write the symbol table // storePartial(db_sof_a, (int32)0, symbol_table); // debugging message // if (debug_level_d >= Integral::NONE) { Console::increaseIndention(); String output; output.assign(L"total number of file processed: "); output.concat((Long)num_file); Console::put(output); Console::decreaseIndention(); } // close the input transcription file // trans_file.close(); // exit gracefully // return status; }