// file: $isip/class/pr/LanguageModelXML/LanguageModelXML.h // version: $Id: LanguageModelXML.h 10636 2007-01-26 22:18:09Z tm334 $ // make sure definitions are only made once // #ifndef ISIP_LANGUAGEMODELXML #define ISIP_LANGUAGEMODELXML //--------------------------------------------------------------------------- // // ISIP include files // //--------------------------------------------------------------------------- #if defined(HAVE_EXPAT) #ifndef ISIP_XMLPARSER #include #endif #endif #ifndef ISIP_XMLTOKEN #include #endif #ifndef ISIP_FILENAME #include #endif #ifndef ISIP_DIGRAPH #include #endif #ifndef ISIP_ULONG #include #endif #ifndef ISIP_HIERARCHICAL_DIGRAPH #include #endif #ifndef ISIP_SHORT #include #endif #ifndef ISIP_TRIPLE #include #endif #ifndef ISIP_PAIR #include #endif #ifndef ISIP_SDB #include #endif #ifndef ISIP_SOF #include #endif #ifndef ISIP_LANGUAGE_MODEL_BASE #include #endif // rule type definition // typedef Vector Rule; // LanguageModelXML: a class used for adding XML support to // LanguageModel. This class transparently loads XML grammars // and converts them into DiGraph format, and // performs the conversion in the other direction. // // GLOSSARY // // "structure": // An XML "structure" is any unit of xml code which is // properly nested. For example: // "" // is NOT an xml structure, because it has not close tag. // "" // is an XML structure, for it counts as both an open and close tag. // " content " // is an XML structure // "content" // is an XML structure // content // is NOT an XML structure. // // "nested": // A nested XML structure is any XMLToken or set of XMLTokens which // are properly nested within an open and close tag. For example: // " content " // Here, the structure "content" is nested within the structure // " content ", which is in turn nested within the // structure " content " // // "properly nested": // A structure is properly nested if every XML structure within it // is complete. That is, every XML open tag contained in this // structure must have a matching close tag. Consider every open // and close tag within this structure as a structure as well, and // apply the definition recursively. // // "circumscribing" // All structures in which another structure is nested circumbscribe that // structure. // // "following": // A following tag or structure is one that occurs later in an XML // document when traversing the document from start to end. // "" // Here the metadata tag follows the item tag. // // "preceding": // A preceding tag or structure is one that occurs earlier in an XML // document when traversing the document from start to end. // "" // Here the item tag precedes the metadata tag. // // "depth": // The depth of an XML tag or structure is a measure of the number of // structure in which a tag or structure is nested. When traversing // the document from start to end, starting at 0, every start tag // increases the depth by one, ever end tag reduces the depth by one, // and character data does not affect the depth. See the example below: // // depth XMLToken // 0 // 1 // 2 // 2 character data // 1 // 0 // // "start/end/open/close tags/tokens": // "" is an XML tag or token (terms used interchangeably) // "" is a start or open tag (terms used interchangeably) // "" is an end or close tag (terms used interchangeably) // // class LanguageModelXML : public LanguageModelBase { //--------------------------------------------------------------------------- // // public constants // //--------------------------------------------------------------------------- public: // constant: class name // static const String CLASS_NAME; // constants: file parameters // static const String DEF_PARAM; static const int32 DEF_LEVEL; static const String PARAM_GRAMMARS; static const String PARAM_UNDERSCORE; static const int32 DEF_TAG; // constants: defaut attribute values // static const float32 DEF_WEIGHT; // constants: delimiters used when parsing contexts // static const String CONTEXT_LABEL_PREFIX; static const String CONTEXT_SYMBOL_DELIM; // constants: Special rule definitions // static const String SPECIAL_GARBAGE; static const String SPECIAL_VOID; static const String SPECIAL_NULL; // constants: handled XML token definitions // static const String ONE_OF; static const String ITEM; static const String RULEREF; static const String RULE; static const String GRAMMAR; // constants: handled XML token attributes // static const String WEIGHT; static const String REPEAT; static const String REPEAT_PROB; static const String ROOT; static const String URI; static const String SPECIAL; static const String ID; // constants: handled XML attribute values // static const String RULEREF_URI_LOCAL_DELIM; static const String REPEAT_RANGE_DELIM; static const String REPEAT_LOOP_BACK; // constant: isip dummy tag // static const String ISIP_DUMMY_NODE; // constant: output constants // static const String XML_VERSION_TAG; static const String ROOT_ATTRIB; static const String ID_ATTRIB; static const String ALGORITHM; static const String XML; static const String IMPLEMENTATION; static const String IHD; static const int32 LOOP_BACK; static const int32 TERMINAL_INDEX; static const int32 START_INDEX; //--------------------------------------- // // error codes // //--------------------------------------- // static const int32 ERR = 100800; static const int32 ERR_CTXT_NO_SYM_TAB = 100801; static const int32 ERR_CTXT_INV_LENGTHS = 100802; static const int32 ERR_CTXT_INV_GRAPH_ALIGN = 100803; static const int32 ERR_CTXT_INV_SYM = 100804; static const int32 ERR_READ_SYM_TYPE = 100805; static const int32 ERR_CTXT_SCV = 100806; static const int32 ERR_CTXT_INV_SYM_TAB = 100807; static const int32 ERR_CTXT_INV_TOKEN_INDEX = 100808; static const int32 ERR_CTXT_ESHG = 100809; static const int32 ERR_CTXT_EXISTS_DIF_INDEX = 100810; static const int32 ERR_CTXT_EXISTS_SAME_INDEX = 100811; static const int32 ERR_DEBG_TEMP_FILE = 100812; static const int32 ERR_DEBG_DSM_NOGR = 100813; static const int32 ERR_DEBG_DTM_NOGR = 100814; static const int32 ERR_REPT_IAV = 100815; static const int32 ERR_GRAMMAR_NO_START = 100816; static const int32 ERR_GRAMMAR_NO_END = 100817; static const int32 ERR_GRAMMAR_EMPTY = 100818; static const int32 ERR_GRAMMAR_FORMAT = 100819; static const int32 ERR_RULEREF_NOT_FOUND = 100820; static const int32 ERR_RULEREF_NON_LOCAL = 100821; static const int32 ERR_ONE_OF_EMPTY= 100822; static const int32 ERR_RULE_EMPTY = 100823; static const int32 ERR_TOKEN_UNHANDLED = 100824; static const int32 ERR_RULE_SPECIAL_INV = 100825; static const int32 ERR_VERTEX_STORE = 100826; //--------------------------------------------------------------------------- // // protected data // //--------------------------------------------------------------------------- protected: // these members hold an XML format grammar in memory // HierarchicalDigraph hg_d; Vector< Vector < Vector > > grammars_d; // variable to temporarily store the rule being parsed. // when the rule is complete, it will be added to the // vector of rules, rules_d. // Rule temp_rule_d; // vector to store all parsed rules. // Vector rules_d; // variables to store the start and end grammar tags // the start grammar tag will have many useful attributes. // XMLToken grammar_start_tag_d; XMLToken grammar_end_tag_d; // keeps track of nesting level of rules. // int32 rule_nesting_level_d; // create a vector of strings to store the symbol table if conversion // is requested. // Vector< String > symbol_table_d; // ABNF RuleModel // RuleModel abnf_model_d; // debug level // // debug_level_d inherited from LanguageModelBase //--------------------------------------------------------------------------- // // required public methods // //--------------------------------------------------------------------------- public: // method: name // static const String& name() { return CLASS_NAME; } // method: diagnose // static bool8 diagnose(Integral::DEBUG level); // method: debug // bool8 debug(const unichar* msg) const; // constructor // LanguageModelXML (); // method: read // bool8 read(Sof& sof, int32 tag = LanguageModelXML::DEF_TAG, const String& cname = CLASS_NAME); // method: readData // bool8 readData(Sof& sof, const String& pname = DEF_PARAM, int32 size = SofParser::FULL_OBJECT, bool8 param = true, bool8 nested = false); // method: write // bool8 write(Sof& sof, int32 tag, const String& cname = CLASS_NAME) const; // method: writeData // bool8 writeData(Sof& sof, const String& pname = DEF_PARAM) const; // method: sofSize // int32 sofSize() const { return hg_d.sofSize(); } // method: assign // bool8 assign(const LanguageModelXML& lmxml); // method: operator= // bool8 operator=(const LanguageModelXML& lmxml); // method: eq // bool8 eq(const LanguageModelXML& lmxml) const; // method: operator== // bool8 operator==(const LanguageModelXML& lmxml) const; // method: clear // bool8 clear(Integral::CMODE ctype); //--------------------------------------------------------------------------- // // class-specific public methods required by interface contract // defined in LanguageModelBase.h // //--------------------------------------------------------------------------- // method: assign // bool8 assign(const LanguageModelBase& arg); // method: getRuleModel // RuleModel getRuleModel(); // method: setRuleModel // bool8 setRuleModel(const RuleModel& rm_a); // method: getABNFRuleModel // RuleModel getABNFRuleModel(); // method: setABNFRuleModel // bool8 setABNFRuleModel(const RuleModel& rm_a) { return abnf_model_d.assign(rm_a); } // method: eq // bool8 eq(const LanguageModelBase& arg) const; // method: className // const String& className() const { return CLASS_NAME; } //--------------------------------------------------------------------------- // // class-specific public methods // //--------------------------------------------------------------------------- public: //-------------------------------------------------------- // // methods used to read/write the language model // //-------------------------------------------------------- // method: readXMLGrammars // bool8 readXMLGrammars(Sof& sof, const String& gname, int32 level, bool8 use_context, Vector* symbol_table, SofParser& parser); // method: parseXMLGrammar // bool8 parseXMLGrammar(String grammar_a, Vector& sub_symbol_list_a, Vector& token_vector_a, String& grammar_name_a); // method: alignGraphs // bool8 alignGraphs(Vector< Vector >& grammar_list_a, Vector& symbol_table_a, Vector& graph_name_list_a); // method: readSymbolType // bool8 readSymbolType(Sof& sof, int32 level, const String& pname, Vector& symbol_list, SofParser& parser); // method: readContextMapping // bool8 readContextMapping(Sof& sof, const String& pname, int32 level, SofParser&parser); //--------------------------------------------------------------------------- // // class-specific protected methods // //--------------------------------------------------------------------------- protected: //-------------------------------------------------------- // // output methods // //-------------------------------------------------------- // method: getXMLModel() // Pair< Vector< Vector< Rule > >, HierarchicalDigraph> getXMLModel(); // method: setXMLModel() // bool8 setXMLModel(Pair< Vector< Vector< Rule > >, HierarchicalDigraph> xml_model_a); // method: writeSymbols // bool8 writeSymbols(Sof& sof_a, int32 level_a, const String& pname_a, Vector symbol_list_a) const; // method: writeGrammars // bool8 writeGrammars(Sof& sof_a, int32 level_a, HierarchicalDigraph& h_digraph_a) const; // method: writeLevelTag // bool8 writeLevelTag(Sof& sof_a, int32 level_a, const String& tag_a) const; // method: addRuleRef // bool8 addRuleRef(Vector& token_vector_a, String rule_name_a, int32& depth_a) const; // method: addDummyItem // bool8 addDummyItem(Vector& token_vector_a, int32& depth_a) const; // method: addDummyItem // bool8 addCDATA(Vector& token_vector_a, const String& cdata_a, int32& depth_a) const; // method: addStartItemTag // bool8 addStartItemTag(Vector& token_vector_a, int32& depth_a, float32 weight_a=1, bool8 repeat_a=0, float32 repeat_prob_a=1) const; // method: addEndItemTag // bool8 addEndItemTag(Vector& token_vector_a, int32& depth_a) const; // method: addStartBranchTag // bool8 addStartBranchTag(Vector& token_vector_a, int32& depth_a) const; // method: addEndBranchTag // bool8 addEndBranchTag(Vector& token_vector_a, int32& depth_a) const; // method: addStartRuleTag // bool8 addStartRuleTag(Vector& token_vector_a, String rule_name_a, int32& depth_a) const; // method: addEndRuleTag // bool8 addEndRuleTag(Vector& token_vector_a, int32& depth_a) const; // method: tokensToString // String tokensToString(Vector& token_vector_a) const; // method: removeRedundantItemTags // Vector removeRedundantItemTags(Vector& token_vector_a) const; // method: constFindFollowingTagAtDepth // int32 constFindFollowingTagAtDepth( Vector& token_vector_a, int32 start_index_a, int32 base_depth_a, XMLToken::TYPE type_a, String value_a) const; //-------------------------------------------------------- // // BNF to XML conversion methods // //-------------------------------------------------------- // method: convertBNFtoXML // Vector convertBNFtoXML(ProductionRuleSet prset_a); // method: extractRule // ProductionRuleSet extractRule(const String& rule_name_a, ProductionRuleSet& prset_a) const; // method: addRule // bool8 addRule(ProductionRuleSet& prset_a, Vector& token_vector_a) const; // method: addAlternative // bool8 addAlternative(ProductionRule& pr_a, Vector& token_vector_a, int32 depth_a) const; //-------------------------------------------------------- // // XML to ABNF conversion methods // //-------------------------------------------------------- // method: convertXMLtoABNF // ProductionRule convertXMLtoABNF(Vector token_vector_a); // method: appendProduction // bool8 appendProduction(ProductionRule& prod1_a, ProductionRule prod2_a); // method: getRange // Vector getRange(int32 start_a, int32 end_a, Vector vector_a); // method: findNextItem // int32 findNextItem(int32 start_a, Vector vector_a); // method: preProcessXMLGrammar // bool8 preProcessXMLGrammar(Vector& token_vector); // method: validateGrammar // bool8 validateXMLGrammar(); // method: partitionGrammar // bool8 partitionGrammar(Vector& token_vector); // method: handleStartAndEndElement // bool8 handleStartAndEndElement (XMLToken xml_token); // method: handleStartElement // bool8 handleStartElement (XMLToken xml_token); // method: handleEndElement // bool8 handleEndElement (XMLToken xml_token); // method: handleCharacterData // bool8 handleCharacterData (XMLToken xml_token); // method: handleSpecialRules // bool8 handleSpecialRules(Vector& token_vector); // method: handleRepeatRequests // bool8 handleRepeatRequests(Vector& token_vector); // method: addOptionalRepeatSections // bool8 addOptionalRepeatSections(Vector& token_vector, Long m); // method: addManditoryRepeatSections // bool8 addManditoryRepeatSections(Vector& token_vector, Long m); //-------------------------------------------------------- // // methods for searching an XML grammar // //-------------------------------------------------------- // method: findFirstItemTagOfDATA // int32 findFirstItemTagOfCDATA(Vector& token_vector, int32 index); // method: findFollowingStructure // int32 findFollowingStructure(Vector& token_vector, int32 index); // method: findForwardTargetStructure // int32 findForwardTargetStructure(Vector& token_vector, int32 index); // findImmediateNesting // int32 findImmediateNesting(Vector token_vector, int32 index); // method: findInternalStructure // int32 findInternalStructure(Vector& token_vector, int32 index); // method: findMatchingStartTag // int32 findMatchingStartTag(Vector& token_vector, int32 index); // method: findMatchingEndTag // int32 findMatchingEndTag(Vector& token_vector, int32 index); // method: findPrecedingStructure // int32 findPrecedingStructure(Vector& token_vector, int32 index); // method: findPrecedingTagAtDepth // int32 findPrecedingTagAtDepth(Vector& token_vector, int32 start_index, int32 base_depth, XMLToken::TYPE type = XMLToken::NULL_TAG, String value = String::EMPTY); // method: findFollowingTagAtDepth // int32 findFollowingTagAtDepth(Vector& token_vector, int32 start_index, int32 base_depth, XMLToken::TYPE type = XMLToken::NULL_TAG, String value = String::EMPTY); //-------------------------------------------------------- // // worker methods // //-------------------------------------------------------- // method: clearXMLGrammar // bool8 clearXMLGrammar(); // method: displayVector // bool8 displayVector(Vector& targets); // method displayXMLModel bool8 displayXMLModel(); // method: debugSearchMethod // bool8 debugSearchMethod(int32 (LanguageModelXML::*func_ptr)(int32), Vector token_vector_a); // method: debugTargetMethod // bool8 debugTargetMethod (int32 (LanguageModelXML::*func_ptr)(int32, Vector&), Vector token_vector_a); // method: indexIsA // bool8 indexIsA(Vector& token_vector, int32 index, String value); // method: indexIsA // bool8 indexIsA(Vector& token_vector, int32 index, XMLToken::TYPE type); // method: isInBounds // bool8 isInBounds(Vector& token_vector, int32 index); // method: getDummySymbol // String getDummySymbol(); // method: getGrammarName // String getGrammarName(); // method: getHandledValues // Vector getHandledValues(); // method: getRuleID // String getRuleID(Rule rule); // method: getRuleIndexByID // int32 getRuleIndexByID(String id); // method: getSymbolList // Vector getSymbolList(){ return symbol_table_d; } // method: updateSymbolTable // bool8 updateSymbolTable(XMLToken xml_token); // method: tokenizeRepeatValue // Triple tokenizeRepeatValue(XMLToken xml_token); }; #endif