// file: $isip/class/pr/LanguageModelXML/lmxml_07.cc // version: $Id: lmxml_07.cc 10356 2006-01-10 18:52:05Z wholland $ // ISIP include files // #include "LanguageModelXML.h" // method: findFirstItemTagOfCDATA // // arguments: // int32 index: (input) the index of the token whose // first item tag we wish to find // // Vector& token_vector: (input) the token vector // in which we want to search // // return: a bool8 indicating status // // This method finds the first tag in // the sequence of tags in whicn the current vertex // (index_a)resides. // int32 LanguageModelXML::findFirstItemTagOfCDATA(Vector& token_vector_a, int32 index_a) { // only continue to process cdata tags // // make sure index is in bounds // // if the tag preceding this one is out of bounds, // then there is no first item tag // if(!isInBounds(token_vector_a, index_a) || !isInBounds(token_vector_a, index_a -1)|| !indexIsA(token_vector_a, index_a, XMLToken::CDATA)) { return Integral::NO_POS; } // if this cdata is not directly following a sequence of start // item tags, return the position of the cdata itself. for purposes // of determining depth, this is adequate. // if(!indexIsA(token_vector_a, index_a -1, XMLToken::START_TAG)) { return index_a; } // start at the most recent tag in the token_vertex_vector. // iterate backwards until the opening tag of the structure // is found. It MUST be an item tag // // the first decrement is to skip the CDATA tag which corresponds // to our vertex // // loop over the sequence of tags // // exit the loop when we encounter a non , non start tag // for(index_a--; indexIsA(token_vector_a, index_a, XMLToken::START_TAG) && indexIsA(token_vector_a, index_a, ITEM); index_a--); // loop will have iterated one tag past the opening item tag // adjust so that the index points to this opening item tag, // and return the index // return ++index_a; } // method: findFollowingStructure // // arguments: // int32 index: (input) the index of a token, the structure // immediately following which we wish to find. // // this method returns the first start tag or cdata tag // following the tag which begins the current structure // int32 LanguageModelXML::findFollowingStructure (Vector& token_vector_a, int32 index_a) { // prevent vector out of bounds exceptions // if(!isInBounds(token_vector_a, index_a)) { return Integral::NO_POS; } // if this is a start tag // skip to the end of this structure // if(indexIsA(token_vector_a, index_a, XMLToken::START_TAG)) { // set index_a to the end tag of this structure // index_a = findMatchingEndTag(token_vector_a, index_a); } // if this is a cdata tag, check immediately following it // else if (indexIsA(token_vector_a, index_a, XMLToken::CDATA)) { // if the token preceding is a CDATA token, it is the // preceding structure. // set index_a to the immediately preceding token // ++index_a; } // prevent vector out of bounds exceptions // if(!isInBounds(token_vector_a, index_a)) { return Integral::NO_POS; } // iterate until the nearest preceding start tag, or cdata tag, // // unless tags are introduced that are processed but are not // CDATA/END_TAG/START_TAG, this should be the immediately // following token. This loop is only in case such tags should // be added. // while(!indexIsA(token_vector_a, index_a, XMLToken::START_TAG) && !indexIsA(token_vector_a, index_a, XMLToken::CDATA)) { // if index iterates past token_vector_d.length(), there are no // more start tags in the vector // if(!isInBounds(token_vector_a, ++index_a)) { return Integral::NO_POS; } }// end looping over non-start tags // return the index of the start tag or cdata tag. // return index_a; } // method: findForwardTargetStructure // // arguments: // Vector token_vector: (input) the vector to which the index // beongs. // // index: (input) the index of the token whose target structure // we wish to find. Here target structure means the structure // to which the current vertex will have to draw arcs // // return: a int32 containing the index of the end tag of the // structure we wish to arc back to. // // this method is for use on a CDATA tag. // // This method is one of the most confusing in this entire class. // Here's how it works: // // 1. find the start tag in which we are immediately nested. // // 1.1. if there is no such start tag, then we are not nested, // and by default we are in a sequential statement. Therefore, // return the location of the immediately preceding structure (if any) // // 1.2. If we're nested, what are we nested in? // 1.2.1. If we're nested in an , use the index of // the current vertex to proceed. Since we're nested // in an item, if there is a vertex preceding // this one within that item, we want to find it. // alternatively, if there // is a vertex behind this item, that is not // nested in a branch, we want to find it // 1.2.2 If we're nested in a , use the index of // that to proceed. Since we're nested in // a branch, we know we NEVER want to connect two // parallel items, so we may skip past all structures // that are parallel to the current vertex (by being // nested at the same level within this ) // 1.3. Starting from the index decided on above, we wish to begin // looking for the structure immediately preceding this tag, meaning // the nearest preceding end tag or CDATA token. // 1.4. Once that structure is found, we want to know what it is immediately // nested in. As mentioned above, parallel items within branches are // never to be connected. Since we have found a structure that is // seperate from that of our current vertex, the current structure and // this preceding structure may be in parallel if they are both nested // within the same tags. // 1.4.1. If this preceding structure is nested within a tag, // it is parallel to the current structure, and it is not what // we are looking for. All other structures within this branch // may be skipped.go back to 1.3 using the index of the one-of // tag in which we are nested // 1.4.2 If this preceding structure is nested within an tag, // it the target we're looking for. Return the index of this // preceding structure. // int32 LanguageModelXML::findForwardTargetStructure (Vector& token_vector_a, int32 index_a) { // prevent vector out of bounds exceptions // if(!isInBounds(token_vector_a, index_a)) { return Integral::NO_POS; } // only process for tokens that make sense // if(!indexIsA(token_vector_a, index_a, XMLToken::CDATA)) { return Integral::NO_POS; } // declare a variable to store the index of the token in which // index_a is nested // int32 nesting_index = findImmediateNesting(token_vector_a, index_a); // declare a variable to store the index of the target for which // we are searching // int32 target_index = -1; // if we are not nested,then we are at the lowest level // of the entire grammar, and are to treat structures as though // they are in sequence (for lack of one-of tags). return the // preceding structure. // if(!isInBounds(token_vector_a, nesting_index)) { return findPrecedingStructure(token_vector_a, index_a); } // note, if we're nested in a , we want search_index to // point to that, and it already does, so do nothing. // if (indexIsA(token_vector_a, nesting_index, ITEM)) { nesting_index = index_a; } while(true) { // find the preceding structure // target_index = findPrecedingStructure(token_vector_a, nesting_index); if(!isInBounds(token_vector_a, target_index)) { return Integral::NO_POS; } // determine what this preceding structure is nested in // nesting_index = findImmediateNesting(token_vector_a, target_index); // if we are not nested, then the target index is to be treated // as though it is in sequence with our current vertex // exit the loop. // if(!isInBounds(token_vector_a, nesting_index)) { break; } // note, if we're nested in a , we want search_index to // point to that, and it already does, so do nothing. // // on the other hand, if we're nested in an item, then the target // index is in sequence with our current vertex, and we may exit // this loop // if (indexIsA(token_vector_a, nesting_index, ITEM)) { break; } } // return the target index so that it may be connected to the current // vertex // return target_index; } // method: findImmediateNesting // // arguments: // Vector token_vector: (input) the vector to which the index // belongs // // index: (input) the index of the token whose circumscribing // start tag we wish to find // // return: a int32 countaining the index of the start token in which // the token at index_a was nested // // this method assumes a CDATA or START tag is passed to it. NOT // an END tag. // int32 LanguageModelXML::findImmediateNesting (Vector token_vector_a, int32 index_a) { // prevent vector out of bounds exceptions // if(!isInBounds(token_vector_a, index_a)) { return Integral::NO_POS; } // to store the base depth (the depth just before this structure // begins) // int32 base_depth = 0; // if index_a points to a start tag // if(indexIsA(token_vector_a, index_a, XMLToken::START_TAG)) { // get the base depth of the structure in which our vertex resides // (the depth just before this structure starts) // base_depth = token_vector_a(index_a).getDepth() - 1; } else if(indexIsA(token_vector_a, index_a, XMLToken::END_TAG) || indexIsA(token_vector_a, index_a, XMLToken::CDATA)) { // get the base depth of the structure in which our vertex resides // (the depth just before this structure starts) // // both end tags and cdata tags exist at the same depth as the start // tags in which they are nested. // // take, for example, // // // CDATA // // base_depth = token_vector_a(index_a).getDepth(); } // find the opening tag of the structure preceding our vertex // (the first open tag with depth equal to our base depth) // int32 circumscribing_index = findPrecedingTagAtDepth(token_vector_a, index_a, base_depth, XMLToken::START_TAG); return circumscribing_index; } // method: findInternalStructure // // arguments: // int32 index: (input) the index of the token which marks // the end of a structure. // // Vector& token_vector: (input) the token vector // in which we want to search // // return: the index of the end tag of the // last (first preceding) structure which is // nested in the current structure // // this method will only produce meaningful results when // given the index of an END tag // int32 LanguageModelXML::findInternalStructure (Vector& token_vector_a, int32 index_a) { // if this is a start tag, check the index immediately following it // if(indexIsA(token_vector_a, index_a, XMLToken::START_TAG)) { // if the nested tag is a cdata tag, or a start tag, we have found // a nested structure // if(indexIsA(token_vector_a, index_a +1, XMLToken::START_TAG) || indexIsA(token_vector_a, index_a +1, XMLToken::CDATA)) { return ++index_a; } } // if this is an end tag, check the index immediately preceding it // else if(indexIsA(token_vector_a, index_a, XMLToken::END_TAG)) { // if the nested tag is a cdata tag or an end tag, we have found // a nested structure // if (indexIsA(token_vector_a, index_a -1, XMLToken::END_TAG) || indexIsA(token_vector_a, index_a -1, XMLToken::CDATA)) { return --index_a; } } // if the input index is not a start tag or an end tag, it must be a // CDATA tag, which cannot have nested structures // return Integral::NO_POS; } // method: findMatchingStartTag // // arguments: // int32 index: (input) the index of an end tag // // Vector& token_vector: (input) the token vector // in which we want to search // // return: the index of the start tag which matches the end tag at index_a // // this method finds the start tag which matches the // end tag at index_a // int32 LanguageModelXML::findMatchingStartTag (Vector& token_vector_a, int32 index_a) { // prevent vector out of bounds exceptions // if(!isInBounds(token_vector_a, index_a)) { return Integral::NO_POS; } // only find start tags for end tags. // if(!indexIsA(token_vector_a, index_a, XMLToken::END_TAG)) { return Integral::NO_POS; } // here, 1 is added to the depth. see why below // depth = 0 // depth = 1 // depth = 0 // return findPrecedingTagAtDepth(token_vector_a, index_a, token_vector_a(index_a).getDepth()+1, XMLToken::START_TAG, token_vector_a(index_a).getValue()); } // method: findMatchingEndTag // // arguments: // int32 index: (input) the index of a start tag // // Vector& token_vector: (input) the token vector // in which we want to search // // return: the index of the end tag which matches the start tag at index_a // // this method finds the end tag which matches the // start tag at index_a // int32 LanguageModelXML::findMatchingEndTag (Vector& token_vector_a, int32 index_a) { // prevent vector out of bounds exceptions // if(!isInBounds(token_vector_a, index_a)) { return Integral::NO_POS; } // only find start tags for end tags. // if(!indexIsA(token_vector_a, index_a, XMLToken::START_TAG)) { return Integral::NO_POS; } int32 i, nesting=0; // loop through the rest of the vector and find matching tag // for (i=index_a; i& token_vector: (input) the token vector // in which we want to search // // this method returns the first end tag preceding the start tag // which begins the current structure // int32 LanguageModelXML::findPrecedingStructure (Vector& token_vector_a, int32 index_a) { // prevent vector out of bounds exceptions // if(!isInBounds(token_vector_a, index_a)) { return Integral::NO_POS; } // if this is an end tag // skip back to the start of this structure // if(indexIsA(token_vector_a, index_a, XMLToken::END_TAG)) { // set index_a to the start of this structure // index_a = findMatchingStartTag(token_vector_a, index_a); } // if this is a cdata tag, check immediately preceding it // else if (indexIsA(token_vector_a, index_a, XMLToken::CDATA)) { // if the token preceding is a CDATA token, it is the // preceding structure. // set index_a to the immediately preceding token // --index_a; } // prevent vector out of bounds exceptions // if(!isInBounds(token_vector_a, index_a)) { return Integral::NO_POS; } // iterate until the nearest preceding end tag, or cdata tag, // // unless tags are introduced that are processed but are not // CDATA/END_TAG/START_TAG, this should be the immediately // preceding token. This loop is only in case such tags should // be added. // while(!indexIsA(token_vector_a, index_a, XMLToken::END_TAG) && !indexIsA(token_vector_a, index_a, XMLToken::CDATA)) { // if index iterates past 0, there are no // more end tags in the vector // if(!isInBounds(token_vector_a, --index_a)) { return Integral::NO_POS; } } // end looping over non-start tags // return the index of the end tag or cdata tag. // if none were found, index_a will be -1, which is // Integral::NO_POS // return index_a; } // method: findPrecedingTagAtDepth // // arguments: // Vector& token_vector: (input) the token vector // in which we want to search // // int32 start_index: (input) the index of the token from // which the backwards search begins // // int32 base_depth: (input) the depth of the token we wish // to find // // XMLToken::TYPE type: (input) the type of the token we wish // to find // // String value: (input) the value of the token we wish to find // // return: the index of the token that was found // // this general purpose method finds a tag which matches any of the // search criteria which are (optionally) specified. it iterates // backwards starting at start_index_a until a token is found whose // depth, type, and value match the search criteria // int32 LanguageModelXML::findPrecedingTagAtDepth(Vector& token_vector_a, int32 start_index_a, int32 base_depth_a, XMLToken::TYPE type_a, String value_a) { // prevent vector out of bounds exceptions and unreasonable // depths // if(!isInBounds(token_vector_a, start_index_a) || base_depth_a < 0) { return Integral::NO_POS; } // iterate backwards until tag is found whose // total depth matches the argument base_depth // and whose other parameters match the specified // search criteria (given by args value_a and // type_a // for better code readability // store the current token // XMLToken current_token; // loop from the start_index until index 0, which is the // very first token in the token_vertex_vector // for(;start_index_a >= 0; start_index_a--) { current_token = token_vector_a(start_index_a); // if type is null and value is null and depth matches, return // if(type_a == XMLToken::NULL_TAG && value_a.eq(String::EMPTY) && base_depth_a == current_token.getDepth()) { return start_index_a; } // if type is null and value matches and depth matches, return // else if(type_a == XMLToken::NULL_TAG && current_token.isA(value_a) && base_depth_a == current_token.getDepth()) { return start_index_a; } // if type matches and value is null and depth matches, return // else if (current_token.isA(type_a) && value_a.eq(String::DEF_VALUE) && base_depth_a == current_token.getDepth()) { return start_index_a; } // if type matches and value matches and depth matches, return // else if (current_token.isA(type_a) && current_token.isA(value_a) && base_depth_a == current_token.getDepth()) { return start_index_a; } }// end looping over all indeces // if we get here, no tag was found // return Integral::NO_POS; } // method: findFollowingTagAtDepth // // arguments: // Vector& token_vector: (input) the token vector // in which we want to search // // int32 start_index: (input) the index of the token from // which the forward search begins // // int32 base_depth: (input) the depth of the token we wish // to find // // XMLToken::TYPE type: (input) the type of the token we wish // to find // // String value: (input) the value of the token we wish to find // // return: the index of the token that was found // // this general purpose method finds a tag which matches any of the // search criteria which are (optionally) specified. it iterates // forward starting at start_index_a until a token is found whose // depth, type, and value match the search criteria // int32 LanguageModelXML::findFollowingTagAtDepth(Vector& token_vector_a, int32 start_index_a, int32 base_depth_a, XMLToken::TYPE type_a, String value_a) { // prevent vector out of bounds exceptions and unreasonable // depths // if(!isInBounds(token_vector_a, start_index_a) || base_depth_a < 0) { return Integral::NO_POS; } // iterate backwards until tag is found whose // total depth matches the argument base_depth // and whose other parameters match the specified // search criteria (given by args value_a and // type_a // for better code readability // store the current token // XMLToken current_token; // loop from the start_index until index 0, which is the // very first token in the token_vertex_vector // for(int32 i= start_index_a; i < token_vector_a.length(); i++) { current_token = token_vector_a(i); // if type is null and value is null and depth matches, return // if(type_a == XMLToken::NULL_TAG && value_a.eq(String::EMPTY) && base_depth_a == current_token.getDepth()) { return i; } // if type is null and value matches and depth matches, return // else if(type_a == XMLToken::NULL_TAG && current_token.isA(value_a) && base_depth_a == current_token.getDepth()) { return i; } // if type matches and value is null and depth matches, return // else if (current_token.isA(type_a) && value_a.eq(String::DEF_VALUE) && base_depth_a == current_token.getDepth()) { return i; } // if type matches and value matches and depth matches, return // else if (current_token.isA(type_a) && current_token.isA(value_a) && base_depth_a == current_token.getDepth()) { return i; } }// end looping over all indeces // if we get here, no tag was found // return Integral::NO_POS; }