001package bradleyross.opensource.xerces; 002import org.xml.sax.*; 003import org.xml.sax.helpers.*; 004import java.util.*; 005import java.io.File; 006import java.io.IOException; 007import java.io.StringReader; 008import java.io.FileInputStream; 009import java.io.FileNotFoundException; 010/** The purpose of this class is to provide 011* a set of tools for parsing XML documents. 012* <p>When you create a class for parsing XML files, it is necessary 013* to create a subclass of org.xml.sax.ContentHandler 014* for a content handler object. The 015* XML parser calls methods in this class when various events 016* occur while reading the XML files. Events would be such things 017* as the start or end of a tagged element.</p> 018* <p>In order to use this class, this class it is necessary 019* to have xercesImpl.jar and xml-apis.jar 020* from the Apache Xerces project in the 021* CLASSPATH.</p> 022* <p>There are two sets of source code for this 023* class, one for versions previous to Java 5 and the other 024* for Java 5 and later.</p> 025* <p>When constructing code using Vector classes, it is necessary to 026* code differently for Java 5 and later versions because 027* version 5 introduced the idea of parameterizing Vector objects.</p> 028* <p><ul> 029* <li><p>For Java 5 and later, you will see the syntax 030* <code>Vector<String></code>. This means 031* objects making up the Vector are all of type String.</p></li> 032* <li><p>For versions before Java 5, the syntax is simply 033* <code>Vector</code>. This is because there is no method in 034* the earlier versions of restricting Vectors and other list 035* to a single class.</p></li> 036* </ul></p> 037* <p>The following is an example of a file to be processed by this object.</p> 038* <p><html><head></head><br /> 039* <body>aaaa</body></html></code></p> 040* @see bradleyross.demonstrations.getTags 041* @see bradleyross.demonstrations.parseFile 042* @see org.xml.sax.ContentHandler 043* 044* @author Bradley Ross 045*/ 046public class XmlParser 047{ 048/** Indicates mode of parsing operation. */ 049private int mode; 050/** Getter for mode */ 051public int getMode() 052 { return mode; } 053/** Option for listing tags in document. */ 054public static final int LISTTAGS = 1; 055/** Option for searching for strings with specified tags. */ 056public static final int SEARCH = 2; 057/** Vector containing strings or lists of tags in document */ 058private Vector<String> items; 059/** Amount of diagnostic listing to be generated. */ 060private int debugLevel = 0; 061/** 062* This class provides the means of responding to 063* the Xerces parser from the Apache Xerces 064* parser. 065*/ 066protected class MyHandler implements org.xml.sax.ContentHandler 067 { 068 /** 069 * Indicates type of search to be carried out. 070 * <p>Value of LISTTAGS used for the following methods where 071 * the goal is to get a list of the tags and their structure 072 * in the document.</p> 073 * <p><ul> 074 * <li><p>public Vector parseString (Vector start, 075 * File document, String search)</p></li> 076 * <li><p>public Vector listTags (Vector start, String document)</p></li> 077 * <li><p>public Vector listTags (File document, String search)</p></li> 078 * <li><p>public Vector listTags (Vector start, File document, 079 * String search)</p></li> 080 * </ul></p> 081 * <p>Value of SEARCH used for the following methods where the 082 * goal is to get a list of segments of the documents having 083 * the specified tag structure.</p> 084 * <p><ul> 085 * <li><p>public Vector parseString (Vector<String> start, 086 * String document, String 087 * search )</p></li> 088 * <li><p>public Vector parseString (String document, String 089 * search)</p></li> 090 * <li><p>public Vector parseString (File document, String search)</p></li> 091 * <li><p>public Vector parseString (Vector start, 092 * File document, String search)</p></li> 093 * </ul></p> 094 */ 095 int mode; 096 int depth; 097 String searchString; 098 int activeDepth; 099 boolean activeSection = false; 100 StringBuffer activeString = null; 101 boolean testValue; 102 String tags[] = new String[40]; 103 /** 104 * Constructor defining actions taken during parsing. 105 * 106 * @param type This is an integer value defining the type of 107 * parsing operation to be carried out. 108 * @param criteria This String contains the criteria used 109 * for carrying out the parsing operation. 110 */ 111 public MyHandler(int type, String criteria) 112 { 113 mode = type; 114 searchString = criteria; 115 } 116 private void printText (String methodName) 117 { 118 if (debugLevel > 0) 119 { 120 System.out.println("*** Depth: " 121 .concat(Integer.toString(depth)) 122 .concat(" Running ").concat(methodName)); 123 } 124 } 125 private void printText (String methodName, String itemName) 126 { 127 if (debugLevel > 0) 128 { 129 System.out.println("*** Depth: " 130 .concat(Integer.toString(depth)) 131 .concat(" Running ").concat(methodName)); 132 System.out.println("Item: ".concat(itemName)); 133 } 134 } 135 private void printAttributes(Attributes atts) 136 { 137 for (int i=0; i < atts.getLength(); i++) 138 { 139 System.out.println(atts.getLocalName(i).concat(" :: ") 140 .concat(atts.getValue(i))); 141 } 142 } 143 public void setDocumentLocator(Locator locator) 144 { printText ("setDocumentLocation"); } 145 /** 146 * Called by the parser when the start of the document 147 * is encountered. 148 * <p>Initializes fields used in parsing document.</p> 149 */ 150 public void startDocument() throws SAXException 151 { 152 depth = -1 ; 153 activeSection = false; 154 printText("startDocument"); 155 if ((debugLevel > 0) && (mode == SEARCH)) 156 { System.out.println("Search string: ".concat(searchString)); } 157 } 158 /** 159 * Called by the parser when the end of the document 160 * is encountered. 161 */ 162 public void endDocument() throws SAXException 163 { 164 printText("endDocument"); 165 } 166 /** 167 * Called when prefix mapping is started. 168 * <p>No action is taken for this parser action.</p> 169 */ 170 public void startPrefixMapping (String prefix, String uri) 171 throws SAXException 172 { 173 printText ("startPrefixMapping"); 174 } 175 /** 176 * Called when prefix mapping is ended. 177 * <p>No action is taken for this parser action.</p> 178 */ 179 public void endPrefixMapping(String prefix) throws SAXException 180 { 181 printText ("endPrefixMapping"); 182 } 183 /** 184 * Called when a start tag is encountered. 185 * <p>Together with the actions taken in response to 186 * the endElement method, this represents the heart 187 * of the parsing operation.</p> 188 */ 189 public void startElement(String namespaceURI, String localName, 190 String qualifiedName, Attributes atts) throws SAXException 191 { 192 StringBuffer tagList = new StringBuffer(); 193 depth = depth + 1; 194 printText ("startElement", localName); 195 if (debugLevel > 0) 196 { 197 if (atts.getLength() > 0) 198 { printAttributes(atts); } 199 } 200 tags[depth] = localName; 201 tagList = new StringBuffer(); 202 for (int i = 0; i <= depth; i++) 203 { tagList.append("<".concat(tags[i]).concat(">")); } 204 if (debugLevel > 0) 205 { System.out.println(tagList); } 206 if (mode == SEARCH) 207 { 208 if (activeSection) 209 { 210 activeString.append("<".concat(localName).concat(">")); 211 } 212 if ((new String(tagList)).equals(searchString)) 213 { 214 if (debugLevel > 0) 215 { 216 System.out.println("Match found"); 217 } 218 activeSection = true; 219 activeDepth = depth; 220 activeString = new StringBuffer(); 221 } 222 } 223 else if (mode == LISTTAGS) 224 { 225 items.add(new String(tagList)); 226 if (debugLevel > 0) 227 { System.out.println(tagList); } 228 } 229 } 230 /** 231 * Called when an end tag is encountered. 232 */ 233 public void endElement(String namespaceURI, String localName, 234 String qualifiedName) throws SAXException 235 { 236 printText ("endElement", localName); 237 depth = depth - 1; 238 if ((mode == SEARCH) && (activeSection)) 239 { 240 if (depth < activeDepth) 241 { 242 items.add(new String(activeString)); 243 activeString = new StringBuffer(); 244 activeSection = false; 245 } 246 else 247 { 248 activeString.append("</".concat(localName).concat(">")); 249 } 250 } 251 } 252 /** 253 * This method is called when text is encountered between start 254 * and end tags. 255 * <p>Multiple calls of this method may be executed to handle the 256 * text between the tags.</p> 257 */ 258 public void characters(char[] text, int start, int length) 259 throws SAXException 260 { 261 String data = new String(text, start, length) 262 .replaceAll("&", "&") 263 .replaceAll("\\\'", "'") 264 .replaceAll("\\\"", """) 265 .replaceAll("<", "<") 266 .replaceAll(">", ">"); 267 if (mode == SEARCH) 268 { 269 if (activeSection) 270 { 271 activeString.append(data); 272 } 273 } 274 printText ("characters", data); 275 } 276 /** 277 * Called when ignorable whitespace is encountered. 278 * <p>Ignorable whitespace is ignored and no action is 279 * taken.</p> 280 */ 281 public void ignorableWhitespace (char[] text, int start, int length) 282 throws SAXException 283 { 284 printText ("ignorableWhitespace"); 285 } 286 public void processingInstruction(String target, String data) 287 throws SAXException 288 { 289 printText ("processingInstruction"); 290 } 291 public void skippedEntity(String name) throws SAXException 292 { 293 printText("skippedEntity"); 294 } 295 } 296/** 297* Determine amount of diagnostic output. 298* @param level Amount of diagnostic material to be printed. 0 299* is default and results in no diagnostic messages. Higher values 300* produce more diagnostic messages. 301*/ 302public void setDebugLevel (int level) 303 { debugLevel = level; } 304/** 305This method parses an XML document for strings 306* @param start Initial Vector of String objects 307* @param document This string contains the document to be parsed 308* @param search This string indicates the set of tags to be searched 309* for. If the value is 310* <code><Envelope><Body><FetchHandle></code>, 311* the program will return the contents of all 312* <code>FetchHandle</code> tags which are within 313* <code>Body</code> tags which are within 314* <code>Envelope</code> tags. 315* @return Vector of String objects that contains all of the 316* objects from the initial list plus the items found in 317* document. 318*/ 319public Vector<String> parseString (Vector<String> start, String document, 320 String search) 321 { 322 return internalParse(start, 323 new InputSource(new StringReader(document)), 324 search, SEARCH); 325 } 326/** 327This method parses an XML document for strings 328* @param document This string contains the document to be parsed 329* @param search This string indicates the set of tags to be searched 330* for. If the value is 331* <code><Envelope><Body><FetchHandle></code>, 332* the program will return the contents of all 333* <code>FetchHandle</code> tags which are within 334* <code>Body</code> tags which are within 335* <code>Envelope</code> tags. 336* @return Vector of String objects that contains all of the 337* objects found in 338* document. 339*/ 340public Vector<String> parseString (String document, String search) 341 { 342 if (debugLevel > 0) 343 { 344 System.out.println("*** Starting parseString"); 345 System.out.println("Search string is ".concat(search)); 346 System.out.println(document); 347 } 348 return internalParse (new Vector<String>(), 349 new InputSource(new StringReader(document)), 350 search, SEARCH); 351 } 352/** 353This method parses an XML document for strings 354* @param document This string contains the File object representing 355* the file to be parsed. 356* @param search This string indicates the set of tags to be searched 357* for. If the value is 358* <code><Envelope><Body><FetchHandle></code>, 359* the program will return the contents of all 360* <code>FetchHandle</code> tags which are within 361* <code>Body</code> tags which are within 362* <code>Envelope</code> tags. 363* @return Vector of String objects that contains all of the 364* objects found in 365* document. 366*/ 367public Vector<String> parseString (File document, String search) 368 { 369 InputSource source; 370 if (debugLevel > 0) 371 { 372 System.out.println("*** Starting parseString"); 373 System.out.println("Search string is ".concat(search)); 374 System.out.println(document); 375 } 376 try 377 { 378 source = new InputSource(new FileInputStream(document)); 379 } 380 catch (FileNotFoundException e) 381 { 382 System.out.println("Unable to open file"); 383 e.printStackTrace(); 384 return null; 385 } 386 return internalParse (new Vector<String>(), 387 source, 388 search, SEARCH); 389 } 390/** 391This method parses an XML document for strings 392* @param start Vector containing the String objects at the 393* start executing the method 394* @param document This string is the File object to be parsed 395* @param search This string indicates the set of tags to be searched 396* for. If the value is 397* <code><Envelope><Body><FetchHandle></code>, 398* the program will return the contents of all 399* <code>FetchHandle</code> tags which are within 400* <code>Body</code> tags which are within 401* <code>Envelope</code> tags. 402* @return Vector of String objects that contains all of the 403* objects found in 404* document. 405*/ 406public Vector<String> parseString (Vector<String> start, 407 File document, String search) 408 { 409 InputSource source; 410 if (debugLevel > 0) 411 { 412 System.out.println("*** Starting parseString"); 413 System.out.println("Search string is ".concat(search)); 414 System.out.println(document); 415 } 416 try 417 { 418 source = new InputSource(new FileInputStream(document)); 419 } 420 catch (FileNotFoundException e) 421 { 422 System.out.println("Unable to open file"); 423 e.printStackTrace(); 424 return null; 425 } 426 return internalParse (start, 427 source, 428 search, SEARCH); 429 } 430/** 431* List tags contained in an XML document. 432* 433* <p>This method returns a Vector containing String objects.</p> 434* <p>Each String object contains a sequence of tags found 435* in the document.</p> 436* @param start Initial vector of String objects to which 437* items are to be appended. 438* @param document Document to be parsed. 439* @return Vector of String objects containing list of tag 440* combinations 441*/ 442public Vector<String> listTags (Vector<String> start, String document) 443 { 444 return internalParse(start, 445 new InputSource(new StringReader(document)), 446 (String) null, LISTTAGS); 447 } 448/** 449* List strings contained in document. 450* 451* This method returns a Vector containing String objects. 452* Each String object contains a sequence of tags found 453* in the document. 454* @param document Document to be parsed. 455* @return Vector of String objects containing list of tag 456* combinations. 457*/ 458public Vector<String> listTags (String document) 459 { 460 return internalParse(new Vector<String>(), 461 new InputSource(new StringReader(document)), (String) null, 462 LISTTAGS); 463 } 464/** 465This method lists tags contained in an XML document. 466* @param document This string contains the File object representing 467* the file to be parsed. 468* @return Vector of String objects that contains a listing 469* of the tags in the document 470*/ 471public Vector<String> listTags (File document) 472 { 473 InputSource source; 474 if (debugLevel > 0) 475 { 476 System.out.println("*** Starting parseString"); 477 System.out.println(document); 478 } 479 try 480 { 481 source = new InputSource(new FileInputStream(document)); 482 } 483 catch (FileNotFoundException e) 484 { 485 System.out.println("Unable to open file"); 486 e.printStackTrace(); 487 return null; 488 } 489 return internalParse (new Vector<String>(), 490 source, 491 LISTTAGS); 492 } 493/** 494* This method lists tags contained in an XML document. 495* @param start Vector containing the String objects at the 496* start executing the method 497* @param document Object containing the file to be parsed 498* @return Vector of String objects that contains all of the 499* tags found in the 500* document. 501*/ 502public Vector<String> listTags (Vector<String> start, 503 File document) 504 { 505 InputSource source; 506 if (debugLevel > 0) 507 { 508 System.out.println("*** Starting listTags"); 509 System.out.println(document); 510 } 511 try 512 { 513 source = new InputSource(new FileInputStream(document)); 514 } 515 catch (FileNotFoundException e) 516 { 517 System.out.println("Unable to open file"); 518 e.printStackTrace(); 519 return null; 520 } 521 return internalParse (start, 522 source, 523 LISTTAGS); 524 } 525private Vector<String> internalParse(Vector<String> start, 526 InputSource document, 527 int mode) 528 { 529 String search = (String) null; 530 return internalParse (start, document, search, mode); 531 } 532private Vector<String> internalParse(Vector<String> start, 533 InputSource document, 534 String search, int mode) 535 { 536 items = new Vector<String>(start); 537 XMLReader parser = null; 538 try 539 { 540 parser = XMLReaderFactory.createXMLReader 541 ("org.apache.xerces.parsers.SAXParser"); 542 } 543 catch (SAXException e) 544 { 545 System.out.println ("SAXException error when creating XMLReader"); 546 return null; 547 } 548 parser.setContentHandler(new MyHandler(mode, search)); 549 try 550 { 551 /* 552 ** The argument for parse method must be of type 553 ** InputSource 554 */ 555 parser.parse(document); 556 } 557 catch (SAXParseException e) 558 { 559 System.out.println ("SAXParseException"); 560 } 561 catch (SAXException e) 562 { 563 System.out.println ("SAXException while parsing"); 564 } 565 catch (IOException e) 566 { 567 System.out.println ("IOException while parsing"); 568 } 569 return items; 570 } 571}