001package bradleyross.opensource.xerces; 002import org.xml.sax.*; 003import org.xml.sax.helpers.*; 004import java.util.*; 005import java.io.File; 006import java.io.IOException; 007import java.io.StringReader; 008import java.io.FileInputStream; 009import java.io.FileNotFoundException; 010/** The purpose of this class is to provide 011* a set of tools for parsing XML documents. 012* <p>When you create a class for parsing XML files, it is necessary 013* to create a subclass of org.xml.sax.ContentHandler 014* for a content handler object. The 015* XML parser calls methods in this class when various events 016* occur while reading the XML files. Events would be such things 017* as the start or end of a tagged element.</p> 018* <p>In order to use this class, this class it is necessary 019* to have xercesImpl.jar and xml-apis.jar 020* from the Apache Xerces project in the 021* CLASSPATH.</p> 022* <p>There are two sets of source code for this 023* class, one for versions previous to Java 5 and the other 024* for Java 5 and later.</p> 025* <p>When constructing code using Vector classes, it is necessary to 026* code differently for Java 5 and later versions because 027* version 5 introduced the idea of parameterizing Vector objects.</p> 028* <ul> 029* <li><p>For Java 5 and later, you will see the syntax 030* <code>Vector<String></code>. This means 031* objects making up the Vector are all of type String.</p></li> 032* <li><p>For versions before Java 5, the syntax is simply 033* <code>Vector</code>. This is because there is no method in 034* the earlier versions of restricting Vectors and other list 035* to a single class.</p></li> 036* </ul> 037* <p>The following is an example of a file to be processed by this object.</p> 038* <p><code><html><head></head><br> 039* <body>aaaa</body></html></code></p> 040* @see bradleyross.demonstrations.getTags 041* @see bradleyross.demonstrations.parseFile 042* @see org.xml.sax.ContentHandler 043* 044* @author Bradley Ross 045*/ 046public class XmlParser 047{ 048/** Indicates mode of parsing operation. */ 049private int mode; 050/** Getter for mode 051 * @return value of {@link #mode} 052 */ 053public int getMode() 054 { return mode; } 055/** Option for listing tags in document. */ 056public static final int LISTTAGS = 1; 057/** Option for searching for strings with specified tags. */ 058public static final int SEARCH = 2; 059/** Vector containing strings or lists of tags in document */ 060private Vector<String> items; 061/** Amount of diagnostic listing to be generated. */ 062private int debugLevel = 0; 063/** 064* This class provides the means of responding to 065* the Xerces parser from the Apache Xerces 066* parser. 067*/ 068protected class MyHandler implements org.xml.sax.ContentHandler 069 { 070 /** 071 * Indicates type of search to be carried out. 072 * <p>Value of LISTTAGS used for the following methods where 073 * the goal is to get a list of the tags and their structure 074 * in the document.</p> 075 * <p><ul> 076 * <li><p>public Vector parseString (Vector start, 077 * File document, String search)</p></li> 078 * <li><p>public Vector listTags (Vector start, String document)</p></li> 079 * <li><p>public Vector listTags (File document, String search)</p></li> 080 * <li><p>public Vector listTags (Vector start, File document, 081 * String search)</p></li> 082 * </ul></p> 083 * <p>Value of SEARCH used for the following methods where the 084 * goal is to get a list of segments of the documents having 085 * the specified tag structure.</p> 086 * <p><ul> 087 * <li><p>public Vector parseString (Vector<String> start, 088 * String document, String 089 * search )</p></li> 090 * <li><p>public Vector parseString (String document, String 091 * search)</p></li> 092 * <li><p>public Vector parseString (File document, String search)</p></li> 093 * <li><p>public Vector parseString (Vector start, 094 * File document, String search)</p></li> 095 * </ul></p> 096 */ 097 int mode; 098 int depth; 099 String searchString; 100 int activeDepth; 101 boolean activeSection = false; 102 StringBuffer activeString = null; 103 boolean testValue; 104 String tags[] = new String[40]; 105 /** 106 * Constructor defining actions taken during parsing. 107 * 108 * @param type This is an integer value defining the type of 109 * parsing operation to be carried out. 110 * @param criteria This String contains the criteria used 111 * for carrying out the parsing operation. 112 */ 113 public MyHandler(int type, String criteria) 114 { 115 mode = type; 116 searchString = criteria; 117 } 118 private void printText (String methodName) 119 { 120 if (debugLevel > 0) 121 { 122 System.out.println("*** Depth: " 123 .concat(Integer.toString(depth)) 124 .concat(" Running ").concat(methodName)); 125 } 126 } 127 private void printText (String methodName, String itemName) 128 { 129 if (debugLevel > 0) 130 { 131 System.out.println("*** Depth: " 132 .concat(Integer.toString(depth)) 133 .concat(" Running ").concat(methodName)); 134 System.out.println("Item: ".concat(itemName)); 135 } 136 } 137 private void printAttributes(Attributes atts) 138 { 139 for (int i=0; i < atts.getLength(); i++) 140 { 141 System.out.println(atts.getLocalName(i).concat(" :: ") 142 .concat(atts.getValue(i))); 143 } 144 } 145 public void setDocumentLocator(Locator locator) 146 { printText ("setDocumentLocation"); } 147 /** 148 * Called by the parser when the start of the document 149 * is encountered. 150 * <p>Initializes fields used in parsing document.</p> 151 */ 152 public void startDocument() throws SAXException 153 { 154 depth = -1 ; 155 activeSection = false; 156 printText("startDocument"); 157 if ((debugLevel > 0) && (mode == SEARCH)) 158 { System.out.println("Search string: ".concat(searchString)); } 159 } 160 /** 161 * Called by the parser when the end of the document 162 * is encountered. 163 */ 164 public void endDocument() throws SAXException 165 { 166 printText("endDocument"); 167 } 168 /** 169 * Called when prefix mapping is started. 170 * <p>No action is taken for this parser action.</p> 171 */ 172 public void startPrefixMapping (String prefix, String uri) 173 throws SAXException 174 { 175 printText ("startPrefixMapping"); 176 } 177 /** 178 * Called when prefix mapping is ended. 179 * <p>No action is taken for this parser action.</p> 180 */ 181 public void endPrefixMapping(String prefix) throws SAXException 182 { 183 printText ("endPrefixMapping"); 184 } 185 /** 186 * Called when a start tag is encountered. 187 * <p>Together with the actions taken in response to 188 * the endElement method, this represents the heart 189 * of the parsing operation.</p> 190 */ 191 public void startElement(String namespaceURI, String localName, 192 String qualifiedName, Attributes atts) throws SAXException 193 { 194 StringBuffer tagList = new StringBuffer(); 195 depth = depth + 1; 196 printText ("startElement", localName); 197 if (debugLevel > 0) 198 { 199 if (atts.getLength() > 0) 200 { printAttributes(atts); } 201 } 202 tags[depth] = localName; 203 tagList = new StringBuffer(); 204 for (int i = 0; i <= depth; i++) 205 { tagList.append("<".concat(tags[i]).concat(">")); } 206 if (debugLevel > 0) 207 { System.out.println(tagList); } 208 if (mode == SEARCH) 209 { 210 if (activeSection) 211 { 212 activeString.append("<".concat(localName).concat(">")); 213 } 214 if ((new String(tagList)).equals(searchString)) 215 { 216 if (debugLevel > 0) 217 { 218 System.out.println("Match found"); 219 } 220 activeSection = true; 221 activeDepth = depth; 222 activeString = new StringBuffer(); 223 } 224 } 225 else if (mode == LISTTAGS) 226 { 227 items.add(new String(tagList)); 228 if (debugLevel > 0) 229 { System.out.println(tagList); } 230 } 231 } 232 /** 233 * Called when an end tag is encountered. 234 */ 235 public void endElement(String namespaceURI, String localName, 236 String qualifiedName) throws SAXException 237 { 238 printText ("endElement", localName); 239 depth = depth - 1; 240 if ((mode == SEARCH) && (activeSection)) 241 { 242 if (depth < activeDepth) 243 { 244 items.add(new String(activeString)); 245 activeString = new StringBuffer(); 246 activeSection = false; 247 } 248 else 249 { 250 activeString.append("</".concat(localName).concat(">")); 251 } 252 } 253 } 254 /** 255 * This method is called when text is encountered between start 256 * and end tags. 257 * <p>Multiple calls of this method may be executed to handle the 258 * text between the tags.</p> 259 */ 260 public void characters(char[] text, int start, int length) 261 throws SAXException 262 { 263 String data = new String(text, start, length) 264 .replaceAll("&", "&") 265 .replaceAll("\\\'", "'") 266 .replaceAll("\\\"", """) 267 .replaceAll("<", "<") 268 .replaceAll(">", ">"); 269 if (mode == SEARCH) 270 { 271 if (activeSection) 272 { 273 activeString.append(data); 274 } 275 } 276 printText ("characters", data); 277 } 278 /** 279 * Called when ignorable whitespace is encountered. 280 * <p>Ignorable whitespace is ignored and no action is 281 * taken.</p> 282 */ 283 public void ignorableWhitespace (char[] text, int start, int length) 284 throws SAXException 285 { 286 printText ("ignorableWhitespace"); 287 } 288 public void processingInstruction(String target, String data) 289 throws SAXException 290 { 291 printText ("processingInstruction"); 292 } 293 public void skippedEntity(String name) throws SAXException 294 { 295 printText("skippedEntity"); 296 } 297 } 298/** 299* Determine amount of diagnostic output. 300* @param level Amount of diagnostic material to be printed. 0 301* is default and results in no diagnostic messages. Higher values 302* produce more diagnostic messages. 303*/ 304public void setDebugLevel (int level) 305 { debugLevel = level; } 306/** 307This method parses an XML document for strings 308* @param start Initial Vector of String objects 309* @param document This string contains the document to be parsed 310* @param search This string indicates the set of tags to be searched 311* for. If the value is 312* <code><Envelope><Body><FetchHandle></code>, 313* the program will return the contents of all 314* <code>FetchHandle</code> tags which are within 315* <code>Body</code> tags which are within 316* <code>Envelope</code> tags. 317* @return Vector of String objects that contains all of the 318* objects from the initial list plus the items found in 319* document. 320*/ 321public Vector<String> parseString (Vector<String> start, String document, 322 String search) 323 { 324 return internalParse(start, 325 new InputSource(new StringReader(document)), 326 search, SEARCH); 327 } 328/** 329This method parses an XML document for strings 330* @param document This string contains the document to be parsed 331* @param search This string indicates the set of tags to be searched 332* for. If the value is 333* <code><Envelope><Body><FetchHandle></code>, 334* the program will return the contents of all 335* <code>FetchHandle</code> tags which are within 336* <code>Body</code> tags which are within 337* <code>Envelope</code> tags. 338* @return Vector of String objects that contains all of the 339* objects found in 340* document. 341*/ 342public Vector<String> parseString (String document, String search) 343 { 344 if (debugLevel > 0) 345 { 346 System.out.println("*** Starting parseString"); 347 System.out.println("Search string is ".concat(search)); 348 System.out.println(document); 349 } 350 return internalParse (new Vector<String>(), 351 new InputSource(new StringReader(document)), 352 search, SEARCH); 353 } 354/** 355This method parses an XML document for strings 356* @param document This string contains the File object representing 357* the file to be parsed. 358* @param search This string indicates the set of tags to be searched 359* for. If the value is 360* <code><Envelope><Body><FetchHandle></code>, 361* the program will return the contents of all 362* <code>FetchHandle</code> tags which are within 363* <code>Body</code> tags which are within 364* <code>Envelope</code> tags. 365* @return Vector of String objects that contains all of the 366* objects found in 367* document. 368*/ 369public Vector<String> parseString (File document, String search) 370 { 371 InputSource source; 372 if (debugLevel > 0) 373 { 374 System.out.println("*** Starting parseString"); 375 System.out.println("Search string is ".concat(search)); 376 System.out.println(document); 377 } 378 try 379 { 380 source = new InputSource(new FileInputStream(document)); 381 } 382 catch (FileNotFoundException e) 383 { 384 System.out.println("Unable to open file"); 385 e.printStackTrace(); 386 return null; 387 } 388 return internalParse (new Vector<String>(), 389 source, 390 search, SEARCH); 391 } 392/** 393This method parses an XML document for strings 394* @param start Vector containing the String objects at the 395* start executing the method 396* @param document This string is the File object to be parsed 397* @param search This string indicates the set of tags to be searched 398* for. If the value is 399* <code><Envelope><Body><FetchHandle></code>, 400* the program will return the contents of all 401* <code>FetchHandle</code> tags which are within 402* <code>Body</code> tags which are within 403* <code>Envelope</code> tags. 404* @return Vector of String objects that contains all of the 405* objects found in 406* document. 407*/ 408public Vector<String> parseString (Vector<String> start, 409 File document, String search) 410 { 411 InputSource source; 412 if (debugLevel > 0) 413 { 414 System.out.println("*** Starting parseString"); 415 System.out.println("Search string is ".concat(search)); 416 System.out.println(document); 417 } 418 try 419 { 420 source = new InputSource(new FileInputStream(document)); 421 } 422 catch (FileNotFoundException e) 423 { 424 System.out.println("Unable to open file"); 425 e.printStackTrace(); 426 return null; 427 } 428 return internalParse (start, 429 source, 430 search, SEARCH); 431 } 432/** 433* List tags contained in an XML document. 434* 435* <p>This method returns a Vector containing String objects.</p> 436* <p>Each String object contains a sequence of tags found 437* in the document.</p> 438* @param start Initial vector of String objects to which 439* items are to be appended. 440* @param document Document to be parsed. 441* @return Vector of String objects containing list of tag 442* combinations 443*/ 444public Vector<String> listTags (Vector<String> start, String document) 445 { 446 return internalParse(start, 447 new InputSource(new StringReader(document)), 448 (String) null, LISTTAGS); 449 } 450/** 451* List strings contained in document. 452* 453* This method returns a Vector containing String objects. 454* Each String object contains a sequence of tags found 455* in the document. 456* @param document Document to be parsed. 457* @return Vector of String objects containing list of tag 458* combinations. 459*/ 460public Vector<String> listTags (String document) 461 { 462 return internalParse(new Vector<String>(), 463 new InputSource(new StringReader(document)), (String) null, 464 LISTTAGS); 465 } 466/** 467This method lists tags contained in an XML document. 468* @param document This string contains the File object representing 469* the file to be parsed. 470* @return Vector of String objects that contains a listing 471* of the tags in the document 472*/ 473public Vector<String> listTags (File document) 474 { 475 InputSource source; 476 if (debugLevel > 0) 477 { 478 System.out.println("*** Starting parseString"); 479 System.out.println(document); 480 } 481 try 482 { 483 source = new InputSource(new FileInputStream(document)); 484 } 485 catch (FileNotFoundException e) 486 { 487 System.out.println("Unable to open file"); 488 e.printStackTrace(); 489 return null; 490 } 491 return internalParse (new Vector<String>(), 492 source, 493 LISTTAGS); 494 } 495/** 496* This method lists tags contained in an XML document. 497* @param start Vector containing the String objects at the 498* start executing the method 499* @param document Object containing the file to be parsed 500* @return Vector of String objects that contains all of the 501* tags found in the 502* document. 503*/ 504public Vector<String> listTags (Vector<String> start, 505 File document) 506 { 507 InputSource source; 508 if (debugLevel > 0) 509 { 510 System.out.println("*** Starting listTags"); 511 System.out.println(document); 512 } 513 try 514 { 515 source = new InputSource(new FileInputStream(document)); 516 } 517 catch (FileNotFoundException e) 518 { 519 System.out.println("Unable to open file"); 520 e.printStackTrace(); 521 return null; 522 } 523 return internalParse (start, 524 source, 525 LISTTAGS); 526 } 527private Vector<String> internalParse(Vector<String> start, 528 InputSource document, 529 int mode) 530 { 531 String search = (String) null; 532 return internalParse (start, document, search, mode); 533 } 534private Vector<String> internalParse(Vector<String> start, 535 InputSource document, 536 String search, int mode) 537 { 538 items = new Vector<String>(start); 539 XMLReader parser = null; 540 try 541 { 542 parser = XMLReaderFactory.createXMLReader 543 ("org.apache.xerces.parsers.SAXParser"); 544 } 545 catch (SAXException e) 546 { 547 System.out.println ("SAXException error when creating XMLReader"); 548 return null; 549 } 550 parser.setContentHandler(new MyHandler(mode, search)); 551 try 552 { 553 /* 554 ** The argument for parse method must be of type 555 ** InputSource 556 */ 557 parser.parse(document); 558 } 559 catch (SAXParseException e) 560 { 561 System.out.println ("SAXParseException"); 562 } 563 catch (SAXException e) 564 { 565 System.out.println ("SAXException while parsing"); 566 } 567 catch (IOException e) 568 { 569 System.out.println ("IOException while parsing"); 570 } 571 return items; 572 } 573}