001package bradleyross.common; 002import org.xml.sax.*; 003import org.xml.sax.helpers.*; 004import java.util.*; 005import java.io.File; 006import java.io.IOException; 007import java.io.StringReader; 008import java.io.FileInputStream; 009import java.io.FileNotFoundException; 010/** The purpose of this class is to provide 011* a set of tools for parsing XML documents. 012* <p>When you create a class for parsing XML files, it is necessary 013* to create a subclass of org.xml.sax.ContentHandler 014* for a content handler object. The 015* XML parser calls methods in this class when various events 016* occur while reading the XML files. Events would be such things 017* as the start or end of a tagged element.</p> 018* <p>In order to use this class, this class it is necessary 019* to have xercesImpl.jar and xml-apis.jar 020* from the Apache Xerces project in the 021* CLASSPATH.</p> 022* <p>There are two sets of source code for this 023* class, one for versions previous to Java 5 and the other 024* for Java 5 and later.</p> 025* <p>When constructing code using Vector classes, it is necessary to 026* code differently for Java 5 and later versions because 027* version 5 introduced the idea of parameterizing Vector objects.</p> 028* <ul> 029* <li><p>For Java 5 and later, you will see the syntax 030* <code>Vector<String></code>. This means 031* objects making up the Vector are all of type String.</p></li> 032* <li><p>For versions before Java 5, the syntax is simply 033* <code>Vector</code>. This is because there is no method in 034* the earlier versions of restricting Vectors and other list 035* to a single class.</p></li> 036* </ul> 037* <p>The following is an example of a file to be processed by this object.</p> 038* <p><html><head></head><br> 039* <code> <body>aaaa</body></html></code></p> 040* 041* @see bradleyross.demonstrations.getTags 042* @see bradleyross.demonstrations.parseFile 043* @see org.xml.sax.ContentHandler 044* 045* @author Bradley Ross 046*/ 047public class XmlParser 048{ 049/** Indicates mode of parsing operation. */ 050private int mode; 051/** Getter for mode 052 * @return mode value 053 */ 054public int getMode() 055 { return mode; } 056/** Option for listing tags in document. */ 057public static final int LISTTAGS = 1; 058/** Option for searching for strings with specified tags. */ 059public static final int SEARCH = 2; 060/** Vector containing strings or lists of tags in document */ 061private Vector<String> items; 062/** Amount of diagnostic listing to be generated. */ 063private int debugLevel = 0; 064/** 065* This class provides the means of responding to 066* the Xerces parser from the Apache Xerces 067* parser. 068*/ 069protected class MyHandler implements org.xml.sax.ContentHandler 070 { 071 /** 072 * Indicates type of search to be carried out. 073 * <p>Value of LISTTAGS used for the following methods where 074 * the goal is to get a list of the tags and their structure 075 * in the document.</p> 076 * <p><ul> 077 * <li><p>public Vector parseString (Vector start, 078 * File document, String search)</p></li> 079 * <li><p>public Vector listTags (Vector start, String document)</p></li> 080 * <li><p>public Vector listTags (File document, String search)</p></li> 081 * <li><p>public Vector listTags (Vector start, File document, 082 * String search)</p></li> 083 * </ul></p> 084 * <p>Value of SEARCH used for the following methods where the 085 * goal is to get a list of segments of the documents having 086 * the specified tag structure.</p> 087 * <p><ul> 088 * <li><p>public Vector parseString (Vector<String> start, 089 * String document, String 090 * search )</p></li> 091 * <li><p>public Vector parseString (String document, String 092 * search)</p></li> 093 * <li><p>public Vector parseString (File document, String search)</p></li> 094 * <li><p>public Vector parseString (Vector start, 095 * File document, String search)</p></li> 096 * </ul></p> 097 */ 098 int mode; 099 int depth; 100 String searchString; 101 int activeDepth; 102 boolean activeSection = false; 103 StringBuffer activeString = null; 104 boolean testValue; 105 String tags[] = new String[40]; 106 /** 107 * Constructor defining actions taken during parsing. 108 * 109 * @param type This is an integer value defining the type of 110 * parsing operation to be carried out. 111 * @param criteria This String contains the criteria used 112 * for carrying out the parsing operation. 113 */ 114 public MyHandler(int type, String criteria) 115 { 116 mode = type; 117 searchString = criteria; 118 } 119 private void printText (String methodName) 120 { 121 if (debugLevel > 0) 122 { 123 System.out.println("*** Depth: " 124 .concat(Integer.toString(depth)) 125 .concat(" Running ").concat(methodName)); 126 } 127 } 128 private void printText (String methodName, String itemName) 129 { 130 if (debugLevel > 0) 131 { 132 System.out.println("*** Depth: " 133 .concat(Integer.toString(depth)) 134 .concat(" Running ").concat(methodName)); 135 System.out.println("Item: ".concat(itemName)); 136 } 137 } 138 private void printAttributes(Attributes atts) 139 { 140 for (int i=0; i < atts.getLength(); i++) 141 { 142 System.out.println(atts.getLocalName(i).concat(" :: ") 143 .concat(atts.getValue(i))); 144 } 145 } 146 public void setDocumentLocator(Locator locator) 147 { printText ("setDocumentLocation"); } 148 /** 149 * Called by the parser when the start of the document 150 * is encountered. 151 * <p>Initializes fields used in parsing document.</p> 152 */ 153 public void startDocument() throws SAXException 154 { 155 depth = -1 ; 156 activeSection = false; 157 printText("startDocument"); 158 if ((debugLevel > 0) && (mode == SEARCH)) 159 { System.out.println("Search string: ".concat(searchString)); } 160 } 161 /** 162 * Called by the parser when the end of the document 163 * is encountered. 164 */ 165 public void endDocument() throws SAXException 166 { 167 printText("endDocument"); 168 } 169 /** 170 * Called when prefix mapping is started. 171 * <p>No action is taken for this parser action.</p> 172 */ 173 public void startPrefixMapping (String prefix, String uri) 174 throws SAXException 175 { 176 printText ("startPrefixMapping"); 177 } 178 /** 179 * Called when prefix mapping is ended. 180 * <p>No action is taken for this parser action.</p> 181 */ 182 public void endPrefixMapping(String prefix) throws SAXException 183 { 184 printText ("endPrefixMapping"); 185 } 186 /** 187 * Called when a start tag is encountered. 188 * <p>Together with the actions taken in response to 189 * the endElement method, this represents the heart 190 * of the parsing operation.</p> 191 */ 192 public void startElement(String namespaceURI, String localName, 193 String qualifiedName, Attributes atts) throws SAXException 194 { 195 StringBuffer tagList = new StringBuffer(); 196 depth = depth + 1; 197 printText ("startElement", localName); 198 if (debugLevel > 0) 199 { 200 if (atts.getLength() > 0) 201 { printAttributes(atts); } 202 } 203 tags[depth] = localName; 204 tagList = new StringBuffer(); 205 for (int i = 0; i <= depth; i++) 206 { tagList.append("<".concat(tags[i]).concat(">")); } 207 if (debugLevel > 0) 208 { System.out.println(tagList); } 209 if (mode == SEARCH) 210 { 211 if (activeSection) 212 { 213 activeString.append("<".concat(localName).concat(">")); 214 } 215 if ((new String(tagList)).equals(searchString)) 216 { 217 if (debugLevel > 0) 218 { 219 System.out.println("Match found"); 220 } 221 activeSection = true; 222 activeDepth = depth; 223 activeString = new StringBuffer(); 224 } 225 } 226 else if (mode == LISTTAGS) 227 { 228 items.add(new String(tagList)); 229 if (debugLevel > 0) 230 { System.out.println(tagList); } 231 } 232 } 233 /** 234 * Called when an end tag is encountered. 235 */ 236 public void endElement(String namespaceURI, String localName, 237 String qualifiedName) throws SAXException 238 { 239 printText ("endElement", localName); 240 depth = depth - 1; 241 if ((mode == SEARCH) && (activeSection)) 242 { 243 if (depth < activeDepth) 244 { 245 items.add(new String(activeString)); 246 activeString = new StringBuffer(); 247 activeSection = false; 248 } 249 else 250 { 251 activeString.append("</".concat(localName).concat(">")); 252 } 253 } 254 } 255 /** 256 * This method is called when text is encountered between start 257 * and end tags. 258 * <p>Multiple calls of this method may be executed to handle the 259 * text between the tags.</p> 260 */ 261 public void characters(char[] text, int start, int length) 262 throws SAXException 263 { 264 String data = new String(text, start, length) 265 .replaceAll("&", "&") 266 .replaceAll("\\\'", "'") 267 .replaceAll("\\\"", """) 268 .replaceAll("<", "<") 269 .replaceAll(">", ">"); 270 if (mode == SEARCH) 271 { 272 if (activeSection) 273 { 274 activeString.append(data); 275 } 276 } 277 printText ("characters", data); 278 } 279 /** 280 * Called when ignorable whitespace is encountered. 281 * <p>Ignorable whitespace is ignored and no action is 282 * taken.</p> 283 */ 284 public void ignorableWhitespace (char[] text, int start, int length) 285 throws SAXException 286 { 287 printText ("ignorableWhitespace"); 288 } 289 public void processingInstruction(String target, String data) 290 throws SAXException 291 { 292 printText ("processingInstruction"); 293 } 294 public void skippedEntity(String name) throws SAXException 295 { 296 printText("skippedEntity"); 297 } 298 } 299/** 300* Determine amount of diagnostic output. 301* @param level Amount of diagnostic material to be printed. 0 302* is default and results in no diagnostic messages. Higher values 303* produce more diagnostic messages. 304*/ 305public void setDebugLevel (int level) 306 { debugLevel = level; } 307/** 308This method parses an XML document for strings 309* @param start Initial Vector of String objects 310* @param document This string contains the document to be parsed 311* @param search This string indicates the set of tags to be searched 312* for. If the value is 313* <code><Envelope><Body><FetchHandle></code>, 314* the program will return the contents of all 315* <code>FetchHandle</code> tags which are within 316* <code>Body</code> tags which are within 317* <code>Envelope</code> tags. 318* @return Vector of String objects that contains all of the 319* objects from the initial list plus the items found in 320* document. 321*/ 322public Vector<String> parseString (Vector<String> start, String document, 323 String search) 324 { 325 return internalParse(start, 326 new InputSource(new StringReader(document)), 327 search, SEARCH); 328 } 329/** 330This method parses an XML document for strings 331* @param document This string contains the document to be parsed 332* @param search This string indicates the set of tags to be searched 333* for. If the value is 334* <code><Envelope><Body><FetchHandle></code>, 335* the program will return the contents of all 336* <code>FetchHandle</code> tags which are within 337* <code>Body</code> tags which are within 338* <code>Envelope</code> tags. 339* @return Vector of String objects that contains all of the 340* objects found in 341* document. 342*/ 343public Vector<String> parseString (String document, String search) 344 { 345 if (debugLevel > 0) 346 { 347 System.out.println("*** Starting parseString"); 348 System.out.println("Search string is ".concat(search)); 349 System.out.println(document); 350 } 351 return internalParse (new Vector<String>(), 352 new InputSource(new StringReader(document)), 353 search, SEARCH); 354 } 355/** 356This method parses an XML document for strings 357* @param document This string contains the File object representing 358* the file to be parsed. 359* @param search This string indicates the set of tags to be searched 360* for. If the value is 361* <code><Envelope><Body><FetchHandle></code>, 362* the program will return the contents of all 363* <code>FetchHandle</code> tags which are within 364* <code>Body</code> tags which are within 365* <code>Envelope</code> tags. 366* @return Vector of String objects that contains all of the 367* objects found in 368* document. 369*/ 370public Vector<String> parseString (File document, String search) 371 { 372 InputSource source; 373 if (debugLevel > 0) 374 { 375 System.out.println("*** Starting parseString"); 376 System.out.println("Search string is ".concat(search)); 377 System.out.println(document); 378 } 379 try 380 { 381 source = new InputSource(new FileInputStream(document)); 382 } 383 catch (FileNotFoundException e) 384 { 385 System.out.println("Unable to open file"); 386 e.printStackTrace(); 387 return null; 388 } 389 return internalParse (new Vector<String>(), 390 source, 391 search, SEARCH); 392 } 393/** 394This method parses an XML document for strings 395* @param start Vector containing the String objects at the 396* start executing the method 397* @param document This string is the File object to be parsed 398* @param search This string indicates the set of tags to be searched 399* for. If the value is 400* <code><Envelope><Body><FetchHandle></code>, 401* the program will return the contents of all 402* <code>FetchHandle</code> tags which are within 403* <code>Body</code> tags which are within 404* <code>Envelope</code> tags. 405* @return Vector of String objects that contains all of the 406* objects found in 407* document. 408*/ 409public Vector<String> parseString (Vector<String> start, 410 File document, String search) 411 { 412 InputSource source; 413 if (debugLevel > 0) 414 { 415 System.out.println("*** Starting parseString"); 416 System.out.println("Search string is ".concat(search)); 417 System.out.println(document); 418 } 419 try 420 { 421 source = new InputSource(new FileInputStream(document)); 422 } 423 catch (FileNotFoundException e) 424 { 425 System.out.println("Unable to open file"); 426 e.printStackTrace(); 427 return null; 428 } 429 return internalParse (start, 430 source, 431 search, SEARCH); 432 } 433/** 434* List tags contained in an XML document. 435* 436* <p>This method returns a Vector containing String objects.</p> 437* <p>Each String object contains a sequence of tags found 438* in the document.</p> 439* @param start Initial vector of String objects to which 440* items are to be appended. 441* @param document Document to be parsed. 442* @return Vector of String objects containing list of tag 443* combinations 444*/ 445public Vector<String> listTags (Vector<String> start, String document) 446 { 447 return internalParse(start, 448 new InputSource(new StringReader(document)), 449 (String) null, LISTTAGS); 450 } 451/** 452* List strings contained in document. 453* 454* This method returns a Vector containing String objects. 455* Each String object contains a sequence of tags found 456* in the document. 457* @param document Document to be parsed. 458* @return Vector of String objects containing list of tag 459* combinations. 460*/ 461public Vector<String> listTags (String document) 462 { 463 return internalParse(new Vector<String>(), 464 new InputSource(new StringReader(document)), (String) null, 465 LISTTAGS); 466 } 467/** 468This method lists tags contained in an XML document. 469* @param document This string contains the File object representing 470* the file to be parsed. 471* @return Vector of String objects that contains a listing 472* of the tags in the document 473*/ 474public Vector<String> listTags (File document) 475 { 476 InputSource source; 477 if (debugLevel > 0) 478 { 479 System.out.println("*** Starting parseString"); 480 System.out.println(document); 481 } 482 try 483 { 484 source = new InputSource(new FileInputStream(document)); 485 } 486 catch (FileNotFoundException e) 487 { 488 System.out.println("Unable to open file"); 489 e.printStackTrace(); 490 return null; 491 } 492 return internalParse (new Vector<String>(), 493 source, 494 LISTTAGS); 495 } 496/** 497* This method lists tags contained in an XML document. 498* @param start Vector containing the String objects at the 499* start executing the method 500* @param document Object containing the file to be parsed 501* @return Vector of String objects that contains all of the 502* tags found in the 503* document. 504*/ 505public Vector<String> listTags (Vector<String> start, 506 File document) 507 { 508 InputSource source; 509 if (debugLevel > 0) 510 { 511 System.out.println("*** Starting listTags"); 512 System.out.println(document); 513 } 514 try 515 { 516 source = new InputSource(new FileInputStream(document)); 517 } 518 catch (FileNotFoundException e) 519 { 520 System.out.println("Unable to open file"); 521 e.printStackTrace(); 522 return null; 523 } 524 return internalParse (start, 525 source, 526 LISTTAGS); 527 } 528private Vector<String> internalParse(Vector<String> start, 529 InputSource document, 530 int mode) 531 { 532 String search = (String) null; 533 return internalParse (start, document, search, mode); 534 } 535private Vector<String> internalParse(Vector<String> start, 536 InputSource document, 537 String search, int mode) 538 { 539 items = new Vector<String>(start); 540 XMLReader parser = null; 541 try 542 { 543 parser = XMLReaderFactory.createXMLReader 544 ("org.apache.xerces.parsers.SAXParser"); 545 } 546 catch (SAXException e) 547 { 548 System.out.println ("SAXException error when creating XMLReader"); 549 return null; 550 } 551 parser.setContentHandler(new MyHandler(mode, search)); 552 try 553 { 554 /* 555 ** The argument for parse method must be of type 556 ** InputSource 557 */ 558 parser.parse(document); 559 } 560 catch (SAXParseException e) 561 { 562 System.out.println ("SAXParseException"); 563 } 564 catch (SAXException e) 565 { 566 System.out.println ("SAXException while parsing"); 567 } 568 catch (IOException e) 569 { 570 System.out.println ("IOException while parsing"); 571 } 572 return items; 573 } 574}