001package bradleyross.opensource.xerces;
002import org.xml.sax.*;
003import org.xml.sax.helpers.*;
004import java.util.*;
005import java.io.File;
006import java.io.IOException;
007import java.io.StringReader;
008import java.io.FileInputStream;
009import java.io.FileNotFoundException;
010/** The purpose of this class is to provide
011* a set of tools for parsing XML documents.
012* <p>When you create a class for parsing XML files, it is necessary
013*    to create a subclass of org.xml.sax.ContentHandler
014*    for a content handler object.  The
015*    XML parser calls methods in this class when various events
016*    occur while reading the XML files.  Events would be such things
017*    as the start or end of a tagged element.</p>
018* <p>In order to use this class, this class it is necessary
019* to have xercesImpl.jar and xml-apis.jar
020* from the Apache Xerces project in the
021* CLASSPATH.</p>
022* <p>There are two sets of source code for this
023*    class, one for versions previous to Java 5 and the other
024*    for Java 5 and later.</p>
025* <p>When constructing code using Vector classes, it is necessary to
026*    code differently for Java 5 and later versions because 
027*    version 5 introduced the idea of parameterizing Vector objects.</p>
028* <p><ul>
029* <li><p>For Java 5 and later, you will see the syntax
030*     <code>Vector&lt;String&gt;</code>.  This means
031*     objects making up the Vector are all of type String.</p></li>
032* <li><p>For versions before Java 5, the syntax is simply
033*     <code>Vector</code>.  This is because there is no method in
034*     the earlier versions of restricting Vectors and other list
035*     to a single class.</p></li>
036* </ul></p>
037* <p>The following is an example of a file to be processed by this object.</p>
038* <p>&lt;html&gt;&lt;head&gt;&lt;/head&gt;<br />
039* &lt;body&gt;aaaa&lt;/body&gt;&lt;/html&gt;</code></p>
040* @see bradleyross.demonstrations.getTags
041* @see bradleyross.demonstrations.parseFile
042* @see org.xml.sax.ContentHandler
043*
044* @author Bradley Ross
045*/
046public class XmlParser 
047{
048/** Indicates mode of parsing operation. */
049private int mode;
050/** Getter for mode */
051public int getMode()
052        { return mode; }
053/** Option for listing tags in document. */
054public static final int LISTTAGS = 1;
055/** Option for searching for strings with specified tags. */
056public static final int SEARCH = 2;
057/** Vector containing strings or lists of tags in document */
058private Vector<String> items;
059/** Amount of diagnostic listing to be generated. */
060private int debugLevel = 0;
061/**
062* This class provides the means of responding to
063* the Xerces parser from the Apache Xerces
064* parser.
065*/
066protected class MyHandler implements org.xml.sax.ContentHandler
067   {
068   /**
069   * Indicates type of search to be carried out.
070   * <p>Value of LISTTAGS used for the following methods where
071   *    the goal is to get a list of the tags and their structure
072   *    in the document.</p>
073   * <p><ul>
074   * <li><p>public Vector parseString (Vector start,
075   *        File document, String search)</p></li>
076   * <li><p>public Vector listTags (Vector start, String document)</p></li>
077   * <li><p>public Vector listTags (File document, String search)</p></li>
078   * <li><p>public Vector listTags (Vector start, File document,
079   *        String search)</p></li>
080   * </ul></p>
081   * <p>Value of SEARCH used for the following methods where the
082   *    goal is to get a list of segments of the documents having
083   *    the specified tag structure.</p>
084   * <p><ul>
085   * <li><p>public Vector parseString (Vector&lt;String&gt; start, 
086   *        String document, String 
087   *        search )</p></li>
088   * <li><p>public Vector parseString (String document, String
089   *        search)</p></li>
090   * <li><p>public Vector parseString (File document, String search)</p></li>
091   * <li><p>public Vector parseString (Vector start,
092   *        File document, String search)</p></li>
093   * </ul></p>
094   */
095   int mode;
096   int depth;
097   String searchString;
098   int activeDepth;
099   boolean activeSection = false;
100   StringBuffer  activeString = null;
101   boolean testValue;
102   String tags[] = new String[40];
103   /**
104   * Constructor defining actions taken during parsing.
105   *
106   * @param type This is an integer value defining the type of 
107   * parsing operation to be carried out.
108   * @param criteria This String contains the criteria used
109   * for carrying out the parsing operation.
110   */
111   public MyHandler(int type, String criteria)
112      {
113      mode = type;
114      searchString = criteria;
115      }
116   private void printText (String methodName)
117      {
118      if (debugLevel > 0)
119         {
120         System.out.println("*** Depth: "
121            .concat(Integer.toString(depth))
122            .concat(" Running ").concat(methodName));
123         }
124      }
125   private void printText (String methodName, String itemName)
126      {
127      if (debugLevel > 0)
128         {
129         System.out.println("*** Depth: "
130            .concat(Integer.toString(depth))
131            .concat(" Running ").concat(methodName));
132         System.out.println("Item: ".concat(itemName));
133         }
134      }
135   private void printAttributes(Attributes atts)
136      {
137      for (int i=0; i < atts.getLength(); i++)
138         {
139         System.out.println(atts.getLocalName(i).concat(" :: ")
140             .concat(atts.getValue(i)));
141         }
142      }
143   public void setDocumentLocator(Locator locator)
144      { printText ("setDocumentLocation");  }
145   /**
146   * Called by the parser when the start of the document
147   * is encountered.
148   * <p>Initializes fields used in parsing document.</p>
149   */
150   public void startDocument() throws SAXException
151      {
152      depth = -1 ;
153      activeSection = false;
154      printText("startDocument");
155      if ((debugLevel > 0) && (mode == SEARCH))
156         { System.out.println("Search string: ".concat(searchString)); }
157      }
158   /**
159   * Called by the parser when the end of the document
160   * is encountered.
161   */
162   public void endDocument() throws SAXException
163      {
164      printText("endDocument");
165      }
166   /**
167   * Called when prefix mapping is started.
168   * <p>No action is taken for this parser action.</p>
169   */
170   public void startPrefixMapping (String prefix, String uri)
171          throws SAXException
172      {
173      printText ("startPrefixMapping");
174      }
175   /**
176   * Called when prefix mapping is ended.
177   * <p>No action is taken for this parser action.</p>
178   */
179   public void endPrefixMapping(String prefix) throws SAXException
180      {
181      printText ("endPrefixMapping");
182      }
183   /**
184   * Called when a start tag is encountered.
185   * <p>Together with the actions taken in response to 
186   *    the endElement method, this represents the heart
187   *    of the parsing operation.</p>
188   */
189   public void startElement(String namespaceURI, String localName,
190          String qualifiedName, Attributes atts) throws SAXException
191      {
192      StringBuffer tagList = new StringBuffer();
193      depth = depth + 1;
194      printText ("startElement", localName);
195      if (debugLevel > 0)
196         {
197         if (atts.getLength() > 0)
198            { printAttributes(atts); }
199         }
200      tags[depth] = localName;
201      tagList = new StringBuffer();
202      for (int i = 0; i <= depth; i++)
203         { tagList.append("<".concat(tags[i]).concat(">")); }
204      if (debugLevel > 0)
205         { System.out.println(tagList); }
206      if (mode == SEARCH)
207         {
208         if (activeSection)
209            {
210            activeString.append("<".concat(localName).concat(">"));
211            }
212         if ((new String(tagList)).equals(searchString))
213            {
214            if (debugLevel > 0)
215               {
216               System.out.println("Match found");
217               }
218            activeSection = true;
219            activeDepth = depth;
220            activeString = new StringBuffer();
221            }
222         } 
223      else if (mode == LISTTAGS)
224         { 
225         items.add(new String(tagList)); 
226         if (debugLevel > 0)
227            { System.out.println(tagList); }
228         }
229      }
230   /**
231   * Called when an end tag is encountered.
232   */
233   public void endElement(String namespaceURI, String localName,
234           String qualifiedName) throws SAXException
235      {
236      printText ("endElement", localName);
237      depth = depth - 1;
238      if ((mode == SEARCH) && (activeSection))
239         {
240         if (depth < activeDepth)
241            {
242            items.add(new String(activeString));
243            activeString = new StringBuffer();
244            activeSection = false;
245            }
246         else
247            {
248            activeString.append("</".concat(localName).concat(">"));
249            }
250         }
251      }
252   /**
253   * This method is called when text is encountered between start
254   * and end tags.
255   * <p>Multiple calls of this method may be executed to handle the
256   *    text between the tags.</p>
257   */
258   public void characters(char[] text, int start, int length)
259           throws SAXException
260      {
261      String data = new String(text, start, length) 
262                  .replaceAll("&", "&amp;")
263                  .replaceAll("\\\'", "&apos;")
264                  .replaceAll("\\\"", "&quot;")
265                  .replaceAll("<", "&lt;")
266                  .replaceAll(">", "&gt;");
267      if (mode == SEARCH)
268         {
269         if (activeSection)
270            {
271            activeString.append(data);
272            }
273          }
274      printText ("characters", data);
275      }
276   /**
277   * Called when ignorable whitespace is encountered.
278   * <p>Ignorable whitespace is ignored and no action is
279   *    taken.</p>
280   */
281   public void ignorableWhitespace (char[] text, int start, int length)
282           throws SAXException
283      {
284      printText ("ignorableWhitespace");
285      }
286   public void processingInstruction(String target, String data)
287           throws SAXException
288      {
289      printText ("processingInstruction");
290      }
291   public void skippedEntity(String name) throws SAXException
292      {
293      printText("skippedEntity");
294      }
295   }
296/**
297* Determine amount of diagnostic output.
298* @param level Amount of diagnostic material to be printed.  0
299* is default and results in no diagnostic messages.  Higher values
300* produce more diagnostic messages.
301*/
302public void setDebugLevel (int level)
303   { debugLevel = level; }
304/** 
305This method parses an XML document for strings
306* @param start Initial Vector of String objects
307* @param document This string contains the document to be parsed
308* @param search This string indicates the set of tags to be searched
309* for.  If the value is 
310* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
311* the program will return the contents of all
312* <code>FetchHandle</code> tags which are within
313* <code>Body</code> tags which are within
314* <code>Envelope</code> tags.
315* @return Vector of String objects that contains all of the 
316* objects from the initial list plus the items found in
317* document.
318*/
319public Vector<String> parseString (Vector<String> start, String document,
320        String search)
321   {
322   return internalParse(start, 
323          new InputSource(new StringReader(document)), 
324          search, SEARCH);
325   }
326/** 
327This method parses an XML document for strings
328* @param document This string contains the document to be parsed
329* @param search This string indicates the set of tags to be searched
330* for.  If the value is 
331* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
332* the program will return the contents of all
333* <code>FetchHandle</code> tags which are within
334* <code>Body</code> tags which are within
335* <code>Envelope</code> tags.
336* @return Vector of String objects that contains all of the 
337* objects found in
338* document.
339*/
340public Vector<String> parseString (String document, String search)
341   {
342   if (debugLevel > 0)
343      {
344      System.out.println("*** Starting parseString");
345      System.out.println("Search string is ".concat(search));
346      System.out.println(document);
347      }
348   return internalParse (new Vector<String>(), 
349          new InputSource(new StringReader(document)), 
350          search, SEARCH);
351   }
352/** 
353This method parses an XML document for strings
354* @param document This string contains the File object representing
355* the file to be parsed.
356* @param search This string indicates the set of tags to be searched
357* for.  If the value is 
358* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
359* the program will return the contents of all
360* <code>FetchHandle</code> tags which are within
361* <code>Body</code> tags which are within
362* <code>Envelope</code> tags.
363* @return Vector of String objects that contains all of the 
364* objects found in
365* document.
366*/
367public Vector<String> parseString (File document, String search)
368   {
369   InputSource source;
370   if (debugLevel > 0)
371      {
372      System.out.println("*** Starting parseString");
373      System.out.println("Search string is ".concat(search));
374      System.out.println(document);
375      }
376   try
377      {
378      source = new InputSource(new FileInputStream(document));
379      }
380   catch (FileNotFoundException e) 
381      {
382      System.out.println("Unable to open file");
383      e.printStackTrace();
384      return null;
385      }
386   return internalParse (new Vector<String>(), 
387          source, 
388          search, SEARCH);
389   }
390/** 
391This method parses an XML document for strings
392* @param start Vector containing the String objects at the
393* start executing the method
394* @param document This string is the File object to be parsed
395* @param search This string indicates the set of tags to be searched
396* for.  If the value is 
397* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
398* the program will return the contents of all
399* <code>FetchHandle</code> tags which are within
400* <code>Body</code> tags which are within
401* <code>Envelope</code> tags.
402* @return Vector of String objects that contains all of the 
403* objects found in
404* document.
405*/
406public Vector<String> parseString (Vector<String> start, 
407       File document, String search)
408   {
409   InputSource source;
410   if (debugLevel > 0)
411      {
412      System.out.println("*** Starting parseString");
413      System.out.println("Search string is ".concat(search));
414      System.out.println(document);
415      }
416   try
417      {
418      source = new InputSource(new FileInputStream(document));
419      }
420   catch (FileNotFoundException e) 
421      {
422      System.out.println("Unable to open file");
423      e.printStackTrace();
424      return null;
425      }
426   return internalParse (start, 
427          source, 
428          search, SEARCH);
429   }
430/**
431* List tags contained in an XML document.
432*
433* <p>This method returns a Vector containing String objects.</p>
434* <p>Each String object contains a sequence of tags found
435* in the document.</p>
436* @param start Initial vector of String objects to which
437* items are to be appended.
438* @param document Document to be parsed.
439* @return Vector of String objects containing list of tag
440* combinations
441*/
442public Vector<String> listTags (Vector<String> start, String document)
443   {
444   return internalParse(start, 
445          new InputSource(new StringReader(document)), 
446          (String) null, LISTTAGS);
447   }
448/**
449* List strings contained in document.
450*
451* This method returns a Vector containing String objects.
452* Each String object contains a sequence of tags found
453* in the document.
454* @param document Document to be parsed.
455* @return Vector of String objects containing list of tag
456* combinations.
457*/
458public Vector<String> listTags (String document)
459   { 
460   return internalParse(new Vector<String>(), 
461        new InputSource(new StringReader(document)), (String) null,
462                  LISTTAGS);
463   }
464/** 
465This method lists tags contained in an XML document.
466* @param document This string contains the File object representing
467* the file to be parsed.
468* @return Vector of String objects that contains a listing
469* of the tags in the document
470*/
471public Vector<String> listTags (File document)
472   {
473   InputSource source;
474   if (debugLevel > 0)
475      {
476      System.out.println("*** Starting parseString");
477      System.out.println(document);
478      }
479   try
480      {
481      source = new InputSource(new FileInputStream(document));
482      }
483   catch (FileNotFoundException e) 
484      {
485      System.out.println("Unable to open file");
486      e.printStackTrace();
487      return null;
488      }
489   return internalParse (new Vector<String>(), 
490          source,
491          LISTTAGS);
492   }
493/** 
494* This method lists tags contained in an XML document.
495* @param start Vector containing the String objects at the
496* start executing the method
497* @param document Object containing the file to be parsed
498* @return Vector of String objects that contains all of the 
499* tags found in the
500* document.
501*/
502public Vector<String> listTags (Vector<String> start, 
503       File document)
504   {
505   InputSource source;
506   if (debugLevel > 0)
507      {
508      System.out.println("*** Starting listTags");
509      System.out.println(document);
510      }
511   try
512      {
513      source = new InputSource(new FileInputStream(document));
514      }
515   catch (FileNotFoundException e) 
516      {
517      System.out.println("Unable to open file");
518      e.printStackTrace();
519      return null;
520      }
521   return internalParse (start, 
522          source, 
523          LISTTAGS);
524   }
525private Vector<String> internalParse(Vector<String> start,
526               InputSource document,
527                int mode)
528   {
529   String search = (String) null;
530   return internalParse (start, document, search, mode);
531   }
532private Vector<String> internalParse(Vector<String> start,
533               InputSource document,
534               String search, int mode)
535   {
536   items = new Vector<String>(start);
537   XMLReader parser = null;
538   try 
539      {
540      parser = XMLReaderFactory.createXMLReader
541         ("org.apache.xerces.parsers.SAXParser");
542      }
543   catch (SAXException e)
544      {
545      System.out.println ("SAXException error when creating XMLReader");
546      return null;
547      }
548   parser.setContentHandler(new MyHandler(mode, search));
549   try
550      {
551      /*
552      ** The argument for parse method must be of type 
553      ** InputSource
554      */
555      parser.parse(document);
556      }
557   catch (SAXParseException e)
558      {
559      System.out.println ("SAXParseException");
560      }
561   catch (SAXException e)
562      {
563      System.out.println ("SAXException while parsing");
564      }
565   catch (IOException e)
566      {
567      System.out.println ("IOException while parsing");
568      }
569   return items;
570   }
571}