001package bradleyross.opensource.xerces;
002import org.xml.sax.*;
003import org.xml.sax.helpers.*;
004import java.util.*;
005import java.io.File;
006import java.io.IOException;
007import java.io.StringReader;
008import java.io.FileInputStream;
009import java.io.FileNotFoundException;
010/** The purpose of this class is to provide
011* a set of tools for parsing XML documents.
012* <p>When you create a class for parsing XML files, it is necessary
013*    to create a subclass of org.xml.sax.ContentHandler
014*    for a content handler object.  The
015*    XML parser calls methods in this class when various events
016*    occur while reading the XML files.  Events would be such things
017*    as the start or end of a tagged element.</p>
018* <p>In order to use this class, this class it is necessary
019* to have xercesImpl.jar and xml-apis.jar
020* from the Apache Xerces project in the
021* CLASSPATH.</p>
022* <p>There are two sets of source code for this
023*    class, one for versions previous to Java 5 and the other
024*    for Java 5 and later.</p>
025* <p>When constructing code using Vector classes, it is necessary to
026*    code differently for Java 5 and later versions because 
027*    version 5 introduced the idea of parameterizing Vector objects.</p>
028* <ul>
029* <li><p>For Java 5 and later, you will see the syntax
030*     <code>Vector&lt;String&gt;</code>.  This means
031*     objects making up the Vector are all of type String.</p></li>
032* <li><p>For versions before Java 5, the syntax is simply
033*     <code>Vector</code>.  This is because there is no method in
034*     the earlier versions of restricting Vectors and other list
035*     to a single class.</p></li>
036* </ul>
037* <p>The following is an example of a file to be processed by this object.</p>
038* <p><code>&lt;html&gt;&lt;head&gt;&lt;/head&gt;<br>
039* &lt;body&gt;aaaa&lt;/body&gt;&lt;/html&gt;</code></p>
040* @see bradleyross.demonstrations.getTags
041* @see bradleyross.demonstrations.parseFile
042* @see org.xml.sax.ContentHandler
043*
044* @author Bradley Ross
045*/
046public class XmlParser 
047{
048/** Indicates mode of parsing operation. */
049private int mode;
050/** Getter for mode
051 * @return value of {@link #mode}
052 */
053public int getMode()
054        { return mode; }
055/** Option for listing tags in document. */
056public static final int LISTTAGS = 1;
057/** Option for searching for strings with specified tags. */
058public static final int SEARCH = 2;
059/** Vector containing strings or lists of tags in document */
060private Vector<String> items;
061/** Amount of diagnostic listing to be generated. */
062private int debugLevel = 0;
063/**
064* This class provides the means of responding to
065* the Xerces parser from the Apache Xerces
066* parser.
067*/
068protected class MyHandler implements org.xml.sax.ContentHandler
069   {
070   /**
071   * Indicates type of search to be carried out.
072   * <p>Value of LISTTAGS used for the following methods where
073   *    the goal is to get a list of the tags and their structure
074   *    in the document.</p>
075   * <p><ul>
076   * <li><p>public Vector parseString (Vector start,
077   *        File document, String search)</p></li>
078   * <li><p>public Vector listTags (Vector start, String document)</p></li>
079   * <li><p>public Vector listTags (File document, String search)</p></li>
080   * <li><p>public Vector listTags (Vector start, File document,
081   *        String search)</p></li>
082   * </ul></p>
083   * <p>Value of SEARCH used for the following methods where the
084   *    goal is to get a list of segments of the documents having
085   *    the specified tag structure.</p>
086   * <p><ul>
087   * <li><p>public Vector parseString (Vector&lt;String&gt; start, 
088   *        String document, String 
089   *        search )</p></li>
090   * <li><p>public Vector parseString (String document, String
091   *        search)</p></li>
092   * <li><p>public Vector parseString (File document, String search)</p></li>
093   * <li><p>public Vector parseString (Vector start,
094   *        File document, String search)</p></li>
095   * </ul></p>
096   */
097   int mode;
098   int depth;
099   String searchString;
100   int activeDepth;
101   boolean activeSection = false;
102   StringBuffer  activeString = null;
103   boolean testValue;
104   String tags[] = new String[40];
105   /**
106   * Constructor defining actions taken during parsing.
107   *
108   * @param type This is an integer value defining the type of 
109   * parsing operation to be carried out.
110   * @param criteria This String contains the criteria used
111   * for carrying out the parsing operation.
112   */
113   public MyHandler(int type, String criteria)
114      {
115      mode = type;
116      searchString = criteria;
117      }
118   private void printText (String methodName)
119      {
120      if (debugLevel > 0)
121         {
122         System.out.println("*** Depth: "
123            .concat(Integer.toString(depth))
124            .concat(" Running ").concat(methodName));
125         }
126      }
127   private void printText (String methodName, String itemName)
128      {
129      if (debugLevel > 0)
130         {
131         System.out.println("*** Depth: "
132            .concat(Integer.toString(depth))
133            .concat(" Running ").concat(methodName));
134         System.out.println("Item: ".concat(itemName));
135         }
136      }
137   private void printAttributes(Attributes atts)
138      {
139      for (int i=0; i < atts.getLength(); i++)
140         {
141         System.out.println(atts.getLocalName(i).concat(" :: ")
142             .concat(atts.getValue(i)));
143         }
144      }
145   public void setDocumentLocator(Locator locator)
146      { printText ("setDocumentLocation");  }
147   /**
148   * Called by the parser when the start of the document
149   * is encountered.
150   * <p>Initializes fields used in parsing document.</p>
151   */
152   public void startDocument() throws SAXException
153      {
154      depth = -1 ;
155      activeSection = false;
156      printText("startDocument");
157      if ((debugLevel > 0) && (mode == SEARCH))
158         { System.out.println("Search string: ".concat(searchString)); }
159      }
160   /**
161   * Called by the parser when the end of the document
162   * is encountered.
163   */
164   public void endDocument() throws SAXException
165      {
166      printText("endDocument");
167      }
168   /**
169   * Called when prefix mapping is started.
170   * <p>No action is taken for this parser action.</p>
171   */
172   public void startPrefixMapping (String prefix, String uri)
173          throws SAXException
174      {
175      printText ("startPrefixMapping");
176      }
177   /**
178   * Called when prefix mapping is ended.
179   * <p>No action is taken for this parser action.</p>
180   */
181   public void endPrefixMapping(String prefix) throws SAXException
182      {
183      printText ("endPrefixMapping");
184      }
185   /**
186   * Called when a start tag is encountered.
187   * <p>Together with the actions taken in response to 
188   *    the endElement method, this represents the heart
189   *    of the parsing operation.</p>
190   */
191   public void startElement(String namespaceURI, String localName,
192          String qualifiedName, Attributes atts) throws SAXException
193      {
194      StringBuffer tagList = new StringBuffer();
195      depth = depth + 1;
196      printText ("startElement", localName);
197      if (debugLevel > 0)
198         {
199         if (atts.getLength() > 0)
200            { printAttributes(atts); }
201         }
202      tags[depth] = localName;
203      tagList = new StringBuffer();
204      for (int i = 0; i <= depth; i++)
205         { tagList.append("<".concat(tags[i]).concat(">")); }
206      if (debugLevel > 0)
207         { System.out.println(tagList); }
208      if (mode == SEARCH)
209         {
210         if (activeSection)
211            {
212            activeString.append("<".concat(localName).concat(">"));
213            }
214         if ((new String(tagList)).equals(searchString))
215            {
216            if (debugLevel > 0)
217               {
218               System.out.println("Match found");
219               }
220            activeSection = true;
221            activeDepth = depth;
222            activeString = new StringBuffer();
223            }
224         } 
225      else if (mode == LISTTAGS)
226         { 
227         items.add(new String(tagList)); 
228         if (debugLevel > 0)
229            { System.out.println(tagList); }
230         }
231      }
232   /**
233   * Called when an end tag is encountered.
234   */
235   public void endElement(String namespaceURI, String localName,
236           String qualifiedName) throws SAXException
237      {
238      printText ("endElement", localName);
239      depth = depth - 1;
240      if ((mode == SEARCH) && (activeSection))
241         {
242         if (depth < activeDepth)
243            {
244            items.add(new String(activeString));
245            activeString = new StringBuffer();
246            activeSection = false;
247            }
248         else
249            {
250            activeString.append("</".concat(localName).concat(">"));
251            }
252         }
253      }
254   /**
255   * This method is called when text is encountered between start
256   * and end tags.
257   * <p>Multiple calls of this method may be executed to handle the
258   *    text between the tags.</p>
259   */
260   public void characters(char[] text, int start, int length)
261           throws SAXException
262      {
263      String data = new String(text, start, length) 
264                  .replaceAll("&", "&amp;")
265                  .replaceAll("\\\'", "&apos;")
266                  .replaceAll("\\\"", "&quot;")
267                  .replaceAll("<", "&lt;")
268                  .replaceAll(">", "&gt;");
269      if (mode == SEARCH)
270         {
271         if (activeSection)
272            {
273            activeString.append(data);
274            }
275          }
276      printText ("characters", data);
277      }
278   /**
279   * Called when ignorable whitespace is encountered.
280   * <p>Ignorable whitespace is ignored and no action is
281   *    taken.</p>
282   */
283   public void ignorableWhitespace (char[] text, int start, int length)
284           throws SAXException
285      {
286      printText ("ignorableWhitespace");
287      }
288   public void processingInstruction(String target, String data)
289           throws SAXException
290      {
291      printText ("processingInstruction");
292      }
293   public void skippedEntity(String name) throws SAXException
294      {
295      printText("skippedEntity");
296      }
297   }
298/**
299* Determine amount of diagnostic output.
300* @param level Amount of diagnostic material to be printed.  0
301* is default and results in no diagnostic messages.  Higher values
302* produce more diagnostic messages.
303*/
304public void setDebugLevel (int level)
305   { debugLevel = level; }
306/** 
307This method parses an XML document for strings
308* @param start Initial Vector of String objects
309* @param document This string contains the document to be parsed
310* @param search This string indicates the set of tags to be searched
311* for.  If the value is 
312* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
313* the program will return the contents of all
314* <code>FetchHandle</code> tags which are within
315* <code>Body</code> tags which are within
316* <code>Envelope</code> tags.
317* @return Vector of String objects that contains all of the 
318* objects from the initial list plus the items found in
319* document.
320*/
321public Vector<String> parseString (Vector<String> start, String document,
322        String search)
323   {
324   return internalParse(start, 
325          new InputSource(new StringReader(document)), 
326          search, SEARCH);
327   }
328/** 
329This method parses an XML document for strings
330* @param document This string contains the document to be parsed
331* @param search This string indicates the set of tags to be searched
332* for.  If the value is 
333* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
334* the program will return the contents of all
335* <code>FetchHandle</code> tags which are within
336* <code>Body</code> tags which are within
337* <code>Envelope</code> tags.
338* @return Vector of String objects that contains all of the 
339* objects found in
340* document.
341*/
342public Vector<String> parseString (String document, String search)
343   {
344   if (debugLevel > 0)
345      {
346      System.out.println("*** Starting parseString");
347      System.out.println("Search string is ".concat(search));
348      System.out.println(document);
349      }
350   return internalParse (new Vector<String>(), 
351          new InputSource(new StringReader(document)), 
352          search, SEARCH);
353   }
354/** 
355This method parses an XML document for strings
356* @param document This string contains the File object representing
357* the file to be parsed.
358* @param search This string indicates the set of tags to be searched
359* for.  If the value is 
360* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
361* the program will return the contents of all
362* <code>FetchHandle</code> tags which are within
363* <code>Body</code> tags which are within
364* <code>Envelope</code> tags.
365* @return Vector of String objects that contains all of the 
366* objects found in
367* document.
368*/
369public Vector<String> parseString (File document, String search)
370   {
371   InputSource source;
372   if (debugLevel > 0)
373      {
374      System.out.println("*** Starting parseString");
375      System.out.println("Search string is ".concat(search));
376      System.out.println(document);
377      }
378   try
379      {
380      source = new InputSource(new FileInputStream(document));
381      }
382   catch (FileNotFoundException e) 
383      {
384      System.out.println("Unable to open file");
385      e.printStackTrace();
386      return null;
387      }
388   return internalParse (new Vector<String>(), 
389          source, 
390          search, SEARCH);
391   }
392/** 
393This method parses an XML document for strings
394* @param start Vector containing the String objects at the
395* start executing the method
396* @param document This string is the File object to be parsed
397* @param search This string indicates the set of tags to be searched
398* for.  If the value is 
399* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
400* the program will return the contents of all
401* <code>FetchHandle</code> tags which are within
402* <code>Body</code> tags which are within
403* <code>Envelope</code> tags.
404* @return Vector of String objects that contains all of the 
405* objects found in
406* document.
407*/
408public Vector<String> parseString (Vector<String> start, 
409       File document, String search)
410   {
411   InputSource source;
412   if (debugLevel > 0)
413      {
414      System.out.println("*** Starting parseString");
415      System.out.println("Search string is ".concat(search));
416      System.out.println(document);
417      }
418   try
419      {
420      source = new InputSource(new FileInputStream(document));
421      }
422   catch (FileNotFoundException e) 
423      {
424      System.out.println("Unable to open file");
425      e.printStackTrace();
426      return null;
427      }
428   return internalParse (start, 
429          source, 
430          search, SEARCH);
431   }
432/**
433* List tags contained in an XML document.
434*
435* <p>This method returns a Vector containing String objects.</p>
436* <p>Each String object contains a sequence of tags found
437* in the document.</p>
438* @param start Initial vector of String objects to which
439* items are to be appended.
440* @param document Document to be parsed.
441* @return Vector of String objects containing list of tag
442* combinations
443*/
444public Vector<String> listTags (Vector<String> start, String document)
445   {
446   return internalParse(start, 
447          new InputSource(new StringReader(document)), 
448          (String) null, LISTTAGS);
449   }
450/**
451* List strings contained in document.
452*
453* This method returns a Vector containing String objects.
454* Each String object contains a sequence of tags found
455* in the document.
456* @param document Document to be parsed.
457* @return Vector of String objects containing list of tag
458* combinations.
459*/
460public Vector<String> listTags (String document)
461   { 
462   return internalParse(new Vector<String>(), 
463        new InputSource(new StringReader(document)), (String) null,
464                  LISTTAGS);
465   }
466/** 
467This method lists tags contained in an XML document.
468* @param document This string contains the File object representing
469* the file to be parsed.
470* @return Vector of String objects that contains a listing
471* of the tags in the document
472*/
473public Vector<String> listTags (File document)
474   {
475   InputSource source;
476   if (debugLevel > 0)
477      {
478      System.out.println("*** Starting parseString");
479      System.out.println(document);
480      }
481   try
482      {
483      source = new InputSource(new FileInputStream(document));
484      }
485   catch (FileNotFoundException e) 
486      {
487      System.out.println("Unable to open file");
488      e.printStackTrace();
489      return null;
490      }
491   return internalParse (new Vector<String>(), 
492          source,
493          LISTTAGS);
494   }
495/** 
496* This method lists tags contained in an XML document.
497* @param start Vector containing the String objects at the
498* start executing the method
499* @param document Object containing the file to be parsed
500* @return Vector of String objects that contains all of the 
501* tags found in the
502* document.
503*/
504public Vector<String> listTags (Vector<String> start, 
505       File document)
506   {
507   InputSource source;
508   if (debugLevel > 0)
509      {
510      System.out.println("*** Starting listTags");
511      System.out.println(document);
512      }
513   try
514      {
515      source = new InputSource(new FileInputStream(document));
516      }
517   catch (FileNotFoundException e) 
518      {
519      System.out.println("Unable to open file");
520      e.printStackTrace();
521      return null;
522      }
523   return internalParse (start, 
524          source, 
525          LISTTAGS);
526   }
527private Vector<String> internalParse(Vector<String> start,
528               InputSource document,
529                int mode)
530   {
531   String search = (String) null;
532   return internalParse (start, document, search, mode);
533   }
534private Vector<String> internalParse(Vector<String> start,
535               InputSource document,
536               String search, int mode)
537   {
538   items = new Vector<String>(start);
539   XMLReader parser = null;
540   try 
541      {
542      parser = XMLReaderFactory.createXMLReader
543         ("org.apache.xerces.parsers.SAXParser");
544      }
545   catch (SAXException e)
546      {
547      System.out.println ("SAXException error when creating XMLReader");
548      return null;
549      }
550   parser.setContentHandler(new MyHandler(mode, search));
551   try
552      {
553      /*
554      ** The argument for parse method must be of type 
555      ** InputSource
556      */
557      parser.parse(document);
558      }
559   catch (SAXParseException e)
560      {
561      System.out.println ("SAXParseException");
562      }
563   catch (SAXException e)
564      {
565      System.out.println ("SAXException while parsing");
566      }
567   catch (IOException e)
568      {
569      System.out.println ("IOException while parsing");
570      }
571   return items;
572   }
573}