001package bradleyross.common;
002import org.xml.sax.*;
003import org.xml.sax.helpers.*;
004import java.util.*;
005import java.io.File;
006import java.io.IOException;
007import java.io.StringReader;
008import java.io.FileInputStream;
009import java.io.FileNotFoundException;
010/** The purpose of this class is to provide
011* a set of tools for parsing XML documents.
012* <p>When you create a class for parsing XML files, it is necessary
013*    to create a subclass of org.xml.sax.ContentHandler
014*    for a content handler object.  The
015*    XML parser calls methods in this class when various events
016*    occur while reading the XML files.  Events would be such things
017*    as the start or end of a tagged element.</p>
018* <p>In order to use this class, this class it is necessary
019* to have xercesImpl.jar and xml-apis.jar
020* from the Apache Xerces project in the
021* CLASSPATH.</p>
022* <p>There are two sets of source code for this
023*    class, one for versions previous to Java 5 and the other
024*    for Java 5 and later.</p>
025* <p>When constructing code using Vector classes, it is necessary to
026*    code differently for Java 5 and later versions because 
027*    version 5 introduced the idea of parameterizing Vector objects.</p>
028* <ul>
029* <li><p>For Java 5 and later, you will see the syntax
030*     <code>Vector&lt;String&gt;</code>.  This means
031*     objects making up the Vector are all of type String.</p></li>
032* <li><p>For versions before Java 5, the syntax is simply
033*     <code>Vector</code>.  This is because there is no method in
034*     the earlier versions of restricting Vectors and other list
035*     to a single class.</p></li>
036* </ul>
037* <p>The following is an example of a file to be processed by this object.</p>
038* <p>&lt;html&gt;&lt;head&gt;&lt;/head&gt;<br>
039*    <code> &lt;body&gt;aaaa&lt;/body&gt;&lt;/html&gt;</code></p>
040*    
041* @see bradleyross.demonstrations.getTags
042* @see bradleyross.demonstrations.parseFile
043* @see org.xml.sax.ContentHandler
044*
045* @author Bradley Ross
046*/
047public class XmlParser 
048{
049/** Indicates mode of parsing operation. */
050private int mode;
051/** Getter for mode 
052 * @return mode value
053 */
054public int getMode()
055        { return mode; }
056/** Option for listing tags in document. */
057public static final int LISTTAGS = 1;
058/** Option for searching for strings with specified tags. */
059public static final int SEARCH = 2;
060/** Vector containing strings or lists of tags in document */
061private Vector<String> items;
062/** Amount of diagnostic listing to be generated. */
063private int debugLevel = 0;
064/**
065* This class provides the means of responding to
066* the Xerces parser from the Apache Xerces
067* parser.
068*/
069protected class MyHandler implements org.xml.sax.ContentHandler
070   {
071   /**
072   * Indicates type of search to be carried out.
073   * <p>Value of LISTTAGS used for the following methods where
074   *    the goal is to get a list of the tags and their structure
075   *    in the document.</p>
076   * <p><ul>
077   * <li><p>public Vector parseString (Vector start,
078   *        File document, String search)</p></li>
079   * <li><p>public Vector listTags (Vector start, String document)</p></li>
080   * <li><p>public Vector listTags (File document, String search)</p></li>
081   * <li><p>public Vector listTags (Vector start, File document,
082   *        String search)</p></li>
083   * </ul></p>
084   * <p>Value of SEARCH used for the following methods where the
085   *    goal is to get a list of segments of the documents having
086   *    the specified tag structure.</p>
087   * <p><ul>
088   * <li><p>public Vector parseString (Vector&lt;String&gt; start, 
089   *        String document, String 
090   *        search )</p></li>
091   * <li><p>public Vector parseString (String document, String
092   *        search)</p></li>
093   * <li><p>public Vector parseString (File document, String search)</p></li>
094   * <li><p>public Vector parseString (Vector start,
095   *        File document, String search)</p></li>
096   * </ul></p>
097   */
098   int mode;
099   int depth;
100   String searchString;
101   int activeDepth;
102   boolean activeSection = false;
103   StringBuffer  activeString = null;
104   boolean testValue;
105   String tags[] = new String[40];
106   /**
107   * Constructor defining actions taken during parsing.
108   *
109   * @param type This is an integer value defining the type of 
110   * parsing operation to be carried out.
111   * @param criteria This String contains the criteria used
112   * for carrying out the parsing operation.
113   */
114   public MyHandler(int type, String criteria)
115      {
116      mode = type;
117      searchString = criteria;
118      }
119   private void printText (String methodName)
120      {
121      if (debugLevel > 0)
122         {
123         System.out.println("*** Depth: "
124            .concat(Integer.toString(depth))
125            .concat(" Running ").concat(methodName));
126         }
127      }
128   private void printText (String methodName, String itemName)
129      {
130      if (debugLevel > 0)
131         {
132         System.out.println("*** Depth: "
133            .concat(Integer.toString(depth))
134            .concat(" Running ").concat(methodName));
135         System.out.println("Item: ".concat(itemName));
136         }
137      }
138   private void printAttributes(Attributes atts)
139      {
140      for (int i=0; i < atts.getLength(); i++)
141         {
142         System.out.println(atts.getLocalName(i).concat(" :: ")
143             .concat(atts.getValue(i)));
144         }
145      }
146   public void setDocumentLocator(Locator locator)
147      { printText ("setDocumentLocation");  }
148   /**
149   * Called by the parser when the start of the document
150   * is encountered.
151   * <p>Initializes fields used in parsing document.</p>
152   */
153   public void startDocument() throws SAXException
154      {
155      depth = -1 ;
156      activeSection = false;
157      printText("startDocument");
158      if ((debugLevel > 0) && (mode == SEARCH))
159         { System.out.println("Search string: ".concat(searchString)); }
160      }
161   /**
162   * Called by the parser when the end of the document
163   * is encountered.
164   */
165   public void endDocument() throws SAXException
166      {
167      printText("endDocument");
168      }
169   /**
170   * Called when prefix mapping is started.
171   * <p>No action is taken for this parser action.</p>
172   */
173   public void startPrefixMapping (String prefix, String uri)
174          throws SAXException
175      {
176      printText ("startPrefixMapping");
177      }
178   /**
179   * Called when prefix mapping is ended.
180   * <p>No action is taken for this parser action.</p>
181   */
182   public void endPrefixMapping(String prefix) throws SAXException
183      {
184      printText ("endPrefixMapping");
185      }
186   /**
187   * Called when a start tag is encountered.
188   * <p>Together with the actions taken in response to 
189   *    the endElement method, this represents the heart
190   *    of the parsing operation.</p>
191   */
192   public void startElement(String namespaceURI, String localName,
193          String qualifiedName, Attributes atts) throws SAXException
194      {
195      StringBuffer tagList = new StringBuffer();
196      depth = depth + 1;
197      printText ("startElement", localName);
198      if (debugLevel > 0)
199         {
200         if (atts.getLength() > 0)
201            { printAttributes(atts); }
202         }
203      tags[depth] = localName;
204      tagList = new StringBuffer();
205      for (int i = 0; i <= depth; i++)
206         { tagList.append("<".concat(tags[i]).concat(">")); }
207      if (debugLevel > 0)
208         { System.out.println(tagList); }
209      if (mode == SEARCH)
210         {
211         if (activeSection)
212            {
213            activeString.append("<".concat(localName).concat(">"));
214            }
215         if ((new String(tagList)).equals(searchString))
216            {
217            if (debugLevel > 0)
218               {
219               System.out.println("Match found");
220               }
221            activeSection = true;
222            activeDepth = depth;
223            activeString = new StringBuffer();
224            }
225         } 
226      else if (mode == LISTTAGS)
227         { 
228         items.add(new String(tagList)); 
229         if (debugLevel > 0)
230            { System.out.println(tagList); }
231         }
232      }
233   /**
234   * Called when an end tag is encountered.
235   */
236   public void endElement(String namespaceURI, String localName,
237           String qualifiedName) throws SAXException
238      {
239      printText ("endElement", localName);
240      depth = depth - 1;
241      if ((mode == SEARCH) && (activeSection))
242         {
243         if (depth < activeDepth)
244            {
245            items.add(new String(activeString));
246            activeString = new StringBuffer();
247            activeSection = false;
248            }
249         else
250            {
251            activeString.append("</".concat(localName).concat(">"));
252            }
253         }
254      }
255   /**
256   * This method is called when text is encountered between start
257   * and end tags.
258   * <p>Multiple calls of this method may be executed to handle the
259   *    text between the tags.</p>
260   */
261   public void characters(char[] text, int start, int length)
262           throws SAXException
263      {
264      String data = new String(text, start, length) 
265                  .replaceAll("&", "&amp;")
266                  .replaceAll("\\\'", "&apos;")
267                  .replaceAll("\\\"", "&quot;")
268                  .replaceAll("<", "&lt;")
269                  .replaceAll(">", "&gt;");
270      if (mode == SEARCH)
271         {
272         if (activeSection)
273            {
274            activeString.append(data);
275            }
276          }
277      printText ("characters", data);
278      }
279   /**
280   * Called when ignorable whitespace is encountered.
281   * <p>Ignorable whitespace is ignored and no action is
282   *    taken.</p>
283   */
284   public void ignorableWhitespace (char[] text, int start, int length)
285           throws SAXException
286      {
287      printText ("ignorableWhitespace");
288      }
289   public void processingInstruction(String target, String data)
290           throws SAXException
291      {
292      printText ("processingInstruction");
293      }
294   public void skippedEntity(String name) throws SAXException
295      {
296      printText("skippedEntity");
297      }
298   }
299/**
300* Determine amount of diagnostic output.
301* @param level Amount of diagnostic material to be printed.  0
302* is default and results in no diagnostic messages.  Higher values
303* produce more diagnostic messages.
304*/
305public void setDebugLevel (int level)
306   { debugLevel = level; }
307/** 
308This method parses an XML document for strings
309* @param start Initial Vector of String objects
310* @param document This string contains the document to be parsed
311* @param search This string indicates the set of tags to be searched
312* for.  If the value is 
313* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
314* the program will return the contents of all
315* <code>FetchHandle</code> tags which are within
316* <code>Body</code> tags which are within
317* <code>Envelope</code> tags.
318* @return Vector of String objects that contains all of the 
319* objects from the initial list plus the items found in
320* document.
321*/
322public Vector<String> parseString (Vector<String> start, String document,
323        String search)
324   {
325   return internalParse(start, 
326          new InputSource(new StringReader(document)), 
327          search, SEARCH);
328   }
329/** 
330This method parses an XML document for strings
331* @param document This string contains the document to be parsed
332* @param search This string indicates the set of tags to be searched
333* for.  If the value is 
334* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
335* the program will return the contents of all
336* <code>FetchHandle</code> tags which are within
337* <code>Body</code> tags which are within
338* <code>Envelope</code> tags.
339* @return Vector of String objects that contains all of the 
340* objects found in
341* document.
342*/
343public Vector<String> parseString (String document, String search)
344   {
345   if (debugLevel > 0)
346      {
347      System.out.println("*** Starting parseString");
348      System.out.println("Search string is ".concat(search));
349      System.out.println(document);
350      }
351   return internalParse (new Vector<String>(), 
352          new InputSource(new StringReader(document)), 
353          search, SEARCH);
354   }
355/** 
356This method parses an XML document for strings
357* @param document This string contains the File object representing
358* the file to be parsed.
359* @param search This string indicates the set of tags to be searched
360* for.  If the value is 
361* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
362* the program will return the contents of all
363* <code>FetchHandle</code> tags which are within
364* <code>Body</code> tags which are within
365* <code>Envelope</code> tags.
366* @return Vector of String objects that contains all of the 
367* objects found in
368* document.
369*/
370public Vector<String> parseString (File document, String search)
371   {
372   InputSource source;
373   if (debugLevel > 0)
374      {
375      System.out.println("*** Starting parseString");
376      System.out.println("Search string is ".concat(search));
377      System.out.println(document);
378      }
379   try
380      {
381      source = new InputSource(new FileInputStream(document));
382      }
383   catch (FileNotFoundException e) 
384      {
385      System.out.println("Unable to open file");
386      e.printStackTrace();
387      return null;
388      }
389   return internalParse (new Vector<String>(), 
390          source, 
391          search, SEARCH);
392   }
393/** 
394This method parses an XML document for strings
395* @param start Vector containing the String objects at the
396* start executing the method
397* @param document This string is the File object to be parsed
398* @param search This string indicates the set of tags to be searched
399* for.  If the value is 
400* <code>&lt;Envelope&gt;&lt;Body&gt;&lt;FetchHandle&gt;</code>,
401* the program will return the contents of all
402* <code>FetchHandle</code> tags which are within
403* <code>Body</code> tags which are within
404* <code>Envelope</code> tags.
405* @return Vector of String objects that contains all of the 
406* objects found in
407* document.
408*/
409public Vector<String> parseString (Vector<String> start, 
410       File document, String search)
411   {
412   InputSource source;
413   if (debugLevel > 0)
414      {
415      System.out.println("*** Starting parseString");
416      System.out.println("Search string is ".concat(search));
417      System.out.println(document);
418      }
419   try
420      {
421      source = new InputSource(new FileInputStream(document));
422      }
423   catch (FileNotFoundException e) 
424      {
425      System.out.println("Unable to open file");
426      e.printStackTrace();
427      return null;
428      }
429   return internalParse (start, 
430          source, 
431          search, SEARCH);
432   }
433/**
434* List tags contained in an XML document.
435*
436* <p>This method returns a Vector containing String objects.</p>
437* <p>Each String object contains a sequence of tags found
438* in the document.</p>
439* @param start Initial vector of String objects to which
440* items are to be appended.
441* @param document Document to be parsed.
442* @return Vector of String objects containing list of tag
443* combinations
444*/
445public Vector<String> listTags (Vector<String> start, String document)
446   {
447   return internalParse(start, 
448          new InputSource(new StringReader(document)), 
449          (String) null, LISTTAGS);
450   }
451/**
452* List strings contained in document.
453*
454* This method returns a Vector containing String objects.
455* Each String object contains a sequence of tags found
456* in the document.
457* @param document Document to be parsed.
458* @return Vector of String objects containing list of tag
459* combinations.
460*/
461public Vector<String> listTags (String document)
462   { 
463   return internalParse(new Vector<String>(), 
464        new InputSource(new StringReader(document)), (String) null,
465                  LISTTAGS);
466   }
467/** 
468This method lists tags contained in an XML document.
469* @param document This string contains the File object representing
470* the file to be parsed.
471* @return Vector of String objects that contains a listing
472* of the tags in the document
473*/
474public Vector<String> listTags (File document)
475   {
476   InputSource source;
477   if (debugLevel > 0)
478      {
479      System.out.println("*** Starting parseString");
480      System.out.println(document);
481      }
482   try
483      {
484      source = new InputSource(new FileInputStream(document));
485      }
486   catch (FileNotFoundException e) 
487      {
488      System.out.println("Unable to open file");
489      e.printStackTrace();
490      return null;
491      }
492   return internalParse (new Vector<String>(), 
493          source,
494          LISTTAGS);
495   }
496/** 
497* This method lists tags contained in an XML document.
498* @param start Vector containing the String objects at the
499* start executing the method
500* @param document Object containing the file to be parsed
501* @return Vector of String objects that contains all of the 
502* tags found in the
503* document.
504*/
505public Vector<String> listTags (Vector<String> start, 
506       File document)
507   {
508   InputSource source;
509   if (debugLevel > 0)
510      {
511      System.out.println("*** Starting listTags");
512      System.out.println(document);
513      }
514   try
515      {
516      source = new InputSource(new FileInputStream(document));
517      }
518   catch (FileNotFoundException e) 
519      {
520      System.out.println("Unable to open file");
521      e.printStackTrace();
522      return null;
523      }
524   return internalParse (start, 
525          source, 
526          LISTTAGS);
527   }
528private Vector<String> internalParse(Vector<String> start,
529               InputSource document,
530                int mode)
531   {
532   String search = (String) null;
533   return internalParse (start, document, search, mode);
534   }
535private Vector<String> internalParse(Vector<String> start,
536               InputSource document,
537               String search, int mode)
538   {
539   items = new Vector<String>(start);
540   XMLReader parser = null;
541   try 
542      {
543      parser = XMLReaderFactory.createXMLReader
544         ("org.apache.xerces.parsers.SAXParser");
545      }
546   catch (SAXException e)
547      {
548      System.out.println ("SAXException error when creating XMLReader");
549      return null;
550      }
551   parser.setContentHandler(new MyHandler(mode, search));
552   try
553      {
554      /*
555      ** The argument for parse method must be of type 
556      ** InputSource
557      */
558      parser.parse(document);
559      }
560   catch (SAXParseException e)
561      {
562      System.out.println ("SAXParseException");
563      }
564   catch (SAXException e)
565      {
566      System.out.println ("SAXException while parsing");
567      }
568   catch (IOException e)
569      {
570      System.out.println ("IOException while parsing");
571      }
572   return items;
573   }
574}