SourceForge: xtf/xtf: WEB-INF/src/org/cdlib/xtf/dynaXML/DefaultDocLocator.java@de7d8a406bef
WEB-INF/src/org/cdlib/xtf/dynaXML/DefaultDocLocator.java
author mhaye
Fri Aug 04 18:52:21 2006 +0000 (2006-08-04)
changeset 1035 de7d8a406bef
parent 925 fbe4d6def576
child 1232 a9e223205089
permissions -rw-r--r--
Make whitespace stripping optional.
     1 package org.cdlib.xtf.dynaXML;
     2 
     3 /*
     4  * Copyright (c) 2004, Regents of the University of California
     5  * All rights reserved.
     6  * 
     7  * Redistribution and use in source and binary forms, with or without 
     8  * modification, are permitted provided that the following conditions are met:
     9  *
    10  * - Redistributions of source code must retain the above copyright notice, 
    11  *   this list of conditions and the following disclaimer.
    12  * - Redistributions in binary form must reproduce the above copyright notice, 
    13  *   this list of conditions and the following disclaimer in the documentation 
    14  *   and/or other materials provided with the distribution.
    15  * - Neither the name of the University of California nor the names of its
    16  *   contributors may be used to endorse or promote products derived from this 
    17  *   software without specific prior written permission.
    18  *
    19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
    20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
    21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
    22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
    23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
    24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
    25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
    26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
    27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
    28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
    29  * POSSIBILITY OF SUCH DAMAGE.
    30  */
    31 
    32 import java.io.File;
    33 import java.io.FileInputStream;
    34 import java.io.IOException;
    35 import java.io.InputStream;
    36 
    37 import javax.xml.parsers.SAXParser;
    38 import javax.xml.transform.Templates;
    39 import javax.xml.transform.sax.SAXResult;
    40 
    41 import net.sf.saxon.event.Receiver;
    42 import net.sf.saxon.event.ReceivingContentHandler;
    43 import net.sf.saxon.om.NamePool;
    44 
    45 import org.cdlib.xtf.lazyTree.LazyTreeBuilder;
    46 import org.cdlib.xtf.servletBase.TextServlet;
    47 import org.cdlib.xtf.textEngine.IndexUtil;
    48 import org.cdlib.xtf.util.DocTypeDeclRemover;
    49 import org.cdlib.xtf.util.Path;
    50 import org.cdlib.xtf.util.StructuredFile;
    51 import org.cdlib.xtf.util.StructuredStore;
    52 import org.xml.sax.Attributes;
    53 import org.xml.sax.ContentHandler;
    54 import org.xml.sax.InputSource;
    55 import org.xml.sax.SAXException;
    56 import org.xml.sax.helpers.DefaultHandler;
    57 
    58 /*
    59  * This file created on Mar 11, 2005 by Martin Haye
    60  */
    61 
    62 /**
    63  * Provides local filesystem-based access to lazy and non-lazy versions of
    64  * a source XML document.
    65  * 
    66  * @author Martin Haye
    67  */
    68 public class DefaultDocLocator implements DocLocator 
    69 {
    70     /** Servlet we are part of */
    71     private TextServlet servlet;
    72     
    73     /** Attach to a servlet */
    74     public void setServlet( TextServlet servlet ) {
    75         this.servlet = servlet;
    76     }
    77     
    78     /**
    79      * Search for a StructuredStore containing the "lazy" or persistent
    80      * representation of a given document. Index parameters are specified,
    81      * since often the lazy file is stored along with the index. This method
    82      * is called first, and if it returns null, then 
    83      * {@link #getInputSource(String, boolean)} will be called as a fall-back.
    84      * 
    85      * @param indexConfigPath Path to the index configuration file
    86      * @param indexName       Name of the index being searched
    87      * @param sourcePath      Path to the source document
    88      * @param preFilter       Prefilter stylesheet to run (or null for none)
    89      * @param removeDoctypeDecl Set to true to remove DOCTYPE declaration from
    90      *                          the XML document.
    91      * 
    92      * @return                Store containing the tree, or null if none
    93      *                        could be found.
    94      */
    95     public StructuredStore getLazyStore( String    indexConfigPath, 
    96                                          String    indexName,
    97                                          String    sourcePath,
    98                                          Templates preFilter,
    99                                          boolean   removeDoctypeDecl ) 
   100         throws IOException 
   101     {
   102         // If no 'index' specified in the docInfo, then there's no way we can
   103         // find the lazy file.
   104         //
   105         boolean useLazy = true;
   106         if( indexConfigPath == null || indexName == null )
   107             return null;
   108         
   109         // If the source isn't a local file, we also can't use a lazy file.
   110         if( sourcePath.startsWith("http:") )
   111             return null;
   112         if( sourcePath.startsWith("https:") )
   113             return null;
   114         
   115         // If it's a directory, something went wrong. No lazy file for sure.
   116         File sourceFile = new File( sourcePath );
   117         if( !sourceFile.isFile() )
   118             return null;
   119             
   120         // Figure out where the lazy file is (or should be.)
   121         File lazyFile = IndexUtil.calcLazyPath( 
   122                               new File(servlet.getRealPath("")),
   123                               new File(indexConfigPath), indexName,
   124                               new File(sourcePath), false );
   125         
   126         // If we can't read it, try to build it instead.
   127         if( !lazyFile.canRead() ) {
   128             boolean stripWhitespace = false;
   129             try {
   130                 stripWhitespace = IndexUtil.getIndexInfo(
   131                     new File(indexConfigPath), indexName).stripWhitespace;
   132             }
   133             catch( Exception e ) { }
   134             
   135             buildLazyStore( lazyFile, sourcePath, preFilter,
   136                             removeDoctypeDecl, stripWhitespace );
   137         }
   138         
   139         // Cool. Open the lazy file.
   140         return StructuredFile.open( lazyFile );
   141         
   142     } // getLazyStore()
   143 
   144     /**
   145      * Retrieve the data stream for an XML source document. 
   146      * 
   147      * @param sourcePath  Path to the source document
   148      * @param removeDoctypeDecl Set to true to remove DOCTYPE declaration from
   149      *                          the XML document.
   150      * 
   151      * @return            Data stream for the document.
   152      */
   153     public InputSource getInputSource( String sourcePath,
   154                                        boolean removeDoctypeDecl ) 
   155         throws IOException 
   156     {
   157         // If it's non-local, load the URL.
   158         if( sourcePath.startsWith("http:") ||
   159             sourcePath.startsWith("https:") )
   160         {
   161             return new InputSource( sourcePath );
   162         }
   163         
   164         // Okay, assume it's a local file.
   165         InputStream inStream = new FileInputStream( sourcePath );
   166         
   167         // Remove DOCTYPE declarations, since the XML reader will barf 
   168         // if it can't resolve the entity reference, and we really 
   169         // don't care one way or the other.
   170         //
   171         if( removeDoctypeDecl )
   172             inStream = new DocTypeDeclRemover( inStream );
   173         
   174         // Make the input source, and give it a real system ID.
   175         InputSource inSrc = new InputSource( inStream );
   176         inSrc.setSystemId( new File(sourcePath).toURL().toString() );
   177         
   178         // All done!
   179         return inSrc;
   180     } // getInputSource()
   181 
   182     /**
   183      * Create a lazy document by loading the original, building the lazy
   184      * tree, and writing it out.
   185      * 
   186      * @param lazyFile      Lazy file to create
   187      * @param sourcePath    Path to the source document
   188      * @param preFilter     A prefilter stylesheet (or null for no pre-filtering.)
   189      * @param removeDoctypeDecl true to remove DOCTYPE declarations from the
   190      *                          XML document
   191      * @param stripWhitespace If set, whitespace will be removed between elements
   192      *                        in the lazy file.
   193      */
   194     private void buildLazyStore( File lazyFile, 
   195                                  String sourcePath,
   196                                  Templates preFilter,
   197                                  boolean removeDoctypeDecl,
   198                                  boolean stripWhitespace )
   199         throws IOException
   200     {
   201         // The directory the lazy file is to be stored in might not exist yet.
   202         // If not, we need to create it now before making the lazy file.
   203         //
   204         Path.createPath( lazyFile.getParent() );
   205       
   206         // While we parse the source document, we're going to also build up 
   207         // a tree that will be written to the lazy file.
   208         //
   209         LazyTreeBuilder lazyBuilder = new LazyTreeBuilder();
   210         StructuredStore lazyStore = StructuredFile.create( lazyFile );
   211         Receiver lazyReceiver = lazyBuilder.begin( lazyStore );
   212         
   213         lazyBuilder.setNamePool( NamePool.getDefaultNamePool() );
   214         
   215         ReceivingContentHandler lazyHandler = new ReceivingContentHandler();
   216         lazyHandler.setReceiver( lazyReceiver );
   217         lazyHandler.setPipelineConfiguration( 
   218             lazyReceiver.getPipelineConfiguration() );
   219         
   220         // Instantiate a new XML parser, being sure to get the right one.
   221         SAXParser xmlParser = IndexUtil.createSAXParser();
   222     
   223         // Open the source file for reading
   224         InputStream inStream = new FileInputStream( sourcePath );
   225           
   226         // Apply the standard set of document filters.
   227         InputSource inSrc = new InputSource( 
   228             IndexUtil.filterXMLDocument(inStream, xmlParser,
   229                                         removeDoctypeDecl) );
   230         
   231         // Put a proper system ID onto the InputSource.
   232         inSrc.setSystemId( new File(sourcePath).toURL().toString() );
   233         
   234         // Make a DefaultHandler that will pass events to the lazy receiver.
   235         LazyPassthru passthru = new LazyPassthru( lazyHandler, stripWhitespace );
   236         
   237         // Apply a prefilter if one was specified.
   238         if( preFilter == null ) {
   239             try {
   240                 xmlParser.parse( inSrc, passthru );
   241             }
   242             catch( Exception e ) { throw new RuntimeException( e ); }
   243         }
   244         else 
   245         {
   246             // Apply the pre-filter.
   247             try {
   248                 Templates[] array = new Templates[1];
   249                 array[0] = preFilter;
   250                 IndexUtil.applyPreFilters( array, 
   251                                            xmlParser.getXMLReader(), 
   252                                            inSrc, 
   253                                            new SAXResult( passthru ) );
   254             }
   255             catch( Exception e ) { throw new RuntimeException( e ); }
   256         }
   257         
   258         // Finish off the lazy file.
   259         lazyBuilder.finish( lazyReceiver, true );
   260         
   261     } // buildLazyStore()
   262     
   263     /** 
   264      * Passes SAX events to a ContentHandler. Also performs character
   265      * buffering that mimics what the textIndexer normally does.
   266      */
   267     private static class LazyPassthru extends DefaultHandler
   268     {
   269       private StringBuffer   charBuf = new StringBuffer();
   270       private ContentHandler lazyHandler;
   271       private boolean        stripWhitespace;
   272       
   273       public LazyPassthru( ContentHandler lazyHandler,
   274                            boolean        stripWhitespace ) 
   275       {
   276           this.lazyHandler = lazyHandler;
   277           this.stripWhitespace = stripWhitespace;
   278       }
   279       
   280       public void startDocument() throws SAXException {
   281           lazyHandler.startDocument();
   282       }
   283       public void startElement( String uri,   String localName,
   284                                 String qName, Attributes atts ) 
   285           throws SAXException
   286       {
   287           flushCharacters();
   288           lazyHandler.startElement( uri, localName, qName, atts );
   289       }
   290       public void endElement( String uri, String localName, String qName )
   291           throws SAXException
   292       {
   293           flushCharacters();
   294           lazyHandler.endElement( uri, localName, qName );
   295       }
   296       public void processingInstruction( String target, String data )
   297           throws SAXException
   298       {
   299           lazyHandler.processingInstruction( target, data );
   300       }
   301       public void endDocument()
   302           throws SAXException
   303       {
   304           lazyHandler.endDocument();
   305       }
   306       public void characters( char[] ch, int start, int length ) {
   307           charBuf.append( ch, start, length );
   308       }
   309       private void flushCharacters()
   310           throws SAXException
   311       {
   312           // If the entire buffer is whitespace (or empty), we can safely 
   313           // strip it.
   314           //
   315           int i = 0;
   316           if (stripWhitespace) {
   317               for( i = 0; i < charBuf.length(); i++ )
   318                   if( !Character.isWhitespace(charBuf.charAt(i)) ) break;
   319           }
   320           if( i < charBuf.length() )
   321               lazyHandler.characters( charBuf.toString().toCharArray(),
   322                                       0, charBuf.length() );
   323           charBuf.setLength( 0 );
   324       }
   325     };
   326 
   327 } // class DefaultDocLocator