Make whitespace stripping optional.
1 package org.cdlib.xtf.dynaXML;
4 * Copyright (c) 2004, Regents of the University of California
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * - Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 * - Neither the name of the University of California nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 import java.io.FileInputStream;
34 import java.io.IOException;
35 import java.io.InputStream;
37 import javax.xml.parsers.SAXParser;
38 import javax.xml.transform.Templates;
39 import javax.xml.transform.sax.SAXResult;
41 import net.sf.saxon.event.Receiver;
42 import net.sf.saxon.event.ReceivingContentHandler;
43 import net.sf.saxon.om.NamePool;
45 import org.cdlib.xtf.lazyTree.LazyTreeBuilder;
46 import org.cdlib.xtf.servletBase.TextServlet;
47 import org.cdlib.xtf.textEngine.IndexUtil;
48 import org.cdlib.xtf.util.DocTypeDeclRemover;
49 import org.cdlib.xtf.util.Path;
50 import org.cdlib.xtf.util.StructuredFile;
51 import org.cdlib.xtf.util.StructuredStore;
52 import org.xml.sax.Attributes;
53 import org.xml.sax.ContentHandler;
54 import org.xml.sax.InputSource;
55 import org.xml.sax.SAXException;
56 import org.xml.sax.helpers.DefaultHandler;
59 * This file created on Mar 11, 2005 by Martin Haye
63 * Provides local filesystem-based access to lazy and non-lazy versions of
64 * a source XML document.
68 public class DefaultDocLocator implements DocLocator
70 /** Servlet we are part of */
71 private TextServlet servlet;
73 /** Attach to a servlet */
74 public void setServlet( TextServlet servlet ) {
75 this.servlet = servlet;
79 * Search for a StructuredStore containing the "lazy" or persistent
80 * representation of a given document. Index parameters are specified,
81 * since often the lazy file is stored along with the index. This method
82 * is called first, and if it returns null, then
83 * {@link #getInputSource(String, boolean)} will be called as a fall-back.
85 * @param indexConfigPath Path to the index configuration file
86 * @param indexName Name of the index being searched
87 * @param sourcePath Path to the source document
88 * @param preFilter Prefilter stylesheet to run (or null for none)
89 * @param removeDoctypeDecl Set to true to remove DOCTYPE declaration from
92 * @return Store containing the tree, or null if none
95 public StructuredStore getLazyStore( String indexConfigPath,
99 boolean removeDoctypeDecl )
102 // If no 'index' specified in the docInfo, then there's no way we can
103 // find the lazy file.
105 boolean useLazy = true;
106 if( indexConfigPath == null || indexName == null )
109 // If the source isn't a local file, we also can't use a lazy file.
110 if( sourcePath.startsWith("http:") )
112 if( sourcePath.startsWith("https:") )
115 // If it's a directory, something went wrong. No lazy file for sure.
116 File sourceFile = new File( sourcePath );
117 if( !sourceFile.isFile() )
120 // Figure out where the lazy file is (or should be.)
121 File lazyFile = IndexUtil.calcLazyPath(
122 new File(servlet.getRealPath("")),
123 new File(indexConfigPath), indexName,
124 new File(sourcePath), false );
126 // If we can't read it, try to build it instead.
127 if( !lazyFile.canRead() ) {
128 boolean stripWhitespace = false;
130 stripWhitespace = IndexUtil.getIndexInfo(
131 new File(indexConfigPath), indexName).stripWhitespace;
133 catch( Exception e ) { }
135 buildLazyStore( lazyFile, sourcePath, preFilter,
136 removeDoctypeDecl, stripWhitespace );
139 // Cool. Open the lazy file.
140 return StructuredFile.open( lazyFile );
145 * Retrieve the data stream for an XML source document.
147 * @param sourcePath Path to the source document
148 * @param removeDoctypeDecl Set to true to remove DOCTYPE declaration from
151 * @return Data stream for the document.
153 public InputSource getInputSource( String sourcePath,
154 boolean removeDoctypeDecl )
157 // If it's non-local, load the URL.
158 if( sourcePath.startsWith("http:") ||
159 sourcePath.startsWith("https:") )
161 return new InputSource( sourcePath );
164 // Okay, assume it's a local file.
165 InputStream inStream = new FileInputStream( sourcePath );
167 // Remove DOCTYPE declarations, since the XML reader will barf
168 // if it can't resolve the entity reference, and we really
169 // don't care one way or the other.
171 if( removeDoctypeDecl )
172 inStream = new DocTypeDeclRemover( inStream );
174 // Make the input source, and give it a real system ID.
175 InputSource inSrc = new InputSource( inStream );
176 inSrc.setSystemId( new File(sourcePath).toURL().toString() );
180 } // getInputSource()
183 * Create a lazy document by loading the original, building the lazy
184 * tree, and writing it out.
186 * @param lazyFile Lazy file to create
187 * @param sourcePath Path to the source document
188 * @param preFilter A prefilter stylesheet (or null for no pre-filtering.)
189 * @param removeDoctypeDecl true to remove DOCTYPE declarations from the
191 * @param stripWhitespace If set, whitespace will be removed between elements
194 private void buildLazyStore( File lazyFile,
197 boolean removeDoctypeDecl,
198 boolean stripWhitespace )
201 // The directory the lazy file is to be stored in might not exist yet.
202 // If not, we need to create it now before making the lazy file.
204 Path.createPath( lazyFile.getParent() );
206 // While we parse the source document, we're going to also build up
207 // a tree that will be written to the lazy file.
209 LazyTreeBuilder lazyBuilder = new LazyTreeBuilder();
210 StructuredStore lazyStore = StructuredFile.create( lazyFile );
211 Receiver lazyReceiver = lazyBuilder.begin( lazyStore );
213 lazyBuilder.setNamePool( NamePool.getDefaultNamePool() );
215 ReceivingContentHandler lazyHandler = new ReceivingContentHandler();
216 lazyHandler.setReceiver( lazyReceiver );
217 lazyHandler.setPipelineConfiguration(
218 lazyReceiver.getPipelineConfiguration() );
220 // Instantiate a new XML parser, being sure to get the right one.
221 SAXParser xmlParser = IndexUtil.createSAXParser();
223 // Open the source file for reading
224 InputStream inStream = new FileInputStream( sourcePath );
226 // Apply the standard set of document filters.
227 InputSource inSrc = new InputSource(
228 IndexUtil.filterXMLDocument(inStream, xmlParser,
229 removeDoctypeDecl) );
231 // Put a proper system ID onto the InputSource.
232 inSrc.setSystemId( new File(sourcePath).toURL().toString() );
234 // Make a DefaultHandler that will pass events to the lazy receiver.
235 LazyPassthru passthru = new LazyPassthru( lazyHandler, stripWhitespace );
237 // Apply a prefilter if one was specified.
238 if( preFilter == null ) {
240 xmlParser.parse( inSrc, passthru );
242 catch( Exception e ) { throw new RuntimeException( e ); }
246 // Apply the pre-filter.
248 Templates[] array = new Templates[1];
249 array[0] = preFilter;
250 IndexUtil.applyPreFilters( array,
251 xmlParser.getXMLReader(),
253 new SAXResult( passthru ) );
255 catch( Exception e ) { throw new RuntimeException( e ); }
258 // Finish off the lazy file.
259 lazyBuilder.finish( lazyReceiver, true );
261 } // buildLazyStore()
264 * Passes SAX events to a ContentHandler. Also performs character
265 * buffering that mimics what the textIndexer normally does.
267 private static class LazyPassthru extends DefaultHandler
269 private StringBuffer charBuf = new StringBuffer();
270 private ContentHandler lazyHandler;
271 private boolean stripWhitespace;
273 public LazyPassthru( ContentHandler lazyHandler,
274 boolean stripWhitespace )
276 this.lazyHandler = lazyHandler;
277 this.stripWhitespace = stripWhitespace;
280 public void startDocument() throws SAXException {
281 lazyHandler.startDocument();
283 public void startElement( String uri, String localName,
284 String qName, Attributes atts )
288 lazyHandler.startElement( uri, localName, qName, atts );
290 public void endElement( String uri, String localName, String qName )
294 lazyHandler.endElement( uri, localName, qName );
296 public void processingInstruction( String target, String data )
299 lazyHandler.processingInstruction( target, data );
301 public void endDocument()
304 lazyHandler.endDocument();
306 public void characters( char[] ch, int start, int length ) {
307 charBuf.append( ch, start, length );
309 private void flushCharacters()
312 // If the entire buffer is whitespace (or empty), we can safely
316 if (stripWhitespace) {
317 for( i = 0; i < charBuf.length(); i++ )
318 if( !Character.isWhitespace(charBuf.charAt(i)) ) break;
320 if( i < charBuf.length() )
321 lazyHandler.characters( charBuf.toString().toCharArray(),
322 0, charBuf.length() );
323 charBuf.setLength( 0 );
327 } // class DefaultDocLocator