View Javadoc

1   package net.sourceforge.blogentis.utils;
2   
3   //-----------------------------------------------------------------------
4   //Blogentis - a blog publishing platform.
5   //Copyright (C) 2004 Tassos Bassoukos <abassouk@gmail.com>
6   //
7   //This library is free software; you can redistribute it and/or
8   //modify it under the terms of the GNU Lesser General Public
9   //License as published by the Free Software Foundation; either
10  //version 2.1 of the License, or (at your option) any later version.
11  //
12  //This library is distributed in the hope that it will be useful,
13  //but WITHOUT ANY WARRANTY; without even the implied warranty of
14  //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  //Lesser General Public License for more details.
16  //
17  //You should have received a copy of the GNU Lesser General Public
18  //License along with this library; if not, write to the Free Software
19  //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20  //-----------------------------------------------------------------------
21  //
22  //$Id: JTidyService.java,v 1.1 2004/10/22 17:34:05 tassos Exp $
23  //
24  
25  import java.io.ByteArrayInputStream;
26  import java.io.ByteArrayOutputStream;
27  import java.io.InputStream;
28  import java.io.UnsupportedEncodingException;
29  
30  import javax.xml.transform.TransformerException;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.turbine.services.TurbineBaseService;
35  import org.apache.turbine.services.TurbineServices;
36  import org.apache.turbine.services.servlet.TurbineServlet;
37  import org.apache.xpath.XPathAPI;
38  import org.w3c.dom.Document;
39  import org.w3c.dom.Node;
40  import org.w3c.dom.NodeList;
41  import org.w3c.tidy.DOMNodeImpl;
42  import org.w3c.tidy.OutImpl;
43  import org.w3c.tidy.PPrint;
44  import org.w3c.tidy.StreamIn;
45  import org.w3c.tidy.Tidy;
46  
47  /***
48   * Service that handles cleaning up HTML with JTidy.
49   * 
50   * @author abas
51   */
52  public class JTidyService
53          extends TurbineBaseService {
54      public static final String DEFAULT_CONFIGURATION = "WEB-INF/conf/JTidy.properties";
55  
56      public static final String CONFIGURATION_PARAMETER = "configuration";
57  
58      private static Log log = LogFactory.getLog(JTidyService.class);
59  
60      public static final String SERVICE_NAME = "JTidyService";
61  
62      public static JTidyService getInstane() {
63          return (JTidyService)TurbineServices.getInstance()
64              .getService(SERVICE_NAME);
65      }
66  
67      private String configurationPath = null;
68  
69      public void init() {
70          configurationPath = TurbineServlet.getRealPath(getConfiguration()
71              .getString(CONFIGURATION_PARAMETER, DEFAULT_CONFIGURATION));
72          setInit(true);
73      }
74  
75      public void shutdown() {
76          configurationPath = null;
77          setInit(false);
78      }
79  
80      public static Tidy getTidy() {
81          Tidy t = new Tidy();
82          t.setConfigurationFromFile(getInstane().configurationPath);
83          return t;
84      }
85  
86      /***
87       * Parse the given HTML fragment and produce the cleaned-up DOM tree.
88       * 
89       * @param html
90       *            the HTML fragment
91       * @return the BODY element of the parsed HTML document. Its children will
92       *         be tags, text will be wrapped in paragraph tags.
93       */
94      public static Node parseHTMLToDOM(String html) {
95          Tidy t = getTidy();
96          InputStream is;
97          try {
98              is = new ByteArrayInputStream(html.getBytes("utf-8"));
99          } catch (UnsupportedEncodingException e1) {
100             log.error("Can't get utf-8 encoding", e1);
101             return null;
102         }
103         Document d = t.parseDOM(is, null);
104         try {
105             return XPathAPI.selectSingleNode(d, "//body");
106         } catch (TransformerException e) {
107             log.error(e);
108             return null;
109         }
110     }
111 
112     /***
113      * Clean us the given string according to JTidy rules and return the
114      * pretty-printed version.
115      * 
116      * @param orig
117      *            the original HTML fragment string.
118      * @return the pretty-printed HTML string.
119      * @throws UnsupportedEncodingException
120      *             When the UTF-8 encoding does not exist.
121      */
122     public static String cleanupString(String orig)
123             throws UnsupportedEncodingException {
124         Tidy t = getTidy();
125         PPrint pp = new PPrint(t.getConfiguration());
126         InputStream is = new ByteArrayInputStream(orig.getBytes("utf-8"));
127         Document d = t.parseDOM(is, null);
128         Node body;
129         try {
130             body = XPathAPI.selectSingleNode(d, "//body");
131         } catch (TransformerException e) {
132             log.error(e);
133             return null;
134         }
135         // an estimate of the final string size
136         ByteArrayOutputStream out = new ByteArrayOutputStream(2 * orig.length());
137         OutImpl o = new OutImpl();
138         o.out = out;
139         o.state = StreamIn.FSM_ASCII;
140         o.encoding = t.getCharEncoding();
141         NodeList nl = body.getChildNodes();
142         for(int i = 0; i < nl.getLength(); i++) {
143             Node n = nl.item(i);
144             if (n instanceof DOMNodeImpl)
145                 pp.printTree(o, (short)0, 0, null, ((DOMNodeImpl)n)
146                     .getAdaptee());
147         }
148         pp.flushLine(o, 0);
149         return new String(out.toByteArray(), "utf-8");
150     }
151 }