1 package net.sourceforge.blogentis.utils;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 import java.io.ByteArrayInputStream;
26 import java.io.ByteArrayOutputStream;
27 import java.io.InputStream;
28 import java.io.UnsupportedEncodingException;
29
30 import javax.xml.transform.TransformerException;
31
32 import org.apache.commons.logging.Log;
33 import org.apache.commons.logging.LogFactory;
34 import org.apache.turbine.services.TurbineBaseService;
35 import org.apache.turbine.services.TurbineServices;
36 import org.apache.turbine.services.servlet.TurbineServlet;
37 import org.apache.xpath.XPathAPI;
38 import org.w3c.dom.Document;
39 import org.w3c.dom.Node;
40 import org.w3c.dom.NodeList;
41 import org.w3c.tidy.DOMNodeImpl;
42 import org.w3c.tidy.OutImpl;
43 import org.w3c.tidy.PPrint;
44 import org.w3c.tidy.StreamIn;
45 import org.w3c.tidy.Tidy;
46
47 /***
48 * Service that handles cleaning up HTML with JTidy.
49 *
50 * @author abas
51 */
52 public class JTidyService
53 extends TurbineBaseService {
54 public static final String DEFAULT_CONFIGURATION = "WEB-INF/conf/JTidy.properties";
55
56 public static final String CONFIGURATION_PARAMETER = "configuration";
57
58 private static Log log = LogFactory.getLog(JTidyService.class);
59
60 public static final String SERVICE_NAME = "JTidyService";
61
62 public static JTidyService getInstane() {
63 return (JTidyService)TurbineServices.getInstance()
64 .getService(SERVICE_NAME);
65 }
66
67 private String configurationPath = null;
68
69 public void init() {
70 configurationPath = TurbineServlet.getRealPath(getConfiguration()
71 .getString(CONFIGURATION_PARAMETER, DEFAULT_CONFIGURATION));
72 setInit(true);
73 }
74
75 public void shutdown() {
76 configurationPath = null;
77 setInit(false);
78 }
79
80 public static Tidy getTidy() {
81 Tidy t = new Tidy();
82 t.setConfigurationFromFile(getInstane().configurationPath);
83 return t;
84 }
85
86 /***
87 * Parse the given HTML fragment and produce the cleaned-up DOM tree.
88 *
89 * @param html
90 * the HTML fragment
91 * @return the BODY element of the parsed HTML document. Its children will
92 * be tags, text will be wrapped in paragraph tags.
93 */
94 public static Node parseHTMLToDOM(String html) {
95 Tidy t = getTidy();
96 InputStream is;
97 try {
98 is = new ByteArrayInputStream(html.getBytes("utf-8"));
99 } catch (UnsupportedEncodingException e1) {
100 log.error("Can't get utf-8 encoding", e1);
101 return null;
102 }
103 Document d = t.parseDOM(is, null);
104 try {
105 return XPathAPI.selectSingleNode(d, "//body");
106 } catch (TransformerException e) {
107 log.error(e);
108 return null;
109 }
110 }
111
112 /***
113 * Clean us the given string according to JTidy rules and return the
114 * pretty-printed version.
115 *
116 * @param orig
117 * the original HTML fragment string.
118 * @return the pretty-printed HTML string.
119 * @throws UnsupportedEncodingException
120 * When the UTF-8 encoding does not exist.
121 */
122 public static String cleanupString(String orig)
123 throws UnsupportedEncodingException {
124 Tidy t = getTidy();
125 PPrint pp = new PPrint(t.getConfiguration());
126 InputStream is = new ByteArrayInputStream(orig.getBytes("utf-8"));
127 Document d = t.parseDOM(is, null);
128 Node body;
129 try {
130 body = XPathAPI.selectSingleNode(d, "//body");
131 } catch (TransformerException e) {
132 log.error(e);
133 return null;
134 }
135
136 ByteArrayOutputStream out = new ByteArrayOutputStream(2 * orig.length());
137 OutImpl o = new OutImpl();
138 o.out = out;
139 o.state = StreamIn.FSM_ASCII;
140 o.encoding = t.getCharEncoding();
141 NodeList nl = body.getChildNodes();
142 for(int i = 0; i < nl.getLength(); i++) {
143 Node n = nl.item(i);
144 if (n instanceof DOMNodeImpl)
145 pp.printTree(o, (short)0, 0, null, ((DOMNodeImpl)n)
146 .getAdaptee());
147 }
148 pp.flushLine(o, 0);
149 return new String(out.toByteArray(), "utf-8");
150 }
151 }