1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package net.sourceforge.blogentis.utils;
19
20 import java.util.HashMap;
21
22 /***
23 * Utilities for String formatting, manipulation, and queries. More information
24 * about this class is available from <a target="_top" href=
25 * "http://ostermiller.org/utils/StringHelper.html">ostermiller.org </a>.
26 *
27 * This class has been trimmed down by <a
28 * href="mailto:abas@aix.meng.auth.gr">abas@aix.meng.auth.gr </a>
29 */
30 public class StringUtils {
31 /***
32 * Replaces characters that may be confused by a HTML parser with their
33 * equivalent character entity references.
34 * <p>
35 * Any data that will appear as text on a web page should be be escaped.
36 * This is especially important for data that comes from untrusted sources
37 * such as Internet users. A common mistake in CGI programming is to ask a
38 * user for data and then put that data on a web page. For example:
39 *
40 * <pre>
41 *
42 *
43 * Server: What is your name?
44 * User: <b>Joe<b>
45 * Server: Hello <b>Joe</b>, Welcome
46 *
47 * </pre>
48 *
49 * If the name is put on the page without checking that it doesn't contain
50 * HTML code or without sanitizing that HTML code, the user could reformat
51 * the page, insert scripts, and control the the content on your web server.
52 * <p>
53 * This method will replace HTML characters such as > with their HTML
54 * entity reference (&gt;) so that the html parser will be sure to
55 * interpret them as plain text rather than HTML or script.
56 * <p>
57 * This method should be used for both data to be displayed in text in the
58 * html document, and data put in form elements. For example: <br>
59 * <code><html><body><i>This in not a &lt;tag&gt;
60 * in HTML</i></body></html></code>
61 * <br>
62 * and <br>
63 * <code><form><input type="hidden" name="date" value="<i>This data could
64 * be &quot;malicious&quot;</i>"></form></code>
65 * <br>
66 * In the second example, the form data would be properly be resubmitted to
67 * your cgi script in the URLEncoded format: <br>
68 * <code><i>This data could be %22malicious%22</i></code>
69 *
70 * @param s
71 * String to be escaped
72 * @return escaped String
73 * @throws NullPointerException
74 * if s is null.
75 */
76 public static String escapeHTML(String s) {
77 if (s == null)
78 return null;
79 int length = s.length();
80 int newLength = length;
81
82
83
84 for(int i = 0; i < length; i++) {
85 char c = s.charAt(i);
86 int cint = 0xffff & c;
87 if (cint < 32) {
88 switch (c) {
89 case '\r':
90 case '\n':
91 case '\t':
92 case '\f': {}
93 break;
94 default: {
95 newLength -= 1;
96 }
97 }
98 } else {
99 switch (c) {
100 case '\"': {
101 newLength += 5;
102 }
103 break;
104 case '&':
105 case '\'': {
106 newLength += 4;
107 }
108 break;
109 case '<':
110 case '>': {
111 newLength += 3;
112 }
113 break;
114 }
115 }
116 }
117 if (length == newLength) {
118
119 return s;
120 }
121 StringBuffer sb = new StringBuffer(newLength);
122 for(int i = 0; i < length; i++) {
123 char c = s.charAt(i);
124 int cint = 0xffff & c;
125 if (cint < 32) {
126 switch (c) {
127 case '\r':
128 case '\n':
129 case '\t':
130 case '\f': {
131 sb.append(c);
132 }
133 break;
134 default: {
135
136 }
137 }
138 } else {
139 switch (c) {
140 case '\"': {
141 sb.append(""");
142 }
143 break;
144 case '\'': {
145 sb.append("'");
146 }
147 break;
148 case '&': {
149 sb.append("&");
150 }
151 break;
152 case '<': {
153 sb.append("<");
154 }
155 break;
156 case '>': {
157 sb.append(">");
158 }
159 break;
160 default: {
161 sb.append(c);
162 }
163 }
164 }
165 }
166 return sb.toString();
167 }
168
169 private static HashMap htmlEntities = new HashMap();
170 static {
171 htmlEntities.put("nbsp", new Integer(160));
172 htmlEntities.put("iexcl", new Integer(161));
173 htmlEntities.put("cent", new Integer(162));
174 htmlEntities.put("pound", new Integer(163));
175 htmlEntities.put("curren", new Integer(164));
176 htmlEntities.put("yen", new Integer(165));
177 htmlEntities.put("brvbar", new Integer(166));
178 htmlEntities.put("sect", new Integer(167));
179 htmlEntities.put("uml", new Integer(168));
180 htmlEntities.put("copy", new Integer(169));
181 htmlEntities.put("ordf", new Integer(170));
182 htmlEntities.put("laquo", new Integer(171));
183 htmlEntities.put("not", new Integer(172));
184 htmlEntities.put("shy", new Integer(173));
185 htmlEntities.put("reg", new Integer(174));
186 htmlEntities.put("macr", new Integer(175));
187 htmlEntities.put("deg", new Integer(176));
188 htmlEntities.put("plusmn", new Integer(177));
189 htmlEntities.put("sup2", new Integer(178));
190 htmlEntities.put("sup3", new Integer(179));
191 htmlEntities.put("acute", new Integer(180));
192 htmlEntities.put("micro", new Integer(181));
193 htmlEntities.put("para", new Integer(182));
194 htmlEntities.put("middot", new Integer(183));
195 htmlEntities.put("cedil", new Integer(184));
196 htmlEntities.put("sup1", new Integer(185));
197 htmlEntities.put("ordm", new Integer(186));
198 htmlEntities.put("raquo", new Integer(187));
199 htmlEntities.put("frac14", new Integer(188));
200 htmlEntities.put("frac12", new Integer(189));
201 htmlEntities.put("frac34", new Integer(190));
202 htmlEntities.put("iquest", new Integer(191));
203 htmlEntities.put("Agrave", new Integer(192));
204 htmlEntities.put("Aacute", new Integer(193));
205 htmlEntities.put("Acirc", new Integer(194));
206 htmlEntities.put("Atilde", new Integer(195));
207 htmlEntities.put("Auml", new Integer(196));
208 htmlEntities.put("Aring", new Integer(197));
209 htmlEntities.put("AElig", new Integer(198));
210 htmlEntities.put("Ccedil", new Integer(199));
211 htmlEntities.put("Egrave", new Integer(200));
212 htmlEntities.put("Eacute", new Integer(201));
213 htmlEntities.put("Ecirc", new Integer(202));
214 htmlEntities.put("Euml", new Integer(203));
215 htmlEntities.put("Igrave", new Integer(204));
216 htmlEntities.put("Iacute", new Integer(205));
217 htmlEntities.put("Icirc", new Integer(206));
218 htmlEntities.put("Iuml", new Integer(207));
219 htmlEntities.put("ETH", new Integer(208));
220 htmlEntities.put("Ntilde", new Integer(209));
221 htmlEntities.put("Ograve", new Integer(210));
222 htmlEntities.put("Oacute", new Integer(211));
223 htmlEntities.put("Ocirc", new Integer(212));
224 htmlEntities.put("Otilde", new Integer(213));
225 htmlEntities.put("Ouml", new Integer(214));
226 htmlEntities.put("times", new Integer(215));
227 htmlEntities.put("Oslash", new Integer(216));
228 htmlEntities.put("Ugrave", new Integer(217));
229 htmlEntities.put("Uacute", new Integer(218));
230 htmlEntities.put("Ucirc", new Integer(219));
231 htmlEntities.put("Uuml", new Integer(220));
232 htmlEntities.put("Yacute", new Integer(221));
233 htmlEntities.put("THORN", new Integer(222));
234 htmlEntities.put("szlig", new Integer(223));
235 htmlEntities.put("agrave", new Integer(224));
236 htmlEntities.put("aacute", new Integer(225));
237 htmlEntities.put("acirc", new Integer(226));
238 htmlEntities.put("atilde", new Integer(227));
239 htmlEntities.put("auml", new Integer(228));
240 htmlEntities.put("aring", new Integer(229));
241 htmlEntities.put("aelig", new Integer(230));
242 htmlEntities.put("ccedil", new Integer(231));
243 htmlEntities.put("egrave", new Integer(232));
244 htmlEntities.put("eacute", new Integer(233));
245 htmlEntities.put("ecirc", new Integer(234));
246 htmlEntities.put("euml", new Integer(235));
247 htmlEntities.put("igrave", new Integer(236));
248 htmlEntities.put("iacute", new Integer(237));
249 htmlEntities.put("icirc", new Integer(238));
250 htmlEntities.put("iuml", new Integer(239));
251 htmlEntities.put("eth", new Integer(240));
252 htmlEntities.put("ntilde", new Integer(241));
253 htmlEntities.put("ograve", new Integer(242));
254 htmlEntities.put("oacute", new Integer(243));
255 htmlEntities.put("ocirc", new Integer(244));
256 htmlEntities.put("otilde", new Integer(245));
257 htmlEntities.put("ouml", new Integer(246));
258 htmlEntities.put("divide", new Integer(247));
259 htmlEntities.put("oslash", new Integer(248));
260 htmlEntities.put("ugrave", new Integer(249));
261 htmlEntities.put("uacute", new Integer(250));
262 htmlEntities.put("ucirc", new Integer(251));
263 htmlEntities.put("uuml", new Integer(252));
264 htmlEntities.put("yacute", new Integer(253));
265 htmlEntities.put("thorn", new Integer(254));
266 htmlEntities.put("yuml", new Integer(255));
267 htmlEntities.put("fnof", new Integer(402));
268 htmlEntities.put("Alpha", new Integer(913));
269 htmlEntities.put("Beta", new Integer(914));
270 htmlEntities.put("Gamma", new Integer(915));
271 htmlEntities.put("Delta", new Integer(916));
272 htmlEntities.put("Epsilon", new Integer(917));
273 htmlEntities.put("Zeta", new Integer(918));
274 htmlEntities.put("Eta", new Integer(919));
275 htmlEntities.put("Theta", new Integer(920));
276 htmlEntities.put("Iota", new Integer(921));
277 htmlEntities.put("Kappa", new Integer(922));
278 htmlEntities.put("Lambda", new Integer(923));
279 htmlEntities.put("Mu", new Integer(924));
280 htmlEntities.put("Nu", new Integer(925));
281 htmlEntities.put("Xi", new Integer(926));
282 htmlEntities.put("Omicron", new Integer(927));
283 htmlEntities.put("Pi", new Integer(928));
284 htmlEntities.put("Rho", new Integer(929));
285 htmlEntities.put("Sigma", new Integer(931));
286 htmlEntities.put("Tau", new Integer(932));
287 htmlEntities.put("Upsilon", new Integer(933));
288 htmlEntities.put("Phi", new Integer(934));
289 htmlEntities.put("Chi", new Integer(935));
290 htmlEntities.put("Psi", new Integer(936));
291 htmlEntities.put("Omega", new Integer(937));
292 htmlEntities.put("alpha", new Integer(945));
293 htmlEntities.put("beta", new Integer(946));
294 htmlEntities.put("gamma", new Integer(947));
295 htmlEntities.put("delta", new Integer(948));
296 htmlEntities.put("epsilon", new Integer(949));
297 htmlEntities.put("zeta", new Integer(950));
298 htmlEntities.put("eta", new Integer(951));
299 htmlEntities.put("theta", new Integer(952));
300 htmlEntities.put("iota", new Integer(953));
301 htmlEntities.put("kappa", new Integer(954));
302 htmlEntities.put("lambda", new Integer(955));
303 htmlEntities.put("mu", new Integer(956));
304 htmlEntities.put("nu", new Integer(957));
305 htmlEntities.put("xi", new Integer(958));
306 htmlEntities.put("omicron", new Integer(959));
307 htmlEntities.put("pi", new Integer(960));
308 htmlEntities.put("rho", new Integer(961));
309 htmlEntities.put("sigmaf", new Integer(962));
310 htmlEntities.put("sigma", new Integer(963));
311 htmlEntities.put("tau", new Integer(964));
312 htmlEntities.put("upsilon", new Integer(965));
313 htmlEntities.put("phi", new Integer(966));
314 htmlEntities.put("chi", new Integer(967));
315 htmlEntities.put("psi", new Integer(968));
316 htmlEntities.put("omega", new Integer(969));
317 htmlEntities.put("thetasym", new Integer(977));
318 htmlEntities.put("upsih", new Integer(978));
319 htmlEntities.put("piv", new Integer(982));
320 htmlEntities.put("bull", new Integer(8226));
321 htmlEntities.put("hellip", new Integer(8230));
322 htmlEntities.put("prime", new Integer(8242));
323 htmlEntities.put("Prime", new Integer(8243));
324 htmlEntities.put("oline", new Integer(8254));
325 htmlEntities.put("frasl", new Integer(8260));
326 htmlEntities.put("weierp", new Integer(8472));
327 htmlEntities.put("image", new Integer(8465));
328 htmlEntities.put("real", new Integer(8476));
329 htmlEntities.put("trade", new Integer(8482));
330 htmlEntities.put("alefsym", new Integer(8501));
331 htmlEntities.put("larr", new Integer(8592));
332 htmlEntities.put("uarr", new Integer(8593));
333 htmlEntities.put("rarr", new Integer(8594));
334 htmlEntities.put("darr", new Integer(8595));
335 htmlEntities.put("harr", new Integer(8596));
336 htmlEntities.put("crarr", new Integer(8629));
337 htmlEntities.put("lArr", new Integer(8656));
338 htmlEntities.put("uArr", new Integer(8657));
339 htmlEntities.put("rArr", new Integer(8658));
340 htmlEntities.put("dArr", new Integer(8659));
341 htmlEntities.put("hArr", new Integer(8660));
342 htmlEntities.put("forall", new Integer(8704));
343 htmlEntities.put("part", new Integer(8706));
344 htmlEntities.put("exist", new Integer(8707));
345 htmlEntities.put("empty", new Integer(8709));
346 htmlEntities.put("nabla", new Integer(8711));
347 htmlEntities.put("isin", new Integer(8712));
348 htmlEntities.put("notin", new Integer(8713));
349 htmlEntities.put("ni", new Integer(8715));
350 htmlEntities.put("prod", new Integer(8719));
351 htmlEntities.put("sum", new Integer(8721));
352 htmlEntities.put("minus", new Integer(8722));
353 htmlEntities.put("lowast", new Integer(8727));
354 htmlEntities.put("radic", new Integer(8730));
355 htmlEntities.put("prop", new Integer(8733));
356 htmlEntities.put("infin", new Integer(8734));
357 htmlEntities.put("ang", new Integer(8736));
358 htmlEntities.put("and", new Integer(8743));
359 htmlEntities.put("or", new Integer(8744));
360 htmlEntities.put("cap", new Integer(8745));
361 htmlEntities.put("cup", new Integer(8746));
362 htmlEntities.put("int", new Integer(8747));
363 htmlEntities.put("there4", new Integer(8756));
364 htmlEntities.put("sim", new Integer(8764));
365 htmlEntities.put("cong", new Integer(8773));
366 htmlEntities.put("asymp", new Integer(8776));
367 htmlEntities.put("ne", new Integer(8800));
368 htmlEntities.put("equiv", new Integer(8801));
369 htmlEntities.put("le", new Integer(8804));
370 htmlEntities.put("ge", new Integer(8805));
371 htmlEntities.put("sub", new Integer(8834));
372 htmlEntities.put("sup", new Integer(8835));
373 htmlEntities.put("nsub", new Integer(8836));
374 htmlEntities.put("sube", new Integer(8838));
375 htmlEntities.put("supe", new Integer(8839));
376 htmlEntities.put("oplus", new Integer(8853));
377 htmlEntities.put("otimes", new Integer(8855));
378 htmlEntities.put("perp", new Integer(8869));
379 htmlEntities.put("sdot", new Integer(8901));
380 htmlEntities.put("lceil", new Integer(8968));
381 htmlEntities.put("rceil", new Integer(8969));
382 htmlEntities.put("lfloor", new Integer(8970));
383 htmlEntities.put("rfloor", new Integer(8971));
384 htmlEntities.put("lang", new Integer(9001));
385 htmlEntities.put("rang", new Integer(9002));
386 htmlEntities.put("loz", new Integer(9674));
387 htmlEntities.put("spades", new Integer(9824));
388 htmlEntities.put("clubs", new Integer(9827));
389 htmlEntities.put("hearts", new Integer(9829));
390 htmlEntities.put("diams", new Integer(9830));
391 htmlEntities.put("quot", new Integer(34));
392 htmlEntities.put("amp", new Integer(38));
393 htmlEntities.put("lt", new Integer(60));
394 htmlEntities.put("gt", new Integer(62));
395 htmlEntities.put("OElig", new Integer(338));
396 htmlEntities.put("oelig", new Integer(339));
397 htmlEntities.put("Scaron", new Integer(352));
398 htmlEntities.put("scaron", new Integer(353));
399 htmlEntities.put("Yuml", new Integer(376));
400 htmlEntities.put("circ", new Integer(710));
401 htmlEntities.put("tilde", new Integer(732));
402 htmlEntities.put("ensp", new Integer(8194));
403 htmlEntities.put("emsp", new Integer(8195));
404 htmlEntities.put("thinsp", new Integer(8201));
405 htmlEntities.put("zwnj", new Integer(8204));
406 htmlEntities.put("zwj", new Integer(8205));
407 htmlEntities.put("lrm", new Integer(8206));
408 htmlEntities.put("rlm", new Integer(8207));
409 htmlEntities.put("ndash", new Integer(8211));
410 htmlEntities.put("mdash", new Integer(8212));
411 htmlEntities.put("lsquo", new Integer(8216));
412 htmlEntities.put("rsquo", new Integer(8217));
413 htmlEntities.put("sbquo", new Integer(8218));
414 htmlEntities.put("ldquo", new Integer(8220));
415 htmlEntities.put("rdquo", new Integer(8221));
416 htmlEntities.put("bdquo", new Integer(8222));
417 htmlEntities.put("dagger", new Integer(8224));
418 htmlEntities.put("Dagger", new Integer(8225));
419 htmlEntities.put("permil", new Integer(8240));
420 htmlEntities.put("lsaquo", new Integer(8249));
421 htmlEntities.put("rsaquo", new Integer(8250));
422 htmlEntities.put("euro", new Integer(8364));
423 }
424
425 /***
426 * Turn any HTML escape entities in the string into characters and return
427 * the resulting string.
428 *
429 * @param s
430 * String to be unescaped.
431 * @return unescaped String.
432 * @throws NullPointerException
433 * if s is null.
434 */
435 public static String unescapeHTML(String s) {
436 if (s == null)
437 return null;
438 StringBuffer result = new StringBuffer(s.length());
439 int ampInd = s.indexOf("&");
440 int lastEnd = 0;
441 while (ampInd >= 0) {
442 int nextAmp = s.indexOf("&", ampInd + 1);
443 int nextSemi = s.indexOf(";", ampInd + 1);
444 if (nextSemi != -1 && (nextAmp == -1 || nextSemi < nextAmp)) {
445 int value = -1;
446 String escape = s.substring(ampInd + 1, nextSemi);
447 try {
448 if (escape.startsWith("#")) {
449 value = Integer.parseInt(escape.substring(1), 10);
450 } else {
451 if (htmlEntities.containsKey(escape)) {
452 value = ((Integer)(htmlEntities.get(escape)))
453 .intValue();
454 }
455 }
456 } catch (NumberFormatException x) {}
457 result.append(s.substring(lastEnd, ampInd));
458 lastEnd = nextSemi + 1;
459 if (value >= 0 && value <= 0xffff) {
460 result.append((char)value);
461 } else {
462 result.append("&").append(escape).append(";");
463 }
464 }
465 ampInd = nextAmp;
466 }
467 result.append(s.substring(lastEnd));
468 return result.toString();
469 }
470
471 public static String entityfyHTML(String s) {
472 if (s == null)
473 return null;
474 return s.replaceAll("&", "&").replaceAll("<", "<").replaceAll(
475 ">", ">").replaceAll("\"", """);
476 }
477
478 public static String removeTags(String s) {
479 if (s == null)
480 return null;
481 return s.replaceAll("<[a-zA-Z]+.*?(>|$)", s);
482 }
483
484 public static String convertBreaks(String text) {
485 text = text.replaceFirst("[//n//r //t]+$", "");
486 text = text.replaceFirst("^[//n//r //t]+", "");
487 text = text.replaceAll("//r*//n//r*//n[//n//r]*", "\n</p><p>\n");
488 return text;
489 }
490 }