View Javadoc

1   /*
2    * Static String formatting and query routines.
3    * Copyright (C) 2001,2002 Stephen Ostermiller
4    * http://ostermiller.org/contact.pl?regarding=Java+Utilities
5    *
6    * This program is free software; you can redistribute it and/or modify
7    * it under the terms of the GNU General Public License as published by
8    * the Free Software Foundation; either version 2 of the License, or
9    * (at your option) any later version.
10   *
11   * This program is distributed in the hope that it will be useful,
12   * but WITHOUT ANY WARRANTY; without even the implied warranty of
13   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   * GNU General Public License for more details.
15   *
16   * See COPYING.TXT for details.
17   */
18  package net.sourceforge.blogentis.utils;
19  
20  import java.util.HashMap;
21  
22  /***
23   * Utilities for String formatting, manipulation, and queries. More information
24   * about this class is available from <a target="_top" href=
25   * "http://ostermiller.org/utils/StringHelper.html">ostermiller.org </a>.
26   * 
27   * This class has been trimmed down by <a
28   * href="mailto:abas@aix.meng.auth.gr">abas@aix.meng.auth.gr </a>
29   */
30  public class StringUtils {
31      /***
32       * Replaces characters that may be confused by a HTML parser with their
33       * equivalent character entity references.
34       * <p>
35       * Any data that will appear as text on a web page should be be escaped.
36       * This is especially important for data that comes from untrusted sources
37       * such as Internet users. A common mistake in CGI programming is to ask a
38       * user for data and then put that data on a web page. For example:
39       * 
40       * <pre>
41       * 
42       *  
43       *   Server: What is your name?
44       *   User: &lt;b&gt;Joe&lt;b&gt;
45       *   Server: Hello &lt;b&gt;Joe&lt;/b&gt;, Welcome
46       *  
47       * </pre>
48       * 
49       * If the name is put on the page without checking that it doesn't contain
50       * HTML code or without sanitizing that HTML code, the user could reformat
51       * the page, insert scripts, and control the the content on your web server.
52       * <p>
53       * This method will replace HTML characters such as &gt; with their HTML
54       * entity reference (&amp;gt;) so that the html parser will be sure to
55       * interpret them as plain text rather than HTML or script.
56       * <p>
57       * This method should be used for both data to be displayed in text in the
58       * html document, and data put in form elements. For example: <br>
59       * <code>&lt;html&gt;&lt;body&gt;<i>This in not a &amp;lt;tag&amp;gt;
60       * in HTML</i>&lt;/body&gt;&lt;/html&gt;</code>
61       * <br>
62       * and <br>
63       * <code>&lt;form&gt;&lt;input type="hidden" name="date" value="<i>This data could
64       * be &amp;quot;malicious&amp;quot;</i>"&gt;&lt;/form&gt;</code>
65       * <br>
66       * In the second example, the form data would be properly be resubmitted to
67       * your cgi script in the URLEncoded format: <br>
68       * <code><i>This data could be %22malicious%22</i></code>
69       * 
70       * @param s
71       *            String to be escaped
72       * @return escaped String
73       * @throws NullPointerException
74       *             if s is null.
75       */
76      public static String escapeHTML(String s) {
77          if (s == null)
78              return null;
79          int length = s.length();
80          int newLength = length;
81          // first check for characters that might
82          // be dangerous and calculate a length
83          // of the string that has escapes.
84          for(int i = 0; i < length; i++) {
85              char c = s.charAt(i);
86              int cint = 0xffff & c;
87              if (cint < 32) {
88                  switch (c) {
89                  case '\r':
90                  case '\n':
91                  case '\t':
92                  case '\f': {}
93                      break;
94                  default: {
95                      newLength -= 1;
96                  }
97                  }
98              } else {
99                  switch (c) {
100                 case '\"': {
101                     newLength += 5;
102                 }
103                     break;
104                 case '&':
105                 case '\'': {
106                     newLength += 4;
107                 }
108                     break;
109                 case '<':
110                 case '>': {
111                     newLength += 3;
112                 }
113                     break;
114                 }
115             }
116         }
117         if (length == newLength) {
118             // nothing to escape in the string
119             return s;
120         }
121         StringBuffer sb = new StringBuffer(newLength);
122         for(int i = 0; i < length; i++) {
123             char c = s.charAt(i);
124             int cint = 0xffff & c;
125             if (cint < 32) {
126                 switch (c) {
127                 case '\r':
128                 case '\n':
129                 case '\t':
130                 case '\f': {
131                     sb.append(c);
132                 }
133                     break;
134                 default: {
135                     // Remove this character
136                 }
137                 }
138             } else {
139                 switch (c) {
140                 case '\"': {
141                     sb.append("&quot;");
142                 }
143                     break;
144                 case '\'': {
145                     sb.append("&#39;");
146                 }
147                     break;
148                 case '&': {
149                     sb.append("&amp;");
150                 }
151                     break;
152                 case '<': {
153                     sb.append("&lt;");
154                 }
155                     break;
156                 case '>': {
157                     sb.append("&gt;");
158                 }
159                     break;
160                 default: {
161                     sb.append(c);
162                 }
163                 }
164             }
165         }
166         return sb.toString();
167     }
168 
169     private static HashMap htmlEntities = new HashMap();
170     static {
171         htmlEntities.put("nbsp", new Integer(160));
172         htmlEntities.put("iexcl", new Integer(161));
173         htmlEntities.put("cent", new Integer(162));
174         htmlEntities.put("pound", new Integer(163));
175         htmlEntities.put("curren", new Integer(164));
176         htmlEntities.put("yen", new Integer(165));
177         htmlEntities.put("brvbar", new Integer(166));
178         htmlEntities.put("sect", new Integer(167));
179         htmlEntities.put("uml", new Integer(168));
180         htmlEntities.put("copy", new Integer(169));
181         htmlEntities.put("ordf", new Integer(170));
182         htmlEntities.put("laquo", new Integer(171));
183         htmlEntities.put("not", new Integer(172));
184         htmlEntities.put("shy", new Integer(173));
185         htmlEntities.put("reg", new Integer(174));
186         htmlEntities.put("macr", new Integer(175));
187         htmlEntities.put("deg", new Integer(176));
188         htmlEntities.put("plusmn", new Integer(177));
189         htmlEntities.put("sup2", new Integer(178));
190         htmlEntities.put("sup3", new Integer(179));
191         htmlEntities.put("acute", new Integer(180));
192         htmlEntities.put("micro", new Integer(181));
193         htmlEntities.put("para", new Integer(182));
194         htmlEntities.put("middot", new Integer(183));
195         htmlEntities.put("cedil", new Integer(184));
196         htmlEntities.put("sup1", new Integer(185));
197         htmlEntities.put("ordm", new Integer(186));
198         htmlEntities.put("raquo", new Integer(187));
199         htmlEntities.put("frac14", new Integer(188));
200         htmlEntities.put("frac12", new Integer(189));
201         htmlEntities.put("frac34", new Integer(190));
202         htmlEntities.put("iquest", new Integer(191));
203         htmlEntities.put("Agrave", new Integer(192));
204         htmlEntities.put("Aacute", new Integer(193));
205         htmlEntities.put("Acirc", new Integer(194));
206         htmlEntities.put("Atilde", new Integer(195));
207         htmlEntities.put("Auml", new Integer(196));
208         htmlEntities.put("Aring", new Integer(197));
209         htmlEntities.put("AElig", new Integer(198));
210         htmlEntities.put("Ccedil", new Integer(199));
211         htmlEntities.put("Egrave", new Integer(200));
212         htmlEntities.put("Eacute", new Integer(201));
213         htmlEntities.put("Ecirc", new Integer(202));
214         htmlEntities.put("Euml", new Integer(203));
215         htmlEntities.put("Igrave", new Integer(204));
216         htmlEntities.put("Iacute", new Integer(205));
217         htmlEntities.put("Icirc", new Integer(206));
218         htmlEntities.put("Iuml", new Integer(207));
219         htmlEntities.put("ETH", new Integer(208));
220         htmlEntities.put("Ntilde", new Integer(209));
221         htmlEntities.put("Ograve", new Integer(210));
222         htmlEntities.put("Oacute", new Integer(211));
223         htmlEntities.put("Ocirc", new Integer(212));
224         htmlEntities.put("Otilde", new Integer(213));
225         htmlEntities.put("Ouml", new Integer(214));
226         htmlEntities.put("times", new Integer(215));
227         htmlEntities.put("Oslash", new Integer(216));
228         htmlEntities.put("Ugrave", new Integer(217));
229         htmlEntities.put("Uacute", new Integer(218));
230         htmlEntities.put("Ucirc", new Integer(219));
231         htmlEntities.put("Uuml", new Integer(220));
232         htmlEntities.put("Yacute", new Integer(221));
233         htmlEntities.put("THORN", new Integer(222));
234         htmlEntities.put("szlig", new Integer(223));
235         htmlEntities.put("agrave", new Integer(224));
236         htmlEntities.put("aacute", new Integer(225));
237         htmlEntities.put("acirc", new Integer(226));
238         htmlEntities.put("atilde", new Integer(227));
239         htmlEntities.put("auml", new Integer(228));
240         htmlEntities.put("aring", new Integer(229));
241         htmlEntities.put("aelig", new Integer(230));
242         htmlEntities.put("ccedil", new Integer(231));
243         htmlEntities.put("egrave", new Integer(232));
244         htmlEntities.put("eacute", new Integer(233));
245         htmlEntities.put("ecirc", new Integer(234));
246         htmlEntities.put("euml", new Integer(235));
247         htmlEntities.put("igrave", new Integer(236));
248         htmlEntities.put("iacute", new Integer(237));
249         htmlEntities.put("icirc", new Integer(238));
250         htmlEntities.put("iuml", new Integer(239));
251         htmlEntities.put("eth", new Integer(240));
252         htmlEntities.put("ntilde", new Integer(241));
253         htmlEntities.put("ograve", new Integer(242));
254         htmlEntities.put("oacute", new Integer(243));
255         htmlEntities.put("ocirc", new Integer(244));
256         htmlEntities.put("otilde", new Integer(245));
257         htmlEntities.put("ouml", new Integer(246));
258         htmlEntities.put("divide", new Integer(247));
259         htmlEntities.put("oslash", new Integer(248));
260         htmlEntities.put("ugrave", new Integer(249));
261         htmlEntities.put("uacute", new Integer(250));
262         htmlEntities.put("ucirc", new Integer(251));
263         htmlEntities.put("uuml", new Integer(252));
264         htmlEntities.put("yacute", new Integer(253));
265         htmlEntities.put("thorn", new Integer(254));
266         htmlEntities.put("yuml", new Integer(255));
267         htmlEntities.put("fnof", new Integer(402));
268         htmlEntities.put("Alpha", new Integer(913));
269         htmlEntities.put("Beta", new Integer(914));
270         htmlEntities.put("Gamma", new Integer(915));
271         htmlEntities.put("Delta", new Integer(916));
272         htmlEntities.put("Epsilon", new Integer(917));
273         htmlEntities.put("Zeta", new Integer(918));
274         htmlEntities.put("Eta", new Integer(919));
275         htmlEntities.put("Theta", new Integer(920));
276         htmlEntities.put("Iota", new Integer(921));
277         htmlEntities.put("Kappa", new Integer(922));
278         htmlEntities.put("Lambda", new Integer(923));
279         htmlEntities.put("Mu", new Integer(924));
280         htmlEntities.put("Nu", new Integer(925));
281         htmlEntities.put("Xi", new Integer(926));
282         htmlEntities.put("Omicron", new Integer(927));
283         htmlEntities.put("Pi", new Integer(928));
284         htmlEntities.put("Rho", new Integer(929));
285         htmlEntities.put("Sigma", new Integer(931));
286         htmlEntities.put("Tau", new Integer(932));
287         htmlEntities.put("Upsilon", new Integer(933));
288         htmlEntities.put("Phi", new Integer(934));
289         htmlEntities.put("Chi", new Integer(935));
290         htmlEntities.put("Psi", new Integer(936));
291         htmlEntities.put("Omega", new Integer(937));
292         htmlEntities.put("alpha", new Integer(945));
293         htmlEntities.put("beta", new Integer(946));
294         htmlEntities.put("gamma", new Integer(947));
295         htmlEntities.put("delta", new Integer(948));
296         htmlEntities.put("epsilon", new Integer(949));
297         htmlEntities.put("zeta", new Integer(950));
298         htmlEntities.put("eta", new Integer(951));
299         htmlEntities.put("theta", new Integer(952));
300         htmlEntities.put("iota", new Integer(953));
301         htmlEntities.put("kappa", new Integer(954));
302         htmlEntities.put("lambda", new Integer(955));
303         htmlEntities.put("mu", new Integer(956));
304         htmlEntities.put("nu", new Integer(957));
305         htmlEntities.put("xi", new Integer(958));
306         htmlEntities.put("omicron", new Integer(959));
307         htmlEntities.put("pi", new Integer(960));
308         htmlEntities.put("rho", new Integer(961));
309         htmlEntities.put("sigmaf", new Integer(962));
310         htmlEntities.put("sigma", new Integer(963));
311         htmlEntities.put("tau", new Integer(964));
312         htmlEntities.put("upsilon", new Integer(965));
313         htmlEntities.put("phi", new Integer(966));
314         htmlEntities.put("chi", new Integer(967));
315         htmlEntities.put("psi", new Integer(968));
316         htmlEntities.put("omega", new Integer(969));
317         htmlEntities.put("thetasym", new Integer(977));
318         htmlEntities.put("upsih", new Integer(978));
319         htmlEntities.put("piv", new Integer(982));
320         htmlEntities.put("bull", new Integer(8226));
321         htmlEntities.put("hellip", new Integer(8230));
322         htmlEntities.put("prime", new Integer(8242));
323         htmlEntities.put("Prime", new Integer(8243));
324         htmlEntities.put("oline", new Integer(8254));
325         htmlEntities.put("frasl", new Integer(8260));
326         htmlEntities.put("weierp", new Integer(8472));
327         htmlEntities.put("image", new Integer(8465));
328         htmlEntities.put("real", new Integer(8476));
329         htmlEntities.put("trade", new Integer(8482));
330         htmlEntities.put("alefsym", new Integer(8501));
331         htmlEntities.put("larr", new Integer(8592));
332         htmlEntities.put("uarr", new Integer(8593));
333         htmlEntities.put("rarr", new Integer(8594));
334         htmlEntities.put("darr", new Integer(8595));
335         htmlEntities.put("harr", new Integer(8596));
336         htmlEntities.put("crarr", new Integer(8629));
337         htmlEntities.put("lArr", new Integer(8656));
338         htmlEntities.put("uArr", new Integer(8657));
339         htmlEntities.put("rArr", new Integer(8658));
340         htmlEntities.put("dArr", new Integer(8659));
341         htmlEntities.put("hArr", new Integer(8660));
342         htmlEntities.put("forall", new Integer(8704));
343         htmlEntities.put("part", new Integer(8706));
344         htmlEntities.put("exist", new Integer(8707));
345         htmlEntities.put("empty", new Integer(8709));
346         htmlEntities.put("nabla", new Integer(8711));
347         htmlEntities.put("isin", new Integer(8712));
348         htmlEntities.put("notin", new Integer(8713));
349         htmlEntities.put("ni", new Integer(8715));
350         htmlEntities.put("prod", new Integer(8719));
351         htmlEntities.put("sum", new Integer(8721));
352         htmlEntities.put("minus", new Integer(8722));
353         htmlEntities.put("lowast", new Integer(8727));
354         htmlEntities.put("radic", new Integer(8730));
355         htmlEntities.put("prop", new Integer(8733));
356         htmlEntities.put("infin", new Integer(8734));
357         htmlEntities.put("ang", new Integer(8736));
358         htmlEntities.put("and", new Integer(8743));
359         htmlEntities.put("or", new Integer(8744));
360         htmlEntities.put("cap", new Integer(8745));
361         htmlEntities.put("cup", new Integer(8746));
362         htmlEntities.put("int", new Integer(8747));
363         htmlEntities.put("there4", new Integer(8756));
364         htmlEntities.put("sim", new Integer(8764));
365         htmlEntities.put("cong", new Integer(8773));
366         htmlEntities.put("asymp", new Integer(8776));
367         htmlEntities.put("ne", new Integer(8800));
368         htmlEntities.put("equiv", new Integer(8801));
369         htmlEntities.put("le", new Integer(8804));
370         htmlEntities.put("ge", new Integer(8805));
371         htmlEntities.put("sub", new Integer(8834));
372         htmlEntities.put("sup", new Integer(8835));
373         htmlEntities.put("nsub", new Integer(8836));
374         htmlEntities.put("sube", new Integer(8838));
375         htmlEntities.put("supe", new Integer(8839));
376         htmlEntities.put("oplus", new Integer(8853));
377         htmlEntities.put("otimes", new Integer(8855));
378         htmlEntities.put("perp", new Integer(8869));
379         htmlEntities.put("sdot", new Integer(8901));
380         htmlEntities.put("lceil", new Integer(8968));
381         htmlEntities.put("rceil", new Integer(8969));
382         htmlEntities.put("lfloor", new Integer(8970));
383         htmlEntities.put("rfloor", new Integer(8971));
384         htmlEntities.put("lang", new Integer(9001));
385         htmlEntities.put("rang", new Integer(9002));
386         htmlEntities.put("loz", new Integer(9674));
387         htmlEntities.put("spades", new Integer(9824));
388         htmlEntities.put("clubs", new Integer(9827));
389         htmlEntities.put("hearts", new Integer(9829));
390         htmlEntities.put("diams", new Integer(9830));
391         htmlEntities.put("quot", new Integer(34));
392         htmlEntities.put("amp", new Integer(38));
393         htmlEntities.put("lt", new Integer(60));
394         htmlEntities.put("gt", new Integer(62));
395         htmlEntities.put("OElig", new Integer(338));
396         htmlEntities.put("oelig", new Integer(339));
397         htmlEntities.put("Scaron", new Integer(352));
398         htmlEntities.put("scaron", new Integer(353));
399         htmlEntities.put("Yuml", new Integer(376));
400         htmlEntities.put("circ", new Integer(710));
401         htmlEntities.put("tilde", new Integer(732));
402         htmlEntities.put("ensp", new Integer(8194));
403         htmlEntities.put("emsp", new Integer(8195));
404         htmlEntities.put("thinsp", new Integer(8201));
405         htmlEntities.put("zwnj", new Integer(8204));
406         htmlEntities.put("zwj", new Integer(8205));
407         htmlEntities.put("lrm", new Integer(8206));
408         htmlEntities.put("rlm", new Integer(8207));
409         htmlEntities.put("ndash", new Integer(8211));
410         htmlEntities.put("mdash", new Integer(8212));
411         htmlEntities.put("lsquo", new Integer(8216));
412         htmlEntities.put("rsquo", new Integer(8217));
413         htmlEntities.put("sbquo", new Integer(8218));
414         htmlEntities.put("ldquo", new Integer(8220));
415         htmlEntities.put("rdquo", new Integer(8221));
416         htmlEntities.put("bdquo", new Integer(8222));
417         htmlEntities.put("dagger", new Integer(8224));
418         htmlEntities.put("Dagger", new Integer(8225));
419         htmlEntities.put("permil", new Integer(8240));
420         htmlEntities.put("lsaquo", new Integer(8249));
421         htmlEntities.put("rsaquo", new Integer(8250));
422         htmlEntities.put("euro", new Integer(8364));
423     }
424 
425     /***
426      * Turn any HTML escape entities in the string into characters and return
427      * the resulting string.
428      * 
429      * @param s
430      *            String to be unescaped.
431      * @return unescaped String.
432      * @throws NullPointerException
433      *             if s is null.
434      */
435     public static String unescapeHTML(String s) {
436         if (s == null)
437             return null;
438         StringBuffer result = new StringBuffer(s.length());
439         int ampInd = s.indexOf("&");
440         int lastEnd = 0;
441         while (ampInd >= 0) {
442             int nextAmp = s.indexOf("&", ampInd + 1);
443             int nextSemi = s.indexOf(";", ampInd + 1);
444             if (nextSemi != -1 && (nextAmp == -1 || nextSemi < nextAmp)) {
445                 int value = -1;
446                 String escape = s.substring(ampInd + 1, nextSemi);
447                 try {
448                     if (escape.startsWith("#")) {
449                         value = Integer.parseInt(escape.substring(1), 10);
450                     } else {
451                         if (htmlEntities.containsKey(escape)) {
452                             value = ((Integer)(htmlEntities.get(escape)))
453                                                                          .intValue();
454                         }
455                     }
456                 } catch (NumberFormatException x) {}
457                 result.append(s.substring(lastEnd, ampInd));
458                 lastEnd = nextSemi + 1;
459                 if (value >= 0 && value <= 0xffff) {
460                     result.append((char)value);
461                 } else {
462                     result.append("&").append(escape).append(";");
463                 }
464             }
465             ampInd = nextAmp;
466         }
467         result.append(s.substring(lastEnd));
468         return result.toString();
469     }
470 
471     public static String entityfyHTML(String s) {
472         if (s == null)
473             return null;
474         return s.replaceAll("&", "&amp;").replaceAll("<", "&lt;").replaceAll(
475                 ">", "&gt;").replaceAll("\"", "&quot;");
476     }
477 
478     public static String removeTags(String s) {
479         if (s == null)
480             return null;
481         return s.replaceAll("<[a-zA-Z]+.*?(>|$)", s);
482     }
483 
484     public static String convertBreaks(String text) {
485         text = text.replaceFirst("[//n//r //t]+$", "");
486         text = text.replaceFirst("^[//n//r //t]+", "");
487         text = text.replaceAll("//r*//n//r*//n[//n//r]*", "\n</p><p>\n");
488         return text;
489     }
490 }