View Javadoc

1   /*
2    *  Licensed to the Apache Software Foundation (ASF) under one
3    *  or more contributor license agreements.  See the NOTICE file
4    *  distributed with this work for additional information
5    *  regarding copyright ownership.  The ASF licenses this file
6    *  to you under the Apache License, Version 2.0 (the
7    *  "License"); you may not use this file except in compliance
8    *  with the License.  You may obtain a copy of the License at
9    * 
10   *  http://www.apache.org/licenses/LICENSE-2.0
11   * 
12   *  Unless required by applicable law or agreed to in writing,
13   *  software distributed under the License is distributed on an
14   *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   *  KIND, either express or implied.  See the License for the
16   *  specific language governing permissions and limitations
17   *  under the License.
18   */
19  package org.apache.myfaces.shared.renderkit.html.util;
20  
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.OutputStreamWriter;
24  import java.io.Writer;
25  
26  /***
27   * Converts Strings so that they can be used within HTML-Code.
28   */
29  public abstract class HTMLEncoder
30  {
31      /***
32       * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
33       */
34      public static String encode (String string)
35      {
36          return encode(string, false, true);
37      }
38  
39      /***
40       * Variant of {@link #encode} where encodeNbsp is true.
41       */
42      public static String encode (String string, boolean encodeNewline)
43      {
44          return encode(string, encodeNewline, true);
45      }
46  
47      /***
48       * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
49       */
50      public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
51      {
52          return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
53      }
54  
55      /***
56       * Encodes the given string, so that it can be used within a html page.
57       * @param string the string to convert
58       * @param encodeNewline if true newline characters are converted to <br>'s
59       * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to  's
60       * @param encodeNonLatin if true encode non-latin characters as numeric character references
61       */
62      public static String encode (String string,
63                                   boolean encodeNewline,
64                                   boolean encodeSubsequentBlanksToNbsp,
65                                   boolean encodeNonLatin)
66      {
67          if (string == null)
68          {
69              return "";
70          }
71  
72          StringBuilder sb = null;    //create later on demand
73          String app;
74          char c;
75          for (int i = 0; i < string.length (); ++i)
76          {
77              app = null;
78              c = string.charAt(i);
79              
80              // All characters before letters
81              if ((int)c < 0x41)
82              {
83                  switch (c)
84                  {
85                      case '"': app = "&quot;"; break;    //"
86                      case '&': app = "&amp;"; break;     //&
87                      case '<': app = "&lt;"; break;      //<
88                      case '>': app = "&gt;"; break;      //>
89                      case ' ':
90                          if (encodeSubsequentBlanksToNbsp &&
91                                  (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
92                          {
93                              //Space at beginning or after another space
94                              app = "&#160;";
95                          }
96                          break;
97                      case '\n':
98                          if (encodeNewline)
99                          {
100                             app = "<br/>";
101                         }
102                         break;
103                 }
104             } else if (encodeNonLatin && (int)c > 0x80) {
105                  switch(c) {
106                     //german umlauts
107                     case '\u00E4' : app = "&auml;";  break;
108                     case '\u00C4' : app = "&Auml;";  break;
109                     case '\u00F6' : app = "&ouml;";  break;
110                     case '\u00D6' : app = "&Ouml;";  break;
111                     case '\u00FC' : app = "&uuml;";  break;
112                     case '\u00DC' : app = "&Uuml;";  break;
113                     case '\u00DF' : app = "&szlig;"; break;
114 
115                     //misc
116                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
117                     case '\u20AC': app = "&euro;";  break;
118                     case '\u00AB': app = "&laquo;"; break;
119                     case '\u00BB': app = "&raquo;"; break;
120                     case '\u00A0': app = "&#160;"; break;
121 
122                     default :
123                         //encode all non basic latin characters
124                         app = "&#" + ((int)c) + ";";
125                     break;
126                 }
127             }
128             if (app != null)
129             {
130                 if (sb == null)
131                 {
132                     sb = new StringBuilder(string.substring(0, i));
133                 }
134                 sb.append(app);
135             } else {
136                 if (sb != null)
137                 {
138                     sb.append(c);
139                 }
140             }
141         }
142 
143         if (sb == null)
144         {
145             return string;
146         }
147         else
148         {
149             return sb.toString();
150         }
151     }
152 
153     /***
154      * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
155      */
156     public static void encode (char[] string, int offset, int length, Writer writer) throws IOException
157     {
158         encode(string, offset, length, false, true, writer);
159     }
160 
161     /***
162      * Variant of {@link #encode} where encodeNbsp is true.
163      */
164     public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer) throws IOException
165     {
166         encode(string, offset, length, encodeNewline, true, writer);
167     }
168 
169     /***
170      * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
171      */
172     public static void encode (char[] string, int offset, int length, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException
173     {
174         encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer);
175     }
176 
177 
178     /***
179      * Encodes the given string, so that it can be used within a html page.
180      * @param string the string to convert
181      * @param encodeNewline if true newline characters are converted to &lt;br&gt;'s
182      * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &amp;nbsp;'s
183      * @param encodeNonLatin if true encode non-latin characters as numeric character references
184      */
185     public static void encode (char[] string, int offset, int length,
186                                  boolean encodeNewline,
187                                  boolean encodeSubsequentBlanksToNbsp,
188                                  boolean encodeNonLatin, Writer writer) throws IOException
189     {
190         if (string == null || length < 0 || offset >= string.length)
191         {
192             return;
193         }
194         offset = Math.max(0, offset);
195         int realLength = Math.min(length, string.length - offset);
196 
197         StringBuilder sb = null;    //create later on demand
198         String app;
199         char c;
200         
201         for (int i = offset; i < offset + realLength; ++i)
202         {
203             app = null;
204             c = string[i];
205 
206             // All characters before letters
207             if ((int)c < 0x41)
208             {
209                 switch (c)
210                 {
211                     case '"': app = "&quot;"; break;    //"
212                     case '&': app = "&amp;"; break;     //&
213                     case '<': app = "&lt;"; break;      //<
214                     case '>': app = "&gt;"; break;      //>
215                     case ' ':
216                         if (encodeSubsequentBlanksToNbsp &&
217                                 (i == 0 || (i - 1 >= 0 && string[i - 1] == ' ')))
218                         {
219                             //Space at beginning or after another space
220                             app = "&#160;";
221                         }
222                         break;
223                     case '\n':
224                         if (encodeNewline)
225                         {
226                             app = "<br/>";
227                         }
228                         break;
229                 }
230             } else if (encodeNonLatin && (int)c > 0x80) {
231                  switch(c) {
232                     //german umlauts
233                     case '\u00E4' : app = "&auml;";  break;
234                     case '\u00C4' : app = "&Auml;";  break;
235                     case '\u00F6' : app = "&ouml;";  break;
236                     case '\u00D6' : app = "&Ouml;";  break;
237                     case '\u00FC' : app = "&uuml;";  break;
238                     case '\u00DC' : app = "&Uuml;";  break;
239                     case '\u00DF' : app = "&szlig;"; break;
240 
241                     //misc
242                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
243                     case '\u20AC': app = "&euro;";  break;
244                     case '\u00AB': app = "&laquo;"; break;
245                     case '\u00BB': app = "&raquo;"; break;
246                     case '\u00A0': app = "&#160;"; break;
247 
248                     default :
249                         //encode all non basic latin characters
250                         app = "&#" + ((int)c) + ";";
251                     break;
252                 }
253             }
254             if (app != null)
255             {
256                 if (sb == null)
257                 {
258                     sb = new StringBuilder(realLength*2);
259                     sb.append(string, offset, i - offset);
260                 }
261                 sb.append(app);
262             } else {
263                 if (sb != null)
264                 {
265                     sb.append(c);
266                 }
267             }
268         }
269 
270         if (sb == null)
271         {
272             writer.write(string, offset, realLength);
273         }
274         else
275         {
276             writer.write(sb.toString());
277         }
278     }
279     
280     private static final String HEX_CHARSET = "0123456789ABCDEF";
281     
282     private static final String UTF8 = "UTF-8";
283     
284     /***
285      * Encode an URI, escaping or percent-encoding all required characters and
286      * following the rules mentioned on RFC 3986.  
287      * 
288      * @param string
289      * @param encodeNonLatin
290      * @return
291      * @throws IOException
292      */
293     public static String encodeURIAtributte(final String string, final String characterEncoding)
294         throws IOException
295     {
296         StringBuilder sb = null;    //create later on demand
297         String app;
298         char c;
299         boolean endLoop = false;
300         for (int i = 0; i < string.length (); ++i)
301         {
302             app = null;
303             c = string.charAt(i);
304             
305             // This are the guidelines to be taken into account by this algorithm to encode:
306             
307             // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
308             //
309             // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
310             // space       = <US-ASCII coded character 20 hexadecimal>
311             // delims      = "<" | ">" | "#" | "%" | <">
312             //               %3C   %3E   %23   %25   %22
313             // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
314             //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
315             //
316             // ".... Data corresponding to excluded characters must be escaped in order to
317             // be properly represented within a URI....."
318             
319             // RFC 3986 Section 3.  Syntax Components
320             //
321             // "... The generic URI syntax consists of a hierarchical sequence of
322             // components referred to as the scheme, authority, path, query, and
323             // fragment.
324             //
325             //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
326             //
327             //   hier-part   = "//" authority path-abempty
328             //               / path-absolute
329             //               / path-rootless
330             //               / path-empty
331             // ...."
332             
333             // RFC 3986 Section 2.2:
334             // Reserved characters (should not be percent-encoded)
335             // reserved    = gen-delims / sub-delims
336             // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
337             //               %3A   %2F   %3F   %23   %5B   %5D   %40
338             // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
339             //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
340             
341             // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
342             // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
343             // "...those rules were redefined to directly specify the characters allowed...."
344             // There is also other characters moved from excluded list to reserved:
345             // "[" / "]" / "#"  
346             
347             // RFC 3986 Section 2.3:
348             // "... for consistency, percent-encoded octets in the ranges of ALPHA
349             // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
350             // underscore (%5F), or tilde (%7E) should not be created by URI
351             // producers...."
352             
353             // RFC 3986 Section  3.2.2.  Host
354 
355             // host = IP-literal / IPv4address / reg-name
356 
357             // The reg-name syntax allows percent-encoded octets in order to
358             // represent non-ASCII registered names in a uniform way that is
359             // independent of the underlying name resolution technology.  Non-ASCII
360             // characters must first be encoded according to UTF-8 [STD63], and then
361             // each octet of the corresponding UTF-8 sequence must be percent-
362             // encoded to be represented as URI characters.  URI producing
363             // applications must not use percent-encoding in host unless it is used
364             // to represent a UTF-8 character sequence.
365             
366             // RFC 3986 Section 3.4 Query 
367             //         query       = *( pchar / "/" / "?" )
368             //
369             // "...  However, as query components are often used to carry identifying information 
370             // in the form of "key=value" pairs and one frequently used value is a reference to
371             // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
372             //
373             // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
374             //
375             // When a new URI scheme defines a component that represents textual
376             // data consisting of characters from the Universal Character Set [UCS],
377             // the data should first be encoded as octets according to the UTF-8
378             // character encoding [STD63]; then only those octets that do not
379             // correspond to characters in the unreserved set should be percent-
380             // encoded.  For example, the character A would be represented as "A",
381             // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
382             // as "%C3%80", and the character KATAKANA LETTER A would be represented
383             // as "%E3%82%A2".
384             //
385             // RFC 3986 Section 3.5 Fragment
386             //         fragment    = *( pchar / "/" / "?" )
387             //
388             // Note that follows the same as query
389             
390             // Based on the extracts the strategy to apply on this method is:
391             // 
392             // On scheme ":" hier-part
393             //
394             // Escape or percent encode chars inside :
395             // 
396             // - From %00 to %20, 
397             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
398             //                     duplicate encoding, encode it when we are sure 
399             //                     that there are not encoded twice)
400             // - "<" %3C, ">" %3E
401             // - "\" %5C, "^" %5E, "`" %60 
402             // - "{" %7B, "|" %7C, "}" %7D
403             // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
404             //   part of an URI, but it is preferred to encode it that omit it).
405             //
406             // The remaining characters must not be encoded
407             //
408             // Characters after ? or # should be percent encoding but only the necessary ones:
409             //
410             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
411             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
412             //                     duplicate encoding, encode it when we are sure 
413             //                     that there are not encoded twice)
414             // - "<" %3C, ">" %3E,
415             // - "\" %5C, "^" %5E, "`" %60 
416             // - "{" %7B, "|" %7C, "}" %7D
417             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
418             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
419             //   translating from the document character encoding to percent encoding, because this values
420             //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
421             //   for decode values)
422             //
423             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
424             // put only & is invalid in this context.
425 
426             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
427                     c == '"' || c == '<' ||
428                     c == '>' || c == '//' || c == '^' || c == '`' ||
429                     c == '{' || c == '|' || c == '}')
430             {
431                 // The percent encoding on this part should be done using UTF-8 charset
432                 // as RFC 3986 Section 3.2.2 says.
433                 // Also there is a reference on 
434                 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
435                 // that recommend use of UTF-8 instead the document character encoding.
436                 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
437                 app = percentEncode(c, "UTF-8");
438             }
439             else if (c == '%')
440             {
441                 if (i + 2 < string.length())
442                 {
443                     char c1 = string.charAt(i+1);
444                     char c2 = string.charAt(i+2);
445                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
446                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
447                     {
448                         // do not percent encode, because it could be already encoded
449                         // and we don't want encode it twice
450                     }
451                     else
452                     {
453                         app = percentEncode(c, UTF8);
454                     }
455                 }
456                 else
457                 {
458                     app = percentEncode(c, UTF8);
459                 }
460             }
461             else if (c == '?' || c == '#')
462             {
463                 if (i+1 < string.length())
464                 {
465                     // The remaining part of the URI are data that should be encoded
466                     // using the document character encoding.
467                     app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
468                     endLoop = true;
469                 }
470             }
471             else
472             {
473                 //No encoding, just do nothing, char will be added later.
474             }
475                         
476             if (app != null)
477             {
478                 if (sb == null)
479                 {
480                     sb = new StringBuilder(string.substring(0, i));
481                 }
482                 sb.append(app);
483             } else {
484                 if (sb != null)
485                 {
486                     sb.append(c);
487                 }
488             }
489             if (endLoop)
490             {
491                 break;
492             }
493         }
494         if (sb == null)
495         {
496             return string;
497         }
498         else
499         {
500             return sb.toString();
501         }
502     }
503     
504     /***
505      * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
506      * characterEncoding.
507      * 
508      * @param c
509      * @param characterEncoding
510      * @return
511      */
512     private static String percentEncode(char c, String characterEncoding)
513     {
514         String app = null;
515         if (c > (char)((short)0x007F))
516         {
517             //percent encode in the proper encoding to be consistent
518             app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
519         }
520         else
521         {
522             //percent encode US-ASCII char (0x00-0x7F range)
523             app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
524         }
525         return app;
526     }
527     
528     private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
529     {
530         ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
531         StringBuffer builder = new StringBuffer();
532         try
533         {
534             OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
535             writer.write(c);
536             writer.flush();
537         }
538         catch(IOException e)
539         {
540             baos.reset();
541             return null;
542         }
543         
544         byte [] byteArray =  baos.toByteArray();
545         for (int i=0; i < byteArray.length; i++)
546         {
547             builder.append('%');
548             builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
549             builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
550         }
551         
552         return builder.toString();
553     }
554 
555     /***
556      * Encode the query part using the document charset encoding provided.
557      * 
558      * 
559      * @param string
560      * @param characterEncoding
561      * @return
562      */
563     private static String encodeURIQuery(final String string, final String characterEncoding)
564     {
565         StringBuilder sb = null;    //create later on demand
566         String app;
567         char c;
568         boolean endLoop = false;
569         for (int i = 0; i < string.length (); ++i)
570         {
571             app = null;
572             c = string.charAt(i);
573             
574             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
575             // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one)
576             // - "<" %3C, ">" %3E,
577             // - "\" %5C, "^" %5E, "`" %60 
578             // - "{" %7B, "|" %7C, "}" %7D
579             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
580             //   that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document
581             //   character encoding to percent encoding)
582             //
583             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
584             // put & is invalid in this context   
585             
586             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
587                     c == '"' || c == '<' ||
588                     c == '>' || c == '//' || c == '^' || c == '`' ||
589                     c == '{' || c == '|' || c == '}')
590             {
591                 // The percent encoding on this part should be done using UTF-8 charset
592                 // as RFC 3986 Section 3.2.2 says
593                 app = percentEncode(c, characterEncoding);
594             }
595             else if (c == '%')
596             {
597                 if (i + 2 < string.length())
598                 {
599                     char c1 = string.charAt(i+1);
600                     char c2 = string.charAt(i+2);
601                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
602                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
603                     {
604                         // do not percent encode, because it could be already encoded
605                     }
606                     else
607                     {
608                         app = percentEncode(c, characterEncoding);
609                     }
610                 }
611                 else
612                 {
613                     app = percentEncode(c, characterEncoding);
614                 }
615             }
616             else if (c == '&')
617             {
618                 if (i+4 < string.length() )
619                 {
620                     if ('a' == string.charAt(i+1) &&
621                         'm' == string.charAt(i+2) &&
622                         'p' == string.charAt(i+3) &&
623                         ';' == string.charAt(i+4))
624                     {
625                         //Skip
626                     }
627                     else
628                     {
629                         app = "&amp;";
630                     }
631                 }
632                 else
633                 {
634                     app = "&amp;";
635                 }
636             }
637             else
638             {
639                 //No encoding, just do nothing, char will be added later.
640             }
641                         
642             if (app != null)
643             {
644                 if (sb == null)
645                 {
646                     sb = new StringBuilder(string.substring(0, i));
647                 }
648                 sb.append(app);
649             } else {
650                 if (sb != null)
651                 {
652                     sb.append(c);
653                 }
654             }
655             if (endLoop)
656             {
657                 break;
658             }
659         }
660         if (sb == null)
661         {
662             return string;
663         }
664         else
665         {
666             return sb.toString();
667         }
668     }
669 }