View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.myfaces.shared.renderkit.html.util;
20  
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.OutputStreamWriter;
24  import java.io.Writer;
25  
26  /**
27   * Converts Strings so that they can be used within HTML-Code.
28   */
29  public abstract class HTMLEncoder
30  {
31      /**
32       * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
33       */
34      public static String encode (String string)
35      {
36          return encode(string, false, true);
37      }
38  
39      /**
40       * Variant of {@link #encode} where encodeNbsp is true.
41       */
42      public static String encode (String string, boolean encodeNewline)
43      {
44          return encode(string, encodeNewline, true);
45      }
46  
47      /**
48       * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
49       */
50      public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
51      {
52          return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
53      }
54  
55      /**
56       * Encodes the given string, so that it can be used within a html page.
57       * @param string the string to convert
58       * @param encodeNewline if true newline characters are converted to <br>'s
59       * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to  's
60       * @param encodeNonLatin if true encode non-latin characters as numeric character references
61       */
62      public static String encode (String string,
63                                   boolean encodeNewline,
64                                   boolean encodeSubsequentBlanksToNbsp,
65                                   boolean encodeNonLatin)
66      {
67          if (string == null)
68          {
69              return "";
70          }
71  
72          StringBuilder sb = null;    //create later on demand
73          String app;
74          char c;
75          for (int i = 0; i < string.length (); ++i)
76          {
77              app = null;
78              c = string.charAt(i);
79              
80              // All characters before letters
81              if ((int)c < 0x41)
82              {
83                  switch (c)
84                  {
85                      case '"': app = "&quot;"; break;    //"
86                      case '&': app = "&amp;"; break;     //&
87                      case '<': app = "&lt;"; break;      //<
88                      case '>': app = "&gt;"; break;      //>
89                      case ' ':
90                          if (encodeSubsequentBlanksToNbsp &&
91                                  (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
92                          {
93                              //Space at beginning or after another space
94                              app = "&#160;";
95                          }
96                          break;
97                      case '\n':
98                          if (encodeNewline)
99                          {
100                             app = "<br/>";
101                         }
102                         break;
103                     default:
104                         break;
105                 }
106             }
107             else if (encodeNonLatin && (int)c > 0x80)
108             {
109                  switch(c)
110                  {
111                     //german umlauts
112                     case '\u00E4' : app = "&auml;";  break;
113                     case '\u00C4' : app = "&Auml;";  break;
114                     case '\u00F6' : app = "&ouml;";  break;
115                     case '\u00D6' : app = "&Ouml;";  break;
116                     case '\u00FC' : app = "&uuml;";  break;
117                     case '\u00DC' : app = "&Uuml;";  break;
118                     case '\u00DF' : app = "&szlig;"; break;
119 
120                     //misc
121                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
122                     case '\u20AC': app = "&euro;";  break;
123                     case '\u00AB': app = "&laquo;"; break;
124                     case '\u00BB': app = "&raquo;"; break;
125                     case '\u00A0': app = "&#160;"; break;
126 
127                     default :
128                         //encode all non basic latin characters
129                         app = "&#" + ((int)c) + ";";
130                     break;
131                 }
132             }
133             if (app != null)
134             {
135                 if (sb == null)
136                 {
137                     sb = new StringBuilder(string.substring(0, i));
138                 }
139                 sb.append(app);
140             }
141             else
142             {
143                 if (sb != null)
144                 {
145                     sb.append(c);
146                 }
147             }
148         }
149 
150         if (sb == null)
151         {
152             return string;
153         }
154         else
155         {
156             return sb.toString();
157         }
158     }
159     
160     /**
161      * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
162      */
163     public static void encode (Writer writer, String string) throws IOException
164     {
165         encode(writer, string, false, true);
166     }
167 
168     /**
169      * Variant of {@link #encode} where encodeNbsp is true.
170      */
171     public static void encode (Writer writer, String string, boolean encodeNewline) throws IOException
172     {
173         encode(writer, string, encodeNewline, true);
174     }
175 
176     /**
177      * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
178      */
179     public static void encode (Writer writer, String string, 
180             boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) throws IOException
181     {
182         encode(writer, string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
183     }
184     
185     public static void encode (Writer writer, String string,
186                                  boolean encodeNewline,
187                                  boolean encodeSubsequentBlanksToNbsp,
188                                  boolean encodeNonLatin) throws IOException
189     {
190         if (string == null)
191         {
192             return;
193         }
194 
195         int start = 0;
196         String app;
197         char c;
198         for (int i = 0; i < string.length (); ++i)
199         {
200             app = null;
201             c = string.charAt(i);
202             
203             // All characters before letters
204             if ((int)c < 0x41)
205             {
206                 switch (c)
207                 {
208                     case '"': app = "&quot;"; break;    //"
209                     case '&': app = "&amp;"; break;     //&
210                     case '<': app = "&lt;"; break;      //<
211                     case '>': app = "&gt;"; break;      //>
212                     case ' ':
213                         if (encodeSubsequentBlanksToNbsp &&
214                                 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
215                         {
216                             //Space at beginning or after another space
217                             app = "&#160;";
218                         }
219                         break;
220                     case '\n':
221                         if (encodeNewline)
222                         {
223                             app = "<br/>";
224                         }
225                         break;
226                     default:
227                         break;
228                 }
229             }
230             else if (encodeNonLatin && (int)c > 0x80)
231             {
232                  switch(c)
233                  {
234                     //german umlauts
235                     case '\u00E4' : app = "&auml;";  break;
236                     case '\u00C4' : app = "&Auml;";  break;
237                     case '\u00F6' : app = "&ouml;";  break;
238                     case '\u00D6' : app = "&Ouml;";  break;
239                     case '\u00FC' : app = "&uuml;";  break;
240                     case '\u00DC' : app = "&Uuml;";  break;
241                     case '\u00DF' : app = "&szlig;"; break;
242 
243                     //misc
244                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
245                     case '\u20AC': app = "&euro;";  break;
246                     case '\u00AB': app = "&laquo;"; break;
247                     case '\u00BB': app = "&raquo;"; break;
248                     case '\u00A0': app = "&#160;"; break;
249 
250                     default :
251                         //encode all non basic latin characters
252                         app = "&#" + ((int)c) + ";";
253                     break;
254                 }
255             }
256             if (app != null)
257             {
258                 //if (sb == null)
259                 //{
260                 //    sb = new StringBuilder(string.substring(0, i));
261                 //}
262                 //sb.append(app);
263                 if (start < i)
264                 {
265                     writer.write(string, start, i-start);
266                 }
267                 start = i+1;
268                 writer.write(app);
269             }
270             //else
271             //{
272             //    if (sb != null)
273             //    {
274             //        sb.append(c);
275             //    }
276             //}
277         }
278 
279         //if (sb == null)
280         //{
281         //    return string;
282         //}
283         //else
284         //{
285         //    return sb.toString();
286         //}
287         if (start == 0)
288         {
289             writer.write(string);
290         }
291         else if (start < string.length())
292         {
293             writer.write(string,start,string.length()-start);
294         }
295     }
296 
297 
298     /**
299      * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
300      */
301     public static void encode (char[] string, int offset, int length, Writer writer) throws IOException
302     {
303         encode(string, offset, length, false, true, writer);
304     }
305 
306     /**
307      * Variant of {@link #encode} where encodeNbsp is true.
308      */
309     public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer)
310         throws IOException
311     {
312         encode(string, offset, length, encodeNewline, true, writer);
313     }
314 
315     /**
316      * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
317      */
318     public static void encode (char[] string, int offset, int length, boolean encodeNewline, 
319             boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException
320     {
321         encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer);
322     }
323 
324 
325     /**
326      * Encodes the given string, so that it can be used within a html page.
327      * @param string the string to convert
328      * @param encodeNewline if true newline characters are converted to &lt;br&gt;'s
329      * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &amp;nbsp;'s
330      * @param encodeNonLatin if true encode non-latin characters as numeric character references
331      */
332     public static void encode (char[] string, int offset, int length,
333                                  boolean encodeNewline,
334                                  boolean encodeSubsequentBlanksToNbsp,
335                                  boolean encodeNonLatin, Writer writer) throws IOException
336     {
337         if (string == null || length < 0 || offset >= string.length)
338         {
339             return;
340         }
341         offset = Math.max(0, offset);
342         int realLength = Math.min(length, string.length - offset);
343 
344         //StringBuilder sb = null;    //create later on demand
345         String app;
346         char c;
347         int start = offset;
348         
349         for (int i = offset; i < offset + realLength; ++i)
350         {
351             app = null;
352             c = string[i];
353 
354             // All characters before letters
355             if ((int)c < 0x41)
356             {
357                 switch (c)
358                 {
359                     case '"': app = "&quot;"; break;    //"
360                     case '&': app = "&amp;"; break;     //&
361                     case '<': app = "&lt;"; break;      //<
362                     case '>': app = "&gt;"; break;      //>
363                     case ' ':
364                         if (encodeSubsequentBlanksToNbsp &&
365                                 (i == 0 || (i - 1 >= 0 && string[i - 1] == ' ')))
366                         {
367                             //Space at beginning or after another space
368                             app = "&#160;";
369                         }
370                         break;
371                     case '\n':
372                         if (encodeNewline)
373                         {
374                             app = "<br/>";
375                         }
376                         break;
377                     default:
378                         break;
379                 }
380             }
381             else if (encodeNonLatin && (int)c > 0x80)
382             {
383                  switch(c)
384                  {
385                     //german umlauts
386                     case '\u00E4' : app = "&auml;";  break;
387                     case '\u00C4' : app = "&Auml;";  break;
388                     case '\u00F6' : app = "&ouml;";  break;
389                     case '\u00D6' : app = "&Ouml;";  break;
390                     case '\u00FC' : app = "&uuml;";  break;
391                     case '\u00DC' : app = "&Uuml;";  break;
392                     case '\u00DF' : app = "&szlig;"; break;
393 
394                     //misc
395                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
396                     case '\u20AC': app = "&euro;";  break;
397                     case '\u00AB': app = "&laquo;"; break;
398                     case '\u00BB': app = "&raquo;"; break;
399                     case '\u00A0': app = "&#160;"; break;
400 
401                     default :
402                         //encode all non basic latin characters
403                         app = "&#" + ((int)c) + ";";
404                     break;
405                 }
406             }
407             if (app != null)
408             {
409                 //if (sb == null)
410                 //{
411                 //    sb = new StringBuilder(realLength*2);
412                 //    sb.append(string, offset, i - offset);
413                 //}
414                 //sb.append(app);
415                 if (start < i)
416                 {
417                     writer.write(string, start, i-start);
418                 }
419                 start = i+1;
420                 writer.write(app);
421             }
422             /*
423             else
424             {
425                 if (sb != null)
426                 {
427                     sb.append(c);
428                 }
429             }*/
430         }
431 
432         //if (sb == null)
433         //{
434         //    writer.write(string, offset, realLength);
435         //}
436         //else
437         //{
438         //    writer.write(sb.toString());
439         //}
440         if (start == offset)
441         {
442             writer.write(string, offset, realLength);
443         }
444         else if (start < offset+realLength)
445         {
446             writer.write(string,start,offset+realLength-start);
447         }
448     }
449     
450     private static final String HEX_CHARSET = "0123456789ABCDEF";
451     
452     private static final String UTF8 = "UTF-8";
453     
454     /**
455      * Encode an URI, escaping or percent-encoding all required characters and
456      * following the rules mentioned on RFC 3986.  
457      * 
458      * @param string
459      * @param encodeNonLatin
460      * @return
461      * @throws IOException
462      */
463     public static String encodeURIAtributte(final String string, final String characterEncoding)
464         throws IOException
465     {
466         StringBuilder sb = null;    //create later on demand
467         String app;
468         char c;
469         boolean endLoop = false;
470         for (int i = 0; i < string.length (); ++i)
471         {
472             app = null;
473             c = string.charAt(i);
474             
475             // This are the guidelines to be taken into account by this algorithm to encode:
476             
477             // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
478             //
479             // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
480             // space       = <US-ASCII coded character 20 hexadecimal>
481             // delims      = "<" | ">" | "#" | "%" | <">
482             //               %3C   %3E   %23   %25   %22
483             // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
484             //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
485             //
486             // ".... Data corresponding to excluded characters must be escaped in order to
487             // be properly represented within a URI....."
488             
489             // RFC 3986 Section 3.  Syntax Components
490             //
491             // "... The generic URI syntax consists of a hierarchical sequence of
492             // components referred to as the scheme, authority, path, query, and
493             // fragment.
494             //
495             //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
496             //
497             //   hier-part   = "//" authority path-abempty
498             //               / path-absolute
499             //               / path-rootless
500             //               / path-empty
501             // ...."
502             
503             // RFC 3986 Section 2.2:
504             // Reserved characters (should not be percent-encoded)
505             // reserved    = gen-delims / sub-delims
506             // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
507             //               %3A   %2F   %3F   %23   %5B   %5D   %40
508             // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
509             //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
510             
511             // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
512             // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
513             // "...those rules were redefined to directly specify the characters allowed...."
514             // There is also other characters moved from excluded list to reserved:
515             // "[" / "]" / "#"  
516             
517             // RFC 3986 Section 2.3:
518             // "... for consistency, percent-encoded octets in the ranges of ALPHA
519             // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
520             // underscore (%5F), or tilde (%7E) should not be created by URI
521             // producers...."
522             
523             // RFC 3986 Section  3.2.2.  Host
524 
525             // host = IP-literal / IPv4address / reg-name
526 
527             // The reg-name syntax allows percent-encoded octets in order to
528             // represent non-ASCII registered names in a uniform way that is
529             // independent of the underlying name resolution technology.  Non-ASCII
530             // characters must first be encoded according to UTF-8 [STD63], and then
531             // each octet of the corresponding UTF-8 sequence must be percent-
532             // encoded to be represented as URI characters.  URI producing
533             // applications must not use percent-encoding in host unless it is used
534             // to represent a UTF-8 character sequence.
535             
536             // RFC 3986 Section 3.4 Query 
537             //         query       = *( pchar / "/" / "?" )
538             //
539             // "...  However, as query components are often used to carry identifying information 
540             // in the form of "key=value" pairs and one frequently used value is a reference to
541             // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
542             //
543             // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
544             //
545             // When a new URI scheme defines a component that represents textual
546             // data consisting of characters from the Universal Character Set [UCS],
547             // the data should first be encoded as octets according to the UTF-8
548             // character encoding [STD63]; then only those octets that do not
549             // correspond to characters in the unreserved set should be percent-
550             // encoded.  For example, the character A would be represented as "A",
551             // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
552             // as "%C3%80", and the character KATAKANA LETTER A would be represented
553             // as "%E3%82%A2".
554             //
555             // RFC 3986 Section 3.5 Fragment
556             //         fragment    = *( pchar / "/" / "?" )
557             //
558             // Note that follows the same as query
559             
560             // Based on the extracts the strategy to apply on this method is:
561             // 
562             // On scheme ":" hier-part
563             //
564             // Escape or percent encode chars inside :
565             // 
566             // - From %00 to %20, 
567             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
568             //                     duplicate encoding, encode it when we are sure 
569             //                     that there are not encoded twice)
570             // - "<" %3C, ">" %3E
571             // - "\" %5C, "^" %5E, "`" %60 
572             // - "{" %7B, "|" %7C, "}" %7D
573             // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
574             //   part of an URI, but it is preferred to encode it that omit it).
575             //
576             // The remaining characters must not be encoded
577             //
578             // Characters after ? or # should be percent encoding but only the necessary ones:
579             //
580             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
581             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
582             //                     duplicate encoding, encode it when we are sure 
583             //                     that there are not encoded twice)
584             // - "<" %3C, ">" %3E,
585             // - "\" %5C, "^" %5E, "`" %60 
586             // - "{" %7B, "|" %7C, "}" %7D
587             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
588             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
589             //   translating from the document character encoding to percent encoding, because this values
590             //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
591             //   for decode values)
592             //
593             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
594             // put only & is invalid in this context.
595 
596             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
597                     c == '"' || c == '<' ||
598                     c == '>' || c == '\\' || c == '^' || c == '`' ||
599                     c == '{' || c == '|' || c == '}')
600             {
601                 // The percent encoding on this part should be done using UTF-8 charset
602                 // as RFC 3986 Section 3.2.2 says.
603                 // Also there is a reference on 
604                 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
605                 // that recommend use of UTF-8 instead the document character encoding.
606                 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
607                 app = percentEncode(c, "UTF-8");
608             }
609             else if (c == '%')
610             {
611                 if (i + 2 < string.length())
612                 {
613                     char c1 = string.charAt(i+1);
614                     char c2 = string.charAt(i+2);
615                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
616                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
617                     {
618                         // do not percent encode, because it could be already encoded
619                         // and we don't want encode it twice
620                     }
621                     else
622                     {
623                         app = percentEncode(c, UTF8);
624                     }
625                 }
626                 else
627                 {
628                     app = percentEncode(c, UTF8);
629                 }
630             }
631             else if (c == '?' || c == '#')
632             {
633                 if (i+1 < string.length())
634                 {
635                     // The remaining part of the URI are data that should be encoded
636                     // using the document character encoding.
637                     app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
638                     endLoop = true;
639                 }
640             }
641             else
642             {
643                 //No encoding, just do nothing, char will be added later.
644             }
645                         
646             if (app != null)
647             {
648                 if (sb == null)
649                 {
650                     sb = new StringBuilder(string.substring(0, i));
651                 }
652                 sb.append(app);
653             }
654             else
655             {
656                 if (sb != null)
657                 {
658                     sb.append(c);
659                 }
660             }
661             if (endLoop)
662             {
663                 break;
664             }
665         }
666         if (sb == null)
667         {
668             return string;
669         }
670         else
671         {
672             return sb.toString();
673         }
674     }
675     
676     /**
677      * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
678      * characterEncoding.
679      * 
680      * @param c
681      * @param characterEncoding
682      * @return
683      */
684     private static String percentEncode(char c, String characterEncoding)
685     {
686         String app = null;
687         if (c > (char)((short)0x007F))
688         {
689             //percent encode in the proper encoding to be consistent
690             app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
691         }
692         else
693         {
694             //percent encode US-ASCII char (0x00-0x7F range)
695             app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
696         }
697         return app;
698     }
699     
700     private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
701     {
702         ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
703         StringBuilder builder = new StringBuilder();
704         try
705         {
706             OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
707             writer.write(c);
708             writer.flush();
709         }
710         catch(IOException e)
711         {
712             baos.reset();
713             return null;
714         }
715         
716         byte [] byteArray =  baos.toByteArray();
717         for (int i=0; i < byteArray.length; i++)
718         {
719             builder.append('%');
720             builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
721             builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
722         }
723         
724         return builder.toString();
725     }
726 
727     /**
728      * Encode the query part using the document charset encoding provided.
729      * 
730      * 
731      * @param string
732      * @param characterEncoding
733      * @return
734      */
735     private static String encodeURIQuery(final String string, final String characterEncoding)
736     {
737         StringBuilder sb = null;    //create later on demand
738         String app;
739         char c;
740         boolean endLoop = false;
741         for (int i = 0; i < string.length (); ++i)
742         {
743             app = null;
744             c = string.charAt(i);
745             
746             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
747             // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 
748             //            we make easier and omit this one)
749             // - "<" %3C, ">" %3E,
750             // - "\" %5C, "^" %5E, "`" %60 
751             // - "{" %7B, "|" %7C, "}" %7D
752             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
753             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
754             //   translating from the document character encoding to percent encoding)
755             //
756             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
757             // put & is invalid in this context   
758             
759             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
760                     c == '"' || c == '<' ||
761                     c == '>' || c == '\\' || c == '^' || c == '`' ||
762                     c == '{' || c == '|' || c == '}')
763             {
764                 // The percent encoding on this part should be done using UTF-8 charset
765                 // as RFC 3986 Section 3.2.2 says
766                 app = percentEncode(c, characterEncoding);
767             }
768             else if (c == '%')
769             {
770                 if (i + 2 < string.length())
771                 {
772                     char c1 = string.charAt(i+1);
773                     char c2 = string.charAt(i+2);
774                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
775                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
776                     {
777                         // do not percent encode, because it could be already encoded
778                     }
779                     else
780                     {
781                         app = percentEncode(c, characterEncoding);
782                     }
783                 }
784                 else
785                 {
786                     app = percentEncode(c, characterEncoding);
787                 }
788             }
789             else if (c == '&')
790             {
791                 if (i+4 < string.length() )
792                 {
793                     if ('a' == string.charAt(i+1) &&
794                         'm' == string.charAt(i+2) &&
795                         'p' == string.charAt(i+3) &&
796                         ';' == string.charAt(i+4))
797                     {
798                         //Skip
799                     }
800                     else
801                     {
802                         app = "&amp;";
803                     }
804                 }
805                 else
806                 {
807                     app = "&amp;";
808                 }
809             }
810             else
811             {
812                 //No encoding, just do nothing, char will be added later.
813             }
814                         
815             if (app != null)
816             {
817                 if (sb == null)
818                 {
819                     sb = new StringBuilder(string.substring(0, i));
820                 }
821                 sb.append(app);
822             }
823             else
824             {
825                 if (sb != null)
826                 {
827                     sb.append(c);
828                 }
829             }
830             if (endLoop)
831             {
832                 break;
833             }
834         }
835         if (sb == null)
836         {
837             return string;
838         }
839         else
840         {
841             return sb.toString();
842         }
843     }
844 
845     /**
846      * Encode an URI, escaping or percent-encoding all required characters and
847      * following the rules mentioned on RFC 3986.  
848      * 
849      * @param string
850      * @param encodeNonLatin
851      * @return
852      * @throws IOException
853      */
854     public static void encodeURIAtributte(Writer writer, final String string, final String characterEncoding)
855         throws IOException
856     {
857         //StringBuilder sb = null;    //create later on demand
858         int start = 0;
859         String app;
860         char c;
861         boolean endLoop = false;
862         for (int i = 0; i < string.length (); ++i)
863         {
864             app = null;
865             c = string.charAt(i);
866             
867             // This are the guidelines to be taken into account by this algorithm to encode:
868             
869             // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
870             //
871             // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
872             // space       = <US-ASCII coded character 20 hexadecimal>
873             // delims      = "<" | ">" | "#" | "%" | <">
874             //               %3C   %3E   %23   %25   %22
875             // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
876             //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
877             //
878             // ".... Data corresponding to excluded characters must be escaped in order to
879             // be properly represented within a URI....."
880             
881             // RFC 3986 Section 3.  Syntax Components
882             //
883             // "... The generic URI syntax consists of a hierarchical sequence of
884             // components referred to as the scheme, authority, path, query, and
885             // fragment.
886             //
887             //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
888             //
889             //   hier-part   = "//" authority path-abempty
890             //               / path-absolute
891             //               / path-rootless
892             //               / path-empty
893             // ...."
894             
895             // RFC 3986 Section 2.2:
896             // Reserved characters (should not be percent-encoded)
897             // reserved    = gen-delims / sub-delims
898             // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
899             //               %3A   %2F   %3F   %23   %5B   %5D   %40
900             // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
901             //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
902             
903             // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
904             // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
905             // "...those rules were redefined to directly specify the characters allowed...."
906             // There is also other characters moved from excluded list to reserved:
907             // "[" / "]" / "#"  
908             
909             // RFC 3986 Section 2.3:
910             // "... for consistency, percent-encoded octets in the ranges of ALPHA
911             // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
912             // underscore (%5F), or tilde (%7E) should not be created by URI
913             // producers...."
914             
915             // RFC 3986 Section  3.2.2.  Host
916 
917             // host = IP-literal / IPv4address / reg-name
918 
919             // The reg-name syntax allows percent-encoded octets in order to
920             // represent non-ASCII registered names in a uniform way that is
921             // independent of the underlying name resolution technology.  Non-ASCII
922             // characters must first be encoded according to UTF-8 [STD63], and then
923             // each octet of the corresponding UTF-8 sequence must be percent-
924             // encoded to be represented as URI characters.  URI producing
925             // applications must not use percent-encoding in host unless it is used
926             // to represent a UTF-8 character sequence.
927             
928             // RFC 3986 Section 3.4 Query 
929             //         query       = *( pchar / "/" / "?" )
930             //
931             // "...  However, as query components are often used to carry identifying information 
932             // in the form of "key=value" pairs and one frequently used value is a reference to
933             // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
934             //
935             // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
936             //
937             // When a new URI scheme defines a component that represents textual
938             // data consisting of characters from the Universal Character Set [UCS],
939             // the data should first be encoded as octets according to the UTF-8
940             // character encoding [STD63]; then only those octets that do not
941             // correspond to characters in the unreserved set should be percent-
942             // encoded.  For example, the character A would be represented as "A",
943             // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
944             // as "%C3%80", and the character KATAKANA LETTER A would be represented
945             // as "%E3%82%A2".
946             //
947             // RFC 3986 Section 3.5 Fragment
948             //         fragment    = *( pchar / "/" / "?" )
949             //
950             // Note that follows the same as query
951             
952             // Based on the extracts the strategy to apply on this method is:
953             // 
954             // On scheme ":" hier-part
955             //
956             // Escape or percent encode chars inside :
957             // 
958             // - From %00 to %20, 
959             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
960             //                     duplicate encoding, encode it when we are sure 
961             //                     that there are not encoded twice)
962             // - "<" %3C, ">" %3E
963             // - "\" %5C, "^" %5E, "`" %60 
964             // - "{" %7B, "|" %7C, "}" %7D
965             // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
966             //   part of an URI, but it is preferred to encode it that omit it).
967             //
968             // The remaining characters must not be encoded
969             //
970             // Characters after ? or # should be percent encoding but only the necessary ones:
971             //
972             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
973             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
974             //                     duplicate encoding, encode it when we are sure 
975             //                     that there are not encoded twice)
976             // - "<" %3C, ">" %3E,
977             // - "\" %5C, "^" %5E, "`" %60 
978             // - "{" %7B, "|" %7C, "}" %7D
979             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
980             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
981             //   translating from the document character encoding to percent encoding, because this values
982             //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
983             //   for decode values)
984             //
985             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
986             // put only & is invalid in this context.
987 
988             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
989                     c == '"' || c == '<' ||
990                     c == '>' || c == '\\' || c == '^' || c == '`' ||
991                     c == '{' || c == '|' || c == '}')
992             {
993                 // The percent encoding on this part should be done using UTF-8 charset
994                 // as RFC 3986 Section 3.2.2 says.
995                 // Also there is a reference on 
996                 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
997                 // that recommend use of UTF-8 instead the document character encoding.
998                 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
999                 //app = percentEncode(c, "UTF-8");
1000                 if (start < i)
1001                 {
1002                     writer.write(string, start, i-start);
1003                 }
1004                 start = i+1;
1005                 percentEncode(writer, c, "UTF-8");
1006             }
1007             else if (c == '%')
1008             {
1009                 if (i + 2 < string.length())
1010                 {
1011                     char c1 = string.charAt(i+1);
1012                     char c2 = string.charAt(i+2);
1013                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
1014                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
1015                     {
1016                         // do not percent encode, because it could be already encoded
1017                         // and we don't want encode it twice
1018                     }
1019                     else
1020                     {
1021                         //app = percentEncode(c, UTF8);
1022                         if (start < i)
1023                         {
1024                             writer.write(string, start, i-start);
1025                         }
1026                         start = i+1;
1027                         percentEncode(writer, c, UTF8);
1028                     }
1029                 }
1030                 else
1031                 {
1032                     //app = percentEncode(c, UTF8);
1033                     if (start < i)
1034                     {
1035                         writer.write(string, start, i-start);
1036                     }
1037                     start = i+1;
1038                     percentEncode(writer, c, UTF8);
1039                 }
1040             }
1041             else if (c == '?' || c == '#')
1042             {
1043                 if (i+1 < string.length())
1044                 {
1045                     // The remaining part of the URI are data that should be encoded
1046                     // using the document character encoding.
1047                     //app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
1048                     if (start < i)
1049                     {
1050                         writer.write(string, start, i-start);
1051                     }
1052                     start = i+1;
1053                     writer.write(c);
1054                     //encodeURIQuery(writer, string.substring(i+1), characterEncoding);
1055                     encodeURIQuery(writer, string, i+1, characterEncoding);
1056                     endLoop = true;
1057                 }
1058             }
1059             else
1060             {
1061                 //No encoding, just do nothing, char will be added later.
1062             }
1063                         
1064             if (app != null)
1065             {
1066                 //if (sb == null)
1067                 //{
1068                 //    sb = new StringBuilder(string.substring(0, i));
1069                 //}
1070                 //sb.append(app);
1071                 if (start < i)
1072                 {
1073                     writer.write(string, start, i-start);
1074                 }
1075                 start = i+1;
1076                 writer.write(app);
1077             }
1078             //else
1079             //{
1080             //    if (sb != null)
1081             //    {
1082             //        sb.append(c);
1083             //    }
1084             //}
1085             if (endLoop)
1086             {
1087                 start = string.length();
1088                 break;
1089             }
1090         }
1091         //if (sb == null)
1092         //{
1093         //    return string;
1094         //}
1095         //else
1096         //{
1097         //    return sb.toString();
1098         //}
1099         if (start == 0)
1100         {
1101             writer.write(string);
1102         }
1103         else if (start < string.length())
1104         {
1105             writer.write(string,start,string.length()-start);
1106         }
1107     }
1108 
1109     /**
1110      * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
1111      * characterEncoding.
1112      * 
1113      * @param c
1114      * @param characterEncoding
1115      * @return
1116      */
1117     private static void percentEncode(Writer writer, char c, String characterEncoding) throws IOException
1118     {
1119         String app = null;
1120         if (c > (char)((short)0x007F))
1121         {
1122             //percent encode in the proper encoding to be consistent
1123             //app = percentEncodeNonUsAsciiCharacter(writer c, characterEncoding);
1124             percentEncodeNonUsAsciiCharacter(writer, c, characterEncoding);
1125         }
1126         else
1127         {
1128             //percent encode US-ASCII char (0x00-0x7F range)
1129             //app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
1130             writer.write('%');
1131             writer.write(HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)));
1132             writer.write(HEX_CHARSET.charAt(c % 0x10));
1133         }
1134         //return app;
1135     }
1136     
1137     private static void percentEncodeNonUsAsciiCharacter(Writer currentWriter, char c, String characterEncoding) 
1138         throws IOException
1139     {
1140         ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
1141         StringBuilder builder = new StringBuilder();
1142         try
1143         {
1144             OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
1145             writer.write(c);
1146             writer.flush();
1147         }
1148         catch(IOException e)
1149         {
1150             baos.reset();
1151             return;
1152         }
1153         
1154         byte [] byteArray =  baos.toByteArray();
1155         for (int i=0; i < byteArray.length; i++)
1156         {
1157             //builder.append('%');
1158             //builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
1159             //builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
1160             currentWriter.write('%');
1161             currentWriter.write(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
1162             currentWriter.write(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
1163         }
1164         
1165         //return builder.toString();
1166     }
1167     
1168     /**
1169      * Encode the query part using the document charset encoding provided.
1170      * 
1171      * 
1172      * @param string
1173      * @param characterEncoding
1174      * @return
1175      */
1176     private static void encodeURIQuery(Writer writer, final String string, int offset, final String characterEncoding)
1177             throws IOException
1178     {
1179         //StringBuilder sb = null;    //create later on demand
1180         int start = offset;
1181         int realLength = string.length()-offset;
1182         String app;
1183         char c;
1184         //boolean endLoop = false;
1185         for (int i = offset; i < offset+realLength; ++i)
1186         {
1187             app = null;
1188             c = string.charAt(i);
1189             
1190             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
1191             // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 
1192             //            we make easier and omit this one)
1193             // - "<" %3C, ">" %3E,
1194             // - "\" %5C, "^" %5E, "`" %60 
1195             // - "{" %7B, "|" %7C, "}" %7D
1196             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
1197             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
1198             //   translating from the document character encoding to percent encoding)
1199             //
1200             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
1201             // put & is invalid in this context   
1202             
1203             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
1204                     c == '"' || c == '<' ||
1205                     c == '>' || c == '\\' || c == '^' || c == '`' ||
1206                     c == '{' || c == '|' || c == '}')
1207             {
1208                 // The percent encoding on this part should be done using UTF-8 charset
1209                 // as RFC 3986 Section 3.2.2 says
1210                 //app = percentEncode(c, characterEncoding);
1211                 if (start < i)
1212                 {
1213                     writer.write(string, start, i-start);
1214                 }
1215                 start = i+1;
1216                 percentEncode(writer, c, characterEncoding);
1217             }
1218             else if (c == '%')
1219             {
1220                 if (i + 2 < string.length())
1221                 {
1222                     char c1 = string.charAt(i+1);
1223                     char c2 = string.charAt(i+2);
1224                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
1225                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
1226                     {
1227                         // do not percent encode, because it could be already encoded
1228                     }
1229                     else
1230                     {
1231                         //app = percentEncode(c, characterEncoding);
1232                         if (start < i)
1233                         {
1234                             writer.write(string, start, i-start);
1235                         }
1236                         start = i+1;
1237                         percentEncode(writer, c, characterEncoding);
1238                     }
1239                 }
1240                 else
1241                 {
1242                     //app = percentEncode(c, characterEncoding);
1243                     if (start < i)
1244                     {
1245                         writer.write(string, start, i-start);
1246                     }
1247                     start = i+1;
1248                     percentEncode(writer, c, characterEncoding);
1249                 }
1250             }
1251             else if (c == '&')
1252             {
1253                 if (i+4 < string.length() )
1254                 {
1255                     if ('a' == string.charAt(i+1) &&
1256                         'm' == string.charAt(i+2) &&
1257                         'p' == string.charAt(i+3) &&
1258                         ';' == string.charAt(i+4))
1259                     {
1260                         //Skip
1261                     }
1262                     else
1263                     {
1264                         app = "&amp;";
1265                     }
1266                 }
1267                 else
1268                 {
1269                     app = "&amp;";
1270                 }
1271             }
1272             else
1273             {
1274                 //No encoding, just do nothing, char will be added later.
1275             }
1276                         
1277             if (app != null)
1278             {
1279                 //if (sb == null)
1280                 //{
1281                 //    sb = new StringBuilder(string.substring(0, i));
1282                 //}
1283                 //sb.append(app);
1284                 if (start < i)
1285                 {
1286                     writer.write(string, start, i-start);
1287                 }
1288                 start = i+1;
1289                 writer.write(app);
1290             }
1291             //else
1292             //{
1293             //    if (sb != null)
1294             //    {
1295             //        sb.append(c);
1296             //    }
1297             //}
1298             //if (endLoop)
1299             //{
1300             //    break;
1301             //}
1302         }
1303         
1304         //if (sb == null)
1305         //{
1306         //    return string;
1307         //}
1308         //else
1309         //{
1310         //    return sb.toString();
1311         //}
1312         if (start == offset)
1313         {
1314             writer.write(string, offset, realLength);
1315         }
1316         else if (start < offset+realLength)
1317         {
1318             writer.write(string,start,offset+realLength-start);
1319         }
1320     }
1321 }