1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.myfaces.shared.renderkit.html.util;
20
21 import java.io.ByteArrayOutputStream;
22 import java.io.IOException;
23 import java.io.OutputStreamWriter;
24 import java.io.Writer;
25
26 /**
27 * Converts Strings so that they can be used within HTML-Code.
28 */
29 public abstract class HTMLEncoder
30 {
31 /**
32 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
33 */
34 public static String encode (String string)
35 {
36 return encode(string, false, true);
37 }
38
39 /**
40 * Variant of {@link #encode} where encodeNbsp is true.
41 */
42 public static String encode (String string, boolean encodeNewline)
43 {
44 return encode(string, encodeNewline, true);
45 }
46
47 /**
48 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
49 */
50 public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
51 {
52 return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
53 }
54
55 /**
56 * Encodes the given string, so that it can be used within a html page.
57 * @param string the string to convert
58 * @param encodeNewline if true newline characters are converted to <br>'s
59 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s
60 * @param encodeNonLatin if true encode non-latin characters as numeric character references
61 */
62 public static String encode (String string,
63 boolean encodeNewline,
64 boolean encodeSubsequentBlanksToNbsp,
65 boolean encodeNonLatin)
66 {
67 if (string == null)
68 {
69 return "";
70 }
71
72 StringBuilder sb = null; //create later on demand
73 String app;
74 char c;
75 for (int i = 0; i < string.length (); ++i)
76 {
77 app = null;
78 c = string.charAt(i);
79
80 // All characters before letters
81 if ((int)c < 0x41)
82 {
83 switch (c)
84 {
85 case '"': app = """; break; //"
86 case '&': app = "&"; break; //&
87 case '<': app = "<"; break; //<
88 case '>': app = ">"; break; //>
89 case ' ':
90 if (encodeSubsequentBlanksToNbsp &&
91 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
92 {
93 //Space at beginning or after another space
94 app = " ";
95 }
96 break;
97 case '\n':
98 if (encodeNewline)
99 {
100 app = "<br/>";
101 }
102 break;
103 default:
104 break;
105 }
106 }
107 else if (encodeNonLatin && (int)c > 0x80)
108 {
109 switch(c)
110 {
111 //german umlauts
112 case '\u00E4' : app = "ä"; break;
113 case '\u00C4' : app = "Ä"; break;
114 case '\u00F6' : app = "ö"; break;
115 case '\u00D6' : app = "Ö"; break;
116 case '\u00FC' : app = "ü"; break;
117 case '\u00DC' : app = "Ü"; break;
118 case '\u00DF' : app = "ß"; break;
119
120 //misc
121 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it?
122 case '\u20AC': app = "€"; break;
123 case '\u00AB': app = "«"; break;
124 case '\u00BB': app = "»"; break;
125 case '\u00A0': app = " "; break;
126
127 default :
128 //encode all non basic latin characters
129 app = "&#" + ((int)c) + ";";
130 break;
131 }
132 }
133 if (app != null)
134 {
135 if (sb == null)
136 {
137 sb = new StringBuilder(string.substring(0, i));
138 }
139 sb.append(app);
140 }
141 else
142 {
143 if (sb != null)
144 {
145 sb.append(c);
146 }
147 }
148 }
149
150 if (sb == null)
151 {
152 return string;
153 }
154 else
155 {
156 return sb.toString();
157 }
158 }
159
160 /**
161 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
162 */
163 public static void encode (Writer writer, String string) throws IOException
164 {
165 encode(writer, string, false, true);
166 }
167
168 /**
169 * Variant of {@link #encode} where encodeNbsp is true.
170 */
171 public static void encode (Writer writer, String string, boolean encodeNewline) throws IOException
172 {
173 encode(writer, string, encodeNewline, true);
174 }
175
176 /**
177 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
178 */
179 public static void encode (Writer writer, String string,
180 boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) throws IOException
181 {
182 encode(writer, string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
183 }
184
185 public static void encode (Writer writer, String string,
186 boolean encodeNewline,
187 boolean encodeSubsequentBlanksToNbsp,
188 boolean encodeNonLatin) throws IOException
189 {
190 if (string == null)
191 {
192 return;
193 }
194
195 int start = 0;
196 String app;
197 char c;
198 for (int i = 0; i < string.length (); ++i)
199 {
200 app = null;
201 c = string.charAt(i);
202
203 // All characters before letters
204 if ((int)c < 0x41)
205 {
206 switch (c)
207 {
208 case '"': app = """; break; //"
209 case '&': app = "&"; break; //&
210 case '<': app = "<"; break; //<
211 case '>': app = ">"; break; //>
212 case ' ':
213 if (encodeSubsequentBlanksToNbsp &&
214 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
215 {
216 //Space at beginning or after another space
217 app = " ";
218 }
219 break;
220 case '\n':
221 if (encodeNewline)
222 {
223 app = "<br/>";
224 }
225 break;
226 default:
227 break;
228 }
229 }
230 else if (encodeNonLatin && (int)c > 0x80)
231 {
232 switch(c)
233 {
234 //german umlauts
235 case '\u00E4' : app = "ä"; break;
236 case '\u00C4' : app = "Ä"; break;
237 case '\u00F6' : app = "ö"; break;
238 case '\u00D6' : app = "Ö"; break;
239 case '\u00FC' : app = "ü"; break;
240 case '\u00DC' : app = "Ü"; break;
241 case '\u00DF' : app = "ß"; break;
242
243 //misc
244 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it?
245 case '\u20AC': app = "€"; break;
246 case '\u00AB': app = "«"; break;
247 case '\u00BB': app = "»"; break;
248 case '\u00A0': app = " "; break;
249
250 default :
251 //encode all non basic latin characters
252 app = "&#" + ((int)c) + ";";
253 break;
254 }
255 }
256 if (app != null)
257 {
258 //if (sb == null)
259 //{
260 // sb = new StringBuilder(string.substring(0, i));
261 //}
262 //sb.append(app);
263 if (start < i)
264 {
265 writer.write(string, start, i-start);
266 }
267 start = i+1;
268 writer.write(app);
269 }
270 //else
271 //{
272 // if (sb != null)
273 // {
274 // sb.append(c);
275 // }
276 //}
277 }
278
279 //if (sb == null)
280 //{
281 // return string;
282 //}
283 //else
284 //{
285 // return sb.toString();
286 //}
287 if (start == 0)
288 {
289 writer.write(string);
290 }
291 else if (start < string.length())
292 {
293 writer.write(string,start,string.length()-start);
294 }
295 }
296
297
298 /**
299 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
300 */
301 public static void encode (char[] string, int offset, int length, Writer writer) throws IOException
302 {
303 encode(string, offset, length, false, true, writer);
304 }
305
306 /**
307 * Variant of {@link #encode} where encodeNbsp is true.
308 */
309 public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer)
310 throws IOException
311 {
312 encode(string, offset, length, encodeNewline, true, writer);
313 }
314
315 /**
316 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
317 */
318 public static void encode (char[] string, int offset, int length, boolean encodeNewline,
319 boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException
320 {
321 encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer);
322 }
323
324
325 /**
326 * Encodes the given string, so that it can be used within a html page.
327 * @param string the string to convert
328 * @param encodeNewline if true newline characters are converted to <br>'s
329 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s
330 * @param encodeNonLatin if true encode non-latin characters as numeric character references
331 */
332 public static void encode (char[] string, int offset, int length,
333 boolean encodeNewline,
334 boolean encodeSubsequentBlanksToNbsp,
335 boolean encodeNonLatin, Writer writer) throws IOException
336 {
337 if (string == null || length < 0 || offset >= string.length)
338 {
339 return;
340 }
341 offset = Math.max(0, offset);
342 int realLength = Math.min(length, string.length - offset);
343
344 //StringBuilder sb = null; //create later on demand
345 String app;
346 char c;
347 int start = offset;
348
349 for (int i = offset; i < offset + realLength; ++i)
350 {
351 app = null;
352 c = string[i];
353
354 // All characters before letters
355 if ((int)c < 0x41)
356 {
357 switch (c)
358 {
359 case '"': app = """; break; //"
360 case '&': app = "&"; break; //&
361 case '<': app = "<"; break; //<
362 case '>': app = ">"; break; //>
363 case ' ':
364 if (encodeSubsequentBlanksToNbsp &&
365 (i == 0 || (i - 1 >= 0 && string[i - 1] == ' ')))
366 {
367 //Space at beginning or after another space
368 app = " ";
369 }
370 break;
371 case '\n':
372 if (encodeNewline)
373 {
374 app = "<br/>";
375 }
376 break;
377 default:
378 break;
379 }
380 }
381 else if (encodeNonLatin && (int)c > 0x80)
382 {
383 switch(c)
384 {
385 //german umlauts
386 case '\u00E4' : app = "ä"; break;
387 case '\u00C4' : app = "Ä"; break;
388 case '\u00F6' : app = "ö"; break;
389 case '\u00D6' : app = "Ö"; break;
390 case '\u00FC' : app = "ü"; break;
391 case '\u00DC' : app = "Ü"; break;
392 case '\u00DF' : app = "ß"; break;
393
394 //misc
395 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it?
396 case '\u20AC': app = "€"; break;
397 case '\u00AB': app = "«"; break;
398 case '\u00BB': app = "»"; break;
399 case '\u00A0': app = " "; break;
400
401 default :
402 //encode all non basic latin characters
403 app = "&#" + ((int)c) + ";";
404 break;
405 }
406 }
407 if (app != null)
408 {
409 //if (sb == null)
410 //{
411 // sb = new StringBuilder(realLength*2);
412 // sb.append(string, offset, i - offset);
413 //}
414 //sb.append(app);
415 if (start < i)
416 {
417 writer.write(string, start, i-start);
418 }
419 start = i+1;
420 writer.write(app);
421 }
422 /*
423 else
424 {
425 if (sb != null)
426 {
427 sb.append(c);
428 }
429 }*/
430 }
431
432 //if (sb == null)
433 //{
434 // writer.write(string, offset, realLength);
435 //}
436 //else
437 //{
438 // writer.write(sb.toString());
439 //}
440 if (start == offset)
441 {
442 writer.write(string, offset, realLength);
443 }
444 else if (start < offset+realLength)
445 {
446 writer.write(string,start,offset+realLength-start);
447 }
448 }
449
450 private static final String HEX_CHARSET = "0123456789ABCDEF";
451
452 private static final String UTF8 = "UTF-8";
453
454 /**
455 * Encode an URI, escaping or percent-encoding all required characters and
456 * following the rules mentioned on RFC 3986.
457 *
458 * @param string
459 * @param encodeNonLatin
460 * @return
461 * @throws IOException
462 */
463 public static String encodeURIAtributte(final String string, final String characterEncoding)
464 throws IOException
465 {
466 StringBuilder sb = null; //create later on demand
467 String app;
468 char c;
469 boolean endLoop = false;
470 for (int i = 0; i < string.length (); ++i)
471 {
472 app = null;
473 c = string.charAt(i);
474
475 // This are the guidelines to be taken into account by this algorithm to encode:
476
477 // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
478 //
479 // control = <US-ASCII coded characters 00-1F and 7F hexadecimal>
480 // space = <US-ASCII coded character 20 hexadecimal>
481 // delims = "<" | ">" | "#" | "%" | <">
482 // %3C %3E %23 %25 %22
483 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
484 // %7D %7B %7C %5C %5E %5B %5D %60
485 //
486 // ".... Data corresponding to excluded characters must be escaped in order to
487 // be properly represented within a URI....."
488
489 // RFC 3986 Section 3. Syntax Components
490 //
491 // "... The generic URI syntax consists of a hierarchical sequence of
492 // components referred to as the scheme, authority, path, query, and
493 // fragment.
494 //
495 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
496 //
497 // hier-part = "//" authority path-abempty
498 // / path-absolute
499 // / path-rootless
500 // / path-empty
501 // ...."
502
503 // RFC 3986 Section 2.2:
504 // Reserved characters (should not be percent-encoded)
505 // reserved = gen-delims / sub-delims
506 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
507 // %3A %2F %3F %23 %5B %5D %40
508 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
509 // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D
510
511 // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
512 // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6)
513 // "...those rules were redefined to directly specify the characters allowed...."
514 // There is also other characters moved from excluded list to reserved:
515 // "[" / "]" / "#"
516
517 // RFC 3986 Section 2.3:
518 // "... for consistency, percent-encoded octets in the ranges of ALPHA
519 // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
520 // underscore (%5F), or tilde (%7E) should not be created by URI
521 // producers...."
522
523 // RFC 3986 Section 3.2.2. Host
524
525 // host = IP-literal / IPv4address / reg-name
526
527 // The reg-name syntax allows percent-encoded octets in order to
528 // represent non-ASCII registered names in a uniform way that is
529 // independent of the underlying name resolution technology. Non-ASCII
530 // characters must first be encoded according to UTF-8 [STD63], and then
531 // each octet of the corresponding UTF-8 sequence must be percent-
532 // encoded to be represented as URI characters. URI producing
533 // applications must not use percent-encoding in host unless it is used
534 // to represent a UTF-8 character sequence.
535
536 // RFC 3986 Section 3.4 Query
537 // query = *( pchar / "/" / "?" )
538 //
539 // "... However, as query components are often used to carry identifying information
540 // in the form of "key=value" pairs and one frequently used value is a reference to
541 // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
542 //
543 // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
544 //
545 // When a new URI scheme defines a component that represents textual
546 // data consisting of characters from the Universal Character Set [UCS],
547 // the data should first be encoded as octets according to the UTF-8
548 // character encoding [STD63]; then only those octets that do not
549 // correspond to characters in the unreserved set should be percent-
550 // encoded. For example, the character A would be represented as "A",
551 // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
552 // as "%C3%80", and the character KATAKANA LETTER A would be represented
553 // as "%E3%82%A2".
554 //
555 // RFC 3986 Section 3.5 Fragment
556 // fragment = *( pchar / "/" / "?" )
557 //
558 // Note that follows the same as query
559
560 // Based on the extracts the strategy to apply on this method is:
561 //
562 // On scheme ":" hier-part
563 //
564 // Escape or percent encode chars inside :
565 //
566 // - From %00 to %20,
567 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
568 // duplicate encoding, encode it when we are sure
569 // that there are not encoded twice)
570 // - "<" %3C, ">" %3E
571 // - "\" %5C, "^" %5E, "`" %60
572 // - "{" %7B, "|" %7C, "}" %7D
573 // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
574 // part of an URI, but it is preferred to encode it that omit it).
575 //
576 // The remaining characters must not be encoded
577 //
578 // Characters after ? or # should be percent encoding but only the necessary ones:
579 //
580 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
581 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
582 // duplicate encoding, encode it when we are sure
583 // that there are not encoded twice)
584 // - "<" %3C, ">" %3E,
585 // - "\" %5C, "^" %5E, "`" %60
586 // - "{" %7B, "|" %7C, "}" %7D
587 // - From %7F ad infinitum (each character as many bytes as necessary but take into account
588 // that a single char should contain 2,3 or more bytes!. This data should be encoded
589 // translating from the document character encoding to percent encoding, because this values
590 // could be retrieved from httpRequest.getParameter() and it uses the current character encoding
591 // for decode values)
592 //
593 // "&" should be encoded as "&" because this link is inside an html page, and
594 // put only & is invalid in this context.
595
596 if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
597 c == '"' || c == '<' ||
598 c == '>' || c == '\\' || c == '^' || c == '`' ||
599 c == '{' || c == '|' || c == '}')
600 {
601 // The percent encoding on this part should be done using UTF-8 charset
602 // as RFC 3986 Section 3.2.2 says.
603 // Also there is a reference on
604 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
605 // that recommend use of UTF-8 instead the document character encoding.
606 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
607 app = percentEncode(c, "UTF-8");
608 }
609 else if (c == '%')
610 {
611 if (i + 2 < string.length())
612 {
613 char c1 = string.charAt(i+1);
614 char c2 = string.charAt(i+2);
615 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
616 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
617 {
618 // do not percent encode, because it could be already encoded
619 // and we don't want encode it twice
620 }
621 else
622 {
623 app = percentEncode(c, UTF8);
624 }
625 }
626 else
627 {
628 app = percentEncode(c, UTF8);
629 }
630 }
631 else if (c == '?' || c == '#')
632 {
633 if (i+1 < string.length())
634 {
635 // The remaining part of the URI are data that should be encoded
636 // using the document character encoding.
637 app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
638 endLoop = true;
639 }
640 }
641 else
642 {
643 //No encoding, just do nothing, char will be added later.
644 }
645
646 if (app != null)
647 {
648 if (sb == null)
649 {
650 sb = new StringBuilder(string.substring(0, i));
651 }
652 sb.append(app);
653 }
654 else
655 {
656 if (sb != null)
657 {
658 sb.append(c);
659 }
660 }
661 if (endLoop)
662 {
663 break;
664 }
665 }
666 if (sb == null)
667 {
668 return string;
669 }
670 else
671 {
672 return sb.toString();
673 }
674 }
675
676 /**
677 * Encode a unicode char value in percentEncode, decoding its bytes using a specified
678 * characterEncoding.
679 *
680 * @param c
681 * @param characterEncoding
682 * @return
683 */
684 private static String percentEncode(char c, String characterEncoding)
685 {
686 String app = null;
687 if (c > (char)((short)0x007F))
688 {
689 //percent encode in the proper encoding to be consistent
690 app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
691 }
692 else
693 {
694 //percent encode US-ASCII char (0x00-0x7F range)
695 app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
696 }
697 return app;
698 }
699
700 private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
701 {
702 ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
703 StringBuilder builder = new StringBuilder();
704 try
705 {
706 OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
707 writer.write(c);
708 writer.flush();
709 }
710 catch(IOException e)
711 {
712 baos.reset();
713 return null;
714 }
715
716 byte [] byteArray = baos.toByteArray();
717 for (int i=0; i < byteArray.length; i++)
718 {
719 builder.append('%');
720 builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
721 builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
722 }
723
724 return builder.toString();
725 }
726
727 /**
728 * Encode the query part using the document charset encoding provided.
729 *
730 *
731 * @param string
732 * @param characterEncoding
733 * @return
734 */
735 private static String encodeURIQuery(final String string, final String characterEncoding)
736 {
737 StringBuilder sb = null; //create later on demand
738 String app;
739 char c;
740 boolean endLoop = false;
741 for (int i = 0; i < string.length (); ++i)
742 {
743 app = null;
744 c = string.charAt(i);
745
746 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
747 // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so
748 // we make easier and omit this one)
749 // - "<" %3C, ">" %3E,
750 // - "\" %5C, "^" %5E, "`" %60
751 // - "{" %7B, "|" %7C, "}" %7D
752 // - From %7F ad infinitum (each character as many bytes as necessary but take into account
753 // that a single char should contain 2,3 or more bytes!. This data should be encoded
754 // translating from the document character encoding to percent encoding)
755 //
756 // "&" should be encoded as "&" because this link is inside an html page, and
757 // put & is invalid in this context
758
759 if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
760 c == '"' || c == '<' ||
761 c == '>' || c == '\\' || c == '^' || c == '`' ||
762 c == '{' || c == '|' || c == '}')
763 {
764 // The percent encoding on this part should be done using UTF-8 charset
765 // as RFC 3986 Section 3.2.2 says
766 app = percentEncode(c, characterEncoding);
767 }
768 else if (c == '%')
769 {
770 if (i + 2 < string.length())
771 {
772 char c1 = string.charAt(i+1);
773 char c2 = string.charAt(i+2);
774 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
775 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
776 {
777 // do not percent encode, because it could be already encoded
778 }
779 else
780 {
781 app = percentEncode(c, characterEncoding);
782 }
783 }
784 else
785 {
786 app = percentEncode(c, characterEncoding);
787 }
788 }
789 else if (c == '&')
790 {
791 if (i+4 < string.length() )
792 {
793 if ('a' == string.charAt(i+1) &&
794 'm' == string.charAt(i+2) &&
795 'p' == string.charAt(i+3) &&
796 ';' == string.charAt(i+4))
797 {
798 //Skip
799 }
800 else
801 {
802 app = "&";
803 }
804 }
805 else
806 {
807 app = "&";
808 }
809 }
810 else
811 {
812 //No encoding, just do nothing, char will be added later.
813 }
814
815 if (app != null)
816 {
817 if (sb == null)
818 {
819 sb = new StringBuilder(string.substring(0, i));
820 }
821 sb.append(app);
822 }
823 else
824 {
825 if (sb != null)
826 {
827 sb.append(c);
828 }
829 }
830 if (endLoop)
831 {
832 break;
833 }
834 }
835 if (sb == null)
836 {
837 return string;
838 }
839 else
840 {
841 return sb.toString();
842 }
843 }
844
845 /**
846 * Encode an URI, escaping or percent-encoding all required characters and
847 * following the rules mentioned on RFC 3986.
848 *
849 * @param string
850 * @param encodeNonLatin
851 * @return
852 * @throws IOException
853 */
854 public static void encodeURIAtributte(Writer writer, final String string, final String characterEncoding)
855 throws IOException
856 {
857 //StringBuilder sb = null; //create later on demand
858 int start = 0;
859 String app;
860 char c;
861 boolean endLoop = false;
862 for (int i = 0; i < string.length (); ++i)
863 {
864 app = null;
865 c = string.charAt(i);
866
867 // This are the guidelines to be taken into account by this algorithm to encode:
868
869 // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
870 //
871 // control = <US-ASCII coded characters 00-1F and 7F hexadecimal>
872 // space = <US-ASCII coded character 20 hexadecimal>
873 // delims = "<" | ">" | "#" | "%" | <">
874 // %3C %3E %23 %25 %22
875 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
876 // %7D %7B %7C %5C %5E %5B %5D %60
877 //
878 // ".... Data corresponding to excluded characters must be escaped in order to
879 // be properly represented within a URI....."
880
881 // RFC 3986 Section 3. Syntax Components
882 //
883 // "... The generic URI syntax consists of a hierarchical sequence of
884 // components referred to as the scheme, authority, path, query, and
885 // fragment.
886 //
887 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
888 //
889 // hier-part = "//" authority path-abempty
890 // / path-absolute
891 // / path-rootless
892 // / path-empty
893 // ...."
894
895 // RFC 3986 Section 2.2:
896 // Reserved characters (should not be percent-encoded)
897 // reserved = gen-delims / sub-delims
898 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
899 // %3A %2F %3F %23 %5B %5D %40
900 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
901 // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D
902
903 // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
904 // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6)
905 // "...those rules were redefined to directly specify the characters allowed...."
906 // There is also other characters moved from excluded list to reserved:
907 // "[" / "]" / "#"
908
909 // RFC 3986 Section 2.3:
910 // "... for consistency, percent-encoded octets in the ranges of ALPHA
911 // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
912 // underscore (%5F), or tilde (%7E) should not be created by URI
913 // producers...."
914
915 // RFC 3986 Section 3.2.2. Host
916
917 // host = IP-literal / IPv4address / reg-name
918
919 // The reg-name syntax allows percent-encoded octets in order to
920 // represent non-ASCII registered names in a uniform way that is
921 // independent of the underlying name resolution technology. Non-ASCII
922 // characters must first be encoded according to UTF-8 [STD63], and then
923 // each octet of the corresponding UTF-8 sequence must be percent-
924 // encoded to be represented as URI characters. URI producing
925 // applications must not use percent-encoding in host unless it is used
926 // to represent a UTF-8 character sequence.
927
928 // RFC 3986 Section 3.4 Query
929 // query = *( pchar / "/" / "?" )
930 //
931 // "... However, as query components are often used to carry identifying information
932 // in the form of "key=value" pairs and one frequently used value is a reference to
933 // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
934 //
935 // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
936 //
937 // When a new URI scheme defines a component that represents textual
938 // data consisting of characters from the Universal Character Set [UCS],
939 // the data should first be encoded as octets according to the UTF-8
940 // character encoding [STD63]; then only those octets that do not
941 // correspond to characters in the unreserved set should be percent-
942 // encoded. For example, the character A would be represented as "A",
943 // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
944 // as "%C3%80", and the character KATAKANA LETTER A would be represented
945 // as "%E3%82%A2".
946 //
947 // RFC 3986 Section 3.5 Fragment
948 // fragment = *( pchar / "/" / "?" )
949 //
950 // Note that follows the same as query
951
952 // Based on the extracts the strategy to apply on this method is:
953 //
954 // On scheme ":" hier-part
955 //
956 // Escape or percent encode chars inside :
957 //
958 // - From %00 to %20,
959 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
960 // duplicate encoding, encode it when we are sure
961 // that there are not encoded twice)
962 // - "<" %3C, ">" %3E
963 // - "\" %5C, "^" %5E, "`" %60
964 // - "{" %7B, "|" %7C, "}" %7D
965 // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
966 // part of an URI, but it is preferred to encode it that omit it).
967 //
968 // The remaining characters must not be encoded
969 //
970 // Characters after ? or # should be percent encoding but only the necessary ones:
971 //
972 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
973 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
974 // duplicate encoding, encode it when we are sure
975 // that there are not encoded twice)
976 // - "<" %3C, ">" %3E,
977 // - "\" %5C, "^" %5E, "`" %60
978 // - "{" %7B, "|" %7C, "}" %7D
979 // - From %7F ad infinitum (each character as many bytes as necessary but take into account
980 // that a single char should contain 2,3 or more bytes!. This data should be encoded
981 // translating from the document character encoding to percent encoding, because this values
982 // could be retrieved from httpRequest.getParameter() and it uses the current character encoding
983 // for decode values)
984 //
985 // "&" should be encoded as "&" because this link is inside an html page, and
986 // put only & is invalid in this context.
987
988 if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
989 c == '"' || c == '<' ||
990 c == '>' || c == '\\' || c == '^' || c == '`' ||
991 c == '{' || c == '|' || c == '}')
992 {
993 // The percent encoding on this part should be done using UTF-8 charset
994 // as RFC 3986 Section 3.2.2 says.
995 // Also there is a reference on
996 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
997 // that recommend use of UTF-8 instead the document character encoding.
998 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
999 //app = percentEncode(c, "UTF-8");
1000 if (start < i)
1001 {
1002 writer.write(string, start, i-start);
1003 }
1004 start = i+1;
1005 percentEncode(writer, c, "UTF-8");
1006 }
1007 else if (c == '%')
1008 {
1009 if (i + 2 < string.length())
1010 {
1011 char c1 = string.charAt(i+1);
1012 char c2 = string.charAt(i+2);
1013 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
1014 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
1015 {
1016 // do not percent encode, because it could be already encoded
1017 // and we don't want encode it twice
1018 }
1019 else
1020 {
1021 //app = percentEncode(c, UTF8);
1022 if (start < i)
1023 {
1024 writer.write(string, start, i-start);
1025 }
1026 start = i+1;
1027 percentEncode(writer, c, UTF8);
1028 }
1029 }
1030 else
1031 {
1032 //app = percentEncode(c, UTF8);
1033 if (start < i)
1034 {
1035 writer.write(string, start, i-start);
1036 }
1037 start = i+1;
1038 percentEncode(writer, c, UTF8);
1039 }
1040 }
1041 else if (c == '?' || c == '#')
1042 {
1043 if (i+1 < string.length())
1044 {
1045 // The remaining part of the URI are data that should be encoded
1046 // using the document character encoding.
1047 //app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
1048 if (start < i)
1049 {
1050 writer.write(string, start, i-start);
1051 }
1052 start = i+1;
1053 writer.write(c);
1054 //encodeURIQuery(writer, string.substring(i+1), characterEncoding);
1055 encodeURIQuery(writer, string, i+1, characterEncoding);
1056 endLoop = true;
1057 }
1058 }
1059 else
1060 {
1061 //No encoding, just do nothing, char will be added later.
1062 }
1063
1064 if (app != null)
1065 {
1066 //if (sb == null)
1067 //{
1068 // sb = new StringBuilder(string.substring(0, i));
1069 //}
1070 //sb.append(app);
1071 if (start < i)
1072 {
1073 writer.write(string, start, i-start);
1074 }
1075 start = i+1;
1076 writer.write(app);
1077 }
1078 //else
1079 //{
1080 // if (sb != null)
1081 // {
1082 // sb.append(c);
1083 // }
1084 //}
1085 if (endLoop)
1086 {
1087 start = string.length();
1088 break;
1089 }
1090 }
1091 //if (sb == null)
1092 //{
1093 // return string;
1094 //}
1095 //else
1096 //{
1097 // return sb.toString();
1098 //}
1099 if (start == 0)
1100 {
1101 writer.write(string);
1102 }
1103 else if (start < string.length())
1104 {
1105 writer.write(string,start,string.length()-start);
1106 }
1107 }
1108
1109 /**
1110 * Encode a unicode char value in percentEncode, decoding its bytes using a specified
1111 * characterEncoding.
1112 *
1113 * @param c
1114 * @param characterEncoding
1115 * @return
1116 */
1117 private static void percentEncode(Writer writer, char c, String characterEncoding) throws IOException
1118 {
1119 String app = null;
1120 if (c > (char)((short)0x007F))
1121 {
1122 //percent encode in the proper encoding to be consistent
1123 //app = percentEncodeNonUsAsciiCharacter(writer c, characterEncoding);
1124 percentEncodeNonUsAsciiCharacter(writer, c, characterEncoding);
1125 }
1126 else
1127 {
1128 //percent encode US-ASCII char (0x00-0x7F range)
1129 //app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
1130 writer.write('%');
1131 writer.write(HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)));
1132 writer.write(HEX_CHARSET.charAt(c % 0x10));
1133 }
1134 //return app;
1135 }
1136
1137 private static void percentEncodeNonUsAsciiCharacter(Writer currentWriter, char c, String characterEncoding)
1138 throws IOException
1139 {
1140 ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
1141 StringBuilder builder = new StringBuilder();
1142 try
1143 {
1144 OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
1145 writer.write(c);
1146 writer.flush();
1147 }
1148 catch(IOException e)
1149 {
1150 baos.reset();
1151 return;
1152 }
1153
1154 byte [] byteArray = baos.toByteArray();
1155 for (int i=0; i < byteArray.length; i++)
1156 {
1157 //builder.append('%');
1158 //builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
1159 //builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
1160 currentWriter.write('%');
1161 currentWriter.write(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
1162 currentWriter.write(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
1163 }
1164
1165 //return builder.toString();
1166 }
1167
1168 /**
1169 * Encode the query part using the document charset encoding provided.
1170 *
1171 *
1172 * @param string
1173 * @param characterEncoding
1174 * @return
1175 */
1176 private static void encodeURIQuery(Writer writer, final String string, int offset, final String characterEncoding)
1177 throws IOException
1178 {
1179 //StringBuilder sb = null; //create later on demand
1180 int start = offset;
1181 int realLength = string.length()-offset;
1182 String app;
1183 char c;
1184 //boolean endLoop = false;
1185 for (int i = offset; i < offset+realLength; ++i)
1186 {
1187 app = null;
1188 c = string.charAt(i);
1189
1190 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
1191 // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so
1192 // we make easier and omit this one)
1193 // - "<" %3C, ">" %3E,
1194 // - "\" %5C, "^" %5E, "`" %60
1195 // - "{" %7B, "|" %7C, "}" %7D
1196 // - From %7F ad infinitum (each character as many bytes as necessary but take into account
1197 // that a single char should contain 2,3 or more bytes!. This data should be encoded
1198 // translating from the document character encoding to percent encoding)
1199 //
1200 // "&" should be encoded as "&" because this link is inside an html page, and
1201 // put & is invalid in this context
1202
1203 if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
1204 c == '"' || c == '<' ||
1205 c == '>' || c == '\\' || c == '^' || c == '`' ||
1206 c == '{' || c == '|' || c == '}')
1207 {
1208 // The percent encoding on this part should be done using UTF-8 charset
1209 // as RFC 3986 Section 3.2.2 says
1210 //app = percentEncode(c, characterEncoding);
1211 if (start < i)
1212 {
1213 writer.write(string, start, i-start);
1214 }
1215 start = i+1;
1216 percentEncode(writer, c, characterEncoding);
1217 }
1218 else if (c == '%')
1219 {
1220 if (i + 2 < string.length())
1221 {
1222 char c1 = string.charAt(i+1);
1223 char c2 = string.charAt(i+2);
1224 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
1225 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
1226 {
1227 // do not percent encode, because it could be already encoded
1228 }
1229 else
1230 {
1231 //app = percentEncode(c, characterEncoding);
1232 if (start < i)
1233 {
1234 writer.write(string, start, i-start);
1235 }
1236 start = i+1;
1237 percentEncode(writer, c, characterEncoding);
1238 }
1239 }
1240 else
1241 {
1242 //app = percentEncode(c, characterEncoding);
1243 if (start < i)
1244 {
1245 writer.write(string, start, i-start);
1246 }
1247 start = i+1;
1248 percentEncode(writer, c, characterEncoding);
1249 }
1250 }
1251 else if (c == '&')
1252 {
1253 if (i+4 < string.length() )
1254 {
1255 if ('a' == string.charAt(i+1) &&
1256 'm' == string.charAt(i+2) &&
1257 'p' == string.charAt(i+3) &&
1258 ';' == string.charAt(i+4))
1259 {
1260 //Skip
1261 }
1262 else
1263 {
1264 app = "&";
1265 }
1266 }
1267 else
1268 {
1269 app = "&";
1270 }
1271 }
1272 else
1273 {
1274 //No encoding, just do nothing, char will be added later.
1275 }
1276
1277 if (app != null)
1278 {
1279 //if (sb == null)
1280 //{
1281 // sb = new StringBuilder(string.substring(0, i));
1282 //}
1283 //sb.append(app);
1284 if (start < i)
1285 {
1286 writer.write(string, start, i-start);
1287 }
1288 start = i+1;
1289 writer.write(app);
1290 }
1291 //else
1292 //{
1293 // if (sb != null)
1294 // {
1295 // sb.append(c);
1296 // }
1297 //}
1298 //if (endLoop)
1299 //{
1300 // break;
1301 //}
1302 }
1303
1304 //if (sb == null)
1305 //{
1306 // return string;
1307 //}
1308 //else
1309 //{
1310 // return sb.toString();
1311 //}
1312 if (start == offset)
1313 {
1314 writer.write(string, offset, realLength);
1315 }
1316 else if (start < offset+realLength)
1317 {
1318 writer.write(string,start,offset+realLength-start);
1319 }
1320 }
1321 }