1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.myfaces.shared.renderkit.html.util;
20
21 import java.io.ByteArrayOutputStream;
22 import java.io.IOException;
23 import java.io.OutputStreamWriter;
24 import java.io.Writer;
25
26 /***
27 * Converts Strings so that they can be used within HTML-Code.
28 */
29 public abstract class HTMLEncoder
30 {
31 /***
32 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
33 */
34 public static String encode (String string)
35 {
36 return encode(string, false, true);
37 }
38
39 /***
40 * Variant of {@link #encode} where encodeNbsp is true.
41 */
42 public static String encode (String string, boolean encodeNewline)
43 {
44 return encode(string, encodeNewline, true);
45 }
46
47 /***
48 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
49 */
50 public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
51 {
52 return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
53 }
54
55 /***
56 * Encodes the given string, so that it can be used within a html page.
57 * @param string the string to convert
58 * @param encodeNewline if true newline characters are converted to <br>'s
59 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s
60 * @param encodeNonLatin if true encode non-latin characters as numeric character references
61 */
62 public static String encode (String string,
63 boolean encodeNewline,
64 boolean encodeSubsequentBlanksToNbsp,
65 boolean encodeNonLatin)
66 {
67 if (string == null)
68 {
69 return "";
70 }
71
72 StringBuilder sb = null; //create later on demand
73 String app;
74 char c;
75 for (int i = 0; i < string.length (); ++i)
76 {
77 app = null;
78 c = string.charAt(i);
79
80 // All characters before letters
81 if ((int)c < 0x41)
82 {
83 switch (c)
84 {
85 case '"': app = """; break; //"
86 case '&': app = "&"; break; //&
87 case '<': app = "<"; break; //<
88 case '>': app = ">"; break; //>
89 case ' ':
90 if (encodeSubsequentBlanksToNbsp &&
91 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
92 {
93 //Space at beginning or after another space
94 app = " ";
95 }
96 break;
97 case '\n':
98 if (encodeNewline)
99 {
100 app = "<br/>";
101 }
102 break;
103 }
104 } else if (encodeNonLatin && (int)c > 0x80) {
105 switch(c) {
106 //german umlauts
107 case '\u00E4' : app = "ä"; break;
108 case '\u00C4' : app = "Ä"; break;
109 case '\u00F6' : app = "ö"; break;
110 case '\u00D6' : app = "Ö"; break;
111 case '\u00FC' : app = "ü"; break;
112 case '\u00DC' : app = "Ü"; break;
113 case '\u00DF' : app = "ß"; break;
114
115 //misc
116 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it?
117 case '\u20AC': app = "€"; break;
118 case '\u00AB': app = "«"; break;
119 case '\u00BB': app = "»"; break;
120 case '\u00A0': app = " "; break;
121
122 default :
123 //encode all non basic latin characters
124 app = "&#" + ((int)c) + ";";
125 break;
126 }
127 }
128 if (app != null)
129 {
130 if (sb == null)
131 {
132 sb = new StringBuilder(string.substring(0, i));
133 }
134 sb.append(app);
135 } else {
136 if (sb != null)
137 {
138 sb.append(c);
139 }
140 }
141 }
142
143 if (sb == null)
144 {
145 return string;
146 }
147 else
148 {
149 return sb.toString();
150 }
151 }
152
153 /***
154 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
155 */
156 public static void encode (char[] string, int offset, int length, Writer writer) throws IOException
157 {
158 encode(string, offset, length, false, true, writer);
159 }
160
161 /***
162 * Variant of {@link #encode} where encodeNbsp is true.
163 */
164 public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer) throws IOException
165 {
166 encode(string, offset, length, encodeNewline, true, writer);
167 }
168
169 /***
170 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
171 */
172 public static void encode (char[] string, int offset, int length, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException
173 {
174 encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer);
175 }
176
177
178 /***
179 * Encodes the given string, so that it can be used within a html page.
180 * @param string the string to convert
181 * @param encodeNewline if true newline characters are converted to <br>'s
182 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s
183 * @param encodeNonLatin if true encode non-latin characters as numeric character references
184 */
185 public static void encode (char[] string, int offset, int length,
186 boolean encodeNewline,
187 boolean encodeSubsequentBlanksToNbsp,
188 boolean encodeNonLatin, Writer writer) throws IOException
189 {
190 if (string == null || length < 0 || offset >= string.length)
191 {
192 return;
193 }
194 offset = Math.max(0, offset);
195 int realLength = Math.min(length, string.length - offset);
196
197 StringBuilder sb = null; //create later on demand
198 String app;
199 char c;
200
201 for (int i = offset; i < offset + realLength; ++i)
202 {
203 app = null;
204 c = string[i];
205
206 // All characters before letters
207 if ((int)c < 0x41)
208 {
209 switch (c)
210 {
211 case '"': app = """; break; //"
212 case '&': app = "&"; break; //&
213 case '<': app = "<"; break; //<
214 case '>': app = ">"; break; //>
215 case ' ':
216 if (encodeSubsequentBlanksToNbsp &&
217 (i == 0 || (i - 1 >= 0 && string[i - 1] == ' ')))
218 {
219 //Space at beginning or after another space
220 app = " ";
221 }
222 break;
223 case '\n':
224 if (encodeNewline)
225 {
226 app = "<br/>";
227 }
228 break;
229 }
230 } else if (encodeNonLatin && (int)c > 0x80) {
231 switch(c) {
232 //german umlauts
233 case '\u00E4' : app = "ä"; break;
234 case '\u00C4' : app = "Ä"; break;
235 case '\u00F6' : app = "ö"; break;
236 case '\u00D6' : app = "Ö"; break;
237 case '\u00FC' : app = "ü"; break;
238 case '\u00DC' : app = "Ü"; break;
239 case '\u00DF' : app = "ß"; break;
240
241 //misc
242 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it?
243 case '\u20AC': app = "€"; break;
244 case '\u00AB': app = "«"; break;
245 case '\u00BB': app = "»"; break;
246 case '\u00A0': app = " "; break;
247
248 default :
249 //encode all non basic latin characters
250 app = "&#" + ((int)c) + ";";
251 break;
252 }
253 }
254 if (app != null)
255 {
256 if (sb == null)
257 {
258 sb = new StringBuilder(realLength*2);
259 sb.append(string, offset, i - offset);
260 }
261 sb.append(app);
262 } else {
263 if (sb != null)
264 {
265 sb.append(c);
266 }
267 }
268 }
269
270 if (sb == null)
271 {
272 writer.write(string, offset, realLength);
273 }
274 else
275 {
276 writer.write(sb.toString());
277 }
278 }
279
280 private static final String HEX_CHARSET = "0123456789ABCDEF";
281
282 private static final String UTF8 = "UTF-8";
283
284 /***
285 * Encode an URI, escaping or percent-encoding all required characters and
286 * following the rules mentioned on RFC 3986.
287 *
288 * @param string
289 * @param encodeNonLatin
290 * @return
291 * @throws IOException
292 */
293 public static String encodeURIAtributte(final String string, final String characterEncoding)
294 throws IOException
295 {
296 StringBuilder sb = null; //create later on demand
297 String app;
298 char c;
299 boolean endLoop = false;
300 for (int i = 0; i < string.length (); ++i)
301 {
302 app = null;
303 c = string.charAt(i);
304
305 // This are the guidelines to be taken into account by this algorithm to encode:
306
307 // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
308 //
309 // control = <US-ASCII coded characters 00-1F and 7F hexadecimal>
310 // space = <US-ASCII coded character 20 hexadecimal>
311 // delims = "<" | ">" | "#" | "%" | <">
312 // %3C %3E %23 %25 %22
313 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
314 // %7D %7B %7C %5C %5E %5B %5D %60
315 //
316 // ".... Data corresponding to excluded characters must be escaped in order to
317 // be properly represented within a URI....."
318
319 // RFC 3986 Section 3. Syntax Components
320 //
321 // "... The generic URI syntax consists of a hierarchical sequence of
322 // components referred to as the scheme, authority, path, query, and
323 // fragment.
324 //
325 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
326 //
327 // hier-part = "//" authority path-abempty
328 // / path-absolute
329 // / path-rootless
330 // / path-empty
331 // ...."
332
333 // RFC 3986 Section 2.2:
334 // Reserved characters (should not be percent-encoded)
335 // reserved = gen-delims / sub-delims
336 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
337 // %3A %2F %3F %23 %5B %5D %40
338 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
339 // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D
340
341 // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
342 // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6)
343 // "...those rules were redefined to directly specify the characters allowed...."
344 // There is also other characters moved from excluded list to reserved:
345 // "[" / "]" / "#"
346
347 // RFC 3986 Section 2.3:
348 // "... for consistency, percent-encoded octets in the ranges of ALPHA
349 // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
350 // underscore (%5F), or tilde (%7E) should not be created by URI
351 // producers...."
352
353 // RFC 3986 Section 3.2.2. Host
354
355 // host = IP-literal / IPv4address / reg-name
356
357 // The reg-name syntax allows percent-encoded octets in order to
358 // represent non-ASCII registered names in a uniform way that is
359 // independent of the underlying name resolution technology. Non-ASCII
360 // characters must first be encoded according to UTF-8 [STD63], and then
361 // each octet of the corresponding UTF-8 sequence must be percent-
362 // encoded to be represented as URI characters. URI producing
363 // applications must not use percent-encoding in host unless it is used
364 // to represent a UTF-8 character sequence.
365
366 // RFC 3986 Section 3.4 Query
367 // query = *( pchar / "/" / "?" )
368 //
369 // "... However, as query components are often used to carry identifying information
370 // in the form of "key=value" pairs and one frequently used value is a reference to
371 // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
372 //
373 // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
374 //
375 // When a new URI scheme defines a component that represents textual
376 // data consisting of characters from the Universal Character Set [UCS],
377 // the data should first be encoded as octets according to the UTF-8
378 // character encoding [STD63]; then only those octets that do not
379 // correspond to characters in the unreserved set should be percent-
380 // encoded. For example, the character A would be represented as "A",
381 // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
382 // as "%C3%80", and the character KATAKANA LETTER A would be represented
383 // as "%E3%82%A2".
384 //
385 // RFC 3986 Section 3.5 Fragment
386 // fragment = *( pchar / "/" / "?" )
387 //
388 // Note that follows the same as query
389
390 // Based on the extracts the strategy to apply on this method is:
391 //
392 // On scheme ":" hier-part
393 //
394 // Escape or percent encode chars inside :
395 //
396 // - From %00 to %20,
397 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
398 // duplicate encoding, encode it when we are sure
399 // that there are not encoded twice)
400 // - "<" %3C, ">" %3E
401 // - "\" %5C, "^" %5E, "`" %60
402 // - "{" %7B, "|" %7C, "}" %7D
403 // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
404 // part of an URI, but it is preferred to encode it that omit it).
405 //
406 // The remaining characters must not be encoded
407 //
408 // Characters after ? or # should be percent encoding but only the necessary ones:
409 //
410 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
411 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
412 // duplicate encoding, encode it when we are sure
413 // that there are not encoded twice)
414 // - "<" %3C, ">" %3E,
415 // - "\" %5C, "^" %5E, "`" %60
416 // - "{" %7B, "|" %7C, "}" %7D
417 // - From %7F ad infinitum (each character as many bytes as necessary but take into account
418 // that a single char should contain 2,3 or more bytes!. This data should be encoded
419 // translating from the document character encoding to percent encoding, because this values
420 // could be retrieved from httpRequest.getParameter() and it uses the current character encoding
421 // for decode values)
422 //
423 // "&" should be encoded as "&" because this link is inside an html page, and
424 // put only & is invalid in this context.
425
426 if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
427 c == '"' || c == '<' ||
428 c == '>' || c == '//' || c == '^' || c == '`' ||
429 c == '{' || c == '|' || c == '}')
430 {
431 // The percent encoding on this part should be done using UTF-8 charset
432 // as RFC 3986 Section 3.2.2 says.
433 // Also there is a reference on
434 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
435 // that recommend use of UTF-8 instead the document character encoding.
436 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
437 app = percentEncode(c, "UTF-8");
438 }
439 else if (c == '%')
440 {
441 if (i + 2 < string.length())
442 {
443 char c1 = string.charAt(i+1);
444 char c2 = string.charAt(i+2);
445 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
446 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
447 {
448 // do not percent encode, because it could be already encoded
449 // and we don't want encode it twice
450 }
451 else
452 {
453 app = percentEncode(c, UTF8);
454 }
455 }
456 else
457 {
458 app = percentEncode(c, UTF8);
459 }
460 }
461 else if (c == '?' || c == '#')
462 {
463 if (i+1 < string.length())
464 {
465 // The remaining part of the URI are data that should be encoded
466 // using the document character encoding.
467 app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
468 endLoop = true;
469 }
470 }
471 else
472 {
473 //No encoding, just do nothing, char will be added later.
474 }
475
476 if (app != null)
477 {
478 if (sb == null)
479 {
480 sb = new StringBuilder(string.substring(0, i));
481 }
482 sb.append(app);
483 } else {
484 if (sb != null)
485 {
486 sb.append(c);
487 }
488 }
489 if (endLoop)
490 {
491 break;
492 }
493 }
494 if (sb == null)
495 {
496 return string;
497 }
498 else
499 {
500 return sb.toString();
501 }
502 }
503
504 /***
505 * Encode a unicode char value in percentEncode, decoding its bytes using a specified
506 * characterEncoding.
507 *
508 * @param c
509 * @param characterEncoding
510 * @return
511 */
512 private static String percentEncode(char c, String characterEncoding)
513 {
514 String app = null;
515 if (c > (char)((short)0x007F))
516 {
517 //percent encode in the proper encoding to be consistent
518 app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
519 }
520 else
521 {
522 //percent encode US-ASCII char (0x00-0x7F range)
523 app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
524 }
525 return app;
526 }
527
528 private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
529 {
530 ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
531 StringBuffer builder = new StringBuffer();
532 try
533 {
534 OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
535 writer.write(c);
536 writer.flush();
537 }
538 catch(IOException e)
539 {
540 baos.reset();
541 return null;
542 }
543
544 byte [] byteArray = baos.toByteArray();
545 for (int i=0; i < byteArray.length; i++)
546 {
547 builder.append('%');
548 builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
549 builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
550 }
551
552 return builder.toString();
553 }
554
555 /***
556 * Encode the query part using the document charset encoding provided.
557 *
558 *
559 * @param string
560 * @param characterEncoding
561 * @return
562 */
563 private static String encodeURIQuery(final String string, final String characterEncoding)
564 {
565 StringBuilder sb = null; //create later on demand
566 String app;
567 char c;
568 boolean endLoop = false;
569 for (int i = 0; i < string.length (); ++i)
570 {
571 app = null;
572 c = string.charAt(i);
573
574 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
575 // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one)
576 // - "<" %3C, ">" %3E,
577 // - "\" %5C, "^" %5E, "`" %60
578 // - "{" %7B, "|" %7C, "}" %7D
579 // - From %7F ad infinitum (each character as many bytes as necessary but take into account
580 // that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document
581 // character encoding to percent encoding)
582 //
583 // "&" should be encoded as "&" because this link is inside an html page, and
584 // put & is invalid in this context
585
586 if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
587 c == '"' || c == '<' ||
588 c == '>' || c == '//' || c == '^' || c == '`' ||
589 c == '{' || c == '|' || c == '}')
590 {
591 // The percent encoding on this part should be done using UTF-8 charset
592 // as RFC 3986 Section 3.2.2 says
593 app = percentEncode(c, characterEncoding);
594 }
595 else if (c == '%')
596 {
597 if (i + 2 < string.length())
598 {
599 char c1 = string.charAt(i+1);
600 char c2 = string.charAt(i+2);
601 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
602 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
603 {
604 // do not percent encode, because it could be already encoded
605 }
606 else
607 {
608 app = percentEncode(c, characterEncoding);
609 }
610 }
611 else
612 {
613 app = percentEncode(c, characterEncoding);
614 }
615 }
616 else if (c == '&')
617 {
618 if (i+4 < string.length() )
619 {
620 if ('a' == string.charAt(i+1) &&
621 'm' == string.charAt(i+2) &&
622 'p' == string.charAt(i+3) &&
623 ';' == string.charAt(i+4))
624 {
625 //Skip
626 }
627 else
628 {
629 app = "&";
630 }
631 }
632 else
633 {
634 app = "&";
635 }
636 }
637 else
638 {
639 //No encoding, just do nothing, char will be added later.
640 }
641
642 if (app != null)
643 {
644 if (sb == null)
645 {
646 sb = new StringBuilder(string.substring(0, i));
647 }
648 sb.append(app);
649 } else {
650 if (sb != null)
651 {
652 sb.append(c);
653 }
654 }
655 if (endLoop)
656 {
657 break;
658 }
659 }
660 if (sb == null)
661 {
662 return string;
663 }
664 else
665 {
666 return sb.toString();
667 }
668 }
669 }