1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.myfaces.renderkit.html.util;
20
21 import org.apache.commons.logging.Log;
22 import org.apache.commons.logging.LogFactory;
23
24 /**
25 * A class which detects the open/close tags in an HTML document and reports
26 * them to a listener class.
27 * <p>
28 * This is unfortunately necessary when using JSF with JSP, as tags in the body
29 * of the document can need to output commands into the document at points
30 * earlier than the tag occurred (particularly into the document HEAD section).
31 * This can only be implemented by buffering the response and post-processing
32 * it to find the relevant HTML tags and modifying the buffer as needed.
33 * <p>
34 * This class tries to do the parsing as quickly as possible; many of the
35 * details of HTML are not relevant for the purposes this class is used for.
36 *
37 * @version $Revision: 673833 $ $Date: 2008-07-03 16:58:05 -0500 (Thu, 03 Jul 2008) $
38 */
39 public class ReducedHTMLParser
40 {
41 // IMPLEMENTATION NOTE:
42 //
43 // Many of the methods on this class are package-scope. This is intended
44 // solely for the purpose of unit-testing. This class does not expect
45 // other classes in this package to access its methods.
46
47 private static final Log log = LogFactory.getLog(ReducedHTMLParser.class);
48
49 public static final int BODY_TAG = 0;
50 public static final int HEAD_TAG = 1;
51 public static final int SCRIPT_TAG = 2;
52
53 private static final int STATE_READY = 0;
54 private static final int STATE_IN_COMMENT = 1;
55 private static final int STATE_IN_TAG = 2;
56 private static final int STATE_IN_MARKED_SECTION = 3;
57 private static final int STATE_EXPECTING_ETAGO = 4;
58
59 private int _offset;
60 private int _lineNumber;
61 private CharSequence _seq;
62 private CallbackListener _listener;
63
64 public static void parse(CharSequence seq, CallbackListener l)
65 {
66 new ReducedHTMLParser(seq, l).parse();
67 }
68
69 /**
70 * Constructor, package-scope for unit testing.
71 *
72 * @param s is the sequence of chars to parse.
73 * @param l is the listener to invoke callbacks on.
74 */
75 ReducedHTMLParser(CharSequence s, CallbackListener l)
76 {
77 _seq = s;
78 _listener = l;
79 }
80
81 /**
82 * Return true if there are no more characters to parse.
83 */
84 boolean isFinished()
85 {
86 return _offset >= _seq.length();
87 }
88
89 int getCurrentLineNumber()
90 {
91 return _lineNumber;
92 }
93
94 /**
95 * Advance the current parse position over any whitespace characters.
96 */
97 void consumeWhitespace()
98 {
99 boolean crSeen = false;
100
101 while (_offset < _seq.length())
102 {
103 char c = _seq.charAt(_offset);
104 if (!Character.isWhitespace(c))
105 {
106 break;
107 }
108
109 // Track line number for error messages.
110 if (c == '\r')
111 {
112 ++_lineNumber;
113 crSeen = true;
114 }
115 else if ((c == '\n') && !crSeen)
116 {
117 ++_lineNumber;
118 }
119 else
120 {
121 crSeen = false;
122 }
123
124 ++_offset;
125 }
126 }
127
128 /**
129 * Eat up a sequence of non-whitespace characters and return them.
130 */
131 String consumeNonWhitespace()
132 {
133 int wordStart = _offset;
134 while (_offset < _seq.length())
135 {
136 char c = _seq.charAt(_offset);
137 if (Character.isWhitespace(c))
138 {
139 break;
140 }
141 ++_offset;
142 }
143 if (wordStart == _offset)
144 {
145 return null;
146 }
147 else
148 {
149 return _seq.subSequence(wordStart, _offset).toString();
150 }
151 }
152
153 /**
154 * If the next chars in the input sequence exactly match the specified
155 * string then skip over them and return true.
156 * <p>
157 * If there is not a match then leave the current parse position
158 * unchanged and return false.
159 *
160 * @param s is the exact string to match.
161 * @return true if the input contains exactly the param s
162 */
163 boolean consumeMatch(String s)
164 {
165 if (_offset + s.length() > _seq.length())
166 {
167 // seq isn't long enough to contain the specified string
168 return false;
169 }
170
171 int i = 0;
172 while (i < s.length())
173 {
174 if (_seq.charAt(_offset+i) == s.charAt(i))
175 {
176 ++i;
177 }
178 else
179 {
180 return false;
181 }
182 }
183
184 _offset += i;
185 return true;
186 }
187
188 /**
189 * Eat up a sequence of chars which form a valid XML element name.
190 * <p>
191 * TODO: implement this properly in compliance with spec
192 */
193 String consumeElementName()
194 {
195 consumeWhitespace();
196 int nameStart = _offset;
197 while (!isFinished())
198 {
199 boolean ok = false;
200 char c = _seq.charAt(_offset);
201 if (Character.isLetterOrDigit(_seq.charAt(_offset)))
202 {
203 ok = true;
204 }
205 else if (c == '_')
206 {
207 ok = true;
208 }
209 else if (c == '-')
210 {
211 ok = true;
212 }
213 else if (c == ':')
214 {
215 ok = true;
216 }
217
218 if (!ok)
219 {
220 break;
221 }
222
223 ++_offset;
224 }
225
226 if (nameStart == _offset)
227 {
228 return null;
229 }
230 else
231 {
232 return _seq.subSequence(nameStart, _offset).toString();
233 }
234 }
235
236 /**
237 * Eat up a sequence of chars which form a valid XML attribute name.
238 * <p>
239 * TODO: implement this properly in compliance with spec
240 */
241 String consumeAttrName()
242 {
243 // for now, assume elements and attributes have same rules
244 return consumeElementName();
245 }
246
247 /**
248 * Eat up a string which is terminated with the specified quote
249 * character. This means handling escaped quote chars within the
250 * string.
251 * <p>
252 * This method assumes that the leading quote has already been
253 * consumed.
254 */
255 String consumeString(char quote)
256 {
257 // TODO: should we consider a string to be terminated by a newline?
258 // that would help with runaway strings but I think that multiline
259 // strings *are* allowed...
260 //
261 // TODO: detect newlines within strings and increment lineNumber.
262 // This isn't so important, though; they aren't common and being a
263 // few lines out in an error message isn't serious either.
264 StringBuffer stringBuf = new StringBuffer();
265 boolean escaping = false;
266 while (!isFinished())
267 {
268 char c = _seq.charAt(_offset);
269 ++_offset;
270 if (c == quote)
271 {
272 if (!escaping)
273 {
274 break;
275 }
276 else
277 {
278 stringBuf.append(c);
279 escaping = false;
280 }
281 }
282 else if (c == '\\')
283 {
284 if (escaping)
285 {
286 // append a real backslash
287 stringBuf.append(c);
288 escaping = false;
289 }
290 else
291 {
292 escaping = true;
293 }
294 }
295 else
296 {
297 if (escaping)
298 {
299 stringBuf.append('\\');
300 escaping = false;
301 }
302
303 stringBuf.append(c);
304 }
305 }
306 return stringBuf.toString();
307 }
308
309 /**
310 * Assuming we have already encountered "attrname=", consume the
311 * value part of the attribute definition. Note that unlike XML,
312 * HTML doesn't have to quote its attribute values.
313 *
314 * @return the attribute value. If the attr-value was quoted,
315 * the returned value will not include the quote chars.
316 */
317 String consumeAttrValue()
318 {
319 consumeWhitespace();
320
321 if (consumeMatch("'"))
322 {
323 return consumeString('\'');
324 }
325 else if (consumeMatch("\""))
326 {
327 return consumeString('"');
328 }
329 else
330 {
331 return consumeNonWhitespace();
332 }
333 }
334
335 /**
336 * Discard all characters in the input until one in the specified
337 * string (character-set) is found.
338 *
339 * @param s is a set of characters that should not be discarded.
340 */
341 void consumeExcept(String s)
342 {
343 boolean crSeen = false;
344
345 while (_offset < _seq.length())
346 {
347 char c = _seq.charAt(_offset);
348 if (s.indexOf(c) >= 0)
349 {
350 // char is in the exception set
351 return;
352 }
353
354 // Track line number for error messages.
355 if (c == '\r')
356 {
357 ++_lineNumber;
358 crSeen = true;
359 }
360 else if ((c == '\n') && !crSeen)
361 {
362 ++_lineNumber;
363 }
364 else
365 {
366 crSeen = false;
367 }
368
369 ++_offset;
370 }
371 }
372
373 /**
374 * Process the entire input buffer, invoking callbacks on the listener
375 * object as appropriate.
376 */
377 void parse()
378 {
379 int state = STATE_READY;
380
381 int currentTagStart = -1;
382 String currentTagName = null;
383
384 _lineNumber = 1;
385 _offset = 0;
386 int lastOffset = _offset -1;
387 while (_offset < _seq.length())
388 {
389 // Sanity check; each pass through this loop must increase the offset.
390 // Failure to do this means a hang situation has occurred.
391 if (_offset <= lastOffset)
392 {
393 // throw new RuntimeException("Infinite loop detected in ReducedHTMLParser");
394 log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped."+
395 " Surroundings: '" + getTagSurroundings() +"'.");
396 //return;
397 }
398 lastOffset = _offset;
399
400 if (state == STATE_READY)
401 {
402 // in this state, nothing but "<" has any significance
403 consumeExcept("<");
404 if (isFinished())
405 {
406 break;
407 }
408
409 if (consumeMatch("<!--"))
410 {
411 // Note that whitespace is *not* permitted in <!--
412 state = STATE_IN_COMMENT;
413 }
414 else if (consumeMatch("<!["))
415 {
416 // Start of a "marked section", eg "<![CDATA" or
417 // "<![INCLUDE" or "<![IGNORE". These always terminate
418 // with "]]>"
419 log.debug("Marked section found at line " + getCurrentLineNumber()+". "+
420 "Surroundings: '" + getTagSurroundings() +"'.");
421 state = STATE_IN_MARKED_SECTION;
422 }
423 else if (consumeMatch("<!DOCTYPE"))
424 {
425 log.debug("DOCTYPE found at line " + getCurrentLineNumber());
426 // we don't need to actually do anything here; the
427 // tag can't contain a bare "<", so the first "<"
428 // indicates the start of the next real tag.
429 //
430 // TODO: Handle case where the DOCTYPE includes an internal DTD. In
431 // that case there *will* be embedded < chars in the document. However
432 // that's very unlikely to be used in a JSF page, so this is pretty low
433 // priority.
434 }
435 else if (consumeMatch("<?"))
436 {
437 // xml processing instruction or <!DOCTYPE> tag
438 // we don't need to actually do anything here; the
439 // tag can't contain a bare "<", so the first "<"
440 // indicates the start of the next real tag.
441 log.debug("PI found at line " + getCurrentLineNumber());
442 }
443 else if (consumeMatch("</"))
444 {
445 if (!processEndTag())
446 {
447 // message already logged
448 return;
449 }
450
451 // stay in state READY
452 state = STATE_READY;
453 }
454 else if (consumeMatch("<"))
455 {
456 // We can't tell the user that the tag has closed until after we have
457 // processed any attributes and found the real end of the tag. So save
458 // the current info until the end of this tag.
459 currentTagStart = _offset - 1;
460 currentTagName = consumeElementName();
461 if (currentTagName == null)
462 {
463 log.warn("Invalid HTML; bare lessthan sign found at line "
464 + getCurrentLineNumber() + ". "+
465 "Surroundings: '" + getTagSurroundings() +"'.");
466 // remain in STATE_READY; this isn't really the start of
467 // an xml element.
468 }
469 else
470 {
471 state = STATE_IN_TAG;
472 }
473 }
474 else
475 {
476 // should never get here
477 throw new Error("Internal error at line " + getCurrentLineNumber());
478 }
479
480 continue;
481 }
482
483 if (state == STATE_IN_COMMENT)
484 {
485 // TODO: handle "-- >", which is a valid way to close a
486 // comment according to the specs.
487
488 // in this state, nothing but "--" has any significance
489 consumeExcept("-");
490 if (isFinished())
491 {
492 break;
493 }
494
495 if (consumeMatch("-->"))
496 {
497 state = STATE_READY;
498 }
499 else
500 {
501 // false call; hyphen is not end of comment
502 consumeMatch("-");
503 }
504
505 continue;
506 }
507
508 if (state == STATE_IN_TAG)
509 {
510 consumeWhitespace();
511
512 if (consumeMatch("/>"))
513 {
514 // ok, end of element
515 state = STATE_READY;
516 closedTag(currentTagStart, _offset, currentTagName);
517
518 // and reset vars just in case...
519 currentTagStart = -1;
520 currentTagName = null;
521 }
522 else if (consumeMatch(">"))
523 {
524 if (currentTagName.equalsIgnoreCase("script")
525 || currentTagName.equalsIgnoreCase("style"))
526 {
527 // We've just started a special tag which can contain anything except
528 // the ETAGO marker ("</"). See
529 // http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data
530 state = STATE_EXPECTING_ETAGO;
531 }
532 else
533 {
534 state = STATE_READY;
535 }
536
537 // end of open tag, but not end of element
538 openedTag(currentTagStart, _offset, currentTagName);
539
540 // and reset vars just in case...
541 currentTagStart = -1;
542 currentTagName = null;
543 }
544 else
545 {
546 // xml attribute
547 String attrName = consumeAttrName();
548 if (attrName == null)
549 {
550 // Oops, we found something quite unexpected in this tag.
551 // The best we can do is probably to drop back to looking
552 // for "/>", though that does risk us misinterpreting the
553 // contents of an attribute's associated string value.
554 log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'"
555 + " at line " + getCurrentLineNumber()+". "+
556 "Surroundings: '" + getTagSurroundings() +"'.");
557 state = STATE_EXPECTING_ETAGO;
558 // and consume one character
559 ++_offset;
560 }
561 else
562 {
563 consumeWhitespace();
564
565 // html can have "stand-alone" attributes with no following equals sign
566 if (consumeMatch("="))
567 {
568 consumeAttrValue();
569 }
570 }
571 }
572
573 continue;
574 }
575
576 if (state == STATE_IN_MARKED_SECTION)
577 {
578 // in this state, nothing but "]]>" has any significance
579 consumeExcept("]");
580 if (isFinished())
581 {
582 break;
583 }
584
585 if (consumeMatch("]]>"))
586 {
587 state = STATE_READY;
588 }
589 else
590 {
591 // false call; ] is not end of cdata section
592 consumeMatch("]");
593 }
594
595 continue;
596 }
597
598 if (state == STATE_EXPECTING_ETAGO)
599 {
600 // The term "ETAGO" is the official spec term for "</".
601 consumeExcept("<");
602 if (isFinished())
603 {
604 log.debug("Malformed input page; input terminated while tag not closed.");
605 break;
606 }
607
608 if (consumeMatch("</"))
609 {
610 if (!processEndTag())
611 {
612 return;
613 }
614 state = STATE_READY;
615 }
616 else
617 {
618 // false call; < does not start an ETAGO
619 consumeMatch("<");
620 }
621
622 continue;
623 }
624 }
625 }
626
627 /**
628 * Get details about malformed HTML tag.
629 *
630 * @return Tag surroundings.
631 */
632 private String getTagSurroundings()
633 {
634 int maxLength = 30;
635 int end = _seq.length();
636 if (end - _offset > maxLength) {
637 end = _offset + maxLength;
638 }
639 return _seq.subSequence(_offset, end).toString();
640 }
641
642 /**
643 * Invoked when "</" has been seen in the input, this method
644 * handles the parsing of the end tag and the invocation of the
645 * appropriate callback method.
646 *
647 * @return true if the tag was successfully parsed, and false
648 * if there was a fatal parsing error.
649 */
650 private boolean processEndTag()
651 {
652 int tagStart = _offset - 2;
653 String tagName = consumeElementName();
654 consumeWhitespace();
655 if (!consumeMatch(">"))
656 {
657 // log details about malformed end tag
658 log.error("Malformed end tag '" + tagName + "' at line " + getCurrentLineNumber()
659 + "; skipping parsing. Surroundings: '" + getTagSurroundings() +"'.");
660 return false;
661 }
662
663
664 // inform user that the tag has been closed
665 closedTag(tagStart, _offset, tagName);
666
667 // We can't verify that the tag names balance because this is HTML
668 // we are processing, not XML.
669 return true;
670 }
671
672 /**
673 * Invoke a callback method to inform the listener that we have found a start tag.
674 *
675 * @param startOffset
676 * @param endOffset
677 * @param tagName
678 */
679 void openedTag(int startOffset, int endOffset, String tagName)
680 {
681 //log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" + tagName);
682
683 if ("head".equalsIgnoreCase(tagName))
684 {
685 _listener.openedStartTag(startOffset, HEAD_TAG);
686 _listener.closedStartTag(endOffset, HEAD_TAG);
687 }
688 else if ("body".equalsIgnoreCase(tagName))
689 {
690 _listener.openedStartTag(startOffset, BODY_TAG);
691 _listener.closedStartTag(endOffset, BODY_TAG);
692 }
693 else if ("script".equalsIgnoreCase(tagName))
694 {
695 _listener.openedStartTag(startOffset, SCRIPT_TAG);
696 _listener.closedStartTag(endOffset, SCRIPT_TAG);
697 }
698 }
699
700 void closedTag(int startOffset, int endOffset, String tagName)
701 {
702 //log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" + tagName);
703
704 if ("head".equalsIgnoreCase(tagName))
705 {
706 _listener.openedEndTag(startOffset, HEAD_TAG);
707 _listener.closedEndTag(endOffset, HEAD_TAG);
708 }
709 else if ("body".equalsIgnoreCase(tagName))
710 {
711 _listener.openedEndTag(startOffset, BODY_TAG);
712 _listener.closedEndTag(endOffset, BODY_TAG);
713 }
714 else if ("script".equalsIgnoreCase(tagName))
715 {
716 _listener.openedEndTag(startOffset, SCRIPT_TAG);
717 _listener.closedEndTag(endOffset, SCRIPT_TAG);
718 }
719 }
720 }