/* * [HTMLState.java] * * Summary: Finite state automaton parser to analyse HTML to remove excess whitespace. * * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 2.8 2009-04-04 no longer correct missing entities. Just issue warning messages. * 2.9 2010-01-18 refactor so you first allocate a Compactor object, permitting simultaneous compactings. * 3.0 2010-02-12 trim space inside
..
. * 3.1 2010-12-21 avoid touching JavaScript and other scripts. * 3.2 2010-12-24 handle * Collapses multiple spaces in HTML text, tags and comments to one. * * Trims space from start and end of line. * * Removes whitespace after <dt...><h?...><li..><td...> * * Removes whitespace before </dt></h?></li></td> * * Leaves whitespace alone in <pre>...</pre> * * Leaves whitespace alone inside "..." in tags. * * Normalises newlines to \n. * * If there is whitespace before, or after a comment or a between multiple comments it will be collapsed to a single * space or NL. Macro comments will not remove whitespace entirely before or after. They expand to text, so that * whitespace is significant. * * We emit NLs when we first see one, and avoid emitting subsequent NLs. However, we procrastinate emitting space until * we find the end of the space string. That way we can often eliminate the spaces altogether, replacing it with an NL. * * @author Roedy Green, Canadian Mind Products * @version 3.6 2013-03-01 no longer complain about unescaped " in text. * @see com.mindprod.htmlreflow.HTMLState * @see com.mindprod.jprep.HTMLState * @since 2009 */ enum HTMLState { IN_COMMENT { /* between */ HTMLState next( HTMLCharCategory category, char nextChar ) { switch ( category ) { case BEGIN_TAG: case END_TAG: case QUOTE: case TEXT: emit.append( nextChar ); return IN_COMMENT; case DASH: if ( lookAhead( 2 ).equals( "->" ) ) { // cheat, process 2 extra chars without using state machine charIndex += 2; emit.append( "-->" ); return IN_TEXT; // pick up where we left off as if the comment never happened. // return previousTextState; } else { emit.append( '-' ); return IN_COMMENT; } case IGNORE: return IN_COMMENT; case NL: lineNumber++; emit.append( '\n' ); return IN_COMMENT_REMOVABLE_SPACE; case SPACE: return IN_COMMENT_COMPACTIBLE_SPACE; default: throw new IllegalArgumentException( "program bug: invalid category" ); } } }, IN_COMMENT_COMPACTIBLE_SPACE { /* inside spaces in a comment, which can be collapsed down to a single space */ HTMLState next( HTMLCharCategory category, char nextChar ) { switch ( category ) { case BEGIN_TAG: case END_TAG: case QUOTE: case TEXT: emit.append( ' ' ); emit.append( nextChar ); return IN_COMMENT; case IGNORE: case SPACE: return IN_COMMENT_COMPACTIBLE_SPACE; case DASH: if ( lookAhead( 2 ).equals( "->" ) ) { // cheat, process 2 extra chars without using state machine charIndex += 2; emit.append( " -->" ); return IN_TEXT; // pick up where we left off as if the comment never happened. // return previousTextState; } else { emit.append( " -" ); return IN_COMMENT; } case NL: // we don't suppress NLs inside comments. lineNumber++; emit.append( '\n' ); return IN_COMMENT_REMOVABLE_SPACE; default: throw new IllegalArgumentException( "program bug: invalid category" ); } } }, IN_COMMENT_REMOVABLE_SPACE { /* inside spaces in a comment, after a newline leading on a line, will be totally deleted. */ HTMLState next( HTMLCharCategory category, char nextChar ) { switch ( category ) { case BEGIN_TAG: case END_TAG: case QUOTE: case TEXT: emit.append( nextChar ); return IN_COMMENT; case DASH: if ( lookAhead( 2 ).equals( "->" ) ) { // cheat, process 2 extra chars without using state machine charIndex += 2; emit.append( "-->" ); return IN_TEXT; // pick up where we left off as if the comment never happened. // return previousTextState; } else { emit.append( '-' ); return IN_COMMENT; } case IGNORE: case SPACE: return IN_COMMENT_REMOVABLE_SPACE; case NL: lineNumber++; return IN_COMMENT_REMOVABLE_SPACE; default: throw new IllegalArgumentException( "program bug: invalid category" ); } } }, IN_REMOVABLE_SPACE { /* inside lead spaces on line of ordinary text, or after