Snippet : HTMLState.java

/*
 * [HTMLState.java]
 *
 * Summary: Finite state automaton parser to analyse HTML to remove excess whitespace.
 *
 * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.8+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  2.8 2009-04-04 no longer correct missing entities. Just issue warning messages.
 *  2.9 2010-01-18 refactor so you first allocate a Compactor object, permitting simultaneous compactings.
 *  3.0 2010-02-12 trim space inside <p>..</p>.
 *  3.1 2010-12-21 avoid touching JavaScript and other scripts.
 *  3.2 2010-12-24 handle <script and <?
 *  3.3 2011-11-15 add compactStringAsNeeded
 *  3.4 2012-06-18 no longer strip \n in front of <script
 *  3.5 2012-10-27 show more context where there is an error.
 *  3.6 2013-03-01 no longer complain about unescaped " in text.
 */
package com.mindprod.compactor;

import com.mindprod.entities.EntifyStrings;
import com.mindprod.fastcat.FastCat;

import java.util.regex.Pattern;

import static java.lang.System.*;

/**
 * Finite state automaton parser to analyse HTML to remove excess whitespace.
 * <p/>
 * Collapses multiple spaces in HTML text, tags and comments to one.
 * <p/>
 * Trims space from start and end of line.
 * <p/>
 * Removes whitespace after &lt;dt...&gt;&lt;h?...&gt;&lt;li..&gt;&lt;td...&gt;
 * <p/>
 * Removes whitespace before &lt;/dt&gt;&lt;/h?&gt;&lt;/li&gt;&lt;/td&gt;
 * <p/>
 * Leaves whitespace alone in &lt;pre&gt;...&lt;/pre&gt;
 * <p/>
 * Leaves whitespace alone inside &quot;...&quot; in tags.
 * <p/>
 * Normalises newlines to \n.
 * <p/>
 * If  there is whitespace before, or after a comment or a between multiple comments it will be collapsed to a single
 * space or NL. Macro comments will not remove whitespace entirely before or after. They expand to text, so that
 * whitespace is significant.
 * <p/>
 * We emit NLs when we first see one, and avoid emitting subsequent NLs. However, we procrastinate emitting space until
 * we find the end of the space string. That way we can often eliminate the spaces altogether, replacing it with an NL.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 3.6 2013-03-01 no longer complain about unescaped " in text.
 * @see com.mindprod.htmlreflow.HTMLState
 * @see com.mindprod.jprep.HTMLState
 * @since 2009
 */
enum HTMLState
    {
        IN_COMMENT
                    {
                    /* between <!-- and --> */
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                            case END_TAG:
                            case QUOTE:
                            case TEXT:
                                emit.append( nextChar );
                                return IN_COMMENT;
                            case DASH:
                                if ( lookAhead( 2 ).equals( "->" ) )
                                    {
                                    // cheat, process 2 extra chars without using state machine
                                    charIndex += 2;
                                    emit.append( "-->" );
                                    return IN_TEXT;
                                    // pick up where we left off as if the comment never happened.
                                    //   return previousTextState;
                                    }
                                else
                                    {
                                    emit.append( '-' );
                                    return IN_COMMENT;
                                    }
                            case IGNORE:
                                return IN_COMMENT;
                            case NL:
                                lineNumber++;
                                emit.append( '\n' );
                                return IN_COMMENT_REMOVABLE_SPACE;
                            case SPACE:
                                return IN_COMMENT_COMPACTIBLE_SPACE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_COMMENT_COMPACTIBLE_SPACE
                    {
                    /* inside spaces in a comment, which can be collapsed down to a single space */
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                            case END_TAG:
                            case QUOTE:
                            case TEXT:
                                emit.append( ' ' );
                                emit.append( nextChar );
                                return IN_COMMENT;
                            case IGNORE:
                            case SPACE:
                                return IN_COMMENT_COMPACTIBLE_SPACE;
                            case DASH:
                                if ( lookAhead( 2 ).equals( "->" ) )
                                    {
                                    // cheat, process 2 extra chars without using state machine
                                    charIndex += 2;
                                    emit.append( " -->" );
                                    return IN_TEXT;
                                    // pick up where we left off as if the comment never happened.
                                    //   return previousTextState;
                                    }
                                else
                                    {
                                    emit.append( " -" );
                                    return IN_COMMENT;
                                    }
                            case NL:
                                // we don't suppress NLs inside comments.
                                lineNumber++;
                                emit.append( '\n' );
                                return IN_COMMENT_REMOVABLE_SPACE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_COMMENT_REMOVABLE_SPACE
                    {
                    /* inside spaces in a comment, after a newline leading on a line,
            will be totally deleted. */
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                            case END_TAG:
                            case QUOTE:
                            case TEXT:
                                emit.append( nextChar );
                                return IN_COMMENT;
                            case DASH:
                                if ( lookAhead( 2 ).equals( "->" ) )
                                    {
                                    // cheat, process 2 extra chars without using state machine
                                    charIndex += 2;
                                    emit.append( "-->" );
                                    return IN_TEXT;
                                    // pick up where we left off as if the comment never happened.
                                    //  return previousTextState;
                                    }
                                else
                                    {
                                    emit.append( '-' );
                                    return IN_COMMENT;
                                    }
                            case IGNORE:
                            case SPACE:
                                return IN_COMMENT_REMOVABLE_SPACE;
                            case NL:
                                lineNumber++;
                                return IN_COMMENT_REMOVABLE_SPACE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_REMOVABLE_SPACE
                    {
                    /* inside lead spaces on line of ordinary text, or after <td>....
             Whitespace that will disappear entirely */
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                                tagCategory =
                                        // allow for / and >, < already parsed.
                                        TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG +
                                                                                            2 ) ) );
                                switch ( tagCategory )
                                    {
                                    case COMMENT:
                                        charIndex += "!--".length();
                                        // Will be further later incremented by 1 by charIndexLoop
                                        // Record what we were doing so we can pick up where we left off after comment.
                                        if ( isKeeperComment() )
                                            {
                                            // we used to avoid compacting whitespace in comments
                                            // emit.append( "<!--" );
                                            //  previousTextState = IN_TEXT;
                                            // leave white space alone
                                            //  return IN_COMMENT;
                                            // keep comment
                                            emit.append( "<!--" );
                                            // collapse whitespace
                                            //  previousTextState = IN_REMOVABLE_SPACE;
                                            return IN_COMMENT;
                                            }
                                        else
                                            {
                                            // strip the comment
                                            //    previousTextState = IN_REMOVABLE_SPACE;
                                            return STRIPPING_COMMENT;
                                            }
                                    case LEFT_TRIM:
                                        undoRecentWhiteSpace();// remove any space or NL emitted earlier
                                        emit.append( '<' );
                                        return IN_TAG;
                                    case INVALID:
                                        shouldCorrect( '>' );
                                        emit.append( '<' );
                                        return IN_TEXT;
                                    case PLAIN:
                                    case PRE:/* don't go into IN_PRE until end of <pre> */
                                    case RIGHT_TRIM:
                                        emit.append( '<' );
                                        return IN_TAG;
                                    case SCRIPT:
                                        // undoRecentWhiteSpace();// remove any space or NL emitted earlier
                                        emit.append( '<' );
                                        return IN_SCRIPT;
                                    case SLASH_PRE:
                                        err.println( "Error: </pre> unbalanced" + where() );
                                        emit.append( '<' );
                                        return IN_TAG;
                                    case SLASH_SCRIPT:
                                        emit.append( '<' );
                                        return IN_TAG;
                                    default:
                                        throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                    }
                            case END_TAG:
                                // should have been entity, treat as text, don't change original
                                shouldCorrect( '>' );
                                emit.append( '>' );
                                return IN_TEXT;
                            case DASH:
                            case QUOTE: /* no longer complain about unescaped " */
                            case TEXT:
                                emit.append( nextChar );
                                return IN_TEXT;
                            case IGNORE:
                            case SPACE:
                                // we remove empty lines.
                                return IN_REMOVABLE_SPACE;
                            case NL:
                                lineNumber++;
                                return IN_REMOVABLE_SPACE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_PRE
                /* inside <pre>...</pre>, not counting the two tags.
        Normally, while processing lead/trail tags will be IN_TAG.
         However inside <pre>...</pre>, e.g. <em> will be treated as IN_PRE, not IN_TAG. */
                    {
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                                tagCategory =
                                        // allow for / and >, < already parsed.
                                        TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG +
                                                                                            2 ) ) );
                                emit.append( '<' );
                                switch ( tagCategory )
                                    {
                                    // we treat tags in side <pre>...</pre> as pre text.
                                    case COMMENT:/* treat like pre */
                                    case INVALID:
                                    case LEFT_TRIM:
                                    case PLAIN:
                                    case RIGHT_TRIM:
                                    case SCRIPT:
                                    case SLASH_SCRIPT:
                                        return IN_PRE;// count tag inside <pre>... </pre> an IN_PRE.
                                    case PRE:
                                        err.println( "Error: <pre> unbalanced" + where() );
                                        return IN_PRE;
                                    case SLASH_PRE:
                                        return IN_TAG;
                                    default:
                                        throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                    }
                            case DASH:
                            case END_TAG:
                            case QUOTE:
                            case SPACE:
                            case TEXT:
                                emit.append( nextChar );
                                return IN_PRE;
                            case IGNORE:
                                return IN_PRE;
                            case NL:
                                // keep empty lines in <pre
                                lineNumber++;
                                emit.append( '\n' );
                                return IN_PRE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_SCRIPT
                    {
                    // just seen <script.  leave everything alone until </script
                    // <noscript is treated like ordinary tag.
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case DASH:
                            case END_TAG:
                            case QUOTE:
                            case SPACE:
                            case TEXT:
                                emit.append( nextChar );
                                return IN_SCRIPT;
                            case BEGIN_TAG:
                                // allow for / and >, < already parsed
                                tagCategory = TagCategory.categorise( parsePartialTag( lookAhead(
                                        LONGEST_COMPRESSIBLE_TAG + 2 ) ) );
                                emit.append( nextChar );
                                if ( tagCategory == TagCategory.SLASH_SCRIPT )
                                    {
                                    // treat rest of </script> like ordinary tag
                                    return IN_TAG;
                                    }
                                else
                                    {
                                    // tags in script treated a part of script
                                    return IN_SCRIPT;
                                    }
                            case IGNORE:
                                return IN_SCRIPT;
                            case NL:
                                lineNumber++;
                                emit.append( '\n' );
                                return IN_SCRIPT;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_TAG
                    {
                    /* inside <xxx after first <  ...> or </xxxx ...> or <? ...*> ,
                     * but not inside "..."*/
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                                if ( lookAhead( 3 ).equals( "!--" ) )
                                    {
                                    err.println( "Compactor Error: Can't have <!-- comments inside tags" + where() );
                                    // leave it alone. we can't fix it.
                                    emit.append( "<" );
                                    }
                                else
                                    {
                                    shouldCorrect( '<' );
                                    emit.append( '<' );
                                    }
                                return IN_TAG;
                            case END_TAG:
                                emit.append( '>' );
                                // tag we encountered back after previous <
                                switch ( tagCategory )  // determined back when hit <xxxx
                                    {
                                    case COMMENT:
                                        // we already complained.
                                    case LEFT_TRIM:// can't be preceding space or NL
                                    case PLAIN:
                                    case SLASH_PRE:
                                    case SLASH_SCRIPT:
                                        return IN_TEXT;
                                    case INVALID:
                                        throw new IllegalArgumentException(
                                                "program bug: Invalid tag encountered in IN_TAG state." );
                                    case PRE:
                                        return IN_PRE;
                                    case RIGHT_TRIM:
                                        return IN_REMOVABLE_SPACE;
                                    case SCRIPT:
                                        return IN_SCRIPT;
                                    default:
                                        throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                    }
                            case DASH:
                            case TEXT:
                                emit.append( nextChar );
                                return IN_TAG;
                            case IGNORE:
                                return IN_TAG;
                            case NL:
                                // leave NLs inside tags.
                                lineNumber++;
                                emit.append( '\n' );// ignore preceding white space.
                                return IN_TAG_REMOVABLE_SPACE;
                            case QUOTE:
                                emit.append( '\"' );
                                return IN_TAG_QUOTE;
                            case SPACE:
                                return IN_TAG_COMPACTIBLE_SPACE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_TAG_COMPACTIBLE_SPACE/* inside spaces inside a tag, multiple spaces to be collapsed to one */
                    {
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                                if ( lookAhead( 3 ).equals( "!--" ) )
                                    {
                                    err.println( "Compactor Error: Can't have <!-- comments inside tags" + where() );
                                    // leave it alone. we can't fix it.
                                    emit.append( " <" );
                                    }
                                else
                                    {
                                    shouldCorrect( '<' );
                                    emit.append( " <" );
                                    }
                                return IN_TAG;
                            case DASH:
                            case TEXT:
                                emit.append( ' ' );// collapse all previous spaces down to one
                                emit.append( nextChar );
                                return IN_TAG;
                            case END_TAG:
                                // trailing space before > is not only collapsible, it can be discarded altogether.
                                emit.append( ">" );
                                // tag we encountered back after previous <
                                switch ( tagCategory )
                                    {
                                    case COMMENT:
                                    case LEFT_TRIM:
                                    case PLAIN:
                                    case SLASH_PRE:
                                        return IN_TEXT;
                                    case INVALID:
                                        throw new IllegalArgumentException(
                                                "program bug: Invalid tag encountered in IN_TAG_COMPACTABLE_SPACE state." );
                                    case PRE:
                                        return IN_PRE;
                                    case RIGHT_TRIM:
                                        return IN_REMOVABLE_SPACE;
                                    default:
                                        throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                    }
                            case IGNORE:
                            case SPACE:
                                return IN_TAG_COMPACTIBLE_SPACE;
                            case NL:
                                // keep NLs inside tags
                                lineNumber++;
                                emit.append( '\n' );// ignore preceding white space.
                                return IN_TAG_REMOVABLE_SPACE;
                            case QUOTE:
                                emit.append( " \"" );
                                return IN_TAG_QUOTE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_TAG_REMOVABLE_SPACE/* inside lead spaces on line tag split over lines,
         or immediately after first <.  Any spaces following will be totally discarded. */
                    {
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                                shouldCorrect( '<' );
                                emit.append( '<' );
                                return IN_TAG;
                            case DASH:
                            case TEXT:
                                // ignore lead space
                                emit.append( nextChar );
                                return IN_TAG;
                            case END_TAG:
                                emit.append( '>' );
                                // tag we encountered after previous <
                                switch ( tagCategory )
                                    {
                                    case COMMENT:
                                    case LEFT_TRIM:
                                    case PLAIN:
                                    case SLASH_PRE:
                                        return IN_TEXT;
                                    case INVALID:
                                        throw new IllegalArgumentException(
                                                "program bug: Invalid tag encountered in IN_TAG state." );
                                    case PRE:
                                        return IN_PRE;
                                    case RIGHT_TRIM:
                                        return IN_REMOVABLE_SPACE;
                                    default:
                                        throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                    }
                            case IGNORE:
                            case SPACE:
                                // remove empty lines inside tags
                                return IN_TAG_REMOVABLE_SPACE;
                            case NL:
                                lineNumber++;
                                return IN_TAG_REMOVABLE_SPACE;
                            case QUOTE:
                                emit.append( '\"' );
                                return IN_TAG_QUOTE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_TAG_QUOTE
                    {
                    /* inside "..." */
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                            case DASH:
                            case END_TAG:
                            case SPACE:// don't change at all
                            case TEXT:
                                emit.append( nextChar );
                                return IN_TAG_QUOTE;
                            case IGNORE:
                                return IN_TAG_QUOTE;
                            case NL:
                                err.println( "Warning: Quoted string spanning lines. Left as is." + where() );
                                lineNumber++;
                                emit.append( '\n' );
                                return IN_TAG_QUOTE;
                            case QUOTE:
                                emit.append( '\"' );
                                return IN_TAG;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_TEXT
                    {
                    /* inside ordinary HTML text, possibly an entity */
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                                tagCategory =
                                        // allow for / and >, < already parsed.
                                        TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG +
                                                                                            2 ) ) );
                                switch ( tagCategory )
                                    {
                                    /* was no white space on left */
                                    case COMMENT:
                                        charIndex += "!--".length();
                                        // Will be further later incremented by 1 by charIndexLoop.
                                        // Record what we were doing so we can pick up where we left off after comment
                                        //   previousTextState = IN_TEXT;
                                        if ( isKeeperComment() )
                                            {
                                            emit.append( "<!--" );
                                            return IN_COMMENT;
                                            }
                                        else
                                            {
                                            return STRIPPING_COMMENT;
                                            }
                                    case INVALID:
                                        shouldCorrect( '<' );
                                        emit.append( '<' );
                                        return IN_TEXT;
                                    case LEFT_TRIM:// can't be previous space or NL
                                    case PLAIN:
                                    case PRE:
                                    case RIGHT_TRIM:
                                        emit.append( '<' );
                                        return IN_TAG;
                                    case SCRIPT:
                                        // undoRecentWhiteSpace();// remove any space or NL emitted earlier
                                        emit.append( '<' );
                                        return IN_SCRIPT;
                                    case SLASH_PRE:
                                        err.println( "Error: </pre> unbalanced" + where() );
                                        emit.append( '<' );
                                        return IN_TAG;
                                    case SLASH_SCRIPT:
                                        emit.append( '<' );
                                        return IN_TAG;
                                    default:
                                        throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                    }
                            case DASH:
                            case QUOTE:  // no longer complain about unescaped "
                            case TEXT:
                                emit.append( nextChar );
                                return IN_TEXT;
                            case END_TAG:
                                shouldCorrect( '>' );
                                emit.append( '>' );
                                return IN_TEXT;
                            case IGNORE:
                                return IN_TEXT;
                            case NL:
                                // keep NLs in text
                                lineNumber++;
                                emit.append( '\n' );// was no preceding white space.
                                return IN_REMOVABLE_SPACE;
                            case SPACE:
                                return IN_TEXT_COMPACTIBLE_SPACE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        IN_TEXT_COMPACTIBLE_SPACE/* inside spaces in ordinary text */
                    {
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                                tagCategory =
                                        // allow for / and >, < already parsed.
                                        TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG +
                                                                                            2 ) ) );
                                switch ( tagCategory )
                                    {
                                    case COMMENT:
                                        charIndex += "!--".length();
                                        // Will be further later incremented by 1 by charIndexLoop.
                                        // Record what we were doing so we can pick up where we left off after comment.
                                        if ( isKeeperComment() )
                                            {
                                            // keep the lead space.
                                            emit.append( " <!--" );
                                            //  previousTextState = IN_TEXT_COMPACTIBLE_SPACE;
                                            return IN_COMMENT;
                                            }
                                        else
                                            {
                                            // strip
                                            // will force whitespace after the comment.
                                            //    previousTextState = IN_TEXT_COMPACTIBLE_SPACE;
                                            return STRIPPING_COMMENT;
                                            }
                                    case LEFT_TRIM:
                                        undoRecentWhiteSpace();// remove any space or NL emitted earlier
                                        emit.append( '<' );
                                        return IN_TAG;
                                    case INVALID:
                                        shouldCorrect( '<' );
                                        emit.append( " <" );
                                        return IN_TEXT;
                                    case PLAIN:
                                    case PRE:/* don't go into IN_PRE until end of <pre> */
                                    case RIGHT_TRIM:
                                        emit.append( " <" );
                                        return IN_TAG;
                                    case SCRIPT:
                                        // undoRecentWhiteSpace();// remove any space or NL emitted earlier
                                        emit.append( '<' );
                                        return IN_SCRIPT;
                                    case SLASH_PRE:
                                        err.println( "Error: </pre> unbalanced" + where() );
                                        emit.append( " <" );
                                        return IN_TAG;
                                    case SLASH_SCRIPT:
                                        emit.append( '<' );
                                        return IN_TAG;
                                    default:
                                        throw new IllegalArgumentException( "program bug: invalid tagCategory" );
                                    }
                            case DASH:
                            case QUOTE: /* no longer complain about unescaped " */
                            case TEXT:
                                emit.append( ' ' );// collapse all previous spaces down to one
                                emit.append( nextChar );
                                return IN_TEXT;
                            case END_TAG:
                                shouldCorrect( '>' );
                                emit.append( " >" );
                                return IN_TEXT;
                            case IGNORE:
                            case SPACE:
                                return IN_TEXT_COMPACTIBLE_SPACE;
                            case NL:
                                // keep NL in text
                                lineNumber++;
                                emit.append( '\n' );// ignore preceding white space.
                                return IN_REMOVABLE_SPACE;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    },
        STRIPPING_COMMENT
                    {
                    /* inside <!-- ... --> we jump ahead when hit <!-- and --> . We are removing this comment
            entirely */
                    HTMLState next( HTMLCharCategory category, char nextChar )
                        {
                        switch ( category )
                            {
                            case BEGIN_TAG:
                            case END_TAG:
                            case IGNORE:
                            case QUOTE:
                            case SPACE:
                            case TEXT:
                                // ignore everything.
                                return STRIPPING_COMMENT;
                            case DASH:
                                if ( lookAhead( 2 ).equals( "->" ) )
                                    {
                                    // cheat, process 2 extra chars without using state machine
                                    charIndex += 2;
                                    return IN_TEXT;
                                    // pick up where we left off as if the comment never happened.
                                    // return previousTextState;
                                    }
                                else
                                    {
                                    return STRIPPING_COMMENT;
                                    }
                            case NL:
                                lineNumber++;
                                return STRIPPING_COMMENT;
                            default:
                                throw new IllegalArgumentException( "program bug: invalid category" );
                            }
                        }
                    };
    // declarations

    /**
     * true if want debugging output
     */
    private static final boolean DEBUGGING = false;

    /**
     * longest tag that will compress spaces either side of
     */
    private static final int LONGEST_COMPRESSIBLE_TAG = "blockquote".length();

    /**
     * big input string we are parsing
     */
    private static String big;

    /**
     * offset in big where we are processing
     */
    private static int charIndex;

    /**
     * where we accumulate the compacted output. Leave as StringBuilder.
     */
    private static StringBuilder emit;

    /**
     * line number we are processing in the output file. 1-based.
     */
    private static int lineNumber;

    /**
     * lookingAt pattern to recognise SSI-style comments that expand to text, e.g. SSI that must be kept to matter what.
     */
    private static Pattern keepPattern;
    // --Commented out by Inspection START (2014-07-26 6:04 AM):
    //    /**
    //     * lets us remember what we were doing before the comment so we can pick up where we left off
    //     */
    //    private static HTMLState previousTextState;
    // --Commented out by Inspection STOP (2014-07-26 6:04 AM)

    /**
     * category of the most recently encountered tag
     */
    private static TagCategory tagCategory;

    /**
     * used in error messages to indicate where the error occurred, usually the name of the file being compacted.
     */
    private static String where;
    // /declarations
    // methods

    /**
     * Check configuration and regex patterns to decide if this is a macro style comment that must be preserved even if
     * other comments are stripped and whose lead and trail whitespace cannot be totally deleted.
     *
     * @return true pointing to a comment we want to keep
     */
    private static boolean isKeeperComment()
        {
        if ( keepPattern == null )
            {
            return true;  // we keep everything
            }
        // we have have only incremented past 3 of the 4 lead chars <!-- at this point.
        // potentially look ahead all the way to the end of big.
        // We are pointing just after the <!--
        // We have not scanned for the --> yet.
        return ( keepPattern.matcher( big.substring( charIndex + 1 ) ).lookingAt() );
        }    // /method

    /**
     * look at chars ahead in the stream yet to be processed, starting at charIndex+1
     *
     * @param howFar how many chars you want
     *
     * @return 0 to howFar chars.
     */
    private static String lookAhead( int howFar )
        {
        final int start = charIndex + 1;
        final int end = Math.min( start + howFar, big.length() );
        if ( start >= end )
            {
            return "";
            }
        else
            {
            return big.substring( start, end );
            }
        }   // /method

    /**
     * parse candidate tag.
     *
     * @param partialTag first  LONGEST_COMPRESSIBLE_TAG+2 chars of the tag, possibly including trailing space or > and
     *                   other junk, without lead <.
     *
     * @return tag with &lt; &gt; and trailing white space stripped e.g. dt, /dt ,!--, /blockquote
     */
    private static String parsePartialTag( final String partialTag )
        {
        // <!-- is a special case.  It can be terminated by anything, even an alphabetic
        if ( partialTag.startsWith( "!--" ) )
            {
            return "!--";
            }
        if ( partialTag.startsWith( "?" ) )
            {
            return "?";
            }
        // leave room for lead / but not < >
        // StringBuilder is better than FastCat for char by char work.
        final StringBuilder sb = new StringBuilder( LONGEST_COMPRESSIBLE_TAG + 1 );
        for ( int i = 0; i < partialTag.length(); i++ )
            {
            char c = partialTag.charAt( i );
            if ( c == '/' || 'a' <= c && c <= 'z' || c == '!' || '0' <= c && c <= '9' )
                {
                sb.append( c );
                }
            else if ( 'A' <= c && c <= 'Z' )
                {
                sb.append( Character.toLowerCase( c ) );
                }
            else
                {
                break;// terminate with any strange char, e.g. space > tab, nl, # etc.
                }
            }
        return sb.toString();
        }     // /method

    /**
     * display a warning message about using a char that should be represented by an entity.
     *
     * @param culprit naughty char, we check only < > ", since these are the only ones relevant to compacting.
     */
    private static void shouldCorrect( char culprit )
        {
        err.println( "Warning: " + culprit + " in text should be corrected to " +
                     EntifyStrings.toHTMLEntity( culprit ) + where() );
        }    // /method

    /**
     * remove any white space or NLs we have recently emitted
     */
    private static void undoRecentWhiteSpace()
        {
        outer:
        for ( int i = emit.length() - 1; i >= 0; i-- )
            {
            switch ( emit.charAt( i ) )
                {
                case ' ':
                    emit.setLength( i );
                    break;
                case '\n':
                    lineNumber--;
                    emit.setLength( i );
                    break;
                default:
                    break outer;
                }
            }
        }  // /method

    /**
     * Where are we in processing.  Used for error messages
     *
     * @return string describing where error occurred relative to the output file and context.
     * @see com.mindprod.htmlmacros.Replacer#where
     */
    private static String where()
        {
        final FastCat sb = new FastCat( 11 );
        sb.append( "\n  in file: " );
        sb.append( where );
        sb.append( " near line: " );
        sb.append( lineNumber );
        sb.append( " near offset: " );
        sb.append( charIndex );
        sb.append( "\n  [" );
        // get at least 100 chars before and after, break at even line.
        int start = Math.max( 0, charIndex - 100 );
        // cannot compute in one line
        int startBreak = big.lastIndexOf( '\n', start );
        if ( startBreak >= 0 )
            {
            start = startBreak;
            }
        int end = Math.min( charIndex + 100, big.length() );
        // cannot compute in one line
        final int endBreak = big.indexOf( '\n', end );
        if ( endBreak >= 0 )
            {
            end = endBreak;
            }
        sb.append( big.substring( start, charIndex ).trim() );
        sb.append( "  <><>||||<><>  " );
        sb.append( big.substring( charIndex, end ).trim() );
        sb.append( "]\n\n" );
        return sb.toString();
        }    // /method

    /**
     * Implemented by each enum state to find next state given character. This method is the core of the finite state
     * automaton
     *
     * @param category Category of the next character to process
     * @param nextChar next character to process
     *
     * @return next state of the automaton.
     */
    abstract HTMLState next( HTMLCharCategory category,
                             char nextChar );

    /**
     * Remove excess whitespace from HTML represented by string.
     *
     * @param big         the String to compact.
     * @param where       used in error messages to indicate where the error occurred, usually the name of the
     *                    file being compacted.
     * @param keepPattern regex lookingAt startsWith patterns for
     *                    the exceptions, without lead [!-- i.e. comments you want to keep anyway e.g.
     *                    Pattern.compile (
     *                    "\\s*macro\\s+" ) to keep html static macro comments of the form <--__macro__
     *                    These style of comments
     *                    are macros, that expand into text, either through JSP, static macros, SSI etc.
     *                    Further one char of whitespace will be preserved both before and after such a
     *                    comment. With normal comments, whitespace before or after or both collapses down to a
     *                    single whitespace char, possibly before or after the comment. Patterns are
     *                    case-sensitive unless you use (?i) to turn on case insensitivity in your pattern.
     *                    If keepAllComments=true, then these patterns will preserve white space fore and aft
     *                    null means keep all comments.
     *
     * @return the compacted String, big itself if nothing changed.
     */
    @SuppressWarnings( { "SameParameterValue" } )
    public static String compactString( final String big,
                                        final String where,
                                        Pattern keepPattern
    )
        {
        // todo: do a quick pre-scan to see if there is any need to compact.
        // this is static, not a constructor
        HTMLState.big = big;
        HTMLState.where = where;
        HTMLState.keepPattern = keepPattern;
        final int length = big.length();
        // leave as StringBuilder
        emit = new StringBuilder( length );
        lineNumber = 1;
        HTMLState state = IN_REMOVABLE_SPACE;
        // previousTextState = IN_REMOVABLE_SPACE;
        // charIndex is static
        for ( charIndex = 0; charIndex < length; charIndex++ )
            {
            final char c = big.charAt( charIndex );
            final HTMLCharCategory category = HTMLCharCategory.categorise( c );
            if ( DEBUGGING )
                {
                out.println( ">>>" +
                             state.toString() +
                             " " +
                             tagCategory +
                             " [" +
                             c +
                             "] " +
                             category.toString() );
                }
            state = state.next( category, c );
            }
        // we don't append a final NL.
        final String result = emit.toString();
        emit = null;
        if ( result.length() == length && result.equals( big ) )
            {
            // signal to caller nothing changed, also cuts RAM use in half at next GC.
            return big;
            }
        else
            {
            return result;
            }
        }  // /method

    public static void main( String[] args )
        {
        if ( false )
            {
            final String test =
                    "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3" +
                    ".org/TR/html4/loose.dtd\">\n" +
                    " <br><!-- leave contents -  -   -  be    --> <!-- macro foot -->  <Td class=\"brown\">  this stuff " +
                    "\n" +
                    "</td>  \n" +
                    "<td   \n" +
                    "    class=\"brown\">    sit <em>still   </em> <!--tight--> <!>  very still  </td> <table\n" +
                    "    class=simple1 > " +
                    "\n XXXXX<script ex=\"A\">  stuff not to touch </script>" +
                    "<div> <dt> \n" +
                    "<!-- PAD Program_Version for Canadian Sales Tax Calculator -->4.4<!-- /PAD -->" +
                    "  contents  <!--#CONFIG TIMEFMT=\"%Y-%m-%d\"-->\n" +
                    "  </dt> </div> <!-- macro foot -->  \n" +
                    " <!-- comment --> <!-- second comment --> stuff2 <!-- third comment -->\n" +
                    " <!-- comment -->x<!--    third    comment --> stuff3 <!-- fourth comment -->\n" +
                    "    x<!-- second comment --> stuff <!-- third comment -->\n" +
                    "<!-- macro SiteSearch 57|58|61|62|65|122|148|154|155|1503 -->" +
                    "xx <!-- macro silly1 -->   yy<!-- macro silly2 -->zz<!-- Macro Silly2 --><!--abc-->" +
                    ">  stray gt; <!--# some ssi --> <!-- # some faux ssi -->";
            out.println( "--------RAW---------- keep comments:\n [" + test + "]" );
            out.println( "--------COOKED ------ keep comments:\n [" + compactString( test, "in RAM test", null ) + "]" );
            out.println( "--------RAW---------- keep macros:\n [" + test + "]" );
            out.println( "--------COOKED ------ keep macros:\n [" + compactString( test, "in RAM test", Compactor.MACRO_PATTERN ) + "]" );
            out.println( "--------RAW---------- keep all but ssi:\n [" + test + "]" );
            out.println( "--------COOKED ------ keep all but ssi:\n [" + compactString( test, "in RAM test", Compactor.JUST_SSI_PATTERN ) + "]" );
            }
        }  // /method
    // /method
    // /methods
    }