/*
 * [Compactor.java]
 *
 * Summary: Compacts HTML by removing unnecessary white space.
 *
 * Copyright: (c) 2006-2012 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.6+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  2.2 2006-03-15 Suppress IntelliJ Code Analyse that wants to make this default scope.
 *  2.3 2008-02-15 complete rewrite, mainly to handle removing space around <dt> <li> <h?> and <td> tags.
 *  2.4 2008-02-15 add more tags that get trimmed. Charge $10
 *  2.5 2008-02-28 tighter removal of whitespace surrounding comments.
 *  2.6 2008-02-28 optionally allow comments to be stripped out
 *                 entirely. Preserve some space around configurable magic
 *                 macro comments that expand into text such as <!# SSI or
 *                 <!-- macro.
 *  2.7 2008-07-27 remove all space just before > in a tag. < space will convert to &lt; space.
 *  2.8 2009-04-04 no longer correct missing entities. Just issue warning messages.
 *  2.9 2010-01-18 refactor so you first allocate a Compactor object, permitting simultaneous compactings.
 *  3.0 2010-02-12 trim space inside <p>..</p>.
 *  3.1 2010-12-21 avoid touching JavaScript and other scripts.
 *  3.2 2010-12-24 handle <script and <?
 *  3.3 2011-11-15 add compactStringAsNeeded
 */
package com.mindprod.compactor;

import com.mindprod.commandline.CommandLine;
import com.mindprod.filter.AllButSVNDirectoriesFilter;
import com.mindprod.filter.ExtensionListFilter;
import com.mindprod.hunkio.HunkIO;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Pattern;

import static java.lang.System.err;
import static java.lang.System.out;

/**
 * Compacts HTML by removing unnecessary white space.
 * <p/>
 * We always compact whitespace inside and outside comments.
 * <p/>
 * We don't consolidate tags. e.g.
 * &lt;span class=&quot;x&quot;&gt;this &lt;/span&gt;&lt;span class=&quot;x&quot;&gt;and that&lt;/span&gt; can be collapsed
 * to &lt;span class=&quot;x&quot;&gt;this and that&lt;/span&gt;.
 * <p/>
 * We don't convert tags to lower case e.g. &lt;BR&gt; to &lt;br&gt;
 * <p/>
 * We leave all comments in place. If ever such a feature is implemented, it must
 * not strip SSI comments. It may or may not leave macro comments.
 * <p/>
 * We do not remove macro generations. You can do that with StripGenerated.
 * <p/>
 * We do not remove the macro comments.
 * <p/>
 * We remove space and NLs on the right of &lt;div&gt;&lt;dt&gt;&lt;li&gt;&lt;h?&gt;&lt;ol&gt;&lt;table&gt;&lt;tbody&gt;&lt;td&gt;&lt;th&gt;&lt;thead&gt;&lt;tr&gt;&lt;ul&gt;  tags.
 * <p/>
 * We remove space and NLs on the lift of &lt;/div&gt;&lt;/dt&gt;&lt;/li&gt;&lt;/h?&gt;&lt;/ol&gt;&lt;/table&gt;&lt;/tbody&gt;&lt;/td&gt;&lt;/th&gt;&lt;/thead&gt;&lt;/tr&gt;&lt;/ul&gt; tags.
 * <p/>
 * We always remove lead and trailing spaces from lines.
 * <p/>
 * We compact spaces in side HTML text, tags and comments.
 * <p/>
 * We leave spaces as is inside &lt;pre&gt;...&lt;/pre&gt; and inside quoted tag parameters.
 * <p/>
 * We convert &quot; to &amp;quot; &gt; to &amp;gt; when used in raw text.
 * <p/>
 * We don't tokenize to convert to CBF, compact binary format. The catch here is web
 * browsers can't read the result without a plug-in. This would result in a major
 * compaction. Perhaps the XML folk will eventually get disgusted with their obese
 * format and XHTML can inherit a now compact form.
 * <p/>
 * We don't do any LZW compression. the catch is, browsers can't read this without a
 * special plug-in.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 3.3 2011-11-15 add compactStringAsNeeded
 * @since 2006
 */
public class Compactor
    {
    // ------------------------------ CONSTANTS ------------------------------

    /**
     * undisplayed copyright notice
     */
    @SuppressWarnings( { "UnusedDeclaration" } )
    public static final String EMBEDDED_COPYRIGHT =
            "Copyright: (c) 1999-2012 Roedy Green, Canadian Mind Products, http://mindprod.com";

    /**
     * date this version was released.
     */
    @SuppressWarnings( { "UnusedDeclaration" } )
    private static final String RELEASE_DATE = "2011-11-15";

    /**
     * how to use the command line
     */
    private static final String USAGE = "Compactor needs a filename.html or a space-separated list of filenames, with optional -s -q -v switches.";

    /**
     * embedded version string.
     */
    @SuppressWarnings( { "UnusedDeclaration" } )
    public static final String VERSION_STRING = "3.3";

    /**
     * <!-- generated comment pattern
     */
    private static final Pattern GENERATED_PATTERN = Pattern.compile( "\\s*generated\\s" );

    /**
     * <!-- macro comment pattern
     */
    private static final Pattern MACRO_PATTERN = Pattern.compile( "\\s*macro\\s" );

    /**
     * <!-- /generated comment pattern
     */
    private static final Pattern SLASH_GENERATED_PATTERN = Pattern.compile( "\\s*/generated\\s" );

    /**
     * <!--# SSI comment pattern
     */
    private static final Pattern SSI_PATTERN = Pattern.compile( "#" );

    // -------------------------- PUBLIC STATIC METHODS --------------------------

    /**
     * compact and tidy one file.
     *
     * @param quiet              true if want progress messages suppressed
     * @param fileBeingProcessed the file currently being processed.
     *
     * @throws IOException Suppress IntelliJ Code Analyse that wants to make this private.
     * @noinspection WeakerAccess, SameParameterValue, StringEquality
     */
    public static void compactFile( boolean quiet, File fileBeingProcessed ) throws IOException
        {
        if ( !quiet )
            {
            out.print( "  compacting " + fileBeingProcessed.getName() + " " );
            }
        if ( !( fileBeingProcessed.getName().endsWith( ".html" )
                || fileBeingProcessed
                .getName().endsWith( ".htm" ) ) )
            {
            err.println( "Cannot compact: "
                         + fileBeingProcessed.getName()
                         + "not .html file" );
            return;
            }
        String big = HunkIO.readEntireFile( fileBeingProcessed );
        // we don't allow stripping macros and comments. Doing it to original is dangerous without StripGenerated balance checking
        String result = compactStringKeepingMacrosAndComments( big, fileBeingProcessed.getPath() );
        // use == not equals() because compare already done in compactStringKeepingMacrosAndComments.
        if ( result == big )
            {
            // nothing changed. No need to write results.
            if ( !quiet )
                {
                out.println( "-" );
                }
            return;
            }
        // generate output into a temporary file until we are sure all is ok.
        // create a temp file in the same directory as filename
        if ( !quiet )
            {
            out.println( "*" );
            }
        final File tempFile = HunkIO.createTempFile( "temp", ".tmp", fileBeingProcessed );
        FileWriter emit = new FileWriter( tempFile );
        emit.write( result );
        emit.close();
        // successfully created output in same directory as input,
        // Now make it replace the input file.

        if ( !fileBeingProcessed.delete() )
            {
            throw new IOException( "Unable to delete the old file " + fileBeingProcessed.getAbsolutePath() );
            }
        if ( !tempFile.renameTo( fileBeingProcessed ) )
            {
            throw new IOException( "Unable to rename the output to the old file name " + fileBeingProcessed.getAbsolutePath() );
            }
        // don't delete tempFile, it has been renamed to a real file
        }// end compactFile

    /**
     * compact a String as needed
     *
     * @param uncompacted uncompacted string
     * @param where       where this string came from, used in error messages to help you track down source
     * @param how         *=compactStringStrippingMacrosAndComments,
     *                    +=compactStringKeepingMacrosAndComments
     *                    -=does nothing
     *                    Q=Quick If first 400 chars contain a double space, compactStringKeepingMacrosAndComments, otherwise do nothing.
     *
     * @return compacted String
     */
    public static String compactStringAsNeeded( final String uncompacted, final String where, final char how )
        {
        switch ( how )
            {
            case '*':
                return Compactor.compactStringStrippingMacrosAndComments( uncompacted, where );

            case '+':
                return Compactor.compactStringKeepingMacrosAndComments( uncompacted, where );

            case '-':
                return uncompacted;

            case 'Q':
            case 'q':
                final String test = ( uncompacted.length() < 400 ) ? uncompacted : uncompacted.substring( 0, 400 );
                if ( test.contains( " " + " " ) )
                    {
                    return uncompacted;
                    }
                else
                    {
                    return Compactor.compactStringKeepingMacrosAndComments( uncompacted, where );
                    }
            default:
                assert false : "invalid Compactor.compactStringAsNeeded.how " + how + " It must be one of * + - Q";
                return uncompacted;
            }
        }

    /**
     * Remove excess whitespace from HTML represented by string.
     *
     * @param big   the String to compact.
     * @param where used in error messages to indicate where the error occurred, usually the name of the file being
     *              compacted.
     *
     * @return the compacted String, big itself if nothing changed.
     */
    public static String compactStringKeepingMacrosAndComments( final String big, final String where )
        {
        return HTMLState.compactString( big, where, true/* keep comments */,
                MACRO_PATTERN,
                SLASH_GENERATED_PATTERN );
        }

    /**
     * Remove excess whitespace from HTML represented by string.
     *
     * @param big   the String to compact.
     * @param where used in error messages to indicate where the error occurred, usually the name of the file being
     *              compacted.
     *
     * @return the compacted String, big itself if nothing changed.
     */
    public static String compactStringKeepingMacrosStrippingComments( final String big, final String where )
        {
        return HTMLState.compactString( big, where, false/* strip comments */,
                MACRO_PATTERN,
                SSI_PATTERN,
                GENERATED_PATTERN,
                SLASH_GENERATED_PATTERN );
        }

    /**
     * Remove excess whitespace from HTML represented by string, strip all macros and comments.
     *
     * @param big   the String to compact.
     * @param where used in error messages to indicate where the error occurred, usually the name of the file being
     *              compacted.
     *
     * @return the compacted String, big itself if nothing changed.
     */
    public static String compactStringStrippingMacrosAndComments( final String big, final String where )
        {
        return HTMLState.compactString( big, where, false/* strip comments, even macros and generated  */,
                SSI_PATTERN // keep only SSI. Macro generate /generate and ordinary comments go.
        );
        }

    // -------------------------- PUBLIC INSTANCE  METHODS --------------------------

    /**
     * constructor
     */
    public Compactor()
        {
        }

    // --------------------------- main() method ---------------------------

    /**
     * compacts HTML files.
     *
     * @param args names of files to process, dirs, files, -s, *.*, no wildcards.
     */
    public static void main( String[] args )
        {
        // gather all the files mentioned on the command line.
        // either directories, files, *.*, with -s and subdirs option.
        out.println( "Gathering html files to compact..." );
        CommandLine commandLine = new CommandLine( args,
                new AllButSVNDirectoriesFilter(),
                new ExtensionListFilter( "html" ) );
        final boolean quiet = commandLine.isQuiet();
        if ( commandLine.size() == 0 )
            {
            throw new IllegalArgumentException( "No files found to process\n" + USAGE );
            }
        final Compactor compactor = new Compactor();
        for ( File file : commandLine )
            {
            try
                {
                compactFile( quiet, file );
                }
            catch ( FileNotFoundException e )
                {
                err.println( "Error: "
                             + file.getAbsolutePath()
                             + " not found." );
                }
            catch ( Exception e )
                {
                err.println();
                e.printStackTrace( err );
                err.println( " in file "
                             + file.getAbsolutePath() );
                err.println();
                }
            }// end for
        }// end main
    }// end Compactor