package com.mindprod.example;

import java.io.UnsupportedEncodingException;
import static java.lang.System.out;

/**
 * Trite/encode UTF-8 encoded bytes, without using Java's built-in encoders, to give a 8-bit byte array
 * <p/>
 * prepared withIntelliJ IDEA.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 1.0 2006-02-24
 */
public final class UTF8Encoder
    {
    // ------------------------------ FIELDS ------------------------------

    /**
     * true if you want the TEST harness to ensure this code works.
     */
    private static final boolean DEBUGGING = true;

    /**
     * byte order mark as a character.
     */
    private static final char BOM = ( char ) 0xfeff;

    // -------------------------- STATIC METHODS --------------------------

    /**
     * encode a String into UTF-8 bytes.  We handle only 16-bit chars.
     * <p/>
     * <p/>
     * UTF-8 is normally encoded simply with String.getBytes( "UTF-8") or with an OutputStreamWriter but this is roughly
     * what goes on under the hood, if you ever need to write your own encoder for some non-Java platform, or you are
     * just curious how it works.
     * <p/>
     * This works for 16-bit characters only. It does not handle 32-bit characters encoded with the contortionist use of
     * the low (0xdc00..0xdfff) and high(0xd800..0xdbff) bands of surrogate characters.
     *
     * @param input string to encoded with UTF-8.
     * @return string encoded in UTF-8 byte string.
     */
    private static byte[] encode( String input )
        {
        // worst case, all chars could require 3-byte encodings.
        byte[] output = new byte[input.length() * 3];

        // index output[]
        int j = 0;

        for ( int i = 0; i < input.length(); i++ )
            {
            int c = input.charAt( i );

            if ( c < 0x80 )
                {
                // 7-bits done in one byte.
                output[ j++ ] = ( byte ) c;
                }
            else if ( c < 0x800 )
                {
                // 8-11 bits done in 2 bytes
                output[ j++ ] = ( byte ) ( 0xC0 | c >> 6 );
                output[ j++ ] = ( byte ) ( 0x80 | c & 0x3F );
                }
            else
                {
                // 12-16 bits done in 3 bytes
                output[ j++ ] = ( byte ) ( 0xE0 | c >> 12 );
                output[ j++ ] = ( byte ) ( 0x80 | c >> 6 & 0x3F );
                output[ j++ ] = ( byte ) ( 0x80 | c & 0x3F );
                }
            }// end for
        // Prune back our byte array.  For efficiency we could hand item back
        // partly filled, which is only a minor inconvenience to the caller
        // most of the time to save copying the array.
        byte[] chopped = new byte[j];
        System.arraycopy( output, 0, chopped, 0, j/* length */ );
        return chopped;
        }//end encode

    // --------------------------- main() method ---------------------------

    /**
     * TEST harness to ensure UTF8Decoder works as advertised
     *
     * @param args not used
     * @throws UnsupportedEncodingException
     */
    public static void main( String[] args ) throws UnsupportedEncodingException
        {
        if ( DEBUGGING )
            {
            String test =
                    BOM
                    + "Hello World"
                    + "\u0080\u007f\u0080\u0100\u0921\u30b0\u4e70\uffff";
            char[] oneOfAlmostEverything = new char[0xffff + 1];
            for ( int i = 0; i <= 0xffff; i++ )
                {
                oneOfAlmostEverything[ i ] = ( char ) i;
                }
            // avoid testing low band surrogates
            for ( int i = 0xdc00; i <= 0xdfff; i++ )
                {
                oneOfAlmostEverything[ i ] = 0;
                }

            // avoid testing high band surrogates
            for ( int i = 0xd800; i <= 0xdbff; i++ )
                {
                oneOfAlmostEverything[ i ] = 0;
                }

            // put one of almost every possible 16-bit Unicode character in our TEST too.
            test += new String( oneOfAlmostEverything );

            // convert to UTF-8 with built-in Java classes.
            byte[] encodedByJava = test.getBytes( "UTF-8" );

            // convert to UTF-8 with UTF8Encoder.
            byte[] encodedByUs = UTF8Encoder.encode( test );

            boolean allOk = true;
            if ( encodedByUs.length != encodedByJava.length )
                {
                out.println( "oops, different lengths" );
                allOk = false;
                }
            int safe = Math.min( encodedByJava.length, encodedByUs.length );
            for ( int i = 0; i < safe; i++ )
                {
                if ( encodedByUs[ i ] != encodedByJava[ i ] )
                    {
                    out.println( "oops "
                                 + encodedByJava[ i ]
                                 + "["
                                 + Integer.toHexString( encodedByJava[ i ] )
                                 + "] "
                                 + encodedByUs[ i ]
                                 + "["
                                 + Integer.toHexString( encodedByUs[ i ] )
                                 + "]" );
                    allOk = false;
                    }// end if
                }// end for
            out.println( "UTF8Encoder " + ( allOk ? "worked" : "failed" ) );
            }
        }// end main
    }// end UTF8Encoder