Snippet : TestUTF8.java

/*
 * [TestUTF8.java]
 *
 * Summary: Discover how Java's use of UTF-8 conforms with Unicode standards.
 *
 * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.8+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  1.0 2006-02-25
 */
// TestUTF8
package com.mindprod.example;

import com.mindprod.common18.EIO;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;

import static java.lang.System.*;

/**
 * Discover how Java's use of UTF-8 conforms with Unicode standards.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 1.0 2006-02-25
 * @since 2006-02-25
 */
public final class TestUTF8
    {
    /**
     * byte order mark as a character.
     */
    private static final char BOM = ( char ) 0xfeff;

    /**
     * TEST strange use to check out how Sun encodes UTF-8
     */
    private static final String TEST = new StringBuilder().append( "bom:" )
            .append( BOM )
            .append( " text:echidna" )
            .append( " x0:" )
            .append( ( char ) 0x00 )
            .append( " xa1:" )
            .append( ( char ) 0xa1 )
            .append( " x100:" )
            .append( ( char ) 0x100 )
            .append( " x0911:" )
            .append( ( char ) 0x0911 )
            .append( " xffff:" )
            .append( ( char ) 0xffff )
            .append( " || 32 bit || x10000:" )
            .appendCodePoint( 0x10000 )
            .append( " x10302:" )
            .appendCodePoint( 0x10302 )
            .append( " x1ffff:" )
            .appendCodePoint( 0x1ffff )
            .append( " x100000:" )
            .appendCodePoint( 0x100000 )
            .append( " x10ffff:" )
            .appendCodePoint( 0x10ffff )
            .toString();

    /**
     * dump contents of file in hex
     *
     * @param bb ByteBuffer as raw bytes, e.g.  ByteBuffer or MappedByteBuffer
     */
    private static void examine( ByteBuffer bb )
        {
        out.println( "position: " + bb.position() );
        out.println( "limit: " + bb.limit() );
        out.println( "capacity: " + bb.capacity() );
        int limit = bb.limit();
        for ( int offset = 0; offset < limit; offset++ )
            {
            int c = bb.get() & 0xff;// want to view unsigned
            // offset, hex, decimal char
            out.printf( "%6d > %2x : %3d : %1c\n", offset, c, c, ( char ) c );
            }
        }

    /**
     * Test CharBuffer.getBytes
     *
     * @throws java.io.IOException on I/O failure.
     */
    private static void testCharBuffer() throws IOException
        {
        // CharBuffer-ByteBuffer style encoding/decoding.
        // Using nio methods for encoding and decoding.
        // These are more efficient because there is less
        // hidden copying of the data as there is when you work
        // with String and byte[] in the previous methods.
        // choose an encoding
        Charset utf8 = Charset.forName( "UTF-8" );
        // for byte to char
        CharsetDecoder decoder = utf8.newDecoder();
        // for char to byte
        CharsetEncoder encoder = utf8.newEncoder();
        // effectively convert char[] to byte[]
        ByteBuffer encoded = encoder.encode( CharBuffer.wrap( TEST ) );
        // effectively convert byte[] to char[]
        CharBuffer charBuffer = decoder.decode( encoded );
        String reconstitutedTest = charBuffer.toString();
        if ( !reconstitutedTest.equals( TEST ) )
            {
            out.println( "oops: charBuffer differs from original" );
            }
        out.println( "<><> charBuffer <><>" );
        out.println( "String length: "
                     + TEST.length()
                     + " UTF-8 length: "
                     + encoded.limit()
                     + " reconstituted length: "
                     + reconstitutedTest.length() );
        encoded.flip();// prepare to read
        examine( encoded );
        /*
        From the output we make the following discoveries.
        This works just like OutputStreamWriter.
        There is no BOM on the front of the file, unless you write one there.
        It does not insert or remove any BOMs.
        0 is encoded in a single byte, as per UTF standard.
        There is no length on the front of the string
        */
        }

    /**
     * Test String.getBytes
     *
     * @throws java.io.IOException on I/O failure.
     */
    private static void testGetBytes() throws IOException
        {
        byte[] encoded = TEST.getBytes( EIO.UTF8 );
        String reconstitutedTest = new String( encoded, EIO.UTF8 );
        if ( !reconstitutedTest.equals( TEST ) )
            {
            out.println( "oops: getBytes differs from original" );
            }
        out.println( "<><> getBytes <><>" );
        out.println( "String length: "
                     + TEST.length()
                     + " UTF-8 length: "
                     + encoded
                             .length
                     + " reconstituted length: "
                     + reconstitutedTest.length() );
        ByteBuffer encodedBuffer = ByteBuffer.wrap( encoded );
        // encodedBuffer.flip();  // not needed
        examine( encodedBuffer );
        /*
        From the output we make the following discoveries.
        This works just like OutputStreamWriter.
        There is no BOM on the front of the file, unless you write one there.
        It does not insert or remove any BOMs.
        0 is encoded in a single byte, as per UTF standard.
        There is no length on the front of the string.
        */
        }

    /**
     * Test OutputStreamWriter
     *
     * @throws java.io.IOException on I/O failure.
     */
    private static void testOutputStreamWriter() throws IOException
        {
        File tempFile = File.createTempFile( "temp_", "tmp" );
        // O P E N for write
        FileOutputStream fos =
                new FileOutputStream( tempFile, false/* no append */ );
        OutputStreamWriter osw = new OutputStreamWriter( fos, EIO.UTF8 );
        // W R I T E
        osw.write( TEST );
        // C L O S E
        osw.close();
        // O P E N for read
        FileInputStream fis = new FileInputStream( tempFile );
        InputStreamReader isw = new InputStreamReader( fis, EIO.UTF8 );
        // R E A D
        char[] cbuf = new char[ TEST.length() ];
        int charsRead = isw.read( cbuf );
        String reconstitutedTest = new String( cbuf, 0, charsRead );
        if ( !reconstitutedTest.equals( TEST ) )
            {
            out.println( "oops: InputStreamReader differs from original" );
            }
        // C L O S E
        osw.close();
        out.println( "<><> OutputStreamWriter <><>" );
        out.println( "String length: "
                     + TEST.length()
                     + " UTF-8 length: "
                     + tempFile.length()
                     + " reconstituted length: "
                     + reconstitutedTest.length() );
        fis = new FileInputStream( tempFile );
        FileChannel fc = fis.getChannel();
        ByteBuffer encodedBuffer =
                fc.map( FileChannel.MapMode.READ_ONLY, 0, tempFile.length() );
        // encodedBuffer.flip();  // not needed
        examine( encodedBuffer );
        fc.close();
        fis.close();
        //noinspection ResultOfMethodCallIgnored,ResultOfMethodCallIgnored
        tempFile.delete();
        /*
        From this output we make these discoveries.
        There is no BOM on the front of the file, unless you write one there.
        It does not insert or remove any BOMs.
        0 is encoded in a single byte, as per UTF standard.
        There is no length on the front of the string.
        */
        }

    /**
     * Test DataOutputStream.writeUTF
     *
     * @throws java.io.IOException on I/O failure
     */
    private static void testWriteUTF() throws IOException
        {
        File tempFile = File.createTempFile( "temp_", "tmp" );
        // O P E N  for write
        FileOutputStream fos =
                new FileOutputStream( tempFile, false/* append */ );
        DataOutputStream dos = new DataOutputStream( fos );
        // W R I T E
        dos.writeUTF( TEST );
        // C L O S E
        dos.close();
        // O P E N  for read
        FileInputStream fis = new FileInputStream( tempFile );
        DataInputStream dis = new DataInputStream( fis );
        // W R I T E
        String reconstitutedTest = dis.readUTF();
        if ( !reconstitutedTest.equals( TEST ) )
            {
            out.println( "oops: readUTF differs from original" );
            }
        // C L O S E
        dis.close();
        out.println( "<><> DataOutputStream.writeUTF <><>" );
        out.println( "String length: "
                     + TEST.length()
                     + " UTF-8 length: "
                     + tempFile.length()
                     + " reconstituted length: "
                     + reconstitutedTest.length() );
        fis = new FileInputStream( tempFile );
        FileChannel fc = fis.getChannel();
        ByteBuffer encodedBuffer =
                fc.map( FileChannel.MapMode.READ_ONLY, 0, tempFile.length() );
        //  encodedBuffer.flip();  // not needed
        examine( encodedBuffer );
        fc.close();
        fis.close();
        tempFile.delete();
        /*
        From this output we discover these differences
        from true UTF-8 encoding.
        There is a signed two-byte big-endian int length
        on the front, that counts the size of the following field in bytes not chars.
        It does not include itself. This means strings are limited to a mere
        32767 bytes long which is even fewer characters!!
        Somebody goofed big-time here. There should be scheme to write longer Strings.
        0x00 is encoded as is c0 80 instead of 00, to help C from getting
        confused reading such a file and thinking the 00 meant end-of-string.
        The biggest difference is the handling of 32 bit code points.
        UTF-8 codes them as 4-byte sequences. Sun is coding them as 6-byte sequences.
        e.g. consider the encoding of 0x10302
        standard UTF-8 gives
        f0 90 8c 82
        whereas under Sun's writeUTF scheme item encodes as:
        ed a0 80 ed bc 82
        What is going on?  Internally Sun encodes 32-bit codepoints as a
        surrogate pair of 16-bit chars, effectively using UTF-16 encoding
        internally.  Instead of undoing the UTF-16 encoding before applying
        the UTF-8 transform, Sun applies item directly on the surrogate pairs.
        Surrogate pairs are in the bands 0xd800-0xdbff and 0xdc00-0xdfff.
        Treated as ordinary characters, these take 3 bytes each to encode
        each character separately in UTF-8.
        It does not insert or remove any BOMs.
        */
        }

    /**
     * Examines Java's various UTF implementations for conformance with Unicode Standards.
     *
     * @param args not used
     *
     * @throws java.io.IOException on I/O failure
     */
    public static void main( String[] args ) throws IOException
        {
        testCharBuffer();
        testGetBytes();
        testWriteUTF();
        testOutputStreamWriter();
        }
    }