/*
 * [TestRegexFindQuotedString.java]
 *
 * Summary: Finding a quoted String with a regex.
 *
 * Copyright: (c) 2012-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.8+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  1.0 2012-05-25 initial release
 *  1.1 2012-05-26 make program verify its own results.
 */
package com.mindprod.example;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.lang.System.*;

/**
 * Finding a quoted String with a regex.
 * <p/>
 * This program is based on newsgroup posts by markspace (aka Brendan), Lew and Robert Klemme
 * in response to my query about the cleanest way to use a regex to find quoted Strings.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 1.1 2012-05-26 make program verify its own results.
 * @since 2012-05-25
 */
public class TestRegexFindQuotedString
    {
    /**
     * test string to search alternating with what Patter should extract
     */
    private static final String[] alternatingTestExpectedPairs =
            {
                    "basic: href=\"http://mindprod.com\" ",
                    "\"http://mindprod.com\"",
                    "Nested quote: George said \"that's the ticket\".",
                    "\"that's the ticket\"",
                    "Nested tick: Jeb replied '\"ticket?\"what ticket'.",
                    "'\"ticket?\"what ticket'",
                    "Non-ASCII: \"How na\u00efve!\".",
                    "\"How na\u00efve!\"",
                    "empty: \"\"xx",
                    "\"\"",
                    "\\ escaped: 'Bob\\'s your uncle.'",
                    "'Bob\\'s your uncle.'",
                    "unbalanced (should fail):  'wonky\"",
                    "",
            };

    /**
     * exercise a pattern to see if it finds the expected quoted string.
     */
    private static void exercisePattern( Pattern pattern )
        {
        out.println();
        out.println( "Pattern: " + pattern.toString() ); // display with Java string level quoting peeled off.
        for ( int i = 0; i < alternatingTestExpectedPairs.length; i += 2 )
            {
            final String test = alternatingTestExpectedPairs[ i ];
            final String expected = alternatingTestExpectedPairs[ i + 1 ];
            final Matcher m = pattern.matcher( test );
            boolean found = m.find();
            final boolean correct;
            final String extracted;
            if ( found )
                {
                extracted = m.group( 0 );
                correct = extracted.equals( expected );
                }
            else
                {
                extracted = null;
                correct = false;
                }
            out.println( test + ", found: " + found +
                         ", correct: " + correct + " (" + extracted + ")" );
            }
        }

    /**
     * test harness to exercise various candidate Patterns for finding quoted Strings.
     *
     * @param args not used
     */
    public static void main( String[] args )
        {
        // We want to find Strings of the form "xx'xx" or 'xx"xx'
        // We want to avoid the following problems:
        // 1. Works even if String contains foreign languages, even Russian or accented letters.
        // 2. If starts with " must end with ", if starts with ' must end with '.
        // 3. ' is ok inside "...", and " is ok inside '...'
        // 4. It should accept empty strings "" and ''.
        // 5. We usually don't worry about how to use ' inside '...' since there are so many different conventions,
        //    but for bonus points ignore \' and \".
        // here are some suggested candidate Patterns to find quoted Strings:
        exercisePattern( Pattern.compile( "[\"']\\p{Print}+?[\"']" ) );  // fails 1 2 3 4
        exercisePattern( Pattern.compile( "[\"'][^\"']+[\"']" ) );  // fails 2 3 4
        exercisePattern( Pattern.compile( "([\"'])[^\"']+\\1" ) ); // fails 3 4, uses a capturing group.
        exercisePattern( Pattern.compile( "\"[^\"]+\"|'[^']+'" ) );  // fails 4
        exercisePattern( Pattern.compile( "\"[^\"]*\"|'[^']*'" ) ); // works, but fails bonus 5
        exercisePattern( Pattern.compile( "\"(?:\\\\.|[^\\\"])*\"|'(?:\\\\.|[^\\'])*'" ) ); // works, even passes 5.
        // (?: ) is a non-capturing group. \\\\ is a literal \. . means any char.
        // In the above code, I pass a Pattern rather than a more concise raw String
        // because when I do it that way the IntelliJ IDE
        // does some proofreading and formatting for me on the regexes.
        // A follow-on problem would be to find Patterns that extract just the contents of the string, without
        // the delimiters, and possibly even decode any embedded \.
        // These sorts of problem can get so hairy, it sometimes simpler to hand code a little parser or modify
        // one you have already done, e.g.a finite state automaton like the ones JDisplay/JPrep uses.
        // These of course will be much faster than a general purpose regex.
        }
    }