/**
 * IBM Confidential
 * OCO Source Materials
 * 5725-H03
 * (C) Copyright IBM Corp. 2013, 2014
 * The source code for this program is not published or 
 * otherwise divested of its trade secrets, irrespective of 
 * what has been deposited with the U.S. Copyright Office.
 */
package com.ibm.pdp.mdl.link.design;

import java.util.ArrayList;
import java.util.List;



/**
 * The aim of this class is to find the variables (DataElement or Pacbase rubriques) in a line of COBOL source.<br>
 * Rule : 
 * -A variable is composed of 2 or more parts joined by a dash (-). The last part, called here the suffix is long of 1 up to 6 characters.<br>
 * -The suffix can not contain a dash. Its value is the name of the variable as known in the RPP repository.<br>
 * -The prefix can be composed of several parts, all joined by a dash. The prefix can be seen as a domain to which the variable pertains.<br>
 * -For example, in CU00-PRODID, PRODID is the data element name (in the repository). CU00 is the name of the data structure to which it belongs.<br> 
 * -A variable cannot be found in the area A of the COBOL source (ie in the columns 8 to 11 included). <br>
 * <br>
 *       *A   B  
 * 1234567890123456...	For cobol user in cobol source.
 * 0123456789012345...	For java developer
 * <br>
 * Examples of valid variables: 1-CU00-CUSTID, WW00-FNAME, W-12345, $100-1AB
 * Examples of invalid variables : F4510-007 in column 8 up to 11, B-12345 if part of a computation, -12345 (prefix missing). 
 *
 */
public class CobolVariablesScanner 
{
    public final static String copyright = "Licensed Materials - Property of IBM\n5725-H03\n(C) Copyright IBM Corp. 2013, 2014.   All rights reserved.\nUS Government Users Restricted Rights - Use, duplication or disclosure restricted by GSA ADP Schedule Contract with IBM Corp.";	//$NON-NLS-1$

    //PFR: now the list is in a separate class CobolReservedWords and is complete.
    //KEYWORDS IS AN ORDERED LIST. KEEP IT ORDERED BECAUSE DICHOTOMIC SEARCHES ARE PERFORMED ON !! 
//    private static String[] KEYWORDS = {"ALPHABETIC-LOWER", "COMP-1", "COMP-2", "COMP-3", "COMP-4", "COMPUTATIONAL-1", "COMPUTATIONAL-2", "COMPUTATIONAL-3", "COMPUTATIONAL-4", 
//    								"END-EXEC", "END-IF", 
//    								"<", "=", ">", "ACCEPT", "ADD", "AND", "COMPUTE", "DISPLAY", "ELSE", "EXIT", "GO", "SOURCE-COMPUTER", "OBJECT-COMPUTER", 
//    								"SPECIAL-NAMES", "WORKING-STORAGE",
//                                    "IF", "HIGH-VALUE", "LOW-VALUE", "MOVE", "NEXT", "NOT", "PERFORM", "PICTURE", "SENTENCE", "SPACE", "SPACES", "THRU", "TO", "VALUE", "ZERO", "ZEROES", "ZEROS"};

    //The '-' IS NOT A DELIMITER. It must not appear in the list below.
	private static char[] DELIMITERS = {' ', '.', '=', '\t', '\r', '\n', '!', '+', '/', '*', '(', ')', ':', ',', '<', '>', '"', '\''};

	private static List<RubriqueToken> EMPTY_LIST = new ArrayList<RubriqueToken>(0);
	
	//The algo does take for valid variables before this offset.
	private static int FIRST_VALID_COLUMN_FOR_VAR = 10;
	
	private static String CR = System.getProperty("line.separator");				//$NON-NLS-1$
	private static String[] lines = 
	{  //1234567890123456789
		"            MOVE  1  TO   PR-00-GVRC.",									//$NON-NLS-1$
		"            MOVE    UT-ERUT       TO      GE00-ERUT.",						//$NON-NLS-1$
		"      *    IF EN-TTE = 'O' AND EN-PRE = '0' MOVE 2 TO EN-PRE.",			//$NON-NLS-1$
		"              MOVE 4 TO  PR-10-GVNUM2  GO  TO F4510-007.",					//$NON-NLS-1$
		"        IF   PR-10-GVNUM2  NOT  =  '1'",									//$NON-NLS-1$
		"       F40BB-900. MOVE 4 TO  PR-10-GVNUM2  GO  TO F4510-007.",				//$NON-NLS-1$
		"        F4220-090. IF I06 < I04 ADD 1 TO I06 GO TO F4220-010.",			//$NON-NLS-1$
		"        AND GR-TT (I01, I02) = 'I' MOVE 3 TO GR-PR (I01).",				//$NON-NLS-1$
		"        COMPUTE A=B-12345."												//$NON-NLS-1$
	};
	
//	private static String fileName = "c:/tmp/COBOL_Files/ZB0051.cbl";				//$NON-NLS-1$
	private static String fileName = "c:/tmp/COBOL_Files/PGCRDG_TestPFR.cbl"; 		//$NON-NLS-1$
//	private static String fileName = "c:/tmp/COBOL_Files/AQXCDD_TestPFR.pacmacro";	//$NON-NLS-1$
//	private static String fileName = "c:/tmp/COBOL_Files/AYX174_TestPFR.pacmacro";	//$NON-NLS-1$
//	private static String fileName = "c:/tmp/AAAA27.pacmacro";						//$NON-NLS-1$

	/**
	 * @param args
	 */
	public static void main(String[] args) 
	{
		CobolVariablesScanner x = new CobolVariablesScanner();
		//
		x.init();
		long cumulElapsed = 0;
		System.out.println("DEBUT PROGRAMME");	//$NON-NLS-1$
		List<RubriqueToken> result = new ArrayList<RubriqueToken>();
		for (int i=0; i<lines.length; i++)
		{
			String aLine = lines[i];
//			System.out.println("Parsing :[" + aLine + "]");
			long a = System.currentTimeMillis();
			List<RubriqueToken> vars = x.findVariables(aLine);
			long b = System.currentTimeMillis();
			cumulElapsed += b - a;
			result.addAll(vars);
		}

		System.out.println("elapsed = " + cumulElapsed);	//$NON-NLS-1$
		System.out.println("nb of tokens found = " + result.size());	//$NON-NLS-1$
		System.out.println("FIN PROGRAMME");	//$NON-NLS-1$
		x.displayTrace(result);
	}

	private void init()
	{
		String s = Util.readFileContentsQuickly(fileName);
		lines = s.split(CR);
	}
	
	private boolean isDelimiter(char c, char[] delimiters)
	{
		for (int i=0; i<delimiters.length; i++)
		{
			if (delimiters[i] == c)
				return true;
		}
		return false;

	}
	
	/**
	 * What is good is something with a '-' followed by 1 or up to 6 alphanumeric chars.
	 * The last 1 up to 6 chars which constitute the DataElement name must not contain a $.
	 * The part before the last '-' is the prefix of the DataElement and can be quite variable.
	 * For example, it can contain a $ (it is the case for variables in macro).
	 * @param candidate
	 * @return
	 */
	private boolean isGoodVariableCandidate(String candidate, int offsetInLine, String line)
	{
		if (candidate.length() == 0)
			return false;
		
		if (offsetInLine < FIRST_VALID_COLUMN_FOR_VAR)	//DataElements cannot be declared or used before column 11 (in java counting, 12 in user counting).
			return false;
		
		//-------------------------------
		//Also filter if it is a constant (pure number).
		//In addition, There must be a dash followed by 1 or max 6 chars to be a valid DataElement.
		boolean pureNumber = true;
		int lastDashIndex = -1;
		int lastDollarIndex = -1;
		int candidateLen = candidate.length();
		for (int j=0; j<candidateLen; j++)
		{
			char ch = candidate.charAt(j);
			if (! Character.isDigit(ch))
				pureNumber = false;
			
			if (ch == '-')		//We remember the last dash
				lastDashIndex = j;

			if (ch == '$')		//We remember the last $ which must not be in the name of the DataElement
				lastDollarIndex = j;
		}
		int candidateShortNameLen = candidateLen - lastDashIndex - 1;
		if (pureNumber || lastDashIndex == -1 || candidateShortNameLen == 0 || candidateShortNameLen > 6 || lastDollarIndex > lastDashIndex)
			return false;
		
		if ("FILLER".equals(candidate.substring(lastDashIndex  + 1)))		//$NON-NLS-1$
			return false;
		
		//-------------------------------
		//filter the main keywords and constants
		//Useless since the constraints (contains a -) on the DataElement search discards all the keywords.
		if (DichoSearchForArray.search(candidate, CobolReservedWords.COBOL_KEYWORDS) > -1)
			return false;
		
		//-------------------------------
		boolean b = isPrecededByGotoThruPerform(line, offsetInLine, candidate);
		return !b;
	}
	
	private boolean isPrecededByGotoThruPerform(String line, int offsetInLine, String candidate)
	{
		//To eliminate Functions we search on the fly the GOTO preceding since Functions are already eliminated
		//by taking only tokens after the margin B (Functions are in margin A).
		//After having fetched 4 significants characters, we stop. If it forms 'OTOG' for 'GOTO', then it is a function name and not a DataElement.
		int len = candidate.length();
		if (candidate.charAt(0) == 'F'  && (len > 2)) 	//&& len == 3 || len == 6 || len == 5 || len == 8))
		{
			StringBuilder sb = new StringBuilder();
			for (int i=offsetInLine - 1; i>0; i--)
			{
				char c = line.charAt(i);
				if (Character.isWhitespace(c))
				{
					continue;
				}
				sb.append(c);
				String s = sb.toString();
				if (sb.length() == 4)
				{
					if ("OTOG".equalsIgnoreCase(s) 			//$NON-NLS-1$
						|| "URHT".equalsIgnoreCase(s) &&		//$NON-NLS-1$
						(i > 0 && Character.isWhitespace(line.charAt(i-1))))
						return true;
					else if ("MROF".equalsIgnoreCase(s)		//$NON-NLS-1$
						|| "HGUO".equalsIgnoreCase(s))		//$NON-NLS-1$
					{
						//We have to look at a few more characters to be sure that we have PERFORM or THROUGH before.
						continue;
					}
					else
						return false;
				}
				if (sb.length() == 7)
				{
					if ("MROFREP".equalsIgnoreCase(s) 			//$NON-NLS-1$
						|| "HGUORHT".equalsIgnoreCase(s))		//$NON-NLS-1$
						return true;
					else 
						return false;
				}
			}
		}
		return false;
	}
	
	/**
	 * Input : a line representing a comment.<br>
	 * 
	 * Returns true is the line is an interesting MicroPattern declaration (ie referencing MP containing Dataelements)
	 * otherwise returns false (not a interesting MP (ie does not reference dataElements or basic cobol comment)<br>
	 * List of interesting MP:<br>
	 * A, AD, DAD, DAO, M<br>
	 * We also take $ because it can be a macro parameter, therefore it could be an interesting MP.
	 * List of non interesting MP:<br>
	 * G (GT, GFT, GDB...), O (OPE), P (Perform), R(Read), W (Write, WI, WE, WF, WS), S (SQL, ), Y<br>
	 * 
	 */
	private boolean isInterestingComment(String aLine, int i)
	{
		if (i+1 < aLine.length())		//There is at least the CR (in Linux, only ONE char!!).
		{
			if (aLine.charAt(i+1) == '!' && (i+2 < aLine.length()))
			{
				int j = i+2;
				//We can discard MicroPattern that we know that they cannot contain DataElement references such as GT, GFT, P, ...
				char c = aLine.charAt(j);
				if (c == 'A' || c == 'D' || c == 'M' || c == '$')
					return true;
			}
		}
		return false;
	}
	
	/**
	 * Returns true if the variable was added to the list of rubriques, false otherwise.
	 * 
	 */
	private boolean analyzeAndProcessBuffer(StringBuilder sb, int i, String aLine, List<RubriqueToken> result)
	{
		if (sb.length() == 0)
			return false;
		
		boolean status = false;
		String candidate = sb.toString();
		int offsetInLine = i - candidate.length();
		if (isGoodVariableCandidate(candidate, offsetInLine, aLine))
		{
		    //i is at the end of the token just found. In RubriqueToken, we need the start index of the token.
		    RubriqueToken rt = new RubriqueToken(candidate, offsetInLine);
			result.add(rt);
			status = true;
		}
		return status;
	}
	
	/**
	 * Returns a list of cobol variables found in the characters [8..72] of the inputLine passed in parameter (1st char is at position 1).<br>
	 * The inputLine must not contain tabulation ("\t") otherwise it doesn't work. Normally, the PDP COBOL Editor only creates spaces,
	 * but for JUNIT tests, beware to put real spaces and not tabulations with the Text Editor.<br>
	 * This method is able to deal with variables whose last char is stuck to the first char of the ignored chars at column 72 and after. For
	 * that, the line is truncated to only keep the 72 first chars.
	 * 
	 * @param inputLine
	 * @return
	 */
	public List<RubriqueToken> findVariables(String inputLine)
	{
		List<RubriqueToken> result = new ArrayList<RubriqueToken>();
		
		if (inputLine.length() < 8)
			return result;
		
		//It is possible to have a variable stuck to the characters in column 72 and after.
		//We have to take them into account whereas there is no explicit delimiter.
		//The border is indicated by the position in line (position 72 is the end of the cobol line).
		//To simplify we cut the line before the analysis.
		int end = Math.min(inputLine.length(), 72);
		String aLine = inputLine.substring(0, end);
		
		boolean isInQuoteString = false;
        boolean isInDoubleQuoteString = false;
        //Rem : Not useful. The delimiters are sufficient to deal with strings.
        //Actually, after a -, the first char must be a ' or " to resume the String.
      //  boolean isInContinuingString = false;		
        boolean isInAMicroPattern = false;
		
		StringBuilder sb = new StringBuilder();
		int i=6;			//No need to start at 0. The pertinent characters start at position 6.
		int lineLen = aLine.length();
		for (; i<lineLen; i++)
		{
			char c = aLine.charAt(i);
			
			//We jump the REAL comments (but not the MicroPatterns declarations)
			if (i == 6 && c == '*')
			{
				if (isInterestingComment(aLine, i))
					isInAMicroPattern = true;
				else
					return EMPTY_LIST;		//it is a real comment, not an interesting MicroPattern declaration.
			}

			
			if (isDelimiter(c, DELIMITERS))
			{
				if (sb.length() > 0)
				{
					analyzeAndProcessBuffer(sb, i, aLine, result);
					sb = sb.delete(0, sb.length());	//Reset the buffer
				}
				
				if (c == '\'' && !isInAMicroPattern)
				{
				    isInQuoteString = !isInQuoteString;
				    if (!isInQuoteString) //Don't forget to jump over the last ' or " of the string ==> continue.
				        continue;
				}
		        if (c == '"' && !isInAMicroPattern)
		        {
	                isInDoubleQuoteString = !isInDoubleQuoteString;
	                if (!isInDoubleQuoteString)  //Don't forget to jump over the last ' or " of the string ==> continue.
	                    continue;
	            }
			}
			else
			{
				if (isInQuoteString || isInDoubleQuoteString)
				    continue;
				else
					sb.append(c);
			}
			
		}
		//There can be something in the buffer
		if (sb.length() != 0)
		{
			analyzeAndProcessBuffer(sb, i, aLine, result);
		}

		if (result.size() == 0)
			return EMPTY_LIST;
		else
			return result;
	}
	
	private void displayTrace(List<RubriqueToken> vars)
	{
		System.out.print("Tokens=[");	//$NON-NLS-1$

		for (int i=0; i<vars.size(); i++)
		{
			if (i>0)
				System.out.print(", ");	//$NON-NLS-1$
			System.out.print(vars.get(i).getName());
		}
		System.out.println("]");	//$NON-NLS-1$
		System.out.println("");	//$NON-NLS-1$
	}
	
}
