# ---------------------------------------------------------------
# Program 'cleancsv'
#
# This program will read comma-separated text files exported from
# Microsoft Excel, and produce corresponding text files that are
# more easy to read by Fortran programs.
#
# Input files have file extension *.CSV.  Output files have
# extension *.TXT and are written into the same directory as the
# corresponding input files.
#
# A list of input files to process is read from a master data
# file named 'cleancsv.txt'.  This is a text file that contains
# full or partial path names of input files, one on each line.
#
# Author and contact:
#
#         Petr Kuzmic, Ph.D.
#         BioKin Ltd. - Software & Consulting
#
#         pkuzmic@biokin.com
#         http://www.biokin.com
#
# 04/02/2002 added the line: $csv::input_file_extension = "CSV";
# to force the script to read capital .CSV extensions as well 
# as lower case - Tammy W. Beaty, taw@ornl.gov
# ---------------------------------------------------------------

use strict;
package csv;

# ----------------------
# Define some constants.
# ----------------------
$csv::input_file_list           = "cleancsv.txt";
$csv::input_file_extension      = "csv";
$csv::input_file_extension	= "CSV";
$csv::output_file_extension     = "txt";
$csv::empty_cell                = " "; # change to "" if desired

# ---------------------------------
# Get the list of files to process.
# ---------------------------------
open (LIST, $csv::input_file_list) or die "Could not open file $csv::input_file_list!\n";
@csv::list = <LIST>;
close (LIST);

# ------------------
# Process the files.
# ------------------
foreach $csv::input_file (@csv::list) {
   chomp ($csv::input_file);
   process_file ($csv::input_file);
}

# ---------
# All done.
# ---------
print "\n\nAll done.  Press Enter key to exit program.";
<STDIN>;
exit ();

# ---------------------------------------------------------------
# Process a single input file.
# ---------------------------------------------------------------
sub process_file
{
	# ------------------------
   # Get the input file name.
	# ------------------------
   my ($input_file) = @_;
	my ($status, $output_file);
	my (@input, $line, $number_of_lines, @fields, $field, $length, $counter);

	# ---------------------
	# Try to open the file.
	# ---------------------
	$status = open (IN, $input_file);
	if ($status != 1) {
	   print "ERROR: The file $input_file could not be found!\n";
		return;
	}

	print "processing $input_file...\n";

   # ------------------------
	# Create output file name.
   # ------------------------

	$input_file =~ m|(.+)$csv::input_file_extension$|;
	$output_file = "$1$csv::output_file_extension";

   # ---------------------------------
   # Open the output file for writing.
   # ---------------------------------
	$status = open (OUT, ">$output_file");
	if ($status != 1) {
	   print "ERROR: The file $output_file could not be opened!\n";
		return;
	}

   # ----------------------------------
   # Process input file line by line.
   # ----------------------------------
	$number_of_lines = 0;
	foreach $line (<IN>) {
		chomp ($line);
		$number_of_lines++;

		@fields = split_line ($line);
		mark_fields_as_quoted (\@fields);

		$length = @fields;
		$counter = 0;
		foreach $field (@fields) {
		   if ($field eq "") {
			   $field = $csv::empty_cell;
			}
		   if ($csv::quoted[$counter]) {
				print OUT "\"$field\"";
			}
			else {
				print OUT $field;
			}
			if ($counter++ == $length-1) {
				print OUT "\n";
			}
			else {
				print OUT ",";
			}
		}
	}
	close (OUT);
	print "written    $number_of_lines lines.\n\n";
}

# ---------------------------------------------------------------
# Split a comma-separated text line into fields.
# Some fields may contain commas, in which case they are enclosed
# in double quotes (").
# ---------------------------------------------------------------
sub split_line
{
   my ($line) = @_;
	my (@fields, $inside_quoted_field, $field, $length, $i, $char);

   undef @fields;
	$inside_quoted_field = 0;
   $field = "";
	$length = length ($line);

	for ($i = 0; $i < $length; $i++) {
	   $char = substr ($line, $i, 1);
		if ($char eq '"') {
		   if (! $inside_quoted_field) {
			   $inside_quoted_field = 1;
			}
			else {
			   $inside_quoted_field = 0;
			}
		}
		elsif (($char eq ',') && ! $inside_quoted_field) {
			push @fields, $field;
		   $field = "";
		}
		else {
		   $field .= $char;		   
		}
	}
	push @fields, $field;

	return @fields
}


# ---------------------------------------------------------------
# Mark the fields that should be enclosed in double quotes.
# ---------------------------------------------------------------
sub mark_fields_as_quoted
{
   my ($input) = @_;
	my (@fields, $number_of_fields, $numerical_value, $i);

	@fields = @$input;
	$number_of_fields = @fields;
	undef @csv::quoted;

   # --------------------------------
   # First mark all fields as quoted.
   # --------------------------------
	for ($i = 0; $i < $number_of_fields; $i++) {
		$csv::quoted[$i] = 1;
	}

   # --------------------------------------------------
	# Now un-mark all numerical fields.
   # --------------------------------------------------
	for ($i = 0; $i < $number_of_fields; $i++) {
	   if (is_a_number ($fields[$i])) {
		   $csv::quoted[$i] = 0;
		}
	}
}

# ---------------------------------------------------------------
# Determine if a given text represents a number.
# ---------------------------------------------------------------
sub is_a_number
{
   my ($input) = @_;
	my ($is_a_number);

   # --------------------------------------
   # Assume that the input is not a number.
   # --------------------------------------
	$is_a_number = 0;

   # --------------------------------------------
   # Now check if the input actually is a number.
   # --------------------------------------------

   if ($input =~ m|^\-?\d+$|) {

      # ----------------------------------------------------
	   # The input matches an integer.
		# Example: "1", "23", "123", "-1", "-23", "-123", etc.
      # ----------------------------------------------------
	   $is_a_number = 1;
	}
   elsif ($input =~ m|^\-?\d+\.\d+$|) {

      # --------------------------------------------------------------
	   # The input matches a real number in the simple notation
		# Example: "1.12", "23.4", "-1.12", "-23.4",  etc.
		# Note: ".123", meaning 0.123, is NOT accepted by this filter.
		# A number must not begin with a period, or a negative sign
		# followed by a period.
      # --------------------------------------------------------------
	   $is_a_number = 1;
	}
   elsif ($input =~ m|^\-?\d+\.\d+[eE]\-?\d+$|) {

      # --------------------------------------------------------------
	   # The input matches a real number in the "scientific" notation
		# Example: "1.12e02", "23.4E001", "-1.12e-1", "-23.4E-001",  etc.
		# Note: ".123e04", meaning 123.0, is NOT accepted by this filter.
		# A number must not begin with a period, or a negative sign
		# followed by a period.
      # --------------------------------------------------------------
	   $is_a_number = 1;
	}

	return $is_a_number;
}
