HEX

File: //usr/share/slsh/csv.sl
% Copyright (C) 2012-2017,2018 John E. Davis
%
% This file is part of the S-Lang Library and may be distributed under the
% terms of the GNU General Public License.  See the file COPYING for
% more information.
%---------------------------------------------------------------------------
import ("csv");

private define read_fp_callback (info)
{
   variable line, comment_char = info.comment_char;
   forever
     {
	if (-1 == fgets (&line, info.fp))
	  return NULL;

	if ((line[0] == comment_char)
	     && (0 == strnbytecmp (line, info.comment, info.comment_len)))
	  continue;

	return line;
     }
}

private define read_strings_callback (str_info)
{
   variable line;

   if (str_info.output_crlf)
     {
	str_info.output_crlf = 0;
	return "\n";
     }
   variable i = str_info.i;
   if (i >= str_info.n)
     return NULL;
   line = str_info.strings[i];
   str_info.i = i+1;
   if (line[-1] != '\n')
     str_info.output_crlf = 1;

   return line;
}

private define resize_arrays (list, n)
{
   _for (0, length(list)-1, 1)
     {
	variable i = ();
	variable a = list[i];
	variable m = length(a);
	if (m > n)
	  {
	     list[i] = a[[:n-1]];
	     continue;
	  }
	variable b = _typeof(a)[n];
	b[[:m-1]] = a;
	list[i] = b;
     }
}

private define atofloat (x)
{
   typecast (atof(x), Float_Type);
}

private define get_blankrows_bits (val)
{
   if (val == "skip") return CSV_SKIP_BLANK_ROWS;
   if (val == "stop") return CSV_STOP_BLANK_ROWS;
   return 0;
}

private define read_row (csv)
{
   % The blank row handling default is to use that of the csv object.
   if (qualifier_exists ("blankrows"))
     {
	return _csv_decode_row (csv.decoder,
			       get_blankrows_bits (qualifier("blankrows")));
     }
   return _csv_decode_row (csv.decoder);
}

private define fixup_header_names (names)
{
   if (names == NULL) return names;
   if (typeof (names) == List_Type)
     names = list_to_array (names);
   if (_typeof(names) != String_Type)
     return names;

   variable is_scalar = (typeof (names) != Array_Type);
   if (is_scalar)
     names = [names];

   names = strlow (names);
   variable i = where (names == "");
   names[i] = array_map (String_Type, &sprintf, "col%d", i+1);

#iffalse
   % This code is nolonger necessary since slang now allows arbitrary
   % structure names.
   names = strtrans (names, "^\\w", "_");
   names = strcompress (names, "_");

   _for i (0, length(names)-1, 1)
     {
	if ('0' <= names[i][0] <= '9')
	  names[i] = "_" + names[i];
     }
#endif
   if (is_scalar) names = names[0];
   return names;
}

private define pop_columns_as_array (n)
{
   if (n == 0)
     return String_Type[0];

   try
     {
	% allow a mixture of arrays and scalars
	variable columns = __pop_list (n);
	columns = [__push_list(columns)];
	return columns;
     }
   catch TypeMismatchError:
     {
	throw TypeMismatchError, "Column arguments cannot be a mixture of ints and strings";
     }
}


private define read_cols ()
{
   if ((_NARGS == 0) || (qualifier_exists ("help")))
     {
	usage("struct = .readcol ([columns] ; qualifiers)\n\
where columns is an optional 1-based array of column numbers,\n\
 or array of column names.\n\
Qualifiers:\n\
 header=header, fields=[array of field names],\n\
 type=value|array|string of 's','i','l','f','d' (str,int,long,float,dbl)\n\
 typeNTH=val (specifiy type for NTH column)\n\
 snan=\"\", inan=0, lnan=0L, fnan=_NaN, dnan=_NaN (defaults for empty fields),\n\
 nanNTH=val (value used for an empty field in the NTH column\n\
"
	     );
     }

   variable columns = NULL;
   if (_NARGS > 1)
     {
	columns = pop_columns_as_array (_NARGS-1);
     }
   variable csv = ();

   variable fields = qualifier ("fields");
   variable header = qualifier ("header");
   variable types = qualifier ("type");
   variable snan = qualifier ("snan", "");
   variable dnan = qualifier ("dnan", _NaN);
   variable fnan = qualifier ("fnan", typecast(_NaN,Float_Type));
   variable inan = qualifier ("inan", 0);
   variable lnan = qualifier ("lnan", 0L);

   if ((fields != NULL) && (columns != NULL)
       && (length(fields) != length(columns)))
     throw InvalidParmError, "The fields qualifier must be the same size as the number of columns";

   variable flags = get_blankrows_bits (qualifier("blankrows", "skip"));

   header = fixup_header_names (header);
   columns = fixup_header_names (columns);

   variable columns_are_string = _typeof(columns) == String_Type;

   if ((header == NULL) && columns_are_string)
     throw InvalidParmError, "No header was supplied to map column names";

   variable column_ints = columns, col, i, j;
   if (columns_are_string)
     {
	column_ints = Int_Type[length(columns)];
	_for i (0, length(columns)-1, 1)
	  {
	     col = columns[i];
	     j = wherefirst (col == header);
	     if (j == NULL)
	       throw InvalidParmError, "Unknown (canonical) column name $col";
	     column_ints[i] = j+1;
	  }
     }

   variable row_data = _csv_decode_row (csv.decoder, flags);
   if (column_ints == NULL)
     column_ints = [1:length(row_data)];

   if (any(column_ints>length(row_data)))
     {
	throw InvalidParmError, "column number is too large for data";
     }
   variable ncols = length(column_ints);

   variable datastruct = NULL;
   if (fields == NULL)
     {
	if (columns_are_string)
	  fields = columns;
	else if (header != NULL)
	  fields = header[column_ints-1];
	else
	  fields = array_map(String_Type, &sprintf, "col%d", column_ints);
     }
   datastruct = @Struct_Type(fields);

   column_ints -= 1;		       %  make 0-based

   variable convert_funcs = Ref_Type[ncols], convert_func, val;
   variable nan_values = {}; loop(ncols) list_append(nan_values, snan);

   if (types == NULL)
     {
	types = qualifier_exists ("auto") ? 'A' : 's';
     }

   if (typeof(types) == List_Type)
     types = list_to_array (types);

   if (typeof(types) == String_Type)
     types = bstring_to_array (types);

   if ((typeof(types) == Array_Type) && (length(types) != ncols))
     throw InvalidParmError, "types array must be equal to the number of columns";

   if (typeof (types) != Array_Type)
     types = types[Int_Type[ncols]];   %  single (default) type specified

   variable i1;
   _for i (1, ncols, 1)
     {
	i1 = i-1;
	types[i1] = qualifier ("type$i"$, types[i1]);
     }

   i = where(types=='i');
   convert_funcs[i] = &atoi; nan_values[i] = typecast(inan, Int_Type);
   i = where(types=='l');
   convert_funcs[i] = &atol; nan_values[i] = typecast(lnan, Long_Type);
   i = where(types=='f');
   convert_funcs[i] = &atofloat; nan_values[i] = typecast (fnan, Float_Type);
   i = where(types=='d');
   convert_funcs[i] = &atof; nan_values[i] = typecast(dnan, Double_Type);

   _for i (1, ncols, 1)
     {
	i1 = i-1;

	if (types[i1] == 'A')
	  {
	     variable type = _slang_guess_type (row_data[i1]);
	     if (type == Double_Type)
	       {
		  convert_funcs[i1] = &atof;
		  nan_values[i1] = dnan;
		  types[i1] = 'd';
	       }
	     else if (type == Int_Type)
	       {
		  convert_funcs[i1] = &atoi;
		  nan_values[i1] = inan;
		  types[i1] = 'i';
	       }
	     else types[i1] = 's';
	  }

	val = nan_values[i1];
	nan_values[i1] = typecast (qualifier ("nan$i"$, val), typeof(val));
     }

   variable list_of_arrays = {}, array;
   variable init_size = 0x8000;
   variable dsize = init_size;
   variable max_allocated = init_size;
   _for i (0, ncols-1, 1)
     {
	val = row_data[column_ints[i]];
	array = typeof(nan_values[i])[max_allocated];
	ifnot (strbytelen(val))
	  val = nan_values[i];
	else
	  {
	     convert_func = convert_funcs[i];
	     if (convert_func != NULL)
	       val = (@convert_func)(val);
	  }
	array[0] = val;
	list_append (list_of_arrays, array);
     }

   variable nread = 1;
   variable min_row_size = 1+max(column_ints);
   while (row_data = _csv_decode_row (csv.decoder, flags), row_data != NULL)
     {
	if (length (row_data) < min_row_size)
	  {
	     % FIXME-- make what to do here configurable
	     if (length(row_data) == 0)
	       break;

	     continue;
	  }

	if (nread >= max_allocated)
	  {
	     max_allocated += dsize;
	     resize_arrays (list_of_arrays, max_allocated);
	  }

	_for i (0, ncols-1, 1)
	  {
	     val = row_data[column_ints[i]];
	     ifnot (strbytelen(val))
	       {
		  list_of_arrays[i][nread] = nan_values[i];
		  continue;
	       }
	     convert_func = convert_funcs[i];
	     if (convert_func == NULL)
	       {
		  list_of_arrays[i][nread] = val;
		  continue;
	       }
	     list_of_arrays[i][nread] = (@convert_func)(val);
	  }
	nread++;
     }
   resize_arrays (list_of_arrays, nread);
   set_struct_fields (datastruct, __push_list(list_of_arrays));
   return datastruct;
}

define csv_decoder_new ()
{
   if (_NARGS != 1)
     usage ("\
obj = csv_decoder_new (file|fp|strings ; qualifiers);\n\
Qualifiers:\n\
  quote='\"', delim=',', skiplines=0, comment=string");

   variable fp = ();
   variable type = typeof(fp);
   variable func = &read_fp_callback;
   variable func_data;

   variable skiplines = qualifier("skiplines", 0);
   variable delim = qualifier("delim", ',');
   variable quote = qualifier("quote", '"');
   variable comment = qualifier("comment", NULL);
   variable comment_char = (comment == NULL) ? NULL : comment[0];
   variable flags = get_blankrows_bits (qualifier("blankrows", "skip"));

   if ((type == Array_Type) || (type == List_Type))
     {
	func = &read_strings_callback;
	func_data = struct
	  {
	     strings = fp,
	     i = skiplines, n = length(fp),
	     output_crlf = 0,
	     comment_char = comment_char,
	     comment = comment,
	  };
     }
   else
     {
	if (type != File_Type)
	  {
	     fp = fopen (fp, "r");
	     if (fp == NULL)
	       throw OpenError, "Unable to open CSV file"$;
	  }

	func_data = struct
	  {
	     fp = fp,
	     comment_char = comment_char,
	     comment = comment,
	     comment_len = ((comment == NULL) ? 0 : strbytelen(comment)),
	  };
	variable line;
	loop (skiplines)
	  () = fgets (&line, fp);
     }

   variable csv = struct
     {
	decoder = _csv_decoder_new (func, func_data, delim, quote, flags),
	readrow = &read_row,
	readcol = &read_cols,
     };

   return csv;
}

% Encoder

private define writecol ()
{
   if ((_NARGS < 3) || qualifier_exists("help"))
     {
	usage("\
writecol (file|fp, list_of_column_data | datastruct | col1,col2,...)\n\
Qualifiers:\n\
  names=array-of-column-names, noheader, quoteall, quotesome, rdb\n\
"
	     );
     }

   variable csv, data, file;
   if (_NARGS == 3)
     {
	(csv, file, data) = ();
     }
   else
     {
	data = __pop_list (_NARGS-2);
	(csv, file) = ();
     }

   variable type = typeof (data);
   if ((type != List_Type) && (type != Array_Type)
       && not is_struct_type (data))
     data = {data};

   variable flags = 0;
   if (qualifier_exists ("quoteall")) flags |= CSV_QUOTE_ALL;
   if (qualifier_exists ("quotesome")) flags |= CSV_QUOTE_SOME;
   variable rdb = qualifier_exists ("rdb");

   variable fp = file;
   if (typeof(file) != File_Type)
     fp = fopen (file, "wb");
   if (fp == NULL)
     throw OpenError, "Error opening $file in write mode"$;

   variable names = NULL;
   ifnot (qualifier_exists ("noheader"))
     {
	names = qualifier ("names");
	if ((names == NULL) && is_struct_type (data))
	  names = get_struct_field_names (data);
     }

   if (is_struct_type (data))
     {
	variable tmp = {};
	data = {(_push_struct_field_values(data), pop())};
	list_reverse (data);
     }

   EXIT_BLOCK
     {
	ifnot (__is_same(file, fp))
	  {
	     if (-1 == fclose (fp))
	       throw WriteError, "Error closing $file"$;
	  }
     }

   variable ncols = length(data);
   if (length (data) == 0)
     return;
   variable nrows = length(data[0]), i, j;
   _for i (1, ncols-1, 1)
     {
	if (nrows != length(data[i]))
	  throw InvalidParmError, "CSV data columns must be the same length";
     }

   variable str, encoder = csv.encoder;

   if (names != NULL)
     {
	if (typeof (names) == List_Type)
	  names = list_to_array (names);
	str = _csv_encode_row (encoder, names, flags);
	if (-1 == fputs (str, fp))
	  throw WriteError, "Write to CSV file failed";
	if (rdb)
	  {
	     variable types = String_Type[ncols];
	     _for i (0, ncols-1, 1)
	       types[i] = __is_datatype_numeric (_typeof(data[i])) ? "N" : "S";

	     str = _csv_encode_row (encoder, types, flags);
	     if (-1 == fputs (str, fp))
	       throw WriteError, "Write to CSV file failed";
	  }
     }

   variable row_data = String_Type[ncols];
   _for i (0, nrows-1, 1)
     {
	_for j (0, ncols-1, 1)
	  row_data[j] = string (data[j][i]);

	str = _csv_encode_row (encoder, row_data, flags);
	if (-1 == fputs (str, fp))
	  throw WriteError, "Write to CSV file failed";
     }
}

define csv_encoder_new ()
{
   if (qualifier_exists ("help"))
     {
	usage ("csv = csv_encoder_new ();\n\
Qualifiers:\n\
  delim=','\n\
  quote='\"'\n\
  quotesome, quoteall\n\
  rdb\n\
"
	      );
     }

   variable flags = 0;
   if (qualifier_exists ("quoteall")) flags |= CSV_QUOTE_ALL;
   if (qualifier_exists ("quotesome")) flags |= CSV_QUOTE_SOME;
   variable quotechar = qualifier ("quote", '"');
   variable delimchar = qualifier ("delim",
				   qualifier_exists ("rdb") ? '\t' : ',');

   variable csv = struct
     {
	encoder = _csv_encoder_new (delimchar, quotechar, flags),
	writecol = &writecol,
     };

   return csv;
}

define csv_writecol ()
{
   if ((_NARGS < 2) || qualifier_exists("help"))
     {
	usage("\
csv_writecol (file|fp, list_of_column_data | datastruct | col1,col2,...)\n\
Qualifiers:\n\
  names=array-of-column-names, noheader, quote=val, quoteall, quotesome\n\
"
	     );
     }

   variable args = __pop_list (_NARGS);
   variable csv = csv_encoder_new (;;__qualifiers);
   csv.writecol (__push_list(args);;__qualifiers);
}

private define convert_to_numeric (s, name)
{
   variable val = get_struct_field (s, name);
   variable num = length (val);
   if ((num == 0) || (_typeof (val) != String_Type))
     return;

   EXIT_BLOCK
     {
	set_struct_field (s, name, val);
     }

   variable types = DataType_Type[num];
   _for (0, length (val)-1, 1)
     {
	variable i = ();
	variable type = _slang_guess_type (val[i]);
	if (type == Double_Type)
	  {
	     val = atof (val);
	     return;
	  }
	types[i] = type;
     }

   if (all (types == Int_Type))
     {
	val = atoi (val);
	return;
     }

   if (any (types == Float_Type))
     {
	val = atofloat (val);
	return;
     }

   if (any (types == Long_Type))
     {
	val = atol (val);
	return;
     }

   if (any (types == Int_Type))
     {
	val = atoi (val);
	return;
     }

   val = atof (val);
}

define csv_readcol ()
{
   if ((_NARGS == 0) || qualifier_exists("help"))
     {
	usage ("struct = csvreadcol (file|fp [,columns] ;qualifier)\n\
where columns is an optional 1-based array of column numbers,\n\
 or array of column names.\n\
Qualifiers:\n\
 quote='\"', delim=',', skiplines=0, comment=string, has_header,\n\
 header=header, fields=[array of field names],\n\
 type=value|array of 's','i','l','f','d' (string,int,long,float,double)\n\
 typeNTH=val (specifiy type for NTH column)\n\
 snan=\"\", inan=0, lnan=0L, fnan=_NaN, dnan=_NaN (defaults for empty fields),\n\
 nanNTH=val (value used for an empty field in the NTH column\n\
"
	      );
     }

   variable file, columns;
   columns = __pop_list (_NARGS-1);
   file = ();

   variable q = __qualifiers ();
   variable rdb = qualifier_exists ("rdb");

   % rdb files are tab-delimited files, # is a comment character,
   % the first non-comment line contains the field names, the
   % second line gives the field types.
   if (rdb)
     {
	q = struct { comment = "#", delim = '\t' };
     }
   variable types = NULL;
   variable csv = csv_decoder_new (file ;; q);
   if (rdb || qualifier_exists ("has_header"))
     {
	variable header = csv.readrow ();
	q = struct { header=header, @q };
	if (rdb)
	  {
	     % The type field consists of an integer, followed by a
	     % type specifier, and a justification character.  The
	     % integer and justification characters are for display
	     % purposes. The type specifier is N for numberic, S for
	     % string, M for month.  Here, M and S will be treated the
	     % same.
	     types = csv.readrow ();
	     types = strtrans (types, "0-9<>", "");
	  }
     }

   variable s = csv.readcol (__push_list(columns) ;; q);
   if (rdb)
     {
	ifnot (length (columns))
	  columns = header;

	header = fixup_header_names (header);
	foreach (columns)
	  {
	     variable col = ();
	     if (typeof (col) == String_Type)
	       col = fixup_header_names (col);
	     else
	       col = header[col-1];

	     variable i = wherefirst (col == header);
	     if ((i == NULL) || (types[i] != "N"))
	       continue;

	     convert_to_numeric (s, col);
	  }
     }
   return s;
}