pike module repository: Public.Standards.CSV

modules.gotpike.org
Modules
ADT
Database
GTK2
GUI
IP
PiJAX
Public
Sql
Stdio
Subversion
System
Tools
Xosd
lua
v4l2
wx
Recent Changes
Public.Parser.XML2 1.50
Public.ZeroMQ 1.1
Public.Template.Mustache 1.0
Public.Protocols.XMPP 1.4
Sql.Provider.jdbc 1.0
Popular Downloads
Public.Parser.JSON2 1.0
Public.Parser.JSON 0.2
GTK2 2.23
Public.Web.FCGI 1.8
Public.Parser.XML2 1.48
Module Information
Public.Standards.CSV
Viewing contents of Public_Standards_CSV-0.1/module.pmod.in
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * Based on Format.CSV Pike module by
 * James Tyson, DogStar SOFTWARE .
 * Portions created by the Initial Developer are Copyright (C) 2005
 * the Initial Developer. All Rights Reserved.
 *
 * Author(s):
 *   Bertrand LUPART 
 *
 * ***** END LICENSE BLOCK ***** */

/* $Id$ */

string __version = "0.1";
string __author = "Bertrand LUPART ";
array __components = ({ "Public.pmod/Standards.pmod/CSV.pmod/module.pmod" });

static int default_type_detection = 0;

static object _enquote = Regexp("(,|\"|\n|\r)"); // Matches a string to be quoted

static object _int = Regexp("^[0-9]+$"); // Matches an int
static object _float = Regexp("^[0-9]+\\\.[0-9]*$"); // Matches a float
static object _string = Regexp("^\"*.+\"*$"); // Matches a string



/* Common CSV functions */

//! Enquote data to be put into a CSV file
//! This means doubling the quoting character: " -> ""
//!
//! @param in
//! The string to quote
//!  Example: John "foo" Doe
//!
//! @returns
//! The quoted string, ready to be written in a CSV file
//!  Example: John ""foo"" Doe
string enquote(string in)
{
	return replace(in, ({ "\""}), ({ "\"\""}));
}


//! Dequote data taken from a CSV file
//! This means reducing double quoting character: "" -> "
//!
//! @param in
//! The string to dequote
//!  Example: John ""foo"" Doe
//!
//! @returns
//! The string unquoted, ready to be processed
//!  Example: John "foo" Doe
string dequote(string in)
{
	return replace (in, ({ "\"\"" }) , ({ "\"" }));
}


//! Determines the "human" type of a string.
//!
//! In CSV, data are stored as strings, but the actual value can be of any type.
//!
//! Example:
//!  "42" -> 42
//!  "3.14" -> 3.14
//! 
//! @param v
//! The data we want to determine type
//!
//! @returns
//! The actual value, casted as the determined type.
mixed detect_type(mixed v)
{
	// Types can only be determined on a string
	if(!stringp(v))
		return v;

	// MySQL's null
 	if (v == "\\N")
	{
		// Can't think of a good way to suggest NULL that isn't just 0.
		return zero_type; 
  	}

	// Field is an int
	if (_int->match(v))
	{
		return (int)v;
 	}

	// Field is a float
 	if (_float->match(v))
	{
		return (float)v;
	}

	// Field is a string
	if (_string->match(v))
	{
		return v;
	}

	return "";
}


/* CSVIterator */

// This CSVIterator takes a Stdio.FILE has argument, reads data from it using 
// Stdio.FILE()'s line_iterator and convert the CSV data on the fly.
// Since that's a generic Iterator, it could be easily updated to take anything
// as an argument for parsing CSV from it.
class CSVIterator
{
	static int csv_index=-1; // current CSV index
	static array csv_line = ({ }); // current CSV data


	static Stdio.FILE input_file; // The file containing the data
	static int file_remaining = 1; // Is there still some data in the file to read?

	// file_iterator reads the file a line at a time
	object file_iterator;


	/* Iterator API */

	//! @param _input
	//! The file containing the CSV data
	void create(Stdio.FILE _input)
	{
		input_file = _input;

		// Get the line_iterator from Stdio.FILE.
		// This allows to handle \n, \r\n, and \r files
		file_iterator = input_file->line_iterator(1); 

		// Go to the next (first) item
		next();
	}

	// Do we have still some data in our iterator?
  int `!()
	{
		// Nothing's left in the file, no more CSV data
		return !file_remaining;
  }

	// Get next elements from the iterator
	CSVIterator `+=(int steps)
	{
		for(int i=0; ivalue();

			// If no data from the file iterator
		  if (!in)
			{
				file_remaining=0; // there is no file remaining
				csv_line = 0; // current CSV line is empty
				return this; // exit
			}
	
			parse_csv(in); // parse csv and feed csv_line with them

			// Sanity check loop over the colleted data
			int count = -1;
			foreach(csv_line, mixed v)
			{
				count++;
				if (!sizeof(v))
				{
					csv_line[count] = "" ;
					continue;
				}

				if ((v[0] == '\"') && (v[sizeof(v)-1] == '\"'))
				{
  				// The string is surrounded by speechmarks, so let's
  				// remove them.
  				csv_line[count] = v[1..sizeof(v)-2];
				}
			}

			// Go to the next element
			csv_index++;
			file_iterator->next();
		}

		return this;
	}

	// The current index for the iterator
	int index()
	{
		return csv_index;
	}

	// Increment the iterator
	int next()
	{
		`+=(1);
	}

	// The CSV data for our current index
	int|array value()
	{
		return csv_line;
	}


	/* CSVIterator-specific methods */

	//! Parses a string and tries to find some CSV data in it.
	//! The csv_line array is fed with the data.
	//!
	//! No heuristic is done yet to try to manage malformed CSV data.
	//!
	//! @param in
	//!  The line from the file we want to parse, as a string
	static void parse_csv(string in)
	{
		// We can't just divide the string on comma, since commas can be quoted
		int quoted = 0; // are we inside a quote sequence?
		int last = 0; // the last char we cared about when feeding result array
		int i = 0; // our current position in the file
		array result = ({ });
		while(sizeof(in[i..i]))
		{
			switch(in[i..i])
			{
				// a " is found, reverse the quote status
				case "\"":
					quoted=!quoted;
					break;
	
				// a , is found
				case ",":
					// if we are not in a quote sequence, split the string
					if(!quoted)
					{
						result += ({ dequote(in[last..(i-1)]) });
						last=i+1;
					}
					break;
			}
	
			// If we're at the end of the line and quoted, we have a CRLF in a field
			// Adding a LF and go to the next the next line
			if(quoted && i==(sizeof(in)-1))
			{
				// FIXME: we're adding \n here, regardless the original data was \n, \r
				// or \r\n
				if(file_iterator->next())
					in+="\n"+file_iterator->value();
				else
				{
					file_remaining=0;
					continue;
				}
			}

			i++;
		}

		// Adding the last part 
		result+=({ dequote(in[last..]) });

		csv_line=result;
	}
}


/* Public.Standards.CSV.FILE */

class FILE
{
	inherit Stdio.FILE;

	static int _standards=1; // Do we want to be standards compliant for output?
	static int do_type_detection=default_type_detection;

	// csv_iterator reads a CSV line at a time
	// a CSV line can be splitted into multiple file lines
	object csv_iterator;

	//! If standards compliant, not all the fields will be enclosed in double
	//! quotes, only thoses containing double quotes, commas and newlines
	//!
	//! @param t
	//!  1 sets the file to be standards compliant
	//!  0 unsets it
	void set_standard_compliance(int t)
	{
	  _standards = t;
	}


	//! If standards compliant, not all the fields will be enclosed in double
	//! quotes, only thoses containing double quotes, commas and newlines
	//!
	//! @returns
	//! 1 or 0 wether the file has been set standards compliant or not
	int get_standard_compliance()
	{
		return _standards;
	}


	//! Enable or disable the type detection.
	//!
	//! @param t
	//!  1 sets the file to detect types
	//!  0 unsets it
	//!
	//! @returns
	//! 1 or 0 wether the file has been set do detect types or not
	void set_type_detection(int t)
	{
		do_type_detection = t;
	}


	//! Check if type detection is enabled or not.
	//!
	//! @returns
	//!  1 or 0 wether the file has been set to detect types or not
	int get_type_detection()
	{
		return do_type_detection;
	}

	//! Write a row
	//!
	//! @param row
	//!	The data to write
	//!
	//! @returns
	//! The number of bytes written 
	int write_row(mixed... row)
	{
		if (arrayp(row) && (sizeof(row) == 1) && arrayp(row[0]))
			row = row[0];

	  array result = ({});
	  foreach(row, mixed r)
		{
			string v = (string)r;

			if (_standards)
			{
				if (_enquote->match(v))
					result += ({ sprintf("\"%s\"", enquote(v)) });
				else
					result += ({ enquote(v) });
			}
			else
				result += ({ sprintf("\"%s\"", enquote(v)) });
		}

	  return ::write((result * ",") + "\n");
	}



	//! Read a row
	//!
	//! @returns
	//! The row splitted into an array
	//! 0 if no data
	int|array read_row()
	{
		// We have to instanciate the CSVIterator the first time
		if(!objectp(csv_iterator))
		{
			csv_iterator = CSVIterator(this_object());
		}

		mixed res = csv_iterator->value();

		// Type detection has not made it into CSVIterator, because it was hell
		// to set/unset type detection on the fly this way
		if(do_type_detection)
		{
			foreach(res; mixed indice; mixed value)
			{
				res[indice] = detect_type(value);
			}
		}

		// Move the iterators to the next line
		csv_iterator->next();

		return res;
	}

	static object _get_iterator()
	{
		return  CSVIterator(this_object());
	}

	static string _sprintf(mixed... args)
	{
		return replace(::_sprintf(@args), "Stdio.FILE", "Public.Standards.CSV.FILE");
	}
}