X-Git-Url: http://nitlanguage.org

diff --git a/lib/csv/csv.nit b/lib/csv/csv.nit
index 441f95c..abc4437 100644
--- a/lib/csv/csv.nit
+++ b/lib/csv/csv.nit
@@ -15,371 +15,328 @@
 # CSV document handling.
 module csv
 
-# Specifies a CSV format.
-class CsvFormat
-	# The character that delimits escaped value.
-	#
-	# The delimiter is escaped by doubling it.
-	var delimiter: Char
-
-	# The character that split each cell in a row.
-	var separator: Char
-
-	# The character that ends a row (end of line).
-	var eol: String
+redef class Text
+	# Escape the content of `self` for inclusion in a CSV document
+	private fun escape_to_csv(sep_char, delim_char: Char, eol: String): String do
+		var add_sp = chars_to_escape_csv(sep_char, delim_char, eol)
+		if add_sp == 0 then return to_s
+		var bf = new Buffer.with_cap(add_sp + byte_length)
+		bf.add '"'
+		for i in [0 .. length[ do
+			var c = self[i]
+			if c == delim_char then
+				bf.add c
+			end
+			bf.add c
+		end
+		bf.add '"'
+		return bf.to_s
+	end
 
-	# Escape sequence for the delimiter.
-	private var escaping = "{delimiter}{delimiter}" is lazy
+	# How many more bytes should be allocated for CSV escaping ?
+	private fun chars_to_escape_csv(sep_char, delim_char: Char, eol: String): Int do
+		var more_ln = 0
+		var ln = length
+		var need_esc = false
+		var fst_eol = eol.first
+		var i = 0
+		while i < ln do
+			var c = self[i]
+			if c == delim_char then more_ln += 1
+			if c == fst_eol then
+				need_esc = true
+				for j in [1 .. eol.length[ do
+					i += 1
+					c = self[i]
+					if c != eol[j] then
+						i -= j
+						need_esc = false
+						break
+					end
+				end
+			end
+			if c == sep_char then need_esc = true
+			i += 1
+		end
+		var more = more_ln * delim_char.u8char_len
+		if need_esc then more += 2
+		return more
+	end
 
-	# Escape the specified cell.
-	private fun escape_cell(cell: String): Text do
-		var result = new RopeBuffer
-		result.add delimiter
-		result.append cell.replace(delimiter, escaping)
-		result.add delimiter
-		return result
+	# Unescape the content of `self` from CSV format to Nit String
+	private fun unescape_csv(delim_char: Char): String do
+		var to_un = chars_to_unescape_csv(delim_char)
+		if to_un == 0 then return to_s
+		var buf = new Buffer.with_cap(byte_length - to_un)
+		var pos = 0
+		var ln = length
+		while pos < ln do
+			var c = self[pos]
+			if c == delim_char then pos += 1
+			buf.add c
+			pos += 1
+		end
+		return buf.to_s
 	end
 
-	# Can the specified value be inserted without any escaping?
-	private fun is_value_clean(value: String): Bool do
-		for c in value.chars do
-			if c == delimiter then return false
-			if c == separator then return false
-			if eol.chars.has(c) then return false
+	# How many bytes should be removed for CSV unescaping ?
+	private fun chars_to_unescape_csv(delim_char: Char): Int do
+		var pos = 0
+		var to_un = 0
+		var ln = length
+		while pos < ln do
+			var c = self[pos]
+			if c == delim_char then
+				pos += 1
+				to_un += 1
+			end
+			pos += 1
 		end
-		return true
+		return to_un
 	end
 end
 
+# Shared properties by all CSV-related classes
+#
+# This class is basically only here for implementation purposes and should not be used
+# by clients for typing.
+abstract class CsvStream
+	# The character that delimits escaped value.
+	#
+	# The delimiter is escaped by doubling it.
+	var delimiter = '"' is writable
+
+	# The character that split each cell in a record.
+	var separator = ',' is writable
+
+	# The character that ends a record (end of line).
+	var eol = "\n" is writable
+end
+
 # A CSV document representation.
 class CsvDocument
-	super Streamable
-
-	# The format to use.
-	#
-	# Defaults to `rfc4180`.
-	var format: CsvFormat = rfc4180 is writable
+	super Writable
+	super CsvStream
 
 	# The header.
 	#
 	# Contains the name of all fields in this table.
-	var header: Array[String] = new Array[String] is writable
+	var header = new Array[String] is writable, optional
 
 	# The list of the records.
 	#
 	# All records must have the same length than `header`.
-	var records: Array[Array[String]] = new Array[Array[String]]
+	var records = new Array[Array[String]] is writable, optional
 
-	# Replace the header by the specified row.
-	fun set_header(values: Object...) do
-		header.clear
-		for value in values do header.add(value.to_s)
-	end
-
-	# Append the specfied record.
-	fun add_record(values: Object...) do
-		assert values.length == header.length else
-			sys.stderr.write "CSV error: Header declares {header.length} columns, record contains {values.length} values.\n"
-		end
-		var record = new Array[String]
-		for value in values do record.add(value.to_s)
-		records.add(record)
+	# Adds a new record to document containing the values in `objs`
+	fun add_record(objs: Object...) do
+		var ln = new Array[String].with_capacity(objs.length)
+		for i in objs do ln.add(i.to_s)
+		records.add ln
 	end
 
 	redef fun write_to(stream) do
-		var writer = new CsvWriter.with_format(stream, format)
-		writer.write_sequence(header)
-		for record in records do writer.write_sequence(record)
+		var s = new CsvWriter(stream)
+		s.separator = separator
+		s.eol = eol
+		s.delimiter = delimiter
+		if not header.is_empty then
+			s.write_line header
+		end
+		s.write_lines(records)
 	end
 
-	# Deprecated alias for `write_to_file`.
-	fun save(file: String) do write_to_file(file)
-
 	# Load from the specified stream.
 	#
 	# Parameters:
 	#
 	# * `stream`: Input stream.
-	# * `has_header`: Is the first row the header?
-	# * `skip_empty`: Do we skip the empty lines?
-	# For details, see `CsvReader.skip_empty`.
-	fun load_from(stream: IStream, has_header: Bool, skip_empty: Bool) do
-		var reader = new CsvReader.with_format(stream, format)
+	# * `has_header`: Is the first record the header? - defaults to true
+	# * `skip_empty`: Do we skip the empty lines? - defaults to true
+	fun load_from(stream: Reader, has_header: nullable Bool, skip_empty: nullable Bool) do
+		if has_header == null then has_header = true
+		if skip_empty == null then skip_empty = true
+		var reader = new CsvReader(stream)
+		reader.separator = separator
+		reader.eol = eol
+		reader.delimiter = delimiter
 		reader.skip_empty = skip_empty
-		if has_header then
-			if reader.is_ok then
-				header = reader.item
-			else
-				header.clear
-			end
-		end
-		records.clear
-		for record in reader do records.add(record)
-	end
-
-	# Load from the specified file.
-	#
-	# Parameters:
-	#
-	# * `path`: Path of the file.
-	# * `has_header`: Is the first row the header?
-	# * `skip_empty`: Do we skip the empty lines?
-	fun load(path: String, has_header: Bool, skip_empty: Bool) do
-		var istream = new IFStream.open(path)
-		load_from(istream, has_header, skip_empty)
-		istream.close
 	end
 end
 
-# Appends CSV rows to a file.
+# Appends CSV records to a file.
 #
 # By default, uses the format recommended by RFC 4180 (see `rfc4180`).
 #
-# Note: If a row contains only an empty cell, its representation is
+# Note: If a record contains only an empty cell, its representation is
 # undistinguishable from an empty line. This is because the empty values are
 # always written unescaped in order to avoid them to be interpreted as escaped
 # delimiters by some parsers.
 #
 # ~~~nit
-# var out = new StringOStream
+# var out = new StringWriter
 # var writer = new CsvWriter(out)
-# writer.write_row(1, 2.0, "foo\nbar")
-# writer.write_sequence([""])
-# assert out.to_s == """1,2.0,"foo\nbar"\r\n\r\n"""
+# writer.write_elements(1, 2.0, "foo\nbar")
+# writer.write_line([""])
+# assert out.to_s == """1,2.0,"foo\nbar"\n\n"""
 # ~~~
 class CsvWriter
+	super CsvStream
 
 	# The output stream.
-	var ostream: OStream
+	var ostream: Writer
 
-	# The format to use.
-	#
-	# Defaults to `rfc4180`.
-	var format: CsvFormat = rfc4180
+	# Write several lines to a stream
+	fun write_lines(lines: Array[Array[Object]]) do for i in lines do write_line i
 
-	# Do we escape all cells (except empty ones)?
-	#
-	# If `false` (the default), escape only cells that contain a metacharacter
-	# of the format. In all cases, empty cells are not escaped. This option
-	# permits to choose between the optimization of the performances (when
-	# `true`) and optimization of the size of the output (when `false`).
-	#
-	# Note: Escaping may not be correctly recognized by some parsers.
-	var always_escape = false is writable
-
-	# Create a new writer with the specified format.
-	init with_format(ostream:OStream, format: CsvFormat) do
-		self.ostream = ostream
-		self.format = format
-	end
-
-	# Append the specified sequence as a row.
+	# Append the elements in `els` as a record.
 	#
 	# The representation of each cell is determined by `to_s`.
-	fun write_sequence(row: SequenceRead[Object]) do
-		if not row.is_empty then
-			var i = row.iterator
-			var separator = format.separator.to_s
-			write_cell i.item.to_s
-			i.next
-			for cell in i do
-				ostream.write separator
-				write_cell cell.to_s
-			end
+	fun write_elements(els: Object...) do
+		var os = ostream
+		var esc = delimiter
+		var sep = separator
+		var eol = eol
+		for i in [0 .. els.length - 1[ do
+			os.write(els[i].to_s.escape_to_csv(sep, esc, eol))
+			os.write_char(sep)
 		end
-		ostream.write format.eol
+		os.write(els.last.to_s.escape_to_csv(sep, esc, eol))
+		os.write(eol)
 	end
 
-	# Append the specified row.
+	# Append the specified record.
 	#
 	# The representation of each cell is determined by `to_s`.
-	fun write_row(row: Object...) do write_sequence(row)
-
-	# Close the output stream.
-	fun close do ostream.close
-
-	private fun write_cell(cell: String) do
-		if cell.is_empty then return
-		if not always_escape and format.is_value_clean(cell) then
-			ostream.write cell
-		else
-			ostream.write format.escape_cell(cell)
+	fun write_line(line: Array[Object]) do
+		var os = ostream
+		var esc = delimiter
+		var sep = separator
+		var eol = eol
+		for i in [0 .. line.length - 1[ do
+			os.write(line[i].to_s.escape_to_csv(sep, esc, eol))
+			os.write_char(sep)
 		end
+		os.write(line.last.to_s.escape_to_csv(sep, esc, eol))
+		os.write(eol)
 	end
 end
 
-# Reads rows from a CSV file.
+# Reads records from a CSV file.
 #
-# By default, uses the format recommended by RFC 4180 (see `rfc4180`).
+# By default, the format recognizes EOLs as `\n`
 #
 # ~~~nit
-# var example = new StringIStream("""
-# foo,bar\r
-# "Hello, word!",1234.5 + 42\r
-# "Something\r
-# ""else""\", baz\r
-# """)
-# var reader = new CsvReader(example)
-# var table = new Array[Array[String]]
+# var example = """
+# foo,bar
+# "Hello, word!",1234.5 + 42
+# "Something
+# ""else""\", baz
+# """
+# var reader = new CsvReader.from_string(example)
+# var table = reader.read_all
 #
-# for row in reader do table.add row
-# assert table == [
-# 			["foo","bar"],
-# 			["Hello, word!","1234.5 + 42"],
-# 			["Something\r\n\"else\""," baz"]
-# 		]
+# assert table.header  == ["foo","bar"]
+# assert table.records == [["Hello, word!","1234.5 + 42"],
+# 			["Something\n\"else\""," baz"]]
 # ~~~
 class CsvReader
-	super Iterator[Array[String]]
+	super CsvStream
 
 	# The input stream.
-	var istream: IStream
-
-	# The format to use.
-	#
-	# Defaults to `rfc4180`.
-	var format: CsvFormat = rfc4180 is lazy
+	var istream: Reader
 
 	# Do we skip the empty lines?
 	#
 	# Note: Even if this attribute is `false`, the presence of an line ending at
-	# end of the last row does not change the number of returned rows.
+	# end of the last record does not change the number of returned record.
 	# This is because the line endings are processed as terminators, not as
 	# separators. Therefore, when there is more than one line ending at the end
-	# of the file, the additional lines are interpreted as empty rows that
+	# of the file, the additional lines are interpreted as empty records that
 	# are skipped only if `skip_empty` is set to `true`.
 	#
 	# `false` by default.
 	var skip_empty: Bool = false is writable
 
-	# The last read row.
-	private var row: nullable Array[String] = null
-
-	# Did we read something?
-	private var started = false
-
-	# Create a new reader with the specified format.
-	init with_format(istream:IStream, format: CsvFormat) do
-		self.istream = istream
-		self.format = format
-	end
-
-	# Read the first row, if needed.
-	fun prepare do
-		if not started then
-			row = read_row
-			started = true
-		end
-	end
-
-	redef fun next do
-		prepare
-		assert is_ok else
-			sys.stderr.write "Already at the end of the stream.\n"
-		end
-		row = read_row
-	end
-
-	# Return the last read row.
-	redef fun item do
-		prepare
-		return row.as(not null)
-	end
+	# Creates a new CSVReader from a `string` data
+	init from_string(s: String) do init(new StringReader(s))
 
-	redef fun is_ok do
-		prepare
-		return row != null
-	end
-
-	# Free some internal ressources and set `is_ok` to `false`.
+	# Reads the content of the Stream and interprets it as a CSV Document
 	#
-	# Do not close the input stream.
-	redef fun finish do row = null
-
-	# Close the input stream.
-	fun close do istream.close
-
-	private fun read_row: nullable Array[String] do
-		if istream.eof then return null
-		var row = new Array[String]
-		var value = new RopeBuffer
-
-		# Number of unescaped characters since the last delimiter or separator.
-		var unescaped = 0
-
-		# Do we read the start of a row?
-		var got_row = false
-
-		# Do we found a delimited string in the current cell?
-		var got_delimiter = false
-
-		loop
-			var i = istream.read_char
-			var c: Char
-
-			if i < 0 then
-				if got_row then
-					row.add value.to_s
-					return row
-				else
-					return null
-				end
-			end
-			c = i.ascii
-
-			if c == format.delimiter then
-				if got_delimiter and unescaped == 0 then
-					# Got an escaped delimiter.
-					value.add format.delimiter
-				end
-				# Read all bytes until the delimiter.
-				loop
-					i = istream.read_char
-					assert not_eof: i >= 0 else
-						sys.stderr.write "Unexpected end of file before the end of a delimited value.\n"
+	# Optional parameter `has_header` determines whether the first line
+	# of the CSV Document is header data.
+	# Defaults to true
+	fun read_all(has_header: nullable Bool): CsvDocument do
+		var header: nullable Array[String] = null
+		if has_header == null then has_header = true
+		var iss = istream
+		var res_data = new Array[Array[String]]
+		var eol_st = eol.first
+		var line = new Array[String]
+		var esc = delimiter
+		var sep = separator
+		var eol = eol
+		var is_eol = false
+		var eol_buf = new Buffer.with_cap(eol.length)
+		var c = iss.read_char
+		var el = new Buffer
+		while not iss.eof do
+			if c == null then continue
+			loop
+				if c == esc then
+					c = iss.read_char
+					loop
+						if c == esc then
+							c = iss.read_char
+							if c != esc then break
+						end
+						if c == null then break
+						el.add c
+						c = iss.read_char
 					end
-					c = i.ascii
-					if c == format.delimiter then break
-					value.add c
 				end
-				unescaped = 0
-				got_row = true
-				got_delimiter = true
-			else if c == format.separator then
-				# Flush the value to the row.
-				row.add value.to_s
-				value.clear
-				unescaped = 0
-				got_delimiter = false
-			else
-				value.add c
-				unescaped += 1
-				if unescaped >= format.eol.length and
-						value.has_suffix(format.eol) then
-					var value_trimed = value.substring(0,
-							value.length - format.eol.length).to_s
-					if skip_empty and row.is_empty and
-							value_trimed.is_empty and
-							not got_delimiter then
-						# Skip the empty line.
-						value.clear
-						unescaped = 0
-						got_row = false
-					else
-						row.add value_trimed
-						return row
+				if c == sep then break
+				if c == eol_st then
+					eol_buf.add c.as(not null)
+					is_eol = true
+					for i in [1 .. eol.length[ do
+						c = iss.read_char
+						if c == null or c != eol[i] then
+							is_eol = false
+							el.append(eol_buf)
+							eol_buf.clear
+							break
+						end
+						eol_buf.add c
 					end
-				else
-					got_row = true
+					if not is_eol then continue
+					eol_buf.clear
+					break
 				end
+				if c == sep then break
+				el.add c.as(not null)
+				c = iss.read_char
+				if c == null then break
 			end
+			line.add el.to_s
+			el.clear
+			if is_eol or iss.eof then
+				c = iss.read_char
+				is_eol = false
+				if skip_empty and line.is_empty then
+					continue
+				end
+				if has_header and header == null then
+					header = line
+				else res_data.add line
+				line = new Array[String]
+			end
+			if c == sep then c = iss.read_char
 		end
+		if header == null then header = new Array[String]
+		var doc = new CsvDocument
+		doc.header = header
+		doc.records = res_data
+		return doc
 	end
 end
-
-# The CSV format recommended by [RFC 4180](https://tools.ietf.org/html/rfc4180).
-#
-# * `delimiter`: `'"'`
-# * `separator`: `','`
-# * `eol`: `"\r\n"`
-fun rfc4180: CsvFormat do return once new CsvFormat('"', ',', "\r\n")