# This file is part of NIT ( http://www.nitlanguage.org ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# CSV document handling.
module csv

# Specifies a CSV format.
class CsvFormat
	# The character that delimits escaped value.
	#
	# The delimiter is escaped by doubling it.
	var delimiter: Char

	# The character that split each cell in a row.
	var separator: Char

	# The character that ends a row (end of line).
	var eol: String

	# Escape sequence for the delimiter.
	private var escaping = "{delimiter}{delimiter}" is lazy

	# Escape the specified cell.
	private fun escape_cell(cell: String): Text do
		var result = new RopeBuffer
		result.add delimiter
		result.append cell.replace(delimiter, escaping)
		result.add delimiter
		return result
	end

	# Can the specified value be inserted without any escaping?
	private fun is_value_clean(value: String): Bool do
		for c in value.chars do
			if c == delimiter then return false
			if c == separator then return false
			if eol.chars.has(c) then return false
		end
		return true
	end
end

# A CSV document representation.
class CsvDocument
	super Writable

	# The format to use.
	#
	# Defaults to `rfc4180`.
	var format: CsvFormat = rfc4180 is writable

	# The header.
	#
	# Contains the name of all fields in this table.
	var header: Array[String] = new Array[String] is writable

	# The list of the records.
	#
	# All records must have the same length than `header`.
	var records: Array[Array[String]] = new Array[Array[String]]

	# Replace the header by the specified row.
	fun set_header(values: Object...) do
		header.clear
		for value in values do header.add(value.to_s)
	end

	# Append the specfied record.
	fun add_record(values: Object...) do
		assert values.length == header.length else
			sys.stderr.write "CSV error: Header declares {header.length} columns, record contains {values.length} values.\n"
		end
		var record = new Array[String]
		for value in values do record.add(value.to_s)
		records.add(record)
	end

	redef fun write_to(stream) do
		var writer = new CsvWriter.with_format(stream, format)
		writer.write_sequence(header)
		for record in records do writer.write_sequence(record)
	end

	# Deprecated alias for `write_to_file`.
	fun save(file: String) do write_to_file(file)

	# Load from the specified stream.
	#
	# Parameters:
	#
	# * `stream`: Input stream.
	# * `has_header`: Is the first row the header?
	# * `skip_empty`: Do we skip the empty lines?
	# For details, see `CsvReader.skip_empty`.
	fun load_from(stream: Reader, has_header: Bool, skip_empty: Bool) do
		var reader = new CsvReader.with_format(stream, format)
		reader.skip_empty = skip_empty
		if has_header then
			if reader.is_ok then
				header = reader.item
			else
				header.clear
			end
		end
		records.clear
		for record in reader do records.add(record)
	end

	# Load from the specified file.
	#
	# Parameters:
	#
	# * `path`: Path of the file.
	# * `has_header`: Is the first row the header?
	# * `skip_empty`: Do we skip the empty lines?
	fun load(path: String, has_header: Bool, skip_empty: Bool) do
		var istream = new FileReader.open(path)
		load_from(istream, has_header, skip_empty)
		istream.close
	end
end

# Appends CSV rows to a file.
#
# By default, uses the format recommended by RFC 4180 (see `rfc4180`).
#
# Note: If a row contains only an empty cell, its representation is
# undistinguishable from an empty line. This is because the empty values are
# always written unescaped in order to avoid them to be interpreted as escaped
# delimiters by some parsers.
#
# ~~~nit
# var out = new StringWriter
# var writer = new CsvWriter(out)
# writer.write_row(1, 2.0, "foo\nbar")
# writer.write_sequence([""])
# assert out.to_s == """1,2.0,"foo\nbar"\r\n\r\n"""
# ~~~
class CsvWriter

	# The output stream.
	var ostream: Writer

	# The format to use.
	#
	# Defaults to `rfc4180`.
	var format: CsvFormat = rfc4180

	# Do we escape all cells (except empty ones)?
	#
	# If `false` (the default), escape only cells that contain a metacharacter
	# of the format. In all cases, empty cells are not escaped. This option
	# permits to choose between the optimization of the performances (when
	# `true`) and optimization of the size of the output (when `false`).
	#
	# Note: Escaping may not be correctly recognized by some parsers.
	var always_escape = false is writable

	# Create a new writer with the specified format.
	init with_format(ostream:Writer, format: CsvFormat) do
		self.ostream = ostream
		self.format = format
	end

	# Append the specified sequence as a row.
	#
	# The representation of each cell is determined by `to_s`.
	fun write_sequence(row: SequenceRead[Object]) do
		if not row.is_empty then
			var i = row.iterator
			var separator = format.separator.to_s
			write_cell i.item.to_s
			i.next
			for cell in i do
				ostream.write separator
				write_cell cell.to_s
			end
		end
		ostream.write format.eol
	end

	# Append the specified row.
	#
	# The representation of each cell is determined by `to_s`.
	fun write_row(row: Object...) do write_sequence(row)

	# Close the output stream.
	fun close do ostream.close

	private fun write_cell(cell: String) do
		if cell.is_empty then return
		if not always_escape and format.is_value_clean(cell) then
			ostream.write cell
		else
			ostream.write format.escape_cell(cell)
		end
	end
end

# Reads rows from a CSV file.
#
# By default, uses the format recommended by RFC 4180 (see `rfc4180`).
#
# ~~~nit
# var example = new StringReader("""
# foo,bar\r
# "Hello, word!",1234.5 + 42\r
# "Something\r
# ""else""\", baz\r
# """)
# var reader = new CsvReader(example)
# var table = new Array[Array[String]]
#
# for row in reader do table.add row
# assert table == [
# 			["foo","bar"],
# 			["Hello, word!","1234.5 + 42"],
# 			["Something\r\n\"else\""," baz"]
# 		]
# ~~~
class CsvReader
	super Iterator[Array[String]]

	# The input stream.
	var istream: Reader

	# The format to use.
	#
	# Defaults to `rfc4180`.
	var format: CsvFormat = rfc4180 is lazy

	# Do we skip the empty lines?
	#
	# Note: Even if this attribute is `false`, the presence of an line ending at
	# end of the last row does not change the number of returned rows.
	# This is because the line endings are processed as terminators, not as
	# separators. Therefore, when there is more than one line ending at the end
	# of the file, the additional lines are interpreted as empty rows that
	# are skipped only if `skip_empty` is set to `true`.
	#
	# `false` by default.
	var skip_empty: Bool = false is writable

	# The last read row.
	private var row: nullable Array[String] = null

	# Did we read something?
	private var started = false

	# Create a new reader with the specified format.
	init with_format(istream:Reader, format: CsvFormat) do
		self.istream = istream
		self.format = format
	end

	# Read the first row, if needed.
	fun prepare do
		if not started then
			row = read_row
			started = true
		end
	end

	redef fun next do
		prepare
		assert is_ok else
			sys.stderr.write "Already at the end of the stream.\n"
		end
		row = read_row
	end

	# Return the last read row.
	redef fun item do
		prepare
		return row.as(not null)
	end

	redef fun is_ok do
		prepare
		return row != null
	end

	# Free some internal ressources and set `is_ok` to `false`.
	#
	# Do not close the input stream.
	redef fun finish do row = null

	# Close the input stream.
	fun close do istream.close

	private fun read_row: nullable Array[String] do
		if istream.eof then return null
		var row = new Array[String]
		var value = new RopeBuffer

		# Number of unescaped characters since the last delimiter or separator.
		var unescaped = 0

		# Do we read the start of a row?
		var got_row = false

		# Do we found a delimited string in the current cell?
		var got_delimiter = false

		loop
			var i = istream.read_char
			var c: Char

			if i < 0 then
				if got_row then
					row.add value.to_s
					return row
				else
					return null
				end
			end
			c = i.ascii

			if c == format.delimiter then
				if got_delimiter and unescaped == 0 then
					# Got an escaped delimiter.
					value.add format.delimiter
				end
				# Read all bytes until the delimiter.
				loop
					i = istream.read_char
					assert not_eof: i >= 0 else
						sys.stderr.write "Unexpected end of file before the end of a delimited value.\n"
					end
					c = i.ascii
					if c == format.delimiter then break
					value.add c
				end
				unescaped = 0
				got_row = true
				got_delimiter = true
			else if c == format.separator then
				# Flush the value to the row.
				row.add value.to_s
				value.clear
				unescaped = 0
				got_delimiter = false
			else
				value.add c
				unescaped += 1
				if unescaped >= format.eol.length and
						value.has_suffix(format.eol) then
					var value_trimed = value.substring(0,
							value.length - format.eol.length).to_s
					if skip_empty and row.is_empty and
							value_trimed.is_empty and
							not got_delimiter then
						# Skip the empty line.
						value.clear
						unescaped = 0
						got_row = false
					else
						row.add value_trimed
						return row
					end
				else
					got_row = true
				end
			end
		end
	end
end

# The CSV format recommended by [RFC 4180](https://tools.ietf.org/html/rfc4180).
#
# * `delimiter`: `'"'`
# * `separator`: `','`
# * `eol`: `"\r\n"`
fun rfc4180: CsvFormat do return once new CsvFormat('"', ',', "\r\n")