X-Git-Url: http://nitlanguage.org diff --git a/lib/csv/csv.nit b/lib/csv/csv.nit index 38680d0..c6f0b41 100644 --- a/lib/csv/csv.nit +++ b/lib/csv/csv.nit @@ -12,21 +12,71 @@ # See the License for the specific language governing permissions and # limitations under the License. -# CSV output facilities +# CSV document handling. module csv +# Specifies a CSV format. +class CsvFormat + # The character that delimits escaped value. + # + # The delimiter is escaped by doubling it. + var delimiter: Char + + # The character that split each cell in a row. + var separator: Char + + # The character that ends a row (end of line). + var eol: String + + # Escape sequence for the delimiter. + private var escaping = "{delimiter}{delimiter}" is lazy + + # Escape the specified cell. + private fun escape_cell(cell: String): Text do + var result = new RopeBuffer + result.add delimiter + result.append cell.replace(delimiter, escaping) + result.add delimiter + return result + end + + # Can the specified value be inserted without any escaping? + private fun is_value_clean(value: String): Bool do + for c in value.chars do + if c == delimiter then return false + if c == separator then return false + if eol.chars.has(c) then return false + end + return true + end +end + # A CSV document representation. class CsvDocument - super Streamable + super Writable + # The format to use. + # + # Defaults to `rfc4180`. + var format: CsvFormat = rfc4180 is writable + + # The header. + # + # Contains the name of all fields in this table. var header: Array[String] = new Array[String] is writable + + # The list of the records. + # + # All records must have the same length than `header`. var records: Array[Array[String]] = new Array[Array[String]] + # Replace the header by the specified row. fun set_header(values: Object...) do header.clear for value in values do header.add(value.to_s) end + # Append the specfied record. fun add_record(values: Object...) do assert values.length == header.length else sys.stderr.write "CSV error: Header declares {header.length} columns, record contains {values.length} values.\n" @@ -36,26 +86,300 @@ class CsvDocument records.add(record) end - private fun write_line_to(line: Collection[String], stream: OStream) - do - var i = line.iterator - if i.is_ok then - stream.write(i.item) + redef fun write_to(stream) do + var writer = new CsvWriter.with_format(stream, format) + writer.write_sequence(header) + for record in records do writer.write_sequence(record) + end + + # Deprecated alias for `write_to_file`. + fun save(file: String) do write_to_file(file) + + # Load from the specified stream. + # + # Parameters: + # + # * `stream`: Input stream. + # * `has_header`: Is the first row the header? + # * `skip_empty`: Do we skip the empty lines? + # For details, see `CsvReader.skip_empty`. + fun load_from(stream: Reader, has_header: Bool, skip_empty: Bool) do + var reader = new CsvReader.with_format(stream, format) + reader.skip_empty = skip_empty + if has_header then + if reader.is_ok then + header = reader.item + else + header.clear + end + end + records.clear + for record in reader do records.add(record) + end + + # Load from the specified file. + # + # Parameters: + # + # * `path`: Path of the file. + # * `has_header`: Is the first row the header? + # * `skip_empty`: Do we skip the empty lines? + fun load(path: String, has_header: Bool, skip_empty: Bool) do + var istream = new FileReader.open(path) + load_from(istream, has_header, skip_empty) + istream.close + end +end + +# Appends CSV rows to a file. +# +# By default, uses the format recommended by RFC 4180 (see `rfc4180`). +# +# Note: If a row contains only an empty cell, its representation is +# undistinguishable from an empty line. This is because the empty values are +# always written unescaped in order to avoid them to be interpreted as escaped +# delimiters by some parsers. +# +# ~~~nit +# var out = new StringWriter +# var writer = new CsvWriter(out) +# writer.write_row(1, 2.0, "foo\nbar") +# writer.write_sequence([""]) +# assert out.to_s == """1,2.0,"foo\nbar"\r\n\r\n""" +# ~~~ +class CsvWriter + + # The output stream. + var ostream: Writer + + # The format to use. + # + # Defaults to `rfc4180`. + var format: CsvFormat = rfc4180 + + # Do we escape all cells (except empty ones)? + # + # If `false` (the default), escape only cells that contain a metacharacter + # of the format. In all cases, empty cells are not escaped. This option + # permits to choose between the optimization of the performances (when + # `true`) and optimization of the size of the output (when `false`). + # + # Note: Escaping may not be correctly recognized by some parsers. + var always_escape = false is writable + + # Create a new writer with the specified format. + init with_format(ostream:Writer, format: CsvFormat) do + self.ostream = ostream + self.format = format + end + + # Append the specified sequence as a row. + # + # The representation of each cell is determined by `to_s`. + fun write_sequence(row: SequenceRead[Object]) do + if not row.is_empty then + var i = row.iterator + var separator = format.separator.to_s + write_cell i.item.to_s i.next - while i.is_ok do - stream.write(";") - stream.write(i.item) - i.next + for cell in i do + ostream.write separator + write_cell cell.to_s end end - stream.write("\n") + ostream.write format.eol end - redef fun write_to(stream) do - write_line_to(header, stream) - for record in records do write_line_to(record, stream) + # Append the specified row. + # + # The representation of each cell is determined by `to_s`. + fun write_row(row: Object...) do write_sequence(row) + + # Close the output stream. + fun close do ostream.close + + private fun write_cell(cell: String) do + if cell.is_empty then return + if not always_escape and format.is_value_clean(cell) then + ostream.write cell + else + ostream.write format.escape_cell(cell) + end end +end - # Deprecated alias for `write_to_file`. - fun save(file: String) do write_to_file(file) +# Reads rows from a CSV file. +# +# By default, uses the format recommended by RFC 4180 (see `rfc4180`). +# +# ~~~nit +# var example = new StringReader(""" +# foo,bar\r +# "Hello, word!",1234.5 + 42\r +# "Something\r +# ""else""\", baz\r +# """) +# var reader = new CsvReader(example) +# var table = new Array[Array[String]] +# +# for row in reader do table.add row +# assert table == [ +# ["foo","bar"], +# ["Hello, word!","1234.5 + 42"], +# ["Something\r\n\"else\""," baz"] +# ] +# ~~~ +class CsvReader + super Iterator[Array[String]] + + # The input stream. + var istream: Reader + + # The format to use. + # + # Defaults to `rfc4180`. + var format: CsvFormat = rfc4180 is lazy + + # Do we skip the empty lines? + # + # Note: Even if this attribute is `false`, the presence of an line ending at + # end of the last row does not change the number of returned rows. + # This is because the line endings are processed as terminators, not as + # separators. Therefore, when there is more than one line ending at the end + # of the file, the additional lines are interpreted as empty rows that + # are skipped only if `skip_empty` is set to `true`. + # + # `false` by default. + var skip_empty: Bool = false is writable + + # The last read row. + private var row: nullable Array[String] = null + + # Did we read something? + private var started = false + + # Create a new reader with the specified format. + init with_format(istream:Reader, format: CsvFormat) do + self.istream = istream + self.format = format + end + + # Read the first row, if needed. + fun prepare do + if not started then + row = read_row + started = true + end + end + + redef fun next do + prepare + assert is_ok else + sys.stderr.write "Already at the end of the stream.\n" + end + row = read_row + end + + # Return the last read row. + redef fun item do + prepare + return row.as(not null) + end + + redef fun is_ok do + prepare + return row != null + end + + # Free some internal ressources and set `is_ok` to `false`. + # + # Do not close the input stream. + redef fun finish do row = null + + # Close the input stream. + fun close do istream.close + + private fun read_row: nullable Array[String] do + if istream.eof then return null + var row = new Array[String] + var value = new RopeBuffer + + # Number of unescaped characters since the last delimiter or separator. + var unescaped = 0 + + # Do we read the start of a row? + var got_row = false + + # Do we found a delimited string in the current cell? + var got_delimiter = false + + loop + var i = istream.read_char + var c: Char + + if i < 0 then + if got_row then + row.add value.to_s + return row + else + return null + end + end + c = i.ascii + + if c == format.delimiter then + if got_delimiter and unescaped == 0 then + # Got an escaped delimiter. + value.add format.delimiter + end + # Read all bytes until the delimiter. + loop + i = istream.read_char + assert not_eof: i >= 0 else + sys.stderr.write "Unexpected end of file before the end of a delimited value.\n" + end + c = i.ascii + if c == format.delimiter then break + value.add c + end + unescaped = 0 + got_row = true + got_delimiter = true + else if c == format.separator then + # Flush the value to the row. + row.add value.to_s + value.clear + unescaped = 0 + got_delimiter = false + else + value.add c + unescaped += 1 + if unescaped >= format.eol.length and + value.has_suffix(format.eol) then + var value_trimed = value.substring(0, + value.length - format.eol.length).to_s + if skip_empty and row.is_empty and + value_trimed.is_empty and + not got_delimiter then + # Skip the empty line. + value.clear + unescaped = 0 + got_row = false + else + row.add value_trimed + return row + end + else + got_row = true + end + end + end + end end + +# The CSV format recommended by [RFC 4180](https://tools.ietf.org/html/rfc4180). +# +# * `delimiter`: `'"'` +# * `separator`: `','` +# * `eol`: `"\r\n"` +fun rfc4180: CsvFormat do return once new CsvFormat('"', ',', "\r\n")