From: Lucas Bajolet Date: Fri, 29 Apr 2016 17:43:16 +0000 (-0400) Subject: lib/csv: Major rewriting for performance and simplicity X-Git-Url: http://nitlanguage.org lib/csv: Major rewriting for performance and simplicity Signed-off-by: Lucas Bajolet --- diff --git a/lib/csv/csv.nit b/lib/csv/csv.nit index 57590ad..4542553 100644 --- a/lib/csv/csv.nit +++ b/lib/csv/csv.nit @@ -15,127 +15,160 @@ # CSV document handling. module csv -# Specifies a CSV format. -class CsvFormat - # The character that delimits escaped value. - # - # The delimiter is escaped by doubling it. - var delimiter: Char - - # The character that split each cell in a row. - var separator: Char - - # The character that ends a row (end of line). - var eol: String +redef class Text + # Escape the content of `self` for inclusion in a CSV document + private fun escape_to_csv(sep_char, delim_char: Char, eol: String): String do + var add_sp = chars_to_escape_csv(sep_char, delim_char, eol) + if add_sp == 0 then return to_s + var bf = new Buffer.with_cap(add_sp + bytelen) + bf.add '"' + for i in [0 .. length[ do + var c = self[i] + if c == delim_char then + bf.add c + end + bf.add c + end + bf.add '"' + return bf.to_s + end - # Escape sequence for the delimiter. - private var escaping = "{delimiter}{delimiter}" is lazy + # How many more bytes should be allocated for CSV escaping ? + private fun chars_to_escape_csv(sep_char, delim_char: Char, eol: String): Int do + var more_ln = 0 + var ln = length + var need_esc = false + var fst_eol = eol.first + var i = 0 + while i < ln do + var c = self[i] + if c == delim_char then more_ln += 1 + if c == fst_eol then + need_esc = true + for j in [1 .. eol.length[ do + i += 1 + c = self[i] + if c != eol[j] then + i -= j + need_esc = false + break + end + end + end + if c == sep_char then need_esc = true + i += 1 + end + var more = more_ln * delim_char.u8char_len + if need_esc then more += 2 + return more + end - # Escape the specified cell. - private fun escape_cell(cell: String): Text do - var result = new RopeBuffer - result.add delimiter - result.append cell.replace(delimiter, escaping) - result.add delimiter - return result + # Unescape the content of `self` from CSV format to Nit String + private fun unescape_csv(delim_char: Char): String do + var to_un = chars_to_unescape_csv(delim_char) + if to_un == 0 then return to_s + var buf = new Buffer.with_cap(bytelen - to_un) + var pos = 0 + var ln = length + while pos < ln do + var c = self[pos] + if c == delim_char then pos += 1 + buf.add c + pos += 1 + end + return buf.to_s end - # Can the specified value be inserted without any escaping? - private fun is_value_clean(value: String): Bool do - for c in value.chars do - if c == delimiter then return false - if c == separator then return false - if eol.chars.has(c) then return false + # How many bytes should be removed for CSV unescaping ? + private fun chars_to_unescape_csv(delim_char: Char): Int do + var pos = 0 + var to_un = 0 + var ln = length + while pos < ln do + var c = self[pos] + if c == delim_char then + pos += 1 + to_un += 1 + end + pos += 1 end - return true + return to_un end end +# Shared properties by all CSV-related classes +# +# This class is basically only here for implementation purposes and should not be used +# by clients for typing. +abstract class CsvStream + # The character that delimits escaped value. + # + # The delimiter is escaped by doubling it. + var delimiter = '"' is writable + + # The character that split each cell in a record. + var separator = ',' is writable + + # The character that ends a record (end of line). + var eol = "\n" is writable +end + # A CSV document representation. class CsvDocument super Writable - - # The format to use. - # - # Defaults to `rfc4180`. - var format: CsvFormat = rfc4180 is writable + super CsvStream # The header. # # Contains the name of all fields in this table. - var header: Array[String] = new Array[String] is writable + var header = new Array[String] is writable, optional # The list of the records. # # All records must have the same length than `header`. - var records: Array[Array[String]] = new Array[Array[String]] + var records = new Array[Array[String]] is writable, optional - # Replace the header by the specified row. - fun set_header(values: Object...) do - header.clear - for value in values do header.add(value.to_s) - end - - # Append the specfied record. - fun add_record(values: Object...) do - assert values.length == header.length else - sys.stderr.write "CSV error: Header declares {header.length} columns, record contains {values.length} values.\n" - end - var record = new Array[String] - for value in values do record.add(value.to_s) - records.add(record) + # Adds a new record to document containing the values in `objs` + fun add_record(objs: Object...) do + var ln = new Array[String].with_capacity(objs.length) + for i in objs do ln.add(i.to_s) + records.add ln end redef fun write_to(stream) do - var writer = new CsvWriter.with_format(stream, format) - writer.write_sequence(header) - for record in records do writer.write_sequence(record) + var s = new CsvWriter(stream) + s.separator = separator + s.eol = eol + s.delimiter = delimiter + if not header.is_empty then + s.write_line header + end + s.write_lines(records) end - # Deprecated alias for `write_to_file`. - fun save(file: String) do write_to_file(file) - # Load from the specified stream. # # Parameters: # # * `stream`: Input stream. - # * `has_header`: Is the first row the header? - # * `skip_empty`: Do we skip the empty lines? - # For details, see `CsvReader.skip_empty`. - fun load_from(stream: Reader, has_header: Bool, skip_empty: Bool) do - var reader = new CsvReader.with_format(stream, format) + # * `has_header`: Is the first record the header? - defaults to true + # * `skip_empty`: Do we skip the empty lines? - defaults to true + fun load_from(stream: Reader, has_header: nullable Bool, skip_empty: nullable Bool) do + if has_header == null then has_header = true + if skip_empty == null then skip_empty = true + var reader = new CsvReader(stream) + reader.separator = separator + reader.eol = eol + reader.delimiter = delimiter reader.skip_empty = skip_empty - if has_header then - if reader.is_ok then - header = reader.item - else - header.clear - end - end - records.clear - for record in reader do records.add(record) - end - - # Load from the specified file. - # - # Parameters: - # - # * `path`: Path of the file. - # * `has_header`: Is the first row the header? - # * `skip_empty`: Do we skip the empty lines? - fun load(path: String, has_header: Bool, skip_empty: Bool) do - var istream = new FileReader.open(path) - load_from(istream, has_header, skip_empty) - istream.close end end -# Appends CSV rows to a file. +# Appends CSV records to a file. # # By default, uses the format recommended by RFC 4180 (see `rfc4180`). # -# Note: If a row contains only an empty cell, its representation is +# Note: If a record contains only an empty cell, its representation is # undistinguishable from an empty line. This is because the empty values are # always written unescaped in order to avoid them to be interpreted as escaped # delimiters by some parsers. @@ -143,240 +176,167 @@ end # ~~~nit # var out = new StringWriter # var writer = new CsvWriter(out) -# writer.write_row(1, 2.0, "foo\nbar") -# writer.write_sequence([""]) -# assert out.to_s == """1,2.0,"foo\nbar"\r\n\r\n""" +# writer.write_elements(1, 2.0, "foo\nbar") +# writer.write_line([""]) +# assert out.to_s == """1,2.0,"foo\nbar"\n\n""" # ~~~ class CsvWriter + super CsvStream # The output stream. var ostream: Writer - # The format to use. - # - # Defaults to `rfc4180`. - var format: CsvFormat = rfc4180 + # Write several lines to a stream + fun write_lines(lines: Array[Array[Object]]) do for i in lines do write_line i - # Do we escape all cells (except empty ones)? - # - # If `false` (the default), escape only cells that contain a metacharacter - # of the format. In all cases, empty cells are not escaped. This option - # permits to choose between the optimization of the performances (when - # `true`) and optimization of the size of the output (when `false`). - # - # Note: Escaping may not be correctly recognized by some parsers. - var always_escape = false is writable - - # Create a new writer with the specified format. - init with_format(ostream:Writer, format: CsvFormat) do - init(ostream) - self.format = format - end - - # Append the specified sequence as a row. + # Append the elements in `els` as a record. # # The representation of each cell is determined by `to_s`. - fun write_sequence(row: SequenceRead[Object]) do - if not row.is_empty then - var i = row.iterator - var separator = format.separator.to_s - write_cell i.item.to_s - i.next - for cell in i do - ostream.write separator - write_cell cell.to_s - end + fun write_elements(els: Object...) do + var os = ostream + var esc = delimiter + var sep = separator + var eol = eol + for i in [0 .. els.length - 1[ do + os.write(els[i].to_s.escape_to_csv(sep, esc, eol)) + os.write_char(sep) end - ostream.write format.eol + os.write(els.last.to_s.escape_to_csv(sep, esc, eol)) + os.write(eol) end - # Append the specified row. + # Append the specified record. # # The representation of each cell is determined by `to_s`. - fun write_row(row: Object...) do write_sequence(row) - - # Close the output stream. - fun close do ostream.close - - private fun write_cell(cell: String) do - if cell.is_empty then return - if not always_escape and format.is_value_clean(cell) then - ostream.write cell - else - ostream.write format.escape_cell(cell) + fun write_line(line: Array[Object]) do + var os = ostream + var esc = delimiter + var sep = separator + var eol = eol + for i in [0 .. line.length - 1[ do + os.write(line[i].to_s.escape_to_csv(sep, esc, eol)) + os.write_char(sep) end + os.write(line.last.to_s.escape_to_csv(sep, esc, eol)) + os.write(eol) end end -# Reads rows from a CSV file. +# Reads records from a CSV file. # -# By default, uses the format recommended by RFC 4180 (see `rfc4180`). +# By default, the format recognizes EOLs as `\n` # # ~~~nit -# var example = new StringReader(""" -# foo,bar\r -# "Hello, word!",1234.5 + 42\r -# "Something\r -# ""else""\", baz\r -# """) -# var reader = new CsvReader(example) -# var table = new Array[Array[String]] +# var example = """ +# foo,bar +# "Hello, word!",1234.5 + 42 +# "Something +# ""else""\", baz +# """ +# var reader = new CsvReader.from_string(example) +# var table = reader.read_all # -# for row in reader do table.add row -# assert table == [ -# ["foo","bar"], -# ["Hello, word!","1234.5 + 42"], -# ["Something\r\n\"else\""," baz"] -# ] +# assert table.header == ["foo","bar"] +# assert table.records == [["Hello, word!","1234.5 + 42"], +# ["Something\n\"else\""," baz"]] # ~~~ class CsvReader - super Iterator[Array[String]] + super CsvStream # The input stream. var istream: Reader - # The format to use. - # - # Defaults to `rfc4180`. - var format: CsvFormat = rfc4180 is lazy - # Do we skip the empty lines? # # Note: Even if this attribute is `false`, the presence of an line ending at - # end of the last row does not change the number of returned rows. + # end of the last record does not change the number of returned record. # This is because the line endings are processed as terminators, not as # separators. Therefore, when there is more than one line ending at the end - # of the file, the additional lines are interpreted as empty rows that + # of the file, the additional lines are interpreted as empty records that # are skipped only if `skip_empty` is set to `true`. # # `false` by default. var skip_empty: Bool = false is writable - # The last read row. - private var row: nullable Array[String] = null - - # Did we read something? - private var started = false - - # Create a new reader with the specified format. - init with_format(istream:Reader, format: CsvFormat) do - init(istream) - self.format = format - end - - # Read the first row, if needed. - fun prepare do - if not started then - row = read_row - started = true - end - end - - redef fun next do - prepare - assert is_ok else - sys.stderr.write "Already at the end of the stream.\n" - end - row = read_row - end - - # Return the last read row. - redef fun item do - prepare - return row.as(not null) - end + # Creates a new CSVReader from a `string` data + init from_string(s: String) do init(new StringReader(s)) - redef fun is_ok do - prepare - return row != null - end - - # Free some internal ressources and set `is_ok` to `false`. + # Reads the content of the Stream and interprets it as a CSV Document # - # Do not close the input stream. - redef fun finish do row = null - - # Close the input stream. - fun close do istream.close - - private fun read_row: nullable Array[String] do - if istream.eof then return null - var row = new Array[String] - var value = new RopeBuffer - - # Number of unescaped characters since the last delimiter or separator. - var unescaped = 0 - - # Do we read the start of a row? - var got_row = false - - # Do we found a delimited string in the current cell? - var got_delimiter = false - - loop - var c = istream.read_char - - if c == null then - if got_row then - row.add value.to_s - return row - else - return null - end - end - - if c == format.delimiter then - if got_delimiter and unescaped == 0 then - # Got an escaped delimiter. - value.add format.delimiter - end - # Read all bytes until the delimiter. - loop - c = istream.read_char - assert not_eof: c != null else - sys.stderr.write "Unexpected end of file before the end of a delimited value.\n" + # Optional parameter `has_header` determines whether the first line + # of the CSV Document is header data. + # Defaults to true + fun read_all(has_header: nullable Bool): CsvDocument do + var header: nullable Array[String] = null + if has_header == null then has_header = true + var iss = istream + var res_data = new Array[Array[String]] + var eol_st = eol.first + var line = new Array[String] + var esc = delimiter + var sep = separator + var eol = eol + var is_eol = false + var eol_buf = new Buffer.with_cap(eol.length) + var c = iss.read_char + var el = new Buffer + while not iss.eof do + if c == null then continue + loop + if c == esc then + c = iss.read_char + loop + if c == esc then + c = iss.read_char + if c != esc then break + end + if c == null then break + el.add c + c = iss.read_char end - if c == format.delimiter then break - value.add c end - unescaped = 0 - got_row = true - got_delimiter = true - else if c == format.separator then - # Flush the value to the row. - row.add value.to_s - value.clear - unescaped = 0 - got_delimiter = false - else - value.add c - unescaped += 1 - if unescaped >= format.eol.length and - value.has_suffix(format.eol) then - var value_trimed = value.substring(0, - value.length - format.eol.length).to_s - if skip_empty and row.is_empty and - value_trimed.is_empty and - not got_delimiter then - # Skip the empty line. - value.clear - unescaped = 0 - got_row = false - else - row.add value_trimed - return row + if c == sep then break + if c == eol_st then + eol_buf.add c.as(not null) + is_eol = true + for i in [1 .. eol.length[ do + c = iss.read_char + if c == null or c != eol[i] then + is_eol = false + el.append(eol_buf) + eol_buf.clear + break + end + eol_buf.add c end - else - got_row = true + if not is_eol then continue + eol_buf.clear + break end + if c == sep then break + el.add c.as(not null) + c = iss.read_char + if c == null then break end + line.add el.to_s + el.clear + if is_eol or iss.eof then + c = iss.read_char + is_eol = false + if skip_empty and line.is_empty then + continue + end + if has_header and header == null then + header = line + else res_data.add line + line = new Array[String] + end + if c == sep then c = iss.read_char end + if header == null then header = new Array[String] + var doc = new CsvDocument + doc.header = header + doc.records = res_data + return doc end end - -# The CSV format recommended by [RFC 4180](https://tools.ietf.org/html/rfc4180). -# -# * `delimiter`: `'"'` -# * `separator`: `','` -# * `eol`: `"\r\n"` -fun rfc4180: CsvFormat do return once new CsvFormat('"', ',', "\r\n") diff --git a/lib/csv/test_csv.nit b/lib/csv/test_csv.nit deleted file mode 100644 index d23d007..0000000 --- a/lib/csv/test_csv.nit +++ /dev/null @@ -1,201 +0,0 @@ -# This file is part of NIT ( http://www.nitlanguage.org ). -# -# This file is free software, which comes along with NIT. This software is -# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; -# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. You can modify it is you want, provided this header -# is kept unaltered, and a notification of the changes is added. -# You are allowed to redistribute it and sell it, alone or is a part of -# another product. - -# Tests for `csv`. -module test_csv is test_suite - -import test_suite -import csv - -class TestCsvWriter - super TestSuite - - # The custom CSV format used in the tests. - private var custom_format = new CsvFormat('/', ':', "#") - - # Expect to write `row` as `expected_rfc4180` and as `expected_custom`. - # - # Parameters: - # - # * `always_escape`: value of the `always_escape` option. - # * `row`: row to write. - # * `expected_rfc4180`: expected result in RFC 4180. - # * `expected_custom`: expected result in the custom CSV format. - private fun expect(always_escape: Bool, row: SequenceRead[String], - expected_rfc4180: String, - expected_custom: String) do - var out = new StringWriter - var writer = new CsvWriter(out) - - writer.always_escape = always_escape - writer.write_sequence(row) - assert out.to_s == expected_rfc4180 else - sys.stderr.write "\nFormat: RFC 4180\n" - sys.stderr.write "Expecting: \"{expected_rfc4180.escape_to_nit}\"\n" - sys.stderr.write "Got: \"{out.to_s.escape_to_nit}\"\n" - end - writer.close - - out = new StringWriter - writer = new CsvWriter.with_format(out, custom_format) - writer.always_escape = always_escape - writer.write_sequence(row) - assert out.to_s == expected_custom else - sys.stderr.write "\nFormat: {custom_format.delimiter}" - sys.stderr.write " {custom_format.separator}" - sys.stderr.write " {custom_format.eol.escape_to_nit}\n" - sys.stderr.write "Expecting: \"{expected_custom.escape_to_nit}\"\n" - sys.stderr.write "Got: \"{out.to_s.escape_to_nit}\"\n" - end - writer.close - end - - fun test_empty do expect(true, new Array[String], "\r\n", "#") - - fun test_one_cell do expect(true, ["foo/\"\r\n,"], - "\"foo/\"\"\r\n,\"\r\n", - "/foo//\"\r\n,/#") - - fun test_optimize_size_escaped do expect(false, ["foo/\"\r\n,"], - "\"foo/\"\"\r\n,\"\r\n", - "/foo//\"\r\n,/#") - - fun test_optimize_size_eol do expect(false, ["foo\r#\n"], - "\"foo\r#\n\"\r\n", - "/foo\r#\n/#") - - fun test_optimize_size_unescaped do expect(false, ["foo"], - "foo\r\n", - "foo#") - - fun test_multiple_cells do expect(true, ["1", "", "/"], - "\"1\",,\"/\"\r\n", - "/1/::////#") - - fun test_multiple_cells_optimize_size do expect(false, ["1", "", "/"], - "1,,/\r\n", - "1::////#") -end - -class TestCsvReader - super TestSuite - - # The custom CSV format used in the tests. - private var custom_format = new CsvFormat('/', ':', "#") - - # Expect to read `expected`. - # - # Parameters: - # - # * `skip_empty`: value of the `skip_empty` option. - # * `modal_escaping`: value of the `modal_escaping` option. - # * `input_rfc4180`: input in the RFC 4180 format. - # * `input_custom`: input in the custom CSV format. - # * `expected`: expected resulting table. - private fun expect(skip_empty: Bool, - input_rfc4180: String, - input_custom: String, - expected: SequenceRead[SequenceRead[String]]) do - var istream: Reader - var reader: CsvReader - - istream = new StringReader(input_rfc4180) - reader = new CsvReader(istream) - reader.skip_empty = skip_empty - assert_table_equals("RFC 4180", reader, expected.iterator) - - istream = new StringReader(input_custom) - reader = new CsvReader.with_format(istream, custom_format) - reader.skip_empty = skip_empty - assert_table_equals("{custom_format.delimiter} " + - "{custom_format.separator} " + - "{custom_format.eol.escape_to_nit}", reader, expected.iterator) - end - - # Check if tables are equal. - private fun assert_table_equals(format: String, - actual: Iterator[SequenceRead[String]], - expected: Iterator[SequenceRead[String]]) do - var i = 0 - - for actual_row in actual do - assert expected.is_ok else fail(format,"Too many rows.") - var expected_row = expected.item - assert_row_equals(format, i, actual_row, expected_row) - expected.next - i += 1 - end - assert not expected.is_ok else fail(format, "Not enough rows.") - expected.finish - end - - # Check if rows are equal. - private fun assert_row_equals(format: String, - row_index: Int, - actual: SequenceRead[String], - expected: SequenceRead[String]) do - assert actual == expected else - fail(format, """ -At row {{{row_index}}}. -Expecting: {{{expected.join("|")}}} -Got: {{{actual.join("|")}}}""") - end - end - - # Output an error message with an indication of the format used. - private fun fail(format: Text, message: Text) do - sys.stderr.write "\nFormat: {format}\n" - sys.stderr.write message - sys.stderr.write "\n" - end - - fun test_empty do expect(false, "", "", new Array[Array[String]]) - - fun test_empty_eol do expect(false, "\r\n", "#", [[""]]) - - fun test_empty_skip do expect(true, "", "", new Array[Array[String]]) - - fun test_empty_skip1 do expect(true, "\r\n", "#", new Array[Array[String]]) - - fun test_empty_skip2 do expect(true, "\r\n\r\n", "##", new Array[Array[String]]) - - fun test_escaped do expect(false, "\"foo/\"\"\r\n,\"\r\n", - "/foo//\"\r\n,/#", [["foo/\"\r\n,"]]) - - fun test_unescaped do expect(false, "foo bar\r\n", - "foo bar#", [["foo bar"]]) - - fun test_escaped_no_eol do expect(false, "\"foo/\"\"\r\n,\"", - "/foo//\"\r\n,/", [["foo/\"\r\n,"]]) - - fun test_unescaped_no_eol do expect(false, "foo bar", - "foo bar", [["foo bar"]]) - - fun test_multiple_cells do expect(false, "\"1\",,\"/\"\r\n", - "/1/::////#", [["1", "", "/"]]) - - fun test_multiple_cells_unescaped do expect(false, "1,,/\r\n", - "1::////#", [["1", "", "/"]]) - - fun test_modal_escaping do expect(false, """a"b""/c","d"e""", - """/ab"///c:d/e/""", [["""ab"/c""", "de"]]) - - fun test_skip_start do expect(true, "\r\n1,,/\r\n", - "#1::////#", [["1", "", "/"]]) - - fun test_dont_skip_empty_delimited do expect(true, "\"\"\r\n", - "//#", [[""]]) - - fun test_dont_skip_multiple_empty_cells do expect(true, ",\r\n", - ":#", [["",""]]) - - fun test_mutiple_rows do expect(false, "\"a\r\nb#\",c\r\nd,\r\n,e\r\n", - "/a\r\nb#/:c#d:#:e#", [["a\r\nb#", "c"], ["d", ""], ["", "e"]]) -end