X-Git-Url: http://nitlanguage.org diff --git a/lib/core/text/abstract_text.nit b/lib/core/text/abstract_text.nit index 64c22c0..58a1c20 100644 --- a/lib/core/text/abstract_text.nit +++ b/lib/core/text/abstract_text.nit @@ -68,7 +68,7 @@ abstract class Text fun substring(from: Int, count: Int): SELFTYPE is abstract # Iterates on the substrings of self if any - fun substrings: Iterator[FlatText] is abstract + private fun substrings: Iterator[FlatText] is abstract # Is the current Text empty (== "") # @@ -248,7 +248,17 @@ abstract class Text # If `self` contains only digits and alpha <= 'f', return the corresponding integer. # # assert "ff".to_hex == 255 - fun to_hex: Int do return a_to(16) + fun to_hex(pos, ln: nullable Int): Int do + var res = 0 + if pos == null then pos = 0 + if ln == null then ln = length - pos + var max = pos + ln + for i in [pos .. max[ do + res <<= 4 + res += self[i].from_hex + end + return res + end # If `self` contains only digits <= '7', return the corresponding integer. # @@ -295,26 +305,32 @@ abstract class Text end end - # Returns `true` if the string contains only Numeric values (and one "," or one "." character) + # Is this string in a valid numeric format compatible with `to_f`? # # assert "123".is_numeric == true # assert "1.2".is_numeric == true - # assert "1,2".is_numeric == true + # assert "-1.2".is_numeric == true + # assert "-1.23e-2".is_numeric == true # assert "1..2".is_numeric == false + # assert "".is_numeric == false fun is_numeric: Bool do - var has_point_or_comma = false + var has_point = false + var e_index = -1 for i in [0..length[ do var c = chars[i] if not c.is_numeric then - if (c == '.' or c == ',') and not has_point_or_comma then - has_point_or_comma = true + if c == '.' and not has_point then + has_point = true + else if c == 'e' and e_index == -1 and i > 0 and i < length - 1 and chars[i-1] != '-' then + e_index = i + else if c == '-' and i == e_index + 1 and i < length - 1 then else return false end end end - return true + return not is_empty end # Returns `true` if the string contains only Hex chars @@ -733,21 +749,32 @@ abstract class Text # assert "\\ud800\\udfd3".from_utf16_escape == '𐏓' # assert "\\u00e8".from_utf16_escape == 'è' # assert "\\u3042".from_utf16_escape == 'あ' - fun from_utf16_escape: Char do - var ln = length - if ln != 6 and ln != 12 then return 0xFFFD.code_point - var cphi = substring(2, 4).to_hex - if cphi < 0xD800 then return cphi.code_point - if cphi > 0xDFFF then return cphi.code_point - if cphi > 0xDBFF then return 0xFFFD.code_point - var cp = 0 - cp += (cphi - 0xD800) << 10 - var cplo = substring(8, 4).to_hex + fun from_utf16_escape(pos, ln: nullable Int): Char do + if pos == null then pos = 0 + if ln == null then ln = length - pos + if ln < 6 then return 0xFFFD.code_point + var cp = from_utf16_digit(pos + 2) + if cp < 0xD800 then return cp.code_point + if cp > 0xDFFF then return cp.code_point + if cp > 0xDBFF then return 0xFFFD.code_point + if ln == 6 then return 0xFFFD.code_point + if ln < 12 then return 0xFFFD.code_point + cp <<= 16 + cp += from_utf16_digit(pos + 8) + var cplo = cp & 0xFFFF if cplo < 0xDC00 then return 0xFFFD.code_point if cplo > 0xDFFF then return 0xFFFD.code_point - cp += cplo - 0xDC00 - cp += 0x10000 - return cp.code_point + return cp.from_utf16_surr.code_point + end + + # Returns a UTF-16 escape value + # + # var s = "\\ud800\\udfd3" + # assert s.from_utf16_digit(2) == 0xD800 + # assert s.from_utf16_digit(8) == 0xDFD3 + fun from_utf16_digit(pos: nullable Int): Int do + if pos == null then pos = 0 + return to_hex(pos, 4) end # Encode `self` to percent (or URL) encoding @@ -831,7 +858,7 @@ abstract class Text l += 1 end - return buf.to_s_with_length(l) + return buf.to_s_unsafe(l) end # Escape the characters `<`, `>`, `&`, `"`, `'` and `/` as HTML/XML entity references. @@ -974,6 +1001,65 @@ abstract class Text return s.plain_to_s end + # Return the Levenshtein distance between two strings + # + # ~~~ + # assert "abcd".levenshtein_distance("abcd") == 0 + # assert "".levenshtein_distance("abcd") == 4 + # assert "abcd".levenshtein_distance("") == 4 + # assert "abcd".levenshtein_distance("xyz") == 4 + # assert "abcd".levenshtein_distance("xbdy") == 3 + # ~~~ + fun levenshtein_distance(other: String): Int + do + var slen = self.length + var olen = other.length + + # fast cases + if slen == 0 then return olen + if olen == 0 then return slen + if self == other then return 0 + + # previous row of distances + var v0 = new Array[Int].with_capacity(olen+1) + + # current row of distances + var v1 = new Array[Int].with_capacity(olen+1) + + for j in [0..olen] do + # prefix insert cost + v0[j] = j + end + + for i in [0..slen[ do + + # prefix delete cost + v1[0] = i + 1 + + for j in [0..olen[ do + # delete cost + var cost1 = v1[j] + 1 + # insert cost + var cost2 = v0[j + 1] + 1 + # same char cost (+0) + var cost3 = v0[j] + # change cost + if self[i] != other[j] then cost3 += 1 + # keep the min + v1[j+1] = cost1.min(cost2).min(cost3) + end + + # Switch columns: + # * v1 become v0 in the next iteration + # * old v0 is reused as the new v1 + var tmp = v1 + v1 = v0 + v0 = tmp + end + + return v0[olen] + end + # Copies `n` bytes from `self` at `src_offset` into `dest` starting at `dest_offset` # # Basically a high-level synonym of NativeString::copy_to @@ -982,7 +1068,7 @@ abstract class Text # # var ns = new NativeString(8) # "Text is String".copy_to_native(ns, 8, 2, 0) - # assert ns.to_s_with_length(8) == "xt is St" + # assert ns.to_s_unsafe(8) == "xt is St" # fun copy_to_native(dest: NativeString, n, src_offset, dest_offset: Int) do var mypos = src_offset @@ -998,7 +1084,7 @@ abstract class Text end # All kinds of array-based text representations. -private abstract class FlatText +abstract class FlatText super Text # Underlying C-String (`char*`) @@ -1461,14 +1547,14 @@ redef class Byte var ns = new NativeString(nslen + 1) ns[nslen] = 0u8 native_byte_to_s(ns, nslen + 1) - return ns.to_s_with_length(nslen) + return ns.to_s_unsafe(nslen) end end redef class Int # Wrapper of strerror C function - private fun strerror_ext: NativeString `{ return strerror(self); `} + private fun strerror_ext: NativeString `{ return strerror((int)self); `} # Returns a string describing error number fun strerror: String do return strerror_ext.to_s @@ -1600,6 +1686,12 @@ redef class Char # assert 'ま'.bytes == [0xE3u8, 0x81u8, 0xBEu8] fun bytes: SequenceRead[Byte] do return to_s.bytes + # Is `self` an UTF-16 surrogate pair ? + fun is_surrogate: Bool do + var cp = code_point + return cp >= 0xD800 and cp <= 0xDFFF + end + # Length of `self` in a UTF-8 String private fun u8char_len: Int do var c = self.code_point @@ -1616,7 +1708,7 @@ redef class Char var ln = u8char_len var ns = new NativeString(ln + 1) u8char_tos(ns, ln) - return ns.to_s_with_length(ln) + return ns.to_s_unsafe(ln) end # Returns `self` escaped to UTF-16 @@ -1709,6 +1801,16 @@ redef class Char return (self >= 'a' and self <= 'z') or (self >= 'A' and self <= 'Z') end + # Is `self` an hexadecimal digit ? + # + # assert 'A'.is_hexdigit + # assert not 'G'.is_hexdigit + # assert 'a'.is_hexdigit + # assert not 'g'.is_hexdigit + # assert '5'.is_hexdigit + fun is_hexdigit: Bool do return (self >= '0' and self <= '9') or (self >= 'A' and self <= 'F') or + (self >= 'a' and self <= 'f') + # Returns true if the char is an alpha or a numeric digit # # assert 'a'.is_alphanumeric @@ -1722,6 +1824,19 @@ redef class Char do return self.is_numeric or self.is_alpha end + + # Returns `self` to its int value + # + # REQUIRE: `is_hexdigit` + fun from_hex: Int do + if self >= '0' and self <= '9' then return code_point - 0x30 + if self >= 'A' and self <= 'F' then return code_point - 0x37 + if self >= 'a' and self <= 'f' then return code_point - 0x57 + # Happens if self is not a hexdigit + assert self.is_hexdigit + # To make flow analysis happy + abort + end end redef class Collection[E] @@ -1892,6 +2007,12 @@ redef class NativeString # Returns `self` as a String of `length`. fun to_s_with_length(length: Int): String is abstract + # Returns a new instance of `String` with self as `_items` + # + # /!\: Does not clean the items for compliance with UTF-8, + # Use only if you know what you are doing + fun to_s_unsafe(len: nullable Int): String is abstract + # Returns `self` as a String with `bytelen` and `length` set # # SEE: `abstract_text::Text` for more infos on the difference