X-Git-Url: http://nitlanguage.org diff --git a/lib/core/text/abstract_text.nit b/lib/core/text/abstract_text.nit index b1e6a04..d4873ad 100644 --- a/lib/core/text/abstract_text.nit +++ b/lib/core/text/abstract_text.nit @@ -68,7 +68,7 @@ abstract class Text fun substring(from: Int, count: Int): SELFTYPE is abstract # Iterates on the substrings of self if any - fun substrings: Iterator[FlatText] is abstract + private fun substrings: Iterator[FlatText] is abstract # Is the current Text empty (== "") # @@ -146,15 +146,7 @@ abstract class Text # Returns -1 if not found # # DEPRECATED : Use self.chars.last_index_of_from instead - fun last_index_of_from(item: Char, pos: Int): Int - do - var iter = self.chars.reverse_iterator_from(pos) - while iter.is_ok do - if iter.item == item then return iter.index - iter.next - end - return -1 - end + fun last_index_of_from(item: Char, pos: Int): Int do return chars.last_index_of_from(item, pos) # Gets an iterator on the chars of self # @@ -543,7 +535,7 @@ abstract class Text if c >= '0' and c <= '9' then res.add('_') - res.append(c.ascii.to_s) + res.append(c.code_point.to_s) res.add('d') start = 1 end @@ -555,7 +547,7 @@ abstract class Text continue end if underscore then - res.append('_'.ascii.to_s) + res.append('_'.code_point.to_s) res.add('d') end if c >= '0' and c <= '9' then @@ -566,13 +558,13 @@ abstract class Text underscore = true else res.add('_') - res.append(c.ascii.to_s) + res.append(c.code_point.to_s) res.add('d') underscore = false end end if underscore then - res.append('_'.ascii.to_s) + res.append('_'.code_point.to_s) res.add('d') end return res.to_s @@ -587,7 +579,7 @@ abstract class Text # Three digits are always used to avoid following digits to be interpreted as an element # of the octal sequence. # - # assert "{0.ascii}{1.ascii}{8.ascii}{31.ascii}{32.ascii}".escape_to_c == "\\000\\001\\010\\037 " + # assert "{0.code_point}{1.code_point}{8.code_point}{31.code_point}{32.code_point}".escape_to_c == "\\000\\001\\010\\037 " # # The exceptions are the common `\t` and `\n`. fun escape_to_c: String @@ -605,9 +597,9 @@ abstract class Text b.append("\\\'") else if c == '\\' then b.append("\\\\") - else if c.ascii < 32 then + else if c.code_point < 32 then b.add('\\') - var oct = c.ascii.to_base(8, false) + var oct = c.code_point.to_base(8, false) # Force 3 octal digits since it is the # maximum allowed in the C specification if oct.length == 1 then @@ -680,8 +672,8 @@ abstract class Text else if c == ':' or c == ' ' or c == '#' then b.add('\\') b.add(c) - else if c.ascii < 32 or c == ';' or c == '|' or c == '\\' or c == '=' then - b.append("?{c.ascii.to_base(16, false)}") + else if c.code_point < 32 or c == ';' or c == '|' or c == '\\' or c == '=' then + b.append("?{c.code_point.to_base(16, false)}") else b.add(c) end @@ -695,7 +687,7 @@ abstract class Text # assert s.length == 2 # var u = s.unescape_nit # assert u.length == 1 - # assert u.chars[0].ascii == 10 # (the ASCII value of the "new line" character) + # assert u.chars[0].code_point == 10 # (the ASCII value of the "new line" character) fun unescape_nit: String do var res = new Buffer.with_cap(self.length) @@ -726,6 +718,38 @@ abstract class Text return res.to_s end + # Returns `self` with all characters escaped with their UTF-16 representation + # + # assert "Aèあ𐏓".escape_to_utf16 == "\\u0041\\u00e8\\u3042\\ud800\\udfd3" + fun escape_to_utf16: String do + var buf = new Buffer + for i in chars do buf.append i.escape_to_utf16 + return buf.to_s + end + + # Returns the Unicode char escaped by `self` + # + # assert "\\u0041".from_utf16_escape == 'A' + # assert "\\ud800\\udfd3".from_utf16_escape == '𐏓' + # assert "\\u00e8".from_utf16_escape == 'è' + # assert "\\u3042".from_utf16_escape == 'あ' + fun from_utf16_escape: Char do + var ln = length + if ln != 6 and ln != 12 then return 0xFFFD.code_point + var cphi = substring(2, 4).to_hex + if cphi < 0xD800 then return cphi.code_point + if cphi > 0xDFFF then return cphi.code_point + if cphi > 0xDBFF then return 0xFFFD.code_point + var cp = 0 + cp += (cphi - 0xD800) << 10 + var cplo = substring(8, 4).to_hex + if cplo < 0xDC00 then return 0xFFFD.code_point + if cplo > 0xDFFF then return 0xFFFD.code_point + cp += cplo - 0xDC00 + cp += 0x10000 + return cp.code_point + end + # Encode `self` to percent (or URL) encoding # # assert "aBc09-._~".to_percent_encoding == "aBc09-._~" @@ -787,7 +811,7 @@ abstract class Text if c == '%' then if i + 2 >= length then # What follows % has been cut off - buf[l] = '?'.ascii.to_b + buf[l] = '?'.ascii else i += 1 var hex_s = substring(i, 2) @@ -797,11 +821,11 @@ abstract class Text i += 1 else # What follows a % is not Hex - buf[l] = '?'.ascii.to_b + buf[l] = '?'.ascii i -= 1 end end - else buf[l] = c.ascii.to_b + else buf[l] = c.ascii i += 1 l += 1 @@ -905,7 +929,7 @@ abstract class Text for i in [0..length[ do var char = chars[i] - h = (h << 5) + h + char.ascii + h = (h << 5) + h + char.code_point end hash_cache = h @@ -950,6 +974,65 @@ abstract class Text return s.plain_to_s end + # Return the Levenshtein distance between two strings + # + # ~~~ + # assert "abcd".levenshtein_distance("abcd") == 0 + # assert "".levenshtein_distance("abcd") == 4 + # assert "abcd".levenshtein_distance("") == 4 + # assert "abcd".levenshtein_distance("xyz") == 4 + # assert "abcd".levenshtein_distance("xbdy") == 3 + # ~~~ + fun levenshtein_distance(other: String): Int + do + var slen = self.length + var olen = other.length + + # fast cases + if slen == 0 then return olen + if olen == 0 then return slen + if self == other then return 0 + + # previous row of distances + var v0 = new Array[Int].with_capacity(olen+1) + + # current row of distances + var v1 = new Array[Int].with_capacity(olen+1) + + for j in [0..olen] do + # prefix insert cost + v0[j] = j + end + + for i in [0..slen[ do + + # prefix delete cost + v1[0] = i + 1 + + for j in [0..olen[ do + # delete cost + var cost1 = v1[j] + 1 + # insert cost + var cost2 = v0[j + 1] + 1 + # same char cost (+0) + var cost3 = v0[j] + # change cost + if self[i] != other[j] then cost3 += 1 + # keep the min + v1[j+1] = cost1.min(cost2).min(cost3) + end + + # Switch columns: + # * v1 become v0 in the next iteration + # * old v0 is reused as the new v1 + var tmp = v1 + v1 = v0 + v0 = tmp + end + + return v0[olen] + end + # Copies `n` bytes from `self` at `src_offset` into `dest` starting at `dest_offset` # # Basically a high-level synonym of NativeString::copy_to @@ -981,10 +1064,7 @@ abstract class FlatText # # Warning : Might be void in some subclasses, be sure to check # if set before using it. - private var items: NativeString is noinit - - # Real items, used as cache for to_cstring is called - private var real_items: nullable NativeString = null + var items: NativeString is noinit # Returns a char* starting at position `first_byte` # @@ -1001,7 +1081,7 @@ abstract class FlatText # # As always, do not modify the content of the String in C code, if this is what you want # copy locally the char* as Nit Strings are immutable. - private fun fast_cstring: NativeString is abstract + fun fast_cstring: NativeString is abstract redef var length = 0 @@ -1050,7 +1130,7 @@ private abstract class StringByteView redef fun is_empty do return target.is_empty - redef fun length do return target.length + redef fun length do return target.bytelen redef fun iterator do return self.iterator_from(0) @@ -1573,9 +1653,15 @@ end redef class Char + # Returns a sequence with the UTF-8 bytes of `self` + # + # assert 'a'.bytes == [0x61u8] + # assert 'ま'.bytes == [0xE3u8, 0x81u8, 0xBEu8] + fun bytes: SequenceRead[Byte] do return to_s.bytes + # Length of `self` in a UTF-8 String private fun u8char_len: Int do - var c = self.ascii + var c = self.code_point if c < 0x80 then return 1 if c <= 0x7FF then return 2 if c <= 0xFFFF then return 3 @@ -1592,6 +1678,46 @@ redef class Char return ns.to_s_with_length(ln) end + # Returns `self` escaped to UTF-16 + # + # i.e. Represents `self`.`code_point` using UTF-16 codets escaped + # with a `\u` + # + # assert 'A'.escape_to_utf16 == "\\u0041" + # assert 'è'.escape_to_utf16 == "\\u00e8" + # assert 'あ'.escape_to_utf16 == "\\u3042" + # assert '𐏓'.escape_to_utf16 == "\\ud800\\udfd3" + fun escape_to_utf16: String do + var cp = code_point + var buf: Buffer + if cp < 0xD800 or (cp >= 0xE000 and cp <= 0xFFFF) then + buf = new Buffer.with_cap(6) + buf.append("\\u0000") + var hx = cp.to_hex + var outid = 5 + for i in hx.chars.reverse_iterator do + buf[outid] = i + outid -= 1 + end + else + buf = new Buffer.with_cap(12) + buf.append("\\u0000\\u0000") + var lo = (((cp - 0x10000) & 0x3FF) + 0xDC00).to_hex + var hi = ((((cp - 0x10000) & 0xFFC00) >> 10) + 0xD800).to_hex + var out = 2 + for i in hi do + buf[out] = i + out += 1 + end + out = 8 + for i in lo do + buf[out] = i + out += 1 + end + end + return buf.to_s + end + private fun u8char_tos(r: NativeString, len: Int) `{ r[len] = '\0'; switch(len){ @@ -1642,6 +1768,16 @@ redef class Char return (self >= 'a' and self <= 'z') or (self >= 'A' and self <= 'Z') end + # Is `self` an hexadecimal digit ? + # + # assert 'A'.is_hexdigit + # assert not 'G'.is_hexdigit + # assert 'a'.is_hexdigit + # assert not 'g'.is_hexdigit + # assert '5'.is_hexdigit + fun is_hexdigit: Bool do return (self >= '0' and self <= '9') or (self >= 'A' and self <= 'F') or + (self >= 'a' and self <= 'f') + # Returns true if the char is an alpha or a numeric digit # # assert 'a'.is_alphanumeric