X-Git-Url: http://nitlanguage.org diff --git a/lib/core/text/abstract_text.nit b/lib/core/text/abstract_text.nit index 1faaad3..baf8ae7 100644 --- a/lib/core/text/abstract_text.nit +++ b/lib/core/text/abstract_text.nit @@ -146,15 +146,7 @@ abstract class Text # Returns -1 if not found # # DEPRECATED : Use self.chars.last_index_of_from instead - fun last_index_of_from(item: Char, pos: Int): Int - do - var iter = self.chars.reverse_iterator_from(pos) - while iter.is_ok do - if iter.item == item then return iter.index - iter.next - end - return -1 - end + fun last_index_of_from(item: Char, pos: Int): Int do return chars.last_index_of_from(item, pos) # Gets an iterator on the chars of self # @@ -231,15 +223,6 @@ abstract class Text # assert "abcd".has_suffix("bcd") == true fun has_suffix(suffix: String): Bool do return has_substring(suffix, length - suffix.length) - # Returns a copy of `self` minus all occurences of `c` - # - # assert "__init__".remove_all('_') == "init" - fun remove_all(c: Char): String do - var b = new Buffer - for i in chars do if i != c then b.add i - return b.to_s - end - # Returns `self` as the corresponding integer # # assert "123".to_i == 123 @@ -552,7 +535,7 @@ abstract class Text if c >= '0' and c <= '9' then res.add('_') - res.append(c.ascii.to_s) + res.append(c.code_point.to_s) res.add('d') start = 1 end @@ -564,7 +547,7 @@ abstract class Text continue end if underscore then - res.append('_'.ascii.to_s) + res.append('_'.code_point.to_s) res.add('d') end if c >= '0' and c <= '9' then @@ -575,13 +558,13 @@ abstract class Text underscore = true else res.add('_') - res.append(c.ascii.to_s) + res.append(c.code_point.to_s) res.add('d') underscore = false end end if underscore then - res.append('_'.ascii.to_s) + res.append('_'.code_point.to_s) res.add('d') end return res.to_s @@ -596,7 +579,7 @@ abstract class Text # Three digits are always used to avoid following digits to be interpreted as an element # of the octal sequence. # - # assert "{0.ascii}{1.ascii}{8.ascii}{31.ascii}{32.ascii}".escape_to_c == "\\000\\001\\010\\037 " + # assert "{0.code_point}{1.code_point}{8.code_point}{31.code_point}{32.code_point}".escape_to_c == "\\000\\001\\010\\037 " # # The exceptions are the common `\t` and `\n`. fun escape_to_c: String @@ -608,17 +591,15 @@ abstract class Text b.append("\\n") else if c == '\t' then b.append("\\t") - else if c == '\0' then - b.append("\\000") else if c == '"' then b.append("\\\"") else if c == '\'' then b.append("\\\'") else if c == '\\' then b.append("\\\\") - else if c.ascii < 32 then + else if c.code_point < 32 then b.add('\\') - var oct = c.ascii.to_base(8, false) + var oct = c.code_point.to_base(8, false) # Force 3 octal digits since it is the # maximum allowed in the C specification if oct.length == 1 then @@ -691,8 +672,8 @@ abstract class Text else if c == ':' or c == ' ' or c == '#' then b.add('\\') b.add(c) - else if c.ascii < 32 or c == ';' or c == '|' or c == '\\' or c == '=' then - b.append("?{c.ascii.to_base(16, false)}") + else if c.code_point < 32 or c == ';' or c == '|' or c == '\\' or c == '=' then + b.append("?{c.code_point.to_base(16, false)}") else b.add(c) end @@ -706,7 +687,7 @@ abstract class Text # assert s.length == 2 # var u = s.unescape_nit # assert u.length == 1 - # assert u.chars[0].ascii == 10 # (the ASCII value of the "new line" character) + # assert u.chars[0].code_point == 10 # (the ASCII value of the "new line" character) fun unescape_nit: String do var res = new Buffer.with_cap(self.length) @@ -742,6 +723,7 @@ abstract class Text # assert "aBc09-._~".to_percent_encoding == "aBc09-._~" # assert "%()< >".to_percent_encoding == "%25%28%29%3c%20%3e" # assert ".com/post?e=asdf&f=123".to_percent_encoding == ".com%2fpost%3fe%3dasdf%26f%3d123" + # assert "éあいう".to_percent_encoding == "%c3%a9%e3%81%82%e3%81%84%e3%81%86" fun to_percent_encoding: String do var buf = new Buffer @@ -755,7 +737,10 @@ abstract class Text c == '_' or c == '~' then buf.add c - else buf.append "%{c.ascii.to_hex}" + else + var bytes = c.to_s.bytes + for b in bytes do buf.append "%{b.to_i.to_hex}" + end end return buf.to_s @@ -771,36 +756,50 @@ abstract class Text # assert "%25%28%29%3C%20%3E".from_percent_encoding == "%()< >" # assert "incomplete %".from_percent_encoding == "incomplete ?" # assert "invalid % usage".from_percent_encoding == "invalid ? usage" + # assert "%c3%a9%e3%81%82%e3%81%84%e3%81%86".from_percent_encoding == "éあいう" fun from_percent_encoding: String do - var buf = new Buffer + var len = bytelen + var has_percent = false + for c in chars do + if c == '%' then + len -= 2 + has_percent = true + end + end + # If no transformation is needed, return self as a string + if not has_percent then return to_s + + var buf = new NativeString(len) var i = 0 + var l = 0 while i < length do var c = chars[i] if c == '%' then if i + 2 >= length then # What follows % has been cut off - buf.add '?' + buf[l] = '?'.ascii else i += 1 var hex_s = substring(i, 2) if hex_s.is_hex then var hex_i = hex_s.to_hex - buf.add hex_i.ascii + buf[l] = hex_i.to_b i += 1 else # What follows a % is not Hex - buf.add '?' + buf[l] = '?'.ascii i -= 1 end end - else buf.add c + else buf[l] = c.ascii i += 1 + l += 1 end - return buf.to_s + return buf.to_s_with_length(l) end # Escape the characters `<`, `>`, `&`, `"`, `'` and `/` as HTML/XML entity references. @@ -898,7 +897,7 @@ abstract class Text for i in [0..length[ do var char = chars[i] - h = (h << 5) + h + char.ascii + h = (h << 5) + h + char.code_point end hash_cache = h @@ -967,17 +966,14 @@ abstract class Text end # All kinds of array-based text representations. -abstract class FlatText +private abstract class FlatText super Text # Underlying C-String (`char*`) # # Warning : Might be void in some subclasses, be sure to check # if set before using it. - private var items: NativeString is noinit - - # Real items, used as cache for to_cstring is called - private var real_items: nullable NativeString = null + var items: NativeString is noinit # Returns a char* starting at position `first_byte` # @@ -994,7 +990,7 @@ abstract class FlatText # # As always, do not modify the content of the String in C code, if this is what you want # copy locally the char* as Nit Strings are immutable. - private fun fast_cstring: NativeString is abstract + fun fast_cstring: NativeString is abstract redef var length = 0 @@ -1043,7 +1039,7 @@ private abstract class StringByteView redef fun is_empty do return target.is_empty - redef fun length do return target.length + redef fun length do return target.bytelen redef fun iterator do return self.iterator_from(0) @@ -1566,9 +1562,15 @@ end redef class Char + # Returns a sequence with the UTF-8 bytes of `self` + # + # assert 'a'.bytes == [0x61u8] + # assert 'ま'.bytes == [0xE3u8, 0x81u8, 0xBEu8] + fun bytes: SequenceRead[Byte] do return to_s.bytes + # Length of `self` in a UTF-8 String private fun u8char_len: Int do - var c = self.ascii + var c = self.code_point if c < 0x80 then return 1 if c <= 0x7FF then return 2 if c <= 0xFFFF then return 3 @@ -1817,6 +1819,12 @@ redef class NativeString # Returns `self` as a String of `length`. fun to_s_with_length(length: Int): String is abstract + + # Returns `self` as a String with `bytelen` and `length` set + # + # SEE: `abstract_text::Text` for more infos on the difference + # between `Text::bytelen` and `Text::length` + fun to_s_full(bytelen, unilen: Int): String is abstract end redef class NativeArray[E]