X-Git-Url: http://nitlanguage.org diff --git a/lib/core/text/abstract_text.nit b/lib/core/text/abstract_text.nit index 859808a..e07e0d4 100644 --- a/lib/core/text/abstract_text.nit +++ b/lib/core/text/abstract_text.nit @@ -495,18 +495,21 @@ abstract class Text end end - # Justify a self in a space of `length` + # Justify `self` in a space of `length` # # `left` is the space ratio on the left side. # * 0.0 for left-justified (no space at the left) # * 1.0 for right-justified (all spaces at the left) # * 0.5 for centered (half the spaces at the left) # + # `char`, or `' '` by default, is repeated to pad the empty space. + # # Examples # # assert "hello".justify(10, 0.0) == "hello " # assert "hello".justify(10, 1.0) == " hello" # assert "hello".justify(10, 0.5) == " hello " + # assert "hello".justify(10, 0.5, '.') == "..hello..." # # If `length` is not enough, `self` is returned as is. # @@ -515,13 +518,14 @@ abstract class Text # REQUIRE: `left >= 0.0 and left <= 1.0` # ENSURE: `self.length <= length implies result.length == length` # ENSURE: `self.length >= length implies result == self` - fun justify(length: Int, left: Float): String + fun justify(length: Int, left: Float, char: nullable Char): String do + var pad = (char or else ' ').to_s var diff = length - self.length if diff <= 0 then return to_s assert left >= 0.0 and left <= 1.0 var before = (diff.to_f * left).to_i - return " " * before + self + " " * (diff-before) + return pad * before + self + pad * (diff-before) end # Mangle a string to be a unique string only made of alphanumeric characters and underscores. @@ -586,10 +590,13 @@ abstract class Text return res.to_s end - # Escape " \ ' and non printable characters using the rules of literal C strings and characters + # Escape `"` `\` `'`, trigraphs and non printable characters using the rules of literal C strings and characters # - # assert "abAB12<>&".escape_to_c == "abAB12<>&" + # assert "abAB12<>&".escape_to_c == "abAB12<>&" # assert "\n\"'\\".escape_to_c == "\\n\\\"\\'\\\\" + # assert "allo???!".escape_to_c == "allo??\\?!" + # assert "??=??/??'??(??)".escape_to_c == "?\\?=?\\?/??\\'?\\?(?\\?)" + # assert "??!????-".escape_to_c == "?\\?!?\\??\\?-" # # Most non-printable characters (bellow ASCII 32) are escaped to an octal form `\nnn`. # Three digits are always used to avoid following digits to be interpreted as an element @@ -613,6 +620,24 @@ abstract class Text b.append("\\\'") else if c == '\\' then b.append("\\\\") + else if c == '?' then + # Escape if it is the last question mark of a ANSI C trigraph. + var j = i + 1 + if j < length then + var next = chars[j] + # We ignore `??'` because it will be escaped as `??\'`. + if + next == '!' or + next == '(' or + next == ')' or + next == '-' or + next == '/' or + next == '<' or + next == '=' or + next == '>' + then b.add('\\') + end + b.add('?') else if c.code_point < 32 then b.add('\\') var oct = c.code_point.to_base(8) @@ -636,6 +661,7 @@ abstract class Text # The result might no be legal in C but be used in other languages # # assert "ab|\{\}".escape_more_to_c("|\{\}") == "ab\\|\\\{\\\}" + # assert "allo???!".escape_more_to_c("") == "allo??\\?!" fun escape_more_to_c(chars: String): String do var b = new Buffer @@ -1091,6 +1117,39 @@ abstract class Text end end + # Packs the content of a string in packs of `ln` chars. + # This variant ensures that only the last element might be smaller than `ln` + # + # ~~~nit + # var s = "abcdefghijklmnopqrstuvwxyz" + # assert s.pack_l(4) == ["abcd","efgh","ijkl","mnop","qrst","uvwx","yz"] + # ~~~ + fun pack_l(ln: Int): Array[Text] do + var st = 0 + var retarr = new Array[Text].with_capacity(length / ln + length % ln) + while st < length do + retarr.add(substring(st, ln)) + st += ln + end + return retarr + end + + # Packs the content of a string in packs of `ln` chars. + # This variant ensures that only the first element might be smaller than `ln` + # + # ~~~nit + # var s = "abcdefghijklmnopqrstuvwxyz" + # assert s.pack_r(4) == ["ab","cdef","ghij","klmn","opqr","stuv","wxyz"] + # ~~~ + fun pack_r(ln: Int): Array[Text] do + var st = length + var retarr = new Array[Text].with_capacity(length / ln + length % ln) + while st >= 0 do + retarr.add(substring(st - ln, ln)) + st -= ln + end + return retarr.reversed + end end # All kinds of array-based text representations. @@ -1323,30 +1382,19 @@ abstract class String # Letters that follow a letter are lowercased # Letters that follow a non-letter are upcased. # + # If `keep_upper = true`, already uppercase letters are not lowercased. + # # SEE : `Char::is_letter` for the definition of letter. # # assert "jAVASCRIPT".capitalized == "Javascript" # assert "i am root".capitalized == "I Am Root" # assert "ab_c -ab0c ab\nc".capitalized == "Ab_C -Ab0C Ab\nC" - fun capitalized: SELFTYPE do + # assert "preserve my ACRONYMS".capitalized(keep_upper=true) == "Preserve My ACRONYMS" + fun capitalized(keep_upper: nullable Bool): SELFTYPE do if length == 0 then return self var buf = new Buffer.with_cap(length) - - var curr = chars[0].to_upper - var prev = curr - buf[0] = curr - - for i in [1 .. length[ do - prev = curr - curr = self[i] - if prev.is_letter then - buf[i] = curr.to_lower - else - buf[i] = curr.to_upper - end - end - + buf.capitalize(keep_upper=keep_upper, src=self) return buf.to_s end end @@ -1441,6 +1489,13 @@ abstract class Buffer # Letters that follow a letter are lowercased # Letters that follow a non-letter are upcased. # + # If `keep_upper = true`, uppercase letters are not lowercased. + # + # When `src` is specified, this method reads from `src` instead of `self` + # but it still writes the result to the beginning of `self`. + # This requires `self` to have the capacity to receive all of the + # capitalized content of `src`. + # # SEE: `Char::is_letter` for the definition of a letter. # # var b = new FlatBuffer.from("jAVAsCriPt") @@ -1452,16 +1507,32 @@ abstract class Buffer # b = new FlatBuffer.from("ab_c -ab0c ab\nc") # b.capitalize # assert b == "Ab_C -Ab0C Ab\nC" - fun capitalize do + # + # b = new FlatBuffer.from("12345") + # b.capitalize(src="foo") + # assert b == "Foo45" + # + # b = new FlatBuffer.from("preserve my ACRONYMS") + # b.capitalize(keep_upper=true) + # assert b == "Preserve My ACRONYMS" + fun capitalize(keep_upper: nullable Bool, src: nullable Text) do + src = src or else self + var length = src.length if length == 0 then return - var c = self[0].to_upper + keep_upper = keep_upper or else false + + var c = src[0].to_upper self[0] = c var prev = c for i in [1 .. length[ do prev = c - c = self[i] + c = src[i] if prev.is_letter then - self[i] = c.to_lower + if keep_upper then + self[i] = c + else + self[i] = c.to_lower + end else self[i] = c.to_upper end @@ -1477,6 +1548,42 @@ abstract class Buffer # In Buffers, the internal sequence of character is mutable # Thus, `chars` can be used to modify the buffer. redef fun chars: Sequence[Char] is abstract + + # Appends `length` chars from `s` starting at index `from` + # + # ~~~nit + # var b = new Buffer + # b.append_substring("abcde", 1, 2) + # assert b == "bc" + # b.append_substring("vwxyz", 2, 3) + # assert b == "bcxyz" + # b.append_substring("ABCDE", 4, 300) + # assert b == "bcxyzE" + # b.append_substring("VWXYZ", 400, 1) + # assert b == "bcxyzE" + # ~~~ + fun append_substring(s: Text, from, length: Int) do + if from < 0 then + length += from + from = 0 + end + var ln = s.length + if (length + from) > ln then length = ln - from + if length <= 0 then return + append_substring_impl(s, from, length) + end + + # Unsafe version of `append_substring` for performance + # + # NOTE: Use only if sure about `from` and `length`, no checks + # or bound recalculation is done + fun append_substring_impl(s: Text, from, length: Int) do + var pos = from + for i in [0 .. length[ do + self.add s[pos] + pos += 1 + end + end end # View for chars on Buffer objects, extends Sequence @@ -1718,8 +1825,20 @@ redef class Char return cp >= 0xD800 and cp <= 0xDFFF end + # Is `self` a UTF-16 high surrogate ? + fun is_hi_surrogate: Bool do + var cp = code_point + return cp >= 0xD800 and cp <= 0xDBFF + end + + # Is `self` a UTF-16 low surrogate ? + fun is_lo_surrogate: Bool do + var cp = code_point + return cp >= 0xDC00 and cp <= 0xDFFF + end + # Length of `self` in a UTF-8 String - private fun u8char_len: Int do + fun u8char_len: Int do var c = self.code_point if c < 0x80 then return 1 if c <= 0x7FF then return 2 @@ -2017,7 +2136,12 @@ end # see `alpha_comparator` private class AlphaComparator super Comparator - redef fun compare(a, b) do return a.to_s <=> b.to_s + redef fun compare(a, b) do + if a == b then return 0 + if a == null then return -1 + if b == null then return 1 + return a.to_s <=> b.to_s + end end # Stateless comparator that naively use `to_s` to compare things. @@ -2037,23 +2161,49 @@ do end redef class NativeString - # Returns `self` as a new String. + # Get a `String` from the data at `self` copied into Nit memory + # + # Require: `self` is a null-terminated string. fun to_s_with_copy: String is abstract - # Returns `self` as a String of `length`. + # Get a `String` from `length` bytes at `self` + # + # The result may point to the data at `self` or + # it may make a copy in Nit controlled memory. + # This method should only be used when `self` was allocated by the Nit GC, + # or when manually controlling the deallocation of `self`. fun to_s_with_length(length: Int): String is abstract - # Returns a new instance of `String` with self as `_items` + # Get a `String` from the raw `length` bytes at `self` # - # /!\: Does not clean the items for compliance with UTF-8, - # Use only if you know what you are doing - fun to_s_unsafe(len: nullable Int): String is abstract + # The default value of `length` is the number of bytes before + # the first null character. + # + # The created `String` points to the data at `self`. + # This method should be used when `self` was allocated by the Nit GC, + # or when manually controlling the deallocation of `self`. + # + # /!\: This service does not clean the items for compliance with UTF-8, + # use only when the data has already been verified as valid UTF-8. + fun to_s_unsafe(length: nullable Int): String is abstract - # Returns `self` as a String with `bytelen` and `length` set + # Get a `String` from the raw `bytelen` bytes at `self` with `unilen` Unicode characters + # + # The created `String` points to the data at `self`. + # This method should be used when `self` was allocated by the Nit GC, + # or when manually controlling the deallocation of `self`. # - # SEE: `abstract_text::Text` for more infos on the difference - # between `Text::bytelen` and `Text::length` + # /!\: This service does not clean the items for compliance with UTF-8, + # use only when the data has already been verified as valid UTF-8. + # + # SEE: `abstract_text::Text` for more info on the difference + # between `Text::bytelen` and `Text::length`. fun to_s_full(bytelen, unilen: Int): String is abstract + + # Copies the content of `src` to `self` + # + # NOTE: `self` must be large enough to withold `self.bytelen` bytes + fun fill_from(src: Text) do src.copy_to_native(self, src.bytelen, 0, 0) end redef class NativeArray[E]