From 33e0c9fce238d334f160ee9dbaa29dea5a412718 Mon Sep 17 00:00:00 2001 From: Lucas Bajolet Date: Thu, 27 Aug 2015 14:09:33 -0400 Subject: [PATCH] lib/standard/text: Added methods for bytes to char position translation Signed-off-by: Lucas Bajolet --- lib/standard/text/flat.nit | 37 +++++++++++++++++++++++++++++++++++++ lib/standard/text/native.nit | 23 +++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/lib/standard/text/flat.nit b/lib/standard/text/flat.nit index 917b0e5..75cf729 100644 --- a/lib/standard/text/flat.nit +++ b/lib/standard/text/flat.nit @@ -84,6 +84,43 @@ redef class FlatText return ns_i end + private fun byte_to_char_index(index: Int): Int do + var ln = bytelen + assert index >= 0 + assert index < bytelen + + # Find best insertion point + var delta_begin = index + var delta_end = (ln - 1) - index + var delta_cache = (bytepos - index).abs + var min = delta_begin + var its = items + + if delta_cache < min then min = delta_cache + if delta_end < min then min = delta_end + + var ns_i: Int + var my_i: Int + + if min == delta_begin then + ns_i = first_byte + my_i = 0 + else if min == delta_cache then + ns_i = bytepos + my_i = position + else + ns_i = its.find_beginning_of_char_at(last_byte) + my_i = length - 1 + end + + my_i = its.byte_to_char_index_cached(index, my_i, ns_i) + + position = my_i + bytepos = index + + return my_i + end + redef fun [](index) do return items.char_at(char_to_byte_index(index)) end diff --git a/lib/standard/text/native.nit b/lib/standard/text/native.nit index 170b196..11c8d34 100644 --- a/lib/standard/text/native.nit +++ b/lib/standard/text/native.nit @@ -130,6 +130,29 @@ extern class NativeString `{ char* `} return ns_i end + # Gets the byte index of char at position `n` in UTF-8 String + # + # `char_from` and `byte_from` are cached values to seek from. + # + # NOTE: char_from and byte_from are not guaranteed to be valid cache values + # It it up to the client to ensure the validity of the information + fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do + var ns_i = byte_from + var my_i = char_from + + while ns_i < n do + ns_i += length_of_char_at(ns_i) + my_i += 1 + end + + while ns_i > n do + ns_i = find_beginning_of_char_at(ns_i - 1) + my_i -= 1 + end + + return my_i + end + # Returns the beginning position of the char at position `pos` # # If the char is invalid UTF-8, `pos` is returned as-is -- 1.7.9.5