From: Jean Privat Date: Tue, 20 Oct 2015 02:32:54 +0000 (-0400) Subject: Merge: Optimize hash collection X-Git-Tag: v0.7.9~22 X-Git-Url: http://nitlanguage.org?hp=a06f54d8d709b41c9aba0a55d5debae80599080b Merge: Optimize hash collection Some optimization when dealing with empty hash-based collection user time for nitc nitc.nit: * before: 0m6.640s * after: 0m6.344s (-4.5%) Pull-Request: #1768 Reviewed-by: Alexis Laferrière Reviewed-by: Alexandre Terrasa --- diff --git a/contrib/opportunity/src/templates/meetup.nit b/contrib/opportunity/src/templates/meetup.nit index 0358f33..043c86b 100644 --- a/contrib/opportunity/src/templates/meetup.nit +++ b/contrib/opportunity/src/templates/meetup.nit @@ -211,7 +211,11 @@ class OpportunityMeetupPage .fail(function(data){ //TODO: Notify of failure }); + + // Remember the participant's name client-side + set_cookie("opportunity_participant_name", pname); } + function remove_people(ele){ var arr = ele.id.split("_") var pid = arr[1] @@ -226,6 +230,7 @@ class OpportunityMeetupPage } }); } + // ID of line currently open for modification var in_modification_id = null; function modify_people(ele, id){ @@ -248,6 +253,30 @@ class OpportunityMeetupPage in_modification_id = null; } } + + function get_cookie(cookie_name) { + var name = cookie_name + "="; + var ca = document.cookie.split(';'); + for(var i = 0; i < ca.length; i ++) { + var c = ca[i]; + while (c.charAt(0) == ' ') c = c.substring(1); + if (c.indexOf(name) == 0) return c.substring(name.length, c.length); + } + return ""; + } + + function set_cookie(cookie_name, value) { + var date = new Date(); + date.setTime(date.getTime() + (365*24*60*60*1000)); + var expires = "expires="+date.toUTCString(); + document.cookie = cookie_name + "=" + value + "; " + expires; + } + + // Retrieve the last client-side participant's name + window.onload = function () { + var name_field = document.getElementById("new_name"); + name_field.value = get_cookie("opportunity_participant_name"); + } """ end diff --git a/lib/html/html.nit b/lib/html/html.nit index d3a92eb..72284ef 100644 --- a/lib/html/html.nit +++ b/lib/html/html.nit @@ -107,7 +107,12 @@ class HTMLTag # `"div"` for `
`. var tag: String init do - self.is_void = (once ["area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"]).has(tag) + self.is_void = (once void_list).has(tag) + end + + private fun void_list: Set[String] + do + return new HashSet[String].from(["area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"]) end # Is the HTML element a void element? diff --git a/lib/nlp/README.md b/lib/nlp/README.md new file mode 100644 index 0000000..6545537 --- /dev/null +++ b/lib/nlp/README.md @@ -0,0 +1,89 @@ +# Nit wrapper for Stanford CoreNLP + +Stanford CoreNLP provides a set of natural language analysis tools which can take +raw text input and give the base forms of words, their parts of speech, whether +they are names of companies, people, etc., normalize dates, times, and numeric +quantities, and mark up the structure of sentences in terms of phrases and word +dependencies, indicate which noun phrases refer to the same entities, indicate +sentiment, etc. + +This wrapper needs the Stanford CoreNLP jars that run on Java 1.8+. + +See http://nlp.stanford.edu/software/corenlp.shtml. + +## Usage + +~~~nitish +var proc = new NLPProcessor("path/to/StanfordCoreNLP/jars") + +var doc = proc.process("String to analyze") + +for sentence in doc.sentences do + for token in sentence.tokens do + print "{token.lemma}: {token.pos}" + end +end +~~~ + +## Nit API + +For ease of use, this wrapper introduce a Nit model to handle CoreNLP XML results. + +### NLPDocument + +[[doc: NLPDocument]] + +[[doc: NLPDocument::from_xml]] +[[doc: NLPDocument::from_xml_file]] +[[doc: NLPDocument::sentences]] + +### NLPSentence + +[[doc: NLPSentence]] + +[[doc: NLPSentence::tokens]] + +### NLPToken + +[[doc: NLPToken]] + +[[doc: NLPToken::word]] +[[doc: NLPToken::lemma]] +[[doc: NLPToken::pos]] + +### NLP Processor + +[[doc: NLPProcessor]] + +[[doc: NLPProcessor::java_cp]] + +[[doc: NLPProcessor::process]] +[[doc: NLPProcessor::process_file]] +[[doc: NLPProcessor::process_files]] + +## Vector Space Model + +[[doc: NLPVector]] + +[[doc: NLPDocument::vector]] + +[[doc: NLPVector::cosine_similarity]] + +## NitNLP binary + +The `nitnlp` binary is given as an example of NitNLP client. +It compares two strings and display ther cosine similarity value. + +Usage: + +~~~raw +nitnlp --cp "/path/to/jars" "sort" "Sorting array data" +0.577 +~~~ + +## TODO + +* Use JWrapper +* Use options to choose CoreNLP analyzers +* Analyze sentences dependencies +* Analyze sentiment diff --git a/lib/nlp/nitnlp.nit b/lib/nlp/nitnlp.nit new file mode 100644 index 0000000..72946bc --- /dev/null +++ b/lib/nlp/nitnlp.nit @@ -0,0 +1,49 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Natural Language Processor based on the StanfordNLP core. +# +# This tool provides a document comparison service from command line based on +# StanfordNLP and NLPVector consine similarity. +# +# See http://nlp.stanford.edu/software/corenlp.shtml. +module nitnlp + +import opts +import nlp + +# Option management +var opt_java_cp = new OptionString("Java classpath for StanfordNLP jars", "--cp") +var options = new OptionContext +options.add_option(opt_java_cp) +options.parse(args) +var arguments = options.rest + +# Processor initialization +var java_cp = opt_java_cp.value +if java_cp == null then java_cp = "*" +var proc = new NLPProcessor(java_cp) + +if arguments.length != 2 then + print "Usage: nitnlp text1 text2\n" + options.usage + sys.exit 1 +end + +var doc1 = proc.process(arguments.first) +print doc1.vector.join(":", ",") +var doc2 = proc.process(arguments.last) +print doc2.vector.join(":", ",") + +print doc1.vector.cosine_similarity(doc2.vector) diff --git a/lib/nlp/nlp.nit b/lib/nlp/nlp.nit new file mode 100644 index 0000000..4dd7cc9 --- /dev/null +++ b/lib/nlp/nlp.nit @@ -0,0 +1,71 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Natural Language Processor based on the StanfordNLP core. +# +# See http://nlp.stanford.edu/software/corenlp.shtml. +module nlp + +import stanford +import vsm + +redef class NLPDocument + + # `NLPVector` representing `self`. + var vector: NLPVector is lazy do + var vector = new NLPVector + for sentence in sentences do + for token in sentence.tokens do + if not keep_pos_token(token) then continue + var lemma = token.lemma + if lemma_black_list.has(lemma) then continue + if not vector.has_key(lemma) then + vector[lemma] = 1 + else + vector[lemma] += 1 + end + end + end + return vector + end + + # Should we keep `token` when composing the vector? + # + # Choice is based on the POS tag of the token. + # See `allowed_pos_prefixes`. + private fun keep_pos_token(token: NLPToken): Bool do + var pos = token.pos + for prefix in allowed_pos_prefixes do + if pos.has_prefix(prefix) then return true + end + return false + end + + # Should we keep `lemma` when composing the vector? + # + # See `lemma_black_list`. + private fun keep_lemma(lemma: String): Bool do + return true + end + + # Allowed POS tag prefixes. + # + # When building a vector from `self`, only tokens tagged with one of these + # prefixes are kept. + # Other tokens are ignored. + var allowed_pos_prefixes: Array[String] = ["NN", "VB", "RB"] is writable + + # Ignored lemmas. + var lemma_black_list: Array[String] = ["module", "class", "method"] is writable +end diff --git a/lib/nlp/package.ini b/lib/nlp/package.ini new file mode 100644 index 0000000..789aa44 --- /dev/null +++ b/lib/nlp/package.ini @@ -0,0 +1,11 @@ +[package] +name=nlp +tags=nlp,lib +maintainer=Alexandre Terrasa +license=Apache-2.0 +[upstream] +browse=https://github.com/nitlang/nit/tree/master/lib/nlp/ +git=https://github.com/nitlang/nit.git +git.directory=lib/nlp/ +homepage=http://nitlanguage.org +issues=https://github.com/nitlang/nit/issues diff --git a/lib/nlp/stanford.nit b/lib/nlp/stanford.nit new file mode 100644 index 0000000..734a228 --- /dev/null +++ b/lib/nlp/stanford.nit @@ -0,0 +1,258 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Natural Language Processor based on the StanfordNLP core. +# +# See http://nlp.stanford.edu/software/corenlp.shtml. +module stanford + +import opts +import dom + +# Wrapper around StanfordNLP jar. +# +# NLPProcessor provides natural language processing of input text files and +# an API to handle analysis results. +# +# FIXME this should use the Java FFI. +class NLPProcessor + + # Classpath to give to Java when loading the StanfordNLP jars. + var java_cp: String + + # Process a string and return a new NLPDocument from this. + fun process(string: String): NLPDocument do + var tmp_file = ".nlp.in" + var file = new FileWriter.open(tmp_file) + file.write string + file.close + var doc = process_file(tmp_file) + tmp_file.file_delete + return doc + end + + # Process the `input` file and return a new NLPDocument from this. + fun process_file(input: String): NLPDocument do + # TODO opt annotators + var tmp_file = "{input.basename}.xml" + sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}" + var doc = new NLPDocument.from_xml_file(tmp_file) + tmp_file.file_delete + return doc + end + + # Batch mode. + # + # Returns a map of file path associated with their NLPDocument. + fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do + # Prepare the input file list + var input_file = "inputs.list" + var fw = new FileWriter.open(input_file) + for input in inputs do fw.write "{input}\n" + fw.close + + # Run Stanford NLP jar + sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}" + # Parse output + var map = new HashMap[String, NLPDocument] + for input in inputs do + var out_file = output_dir / "{input.basename}.xml" + map[input] = new NLPDocument.from_xml_file(out_file) + end + input_file.file_delete + return map + end +end + +# A `Document` represent a text input given to the NLP processor. +# +# Once processed, it contains a list of sentences that contain tokens. +class NLPDocument + + # NLPSentences contained in `self` + var sentences = new Array[NLPSentence] + + # Init `self` from an xml element. + # + # ~~~ + # var xml = """ + # + # + # + # + # + # + # Stanford + # Stanford + # 0 + # 8 + # NNP + # + # + # University + # University + # 9 + # 19 + # NNP + # + # + # + # + # + # + # UQAM + # UQAM + # 0 + # 4 + # NNP + # + # + # University + # University + # 5 + # 15 + # NNP + # + # + # + # + # + # """.to_xml.as(XMLDocument) + # + # var document = new NLPDocument.from_xml(xml) + # assert document.sentences.length == 2 + # assert document.sentences.first.tokens.first.word == "Stanford" + # assert document.sentences.last.tokens.first.word == "UQAM" + # ~~~ + init from_xml(xml: XMLDocument) do + for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do + if obj isa XMLStartTag then + sentences.add new NLPSentence.from_xml(obj) + else + print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags" + end + end + end + + # Init `self` from a XML file. + init from_xml_file(path: String) do + var file = new FileReader.open(path) + var xml = file.read_lines + file.close + xml.shift # remove xml doctype + xml.shift # remove xslt link + from_xml(xml.join("\n").to_xml.as(XMLDocument)) + end +end + +# Represent one sentence in a `Document`. +class NLPSentence + + # Index of this sentence in the input text. + var index: Int + + # NLPTokens contained in `self`. + var tokens = new Array[NLPToken] + + # Init `self` from an XML element. + # + # ~~~ + # var xml = """ + # + # + # + # Stanford + # Stanford + # 0 + # 8 + # NNP + # + # + # University + # University + # 9 + # 19 + # NNP + # + # + # """.to_xml["sentence"].first.as(XMLStartTag) + # + # var sentence = new NLPSentence.from_xml(xml) + # assert sentence.index == 1 + # assert sentence.tokens.length == 2 + # ~~~ + init from_xml(xml: XMLStartTag) do + var index = xml.attributes.first.as(XMLStringAttr).value.to_i + for obj in xml["tokens"].first["token"] do + if obj isa XMLStartTag then + tokens.add new NLPToken.from_xml(obj) + else + print "Warning: malformed xml, `tokens` is supposed to contain `token` tags" + end + end + init(index) + end +end + +# Represent one word (or puncutation mark) in a `NLPSentence`. +class NLPToken + + # Index of this word in the sentence. + var index: Int + + # Original word + var word: String + + # `word` lemma + var lemma: String + + # Position of the first character in the input + var begin_offset: Int + + # Position of the last character in the input + var end_offset: Int + + # Part Of Speech tag + var pos: String + + # Init `self` from an XML element. + # + # ~~~ + # var xml = """ + # + # University + # University + # 9 + # 19 + # NNP + # """.to_xml["token"].first.as(XMLStartTag) + # + # var token = new NLPToken.from_xml(xml) + # assert token.index == 2 + # assert token.word == "University" + # assert token.lemma == "University" + # assert token.begin_offset == 9 + # assert token.end_offset == 19 + # assert token.pos == "NNP" + # ~~~ + init from_xml(xml: XMLStartTag) do + var index = xml.attributes.first.as(XMLStringAttr).value.to_i + var word = xml["word"].first.as(XMLStartTag).data + var lemma = xml["lemma"].first.as(XMLStartTag).data + var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i + var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i + var pos = xml["POS"].first.as(XMLStartTag).data + init(index, word, lemma, begin_offset, end_offset, pos) + end +end diff --git a/lib/nlp/vsm.nit b/lib/nlp/vsm.nit new file mode 100644 index 0000000..7fe0a84 --- /dev/null +++ b/lib/nlp/vsm.nit @@ -0,0 +1,92 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NLPVector Space Model. +# +# The Vector Space Model (VSM) is used to compare natural language texts. +# Texts are translated to multidimensionnal vectors then compared by cosine +# similarity. +module vsm + +import counter + +# A multi-dimensional vector. +class NLPVector + super Counter[String] + + # Cosine similarity of `self` and `other`. + # + # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the + # two vectors are orthogonal and 1.0 means that they are identical. + # + # ~~~ + # var v1 = new NLPVector + # v1["x"] = 1 + # v1["y"] = 2 + # v1["z"] = 3 + # + # var v2 = new NLPVector + # v2["x"] = 1 + # v2["y"] = 2 + # v2["z"] = 3 + # + # var v3 = new NLPVector + # v3["a"] = 1 + # v3["b"] = 2 + # v3["c"] = 3 + # + # print v1.cosine_similarity(v2) + # #assert v1.cosine_similarity(v2) == 1.0 + # print v1.cosine_similarity(v3) + # assert v1.cosine_similarity(v3) == 0.0 + # ~~~ + fun cosine_similarity(other: SELF): Float do + # Collect terms + var terms = new HashSet[String] + for k in self.keys do terms.add k + for k in other.keys do terms.add k + + # Get dot product of two verctors + var dot = 0 + for term in terms do + dot += self.get_or_default(term, 0) * other.get_or_default(term, 0) + end + + return dot.to_f / (self.norm * other.norm) + end + + # The norm of the vector. + # + # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt` + # + # ~~~ + # var v = new NLPVector + # v["x"] = 1 + # v["y"] = 1 + # v["z"] = 1 + # v["t"] = 1 + # assert v.norm.is_approx(2.0, 0.001) + # + # v["x"] = 1 + # v["y"] = 2 + # v["z"] = 3 + # v["t"] = 0 + # assert v.norm.is_approx(3.742, 0.001) + # ~~~ + fun norm: Float do + var sum = 0 + for v in self.values do sum += v ** 2 + return sum.to_f.sqrt + end +end diff --git a/src/parser/parser.nit b/src/parser/parser.nit index d907890..88b4136 100644 --- a/src/parser/parser.nit +++ b/src/parser/parser.nit @@ -10,7 +10,6 @@ redef class Parser redef fun build_reduce_table do var reduce_table = new Array[ReduceAction].with_capacity(1091) - self.reduce_table = reduce_table reduce_table.add new ReduceAction0(0) reduce_table.add new ReduceAction1(0) reduce_table.add new ReduceAction2(0) @@ -1102,6 +1101,7 @@ redef class Parser reduce_table.add new ReduceAction1088(220) reduce_table.add new ReduceAction473(221) reduce_table.add new ReduceAction492(221) + return reduce_table end end diff --git a/src/parser/parser_work.nit b/src/parser/parser_work.nit index a5bef33..217a09a 100644 --- a/src/parser/parser_work.nit +++ b/src/parser/parser_work.nit @@ -40,7 +40,7 @@ class Parser init do - build_reduce_table + self.reduce_table = once build_reduce_table end # Do a transition in the automata @@ -155,7 +155,7 @@ class Parser end private var reduce_table: Array[ReduceAction] is noinit - private fun build_reduce_table is abstract + private fun build_reduce_table: Array[ReduceAction] is abstract end redef class Prod diff --git a/src/parser/xss/parser.xss b/src/parser/xss/parser.xss index 8e38001..5b1d60e 100644 --- a/src/parser/xss/parser.xss +++ b/src/parser/xss/parser.xss @@ -21,10 +21,10 @@ redef class Parser redef fun build_reduce_table do var reduce_table = new Array[ReduceAction].with_capacity(${count(rules/rule)}) - self.reduce_table = reduce_table $ foreach {rules/rule} reduce_table.add new ReduceAction@index(@leftside) $ end foreach + return reduce_table end end