Merge: loader: print error if bad files given to `scan_full` or `parse_full`

author Jean Privat <jean@pryen.org>

Wed, 21 Oct 2015 01:10:02 +0000 (21:10 -0400)

committer Jean Privat <jean@pryen.org>

Wed, 21 Oct 2015 01:10:02 +0000 (21:10 -0400)
author Jean Privat <jean@pryen.org>
Wed, 21 Oct 2015 01:10:02 +0000 (21:10 -0400)
committer Jean Privat <jean@pryen.org>
Wed, 21 Oct 2015 01:10:02 +0000 (21:10 -0400)
diff --git a/lib/core/collection/hash_collection.nit b/lib/core/collection/hash_collection.nit

index 8ee3491..890ecd9 100644 (file)
--- a/lib/core/collection/hash_collection.nit
+++ b/lib/core/collection/hash_collection.nit
@@ -24,7 +24,7 @@ end
  private abstract class HashCollection[K]
         type N: HashNode[K]
  
-       var array: nullable NativeArray[nullable N] = null # Used to store items
+       var array: NativeArray[nullable N] is noautoinit # Used to store items
         var capacity: Int = 0 # Size of _array
         var the_length: Int = 0 # Number of items in the map
  
@@ -50,6 +50,7 @@ private abstract class HashCollection[K]
         # Return the node associated with the key
         fun node_at(k: nullable Object): nullable N
         do
+               if _the_length == 0 then return null
                 # cache: `is` is used instead of `==` because it is a faster filter (even if not exact)
                 if k.is_same_instance(_last_accessed_key) then return _last_accessed_node
  
@@ -62,6 +63,7 @@ private abstract class HashCollection[K]
         # Return the node associated with the key (but with the index already known)
         fun node_at_idx(i: Int, k: nullable Object): nullable N
         do
+               if _the_length == 0 then return null
                 var c = _array[i]
                 while c != null do
                         var ck = c._key
@@ -111,6 +113,7 @@ private abstract class HashCollection[K]
         # Remove the node assosiated with the key
         fun remove_node(k: nullable Object)
         do
+               if _the_length == 0 then return
                 var i = index_at(k)
                 var node = node_at_idx(i, k)
                 if node == null then return
@@ -162,7 +165,6 @@ private abstract class HashCollection[K]
         # Force a capacity
         fun enlarge(cap: Int)
         do
-               var old_cap = _capacity
                 # get a new capacity
                 if cap < _the_length + 1 then cap = _the_length + 1
                 if cap <= _capacity then return
@@ -173,15 +175,6 @@ private abstract class HashCollection[K]
                 var new_array = new NativeArray[nullable N](cap)
                 _array = new_array
  
-               # clean the new array
-               var i = cap - 1
-               while i >=0 do
-                       new_array[i] = null
-                       i -= 1
-               end
-
-               if _capacity <= old_cap then return
-
                 # Reput items in the array
                 var node = _first_item
                 while node != null do
@@ -253,6 +246,7 @@ class HashMap[K, V]
  
         redef fun []=(key, v)
         do
+               if _capacity == 0 then enlarge(17) # 17 because magic in `store`
                 var i = index_at(key)
                 var c = node_at_idx(i, key)
                 if c != null then
@@ -269,7 +263,6 @@ class HashMap[K, V]
         do
                 _capacity = 0
                 _the_length = 0
-               enlarge(0)
         end
  
         redef var keys: RemovableCollection[K] = new HashMapKeys[K, V](self) is lazy
@@ -442,6 +435,7 @@ class HashSet[E]
  
         redef fun add(item)
         do
+               if _capacity == 0 then enlarge(17) # 17 because magic in `store`
                 var i = index_at(item)
                 var c = node_at_idx(i, item)
                 if c != null then
@@ -461,7 +455,6 @@ class HashSet[E]
         do
                 _capacity = 0
                 _the_length = 0
-               enlarge(0)
         end
  
         # Build a list filled with the items of `coll`.
diff --git a/lib/hash_debug.nit b/lib/hash_debug.nit

index 75d4160..5caf83d 100644 (file)
--- a/lib/hash_debug.nit
+++ b/lib/hash_debug.nit
@@ -55,6 +55,13 @@ redef class Sys
         # Total capacity of hash collections receiver `HashCollection::store`
         var st_tot_cap = 0
  
+       # Number of calls of `HashCollection::enlarge`
+       var en_count = 0
+       # Total length of hash collections receiver of `HashCollection::enlarge`
+       var en_tot_length = 0
+       # Total capacity of hash collections receiver `HashCollection::enlarge`
+       var en_tot_cap = 0
+
         private fun div(n,d: Int): String
         do
                 if d == 0 then return "NA"
@@ -90,6 +97,11 @@ number of collisions: {{{st_coll}}} ({{{div(st_coll*100,st_count)}}}%)
  average length of collisions: {{{div(st_tot_coll,st_coll)}}}
  average length of considered collections: {{{div(st_tot_length,sys.st_count)}}}
  average capacity or considered collections: {{{div(st_tot_cap,sys.st_count)}}} ({{{div(st_tot_cap*100,st_tot_length)}}}%)
+
+ENLARGE:
+number of enlarge: {{{en_count}}}
+average length of considered collections: {{{div(en_tot_length,sys.en_count)}}}
+average capacity or considered collections: {{{div(en_tot_cap,sys.en_count)}}} ({{{div(en_tot_cap*100,en_tot_length)}}}%)
  ~~~~~~"""
         end
  
@@ -133,6 +145,14 @@ redef class HashCollection[K]
                 return super
         end
  
+       redef fun enlarge(c)
+       do
+               super
+               sys.en_count += 1
+               sys.en_tot_length += _the_length
+               sys.en_tot_cap += _capacity
+       end
+
         # Count and update length of collisions for `node_at_idx`
         # Note for dynamic call-graph analysis: callers of this functions are
         # responsible of collisions.
diff --git a/lib/html/html.nit b/lib/html/html.nit

index d3a92eb..72284ef 100644 (file)
--- a/lib/html/html.nit
+++ b/lib/html/html.nit
@@ -107,7 +107,12 @@ class HTMLTag
         # `"div"` for `<div></div>`.
         var tag: String
         init do
-               self.is_void = (once ["area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"]).has(tag)
+               self.is_void = (once void_list).has(tag)
+       end
+
+       private fun void_list: Set[String]
+       do
+               return new HashSet[String].from(["area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"])
         end
  
         # Is the HTML element a void element?
diff --git a/lib/nlp/README.md b/lib/nlp/README.md

new file mode 100644 (file)

index 0000000..6545537
--- /dev/null
+++ b/lib/nlp/README.md
@@ -0,0 +1,89 @@
+# Nit wrapper for Stanford CoreNLP
+
+Stanford CoreNLP provides a set of natural language analysis tools which can take
+raw text input and give the base forms of words, their parts of speech, whether
+they are names of companies, people, etc., normalize dates, times, and numeric
+quantities, and mark up the structure of sentences in terms of phrases and word
+dependencies, indicate which noun phrases refer to the same entities, indicate
+sentiment, etc.
+
+This wrapper needs the Stanford CoreNLP jars that run on Java 1.8+.
+
+See http://nlp.stanford.edu/software/corenlp.shtml.
+
+## Usage
+
+~~~nitish
+var proc = new NLPProcessor("path/to/StanfordCoreNLP/jars")
+
+var doc = proc.process("String to analyze")
+
+for sentence in doc.sentences do
+       for token in sentence.tokens do
+               print "{token.lemma}: {token.pos}"
+       end
+end
+~~~
+
+## Nit API
+
+For ease of use, this wrapper introduce a Nit model to handle CoreNLP XML results.
+
+### NLPDocument
+
+[[doc: NLPDocument]]
+
+[[doc: NLPDocument::from_xml]]
+[[doc: NLPDocument::from_xml_file]]
+[[doc: NLPDocument::sentences]]
+
+### NLPSentence
+
+[[doc: NLPSentence]]
+
+[[doc: NLPSentence::tokens]]
+
+### NLPToken
+
+[[doc: NLPToken]]
+
+[[doc: NLPToken::word]]
+[[doc: NLPToken::lemma]]
+[[doc: NLPToken::pos]]
+
+### NLP Processor
+
+[[doc: NLPProcessor]]
+
+[[doc: NLPProcessor::java_cp]]
+
+[[doc: NLPProcessor::process]]
+[[doc: NLPProcessor::process_file]]
+[[doc: NLPProcessor::process_files]]
+
+## Vector Space Model
+
+[[doc: NLPVector]]
+
+[[doc: NLPDocument::vector]]
+
+[[doc: NLPVector::cosine_similarity]]
+
+## NitNLP binary
+
+The `nitnlp` binary is given as an example of NitNLP client.
+It compares two strings and display ther cosine similarity value.
+
+Usage:
+
+~~~raw
+nitnlp --cp "/path/to/jars" "sort" "Sorting array data"
+0.577
+~~~
+
+## TODO
+
+* Use JWrapper
+* Use options to choose CoreNLP analyzers
+* Analyze sentences dependencies
+* Analyze sentiment
diff --git a/lib/nlp/nitnlp.nit b/lib/nlp/nitnlp.nit

new file mode 100644 (file)

index 0000000..72946bc
--- /dev/null
+++ b/lib/nlp/nitnlp.nit
@@ -0,0 +1,49 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Natural Language Processor based on the StanfordNLP core.
+#
+# This tool provides a document comparison service from command line based on
+# StanfordNLP and NLPVector consine similarity.
+#
+# See http://nlp.stanford.edu/software/corenlp.shtml.
+module nitnlp
+
+import opts
+import nlp
+
+# Option management
+var opt_java_cp = new OptionString("Java classpath for StanfordNLP jars", "--cp")
+var options = new OptionContext
+options.add_option(opt_java_cp)
+options.parse(args)
+var arguments = options.rest
+
+# Processor initialization
+var java_cp = opt_java_cp.value
+if java_cp == null then java_cp = "*"
+var proc = new NLPProcessor(java_cp)
+
+if arguments.length != 2 then
+       print "Usage: nitnlp text1 text2\n"
+       options.usage
+       sys.exit 1
+end
+
+var doc1 = proc.process(arguments.first)
+print doc1.vector.join(":", ",")
+var doc2 = proc.process(arguments.last)
+print doc2.vector.join(":", ",")
+
+print doc1.vector.cosine_similarity(doc2.vector)
diff --git a/lib/nlp/nlp.nit b/lib/nlp/nlp.nit

new file mode 100644 (file)

index 0000000..4dd7cc9
--- /dev/null
+++ b/lib/nlp/nlp.nit
@@ -0,0 +1,71 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Natural Language Processor based on the StanfordNLP core.
+#
+# See http://nlp.stanford.edu/software/corenlp.shtml.
+module nlp
+
+import stanford
+import vsm
+
+redef class NLPDocument
+
+       # `NLPVector` representing `self`.
+       var vector: NLPVector is lazy do
+               var vector = new NLPVector
+               for sentence in sentences do
+                       for token in sentence.tokens do
+                               if not keep_pos_token(token) then continue
+                               var lemma = token.lemma
+                               if lemma_black_list.has(lemma) then continue
+                               if not vector.has_key(lemma) then
+                                       vector[lemma] = 1
+                               else
+                                       vector[lemma] += 1
+                               end
+                       end
+               end
+               return vector
+       end
+
+       # Should we keep `token` when composing the vector?
+       #
+       # Choice is based on the POS tag of the token.
+       # See `allowed_pos_prefixes`.
+       private fun keep_pos_token(token: NLPToken): Bool do
+               var pos = token.pos
+               for prefix in allowed_pos_prefixes do
+                       if pos.has_prefix(prefix) then return true
+               end
+               return false
+       end
+
+       # Should we keep `lemma` when composing the vector?
+       #
+       # See `lemma_black_list`.
+       private fun keep_lemma(lemma: String): Bool do
+               return true
+       end
+
+       # Allowed POS tag prefixes.
+       #
+       # When building a vector from `self`,  only tokens tagged with one of these
+       # prefixes are kept.
+       # Other tokens are ignored.
+       var allowed_pos_prefixes: Array[String] = ["NN", "VB", "RB"] is writable
+
+       # Ignored lemmas.
+       var lemma_black_list: Array[String] = ["module", "class", "method"] is writable
+end
diff --git a/lib/nlp/package.ini b/lib/nlp/package.ini

new file mode 100644 (file)

index 0000000..789aa44
--- /dev/null
+++ b/lib/nlp/package.ini
@@ -0,0 +1,11 @@
+[package]
+name=nlp
+tags=nlp,lib
+maintainer=Alexandre Terrasa <alexandre@moz-code.org>
+license=Apache-2.0
+[upstream]
+browse=https://github.com/nitlang/nit/tree/master/lib/nlp/
+git=https://github.com/nitlang/nit.git
+git.directory=lib/nlp/
+homepage=http://nitlanguage.org
+issues=https://github.com/nitlang/nit/issues
diff --git a/lib/nlp/stanford.nit b/lib/nlp/stanford.nit

new file mode 100644 (file)

index 0000000..734a228
--- /dev/null
+++ b/lib/nlp/stanford.nit
@@ -0,0 +1,258 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Natural Language Processor based on the StanfordNLP core.
+#
+# See http://nlp.stanford.edu/software/corenlp.shtml.
+module stanford
+
+import opts
+import dom
+
+# Wrapper around StanfordNLP jar.
+#
+# NLPProcessor provides natural language processing of input text files and
+# an API to handle analysis results.
+#
+# FIXME this should use the Java FFI.
+class NLPProcessor
+
+       # Classpath to give to Java when loading the StanfordNLP jars.
+       var java_cp: String
+
+       # Process a string and return a new NLPDocument from this.
+       fun process(string: String): NLPDocument do
+               var tmp_file = ".nlp.in"
+               var file = new FileWriter.open(tmp_file)
+               file.write string
+               file.close
+               var doc = process_file(tmp_file)
+               tmp_file.file_delete
+               return doc
+       end
+
+       # Process the `input` file and return a new NLPDocument from this.
+       fun process_file(input: String): NLPDocument do
+               # TODO opt annotators
+               var tmp_file = "{input.basename}.xml"
+               sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
+               var doc = new NLPDocument.from_xml_file(tmp_file)
+               tmp_file.file_delete
+               return doc
+       end
+
+       # Batch mode.
+       #
+       # Returns a map of file path associated with their NLPDocument.
+       fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do
+               # Prepare the input file list
+               var input_file = "inputs.list"
+               var fw = new FileWriter.open(input_file)
+               for input in inputs do fw.write "{input}\n"
+               fw.close
+
+               # Run Stanford NLP jar
+               sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}"
+               # Parse output
+               var map = new HashMap[String, NLPDocument]
+               for input in inputs do
+                       var out_file = output_dir / "{input.basename}.xml"
+                       map[input] = new NLPDocument.from_xml_file(out_file)
+               end
+               input_file.file_delete
+               return map
+       end
+end
+
+# A `Document` represent a text input given to the NLP processor.
+#
+# Once processed, it contains a list of sentences that contain tokens.
+class NLPDocument
+
+       #  NLPSentences contained in `self`
+       var sentences = new Array[NLPSentence]
+
+       # Init `self` from an xml element.
+       #
+       # ~~~
+       # var xml = """
+       # <root>
+       #   <document>
+       #     <sentences>
+       #       <sentence id="1">
+       #         <tokens>
+       #           <token id="1">
+       #             <word>Stanford</word>
+       #             <lemma>Stanford</lemma>
+       #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
+       #             <CharacterOffsetEnd>8</CharacterOffsetEnd>
+       #             <POS>NNP</POS>
+       #           </token>
+       #           <token id="2">
+       #             <word>University</word>
+       #             <lemma>University</lemma>
+       #             <CharacterOffsetBegin>9</CharacterOffsetBegin>
+       #             <CharacterOffsetEnd>19</CharacterOffsetEnd>
+       #             <POS>NNP</POS>
+       #           </token>
+       #         </tokens>
+       #       </sentence>
+       #       <sentence id="2">
+       #         <tokens>
+       #           <token id="1">
+       #             <word>UQAM</word>
+       #             <lemma>UQAM</lemma>
+       #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
+       #             <CharacterOffsetEnd>4</CharacterOffsetEnd>
+       #             <POS>NNP</POS>
+       #           </token>
+       #           <token id="2">
+       #             <word>University</word>
+       #             <lemma>University</lemma>
+       #             <CharacterOffsetBegin>5</CharacterOffsetBegin>
+       #             <CharacterOffsetEnd>15</CharacterOffsetEnd>
+       #             <POS>NNP</POS>
+       #           </token>
+       #         </tokens>
+       #       </sentence>
+       #     </sentences>
+       #   </document>
+       # </root>""".to_xml.as(XMLDocument)
+       #
+       # var document = new NLPDocument.from_xml(xml)
+       # assert document.sentences.length == 2
+       # assert document.sentences.first.tokens.first.word == "Stanford"
+       # assert document.sentences.last.tokens.first.word == "UQAM"
+       # ~~~
+       init from_xml(xml: XMLDocument) do
+               for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
+                       if obj isa XMLStartTag then
+                               sentences.add new NLPSentence.from_xml(obj)
+                       else
+                               print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
+                       end
+               end
+       end
+
+       # Init `self` from a XML file.
+       init from_xml_file(path: String) do
+               var file = new FileReader.open(path)
+               var xml = file.read_lines
+               file.close
+               xml.shift # remove xml doctype
+               xml.shift # remove xslt link
+               from_xml(xml.join("\n").to_xml.as(XMLDocument))
+       end
+end
+
+# Represent one sentence in a `Document`.
+class NLPSentence
+
+       # Index of this sentence in the input text.
+       var index: Int
+
+       #  NLPTokens contained in `self`.
+       var tokens = new Array[NLPToken]
+
+       # Init `self` from an XML element.
+       #
+       # ~~~
+       # var xml = """
+       # <sentence id="1">
+       #   <tokens>
+       #     <token id="1">
+       #       <word>Stanford</word>
+       #       <lemma>Stanford</lemma>
+       #       <CharacterOffsetBegin>0</CharacterOffsetBegin>
+       #       <CharacterOffsetEnd>8</CharacterOffsetEnd>
+       #       <POS>NNP</POS>
+       #     </token>
+       #     <token id="2">
+       #       <word>University</word>
+       #       <lemma>University</lemma>
+       #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
+       #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
+       #       <POS>NNP</POS>
+       #     </token>
+       #   </tokens>
+       # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
+       #
+       # var sentence = new  NLPSentence.from_xml(xml)
+       # assert sentence.index == 1
+       # assert sentence.tokens.length == 2
+       # ~~~
+       init from_xml(xml: XMLStartTag) do
+               var index = xml.attributes.first.as(XMLStringAttr).value.to_i
+               for obj in xml["tokens"].first["token"] do
+                       if obj isa XMLStartTag then
+                               tokens.add new NLPToken.from_xml(obj)
+                       else
+                               print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
+                       end
+               end
+               init(index)
+       end
+end
+
+# Represent one word (or puncutation mark) in a `NLPSentence`.
+class NLPToken
+
+       # Index of this word in the sentence.
+       var index: Int
+
+       # Original word
+       var word: String
+
+       # `word` lemma
+       var lemma: String
+
+       # Position of the first character in the input
+       var begin_offset: Int
+
+       # Position of the last character in the input
+       var end_offset: Int
+
+       # Part Of Speech tag
+       var pos: String
+
+       # Init `self` from an XML element.
+       #
+       # ~~~
+       # var xml = """
+       #  <token id="2">
+       #       <word>University</word>
+       #       <lemma>University</lemma>
+       #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
+       #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
+       #       <POS>NNP</POS>
+       #  </token>""".to_xml["token"].first.as(XMLStartTag)
+       #
+       # var token = new  NLPToken.from_xml(xml)
+       # assert token.index == 2
+       # assert token.word == "University"
+       # assert token.lemma == "University"
+       # assert token.begin_offset == 9
+       # assert token.end_offset == 19
+       # assert token.pos == "NNP"
+       # ~~~
+       init from_xml(xml: XMLStartTag) do
+               var index = xml.attributes.first.as(XMLStringAttr).value.to_i
+               var word = xml["word"].first.as(XMLStartTag).data
+               var lemma = xml["lemma"].first.as(XMLStartTag).data
+               var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i
+               var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i
+               var pos = xml["POS"].first.as(XMLStartTag).data
+               init(index, word, lemma, begin_offset, end_offset, pos)
+       end
+end
diff --git a/lib/nlp/vsm.nit b/lib/nlp/vsm.nit

new file mode 100644 (file)

index 0000000..7fe0a84
--- /dev/null
+++ b/lib/nlp/vsm.nit
@@ -0,0 +1,92 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NLPVector Space Model.
+#
+# The Vector Space Model (VSM) is used to compare natural language texts.
+# Texts are translated to multidimensionnal vectors then compared by cosine
+# similarity.
+module vsm
+
+import counter
+
+# A multi-dimensional vector.
+class NLPVector
+       super Counter[String]
+
+       # Cosine similarity of `self` and `other`.
+       #
+       # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
+       # two vectors are orthogonal and 1.0 means that they are identical.
+       #
+       # ~~~
+       # var v1 = new NLPVector
+       # v1["x"] = 1
+       # v1["y"] = 2
+       # v1["z"] = 3
+       #
+       # var v2 = new NLPVector
+       # v2["x"] = 1
+       # v2["y"] = 2
+       # v2["z"] = 3
+       #
+       # var v3 = new NLPVector
+       # v3["a"] = 1
+       # v3["b"] = 2
+       # v3["c"] = 3
+       #
+       # print v1.cosine_similarity(v2)
+       # #assert v1.cosine_similarity(v2) == 1.0
+       # print v1.cosine_similarity(v3)
+       # assert v1.cosine_similarity(v3) == 0.0
+       # ~~~
+       fun cosine_similarity(other: SELF): Float do
+               # Collect terms
+               var terms = new HashSet[String]
+               for k in self.keys do terms.add k
+               for k in other.keys do terms.add k
+
+               # Get dot product of two verctors
+               var dot = 0
+               for term in terms do
+                       dot += self.get_or_default(term, 0) * other.get_or_default(term, 0)
+               end
+
+               return dot.to_f / (self.norm * other.norm)
+       end
+
+       # The norm of the vector.
+       #
+       # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
+       #
+       # ~~~
+       # var v = new NLPVector
+       # v["x"] = 1
+       # v["y"] = 1
+       # v["z"] = 1
+       # v["t"] = 1
+       # assert v.norm.is_approx(2.0, 0.001)
+       #
+       # v["x"] = 1
+       # v["y"] = 2
+       # v["z"] = 3
+       # v["t"] = 0
+       # assert v.norm.is_approx(3.742, 0.001)
+       # ~~~
+       fun norm: Float do
+               var sum = 0
+               for v in self.values do sum += v ** 2
+               return sum.to_f.sqrt
+       end
+end
diff --git a/src/doc/doc_phases/doc_html.nit b/src/doc/doc_phases/doc_html.nit

index ebc1193..30a15c2 100644 (file)
--- a/src/doc/doc_phases/doc_html.nit
+++ b/src/doc/doc_phases/doc_html.nit
@@ -70,6 +70,9 @@ redef class ToolContext
         # FIXME redo the plugin
         var opt_github_gitdir = new OptionString("Git working directory used to resolve path name (ex: /home/me/mypackage/)", "--github-gitdir")
  
+       # Do not produce HTML files
+       var opt_no_render = new OptionBool("do not render HTML files", "--no-render")
+
         redef init do
                 super
  
@@ -77,7 +80,8 @@ redef class ToolContext
                         opt_source, opt_sharedir, opt_shareurl, opt_custom_title,
                         opt_custom_footer, opt_custom_intro, opt_custom_brand,
                         opt_github_upstream, opt_github_base_sha1, opt_github_gitdir,
-                       opt_piwik_tracker, opt_piwik_site_id)
+                       opt_piwik_tracker, opt_piwik_site_id,
+                       opt_no_render)
         end
  
         redef fun process_options(args) do
@@ -103,6 +107,7 @@ class RenderHTMLPhase
         var name_sorter = new MEntityNameSorter
  
         redef fun apply do
+               if ctx.opt_no_render.value then return
                 init_output_dir
                 for page in doc.pages.values do
                         page.render(self, doc).write_to_file("{ctx.output_dir.to_s}/{page.html_url}")
diff --git a/src/doc/doc_phases/doc_phases.nit b/src/doc/doc_phases/doc_phases.nit

index fefd1ac..24969e4 100644 (file)
--- a/src/doc/doc_phases/doc_phases.nit
+++ b/src/doc/doc_phases/doc_phases.nit
@@ -19,3 +19,4 @@ module doc_phases
  
  import doc_html
  import doc_indexing
+import doc_test
diff --git a/src/doc/doc_phases/doc_test.nit b/src/doc/doc_phases/doc_test.nit

new file mode 100644 (file)

index 0000000..ddb5eb7
--- /dev/null
+++ b/src/doc/doc_phases/doc_test.nit
@@ -0,0 +1,59 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Print the generated DocModel in stdout.
+#
+# Mainly used for tests.
+module doc_test
+
+import doc_structure
+import counter
+
+redef class ToolContext
+
+       # File pattern used to link documentation to source code.
+       var opt_test = new OptionBool("print test data", "--test")
+
+       redef init do
+               super
+               option_context.add_option(opt_test)
+       end
+end
+
+# Display the DocModel in stdout.
+class DocTestPhase
+       super DocPhase
+
+       redef fun apply do
+               if not ctx.opt_test.value then return
+               # Pages metrics
+               var page_counter = new Counter[String]
+               var pages = doc.pages.keys.to_a
+               default_comparator.sort(pages)
+               for title in pages do
+                       var page = doc.pages[title]
+                       page_counter.inc page.class_name
+                       print page.pretty_print.write_to_string
+               end
+               print "Generated {doc.pages.length} pages"
+               page_counter.print_elements(100)
+               # Model metrics
+               var model_counter = new Counter[String]
+               for mentity in doc.mentities do
+                       model_counter.inc mentity.class_name
+               end
+               print "Found {doc.mentities.length} mentities"
+               model_counter.print_elements(100)
+       end
+end
diff --git a/src/nitdoc.nit b/src/nitdoc.nit

index bda79e7..2591863 100644 (file)
--- a/src/nitdoc.nit
+++ b/src/nitdoc.nit
@@ -19,19 +19,12 @@ module nitdoc
  
  import modelbuilder
  import doc
-import counter
  
  redef class ToolContext
         # Nitdoc generation phase.
         var docphase: Phase = new Nitdoc(self, null)
  
-       # File pattern used to link documentation to source code.
-       var opt_test = new OptionBool("do not render anything, only print test data", "--test")
-
-       redef init do
-               super
-               option_context.add_option(opt_test)
-       end
+       init do super # to fix ambiguous linearization
  end
  
  # Nitdoc phase explores the model and generate pages for each mentities found
@@ -52,37 +45,14 @@ private class Nitdoc
                         new IntroRedefListPhase(toolcontext, doc),
                         new LinListPhase(toolcontext, doc),
                         new GraphPhase(toolcontext, doc),
-                       new ReadmePhase(toolcontext, doc): DocPhase]
-
-               if not toolcontext.opt_test.value then
-                       phases.add new RenderHTMLPhase(toolcontext, doc)
-               end
+                       new ReadmePhase(toolcontext, doc),
+                       new RenderHTMLPhase(toolcontext, doc),
+                       new DocTestPhase(toolcontext, doc): DocPhase]
  
                 for phase in phases do
                         toolcontext.info("# {phase.class_name}", 1)
                         phase.apply
                 end
-
-               if toolcontext.opt_test.value then
-                       # Pages metrics
-                       var page_counter = new Counter[String]
-                       var pages = doc.pages.keys.to_a
-                       default_comparator.sort(pages)
-                       for title in pages do
-                               var page = doc.pages[title]
-                               page_counter.inc page.class_name
-                               print page.pretty_print.write_to_string
-                       end
-                       print "Generated {doc.pages.length} pages"
-                       page_counter.print_elements(100)
-                       # Model metrics
-                       var model_counter = new Counter[String]
-                       for mentity in doc.mentities do
-                               model_counter.inc mentity.class_name
-                       end
-                       print "Found {doc.mentities.length} mentities"
-                       model_counter.print_elements(100)
-               end
         end
  end
  
diff --git a/src/parser/parser.nit b/src/parser/parser.nit

index d907890..88b4136 100644 (file)
--- a/src/parser/parser.nit
+++ b/src/parser/parser.nit
@@ -10,7 +10,6 @@ redef class Parser
         redef fun build_reduce_table
         do
                 var reduce_table = new Array[ReduceAction].with_capacity(1091)
-               self.reduce_table = reduce_table
                 reduce_table.add new ReduceAction0(0)
                 reduce_table.add new ReduceAction1(0)
                 reduce_table.add new ReduceAction2(0)
@@ -1102,6 +1101,7 @@ redef class Parser
                 reduce_table.add new ReduceAction1088(220)
                 reduce_table.add new ReduceAction473(221)
                 reduce_table.add new ReduceAction492(221)
+               return reduce_table
         end
  end
  
diff --git a/src/parser/parser_work.nit b/src/parser/parser_work.nit

index a5bef33..217a09a 100644 (file)
--- a/src/parser/parser_work.nit
+++ b/src/parser/parser_work.nit
@@ -40,7 +40,7 @@ class Parser
  
         init
         do
-               build_reduce_table
+               self.reduce_table = once build_reduce_table
         end
  
         # Do a transition in the automata
@@ -155,7 +155,7 @@ class Parser
         end
  
         private var reduce_table: Array[ReduceAction] is noinit
-       private fun build_reduce_table is abstract
+       private fun build_reduce_table: Array[ReduceAction] is abstract
  end
  
  redef class Prod
diff --git a/src/parser/xss/parser.xss b/src/parser/xss/parser.xss

index 8e38001..5b1d60e 100644 (file)
--- a/src/parser/xss/parser.xss
+++ b/src/parser/xss/parser.xss
@@ -21,10 +21,10 @@ redef class Parser
         redef fun build_reduce_table
         do
                 var reduce_table = new Array[ReduceAction].with_capacity(${count(rules/rule)})
-               self.reduce_table = reduce_table
  $ foreach {rules/rule}
                 reduce_table.add new ReduceAction@index(@leftside)
  $ end foreach
+               return reduce_table
         end
  end
  
diff --git a/tests/nitdoc.args b/tests/nitdoc.args

index de4c3e4..ee5fd03 100644 (file)
--- a/tests/nitdoc.args
+++ b/tests/nitdoc.args
@@ -1,4 +1,4 @@
  module_1.nit -d $WRITE
  base_attr_nullable.nit -d $WRITE
  --private base_attr_nullable.nit -d $WRITE
---test test_prog -d $WRITE
+--no-render --test test_prog -d $WRITE
diff --git a/tests/sav/test_hash_debug.res b/tests/sav/test_hash_debug.res

index b52a9c5..c9b5f48 100644 (file)
--- a/tests/sav/test_hash_debug.res
+++ b/tests/sav/test_hash_debug.res
@@ -3,145 +3,171 @@
  
  a1
  false
+~~~No hash statistics~~~
  ~~~Hash statistics~~~
  GET:
  number of get and has_key: 1
  number of collisions: 0 (0.00%)
  average length of collisions: NA
  average length of considered collections: 0.00
-average capacity of considered collections: 1.00 (NA%)
+average capacity of considered collections: 17.00 (NA%)
  
  STORE:
-number of stores: 0
-number of collisions: 0 (NA%)
-average length of collisions: NA
-average length of considered collections: NA
-average capacity or considered collections: NA (NA%)
-~~~~~~
-~~~Hash statistics~~~
-GET:
-number of get and has_key: 2
+number of stores: 1
  number of collisions: 0 (0.00%)
  average length of collisions: NA
  average length of considered collections: 0.00
-average capacity of considered collections: 1.00 (NA%)
+average capacity or considered collections: 17.00 (NA%)
  
-STORE:
-number of stores: 1
-number of collisions: 0 (0.00%)
-average length of collisions: NA
+ENLARGE:
+number of enlarge: 1
  average length of considered collections: 0.00
-average capacity or considered collections: 1.00 (NA%)
+average capacity or considered collections: 17.00 (NA%)
  ~~~~~~
  ~~~Hash statistics~~~
  GET:
-number of get and has_key: 3
+number of get and has_key: 2
  number of collisions: 0 (0.00%)
  average length of collisions: NA
-average length of considered collections: 0.33
-average capacity of considered collections: 6.33 (1900.00%)
+average length of considered collections: 0.50
+average capacity of considered collections: 17.00 (3400.00%)
  
  STORE:
  number of stores: 1
  number of collisions: 0 (0.00%)
  average length of collisions: NA
  average length of considered collections: 0.00
-average capacity or considered collections: 1.00 (NA%)
+average capacity or considered collections: 17.00 (NA%)
+
+ENLARGE:
+number of enlarge: 1
+average length of considered collections: 0.00
+average capacity or considered collections: 17.00 (NA%)
  ~~~~~~
  true
  ~~~Hash statistics~~~
  GET:
-number of get and has_key: 4
+number of get and has_key: 2
  number of collisions: 0 (0.00%)
  average length of collisions: NA
  average length of considered collections: 0.50
-average capacity of considered collections: 9.00 (1800.00%)
+average capacity of considered collections: 17.00 (3400.00%)
  
  STORE:
  number of stores: 1
  number of collisions: 0 (0.00%)
  average length of collisions: NA
  average length of considered collections: 0.00
-average capacity or considered collections: 1.00 (NA%)
+average capacity or considered collections: 17.00 (NA%)
+
+ENLARGE:
+number of enlarge: 1
+average length of considered collections: 0.00
+average capacity or considered collections: 17.00 (NA%)
  ~~~~~~
  
  a2
  false
  ~~~Hash statistics~~~
  GET:
-number of get and has_key: 5
+number of get and has_key: 3
  number of collisions: 0 (0.00%)
  average length of collisions: NA
-average length of considered collections: 0.60
-average capacity of considered collections: 10.60 (1766.67%)
+average length of considered collections: 0.67
+average capacity of considered collections: 17.00 (2550.00%)
  
  STORE:
  number of stores: 1
  number of collisions: 0 (0.00%)
  average length of collisions: NA
  average length of considered collections: 0.00
-average capacity or considered collections: 1.00 (NA%)
+average capacity or considered collections: 17.00 (NA%)
+
+ENLARGE:
+number of enlarge: 1
+average length of considered collections: 0.00
+average capacity or considered collections: 17.00 (NA%)
  ~~~~~~
  ~~~Hash statistics~~~
  GET:
-number of get and has_key: 6
+number of get and has_key: 4
  number of collisions: 0 (0.00%)
  average length of collisions: NA
-average length of considered collections: 0.67
-average capacity of considered collections: 11.67 (1750.00%)
+average length of considered collections: 0.75
+average capacity of considered collections: 17.00 (2266.67%)
  
  STORE:
  number of stores: 2
  number of collisions: 1 (50.00%)
  average length of collisions: 2.00
  average length of considered collections: 0.50
-average capacity or considered collections: 9.00 (1800.00%)
+average capacity or considered collections: 17.00 (3400.00%)
+
+ENLARGE:
+number of enlarge: 1
+average length of considered collections: 0.00
+average capacity or considered collections: 17.00 (NA%)
  ~~~~~~
  ~~~Hash statistics~~~
  GET:
-number of get and has_key: 7
-number of collisions: 1 (14.29%)
+number of get and has_key: 5
+number of collisions: 1 (20.00%)
  average length of collisions: 2.00
-average length of considered collections: 0.86
-average capacity of considered collections: 12.43 (1450.00%)
+average length of considered collections: 1.00
+average capacity of considered collections: 17.00 (1700.00%)
  
  STORE:
  number of stores: 2
  number of collisions: 1 (50.00%)
  average length of collisions: 2.00
  average length of considered collections: 0.50
-average capacity or considered collections: 9.00 (1800.00%)
+average capacity or considered collections: 17.00 (3400.00%)
+
+ENLARGE:
+number of enlarge: 1
+average length of considered collections: 0.00
+average capacity or considered collections: 17.00 (NA%)
  ~~~~~~
  true
  ~~~Hash statistics~~~
  GET:
-number of get and has_key: 7
-number of collisions: 1 (14.29%)
+number of get and has_key: 5
+number of collisions: 1 (20.00%)
  average length of collisions: 2.00
-average length of considered collections: 0.86
-average capacity of considered collections: 12.43 (1450.00%)
+average length of considered collections: 1.00
+average capacity of considered collections: 17.00 (1700.00%)
  
  STORE:
  number of stores: 2
  number of collisions: 1 (50.00%)
  average length of collisions: 2.00
  average length of considered collections: 0.50
-average capacity or considered collections: 9.00 (1800.00%)
+average capacity or considered collections: 17.00 (3400.00%)
+
+ENLARGE:
+number of enlarge: 1
+average length of considered collections: 0.00
+average capacity or considered collections: 17.00 (NA%)
  ~~~~~~
  
  end
  ~~~Hash statistics~~~
  GET:
-number of get and has_key: 7
-number of collisions: 1 (14.29%)
+number of get and has_key: 5
+number of collisions: 1 (20.00%)
  average length of collisions: 2.00
-average length of considered collections: 0.86
-average capacity of considered collections: 12.43 (1450.00%)
+average length of considered collections: 1.00
+average capacity of considered collections: 17.00 (1700.00%)
  
  STORE:
  number of stores: 2
  number of collisions: 1 (50.00%)
  average length of collisions: 2.00
  average length of considered collections: 0.50
-average capacity or considered collections: 9.00 (1800.00%)
+average capacity or considered collections: 17.00 (3400.00%)
+
+ENLARGE:
+number of enlarge: 1
+average length of considered collections: 0.00
+average capacity or considered collections: 17.00 (NA%)
  ~~~~~~
author	Jean Privat <jean@pryen.org>
	Wed, 21 Oct 2015 01:10:02 +0000 (21:10 -0400)
committer	Jean Privat <jean@pryen.org>
	Wed, 21 Oct 2015 01:10:02 +0000 (21:10 -0400)
lib/core/collection/hash_collection.nit		patch \| blob \| history
lib/hash_debug.nit		patch \| blob \| history
lib/html/html.nit		patch \| blob \| history
lib/nlp/README.md	[new file with mode: 0644]	patch \| blob
lib/nlp/nitnlp.nit	[new file with mode: 0644]	patch \| blob
lib/nlp/nlp.nit	[new file with mode: 0644]	patch \| blob
lib/nlp/package.ini	[new file with mode: 0644]	patch \| blob
lib/nlp/stanford.nit	[new file with mode: 0644]	patch \| blob
lib/nlp/vsm.nit	[new file with mode: 0644]	patch \| blob
src/doc/doc_phases/doc_html.nit		patch \| blob \| history
src/doc/doc_phases/doc_phases.nit		patch \| blob \| history
src/doc/doc_phases/doc_test.nit	[new file with mode: 0644]	patch \| blob
src/nitdoc.nit		patch \| blob \| history
src/parser/parser.nit		patch \| blob \| history
src/parser/parser_work.nit		patch \| blob \| history
src/parser/xss/parser.xss		patch \| blob \| history
tests/nitdoc.args		patch \| blob \| history
tests/sav/test_hash_debug.res		patch \| blob \| history