nlp :: NLPDocument
Document
represent a text input given to the NLP processor.Once processed, it contains a list of sentences that contain tokens.
nlp :: NLPDocument :: defaultinit
nlp :: NLPDocument :: sentences=
NLPSentences contained inself
nlp $ NLPDocument :: SELF
Type of this instance, automatically specialized in every classcore :: Object :: class_factory
Implementation used byget_class
to create the specific class.
nlp :: NLPDocument :: defaultinit
core :: Object :: defaultinit
core :: Object :: is_same_instance
Return true ifself
and other
are the same instance (i.e. same identity).
core :: Object :: is_same_serialized
Isself
the same as other
in a serialization context?
core :: Object :: is_same_type
Return true ifself
and other
have the same dynamic type.
core :: Object :: output_class_name
Display class name on stdout (debug only).nlp :: NLPDocument :: sentences=
NLPSentences contained inself
# A `Document` represent a text input given to the NLP processor.
#
# Once processed, it contains a list of sentences that contain tokens.
class NLPDocument
# NLPSentences contained in `self`
var sentences = new Array[NLPSentence]
# Init `self` from an xml element.
#
# ~~~
# var xml = """
# <root>
# <document>
# <sentences>
# <sentence id="1">
# <tokens>
# <token id="1">
# <word>Stanford</word>
# <lemma>Stanford</lemma>
# <CharacterOffsetBegin>0</CharacterOffsetBegin>
# <CharacterOffsetEnd>8</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# <token id="2">
# <word>University</word>
# <lemma>University</lemma>
# <CharacterOffsetBegin>9</CharacterOffsetBegin>
# <CharacterOffsetEnd>19</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# </tokens>
# </sentence>
# <sentence id="2">
# <tokens>
# <token id="1">
# <word>UQAM</word>
# <lemma>UQAM</lemma>
# <CharacterOffsetBegin>0</CharacterOffsetBegin>
# <CharacterOffsetEnd>4</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# <token id="2">
# <word>University</word>
# <lemma>University</lemma>
# <CharacterOffsetBegin>5</CharacterOffsetBegin>
# <CharacterOffsetEnd>15</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# </tokens>
# </sentence>
# </sentences>
# </document>
# </root>""".to_xml.as(XMLDocument)
#
# var document = new NLPDocument.from_xml(xml)
# assert document.sentences.length == 2
# assert document.sentences.first.tokens.first.word == "Stanford"
# assert document.sentences.last.tokens.first.word == "UQAM"
# ~~~
init from_xml(xml: XMLDocument) do
for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
if obj isa XMLStartTag then
sentences.add new NLPSentence.from_xml(obj)
else
print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
end
end
end
# Init `self` from a XML file.
init from_xml_file(path: String) do
var file = new FileReader.open(path)
var xml = file.read_lines
file.close
xml.shift # remove xml doctype
xml.shift # remove xslt link
from_xml(xml.join("\n").to_xml.as(XMLDocument))
end
end
lib/nlp/stanford.nit:109,1--188,3