Commit a0cbbf58 authored by Raul Sirel's avatar Raul Sirel
Browse files

fix sentence tokenization caused span errors

parent 699e9e30
Pipeline #6539 passed with stages
in 19 minutes and 6 seconds
......@@ -217,8 +217,9 @@ def test_that_ner_output_contains_correct_doc_path(mlp: MLP):
def test_sentences_separated_with_newline(mlp: MLP):
result = mlp.process("Siin on üks lause. See on teine lause.")
mlp_result = result["text"]
assert mlp_result["text"] == 'Siin on üks lause .\nSee on teine lause .'
assert mlp_result["lemmas"] == 'siin olema üks lause .\nsee olema teine lause .'
assert mlp_result["text"] == 'Siin on üks lause . \n See on teine lause .'
assert mlp_result["lemmas"] == 'siin olema üks lause . \n see olema teine lause .'
assert len(mlp_result["pos_tags"].split(" ")) == len(mlp_result["lemmas"].split(" "))
def test_sentences_not_separated_with_newline(mlp: MLP):
......
......@@ -162,7 +162,7 @@ class Document:
def to_json(self, use_default_doc_path=True) -> dict:
container = dict()
container["text"] = self.get_words(ssplit="sentences" in self.analyzers)
container["text"] = self.get_words()
texta_facts = self.facts_to_json()
container["language"] = {
"detected": self.dominant_language_code,
......@@ -175,7 +175,6 @@ class Document:
container["lemmas"] = self.get_lemma()
if "pos_tags" in self.analyzers:
container["pos_tags"] = self.get_pos_tags()
# if "sentiment" in self.analyzers: container["sentiment"] = self.get_sentiment()
if "transliteration" in self.analyzers and self.__transliteration:
container["transliteration"] = self.get_transliteration()
if use_default_doc_path:
......@@ -195,30 +194,38 @@ class Document:
sentences.append(" ".join([a.strip() for a in sent_lemmas]))
if "sentences" in self.analyzers:
return "\n".join(sentences)
return " \n ".join(sentences)
else:
return " ".join(sentences)
def sentences(self):
pass
def words(self):
for sent in self.stanza_sentences:
self.__words.append([word.text for word in sent])
def sentences(self):
pass
def get_words(self, ssplit=False) -> str:
if ssplit:
return "\n".join([" ".join(sent_words) for sent_words in self.__words])
def get_words(self) -> str:
if "sentences" in self.analyzers:
return " \n ".join([" ".join(sent_words) for sent_words in self.__words])
else:
return " ".join([" ".join(sent_words) for sent_words in self.__words])
def pos_tags(self):
self.__pos_tags = [word.xpos if word and word.xpos and word.xpos != "_" else "X" if word.xpos == "_" else "X"
for word in self.stanza_words]
if "sentences" in self.analyzers:
for i,sent in enumerate(self.stanza_sentences):
tags_in_sent = [word.xpos if word and word.xpos and word.xpos != "_" else "X" if word.xpos == "_" else "X" for word in sent]
for tag in tags_in_sent:
self.__pos_tags.append(tag)
# if not last item
if i+1 < len(self.stanza_sentences):
self.__pos_tags.append("LBR")
else:
self.__pos_tags = [word.xpos if word and word.xpos and word.xpos != "_" else "X" if word.xpos == "_" else "X" for word in self.stanza_words]
def get_pos_tags(self) -> str:
......@@ -380,7 +387,9 @@ class Document:
tokenized_text = self.get_words()
known_entities = Document.FACT_NAMES_NER
not_entities = self.concat_resources["not_entities"]
for entity in self.stanza_entities:
if entity.text.lower() in not_entities:
continue
if entity.type in known_entities:
......
......@@ -275,6 +275,7 @@ class MLP:
for word in sentence.words:
words.append(word)
sentences.append(words)
for entity in sentence.entities:
entities.append(entity)
......@@ -288,11 +289,6 @@ class MLP:
return sentences, entities, e
def _get_stanza_ner(self, lang: str, raw_text: str):
pipeline = self.get_stanza_pipeline(lang)(raw_text)
return [entity for sentence in pipeline.sentences for entity in sentence.entities]
@staticmethod
def _get_stanza_processors(lang):
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment