Commit ab7f52da authored by Marko Kollo's avatar Marko Kollo 😄
Browse files

Rewrote Document class to include whole stanza.Document instance.

parent fc672343
Pipeline #6244 passed with stage
in 15 minutes and 9 seconds
......@@ -101,8 +101,6 @@ def test_remove_duplicate_facts_by_span_in_doc(expected_non_duplicate_facts, tes
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
......@@ -135,8 +133,6 @@ def test_bound_close_ones(expected_close_BOUNDS, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
......@@ -168,8 +164,6 @@ def test_remove_overlaping_in_bounded(expected_bounds_no_overlap, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
......@@ -195,8 +189,6 @@ def test_concatenate_subset_bounds(expected_bounds_no_subsets, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
......@@ -221,8 +213,6 @@ def test_concatenate_subset_bounds(key_value_single_pairs, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
......@@ -243,8 +233,6 @@ def test_space_between_ok(mlp, ok_spaces):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
......@@ -269,8 +257,6 @@ def test_space_between_not_ok(mlp, not_ok_spaces):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
......@@ -293,8 +279,6 @@ def test_clean_similar_in_strval(similar_cleaned_str_val, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
......
......@@ -3,6 +3,7 @@ import math
from typing import List, Optional
import regex as re
import stanza
from lang_trans.arabic import buckwalter
from pelecanus import PelicanJson
......@@ -44,9 +45,8 @@ class Document:
original_text: str,
dominant_language_code: str,
analysis_lang: str,
stanza_sentences: [list],
stanza_entities,
concat_resources: dict,
stanza_document: stanza.Document = None,
entity_mapper: Optional[EntityMapper] = None,
doc_path: str = "text",
json_doc: dict = None,
......@@ -63,12 +63,14 @@ class Document:
self.json_doc = json_doc
self.entity_mapper = entity_mapper
self.stanza_sentences = stanza_sentences
self.stanza_words = [word for sentence in self.stanza_sentences for word in sentence]
self.stanza_entities = stanza_entities
self.stanza_document = stanza_document
self.concat_resources = concat_resources
self.__stanza_sentences = []
self.__stanza_words = []
self.__stanza_entities = []
self.__words = []
self.__lemmas = []
self.__pos_tags = []
......@@ -76,7 +78,34 @@ class Document:
self.__texta_facts: List[Fact] = []
self.__handle_existing_facts()
self.words()
if self.stanza_document:
self.words()
@property
def stanza_sentences(self):
if not self.__stanza_sentences:
for sentence in self.stanza_document.sentences:
self.__stanza_sentences.append(sentence)
return self.__stanza_sentences
@property
def stanza_words(self):
if not self.__stanza_words:
for sentence in self.__stanza_sentences:
for word in sentence.words:
self.__stanza_words.append(word)
return self.__stanza_words
@property
def stanza_entities(self):
if not self.__stanza_entities:
for entity in self.stanza_document.entities:
self.__stanza_entities.append(entity)
return self.__stanza_entities
def __get_doc_path(self, field: str) -> str:
......@@ -186,7 +215,7 @@ class Document:
def lemmas(self):
for sent in self.stanza_sentences:
self.__lemmas.append([word.lemma.replace("_", "") if word and word.lemma else "X" for word in sent])
self.__lemmas.append([word.lemma.replace("_", "") if word and word.lemma else "X" for word in sent.words])
def get_lemma(self) -> str:
......@@ -201,8 +230,8 @@ class Document:
def words(self):
for sent in self.stanza_sentences:
self.__words.append([word.text for word in sent])
for sentence in self.stanza_sentences:
self.__words.append([word.text for word in sentence.words])
def sentences(self):
......@@ -217,8 +246,11 @@ class Document:
def pos_tags(self):
self.__pos_tags = [word.xpos if word and word.xpos and word.xpos != "_" else "X" if word.xpos == "_" else "X"
for word in self.stanza_words]
for word in self.stanza_words:
if word and word.xpos and word.xpos != "_":
self.__pos_tags.append(word.xpos)
else:
self.__pos_tags.append("X")
def get_pos_tags(self) -> str:
......@@ -324,19 +356,19 @@ class Document:
if self.dominant_language_code in Document.langs_to_transliterate:
for word in self.stanza_words:
if self.dominant_language_code == "ru":
translit_word = self._transliterate_russian_word(word)
translit_word = self._transliterate_russian_word(word.text)
elif self.dominant_language_code == "ar":
translit_word = self._transliterate_arabic_word(word)
translit_word = self._transliterate_arabic_word(word.text)
self.__transliteration.append(translit_word)
@staticmethod
def _transliterate_russian_word(word):
translit_word = russian_transliterator([word.text.strip()])
def _transliterate_russian_word(word: str):
translit_word = russian_transliterator([word.strip()])
try:
translit_word = translit_word[0].strip()
except IndexError:
translit_word = word.text.strip()
translit_word = word.strip()
return translit_word
......
......@@ -206,17 +206,16 @@ class MLP:
'''
if lang not in self.supported_langs:
analysis_lang = self.default_lang
sentences, entities, e = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [], "")
document, e = self._get_stanza_document(analysis_lang, processed_text) if processed_text else (None, "")
else:
analysis_lang = lang
sentences, entities, e = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [], "")
document, e = self._get_stanza_document(analysis_lang, processed_text) if processed_text else (None, "")
document = Document(
original_text=processed_text,
dominant_language_code=lang,
analysis_lang=analysis_lang,
stanza_sentences=sentences,
stanza_entities=entities,
stanza_document=document,
analyzers=analyzers,
json_doc=json_object,
doc_path=doc_paths,
......@@ -239,6 +238,7 @@ class MLP:
self._entity_mapper = self._load_entity_mapper()
return self._entity_mapper
def get_stanza_pipeline(self, lang: str):
if lang not in self._stanza_pipelines:
try:
......@@ -263,29 +263,18 @@ class MLP:
return self._stanza_pipelines[lang]
def _get_stanza_tokens(self, lang: str, raw_text: str):
sentences = []
entities = []
def _get_stanza_document(self, lang: str, raw_text: str):
e = ""
try:
pipeline = self.get_stanza_pipeline(lang)(raw_text)
for sentence in pipeline.sentences:
words = []
for word in sentence.words:
words.append(word)
sentences.append(words)
for entity in sentence.entities:
entities.append(entity)
document = self.get_stanza_pipeline(lang)(raw_text)
return document, e
except KeyError as e:
raise LanguageNotSupported(f"Language {lang} not supported. Check the list of supported languages.")
except Exception as e:
self.logger.exception(e)
return sentences, entities, repr(e)
return sentences, entities, e
return None, repr(e)
def _get_stanza_ner(self, lang: str, raw_text: str):
......@@ -379,7 +368,7 @@ class MLP:
doc_texts = self.parse_doc_texts(doc_path, document)
for raw_text in doc_texts:
analyzers = self._load_analyzers(analyzers, SUPPORTED_ANALYZERS)
doc = self.generate_document(raw_text, analyzers, document, doc_paths=doc_path)
doc = self.generate_document(raw_text, analyzers=analyzers, json_object=document, doc_paths=doc_path, )
if doc:
for analyzer in analyzers:
# For every analyzer, activate the function that processes it from the
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment