Commit 7c142123 authored by Wael Ramadan's avatar Wael Ramadan
Browse files

Merge branch 'improve_error_handling' into 'master'

Improve error handling

See merge request !12
parents 2f8dba52 e230874c
Pipeline #5715 passed with stage
in 21 minutes and 42 seconds
...@@ -51,6 +51,7 @@ class Document: ...@@ -51,6 +51,7 @@ class Document:
doc_path: str = "text", doc_path: str = "text",
json_doc: dict = None, json_doc: dict = None,
analyzers: list = [], analyzers: list = [],
error: str = "",
): ):
self.original_text = original_text self.original_text = original_text
...@@ -58,6 +59,7 @@ class Document: ...@@ -58,6 +59,7 @@ class Document:
self.analyzers = analyzers self.analyzers = analyzers
self.dominant_language_code = dominant_language_code self.dominant_language_code = dominant_language_code
self.analysis_lang = analysis_lang self.analysis_lang = analysis_lang
self.error = error
self.json_doc = json_doc self.json_doc = json_doc
self.entity_mapper = entity_mapper self.entity_mapper = entity_mapper
...@@ -166,6 +168,8 @@ class Document: ...@@ -166,6 +168,8 @@ class Document:
"detected": self.dominant_language_code, "detected": self.dominant_language_code,
"analysis": self.analysis_lang "analysis": self.analysis_lang
} }
if self.error:
container["error"] = self.error
if "lemmas" in self.analyzers: if "lemmas" in self.analyzers:
container["lemmas"] = self.get_lemma() container["lemmas"] = self.get_lemma()
......
...@@ -5,3 +5,7 @@ class LanguageNotSupported(Exception): ...@@ -5,3 +5,7 @@ class LanguageNotSupported(Exception):
class BoundedListEmpty(Exception): class BoundedListEmpty(Exception):
"""Raised when in Concatenator class the BOUNDS are not yet loaded, but concatenate() is tried""" """Raised when in Concatenator class the BOUNDS are not yet loaded, but concatenate() is tried"""
pass pass
class StanzaPipelineFail(Exception):
"""Raised when Stanza pipelines fail to load."""
pass
...@@ -202,10 +202,10 @@ class MLP: ...@@ -202,10 +202,10 @@ class MLP:
''' '''
if lang not in self.supported_langs: if lang not in self.supported_langs:
analysis_lang = self.default_lang analysis_lang = self.default_lang
sentences, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], []) sentences, entities, e = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [], "")
else: else:
analysis_lang = lang analysis_lang = lang
sentences, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], []) sentences, entities, e = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [], "")
document = Document( document = Document(
original_text=processed_text, original_text=processed_text,
...@@ -217,7 +217,8 @@ class MLP: ...@@ -217,7 +217,8 @@ class MLP:
json_doc=json_object, json_doc=json_object,
doc_path=doc_paths, doc_path=doc_paths,
entity_mapper=self.entity_mapper, entity_mapper=self.entity_mapper,
concat_resources=self.concat_resources concat_resources=self.concat_resources,
error=e
) )
return document return document
...@@ -230,19 +231,26 @@ class MLP: ...@@ -230,19 +231,26 @@ class MLP:
def _get_stanza_tokens(self, lang: str, raw_text: str): def _get_stanza_tokens(self, lang: str, raw_text: str):
pipeline = self.stanza_pipelines[lang](raw_text)
sentences = [] sentences = []
entities = [] entities = []
pip_pat = re.compile(r"(?<=\d)_(?=\d)") e = ""
for sentence in pipeline.sentences: try:
words = [] pipeline = self.stanza_pipelines[lang](raw_text)
for word in sentence.words:
words.append(word) pip_pat = re.compile(r"(?<=\d)_(?=\d)")
sentences.append(words) for sentence in pipeline.sentences:
for entity in sentence.entities: words = []
entities.append(entity) for word in sentence.words:
return sentences, entities words.append(word)
sentences.append(words)
for entity in sentence.entities:
entities.append(entity)
except Exception as e:
self.logger.exception(e)
return sentences, entities, repr(e)
return sentences, entities, e
def _get_stanza_ner(self, lang: str, raw_text: str): def _get_stanza_ner(self, lang: str, raw_text: str):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment