Commit e230874c authored by Wael Ramadan's avatar Wael Ramadan
Browse files

Improve error handling

parent 2f8dba52
......@@ -51,6 +51,7 @@ class Document:
doc_path: str = "text",
json_doc: dict = None,
analyzers: list = [],
error: str = "",
):
self.original_text = original_text
......@@ -58,6 +59,7 @@ class Document:
self.analyzers = analyzers
self.dominant_language_code = dominant_language_code
self.analysis_lang = analysis_lang
self.error = error
self.json_doc = json_doc
self.entity_mapper = entity_mapper
......@@ -166,6 +168,8 @@ class Document:
"detected": self.dominant_language_code,
"analysis": self.analysis_lang
}
if self.error:
container["error"] = self.error
if "lemmas" in self.analyzers:
container["lemmas"] = self.get_lemma()
......
......@@ -5,3 +5,7 @@ class LanguageNotSupported(Exception):
class BoundedListEmpty(Exception):
"""Raised when in Concatenator class the BOUNDS are not yet loaded, but concatenate() is tried"""
pass
class StanzaPipelineFail(Exception):
"""Raised when Stanza pipelines fail to load."""
pass
......@@ -202,10 +202,10 @@ class MLP:
'''
if lang not in self.supported_langs:
analysis_lang = self.default_lang
sentences, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [])
sentences, entities, e = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [], "")
else:
analysis_lang = lang
sentences, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [])
sentences, entities, e = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [], "")
document = Document(
original_text=processed_text,
......@@ -217,7 +217,8 @@ class MLP:
json_doc=json_object,
doc_path=doc_paths,
entity_mapper=self.entity_mapper,
concat_resources=self.concat_resources
concat_resources=self.concat_resources,
error=e
)
return document
......@@ -230,19 +231,26 @@ class MLP:
def _get_stanza_tokens(self, lang: str, raw_text: str):
pipeline = self.stanza_pipelines[lang](raw_text)
sentences = []
entities = []
pip_pat = re.compile(r"(?<=\d)_(?=\d)")
for sentence in pipeline.sentences:
words = []
for word in sentence.words:
words.append(word)
sentences.append(words)
for entity in sentence.entities:
entities.append(entity)
return sentences, entities
e = ""
try:
pipeline = self.stanza_pipelines[lang](raw_text)
pip_pat = re.compile(r"(?<=\d)_(?=\d)")
for sentence in pipeline.sentences:
words = []
for word in sentence.words:
words.append(word)
sentences.append(words)
for entity in sentence.entities:
entities.append(entity)
except Exception as e:
self.logger.exception(e)
return sentences, entities, repr(e)
return sentences, entities, e
def _get_stanza_ner(self, lang: str, raw_text: str):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment