Commit 9b0d43e6 authored by Marko Kollo's avatar Marko Kollo 😄
Browse files

Load Stanza and Entity Mapper resources on demand.

parent efb078d1
Pipeline #6165 passed with stage
in 16 minutes and 32 seconds
......@@ -81,6 +81,11 @@ class MLP:
self.use_default_lang = use_default_language_code
self.resource_dir = resource_dir
self.__stanza_pipelines = {}
self.logging_level = logging_level
self.use_gpu = use_gpu
self.stanza_resource_path = pathlib.Path(self.resource_dir) / "stanza"
self.resource_dir_pathlib = pathlib.Path(resource_dir)
self.not_entities_path = self.resource_dir_pathlib / "concatenator" / "not_entities.txt"
self.space_between_not_ok_path = self.resource_dir_pathlib / "concatenator" / "space_between_not_ok.txt"
......@@ -88,14 +93,7 @@ class MLP:
self.prepare_resources(refresh_data)
# This is for CUDA OOM exceptions. Fall back to CPU if needed.
try:
self.stanza_pipelines = self._load_stanza_pipelines(logging_level, use_gpu)
except RuntimeError:
# Try loading using CPU
self.stanza_pipelines = self._load_stanza_pipelines(logging_level, False)
self.entity_mapper = self._load_entity_mapper()
self.__entity_mapper = None
self.loaded_entity_files = []
self.not_entities = self._load_not_entities()
......@@ -222,7 +220,7 @@ class MLP:
analyzers=analyzers,
json_doc=json_object,
doc_path=doc_paths,
entity_mapper=self.entity_mapper,
entity_mapper=self.get_entity_mapper(),
concat_resources=self.concat_resources,
error=e
)
......@@ -236,14 +234,43 @@ class MLP:
return [analyzer for analyzer in analyzers if (analyzer in supported_analyzers and analyzer != "all")]
def get_entity_mapper(self):
if self.__entity_mapper is None:
self.__entity_mapper = self._load_entity_mapper()
return self.__entity_mapper
def get_stanza_pipeline(self, lang: str):
if lang not in self.__stanza_pipelines:
try:
self.__stanza_pipelines[lang] = stanza.Pipeline(
lang=lang,
dir=str(self.stanza_resource_path),
processors=self._get_stanza_processors(lang),
use_gpu=self.use_gpu,
logging_level=self.logging_level,
)
# This is for CUDA OOM exceptions. Fall back to CPU if needed.
except RuntimeError:
self.__stanza_pipelines[lang] = stanza.Pipeline(
lang=lang,
dir=str(self.stanza_resource_path),
processors=self._get_stanza_processors(lang),
use_gpu=False,
logging_level=self.logging_level,
)
return self.__stanza_pipelines[lang]
def _get_stanza_tokens(self, lang: str, raw_text: str):
sentences = []
entities = []
e = ""
try:
pipeline = self.stanza_pipelines[lang](raw_text)
pipeline = self.get_stanza_pipeline(lang)(raw_text)
pip_pat = re.compile(r"(?<=\d)_(?=\d)")
for sentence in pipeline.sentences:
words = []
for word in sentence.words:
......@@ -263,27 +290,10 @@ class MLP:
def _get_stanza_ner(self, lang: str, raw_text: str):
pipeline = self.stanza_pipelines[lang](raw_text)
pipeline = self.get_stanza_pipeline(lang)(raw_text)
return [entity for sentence in pipeline.sentences for entity in sentence.entities]
def _load_stanza_pipelines(self, logging_level, use_gpu):
"""
Initializes Stanza Pipeline objects all at once to save time later.
"""
stanza_pipelines = {}
for lang in self.supported_langs:
stanza_resource_path = pathlib.Path(self.resource_dir) / "stanza"
stanza_pipelines[lang] = stanza.Pipeline(
lang=lang,
dir=str(stanza_resource_path),
processors=self._get_stanza_processors(lang),
use_gpu=use_gpu,
logging_level=logging_level
)
return stanza_pipelines
@staticmethod
def _get_stanza_processors(lang):
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment