Commit ef4e3eb8 authored by Raul Sirel's avatar Raul Sirel
Browse files

Merge remote-tracking branch 'origin/readd_optimization'

parents ff95827f c69f89f7
...@@ -10,5 +10,5 @@ def mlp(request) -> MLP: ...@@ -10,5 +10,5 @@ def mlp(request) -> MLP:
You don’t need to import the fixture you want to use in a test, it automatically gets discovered by pytest. You don’t need to import the fixture you want to use in a test, it automatically gets discovered by pytest.
The discovery of fixture functions starts at test classes, then test modules, then conftest.py files and finally builtin and third party plugins. The discovery of fixture functions starts at test classes, then test modules, then conftest.py files and finally builtin and third party plugins.
""" """
mlp = MLP(language_codes=["et", "ru", "en"], logging_level="info", use_gpu=False) mlp = MLP(language_codes=["et", "en", "ru", "fi", "hr", "lv", "es"], logging_level="info", use_gpu=False)
return mlp return mlp
...@@ -21,6 +21,14 @@ test_texts = [ ...@@ -21,6 +21,14 @@ test_texts = [
"ja siis veel seesama number ka 8-910-431-03-73", "ja siis veel seesama number ka 8-910-431-03-73",
"+7 (903) 474-47-20 или +79034744720 или иван.сергеев@пример.рф", "+7 (903) 474-47-20 или +79034744720 или иван.сергеев@пример.рф",
"иван.сергеев@пример.рф", "иван.сергеев@пример.рф",
# Finnish
"sosiaaliturvakelaPerustoimeentuloa kootaan Kansaneläkelaitoksen katon alleHelsinki, 27. 10. (STT)Yhä useammat perustoimeentuloon liittyvät etuudet maksetaan muutaman vuoden kuluttua Kansaneläkelaitoksen kautta. Lapsilisät ja lasten kotihoidon tuki siirtyvät kuntien sosiaalitoimelta Kelan hoidettaviksi jo ensi vuoden alusta ja vuonna 1994 Kelan luukuilta myönnetään myös äitiysavustukset, sotilasavustukset sekä asumis- ja opintotuki.",
# Croatian
"Svako ima pravo na školovanje. Školovanje treba da bude besplatno bar u osnovnim i nižim školama. Osnovna nastava je obavezna. Tehnička i stručna nastava treba da bude opšte dostupna, a viša nastava treba da bude svima podjednako pristupačna na osnovu utvrdjenih kriterijuma.",
# Latvian
"Ikvienam ir tiesìbas uz izglìtìbu. Izglìtìbai, vismaz pamata un vispârèjai izglìtìbai, ir jâbút bezmaksas. Pamatizglìtìbai ir jâbút obligâtai. Tehniskajai un profesionâlajai izglìtìbai ir jâbút vispâr pieejamai, un augstâkajai izglìtìbai jâbút vienâdi pieejamai visiem atbilstoøi spèjâm.",
# Spanish
"Cuando mencionamos el deporte, a todos nos viene a la cabeza la palabra “salud”. La vida sedentaria que muchos de nosotros llevamos, sin mencionar otro tipo de hábitos perjudiciales para el organismo, son el gran enemigo de nuestro cuerpo."
] ]
...@@ -78,7 +86,7 @@ def test_mlp_process(mlp: MLP): ...@@ -78,7 +86,7 @@ def test_mlp_process(mlp: MLP):
# Skip checks for phones with area codes. # Skip checks for phones with area codes.
# TODO Rewrite this test taking the problem with parenthesis into consideration. # TODO Rewrite this test taking the problem with parenthesis into consideration.
if not ("(" in text and ")" in text): if not ("(" in text and ")" in text):
#print(text, spanned_text, fact) # print(text, spanned_text, fact)
assert spanned_text == str_val assert spanned_text == str_val
...@@ -111,7 +119,7 @@ def test_no_company(mlp: MLP, test_input): ...@@ -111,7 +119,7 @@ def test_no_company(mlp: MLP, test_input):
@pytest.mark.parametrize("expected_lemmas, test_input", [ @pytest.mark.parametrize("expected_lemmas, test_input", [
('Mari olema loll', 'Mari on loll'), ('Mari olema loll nagu kapsas', 'Mari on loll nagu kapsas'),
('Barack Obama be give Donald Trump a heart attack', 'Barack Obama is giving Donald Trump a heart attack') ('Barack Obama be give Donald Trump a heart attack', 'Barack Obama is giving Donald Trump a heart attack')
]) ])
def test_mlp_lemmatize(mlp: MLP, expected_lemmas, test_input): def test_mlp_lemmatize(mlp: MLP, expected_lemmas, test_input):
......
...@@ -260,12 +260,25 @@ class MLP: ...@@ -260,12 +260,25 @@ class MLP:
stanza_pipelines[lang] = stanza.Pipeline( stanza_pipelines[lang] = stanza.Pipeline(
lang=lang, lang=lang,
dir=str(stanza_resource_path), dir=str(stanza_resource_path),
processors=self._get_stanza_processors(lang),
use_gpu=self.use_gpu, use_gpu=self.use_gpu,
logging_level=logging_level logging_level=logging_level
) )
return stanza_pipelines return stanza_pipelines
@staticmethod
def _get_stanza_processors(lang):
"""
Returns processor options based on language and NER support in Stanza.
"""
print(lang)
if lang in STANZA_NER_SUPPORT:
return "tokenize,pos,lemma,ner"
else:
return "tokenize,pos,lemma"
def process(self, raw_text: str, analyzers: list = ["all"], lang=None): def process(self, raw_text: str, analyzers: list = ["all"], lang=None):
""" """
Processes raw text. Processes raw text.
...@@ -280,7 +293,7 @@ class MLP: ...@@ -280,7 +293,7 @@ class MLP:
for analyzer in loaded_analyzers: for analyzer in loaded_analyzers:
# For every analyzer, activate the function that processes it from the # For every analyzer, activate the function that processes it from the
# document class. # document class.
getattr(document, analyzer)() self.__apply_analyzer(document, analyzer)
return document.to_json() return document.to_json()
else: else:
return None return None
...@@ -318,6 +331,13 @@ class MLP: ...@@ -318,6 +331,13 @@ class MLP:
return [] return []
def __apply_analyzer(self, doc, analyzer):
try:
getattr(doc, analyzer)()
except Exception as e:
self.logger.exception(e)
def process_docs(self, docs: List[dict], doc_paths: List[str], analyzers=["all"]): def process_docs(self, docs: List[dict], doc_paths: List[str], analyzers=["all"]):
""" """
:param docs: Contains tuples with two dicts inside them, the first being the document to be analyzed and the second is the meta information that corresponds to the document for transport purposes later on. :param docs: Contains tuples with two dicts inside them, the first being the document to be analyzed and the second is the meta information that corresponds to the document for transport purposes later on.
...@@ -339,7 +359,7 @@ class MLP: ...@@ -339,7 +359,7 @@ class MLP:
for analyzer in analyzers: for analyzer in analyzers:
# For every analyzer, activate the function that processes it from the # For every analyzer, activate the function that processes it from the
# document class. # document class.
getattr(doc, analyzer)() self.__apply_analyzer(doc, analyzer)
result = doc.document_to_json(use_default_doc_path=False) result = doc.document_to_json(use_default_doc_path=False)
new_facts = result.pop("texta_facts", []) new_facts = result.pop("texta_facts", [])
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment