Commit ef4e3eb8 authored by Raul Sirel's avatar Raul Sirel
Browse files

Merge remote-tracking branch 'origin/readd_optimization'

parents ff95827f c69f89f7
......@@ -10,5 +10,5 @@ def mlp(request) -> MLP:
You don’t need to import the fixture you want to use in a test, it automatically gets discovered by pytest.
The discovery of fixture functions starts at test classes, then test modules, then conftest.py files and finally builtin and third party plugins.
"""
mlp = MLP(language_codes=["et", "ru", "en"], logging_level="info", use_gpu=False)
mlp = MLP(language_codes=["et", "en", "ru", "fi", "hr", "lv", "es"], logging_level="info", use_gpu=False)
return mlp
......@@ -21,6 +21,14 @@ test_texts = [
"ja siis veel seesama number ka 8-910-431-03-73",
"+7 (903) 474-47-20 или +79034744720 или иван.сергеев@пример.рф",
"иван.сергеев@пример.рф",
# Finnish
"sosiaaliturvakelaPerustoimeentuloa kootaan Kansaneläkelaitoksen katon alleHelsinki, 27. 10. (STT)Yhä useammat perustoimeentuloon liittyvät etuudet maksetaan muutaman vuoden kuluttua Kansaneläkelaitoksen kautta. Lapsilisät ja lasten kotihoidon tuki siirtyvät kuntien sosiaalitoimelta Kelan hoidettaviksi jo ensi vuoden alusta ja vuonna 1994 Kelan luukuilta myönnetään myös äitiysavustukset, sotilasavustukset sekä asumis- ja opintotuki.",
# Croatian
"Svako ima pravo na školovanje. Školovanje treba da bude besplatno bar u osnovnim i nižim školama. Osnovna nastava je obavezna. Tehnička i stručna nastava treba da bude opšte dostupna, a viša nastava treba da bude svima podjednako pristupačna na osnovu utvrdjenih kriterijuma.",
# Latvian
"Ikvienam ir tiesìbas uz izglìtìbu. Izglìtìbai, vismaz pamata un vispârèjai izglìtìbai, ir jâbút bezmaksas. Pamatizglìtìbai ir jâbút obligâtai. Tehniskajai un profesionâlajai izglìtìbai ir jâbút vispâr pieejamai, un augstâkajai izglìtìbai jâbút vienâdi pieejamai visiem atbilstoøi spèjâm.",
# Spanish
"Cuando mencionamos el deporte, a todos nos viene a la cabeza la palabra “salud”. La vida sedentaria que muchos de nosotros llevamos, sin mencionar otro tipo de hábitos perjudiciales para el organismo, son el gran enemigo de nuestro cuerpo."
]
......@@ -78,7 +86,7 @@ def test_mlp_process(mlp: MLP):
# Skip checks for phones with area codes.
# TODO Rewrite this test taking the problem with parenthesis into consideration.
if not ("(" in text and ")" in text):
#print(text, spanned_text, fact)
# print(text, spanned_text, fact)
assert spanned_text == str_val
......@@ -111,7 +119,7 @@ def test_no_company(mlp: MLP, test_input):
@pytest.mark.parametrize("expected_lemmas, test_input", [
('Mari olema loll', 'Mari on loll'),
('Mari olema loll nagu kapsas', 'Mari on loll nagu kapsas'),
('Barack Obama be give Donald Trump a heart attack', 'Barack Obama is giving Donald Trump a heart attack')
])
def test_mlp_lemmatize(mlp: MLP, expected_lemmas, test_input):
......
......@@ -260,12 +260,25 @@ class MLP:
stanza_pipelines[lang] = stanza.Pipeline(
lang=lang,
dir=str(stanza_resource_path),
processors=self._get_stanza_processors(lang),
use_gpu=self.use_gpu,
logging_level=logging_level
)
return stanza_pipelines
@staticmethod
def _get_stanza_processors(lang):
"""
Returns processor options based on language and NER support in Stanza.
"""
print(lang)
if lang in STANZA_NER_SUPPORT:
return "tokenize,pos,lemma,ner"
else:
return "tokenize,pos,lemma"
def process(self, raw_text: str, analyzers: list = ["all"], lang=None):
"""
Processes raw text.
......@@ -280,7 +293,7 @@ class MLP:
for analyzer in loaded_analyzers:
# For every analyzer, activate the function that processes it from the
# document class.
getattr(document, analyzer)()
self.__apply_analyzer(document, analyzer)
return document.to_json()
else:
return None
......@@ -318,6 +331,13 @@ class MLP:
return []
def __apply_analyzer(self, doc, analyzer):
try:
getattr(doc, analyzer)()
except Exception as e:
self.logger.exception(e)
def process_docs(self, docs: List[dict], doc_paths: List[str], analyzers=["all"]):
"""
:param docs: Contains tuples with two dicts inside them, the first being the document to be analyzed and the second is the meta information that corresponds to the document for transport purposes later on.
......@@ -339,7 +359,7 @@ class MLP:
for analyzer in analyzers:
# For every analyzer, activate the function that processes it from the
# document class.
getattr(doc, analyzer)()
self.__apply_analyzer(doc, analyzer)
result = doc.document_to_json(use_default_doc_path=False)
new_facts = result.pop("texta_facts", [])
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment