Commit 9dcbb14d authored by Raul Sirel's avatar Raul Sirel
Browse files

add ci file

parent e9d5cda6
Pipeline #2077 failed
image: continuumio/miniconda3:latest
stages:
- test
- build
Test:
before_script:
- conda create -n texta-mlp python=3.7 && source activate texta-mlp
- pip install -r requirements.txt
stage: test
tags:
- ci-test
variables:
TEXTA_ES_URL: http://172.17.0.1:9200
TEXTA_MLP_URL: http://texta-mlp:5000
TEXTA_REDIS_URL: redis://redis:6379
script:
- python -m pytest -v tests
Build:
stage: build
tags:
- ci-test
script:
- pip install -U twine
- python setup.py sdist
- twine upload dist/*
only:
- tags
from texta_mlp.mlp import MLP
import os
test_text = "Jossif Stalin astus Laial tänaval koerasita sisse ning sisenes Lai 42 hoonesse."
mlp = MLP(use_default_language_code=True)
print('#',mlp.process(test_text))
print('#',mlp.lemmatize(test_text))
print('#',mlp.lemmatize({"foo": 1}))
print('#', mlp.process("Sputnik Эстония перешёл на работу в чрезвычайном режиме. Заявление шеф-редактора Читать далее: https://ee.sputniknews.ru/"))
from texta_mlp.mlp import MLP
mlp = MLP()
test_texts = [
"Erinevad riigid käituvad siin väga erinevalt. Eestil on suhtes viirusinfoga arenguruumi, leiab Raul Rebane Vikerraadio päevakommentaaris.",
"Jossif Stalin astus Laial tänaval koerasita sisse ning sisenes Lai 42 hoonesse.",
"Sputnik Эстония перешёл на работу в чрезвычайном режиме. Заявление шеф-редактора Читать далее: https://ee.sputniknews.ru/",
1,
#"",
{"foo": "bar"}
]
def test_mlp_process():
for test_text in test_texts:
# process text
result = mlp.process(test_text)
assert "text" in result
assert "texta_facts" in result
def run_test_mlp_lemmatize():
for test_text in test_texts:
# lemmatise text
result = mlp.lemmatize(test_text)
assert isinstance(result, str)
assert len(result) > 0
......@@ -20,7 +20,6 @@ class Document:
original_text: str,
dominant_language_code: str,
stanford,
#polyglot,
doc_path: str = "text",
json_doc: dict = None,
analyzers: list = ("lemmas", "pos_tags", "transliteration", "ner", "sentiment", "entity_mapper")
......@@ -55,13 +54,10 @@ class Document:
list_of_path_keys = self.doc_path.split(".")
root_key = "{}_mlp".format(list_of_path_keys[-1])
path_to_mlp = list_of_path_keys[:-1] + [root_key] if len(list_of_path_keys) > 1 else [root_key]
mlp_result = self.to_json()
nested_dict_wrapper = NestedDict(self.json_doc)
nested_dict_wrapper.insert(path_to_mlp, mlp_result["text"])
nested_dict_wrapper.insert(["texta_facts"], mlp_result["texta_facts"])
return json.loads(nested_dict_wrapper.dumps())
......@@ -84,7 +80,6 @@ class Document:
else:
self.__lemmas.append('X')
def get_lemma(self) -> str:
return " ".join([a.strip() for a in self.__lemmas])
......@@ -112,22 +107,6 @@ class Document:
return " ".join([a.strip() for a in self.__pos_tags])
def transliteration(self):
if self.dominant_language_code in Document.langs_to_transliterate:
transliterator = Transliterator(source_lang=self.dominant_language_code, target_lang="et")
for word in self.stanford:
translit_word = str(transliterator.transliterate(word.text)).strip()
if not translit_word:
translit_word = word.text.strip()
self.__transliteration.append(translit_word)
def get_transliteration(self) -> str:
return " ".join(['X' if not a.strip() else a for a in self.__transliteration])
def entity_mapper(self):
text = self.get_words()
hits = ENTITY_MAPPER.map_entities(text)
......@@ -138,22 +117,15 @@ class Document:
self.__texta_facts.append(new_fact)
def ner(self):
entities = []
seen = set()
# def transliteration(self):
# if self.dominant_language_code in Document.langs_to_transliterate:
# transliterator = Transliterator(source_lang=self.dominant_language_code, target_lang="et")
# for word in self.stanford:
# translit_word = str(transliterator.transliterate(word.text)).strip()
# if not translit_word:
# translit_word = word.text.strip()
# self.__transliteration.append(translit_word)
for entity in self.polyglot.entities:
str_val = " ".join(entity).strip(string.punctuation + " ")
if str_val in seen:
pass
else:
entities.append(entity)
seen.add(str_val)
for entity in entities:
str_val = " ".join(entity).strip(string.punctuation + " ")
pattern = re.compile('(?<!\S)' + re.escape(str_val) + '(?!\S)')
for match in pattern.finditer(self.get_words()):
new_fact = Fact(fact_type=entity.tag, fact_value=str_val, doc_path=self.doc_path, spans=[[match.start(), match.end()]])
self.__texta_facts.append(new_fact)
# def get_transliteration(self) -> str:
# return " ".join(['X' if not a.strip() else a for a in self.__transliteration])
import os
import pathlib
import requests
import stanfordnlp
from settings import (
ENTITY_MAPPER_DIR,
RESOURCES_DIR,
RESOURCES_REPO,
DEFAULT_LANG_CODES
)
def download_entity_mapper_data():
entity_mapper_path = pathlib.Path(ENTITY_MAPPER_DIR)
files = ["addresses.json", "companies.json"]
for file_name in files:
file_path = entity_mapper_path / file_name
# don't download if file already exists
if not os.path.exists(file_path):
url = f"{RESOURCES_REPO}raw/master/entity_mapper/{file_name}"
file_content = requests.get(url).text
if file_content:
file_path = entity_mapper_path / file_name
with open(file_path, "w", encoding="utf8") as fh:
fh.write(file_content)
return True
def download_external_models():
stanford_path = pathlib.Path(RESOURCES_DIR) / "stanfordnlp"
stanford_path.mkdir(parents=True, exist_ok=True)
for language_code in DEFAULT_LANG_CODES:
# rglob is for recursive filename pattern matching, if it matches nothing
# then the necessary files do not exist and we should download them.
if not list(stanford_path.rglob("{}*".format(language_code))):
stanfordnlp.download(language_code, stanford_path, force=True)
return True
if __name__ == "__main__":
download_entity_mapper_data()
download_external_models()
......@@ -40,6 +40,10 @@ class MLP:
def _prepare_resources(self):
"""
Downloads StanfordNLP resources if not present in resources directory.
By default all is downloaded into data directory under package directory.
"""
stanford_resource_path = pathlib.Path(self.resource_dir) / "stanfordnlp"
stanford_resource_path.mkdir(parents=True, exist_ok=True)
for language_code in self.supported_langs:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment