Commit a8e43152 authored by Raul Sirel's avatar Raul Sirel
Browse files

change resources location in web. option to refresh data. äriregister data updated weekly.

parent d9827cab
Pipeline #4811 failed with stage
in 3 minutes and 20 seconds
import logging
import os
import pathlib
import shutil
from typing import List
from urllib.parse import urlparse
from urllib.request import urlopen
import regex as re
import stanza
from bs4 import BeautifulSoup
......@@ -14,6 +14,7 @@ from pelecanus import PelicanJson
from texta_mlp.document import Document
from texta_mlp.entity_mapper import EntityMapper
from texta_mlp.exceptions import LanguageNotSupported
from texta_mlp.utils import parse_bool_env
# Languages supported by default.
......@@ -21,21 +22,24 @@ DEFAULT_LANG_CODES = ("et", "ru", "en", "ar")
# URLs for default Entity Mapper data sources.
ENTITY_MAPPER_DATA_URLS = (
"https://git.texta.ee/texta/texta-resources/-/raw/master/entity_mapper/addresses.json",
"https://git.texta.ee/texta/texta-resources/-/raw/master/entity_mapper/companies.json",
"https://git.texta.ee/texta/texta-resources/-/raw/master/entity_mapper/currencies.json"
"https://packages.texta.ee/texta-resources/entity_mapper/addresses.json",
"https://packages.texta.ee/texta-resources/entity_mapper/companies.json",
"https://packages.texta.ee/texta-resources/entity_mapper/currencies.json"
)
# URLs for Concatenator data sources.
CONCATENATOR_DATA_FILES = (
"https://git.texta.ee/texta/texta-resources/-/raw/master/concatenator/months.txt",
"https://git.texta.ee/texta/texta-resources/-/raw/master/concatenator/not_entities.txt",
"https://git.texta.ee/texta/texta-resources/-/raw/master/concatenator/space_between_not_ok.txt",
"https://packages.texta.ee/texta-resources/concatenator/months.txt",
"https://packages.texta.ee/texta-resources/concatenator/not_entities.txt",
"https://packages.texta.ee/texta-resources/concatenator/space_between_not_ok.txt",
)
# Location of the resource dir where models are downloaded
DEFAULT_RESOURCE_DIR = os.getenv("TEXTA_MLP_DATA_DIR", os.path.join(os.getcwd(), "data"))
# Data refresh means deleting all existing models and downloading new ones
REFRESH_DATA = parse_bool_env("TEXTA_MLP_REFRESH_DATA", True)
# List of all analyzers supported by MLP
SUPPORTED_ANALYZERS = (
"lemmas",
......@@ -66,7 +70,8 @@ class MLP:
use_default_language_code=True,
resource_dir: str = DEFAULT_RESOURCE_DIR,
logging_level="error",
use_gpu=True
use_gpu=True,
refresh_data=REFRESH_DATA
):
self.supported_langs = language_codes
self.logger = logging.getLogger()
......@@ -80,9 +85,7 @@ class MLP:
self.space_between_not_ok_path = self.resource_dir_pathlib / "concatenator" / "space_between_not_ok.txt"
self.months_path = self.resource_dir_pathlib / "concatenator" / "months.txt"
self.download_stanza_resources(self.resource_dir, self.supported_langs, logger=self.logger)
self.download_entity_mapper_resources(self.resource_dir, logger=self.logger)
self.download_concatenator_resources(self.resource_dir, logger=self.logger)
self.prepare_resources(refresh_data)
self.stanza_pipelines = self._load_stanza_pipelines(logging_level)
self.entity_mapper = self._load_entity_mapper()
......@@ -98,6 +101,20 @@ class MLP:
}
def prepare_resources(self, refresh_data):
"""
Prepares all resources for MLP.
"""
# delete data if refresh asked
if refresh_data:
shutil.rmtree(self.resource_dir)
self.logger.info("MLP data directory deleted.")
# download resources
self.download_stanza_resources(self.resource_dir, self.supported_langs, logger=self.logger)
self.download_entity_mapper_resources(self.resource_dir, logger=self.logger)
self.download_concatenator_resources(self.resource_dir, logger=self.logger)
@staticmethod
def download_stanza_resources(resource_dir: str, supported_langs: List[str], logger=None):
"""
......
import os
def parse_bool_env(env_name: str, default: bool):
value = os.getenv(env_name, str(default)).lower()
if value in ["true"]:
return True
else:
return False
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment