Commit 211ffd65 authored by Raul Sirel's avatar Raul Sirel
Browse files

settings + multi-gpu support

parent 1020e71c
Pipeline #6879 canceled with stage
......@@ -15,75 +15,18 @@ from langdetect import detect
from texta_mlp.document import Document
from texta_mlp.entity_mapper import EntityMapper
from texta_mlp.exceptions import LanguageNotSupported, CUDAException
from texta_mlp.utils import parse_bool_env
# Languages supported by default.
DEFAULT_LANG_CODES = ("et", "ru", "en", "ar")
# URLs for default Entity Mapper data sources.
ENTITY_MAPPER_DATA_URLS = (
"https://packages.texta.ee/texta-resources/entity_mapper/addresses.json",
"https://packages.texta.ee/texta-resources/entity_mapper/companies.json",
"https://packages.texta.ee/texta-resources/entity_mapper/currencies.json"
)
# URLs for Concatenator data sources.
CONCATENATOR_DATA_FILES = (
"https://packages.texta.ee/texta-resources/concatenator/months.txt",
"https://packages.texta.ee/texta-resources/concatenator/not_entities.txt",
"https://packages.texta.ee/texta-resources/concatenator/space_between_not_ok.txt",
)
# URLs for Custom NER model downloads.
CUSTOM_NER_MODELS = {
"et": "https://packages.texta.ee/texta-resources/ner_models/_estonian_nertagger.pt",
}
# Location of the resource dir where models are downloaded
DEFAULT_RESOURCE_DIR = os.getenv("TEXTA_MLP_DATA_DIR", os.path.join(os.getcwd(), "data"))
# Data refresh means deleting all existing models and downloading new ones
REFRESH_DATA = parse_bool_env("TEXTA_MLP_REFRESH_DATA", False)
# List of all analyzers supported by MLP
SUPPORTED_ANALYZERS = (
"lemmas",
"pos_tags",
"word_features",
"transliteration",
"ner",
"addresses",
"emails",
"phone_strict",
"entities",
"currency_sum",
"sentences"
from texta_mlp.settings import (
DEFAULT_LANG_CODES,
DEFAULT_RESOURCE_DIR,
REFRESH_DATA,
CUSTOM_NER_MODEL_LANGS,
CUSTOM_NER_MODELS,
STANZA_NER_SUPPORT,
SUPPORTED_ANALYZERS,
DEFAULT_ANALYZERS,
ENTITY_MAPPER_DATA_URLS
)
DEFAULT_ANALYZERS = [
"lemmas",
"pos_tags",
"word_features",
"transliteration",
"ner",
"addresses",
"emails",
"phone_strict",
"entities",
"sentences",
"currency_sum"
]
# Here we define languages with NER support to avoid Stanza trying to load them for languages without NER support.
# This significantly increases performance for languages without NER.
# https://stanfordnlp.github.io/stanza/available_models.html#available-ner-models
STANZA_NER_SUPPORT = ("ar", "zh", "nl", "en", "fr", "de", "ru", "es", "uk")
# Here we add langs that will have custom ner models.
CUSTOM_NER_MODEL_LANGS = ["et"]
class MLP:
......@@ -115,16 +58,15 @@ class MLP:
self.loaded_entity_files = []
self.use_gpu = use_gpu
if self.use_gpu:
# check if cuda available
if not torch.cuda.isavailable():
if not torch.cuda.is_available():
raise CUDAException("Your machine does not support CUDA!")
# select gpu based on env
if gpu_device_id > 0:
device_count = torch.cuda.device_count()
if gpu_device_id > device_count-1:
raise CUDAException(f"Invalid device id: {gpu_device_id}! Your machine only has {device_count} devices.")
raise CUDAException(f"Invalid device id: {gpu_device_id}! Your machine only has {device_count} device(s).")
torch.cuda.set_device(gpu_device_id)
......
import os
from texta_mlp.utils import parse_bool_env
# Languages supported by default.
DEFAULT_LANG_CODES = ("et", "ru", "en", "ar")
# URLs for default Entity Mapper data sources.
ENTITY_MAPPER_DATA_URLS = (
"https://packages.texta.ee/texta-resources/entity_mapper/addresses.json",
"https://packages.texta.ee/texta-resources/entity_mapper/companies.json",
"https://packages.texta.ee/texta-resources/entity_mapper/currencies.json"
)
# URLs for Concatenator data sources.
CONCATENATOR_DATA_FILES = (
"https://packages.texta.ee/texta-resources/concatenator/months.txt",
"https://packages.texta.ee/texta-resources/concatenator/not_entities.txt",
"https://packages.texta.ee/texta-resources/concatenator/space_between_not_ok.txt",
)
# URLs for Custom NER model downloads.
CUSTOM_NER_MODELS = {
"et": "https://packages.texta.ee/texta-resources/ner_models/_estonian_nertagger.pt",
}
# Location of the resource dir where models are downloaded
DEFAULT_RESOURCE_DIR = os.getenv("TEXTA_MLP_DATA_DIR", os.path.join(os.getcwd(), "data"))
# Data refresh means deleting all existing models and downloading new ones
REFRESH_DATA = parse_bool_env("TEXTA_MLP_REFRESH_DATA", False)
# List of all analyzers supported by MLP
SUPPORTED_ANALYZERS = (
"lemmas",
"pos_tags",
"word_features",
"transliteration",
"ner",
"addresses",
"emails",
"phone_strict",
"entities",
"currency_sum",
"sentences"
)
DEFAULT_ANALYZERS = [
"lemmas",
"pos_tags",
"word_features",
"transliteration",
"ner",
"addresses",
"emails",
"phone_strict",
"entities",
"sentences",
"currency_sum"
]
# Here we define languages with NER support to avoid Stanza trying to load them for languages without NER support.
# This significantly increases performance for languages without NER.
# https://stanfordnlp.github.io/stanza/available_models.html#available-ner-models
STANZA_NER_SUPPORT = ("ar", "zh", "nl", "en", "fr", "de", "ru", "es", "uk")
# Here we add langs that will have custom ner models.
CUSTOM_NER_MODEL_LANGS = ["et"]
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment