Commit 5cb7a9f7 authored by Raul Sirel's avatar Raul Sirel
Browse files

Merge branch 'cuda-device-selection'

parents 53a869f9 211ffd65
Pipeline #6881 canceled with stages
in 14 minutes and 33 seconds
......@@ -9,3 +9,7 @@ class BoundedListEmpty(Exception):
class StanzaPipelineFail(Exception):
"""Raised when Stanza pipelines fail to load."""
pass
class CUDAException(Exception):
"""Raised when problems with CUDA settings or support."""
pass
......@@ -2,6 +2,7 @@ import logging
import os
import pathlib
import shutil
import torch
from typing import List, Optional
from urllib.parse import urlparse
from urllib.request import urlopen
......@@ -13,89 +14,33 @@ from langdetect import detect
from texta_mlp.document import Document
from texta_mlp.entity_mapper import EntityMapper
from texta_mlp.exceptions import LanguageNotSupported
from texta_mlp.utils import parse_bool_env
# Languages supported by default.
DEFAULT_LANG_CODES = ("et", "ru", "en", "ar")
# URLs for default Entity Mapper data sources.
ENTITY_MAPPER_DATA_URLS = (
"https://packages.texta.ee/texta-resources/entity_mapper/addresses.json",
"https://packages.texta.ee/texta-resources/entity_mapper/companies.json",
"https://packages.texta.ee/texta-resources/entity_mapper/currencies.json"
)
# URLs for Concatenator data sources.
CONCATENATOR_DATA_FILES = (
"https://packages.texta.ee/texta-resources/concatenator/months.txt",
"https://packages.texta.ee/texta-resources/concatenator/not_entities.txt",
"https://packages.texta.ee/texta-resources/concatenator/space_between_not_ok.txt",
)
# URLs for Custom NER model downloads.
CUSTOM_NER_MODELS = {
"et": "https://packages.texta.ee/texta-resources/ner_models/_estonian_nertagger.pt",
}
# Location of the resource dir where models are downloaded
DEFAULT_RESOURCE_DIR = os.getenv("TEXTA_MLP_DATA_DIR", os.path.join(os.getcwd(), "data"))
# Data refresh means deleting all existing models and downloading new ones
REFRESH_DATA = parse_bool_env("TEXTA_MLP_REFRESH_DATA", False)
# List of all analyzers supported by MLP
SUPPORTED_ANALYZERS = (
"lemmas",
"pos_tags",
"word_features",
"transliteration",
"ner",
"addresses",
"emails",
"phone_strict",
"entities",
"currency_sum",
"sentences"
from texta_mlp.exceptions import LanguageNotSupported, CUDAException
from texta_mlp.settings import (
DEFAULT_LANG_CODES,
DEFAULT_RESOURCE_DIR,
REFRESH_DATA,
CUSTOM_NER_MODEL_LANGS,
CUSTOM_NER_MODELS,
STANZA_NER_SUPPORT,
SUPPORTED_ANALYZERS,
DEFAULT_ANALYZERS,
ENTITY_MAPPER_DATA_URLS
)
DEFAULT_ANALYZERS = [
"lemmas",
"pos_tags",
"word_features",
"transliteration",
"ner",
"addresses",
"emails",
"phone_strict",
"entities",
"sentences",
"currency_sum"
]
# Here we define languages with NER support to avoid Stanza trying to load them for languages without NER support.
# This significantly increases performance for languages without NER.
# https://stanfordnlp.github.io/stanza/available_models.html#available-ner-models
STANZA_NER_SUPPORT = ("ar", "zh", "nl", "en", "fr", "de", "ru", "es", "uk")
# Here we add langs that will have custom ner models.
CUSTOM_NER_MODEL_LANGS = ["et"]
class MLP:
def __init__(
self,
language_codes=DEFAULT_LANG_CODES,
default_language_code=DEFAULT_LANG_CODES[0],
language_codes = DEFAULT_LANG_CODES,
default_language_code = DEFAULT_LANG_CODES[0],
use_default_language_code=True,
resource_dir: str = DEFAULT_RESOURCE_DIR,
ner_model_langs: list = CUSTOM_NER_MODEL_LANGS,
logging_level="error",
use_gpu=True,
refresh_data=REFRESH_DATA
logging_level: str ="error",
use_gpu: bool = True,
gpu_device_id: int = 0,
refresh_data: bool = REFRESH_DATA
):
self.supported_langs = language_codes
self.logger = logging.getLogger()
......@@ -106,15 +51,23 @@ class MLP:
self._stanza_pipelines = {}
self.custom_ner_model_langs = ner_model_langs
self.logging_level = logging_level
self.use_gpu = use_gpu
self.stanza_resource_path = pathlib.Path(self.resource_dir) / "stanza"
self.custom_ner_model_Path = pathlib.Path(self.resource_dir) / "ner_models"
self.prepare_resources(refresh_data)
self._entity_mapper = None
self.loaded_entity_files = []
self.use_gpu = use_gpu
if self.use_gpu:
# check if cuda available
if not torch.cuda.is_available():
raise CUDAException("Your machine does not support CUDA!")
# select gpu based on env
if gpu_device_id > 0:
device_count = torch.cuda.device_count()
if gpu_device_id > device_count-1:
raise CUDAException(f"Invalid device id: {gpu_device_id}! Your machine only has {device_count} device(s).")
torch.cuda.set_device(gpu_device_id)
def prepare_resources(self, refresh_data):
......
import os
from texta_mlp.utils import parse_bool_env
# Languages supported by default.
DEFAULT_LANG_CODES = ("et", "ru", "en", "ar")
# URLs for default Entity Mapper data sources.
ENTITY_MAPPER_DATA_URLS = (
"https://packages.texta.ee/texta-resources/entity_mapper/addresses.json",
"https://packages.texta.ee/texta-resources/entity_mapper/companies.json",
"https://packages.texta.ee/texta-resources/entity_mapper/currencies.json"
)
# URLs for Concatenator data sources.
CONCATENATOR_DATA_FILES = (
"https://packages.texta.ee/texta-resources/concatenator/months.txt",
"https://packages.texta.ee/texta-resources/concatenator/not_entities.txt",
"https://packages.texta.ee/texta-resources/concatenator/space_between_not_ok.txt",
)
# URLs for Custom NER model downloads.
CUSTOM_NER_MODELS = {
"et": "https://packages.texta.ee/texta-resources/ner_models/_estonian_nertagger.pt",
}
# Location of the resource dir where models are downloaded
DEFAULT_RESOURCE_DIR = os.getenv("TEXTA_MLP_DATA_DIR", os.path.join(os.getcwd(), "data"))
# Data refresh means deleting all existing models and downloading new ones
REFRESH_DATA = parse_bool_env("TEXTA_MLP_REFRESH_DATA", False)
# List of all analyzers supported by MLP
SUPPORTED_ANALYZERS = (
"lemmas",
"pos_tags",
"word_features",
"transliteration",
"ner",
"addresses",
"emails",
"phone_strict",
"entities",
"currency_sum",
"sentences"
)
DEFAULT_ANALYZERS = [
"lemmas",
"pos_tags",
"word_features",
"transliteration",
"ner",
"addresses",
"emails",
"phone_strict",
"entities",
"sentences",
"currency_sum"
]
# Here we define languages with NER support to avoid Stanza trying to load them for languages without NER support.
# This significantly increases performance for languages without NER.
# https://stanfordnlp.github.io/stanza/available_models.html#available-ner-models
STANZA_NER_SUPPORT = ("ar", "zh", "nl", "en", "fr", "de", "ru", "es", "uk")
# Here we add langs that will have custom ner models.
CUSTOM_NER_MODEL_LANGS = ["et"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment