Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
texta
texta-mlp-python
Commits
211ffd65
Commit
211ffd65
authored
Oct 22, 2021
by
Raul Sirel
Browse files
settings + multi-gpu support
parent
1020e71c
Pipeline
#6879
canceled with stage
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
VERSION
View file @
211ffd65
1.15.
0
1.15.
1
texta_mlp/mlp.py
View file @
211ffd65
...
...
@@ -15,75 +15,18 @@ from langdetect import detect
from
texta_mlp.document
import
Document
from
texta_mlp.entity_mapper
import
EntityMapper
from
texta_mlp.exceptions
import
LanguageNotSupported
,
CUDAException
from
texta_mlp.utils
import
parse_bool_env
# Languages supported by default.
DEFAULT_LANG_CODES
=
(
"et"
,
"ru"
,
"en"
,
"ar"
)
# URLs for default Entity Mapper data sources.
ENTITY_MAPPER_DATA_URLS
=
(
"https://packages.texta.ee/texta-resources/entity_mapper/addresses.json"
,
"https://packages.texta.ee/texta-resources/entity_mapper/companies.json"
,
"https://packages.texta.ee/texta-resources/entity_mapper/currencies.json"
)
# URLs for Concatenator data sources.
CONCATENATOR_DATA_FILES
=
(
"https://packages.texta.ee/texta-resources/concatenator/months.txt"
,
"https://packages.texta.ee/texta-resources/concatenator/not_entities.txt"
,
"https://packages.texta.ee/texta-resources/concatenator/space_between_not_ok.txt"
,
)
# URLs for Custom NER model downloads.
CUSTOM_NER_MODELS
=
{
"et"
:
"https://packages.texta.ee/texta-resources/ner_models/_estonian_nertagger.pt"
,
}
# Location of the resource dir where models are downloaded
DEFAULT_RESOURCE_DIR
=
os
.
getenv
(
"TEXTA_MLP_DATA_DIR"
,
os
.
path
.
join
(
os
.
getcwd
(),
"data"
))
# Data refresh means deleting all existing models and downloading new ones
REFRESH_DATA
=
parse_bool_env
(
"TEXTA_MLP_REFRESH_DATA"
,
False
)
# List of all analyzers supported by MLP
SUPPORTED_ANALYZERS
=
(
"lemmas"
,
"pos_tags"
,
"word_features"
,
"transliteration"
,
"ner"
,
"addresses"
,
"emails"
,
"phone_strict"
,
"entities"
,
"currency_sum"
,
"sentences"
from
texta_mlp.settings
import
(
DEFAULT_LANG_CODES
,
DEFAULT_RESOURCE_DIR
,
REFRESH_DATA
,
CUSTOM_NER_MODEL_LANGS
,
CUSTOM_NER_MODELS
,
STANZA_NER_SUPPORT
,
SUPPORTED_ANALYZERS
,
DEFAULT_ANALYZERS
,
ENTITY_MAPPER_DATA_URLS
)
DEFAULT_ANALYZERS
=
[
"lemmas"
,
"pos_tags"
,
"word_features"
,
"transliteration"
,
"ner"
,
"addresses"
,
"emails"
,
"phone_strict"
,
"entities"
,
"sentences"
,
"currency_sum"
]
# Here we define languages with NER support to avoid Stanza trying to load them for languages without NER support.
# This significantly increases performance for languages without NER.
# https://stanfordnlp.github.io/stanza/available_models.html#available-ner-models
STANZA_NER_SUPPORT
=
(
"ar"
,
"zh"
,
"nl"
,
"en"
,
"fr"
,
"de"
,
"ru"
,
"es"
,
"uk"
)
# Here we add langs that will have custom ner models.
CUSTOM_NER_MODEL_LANGS
=
[
"et"
]
class
MLP
:
...
...
@@ -115,16 +58,15 @@ class MLP:
self
.
loaded_entity_files
=
[]
self
.
use_gpu
=
use_gpu
if
self
.
use_gpu
:
# check if cuda available
if
not
torch
.
cuda
.
isavailable
():
if
not
torch
.
cuda
.
is
_
available
():
raise
CUDAException
(
"Your machine does not support CUDA!"
)
# select gpu based on env
if
gpu_device_id
>
0
:
device_count
=
torch
.
cuda
.
device_count
()
if
gpu_device_id
>
device_count
-
1
:
raise
CUDAException
(
f
"Invalid device id:
{
gpu_device_id
}
! Your machine only has
{
device_count
}
device
s
."
)
raise
CUDAException
(
f
"Invalid device id:
{
gpu_device_id
}
! Your machine only has
{
device_count
}
device
(s)
."
)
torch
.
cuda
.
set_device
(
gpu_device_id
)
...
...
texta_mlp/settings.py
0 → 100644
View file @
211ffd65
import
os
from
texta_mlp.utils
import
parse_bool_env
# Languages supported by default.
DEFAULT_LANG_CODES
=
(
"et"
,
"ru"
,
"en"
,
"ar"
)
# URLs for default Entity Mapper data sources.
ENTITY_MAPPER_DATA_URLS
=
(
"https://packages.texta.ee/texta-resources/entity_mapper/addresses.json"
,
"https://packages.texta.ee/texta-resources/entity_mapper/companies.json"
,
"https://packages.texta.ee/texta-resources/entity_mapper/currencies.json"
)
# URLs for Concatenator data sources.
CONCATENATOR_DATA_FILES
=
(
"https://packages.texta.ee/texta-resources/concatenator/months.txt"
,
"https://packages.texta.ee/texta-resources/concatenator/not_entities.txt"
,
"https://packages.texta.ee/texta-resources/concatenator/space_between_not_ok.txt"
,
)
# URLs for Custom NER model downloads.
CUSTOM_NER_MODELS
=
{
"et"
:
"https://packages.texta.ee/texta-resources/ner_models/_estonian_nertagger.pt"
,
}
# Location of the resource dir where models are downloaded
DEFAULT_RESOURCE_DIR
=
os
.
getenv
(
"TEXTA_MLP_DATA_DIR"
,
os
.
path
.
join
(
os
.
getcwd
(),
"data"
))
# Data refresh means deleting all existing models and downloading new ones
REFRESH_DATA
=
parse_bool_env
(
"TEXTA_MLP_REFRESH_DATA"
,
False
)
# List of all analyzers supported by MLP
SUPPORTED_ANALYZERS
=
(
"lemmas"
,
"pos_tags"
,
"word_features"
,
"transliteration"
,
"ner"
,
"addresses"
,
"emails"
,
"phone_strict"
,
"entities"
,
"currency_sum"
,
"sentences"
)
DEFAULT_ANALYZERS
=
[
"lemmas"
,
"pos_tags"
,
"word_features"
,
"transliteration"
,
"ner"
,
"addresses"
,
"emails"
,
"phone_strict"
,
"entities"
,
"sentences"
,
"currency_sum"
]
# Here we define languages with NER support to avoid Stanza trying to load them for languages without NER support.
# This significantly increases performance for languages without NER.
# https://stanfordnlp.github.io/stanza/available_models.html#available-ner-models
STANZA_NER_SUPPORT
=
(
"ar"
,
"zh"
,
"nl"
,
"en"
,
"fr"
,
"de"
,
"ru"
,
"es"
,
"uk"
)
# Here we add langs that will have custom ner models.
CUSTOM_NER_MODEL_LANGS
=
[
"et"
]
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment