Commit 3924f7d3 authored by Raul Sirel's avatar Raul Sirel
Browse files

Merge branch 'handle_unsupported_lang' into 'master'

Handle unsupported lang

See merge request !5
parents 9cbeb8b2 47faf09a
Pipeline #4637 canceled with stage
in 41 seconds
......@@ -99,6 +99,7 @@ def test_remove_duplicate_facts_by_span_in_doc(expected_non_duplicate_facts, tes
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_entities=[],
entity_mapper=None,
......@@ -132,6 +133,7 @@ def test_bound_close_ones(expected_close_BOUNDS, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_entities=[],
entity_mapper=None,
......@@ -164,6 +166,7 @@ def test_remove_overlaping_in_bounded(expected_bounds_no_overlap, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_entities=[],
entity_mapper=None,
......@@ -190,6 +193,7 @@ def test_concatenate_subset_bounds(expected_bounds_no_subsets, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_entities=[],
entity_mapper=None,
......@@ -215,6 +219,7 @@ def test_concatenate_subset_bounds(key_value_single_pairs, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_entities=[],
entity_mapper=None,
......@@ -236,6 +241,7 @@ def test_space_between_ok(mlp, ok_spaces):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_entities=[],
entity_mapper=None,
......@@ -261,6 +267,7 @@ def test_space_between_not_ok(mlp, not_ok_spaces):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_entities=[],
entity_mapper=None,
......@@ -284,6 +291,7 @@ def test_clean_similar_in_strval(similar_cleaned_str_val, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_entities=[],
entity_mapper=None,
......
......@@ -38,12 +38,12 @@ def test_mlp_process(mlp: MLP):
assert isinstance(mlp_text["lemmas"], str) == True
assert "lang" in mlp_text
assert isinstance(mlp_text["lang"], str) == True
assert isinstance(mlp_text["lang"], dict) == True
assert "texta_facts" in result
assert isinstance(result["texta_facts"], list) == True
if mlp_text["lang"] in ("ru", "ar"):
if mlp_text["lang"]["analysis_lang"] in ("ru", "ar"):
assert "transliteration" in mlp_text
assert "style" not in mlp_text["text"]
......
This diff is collapsed.
......@@ -165,12 +165,6 @@ class MLP:
lang = detect(text)
except:
lang = None
# check if detected language is supported
if lang not in self.supported_langs:
if self.use_default_lang:
lang = self.default_lang
else:
raise LanguageNotSupported("Detected language is not supported: {}.".format(lang))
return lang
......@@ -179,12 +173,21 @@ class MLP:
# detect language
if not lang:
lang = self.detect_language(processed_text)
words, entities = self._get_stanza_tokens(lang, processed_text) if processed_text else ([], [])
'''
check if detected language is supported if the language is not supported it will use default_lang to load
stanza models yet keep the document lang as the detected language
'''
if lang not in self.supported_langs:
analysis_lang = self.default_lang
words, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [])
else:
analysis_lang = lang
words, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [])
document = Document(
original_text=processed_text,
dominant_language_code=lang,
analysis_lang=analysis_lang,
stanza_words=words,
stanza_entities=entities,
analyzers=analyzers,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment