Commit d21f5c0b authored by Marko Kollo's avatar Marko Kollo 😄
Browse files

Merge branch 'stanza_1.2' into 'master'

Stanza 1.2

See merge request !9
parents 0e94c4eb 75fb9090
Pipeline #5518 failed with stage
in 4 minutes and 13 seconds
......@@ -16,7 +16,7 @@ dependencies:
- langdetect
- lang-trans
- pytest
- stanza==1.1.*
- stanza==1.2.*
- regex
- phonenumberslite
- celery==4.*
......
......@@ -5,7 +5,7 @@ import pytest
('ул. Малиновского , д. 25', 'Луганск, ул. Малиновского, д. 25,'),
('ул. Газеты Луганской правды , 130 В', 'г. Луганск, ул. Газеты Луганской правды, 130 В.\n'),
('ул. Матросская Тишина , д. 14А', 'в направлении подконтрольной территории Украины ул. Матросская Тишина, д.14А,'),
('ул. Матросская Тишина , д. 14А', 'в направлении подконтрольной территории Украины ул. Матросская Тишина,д.14А,'),
('ул. Матросская Тишина ,д.14А', 'в направлении подконтрольной территории Украины ул. Матросская Тишина,д.14А,'),
('ул. Матросская Тишина , д.14', 'в направлении подконтрольной территории Украины ул. Матросская Тишина , д.14 , '),
('ул. Курчатова 10а', 'Божьей Матери «Умиление»;- 10.00, г. Луганск, ул. Курчатова 10а, ГБОУ СПО ЛНР'),
('ул. Ильинка 3 / 8', 'по региональной и международной деятельности\nг. Москва, ул. Ильинка 3/8 строение 5'),
......@@ -16,7 +16,7 @@ import pytest
def test_address(mlp, expected_value, test_input):
result = mlp.process(test_input)
facts = result['texta_facts']
entities = [fact['str_val'] for fact in facts if fact['fact'] == 'ADDR']
entities = [fact['str_val'] for fact in facts if fact['fact'] in ('ADDR', 'LOC')]
if expected_value is None:
assert not entities
else:
......
......@@ -8,7 +8,7 @@ from texta_mlp.mlp import MLP
test_texts = [
"Barack Obama is giving Donald Trump a heart attack. Barack Obama and Lavrenti Beria are in the second sentence. Washington is under siege and New York is burning. Washington again. Microsoft is planning cutoffs. Washington DC is located on Potomac river."
"Erinevad riigid käituvad siin väga erinevalt. Eestil on suhtes viirusinfoga arenguruumi, leiab Raul Rebane Vikerraadio päevakommentaaris.",
"Esimene. Jossif Stalin astus Laial tänaval koerasita sisse ning sisenes Lai 42 hoonesse.",
"Jossif Stalin astus Laial tänaval koerasita sisse ning sisenes Lai 42 hoonesse.",
"Sputnik Эстония перешёл на работу в чрезвычайном режиме. Заявление шеф-редактора Читать далее: https://ee.sputniknews.ru/",
"المادة 1 يولد جميع الناس أحرارًا متساوين في الكرامة والحقوق. وقد وهبوا عقلاً وضميرًا وعليهم أن يعامل بعضهم بعضا بروح الإخاء.",
1,
......@@ -78,6 +78,7 @@ def test_mlp_process(mlp: MLP):
# Skip checks for phones with area codes.
# TODO Rewrite this test taking the problem with parenthesis into consideration.
if not ("(" in text and ")" in text):
#print(text, spanned_text, fact)
assert spanned_text == str_val
......
......@@ -11,9 +11,7 @@ import pytest
])
def test_single_namemail(mlp, expected_namemail, test_input):
""" test for single namemails """
print(test_input)
result = mlp.process(test_input, analyzers=("lemmas", "pos_tags", "transliteration", "ner", "contacts", "entities", "namemail"))
print(result)
facts = result['texta_facts']
detected_phone = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert detected_phone == [expected_namemail]
......
......@@ -59,7 +59,6 @@ def test_multiple_phones(mlp, expected_phones, test_input):
def test_no_phones(mlp, test_input):
""" test for no potential phones"""
result = mlp.process(test_input)
print(result['texta_facts'])
facts = result['texta_facts']
phones = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert len(phones) == 0
......
......@@ -105,6 +105,8 @@ class EntityMapper():
ngrams = self._ngrams(text, n)
for ngram in ngrams:
ngram = ' '.join(ngram)
# This is kinda hacky, but it kinda fixes Stanza tokenization errors.
ngram = ngram.replace(".", "").strip()
# Checks if ngram is in the entity map. If so, define its type and add it to the mappings
if ngram in self.entity_map:
entity_type = self.entity_map[ngram]
......@@ -200,7 +202,6 @@ class EntityMapper():
if entity_type not in facts:
facts[entity_type] = []
facts[entity_type] = values_with_spans
return facts
......
......@@ -230,12 +230,6 @@ class MLP:
def _get_stanza_tokens(self, lang: str, raw_text: str):
# This is a HACK to compensate Stanza errors in tokenizing Russian phone numbers
# Replaces "-" between digits to "_" so it won't be split into separate tokens (1/2)
pat = re.compile(r"(?<=\d)-(?=\d)")
if lang == "ru":
raw_text = pat.sub("_", raw_text)
pipeline = self.stanza_pipelines[lang](raw_text)
sentences = []
......@@ -244,10 +238,6 @@ class MLP:
for sentence in pipeline.sentences:
words = []
for word in sentence.words:
# Russian HACK (2/2)
# replaces back "#" to "-" between digits.
if lang == "ru":
word.text = pip_pat.sub("-", word.text)
words.append(word)
sentences.append(words)
for entity in sentence.entities:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment