Commit 43e67721 authored by Raul Sirel's avatar Raul Sirel
Browse files

add sent support for ner spans

parent a0cbbf58
Pipeline #6544 passed with stages
in 18 minutes and 53 seconds
import pytest
from texta_mlp.document import Document
from texta_mlp.fact import Fact
from texta_mlp.mlp import MLP
test_facts_aleksander = [Fact(
fact_type="NAMEMAIL",
fact_value="Aleksander Great aleksandersuur356eKr@mail.ee",
doc_path="text",
spans=[[30, 75]]
),
Fact(
fact_type="PER",
fact_value="Aleksander Great",
doc_path="text",
spans=[[30, 46]]
)
]
test_facts_to_be_bound = [Fact(
fact_type="EMAIL",
fact_value="aleksandersuur356eKr@mail.ee",
doc_path="text",
spans=[[47, 75]]
),
Fact(
fact_type="PER",
fact_value="Aleksander Great",
doc_path="text",
spans=[[30, 46]]
),
Fact(
fact_type="PHONE_high_recall",
fact_value="356356356",
doc_path="text",
spans=[[80, 89]]
),
Fact(
fact_type="EMAIL",
fact_value="aleksandersuur356eKr@mail.ee",
doc_path="text",
spans=[[97, 125]]
),
Fact(
fact_type="PER",
fact_value="Julius Caecar",
doc_path="text",
spans=[[300, 312]]
)
]
test_facts_to_be_bound2 = test_facts_to_be_bound + [Fact(
fact_type="PHONE_high_recall",
fact_value="356356356",
doc_path="text",
spans=[[400, 409]]
),
Fact(
fact_type="EMAIL",
fact_value="aleksandersuur356eKr@mail.ee",
doc_path="text",
spans=[[417, 485]]
)
]
test_text1 = "От кого: Канцелярия Президента <presidendikantelei@mail.ru>\nКому: Яан Тамм <vanas6ber@bk.ru>\nДата: Четверг, 3 октября 2013, 17:27 +04:00\nТема: контакт\nТелефон президента 45667788, он живет в Москве. Вы также можете отправить ему электронное письмо по адресу\nolenpresident@gmail.com\nС уважением,\nКанцелярия Президента"
test_text1_bounds = ["{'PER': ['Канцелярия Президента'], 'EMAIL': ['presidendikantelei@mail.ru']}",
"{'PHONE_high_recall': ['45667788'], 'LOC': ['Москве'], 'EMAIL': ['olenpresident@gmail.com']}",
"{'PER': ['Яан Тамм'], 'EMAIL': ['vanas6ber@bk.ru']}"]
mlp_wrapper = MLP(language_codes=["et", "ru", "en"], logging_level="info", use_gpu=False)
def test_created_bounds():
result = mlp_wrapper.process(test_text1)
bounded = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'BOUNDED']
for i, bound1 in enumerate(bounded):
# print(bound1, type(bound1))
# bound1 = json.loads(re.sub("\'", "\"", bound1["str_val"]))
assert bound1 in test_text1_bounds
for fact in result["texta_facts"]:
assert isinstance(fact["doc_path"], str) is True
assert fact["doc_path"] == "text.text"
assert isinstance(fact["str_val"], str) is True
assert isinstance(fact["spans"], str) is True
assert isinstance(fact["fact"], str) is True
@pytest.mark.parametrize("expected_non_duplicate_facts, test_input", [
([('aleksandersuur356eKr@mail.ee', 'EMAIL', [(47, 75)]), ('Aleksander Great', 'PER', [(30, 46)])], test_facts_aleksander)
])
def test_remove_duplicate_facts_by_span_in_doc(expected_non_duplicate_facts, test_input):
""" test for namemails unbounding"""
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
output_facts = doc.remove_duplicate_facts_by_span(test_input)
output_list = []
for o_f in output_facts:
output_list += [(o_f.fact_value, o_f.fact_type, o_f.spans)]
assert output_list == expected_non_duplicate_facts
@pytest.mark.parametrize("expected_close_BOUNDS, test_input", [
([{'doc_path': 'text',
'fact': 'BOUNDED',
'spans': [[30, 46], [47, 75], [80, 89], [97, 125]],
'str_val': {'PER': ['Aleksander Great'],
'EMAIL': ['aleksandersuur356eKr@mail.ee', 'aleksandersuur356eKr@mail.ee'],
'PHONE_high_recall': ['356356356']},
'str_values': [('Aleksander Great', 'PER'),
('aleksandersuur356eKr@mail.ee', 'EMAIL'),
('356356356', 'PHONE_high_recall'),
('aleksandersuur356eKr@mail.ee', 'EMAIL')]}], test_facts_to_be_bound)
])
def test_bound_close_ones(expected_close_BOUNDS, test_input):
""" test for namemails unbounding"""
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
output_list = doc.bound_close_ones(test_input)
for i, BOUND in enumerate(output_list):
assert BOUND["fact"] == "BOUNDED"
assert BOUND["str_val"].keys() == expected_close_BOUNDS[i]["str_val"].keys()
for key in BOUND["str_val"]:
for element in BOUND["str_val"][key]:
assert element in expected_close_BOUNDS[i]["str_val"][key]
for str_value in BOUND["str_values"]:
assert str_value in expected_close_BOUNDS[i]["str_values"]
@pytest.mark.parametrize("expected_bounds_no_overlap, test_input", [
([{'doc_path': 'text',
'fact': 'BOUNDED',
'spans': [[30, 46], [47, 75], [80, 89], [97, 125]],
'str_val': {'PER': ['Aleksander Great'],
'EMAIL': ['aleksandersuur356eKr@mail.ee'],
'PHONE_high_recall': ['356356356']}}], test_facts_to_be_bound)
])
def test_remove_overlaping_in_bounded(expected_bounds_no_overlap, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
bounded_facts = doc.bound_close_ones(test_input)
output_list = doc.remove_overlaping_in_bounded(bounded_facts)
for i, BOUND in enumerate(output_list):
assert BOUND["fact"] == "BOUNDED"
assert BOUND["str_val"].keys() == expected_bounds_no_overlap[i]["str_val"].keys()
for key in BOUND["str_val"]:
for element in BOUND["str_val"][key]:
assert element in expected_bounds_no_overlap[i]["str_val"][key]
@pytest.mark.parametrize("expected_bounds_no_subsets, test_input", [
([("{'PER': ['Aleksander Great'], 'EMAIL': ['aleksandersuur356eKr@mail.ee'], 'PHONE_high_recall': ['356356356']}", 'BOUNDED', [[30, 46], [47, 75], [80, 89], [97, 125], [400, 409], [417, 485]])], test_facts_to_be_bound2)
])
def test_concatenate_subset_bounds(expected_bounds_no_subsets, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
bounded_facts = doc.bound_close_ones(test_input)
bounded_facts = doc.remove_overlaping_in_bounded(bounded_facts)
output_facts = doc.concatenate_subset_bounds(bounded_facts)
output_list = []
for o_f in output_facts:
output_list += [(o_f.fact_value, o_f.fact_type, o_f.spans)]
assert output_list == expected_bounds_no_subsets
@pytest.mark.parametrize("key_value_single_pairs, test_input", [
([('a', 1), ('a', 2), ('b', 3), ('b', 4)], {'a': [1, 2], 'b': [3, 4]})
])
def test_concatenate_subset_bounds(key_value_single_pairs, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
result = doc.key_value_single_pairs(test_input)
assert result == key_value_single_pairs
@pytest.mark.parametrize("ok_spaces", [
("Kalevipoeg kirjutas Sulevipojale", (0, 9), (20, 31))
])
def test_space_between_ok(mlp, ok_spaces):
""" test for okay spaces"""
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
result = doc.space_between_ok(ok_spaces[0], ok_spaces[1], ok_spaces[2])
assert result is True
@pytest.mark.parametrize("not_ok_spaces", [
("Vasia Ivanov <am2311@hotmail.com> написал(а): > >> Павел, >>", (0, 12), (52, 57)),
("gmail.com\nSaadetud: 21.04.2015, 12:13:14\nSaaja: Magnus", (0, 8), (41, 46)),
("От кого: Dmitry Chubasskoel <dmitry.chubasdknoi@gmail.com>\nКому: 89178882978@mail.ru", (28, 56), (64, 81)),
("salapärane@yandex.ru>:\nПавел", (0, 18), (24, 27)),
("Павел\n\nПятница, 11 ноября 2016 г., 15:31 +0300 от Александр", (0, 5), (53, 62)),
("entity С уважением, Александр", (0, 5), (20, 28))
])
def test_space_between_not_ok(mlp, not_ok_spaces):
# test for spaces that are not ok
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
result = doc.space_between_ok(not_ok_spaces[0], not_ok_spaces[1], not_ok_spaces[2])
assert result == False
@pytest.mark.parametrize("similar_cleaned_str_val, test_input", [
({'PER': ['Pipi Pikk Sukk']}, {'PER': ['Pipi', 'Pipi Pikk Sukk>', 'Pipi Pikk Sukk']}),
({'PER': ['Pipi Pikk Sukk', 'Pipi Pikk Jalg']}, {'PER': ['Pipi', 'Pipi Pikk Sukk', 'Pipi Pikk Jalg']}),
({'PER': ['Tom Marvolo Riddle', 'Tom Marvolo']}, {'PER': ['Tom', 'Tom Marvolo', 'Tom Marvolo Riddle']}),
({'ORG': ['Eesti Vabariik', 'Eesti Valitsus', 'Valitsus']}, {'ORG': ['Eesti Vabariik', 'Eesti Valitsus', 'Valitsus']}),
({'PER': ['Pipi Pikk Sukk'], 'ORG': ['Народного Совета по промышленности и торговле']}, {'PER': ['Pipi Sukk', 'Pipi Pikk Sukk'], 'ORG': ['Народного Совета по промышленности и торговле', 'Народного Совета']})
])
def test_clean_similar_in_strval(similar_cleaned_str_val, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
result = doc.clean_similar_in_strval(dict(test_input))
assert result.keys() == similar_cleaned_str_val.keys()
for key in result:
for element in result[key]:
assert element in similar_cleaned_str_val[key]
......@@ -10,7 +10,7 @@ import pytest
('30 рубль', 'Рубль — название современных валют России (российский рубль), Белоруссии (белорусский рубль). 30 рубль.')
])
def test_currency_parse(mlp, expected_value, test_input):
result = mlp.process(test_input)
result = mlp.process(test_input, analyzers=["currency_sum", "entities"])
facts = result['texta_facts']
detected_facts = [fact['str_val'] for fact in facts if fact['fact'] == 'CURRENCY_SUM']
assert expected_value in detected_facts
......@@ -23,9 +23,9 @@ def test_currency_parse(mlp, expected_value, test_input):
def test_doc_path_parsing_inside_doc(mlp):
payload = {
"docs": [{"text": {"subsection": "Eesti keeles on see 2,5 eurot."}}],
"doc_paths": ["text.subsection"]
"doc_paths": ["text.subsection"],
}
results = mlp.process_docs(**payload)
results = mlp.process_docs(**payload, analyzers=["currency_sum","entities"])
facts = results[0]["texta_facts"]
assert len(facts) > 1
for fact in facts:
......
......@@ -65,6 +65,7 @@ def test_mlp_process(mlp: MLP):
fact_value = fact["str_val"]
fact_spans = fact["spans"]
fact_name = fact["fact"]
sent_index = fact["sent_index"]
assert isinstance(fact_path, str) is True
assert isinstance(fact_value, str) is True
......@@ -84,11 +85,12 @@ def test_mlp_process(mlp: MLP):
if fact_name == "EMAIL":
spanned_text = spanned_text.replace(' ', '')
str_val = fact_value.lower()
# Skip checks for phones with area codes.
# TODO Rewrite this test taking the problem with parenthesis into consideration.
if not ("(" in text and ")" in text):
# print(text, spanned_text, fact)
if sent_index >= 1:
sent = text.split(" \n ")[sent_index]
spanned_text = sent[span[0]:span[1]].lower()
assert spanned_text == str_val
......@@ -133,11 +135,12 @@ def test_mlp_lemmatize(mlp: MLP, expected_lemmas, test_input):
def test_existing_facts_not_being_overwritten(mlp: MLP):
payload = {
"texts": ["Edgar Savisaar elas Nõmme tänav 24"],
"texta_facts": [{"fact": "PER", "lemma": None, "str_val": "Edgar Savisaar", "spans": "[[0, 14]]", "doc_path": "texts_mlp.text"}]
"texta_facts": [{"fact": "PER", "lemma": None, "str_val": "Edgar Savisaar", "sent_index": 0, "spans": "[[0, 14]]", "doc_path": "texts_mlp.text"}]
}
result = mlp.process_docs([payload], doc_paths=["texts"])
result = mlp.process_docs([payload], doc_paths=["texts"], analyzers=["entities"])
original_facts = result[0]["texta_facts"]
assert len(original_facts) == 3 # Two normal facts + 1 BOUNDED.
assert len(original_facts) == 2 # Two normal facts + 1 BOUNDED.
facts = [fact for fact in original_facts if fact["fact"] != "BOUNDED"]
assert len(facts) == 2 # One BOUNDED added
......@@ -149,7 +152,7 @@ def test_existing_facts_not_being_overwritten(mlp: MLP):
def test_removal_of_duplicate_facts(mlp: MLP):
payload = {
"texts": ["Edgar Savisaar elas Nõmme tänav 24"],
"texta_facts": [{'doc_path': 'texts_mlp.text', 'lemma': None, 'fact': 'ADDR', 'spans': '[[20, 34]]', 'str_val': 'nõmme tänav 24'}]
"texta_facts": [{'doc_path': 'texts_mlp.text', 'lemma': None, 'fact': 'ADDR', 'spans': '[[20, 34]]', 'str_val': 'nõmme tänav 24', "sent_index": 0}]
}
result = mlp.process_docs([payload], doc_paths=["texts"])
facts = result[0]["texta_facts"]
......
import pytest
@pytest.mark.parametrize("expected_namemail, test_input", [
('Керсти Кальюлайд kaljulaidkersti@yandex.ru', 'Отправлено с iPhone06.08.2015, в 20:05, Керсти Кальюлайд kaljulaidkersti@yandex.ru написал(а): '),
('Керсти Кальюлайд < kersti1298@mail.ru', 'Воскресенье, 17 января 2016, 1:10 +03:00 от Керсти Кальюлайд < kersti1298@mail.ru >:'),
# ('Павлович Данилов <danilov64@bk.ru>', 'Кому: Павел Павлович Данилов <danilov64@bk.ru>\nДата: Среда, 10 декабря 2014, 0:31 +03:00'),
('Tuule Tormi < tormituule@gmail.com', 'От кого: Tuule Tormi < tormituule@gmail.com >'),
('Ove Üllar < 123ove@gmail.com', 'Ove Üllar < 123ove@gmail.com >'),
('Антон Казарезов antonkazarezov@mail.ru', 'С уважением,\nАнтон Казарезов\nantonkazarezov@mail.ru\n37259087634')
])
def test_single_namemail(mlp, expected_namemail, test_input):
""" test for single namemails """
result = mlp.process(test_input, analyzers=("lemmas", "pos_tags", "transliteration", "ner", "contacts", "entities", "namemail"))
facts = result['texta_facts']
detected_phone = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert detected_phone == [expected_namemail]
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("expected_namemails, test_input", [
(['Julia Kondraeva < jlkondrat@gmail.com', 'Василий Мав < mavvvasili@bk.ru'], '---------- Пересылаемое сообщение ----------\nОт кого: Julia Kondraeva < jlkondrat@gmail.com >\nДата: 12 ноября 2015 г., 13:54\nТема: мой день рождения (завтра)\nКому: Василий Мав < mavvvasili@bk.ru >'),
])
def test_multiple_namemails(mlp, expected_namemails, test_input):
""" test for multiple potential namemails """
result = mlp.process(test_input)
facts = result['texta_facts']
detected_phones = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert sorted(detected_phones) == sorted(expected_namemails)
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("test_input", [
'можно по адресу https://groups.google.com/d/msgid/mediaplanlnr/9d899b4-ok53-290i-678p-b56399fa6f98%40googlegroups.com.',
'To make sure you can receive our emails, please add noreply@actuallyreply.com to your [trusted contacts]',
'This email was sent to amanda@mail.me',
'С уважением, Михайл.\n\n\nFrom: 123456789@mail.ru'
])
def test_no_namemails(mlp, test_input):
""" test for no potential namemails"""
result = mlp.process(test_input)
facts = result['texta_facts']
phones = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert len(phones) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
def test_doc_path_parsing_inside_doc(mlp):
payload = {
"docs": [{"text": {"subsection": "To make sure you can receive our emails, please add noreply@actuallyreply.com to your [trusted contacts]"}}],
"doc_paths": ["text.subsection"]
}
result = mlp.process_docs(**payload)[0]
facts = result["texta_facts"]
assert len(facts) > 0
for fact in facts:
assert fact["doc_path"] == "text.subsection_mlp.text"
This diff is collapsed.
......@@ -46,7 +46,6 @@ class Document:
analysis_lang: str,
stanza_sentences: [list],
stanza_entities,
concat_resources: dict,
entity_mapper: Optional[EntityMapper] = None,
doc_path: str = "text",
json_doc: dict = None,
......@@ -62,13 +61,13 @@ class Document:
self.error = error
self.json_doc = json_doc
self.entities_processed = False
self.entity_mapper = entity_mapper
self.stanza_sentences = stanza_sentences
self.stanza_words = [word for sentence in self.stanza_sentences for word in sentence]
self.stanza_entities = stanza_entities
self.concat_resources = concat_resources
self.__words = []
self.__lemmas = []
self.__pos_tags = []
......@@ -131,6 +130,37 @@ class Document:
self.__texta_facts.append(fact)
def fact_spans_to_sent(self):
"""
Updates fact spans to use sentence-based spans
"""
tokenized_text = self.get_words()
# browse throush facts
for fact in self.__texta_facts:
new_spans = []
sent_index = 0
for span in fact.spans:
span_len = span[1]-span[0]
sent_index = tokenized_text[:span[0]].count("\n")
# find last sent break before the match
matches = list(re.finditer(" \n ", tokenized_text[:span[0]]))
print(sent_index)
# check if any sentences
if matches:
# find the last sentence break
last_match = matches[-1]
# compute new spans in given sentence
new_start = span[0] - last_match.span()[1]
new_end = new_start + span_len
new_span = [new_start, new_end]
new_spans.append(new_span)
print(new_spans)
# update spans in object
if new_spans:
fact.spans = new_spans
fact.sent_index = sent_index
@staticmethod
def edit_doc(doc: dict, doc_path: str, new_value) -> dict:
"""
......@@ -190,9 +220,10 @@ class Document:
def get_lemma(self) -> str:
sentences = []
if not self.__lemmas:
self.lemmas()
for sent_lemmas in self.__lemmas:
sentences.append(" ".join([a.strip() for a in sent_lemmas]))
if "sentences" in self.analyzers:
return " \n ".join(sentences)
else:
......@@ -272,6 +303,9 @@ class Document:
Extracts currency + sum and sum + currency patterns from text using regexp.
Saves extractions as facts.
"""
if not self.entities_processed:
self.entities()
text = self.get_words()
currency_facts = [fact for fact in self.__texta_facts if fact.fact_type == "CURRENCY"]
for fact in currency_facts:
......@@ -289,7 +323,7 @@ class Document:
fact_type="CURRENCY_SUM",
fact_value=fact_value,
doc_path=self.__get_doc_path("text"),
spans=[match.start(), match.end()]
spans=[[match.start(), match.end()]]
)
self.__texta_facts.append(new_fact)
......@@ -386,12 +420,8 @@ class Document:
def ner(self):
tokenized_text = self.get_words()
known_entities = Document.FACT_NAMES_NER
not_entities = self.concat_resources["not_entities"]
for entity in self.stanza_entities:
if entity.text.lower() in not_entities:
continue
if entity.type in known_entities:
# finds the closest spans in tokenized text
# this is because stanza returns spans from non-tokenized text
......@@ -409,289 +439,13 @@ class Document:
# create and append fact
# ignore facts whose match fails
if best_matching_span:
text_before_match = tokenized_text[:best_matching_span[0]]
sentence_index = text_before_match.count("\n")
new_fact = Fact(
fact_type=entity.type,
fact_value=entity.text,
doc_path=self.__get_doc_path("text"),
spans=[best_matching_span]
spans=[best_matching_span],
sent_index=sentence_index
)
self.__texta_facts.append(new_fact)
def namemail(self):
"""
Find name-email pairs.
"""
text = self.get_words()
email_name_pairs = ContactEmailNamePairParser(text).parse() # bounded -> str "name mail"
self.__texta_facts.extend(
(emailpair.to_fact(Document.FACT_NAME_NAMEMAIL, self.__get_doc_path("text")) for emailpair in email_name_pairs))
def remove_duplicate_facts_by_span(self, facts):
"""if there are emailpairs, then:
[{fact_type: "NAMEMAIL", value: "Aleksander Great aleksandersuur 356eKr@mail.ee", spans(30,60)}, {fact_type: "PER", value: "Aleksander Great", spans:(30,40)}] ==>
[{fact_type: "MAIL", value: "aleksandersuur 356eKr@mail.ee", spans(40,60)}, {fact_type: "PER", value: "Aleskander Great", spans:(30,40)}]
NAMEMAIL is used, because sometimes the ner_tagger only gets the PER or the EMAIL. This makes double sure that we get most of the entities and there is no overlaping entities
(as it was before).
"""
starts = []
ends = []
new_facts = []
facts_values = list()
for fact in facts:
fact.fact_value = fact.fact_value.strip("><\)\(:;-\.,\!\?")
for fact in facts:
facts_values += [fact.fact_value]
if fact.fact_type == Document.FACT_NAME_NAMEMAIL:
splitted_value = fact.fact_value.split(" ")
name_fact_value = " ".join(splitted_value[:2])
name_fact = Fact(
fact_type="PER",
fact_value=name_fact_value.strip("><\)\(:;-\.,\!\?"),
doc_path=self.__get_doc_path("text"),
spans=[(fact.spans[0][0], fact.spans