Commit 9a58af57 authored by Raul Sirel's avatar Raul Sirel
Browse files

Merge branch 'master' into mlp_batch

parents 80179254 6bcc9962
# TEXTA MLP Python package
http://pypi.texta.ee/texta-mlp/
https://pypi.org/project/texta-mlp/
## Installation
### Requirements
......
......@@ -8,7 +8,7 @@ channels:
dependencies:
- python=3.7
- pip
- pytorch==1.5.*
- pytorch==1.7.*
- lxml
- pip:
- beautifulsoup4>=4.9.*
......
......@@ -21,7 +21,7 @@ def test_address(mlp, expected_value, test_input):
assert expected_value in entities
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
@pytest.mark.parametrize("expected_output, test_input", [
......@@ -38,7 +38,7 @@ def test_addresses_by_list(mlp, expected_output, test_input):
assert expected_output in entities
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
@pytest.mark.parametrize("test_input", [
......@@ -53,7 +53,7 @@ def test_no_address(mlp, test_input):
addrs = [fact['str_val'] for fact in facts if fact['fact'] == 'ADDR']
assert len(addrs) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
def test_doc_path_parsing_inside_doc(mlp):
......
import pytest
from texta_mlp.document import Document
from texta_mlp.fact import Fact
from texta_mlp.mlp import MLP
test_facts_aleksander = [Fact(
fact_type="NAMEMAIL",
fact_value="Aleksander Great aleksandersuur356eKr@mail.ee",
doc_path="text",
spans=[[30, 75]]
),
Fact(
fact_type="PER",
fact_value="Aleksander Great",
doc_path="text",
spans=[[30, 46]]
)
]
test_facts_to_be_bound = [Fact(
fact_type="EMAIL",
fact_value="aleksandersuur356eKr@mail.ee",
doc_path="text",
spans=[[47, 75]]
),
Fact(
fact_type="PER",
fact_value="Aleksander Great",
doc_path="text",
spans=[[30, 46]]
),
Fact(
fact_type="PHONE_high_recall",
fact_value="356356356",
doc_path="text",
spans=[[80, 89]]
),
Fact(
fact_type="EMAIL",
fact_value="aleksandersuur356eKr@mail.ee",
doc_path="text",
spans=[[97, 125]]
),
Fact(
fact_type="PER",
fact_value="Julius Caecar",
doc_path="text",
spans=[[300, 312]]
)
]
test_facts_to_be_bound2 = test_facts_to_be_bound + [Fact(
fact_type="PHONE_high_recall",
fact_value="356356356",
doc_path="text",
spans=[[400, 409]]
),
Fact(
fact_type="EMAIL",
fact_value="aleksandersuur356eKr@mail.ee",
doc_path="text",
spans=[[417, 485]]
)
]
test_text1 = "От кого: Канцелярия Президента <presidendikantelei@mail.ru>\nКому: Яан Тамм <vanas6ber@bk.ru>\nДата: Четверг, 3 октября 2013, 17:27 +04:00\nТема: контакт\nТелефон президента 45667788, он живет в Москве. Вы также можете отправить ему электронное письмо по адресу\nolenpresident@gmail.com\nС уважением,\nКанцелярия Президента"
test_text1_bounds = ["{'PER': ['Канцелярия Президента'], 'EMAIL': ['presidendikantelei@mail.ru']}",
"{'PHONE_high_recall': ['45667788'], 'LOC': ['Москве'], 'EMAIL': ['olenpresident@gmail.com']}",
"{'PER': ['Яан Тамм'], 'EMAIL': ['vanas6ber@bk.ru']}"]
mlp_wrapper = MLP(language_codes=["et", "ru", "en"], logging_level="info", use_gpu=False)
def test_created_bounds():
result = mlp_wrapper.process(test_text1)
bounded = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'BOUNDED']
for i, bound1 in enumerate(bounded):
# print(bound1, type(bound1))
# bound1 = json.loads(re.sub("\'", "\"", bound1["str_val"]))
assert bound1 in test_text1_bounds
for fact in result["texta_facts"]:
assert isinstance(fact["doc_path"], str) is True
assert fact["doc_path"] == "text.text"
assert isinstance(fact["str_val"], str) is True
assert isinstance(fact["spans"], str) is True
assert isinstance(fact["fact"], str) is True
@pytest.mark.parametrize("expected_non_duplicate_facts, test_input", [
([('aleksandersuur356eKr@mail.ee', 'EMAIL', [(47, 75)]), ('Aleksander Great', 'PER', [(30, 46)])], test_facts_aleksander)
])
def test_remove_duplicate_facts_by_span_in_doc(expected_non_duplicate_facts, test_input):
""" test for namemails unbounding"""
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
output_facts = doc.remove_duplicate_facts_by_span(test_input)
output_list = []
for o_f in output_facts:
output_list += [(o_f.fact_value, o_f.fact_type, o_f.spans)]
assert output_list == expected_non_duplicate_facts
@pytest.mark.parametrize("expected_close_BOUNDS, test_input", [
([{'doc_path': 'text',
'fact': 'BOUNDED',
'spans': [[30, 46], [47, 75], [80, 89], [97, 125]],
'str_val': {'PER': ['Aleksander Great'],
'EMAIL': ['aleksandersuur356eKr@mail.ee', 'aleksandersuur356eKr@mail.ee'],
'PHONE_high_recall': ['356356356']},
'str_values': [('Aleksander Great', 'PER'),
('aleksandersuur356eKr@mail.ee', 'EMAIL'),
('356356356', 'PHONE_high_recall'),
('aleksandersuur356eKr@mail.ee', 'EMAIL')]}], test_facts_to_be_bound)
])
def test_bound_close_ones(expected_close_BOUNDS, test_input):
""" test for namemails unbounding"""
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
output_list = doc.bound_close_ones(test_input)
for i, BOUND in enumerate(output_list):
assert BOUND["fact"] == "BOUNDED"
assert BOUND["str_val"].keys() == expected_close_BOUNDS[i]["str_val"].keys()
for key in BOUND["str_val"]:
for element in BOUND["str_val"][key]:
assert element in expected_close_BOUNDS[i]["str_val"][key]
for str_value in BOUND["str_values"]:
assert str_value in expected_close_BOUNDS[i]["str_values"]
@pytest.mark.parametrize("expected_bounds_no_overlap, test_input", [
([{'doc_path': 'text',
'fact': 'BOUNDED',
'spans': [[30, 46], [47, 75], [80, 89], [97, 125]],
'str_val': {'PER': ['Aleksander Great'],
'EMAIL': ['aleksandersuur356eKr@mail.ee'],
'PHONE_high_recall': ['356356356']}}], test_facts_to_be_bound)
])
def test_remove_overlaping_in_bounded(expected_bounds_no_overlap, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
bounded_facts = doc.bound_close_ones(test_input)
output_list = doc.remove_overlaping_in_bounded(bounded_facts)
for i, BOUND in enumerate(output_list):
assert BOUND["fact"] == "BOUNDED"
assert BOUND["str_val"].keys() == expected_bounds_no_overlap[i]["str_val"].keys()
for key in BOUND["str_val"]:
for element in BOUND["str_val"][key]:
assert element in expected_bounds_no_overlap[i]["str_val"][key]
@pytest.mark.parametrize("expected_bounds_no_subsets, test_input", [
([("{'PER': ['Aleksander Great'], 'EMAIL': ['aleksandersuur356eKr@mail.ee'], 'PHONE_high_recall': ['356356356']}", 'BOUNDED', [[30, 46], [47, 75], [80, 89], [97, 125], [400, 409], [417, 485]])], test_facts_to_be_bound2)
])
def test_concatenate_subset_bounds(expected_bounds_no_subsets, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
bounded_facts = doc.bound_close_ones(test_input)
bounded_facts = doc.remove_overlaping_in_bounded(bounded_facts)
output_facts = doc.concatenate_subset_bounds(bounded_facts)
output_list = []
for o_f in output_facts:
output_list += [(o_f.fact_value, o_f.fact_type, o_f.spans)]
assert output_list == expected_bounds_no_subsets
@pytest.mark.parametrize("key_value_single_pairs, test_input", [
([('a', 1), ('a', 2), ('b', 3), ('b', 4)], {'a': [1, 2], 'b': [3, 4]})
])
def test_concatenate_subset_bounds(key_value_single_pairs, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
result = doc.key_value_single_pairs(test_input)
assert result == key_value_single_pairs
@pytest.mark.parametrize("ok_spaces", [
("Kalevipoeg kirjutas Sulevipojale", (0, 9), (20, 31))
])
def test_space_between_ok(mlp, ok_spaces):
""" test for okay spaces"""
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
result = doc.space_between_ok(ok_spaces[0], ok_spaces[1], ok_spaces[2])
assert result is True
@pytest.mark.parametrize("not_ok_spaces", [
("Vasia Ivanov <am2311@hotmail.com> написал(а): > >> Павел, >>", (0, 12), (52, 57)),
("gmail.com\nSaadetud: 21.04.2015, 12:13:14\nSaaja: Magnus", (0, 8), (41, 46)),
("От кого: Dmitry Chubasskoel <dmitry.chubasdknoi@gmail.com>\nКому: 89178882978@mail.ru", (28, 56), (64, 81)),
("salapärane@yandex.ru>:\nПавел", (0, 18), (24, 27)),
("Павел\n\nПятница, 11 ноября 2016 г., 15:31 +0300 от Александр", (0, 5), (53, 62)),
("entity С уважением, Александр", (0, 5), (20, 28))
])
def test_space_between_not_ok(mlp, not_ok_spaces):
# test for spaces that are not ok
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
result = doc.space_between_ok(not_ok_spaces[0], not_ok_spaces[1], not_ok_spaces[2])
assert result == False
@pytest.mark.parametrize("similar_cleaned_str_val, test_input", [
({'PER': ['Pipi Pikk Sukk']}, {'PER': ['Pipi', 'Pipi Pikk Sukk>', 'Pipi Pikk Sukk']}),
({'PER': ['Pipi Pikk Sukk', 'Pipi Pikk Jalg']}, {'PER': ['Pipi', 'Pipi Pikk Sukk', 'Pipi Pikk Jalg']}),
({'PER': ['Tom Marvolo Riddle', 'Tom Marvolo']}, {'PER': ['Tom', 'Tom Marvolo', 'Tom Marvolo Riddle']}),
({'ORG': ['Eesti Vabariik', 'Eesti Valitsus', 'Valitsus']}, {'ORG': ['Eesti Vabariik', 'Eesti Valitsus', 'Valitsus']}),
({'PER': ['Pipi Pikk Sukk'], 'ORG': ['Народного Совета по промышленности и торговле']}, {'PER': ['Pipi Sukk', 'Pipi Pikk Sukk'], 'ORG': ['Народного Совета по промышленности и торговле', 'Народного Совета']})
])
def test_clean_similar_in_strval(similar_cleaned_str_val, test_input):
doc = Document(
original_text="",
dominant_language_code="en",
analysis_lang="en",
entity_mapper=None,
doc_path="text",
json_doc=None,
analyzers=[],
concat_resources=mlp_wrapper.concat_resources
)
result = doc.clean_similar_in_strval(dict(test_input))
assert result.keys() == similar_cleaned_str_val.keys()
for key in result:
for element in result[key]:
assert element in similar_cleaned_str_val[key]
......@@ -10,22 +10,22 @@ import pytest
('30 рубль', 'Рубль — название современных валют России (российский рубль), Белоруссии (белорусский рубль). 30 рубль.')
])
def test_currency_parse(mlp, expected_value, test_input):
result = mlp.process(test_input)
result = mlp.process(test_input, analyzers=["currency_sum", "entities"])
facts = result['texta_facts']
detected_facts = [fact['str_val'] for fact in facts if fact['fact'] == 'CURRENCY_SUM']
assert expected_value in detected_facts
for detected_fact in detected_facts:
assert detected_fact == expected_value
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
def test_doc_path_parsing_inside_doc(mlp):
payload = {
"docs": [{"text": {"subsection": "Eesti keeles on see 2,5 eurot."}}],
"doc_paths": ["text.subsection"]
"doc_paths": ["text.subsection"],
}
results = mlp.process_docs(**payload)
results = mlp.process_docs(**payload, analyzers=["currency_sum","entities"])
facts = results[0]["texta_facts"]
assert len(facts) > 1
for fact in facts:
......
......@@ -16,7 +16,7 @@ def test_emails(mlp, expected_email, test_input):
detected_emails = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'EMAIL']
for fact in result["texta_facts"]:
assert isinstance(fact["doc_path"], str) is True
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
assert isinstance(fact["str_val"], str) is True
assert isinstance(fact["spans"], str) is True
assert isinstance(fact["fact"], str) is True
......@@ -39,7 +39,7 @@ def test_no_email(mlp, test_input):
emails = [fact['str_val'] for fact in facts if fact['fact'] == 'EMAIL']
assert len(emails) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
def test_doc_path_parsing_inside_doc(mlp):
......
......@@ -37,12 +37,12 @@ test_texts = [
def test_mlp_process(mlp: MLP):
for test_text in test_texts:
# process text
result = mlp.process(test_text)
result = mlp.process(test_text, spans="sentence")
print("\nMLP process output:", result)
# test result form
assert "text" in result
mlp_text = result["text"]
assert "text_mlp" in result
mlp_text = result["text_mlp"]
assert "lemmas" in mlp_text
assert isinstance(mlp_text["lemmas"], str) is True
......@@ -65,12 +65,13 @@ def test_mlp_process(mlp: MLP):
fact_value = fact["str_val"]
fact_spans = fact["spans"]
fact_name = fact["fact"]
sent_index = fact["sent_index"]
assert isinstance(fact_path, str) is True
assert isinstance(fact_value, str) is True
assert isinstance(fact_spans, str) is True
assert isinstance(fact_name, str) is True
assert fact_path == "text.text" or fact_path == "text.lemmas"
assert fact_path == "text_mlp.text" or fact_path == "text_mlp.lemmas"
if fact_name != "BOUNDED":
span_list = json.loads(fact_spans)
......@@ -84,11 +85,12 @@ def test_mlp_process(mlp: MLP):
if fact_name == "EMAIL":
spanned_text = spanned_text.replace(' ', '')
str_val = fact_value.lower()
# Skip checks for phones with area codes.
# TODO Rewrite this test taking the problem with parenthesis into consideration.
if not ("(" in text and ")" in text):
# print(text, spanned_text, fact)
if sent_index >= 1:
sent = text.split(" \n ")[sent_index]
spanned_text = sent[span[0]:span[1]].lower()
assert spanned_text == str_val
......@@ -100,7 +102,7 @@ def test_companies_by_list(mlp: MLP, expected_output, test_input):
result = mlp.process(test_input)
entities = []
for fact in result["texta_facts"]:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
if fact["fact"] == "COMPANY":
entities.append(fact["str_val"])
if expected_output is None:
......@@ -133,11 +135,12 @@ def test_mlp_lemmatize(mlp: MLP, expected_lemmas, test_input):
def test_existing_facts_not_being_overwritten(mlp: MLP):
payload = {
"texts": ["Edgar Savisaar elas Nõmme tänav 24"],
"texta_facts": [{"fact": "PER", "lemma": None, "str_val": "Edgar Savisaar", "spans": "[[0, 14]]", "doc_path": "texts_mlp.text"}]
"texta_facts": [{"fact": "PER", "lemma": None, "str_val": "Edgar Savisaar", "sent_index": 0, "spans": "[[0, 14]]", "doc_path": "texts_mlp.text"}]
}
result = mlp.process_docs([payload], doc_paths=["texts"])
result = mlp.process_docs([payload], doc_paths=["texts"], analyzers=["entities"])
original_facts = result[0]["texta_facts"]
assert len(original_facts) == 3 # Two normal facts + 1 BOUNDED.
assert len(original_facts) == 2 # Two normal facts + 1 BOUNDED.
facts = [fact for fact in original_facts if fact["fact"] != "BOUNDED"]
assert len(facts) == 2 # One BOUNDED added
......@@ -149,14 +152,13 @@ def test_existing_facts_not_being_overwritten(mlp: MLP):
def test_removal_of_duplicate_facts(mlp: MLP):
payload = {
"texts": ["Edgar Savisaar elas Nõmme tänav 24"],
"texta_facts": [{'doc_path': 'texts_mlp.text', 'lemma': None, 'fact': 'ADDR', 'spans': '[[20, 34]]', 'str_val': 'nõmme tänav 24'}]
"texta_facts": [{'doc_path': 'texts_mlp.text', 'lemma': None, 'fact': 'ADDR', 'spans': '[[20, 34]]', 'str_val': 'nõmme tänav 24', "sent_index": 0}]
}
result = mlp.process_docs([payload], doc_paths=["texts"])
facts = result[0]["texta_facts"]
fact = facts[0]
assert len(facts) == 1
assert fact["str_val"] == "nõmme tänav 24"
assert len(facts) == 3
assert fact["doc_path"] == "texts_mlp.text"
......@@ -207,7 +209,6 @@ def test_processing_documents_with_multiple_doc_paths(mlp: MLP):
def test_that_ner_output_contains_correct_doc_path(mlp: MLP):
result = mlp.process_docs(docs=[{"comment": {"text": "Barack Obama was one of the presidents of the United States of America!"}}], doc_paths=["comment.text"])[0]
assert "text_mlp" in result["comment"]
assert "text" in result["comment"]
assert "texta_facts" in result
facts = result.get("texta_facts")
usa_fact = facts[0]
......@@ -216,14 +217,15 @@ def test_that_ner_output_contains_correct_doc_path(mlp: MLP):
def test_sentences_separated_with_newline(mlp: MLP):
result = mlp.process("Siin on üks lause. See on teine lause.")
mlp_result = result["text"]
assert mlp_result["text"] == 'Siin on üks lause .\nSee on teine lause .'
assert mlp_result["lemmas"] == 'siin olema üks lause .\nsee olema teine lause .'
mlp_result = result["text_mlp"]
assert mlp_result["text"] == 'Siin on üks lause . \n See on teine lause .'
assert mlp_result["lemmas"] == 'siin olema üks lause . \n see olema teine lause .'
assert len(mlp_result["pos_tags"].split(" ")) == len(mlp_result["lemmas"].split(" "))
def test_sentences_not_separated_with_newline(mlp: MLP):
result = mlp.process("Siin on üks lause. See on teine lause.", analyzers=["lemmas", "pos_tags"])
mlp_result = result["text"]
mlp_result = result["text_mlp"]
assert mlp_result["text"] == 'Siin on üks lause . See on teine lause .'
assert mlp_result["lemmas"] == 'siin olema üks lause . see olema teine lause .'
......
import pytest
@pytest.mark.parametrize("expected_namemail, test_input", [
('Керсти Кальюлайд kaljulaidkersti@yandex.ru', 'Отправлено с iPhone06.08.2015, в 20:05, Керсти Кальюлайд kaljulaidkersti@yandex.ru написал(а): '),
('Керсти Кальюлайд < kersti1298@mail.ru', 'Воскресенье, 17 января 2016, 1:10 +03:00 от Керсти Кальюлайд < kersti1298@mail.ru >:'),
# ('Павлович Данилов <danilov64@bk.ru>', 'Кому: Павел Павлович Данилов <danilov64@bk.ru>\nДата: Среда, 10 декабря 2014, 0:31 +03:00'),
('Tuule Tormi < tormituule@gmail.com', 'От кого: Tuule Tormi < tormituule@gmail.com >'),
('Ove Üllar < 123ove@gmail.com', 'Ove Üllar < 123ove@gmail.com >'),
('Антон Казарезов antonkazarezov@mail.ru', 'С уважением,\nАнтон Казарезов\nantonkazarezov@mail.ru\n37259087634')
])
def test_single_namemail(mlp, expected_namemail, test_input):
""" test for single namemails """
result = mlp.process(test_input, analyzers=("lemmas", "pos_tags", "transliteration", "ner", "contacts", "entities", "namemail"))
facts = result['texta_facts']
detected_phone = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert detected_phone == [expected_namemail]
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("expected_namemails, test_input", [
(['Julia Kondraeva < jlkondrat@gmail.com', 'Василий Мав < mavvvasili@bk.ru'], '---------- Пересылаемое сообщение ----------\nОт кого: Julia Kondraeva < jlkondrat@gmail.com >\nДата: 12 ноября 2015 г., 13:54\nТема: мой день рождения (завтра)\nКому: Василий Мав < mavvvasili@bk.ru >'),
])
def test_multiple_namemails(mlp, expected_namemails, test_input):
""" test for multiple potential namemails """
result = mlp.process(test_input)
facts = result['texta_facts']
detected_phones = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert sorted(detected_phones) == sorted(expected_namemails)
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("test_input", [
'можно по адресу https://groups.google.com/d/msgid/mediaplanlnr/9d899b4-ok53-290i-678p-b56399fa6f98%40googlegroups.com.',
'To make sure you can receive our emails, please add noreply@actuallyreply.com to your [trusted contacts]',
'This email was sent to amanda@mail.me',
'С уважением, Михайл.\n\n\nFrom: 123456789@mail.ru'
])
def test_no_namemails(mlp, test_input):
""" test for no potential namemails"""
result = mlp.process(test_input)
facts = result['texta_facts']
phones = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert len(phones) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
def test_doc_path_parsing_inside_doc(mlp):
payload = {
"docs": [{"text": {"subsection": "To make sure you can receive our emails, please add noreply@actuallyreply.com to your [trusted contacts]"}}],
"doc_paths": ["text.subsection"]
}
result = mlp.process_docs(**payload)[0]
facts = result["texta_facts"]
assert len(facts) > 0
for fact in facts:
assert fact["doc_path"] == "text.subsection_mlp.text"
......@@ -21,7 +21,7 @@ def test_single_phone(mlp, expected_phone, test_input):
detected_phone = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert detected_phone == [expected_phone]
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
@pytest.mark.parametrize("expected_phones, test_input", [
......@@ -36,7 +36,7 @@ def test_multiple_phones(mlp, expected_phones, test_input):
detected_phones = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert sorted(detected_phones) == sorted(expected_phones)
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
@pytest.mark.parametrize("test_input", [
......@@ -63,7 +63,7 @@ def test_no_phones(mlp, test_input):
phones = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert len(phones) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"