Commit c99d1ef8 authored by Raul Sirel's avatar Raul Sirel
Browse files

revise mlp process output

parent 549f45ee
Pipeline #6688 passed with stages
in 33 minutes and 16 seconds
......@@ -21,7 +21,7 @@ def test_address(mlp, expected_value, test_input):
assert expected_value in entities
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
@pytest.mark.parametrize("expected_output, test_input", [
......@@ -38,7 +38,7 @@ def test_addresses_by_list(mlp, expected_output, test_input):
assert expected_output in entities
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
@pytest.mark.parametrize("test_input", [
......@@ -53,7 +53,7 @@ def test_no_address(mlp, test_input):
addrs = [fact['str_val'] for fact in facts if fact['fact'] == 'ADDR']
assert len(addrs) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
def test_doc_path_parsing_inside_doc(mlp):
......
......@@ -17,7 +17,7 @@ def test_currency_parse(mlp, expected_value, test_input):
for detected_fact in detected_facts:
assert detected_fact == expected_value
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
def test_doc_path_parsing_inside_doc(mlp):
......
......@@ -16,7 +16,7 @@ def test_emails(mlp, expected_email, test_input):
detected_emails = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'EMAIL']
for fact in result["texta_facts"]:
assert isinstance(fact["doc_path"], str) is True
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
assert isinstance(fact["str_val"], str) is True
assert isinstance(fact["spans"], str) is True
assert isinstance(fact["fact"], str) is True
......@@ -39,7 +39,7 @@ def test_no_email(mlp, test_input):
emails = [fact['str_val'] for fact in facts if fact['fact'] == 'EMAIL']
assert len(emails) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
def test_doc_path_parsing_inside_doc(mlp):
......
......@@ -41,8 +41,8 @@ def test_mlp_process(mlp: MLP):
print("\nMLP process output:", result)
# test result form
assert "text" in result
mlp_text = result["text"]
assert "text_mlp" in result
mlp_text = result["text_mlp"]
assert "lemmas" in mlp_text
assert isinstance(mlp_text["lemmas"], str) is True
......@@ -71,7 +71,7 @@ def test_mlp_process(mlp: MLP):
assert isinstance(fact_value, str) is True
assert isinstance(fact_spans, str) is True
assert isinstance(fact_name, str) is True
assert fact_path == "text.text" or fact_path == "text.lemmas"
assert fact_path == "text_mlp.text" or fact_path == "text_mlp.lemmas"
if fact_name != "BOUNDED":
span_list = json.loads(fact_spans)
......@@ -102,7 +102,7 @@ def test_companies_by_list(mlp: MLP, expected_output, test_input):
result = mlp.process(test_input)
entities = []
for fact in result["texta_facts"]:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
if fact["fact"] == "COMPANY":
entities.append(fact["str_val"])
if expected_output is None:
......@@ -209,7 +209,6 @@ def test_processing_documents_with_multiple_doc_paths(mlp: MLP):
def test_that_ner_output_contains_correct_doc_path(mlp: MLP):
result = mlp.process_docs(docs=[{"comment": {"text": "Barack Obama was one of the presidents of the United States of America!"}}], doc_paths=["comment.text"])[0]
assert "text_mlp" in result["comment"]
assert "text" in result["comment"]
assert "texta_facts" in result
facts = result.get("texta_facts")
usa_fact = facts[0]
......@@ -218,7 +217,7 @@ def test_that_ner_output_contains_correct_doc_path(mlp: MLP):
def test_sentences_separated_with_newline(mlp: MLP):
result = mlp.process("Siin on üks lause. See on teine lause.")
mlp_result = result["text"]
mlp_result = result["text_mlp"]
assert mlp_result["text"] == 'Siin on üks lause . \n See on teine lause .'
assert mlp_result["lemmas"] == 'siin olema üks lause . \n see olema teine lause .'
assert len(mlp_result["pos_tags"].split(" ")) == len(mlp_result["lemmas"].split(" "))
......@@ -226,7 +225,7 @@ def test_sentences_separated_with_newline(mlp: MLP):
def test_sentences_not_separated_with_newline(mlp: MLP):
result = mlp.process("Siin on üks lause. See on teine lause.", analyzers=["lemmas", "pos_tags"])
mlp_result = result["text"]
mlp_result = result["text_mlp"]
assert mlp_result["text"] == 'Siin on üks lause . See on teine lause .'
assert mlp_result["lemmas"] == 'siin olema üks lause . see olema teine lause .'
......
......@@ -21,7 +21,7 @@ def test_single_phone(mlp, expected_phone, test_input):
detected_phone = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert detected_phone == [expected_phone]
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
@pytest.mark.parametrize("expected_phones, test_input", [
......@@ -36,7 +36,7 @@ def test_multiple_phones(mlp, expected_phones, test_input):
detected_phones = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert sorted(detected_phones) == sorted(expected_phones)
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
@pytest.mark.parametrize("test_input", [
......@@ -63,7 +63,7 @@ def test_no_phones(mlp, test_input):
phones = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert len(phones) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
assert fact["doc_path"] == "text_mlp.text"
def test_doc_path_parsing_inside_doc(mlp):
......
......@@ -47,7 +47,7 @@ class Document:
stanza_sentences: [list],
stanza_entities,
entity_mapper: Optional[EntityMapper] = None,
doc_path: str = "text",
doc_path: str = "text_mlp",
json_doc: dict = None,
analyzers: list = [],
error: str = "",
......@@ -154,7 +154,6 @@ class Document:
new_end = new_start + span_len
new_span = [new_start, new_end]
new_spans.append(new_span)
print(new_spans)
# update spans in object
if new_spans:
fact.spans = new_spans
......@@ -184,7 +183,7 @@ class Document:
path_to_mlp = list_of_path_keys[:-1] + [root_key] if len(list_of_path_keys) > 1 else [root_key]
mlp_result = self.to_json(use_default_doc_path)
nested_dict_wrapper = PelicanJson(self.json_doc)
nested_dict_wrapper.set_nested_value(path_to_mlp, mlp_result["text"], force=True)
nested_dict_wrapper.set_nested_value(path_to_mlp, mlp_result["text_mlp"], force=True)
nested_dict_wrapper.set_nested_value(["texta_facts"], mlp_result["texta_facts"], force=True)
return nested_dict_wrapper.convert()
......@@ -209,8 +208,8 @@ class Document:
container["transliteration"] = self.get_transliteration()
if use_default_doc_path:
for fact in texta_facts["texta_facts"]:
fact["doc_path"] = "text.text"
return {"text": container, **texta_facts}
fact["doc_path"] = "text_mlp.text"
return {"text_mlp": container, **texta_facts}
def lemmas(self):
......
......@@ -377,7 +377,7 @@ class MLP:
:return: Lemmatized string.
"""
document = self.process(raw_text, analyzers=["lemmas"], lang=lang)
return document["text"]["lemmas"]
return document["text_mlp"]["lemmas"]
def parse_doc_texts(self, doc_path: str, document: dict) -> list:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment