Commit 1236e4c4 authored by Marko Kollo's avatar Marko Kollo 😄
Browse files

Fixed mistake with NER doc_path not being correct.

parent 60cd61eb
Pipeline #4742 passed with stage
in 21 minutes and 24 seconds
......@@ -180,3 +180,13 @@ def test_processing_documents_with_multiple_doc_paths(mlp: MLP):
assert "texta_facts" in document
assert "first_name_mlp" in document["entity"]
assert "last_name_mlp" in document["entity"]
def test_that_ner_output_contains_correct_doc_path(mlp: MLP):
result = mlp.process_docs(docs=[{"comment": {"text": "Barack Obama was one of the presidents of the United States of America!"}}], doc_paths=["comment.text"])[0]
assert "text_mlp" in result["comment"]
assert "text" in result["comment"]
assert "texta_facts" in result
facts = result.get("texta_facts")
usa_fact = facts[0]
assert usa_fact["doc_path"] == "comment.text_mlp.text"
......@@ -299,8 +299,7 @@ class Document:
if entity.type in known_entities:
# finds the closest spans in tokenized text
# this is because stanza returns spans from non-tokenized text
pattern = re.compile(re.escape(
entity.text)) # Use re.escape to avoid trouble with special characters existing in the text.
pattern = re.compile(re.escape(entity.text)) # Use re.escape to avoid trouble with special characters existing in the text.
matching_tokenized_spans = [(match.start(), match.end()) for match in pattern.finditer(tokenized_text)]
best_matching_span = None
best_matching_distance = math.inf
......@@ -317,7 +316,7 @@ class Document:
new_fact = Fact(
fact_type=entity.type,
fact_value=entity.text,
doc_path=self.doc_path,
doc_path=f"{self.doc_path}_mlp.text",
spans=[best_matching_span]
)
self.__texta_facts.append(new_fact)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment