Commit a5343de7 authored by Raul Sirel's avatar Raul Sirel
Browse files

Merge branch 'doc_path_fixes' into 'master'

Doc path fixes

See merge request !6
parents 1236e4c4 0b26d62d
Pipeline #4789 passed with stages
in 28 minutes and 41 seconds
......@@ -15,12 +15,16 @@ import pytest
])
def test_address(mlp, expected_value, test_input):
result = mlp.process(test_input)
entities = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'ADDR']
facts = result['texta_facts']
entities = [fact['str_val'] for fact in facts if fact['fact'] == 'ADDR']
if expected_value is None:
assert not entities
else:
assert expected_value in entities
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("expected_output, test_input", [
('vana-lõuna 39', 'TEXTA OÜ asub aadressil Vana-Lõuna 39.')
......@@ -28,23 +32,39 @@ def test_address(mlp, expected_value, test_input):
def test_addresses_by_list(mlp, expected_output, test_input):
""" testing addresses from a list """
result = mlp.process(test_input)
entities = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'ADDR']
facts = result['texta_facts']
entities = [fact['str_val'] for fact in facts if fact['fact'] == 'ADDR']
if expected_output is None:
assert not entities
else:
assert expected_output in entities
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("test_input", [
'kana@kukeleegu.com',
'Huntington London, 64',
"Welcome to the Zoom Family\nWelcome to the Zoom Family!\nLet's Get Started!\nRead our\nGetting Started\nguide to learn about hosting meetings\nMeet\nwith a product specialist for a demo of key features\nQuestions? Submit a\nSupport Request\nat any time\nStay On Top\nAttend\nupcoming webinars to learn about new features, how-to's, add-ons, and\nmore\nKeep up with us on our\nBlog\n!\nCopyright ©️2016 Zoom Video Communications, Inc. All rights reserved.\nOur mailing address is:\n55 Almaden Boulevard, 6th Floor, San Jose, CA 95113\n+1.888.799.9666\nIf you no longer wish to receive these emails you may unsubscribe at any time."
])
def test_no_address(mlp, test_input):
""" testing for no potential addresses """
result = mlp.process(test_input)
addrs = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'ADDR']
facts = result['texta_facts']
addrs = [fact['str_val'] for fact in facts if fact['fact'] == 'ADDR']
assert len(addrs) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
def test_doc_path_parsing_inside_doc(mlp):
payload = {
"docs": [{"text": {"subsection": "Huntington London, 64"}}],
"doc_paths": ["text.subsection"]
}
result = mlp.process_docs(**payload)[0]
facts = result["texta_facts"]
assert len(facts) > 0
for fact in facts:
assert fact["doc_path"] == "text.subsection_mlp.text"
......@@ -85,10 +85,11 @@ def test_created_bounds():
assert bound1 in test_text1_bounds
for fact in result["texta_facts"]:
assert isinstance(fact["doc_path"], str) == True
assert isinstance(fact["str_val"], str) == True
assert isinstance(fact["spans"], str) == True
assert isinstance(fact["fact"], str) == True
assert isinstance(fact["doc_path"], str) is True
assert fact["doc_path"] == "text.text"
assert isinstance(fact["str_val"], str) is True
assert isinstance(fact["spans"], str) is True
assert isinstance(fact["fact"], str) is True
@pytest.mark.parametrize("expected_non_duplicate_facts, test_input", [
......@@ -251,7 +252,7 @@ def test_space_between_ok(mlp, ok_spaces):
concat_resources=mlp_wrapper.concat_resources
)
result = doc.space_between_ok(ok_spaces[0], ok_spaces[1], ok_spaces[2])
assert result == True
assert result is True
@pytest.mark.parametrize("not_ok_spaces", [
......
......@@ -11,7 +11,22 @@ import pytest
])
def test_currency_parse(mlp, expected_value, test_input):
result = mlp.process(test_input)
detected_facts = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'CURRENCY_SUM']
facts = result['texta_facts']
detected_facts = [fact['str_val'] for fact in facts if fact['fact'] == 'CURRENCY_SUM']
assert expected_value in detected_facts
for detected_fact in detected_facts:
assert detected_fact == expected_value
for fact in facts:
assert fact["doc_path"] == "text.text"
def test_doc_path_parsing_inside_doc(mlp):
payload = {
"docs": [{"text": {"subsection": "Eesti keeles on see 2,5 eurot."}}],
"doc_paths": ["text.subsection"]
}
results = mlp.process_docs(**payload)
facts = results[0]["texta_facts"]
assert len(facts) > 1
for fact in facts:
assert fact["doc_path"] == "text.subsection_mlp.text" or fact["doc_path"] == "text.subsection_mlp.lemmas"
......@@ -14,10 +14,11 @@ def test_emails(mlp, expected_email, test_input):
result = mlp.process(test_input)
detected_emails = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'EMAIL']
for fact in result["texta_facts"]:
assert isinstance(fact["doc_path"], str) == True
assert isinstance(fact["str_val"], str) == True
assert isinstance(fact["spans"], str) == True
assert isinstance(fact["fact"], str) == True
assert isinstance(fact["doc_path"], str) is True
assert fact["doc_path"] == "text.text"
assert isinstance(fact["str_val"], str) is True
assert isinstance(fact["spans"], str) is True
assert isinstance(fact["fact"], str) is True
if expected_email is None:
assert not detected_emails
......@@ -33,6 +34,20 @@ def test_emails(mlp, expected_email, test_input):
])
def test_no_email(mlp, test_input):
result = mlp.process(test_input)
emails = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'EMAIL']
facts = result['texta_facts']
emails = [fact['str_val'] for fact in facts if fact['fact'] == 'EMAIL']
assert len(emails) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
def test_doc_path_parsing_inside_doc(mlp):
payload = {
"docs": [{"text": {"subsection": "why not Dörte@Sörensen.example.com"}}],
"doc_paths": ["text.subsection"]
}
result = mlp.process_docs(**payload)[0]
facts = result["texta_facts"]
assert len(facts) > 0
for fact in facts:
assert fact["doc_path"] == "text.subsection_mlp.text"
......@@ -35,13 +35,13 @@ def test_mlp_process(mlp: MLP):
assert "text" in result
mlp_text = result["text"]
assert "lemmas" in mlp_text
assert isinstance(mlp_text["lemmas"], str) == True
assert isinstance(mlp_text["lemmas"], str) is True
assert "lang" in mlp_text
assert isinstance(mlp_text["lang"], dict) == True
assert isinstance(mlp_text["lang"], dict) is True
assert "texta_facts" in result
assert isinstance(result["texta_facts"], list) == True
assert isinstance(result["texta_facts"], list) is True
if mlp_text["lang"]["analysis_lang"] in ("ru", "ar"):
assert "transliteration" in mlp_text
......@@ -52,23 +52,29 @@ def test_mlp_process(mlp: MLP):
# test fact spans
text = mlp_text["text"]
for fact in result["texta_facts"]:
assert isinstance(fact["doc_path"], str) == True
assert isinstance(fact["str_val"], str) == True
assert isinstance(fact["spans"], str) == True
assert isinstance(fact["fact"], str) == True
if fact["fact"] != "BOUNDED":
span_list = json.loads(fact["spans"])
fact_path = fact["doc_path"]
fact_value = fact["str_val"]
fact_spans = fact["spans"]
fact_name = fact["fact"]
assert isinstance(fact_path, str) is True
assert isinstance(fact_value, str) is True
assert isinstance(fact_spans, str) is True
assert isinstance(fact_name, str) is True
assert fact_path == "text.text" or fact_path == "text.lemmas"
if fact_name != "BOUNDED":
span_list = json.loads(fact_spans)
for span in span_list:
spanned_text = text[span[0]:span[1]].lower()
# if phones, do replaces
if fact["fact"] == "PHONE_strict":
if fact_name == "PHONE_strict":
patt = re.compile(r'\([^)]+\)')
spanned_text = re.sub(patt, '', spanned_text).replace(' ', '').replace('-', '').replace('+', '')
# if emails, replace ' ' -> ''
if fact["fact"] == "EMAIL":
if fact_name == "EMAIL":
spanned_text = spanned_text.replace(' ', '')
str_val = fact["str_val"].lower()
str_val = fact_value.lower()
# Skip checks for phones with area codes.
# TODO Rewrite this test taking the problem with parenthesis into consideration.
......@@ -82,7 +88,11 @@ def test_mlp_process(mlp: MLP):
def test_companies_by_list(mlp: MLP, expected_output, test_input):
""" testing companies from a list """
result = mlp.process(test_input)
entities = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'COMPANY']
entities = []
for fact in result["texta_facts"]:
assert fact["doc_path"] == "text.text"
if fact["fact"] == "COMPANY":
entities.append(fact["str_val"])
if expected_output is None:
assert not entities
else:
......@@ -123,6 +133,7 @@ def test_existing_facts_not_being_overwritten(mlp: MLP):
assert len(facts) == 2 # One BOUNDED added
for fact in facts:
assert fact["str_val"] in ("Edgar Savisaar", "nõmme tänav 24")
assert fact["doc_path"] == "texts_mlp.text"
def test_removal_of_duplicate_facts(mlp: MLP):
......@@ -136,6 +147,7 @@ def test_removal_of_duplicate_facts(mlp: MLP):
assert len(facts) == 1
assert fact["str_val"] == "nõmme tänav 24"
assert fact["doc_path"] == "texts_mlp.text"
def test_processing_docs_with_missing_docpath(mlp: MLP):
......
......@@ -4,18 +4,21 @@ import pytest
@pytest.mark.parametrize("expected_namemail, test_input", [
('Керсти Кальюлайд kaljulaidkersti@yandex.ru', 'Отправлено с iPhone06.08.2015, в 20:05, Керсти Кальюлайд kaljulaidkersti@yandex.ru написал(а): '),
('Керсти Кальюлайд < kersti1298@mail.ru', 'Воскресенье, 17 января 2016, 1:10 +03:00 от Керсти Кальюлайд < kersti1298@mail.ru >:'),
# ('Павлович Данилов <danilov64@bk.ru>', 'Кому: Павел Павлович Данилов <danilov64@bk.ru>\nДата: Среда, 10 декабря 2014, 0:31 +03:00'),
# ('Павлович Данилов <danilov64@bk.ru>', 'Кому: Павел Павлович Данилов <danilov64@bk.ru>\nДата: Среда, 10 декабря 2014, 0:31 +03:00'),
('Tuule Tormi < tormituule@gmail.com', 'От кого: Tuule Tormi < tormituule@gmail.com >'),
('Ove Üllar < 123ove@gmail.com', 'Ove Üllar < 123ove@gmail.com >'),
('Антон Казарезов antonkazarezov@mail.ru','С уважением,\nАнтон Казарезов\nantonkazarezov@mail.ru\n37259087634')
('Антон Казарезов antonkazarezov@mail.ru', 'С уважением,\nАнтон Казарезов\nantonkazarezov@mail.ru\n37259087634')
])
def test_single_namemail(mlp, expected_namemail, test_input):
""" test for single namemails """
print(test_input)
result = mlp.process(test_input, analyzers = ("lemmas", "pos_tags", "transliteration", "ner", "contacts", "entities", "namemail"))
result = mlp.process(test_input, analyzers=("lemmas", "pos_tags", "transliteration", "ner", "contacts", "entities", "namemail"))
print(result)
detected_phone = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'NAMEMAIL']
facts = result['texta_facts']
detected_phone = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert detected_phone == [expected_namemail]
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("expected_namemails, test_input", [
......@@ -24,8 +27,11 @@ def test_single_namemail(mlp, expected_namemail, test_input):
def test_multiple_namemails(mlp, expected_namemails, test_input):
""" test for multiple potential namemails """
result = mlp.process(test_input)
detected_phones = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'NAMEMAIL']
facts = result['texta_facts']
detected_phones = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert sorted(detected_phones) == sorted(expected_namemails)
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("test_input", [
......@@ -34,9 +40,23 @@ def test_multiple_namemails(mlp, expected_namemails, test_input):
'This email was sent to amanda@mail.me',
'С уважением, Михайл.\n\n\nFrom: 123456789@mail.ru'
])
def test_no_namemails(mlp, test_input):
""" test for no potential namemails"""
result = mlp.process(test_input)
phones = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'NAMEMAIL']
facts = result['texta_facts']
phones = [fact['str_val'] for fact in facts if fact['fact'] == 'NAMEMAIL']
assert len(phones) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
def test_doc_path_parsing_inside_doc(mlp):
payload = {
"docs": [{"text": {"subsection": "To make sure you can receive our emails, please add noreply@actuallyreply.com to your [trusted contacts]"}}],
"doc_paths": ["text.subsection"]
}
result = mlp.process_docs(**payload)[0]
facts = result["texta_facts"]
assert len(facts) > 0
for fact in facts:
assert fact["doc_path"] == "text.subsection_mlp.text"
import pytest
@pytest.mark.parametrize("expected_phone, test_input", [
('74956456601', 'tema number pole +7 495-645-6601'),
('7(903)4744720', '+7 (903) 474-47-20'),
......@@ -16,8 +17,11 @@ import pytest
def test_single_phone(mlp, expected_phone, test_input):
""" test for single phone numbers """
result = mlp.process(test_input)
detected_phone = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'PHONE_strict']
facts = result["texta_facts"]
detected_phone = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert detected_phone == [expected_phone]
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("expected_phones, test_input", [
......@@ -28,8 +32,11 @@ def test_single_phone(mlp, expected_phone, test_input):
def test_multiple_phones(mlp, expected_phones, test_input):
""" test for multiple potential phone numbers """
result = mlp.process(test_input)
detected_phones = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'PHONE_strict']
facts = result['texta_facts']
detected_phones = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert sorted(detected_phones) == sorted(expected_phones)
for fact in facts:
assert fact["doc_path"] == "text.text"
@pytest.mark.parametrize("test_input", [
......@@ -53,5 +60,20 @@ def test_no_phones(mlp, test_input):
""" test for no potential phones"""
result = mlp.process(test_input)
print(result['texta_facts'])
phones = [fact['str_val'] for fact in result['texta_facts'] if fact['fact'] == 'PHONE_strict']
facts = result['texta_facts']
phones = [fact['str_val'] for fact in facts if fact['fact'] == 'PHONE_strict']
assert len(phones) == 0
for fact in facts:
assert fact["doc_path"] == "text.text"
def test_doc_path_parsing_inside_doc(mlp):
payload = {
"docs": [{"text": {"subsection": "tema number pole +7 495-645-6601 (7085), vaid on hoopis +372 5012 3123"}}],
"doc_paths": ["text.subsection"]
}
result = mlp.process_docs(**payload)[0]
facts = result["texta_facts"]
assert len(facts) > 0
for fact in facts:
assert fact["doc_path"] == "text.subsection_mlp.text"
......@@ -12,6 +12,7 @@ from .parsers import AddressParser, ContactEmailNamePairParser, ContactEmailPars
ContactPhoneParserHighRecall, ContactPhoneParserStrict
from .russian_transliterator import Transliterate
russian_transliterator = Transliterate()
......@@ -37,6 +38,7 @@ class Document:
CLOSE_FACT_DISTANCE = 150
def __init__(
self,
original_text: str,
......@@ -73,6 +75,16 @@ class Document:
self.__handle_existing_facts()
self.words()
def __get_doc_path(self, field: str) -> str:
"""
:param field: Whether the doc_path uses the text or lemmas field.
:return: MLP representation of the doc_path
"""
content = f"{self.doc_path}_mlp.{field}"
return content
def __handle_existing_facts(self):
"""
Add existing texta_facts inside the document into the private
......@@ -84,6 +96,7 @@ class Document:
for fact in facts:
self.add_fact(fact)
@staticmethod
def remove_duplicate_facts(facts: List[dict]):
if facts:
......@@ -93,14 +106,17 @@ class Document:
else:
return []
def facts_to_json(self) -> dict:
facts = [fact.to_json() for fact in self.__texta_facts]
unique_facts = Document.remove_duplicate_facts(facts)
return {"texta_facts": unique_facts}
def add_fact(self, fact: Fact):
self.__texta_facts.append(fact)
def document_to_json(self, use_default_doc_path=True) -> dict:
"""
:param use_default_doc_path: Normal string values will be given the default path for facts but for dictionary input you already have them.
......@@ -115,6 +131,7 @@ class Document:
return nested_dict_wrapper.convert()
def to_json(self, use_default_doc_path=True) -> dict:
container = dict()
container["text"] = self.get_words()
......@@ -131,25 +148,32 @@ class Document:
fact["doc_path"] = "text.text"
return {"text": container, **texta_facts}
def lemmas(self):
self.__lemmas = [word.lemma.replace("_", "") if word and word.lemma else "X" for word in self.stanza_words]
def get_lemma(self) -> str:
return " ".join([a.strip() for a in self.__lemmas])
def words(self):
self.__words = [word.text for word in self.stanza_words]
def get_words(self) -> str:
return " ".join(self.__words)
def pos_tags(self):
self.__pos_tags = [word.xpos if word and word.xpos and word.xpos != "_" else "X" if word.xpos == "_" else "X"
for word in self.stanza_words]
def get_pos_tags(self) -> str:
return " ".join([a.strip() for a in self.__pos_tags])
def entities(self):
"""
Retrieves list-based entities.
......@@ -166,7 +190,7 @@ class Document:
new_fact = Fact(
fact_type=entity_type,
fact_value=entity_value["value"],
doc_path=f"{self.doc_path}_mlp.text",
doc_path=self.__get_doc_path("text"),
spans=[[entity_value["span"][0], entity_value["span"][1]]]
)
self.__texta_facts.append(new_fact)
......@@ -176,7 +200,7 @@ class Document:
new_fact = Fact(
fact_type=entity_type,
fact_value=entity_value["value"],
doc_path=f"{self.doc_path}_mlp.lemmas",
doc_path=self.__get_doc_path("lemmas"),
spans=[[entity_value["span"][0], entity_value["span"][1]]]
)
self.__texta_facts.append(new_fact)
......@@ -184,6 +208,7 @@ class Document:
# declare the entities processed
self.entities_processed = True
def currency_sum(self):
"""
Extracts currency + sum and sum + currency patterns from text using regexp.
......@@ -205,38 +230,44 @@ class Document:
new_fact = Fact(
fact_type="CURRENCY_SUM",
fact_value=fact_value,
doc_path=self.doc_path,
doc_path=self.__get_doc_path("text"),
spans=[match.start(), match.end()]
)
self.__texta_facts.append(new_fact)
def emails(self):
text = self.get_words()
emails = ContactEmailParser(text).parse()
self.__texta_facts.extend((email.to_fact(Document.FACT_NAME_EMAIL, self.doc_path) for email in emails))
self.__texta_facts.extend((email.to_fact(Document.FACT_NAME_EMAIL, self.__get_doc_path("text")) for email in emails))
def phone_strict(self):
text = self.get_words()
phone_numbers_strict = ContactPhoneParserStrict(text).parse()
self.__texta_facts.extend(
(number.to_fact(Document.FACT_NAME_PHONE_STRICT, self.doc_path) for number in phone_numbers_strict))
(number.to_fact(Document.FACT_NAME_PHONE_STRICT, self.__get_doc_path("text")) for number in phone_numbers_strict))
def phone_high_recall(self):
text = self.get_words()
phone_numbers = ContactPhoneParserHighRecall(text, months=self.concat_resources["months"]).parse()
self.__texta_facts.extend(
(number.to_fact(Document.FACT_NAME_PHONE_HIGH_RECALL, self.doc_path) for number in phone_numbers))
(number.to_fact(Document.FACT_NAME_PHONE_HIGH_RECALL, self.__get_doc_path("text")) for number in phone_numbers))
def phone_high_precision(self):
text = self.get_words()
phone_numbers_high_precision = ContactPhoneParserHighPrecision(text).parse()
self.__texta_facts.extend((number.to_fact(Document.FACT_NAME_PHONE_HIGH_PRECISION, self.doc_path) for number in
self.__texta_facts.extend((number.to_fact(Document.FACT_NAME_PHONE_HIGH_PRECISION, self.__get_doc_path("text")) for number in
phone_numbers_high_precision))
def addresses(self):
text = self.get_words()
addresses = AddressParser(text, self.stanza_entities, self.dominant_language_code).parse()
self.__texta_facts.extend((addr.to_fact(Document.FACT_NAME_ADDRESS, self.doc_path) for addr in addresses))
self.__texta_facts.extend((addr.to_fact(Document.FACT_NAME_ADDRESS, self.__get_doc_path("text")) for addr in addresses))
def transliteration(self):
if self.dominant_language_code in Document.langs_to_transliterate:
......@@ -247,6 +278,7 @@ class Document:
translit_word = self._transliterate_arabic_word(word)
self.__transliteration.append(translit_word)
@staticmethod
def _transliterate_russian_word(word):
translit_word = russian_transliterator([word.text.strip()])
......@@ -256,6 +288,7 @@ class Document:
translit_word = word.text.strip()
return translit_word
@staticmethod
def _transliterate_arabic_word(word):
translit_word = buckwalter.transliterate(word.text.strip())
......@@ -263,9 +296,11 @@ class Document:
translit_word = word.text.strip()
return translit_word
def get_transliteration(self) -> str:
return " ".join(['X' if not a.strip() else a for a in self.__transliteration])
def entity_lemmas(self, entity_value):
lemmas = ""
splitted = entity_value.split(" ")
......@@ -289,6 +324,7 @@ class Document:
return word.lemma
return lemmas
def ner(self):
tokenized_text = self.get_words()
known_entities = Document.FACT_NAMES_NER
......@@ -316,11 +352,12 @@ class Document:
new_fact = Fact(
fact_type=entity.type,
fact_value=entity.text,
doc_path=f"{self.doc_path}_mlp.text",
doc_path=self.__get_doc_path("text"),
spans=[best_matching_span]
)
self.__texta_facts.append(new_fact)
def namemail(self):
"""
Find name-email pairs.
......@@ -329,7 +366,8 @@ class Document:
text = self.get_words()
email_name_pairs = ContactEmailNamePairParser(text).parse() # bounded -> str "name mail"
self.__texta_facts.extend(
(emailpair.to_fact(Document.FACT_NAME_NAMEMAIL, self.doc_path) for emailpair in email_name_pairs))
(emailpair.to_fact(Document.FACT_NAME_NAMEMAIL, self.__get_doc_path("text")) for emailpair in email_name_pairs))
def remove_duplicate_facts_by_span(self, facts):
"""if there are emailpairs, then:
......@@ -352,7 +390,7 @@ class Document:
name_fact = Fact(
fact_type="PER",
fact_value=name_fact_value.strip("><\)\(:;-\.,\!\?"),
doc_path=self.doc_path,
doc_path=self.__get_doc_path("text"),
spans=[(fact.spans[0][0], fact.spans[0][0] + len(name_fact_value))]
)
......@@ -360,7 +398,7 @@ class Document:
email_fact = Fact(
fact_type="EMAIL",
fact_value=email_fact_value.strip("><\)\(:;-\.,\!\?"),
doc_path=self.doc_path,
doc_path=self.__get_doc_path("text"),
spans=[(fact.spans[0][1] - len(email_fact_value), fact.spans[0][1])]
)
......@@ -378,6 +416,7 @@ class Document: