Commit a116c70f authored by Marko Kollo's avatar Marko Kollo 😄 Committed by Raul Sirel
Browse files

Doc path hotfix

parent 4cf97a29
......@@ -113,7 +113,7 @@ def test_mlp_lemmatize(mlp: MLP, expected_lemmas, test_input):
def test_existing_facts_not_being_overwritten(mlp: MLP):
payload = {
"texts": ["Edgar Savisaar elas Nõmme tänav 24"],
"texta_facts": [{"fact": "PER", "str_val": "Edgar Savisaar", "spans": "[[0, 14]]", "doc_path": "texts"}]
"texta_facts": [{"fact": "PER", "lemma": None, "str_val": "Edgar Savisaar", "spans": "[[0, 14]]", "doc_path": "texts_mlp.text"}]
}
result = mlp.process_docs([payload], doc_paths=["texts"])
original_facts = result[0]["texta_facts"]
......@@ -128,7 +128,7 @@ def test_existing_facts_not_being_overwritten(mlp: MLP):
def test_removal_of_duplicate_facts(mlp: MLP):
payload = {
"texts": ["Edgar Savisaar elas Nõmme tänav 24"],
"texta_facts": [{'doc_path': 'texts', 'fact': 'ADDR', 'spans': '[[20, 34]]', 'str_val': 'nõmme tänav 24'}]
"texta_facts": [{'doc_path': 'texts_mlp.text', 'lemma': None, 'fact': 'ADDR', 'spans': '[[20, 34]]', 'str_val': 'nõmme tänav 24'}]
}
result = mlp.process_docs([payload], doc_paths=["texts"])
facts = result[0]["texta_facts"]
......@@ -140,12 +140,12 @@ def test_removal_of_duplicate_facts(mlp: MLP):
def test_processing_docs_with_missing_docpath(mlp: MLP):
result = mlp.process_docs(docs=[{"text": {"presidents": "Barack Obama"}}], doc_paths=["text.presidents.people"])
assert result == []
assert result == [{'text': {'presidents': 'Barack Obama'}, 'texta_facts': []}]
def test_processing_docs_with_missing_list_value(mlp: MLP):
result = mlp.process_docs(docs=[{"text": {"presidents": ["Barack Obama"]}}], doc_paths=["text.presidents.people"])
assert result == []
assert result == [{'text': {'presidents': ['Barack Obama']}, 'texta_facts': []}]
def test_processing_docs_with_correct_docpath(mlp: MLP):
......@@ -159,7 +159,7 @@ def test_processing_docs_with_correct_docpath(mlp: MLP):
def test_processing_docs_with_list_value(mlp: MLP):
result = mlp.process_docs(docs=[{"text": {"presidents": ["Barack Obama", "Joe Biden"]}}], doc_paths=["text.presidents"])
assert len(result) == 2
assert len(result) == 1
assert result
for hit in result:
assert "presidents_mlp" in hit["text"]
......@@ -170,4 +170,13 @@ def test_processing_docs_with_list_value(mlp: MLP):
def test_processing_docs_with_none_value(mlp: MLP):
result = mlp.process_docs(docs=[{"text": {"presidents": None}}], doc_paths=["text.presidents"])
assert result == []
assert result == [{'text': {'presidents': None}, 'texta_facts': []}]
def test_processing_documents_with_multiple_doc_paths(mlp: MLP):
result = mlp.process_docs(docs=[{"entity": {"first_name": "Barack", "last_name": "Obama"}}], doc_paths=["entity.first_name", "entity.last_name"])
assert len(result) == 1
document = result[0]
assert "texta_facts" in document
assert "first_name_mlp" in document["entity"]
assert "last_name_mlp" in document["entity"]
......@@ -12,8 +12,7 @@ from .parsers import AddressParser, ContactEmailNamePairParser, ContactEmailPars
from .russian_transliterator import Transliterate
# transliterator
transliterate = Transliterate()
russian_transliterator = Transliterate()
class Document:
......@@ -86,7 +85,8 @@ class Document:
self.add_fact(fact)
def __remove_duplicate_facts(self, facts: List[dict]):
@staticmethod
def remove_duplicate_facts(facts: List[dict]):
if facts:
set_of_jsons = {json.dumps(fact, sort_keys=True, ensure_ascii=False) for fact in facts}
without_duplicates = [json.loads(unique_fact) for unique_fact in set_of_jsons]
......@@ -97,7 +97,7 @@ class Document:
def facts_to_json(self) -> dict:
facts = [fact.to_json() for fact in self.__texta_facts]
unique_facts = self.__remove_duplicate_facts(facts)
unique_facts = Document.remove_duplicate_facts(facts)
return {"texta_facts": unique_facts}
......@@ -105,11 +105,14 @@ class Document:
self.__texta_facts.append(fact)
def document_to_json(self):
def document_to_json(self, use_default_doc_path=True) -> dict:
"""
:param use_default_doc_path: Normal string values will be given the default path for facts but for dictionary input you already have them.
"""
list_of_path_keys = self.doc_path.split(".")
root_key = "{}_mlp".format(list_of_path_keys[-1])
path_to_mlp = list_of_path_keys[:-1] + [root_key] if len(list_of_path_keys) > 1 else [root_key]
mlp_result = self.to_json()
mlp_result = self.to_json(use_default_doc_path)
nested_dict_wrapper = PelicanJson(self.json_doc)
nested_dict_wrapper.set_nested_value(path_to_mlp, mlp_result["text"], force=True)
nested_dict_wrapper.set_nested_value(["texta_facts"], mlp_result["texta_facts"], force=True)
......@@ -117,7 +120,7 @@ class Document:
return nested_dict_wrapper.convert()
def to_json(self) -> dict:
def to_json(self, use_default_doc_path=True) -> dict:
container = dict()
container["text"] = self.get_words()
texta_facts = self.facts_to_json()
......@@ -126,8 +129,9 @@ class Document:
if "pos_tags" in self.analyzers: container["pos_tags"] = self.get_pos_tags()
# if "sentiment" in self.analyzers: container["sentiment"] = self.get_sentiment()
if "transliteration" in self.analyzers and self.__transliteration: container["transliteration"] = self.get_transliteration()
for fact in texta_facts["texta_facts"]:
fact["doc_path"] = "text.text"
if use_default_doc_path:
for fact in texta_facts["texta_facts"]:
fact["doc_path"] = "text.text"
return {"text": container, **texta_facts}
......@@ -164,21 +168,31 @@ class Document:
hits = self.entity_mapper.map_entities(text)
lemma_hits = self.entity_mapper.map_entities(lemmas, entity_types=["CURRENCY"])
# combine hits
all_hits = z = {**hits, **lemma_hits}
# make facts
for entity_type, entity_values in all_hits.items():
for entity_type, entity_values in hits.items():
for entity_value in entity_values:
new_fact = Fact(
fact_type=entity_type,
fact_value=entity_value["value"],
doc_path=self.doc_path,
doc_path=f"{self.doc_path}_mlp.text",
spans=[[entity_value["span"][0], entity_value["span"][1]]]
)
self.__texta_facts.append(new_fact)
for entity_type, entity_values in lemma_hits.items():
for entity_value in entity_values:
new_fact = Fact(
fact_type=entity_type,
fact_value=entity_value["value"],
doc_path=f"{self.doc_path}_mlp.lemmas",
spans=[[entity_value["span"][0], entity_value["span"][1]]]
)
self.__texta_facts.append(new_fact)
# declare the entities processed
self.entities_processed = True
def currency_sum(self):
"""
......@@ -205,7 +219,7 @@ class Document:
spans=[match.start(), match.end()]
)
self.__texta_facts.append(new_fact)
def emails(self):
text = self.get_words()
......@@ -249,7 +263,7 @@ class Document:
@staticmethod
def _transliterate_russian_word(word):
translit_word = transliterate([word.text.strip()])
translit_word = russian_transliterator([word.text.strip()])
try:
translit_word = translit_word[0].strip()
except IndexError:
......@@ -259,7 +273,7 @@ class Document:
@staticmethod
def _transliterate_arabic_word(word):
translit_word = buckwalter.transliterate(word.text.strip())
translit_word = buckwalter.russian_transliterator(word.text.strip())
if not translit_word:
translit_word = word.text.strip()
return translit_word
......@@ -569,7 +583,7 @@ class Document:
remove previous phone facts
add phoneparser made for emails (ContactPhoneParserHighRecall)
"""
new_facts = [fact for fact in facts if not re.match("PHONE|"+Document.FACT_NAME_BOUNDED, fact.fact_type)]
new_facts = [fact for fact in facts if not re.match("PHONE|" + Document.FACT_NAME_BOUNDED, fact.fact_type)]
text = self.get_words()
phone_numbers = ContactPhoneParserHighRecall(text, months=self.concat_resources["months"]).parse()
new_facts.extend((number.to_fact(Document.FACT_NAME_PHONE_HIGH_RECALL, self.doc_path) for number in phone_numbers))
......
......@@ -15,6 +15,7 @@ from texta_mlp.document import Document
from texta_mlp.entity_mapper import EntityMapper
from texta_mlp.exceptions import LanguageNotSupported
# Languages supported by default.
DEFAULT_LANG_CODES = ("et", "ru", "en", "ar")
......@@ -71,9 +72,8 @@ class MLP:
self.logger = logging.getLogger()
self.default_lang = default_language_code
self.use_default_lang = use_default_language_code
self.langs_to_transliterate = ["ru"]
self.resource_dir = resource_dir
self.use_gpu=use_gpu
self.use_gpu = use_gpu
self.resource_dir_pathlib = pathlib.Path(resource_dir)
self.not_entities_path = self.resource_dir_pathlib / "concatenator" / "not_entities.txt"
......@@ -249,7 +249,7 @@ class MLP:
logging_level=logging_level
)
return stanza_pipelines
@staticmethod
def _get_stanza_processors(lang):
......@@ -268,7 +268,7 @@ class MLP:
Processes raw text.
:param: raw_text str: Text to be processed.
:param: analyzers list: List of analyzers to be used.
:return: Processed text as document ready for Elastic.
:return: Processed text as document ready for Elastic.
"""
loaded_analyzers = self._load_analyzers(analyzers, SUPPORTED_ANALYZERS)
document = self.generate_document(raw_text, loaded_analyzers, lang=lang)
......@@ -330,10 +330,22 @@ class MLP:
# For every analyzer, activate the function that processes it from the
# document class.
getattr(doc, analyzer)()
data = doc.document_to_json()
container.append(data)
else:
container.append(None)
result = doc.document_to_json(use_default_doc_path=False)
new_facts = result.pop("texta_facts", [])
existing_facts = document.get("texta_facts", [])
unique_facts = Document.remove_duplicate_facts(new_facts + existing_facts)
result["texta_facts"] = unique_facts
document = result
if document:
# Add in texta_facts even if nothing was done due to missing values.
facts = document.get("texta_facts", [])
document["texta_facts"] = facts
container.append(document)
else:
# Add in at least something to avoid problems with operations that include indexing.
container.append({})
return container
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment