Commit b3761b32 authored by Raul Sirel's avatar Raul Sirel
Browse files

Merge branch 'mlp_batch'

parents 6bcc9962 9d5a381c
Pipeline #6817 passed with stage
in 7 minutes and 40 seconds
......@@ -46,6 +46,8 @@ def test_mlp_process(mlp: MLP):
assert "lemmas" in mlp_text
assert isinstance(mlp_text["lemmas"], str) is True
assert "word_features" in mlp_text
assert "language" in mlp_text
assert isinstance(mlp_text["language"], dict) is True
......@@ -164,12 +166,12 @@ def test_removal_of_duplicate_facts(mlp: MLP):
def test_processing_docs_with_missing_docpath(mlp: MLP):
result = mlp.process_docs(docs=[{"text": {"presidents": "Barack Obama"}}], doc_paths=["text.presidents.people"])
assert result == [{'text': {'presidents': 'Barack Obama'}, 'texta_facts': []}]
assert result == [{'text': {'presidents': 'Barack Obama'}}]
def test_processing_docs_with_missing_list_value(mlp: MLP):
result = mlp.process_docs(docs=[{"text": {"presidents": ["Barack Obama"]}}], doc_paths=["text.presidents.people"])
assert result == [{'text': {'presidents': ['Barack Obama']}, 'texta_facts': []}]
assert result == [{'text': {'presidents': ['Barack Obama']}}]
def test_processing_docs_with_correct_docpath(mlp: MLP):
......@@ -194,7 +196,7 @@ def test_processing_docs_with_list_value(mlp: MLP):
def test_processing_docs_with_none_value(mlp: MLP):
result = mlp.process_docs(docs=[{"text": {"presidents": None}}], doc_paths=["text.presidents"])
assert result == [{'text': {'presidents': None}, 'texta_facts': []}]
assert result == [{'text': {'presidents': None}}]
def test_processing_documents_with_multiple_doc_paths(mlp: MLP):
......
......@@ -3,6 +3,7 @@ import math
from typing import List, Optional
import regex as re
import stanza
from lang_trans.arabic import buckwalter
from pelecanus import PelicanJson
......@@ -44,8 +45,7 @@ class Document:
original_text: str,
dominant_language_code: str,
analysis_lang: str,
stanza_sentences: [list],
stanza_entities,
stanza_document: stanza.Document = None,
entity_mapper: Optional[EntityMapper] = None,
doc_path: str = "text_mlp",
json_doc: dict = None,
......@@ -64,10 +64,11 @@ class Document:
self.entities_processed = False
self.entity_mapper = entity_mapper
self.stanza_sentences = stanza_sentences
self.stanza_words = [word for sentence in self.stanza_sentences for word in sentence]
self.stanza_entities = stanza_entities
self.stanza_document = stanza_document
self.__stanza_sentences = []
self.__stanza_words = []
self.__stanza_entities = []
self.__words = []
self.__lemmas = []
self.__pos_tags = []
......@@ -76,7 +77,35 @@ class Document:
self.__texta_facts: List[Fact] = []
self.__handle_existing_facts()
self.words()
if self.stanza_document:
self.words()
@property
def stanza_sentences(self):
if not self.__stanza_sentences and self.stanza_document:
for sentence in self.stanza_document.sentences:
self.__stanza_sentences.append(sentence)
return self.__stanza_sentences
@property
def stanza_words(self):
if not self.__stanza_words and self.stanza_document:
for sentence in self.__stanza_sentences:
for word in sentence.words:
self.__stanza_words.append(word)
return self.__stanza_words
@property
def stanza_entities(self):
if not self.__stanza_entities:
for entity in self.stanza_document.entities:
self.__stanza_entities.append(entity)
return self.__stanza_entities
def __get_doc_path(self, field: str) -> str:
......@@ -93,7 +122,7 @@ class Document:
Add existing texta_facts inside the document into the private
fact container variable so that they wouldn't be overwritten.
"""
if self.json_doc and "texta_facts" in self.json_doc:
if self.json_doc:
existing_facts = self.json_doc.get("texta_facts", [])
facts = Fact.from_json(existing_facts)
for fact in facts:
......@@ -174,6 +203,29 @@ class Document:
return wrapper.convert()
@staticmethod
def parse_doc(doc_path: str, document: dict) -> list:
"""
Function for parsing text values from a nested dictionary given a field path.
:param doc_path: Dot separated path of fields to the value we wish to parse.
:param document: Document to be worked on.
:return: List of text fields that will be processed by MLP.
"""
wrapper = PelicanJson(document)
doc_path_as_list = doc_path.split(".")
content = wrapper.safe_get_nested_value(doc_path_as_list, default=[])
if content and isinstance(content, str):
return [content]
# Check that content is non-empty list and there are only stings in the list.
elif content and isinstance(content, list) and all([isinstance(list_content, str) for list_content in content]):
return content
# In case the field path is faulty and it gives you a dictionary instead.
elif isinstance(content, dict):
return []
else:
return []
def document_to_json(self, use_default_doc_path=True) -> dict:
"""
:param use_default_doc_path: Normal string values will be given the default path for facts but for dictionary input you already have them.
......@@ -216,7 +268,7 @@ class Document:
def lemmas(self):
for sent in self.stanza_sentences:
self.__lemmas.append([word.lemma.replace("_", "") if word and word.lemma else "X" for word in sent])
self.__lemmas.append([word.lemma.replace("_", "") if word and word.lemma else "X" for word in sent.words])
def get_lemma(self) -> str:
......@@ -236,8 +288,8 @@ class Document:
def words(self):
for sent in self.stanza_sentences:
self.__words.append([word.text for word in sent])
for sentence in self.stanza_sentences:
self.__words.append([word.text for word in sentence.words])
def get_words(self) -> str:
......@@ -250,8 +302,7 @@ class Document:
def pos_tags(self):
if "sentences" in self.analyzers:
for i,sent in enumerate(self.stanza_sentences):
#print(sent)
tags_in_sent = [word.upos if word and word.upos and word.upos != "_" else "X" if word.upos == "_" else "X" for word in sent]
tags_in_sent = [word.upos if word and word.upos and word.upos != "_" else "X" if word.upos == "_" else "X" for word in sent.words]
for tag in tags_in_sent:
self.__pos_tags.append(tag)
# if not last item
......@@ -268,7 +319,7 @@ class Document:
def word_features(self):
if "sentences" in self.analyzers:
for i,sent in enumerate(self.stanza_sentences):
tags_in_sent = [word.feats if word and word.feats and word.feats != "_" else "X" if word.feats == "_" else "X" for word in sent]
tags_in_sent = [word.feats if word and word.feats and word.feats != "_" else "X" if word.feats == "_" else "X" for word in sent.words]
for tag in tags_in_sent:
self.__word_features.append(tag)
# if not last item
......@@ -384,19 +435,19 @@ class Document:
if self.dominant_language_code in Document.langs_to_transliterate:
for word in self.stanza_words:
if self.dominant_language_code == "ru":
translit_word = self._transliterate_russian_word(word)
translit_word = self._transliterate_russian_word(word.text)
elif self.dominant_language_code == "ar":
translit_word = self._transliterate_arabic_word(word)
translit_word = self._transliterate_arabic_word(word.text)
self.__transliteration.append(translit_word)
@staticmethod
def _transliterate_russian_word(word):
translit_word = russian_transliterator([word.text.strip()])
def _transliterate_russian_word(word: str):
translit_word = russian_transliterator([word.strip()])
try:
translit_word = translit_word[0].strip()
except IndexError:
translit_word = word.text.strip()
translit_word = word.strip()
return translit_word
......
......@@ -10,12 +10,11 @@ import regex as re
import stanza
from bs4 import BeautifulSoup
from langdetect import detect
from pelecanus import PelicanJson
from texta_mlp.document import Document
from texta_mlp.entity_mapper import EntityMapper
from texta_mlp.utils import parse_bool_env
from texta_mlp.exceptions import LanguageNotSupported
from texta_mlp.utils import parse_bool_env
# Languages supported by default.
......@@ -64,7 +63,7 @@ SUPPORTED_ANALYZERS = (
DEFAULT_ANALYZERS = [
"lemmas",
"pos_tags",
#"word_features",
"word_features",
"transliteration",
"ner",
"addresses",
......@@ -224,8 +223,10 @@ class MLP:
return lang
def generate_document(self, raw_text: str, analyzers: List[str], json_object: dict = None, doc_paths="text", lang=None):
def generate_document(self, raw_text: str, analyzers: List[str], json_object: dict = None, doc_paths="text", lang=None, stanza_document=None):
processed_text = MLP.normalize_input_text(raw_text)
e = ""
# detect language
if not lang:
lang = self.detect_language(processed_text)
......@@ -233,19 +234,26 @@ class MLP:
check if detected language is supported if the language is not supported it will use default_lang to load
stanza models yet keep the document lang as the detected language
'''
# Resolve the language.
if lang not in self.supported_langs:
analysis_lang = self.default_lang
sentences, entities, e = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [], "")
else:
analysis_lang = lang
sentences, entities, e = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [], "")
# Use the pre-given document if it exists, otherwise calculate on own.
if processed_text and stanza_document is None:
document, e = self._get_stanza_document(analysis_lang, processed_text) if processed_text else (None, "")
elif stanza_document and processed_text:
document = stanza_document
else:
document = None
# Create the overall wrapper.
document = Document(
original_text=processed_text,
dominant_language_code=lang,
analysis_lang=analysis_lang,
stanza_sentences=sentences,
stanza_entities=entities,
stanza_document=document,
analyzers=analyzers,
json_doc=json_object,
doc_path=doc_paths,
......@@ -267,6 +275,7 @@ class MLP:
self._entity_mapper = self._load_entity_mapper()
return self._entity_mapper
def get_stanza_pipeline(self, lang: str):
if lang not in self._stanza_pipelines:
if lang in self.custom_ner_model_langs:
......@@ -289,12 +298,11 @@ class MLP:
return self._stanza_pipelines[lang]
def _get_stanza_tokens(self, lang: str, raw_text: str):
sentences = []
entities = []
def _get_stanza_document(self, lang: str, raw_text: str):
e = ""
try:
pipeline = self.get_stanza_pipeline(lang)(raw_text)
document = self.get_stanza_pipeline(lang)(raw_text)
return document, e
for sentence in pipeline.sentences:
words = []
......@@ -310,9 +318,12 @@ class MLP:
except Exception as e:
self.logger.exception(e)
return sentences, entities, repr(e)
return None, repr(e)
return sentences, entities, e
def _get_stanza_ner(self, lang: str, raw_text: str):
pipeline = self.get_stanza_pipeline(lang)(raw_text)
return [entity for sentence in pipeline.sentences for entity in sentence.entities]
@staticmethod
......@@ -360,28 +371,6 @@ class MLP:
return document["text_mlp"]["lemmas"]
def parse_doc_texts(self, doc_path: str, document: dict) -> list:
"""
Function for parsing text values from a nested dictionary given a field path.
:param doc_path: Dot separated path of fields to the value we wish to parse.
:param document: Document to be worked on.
:return: List of text fields that will be processed by MLP.
"""
wrapper = PelicanJson(document)
doc_path_as_list = doc_path.split(".")
content = wrapper.safe_get_nested_value(doc_path_as_list, default=[])
if content and isinstance(content, str):
return [content]
# Check that content is non-empty list and there are only stings in the list.
elif content and isinstance(content, list) and all([isinstance(list_content, str) for list_content in content]):
return content
# In case the field path is faulty and it gives you a dictionary instead.
elif isinstance(content, dict):
return []
else:
return []
def __apply_analyzer(self, doc, analyzer):
try:
getattr(doc, analyzer)()
......@@ -397,15 +386,47 @@ class MLP:
:return: List of dictionaries where the mlp information is stored inside texta_facts and the last field of the doc_path in the format {doc_path}_mlp.
"""
# Container for keeping the tuples of the doc and meta pairs.
container = []
for document in docs:
for doc_path in doc_paths:
analyzers = self._load_analyzers(analyzers, SUPPORTED_ANALYZERS)
for doc_path in doc_paths:
lang_group = {}
texts = [Document.parse_doc(doc_path, document) for document in docs]
for index, text in enumerate(texts):
text = text[0] if text else ""
lang = self.detect_language(text)
if lang not in self.supported_langs:
lang = self.default_lang
if lang and lang not in lang_group:
lang_group[lang] = [{"index": index, "text": text}]
elif lang in lang_group:
lang_group[lang].append({"index": index, "text": text})
intermediary = []
for lang, items in lang_group.items():
pipeline = self.get_stanza_pipeline(lang)
# Create the batch of Stanza Documents to feed into the pipeline.
documents = []
for item in items:
text = item.get("text", "")
text = text if text else ""
documents.append(stanza.Document([], text=text))
# Analyze the batch.
results = pipeline(documents)
for index, result in enumerate(results):
actual_index = items[index]["index"]
# Tie together the original document and it's location in the list for replacement and the relevant Stanza document.
intermediary.insert(actual_index, ({"actual_doc": docs[actual_index], "actual_index": actual_index, "lang": lang}, result))
for meta_info, stanza_document in intermediary:
# Traverse the (possible) nested dicts and extract their text values from it as a list of strings.
# Since the nested doc_path could lead to a list there are multiple pieces of text which would be needed to process.
doc_texts = self.parse_doc_texts(doc_path, document)
actual_document = meta_info["actual_doc"]
actual_index = meta_info["actual_index"]
lang = meta_info["lang"]
doc_texts = Document.parse_doc(doc_path, actual_document)
for raw_text in doc_texts:
analyzers = self._load_analyzers(analyzers, SUPPORTED_ANALYZERS)
doc = self.generate_document(raw_text, analyzers, document, doc_paths=doc_path)
doc = self.generate_document(raw_text, analyzers=analyzers, json_object=actual_document, lang=lang, stanza_document=stanza_document, doc_paths=doc_path, )
if doc:
for analyzer in analyzers:
# For every analyzer, activate the function that processes it from the
......@@ -416,22 +437,10 @@ class MLP:
doc.fact_spans_to_sent()
result = doc.document_to_json(use_default_doc_path=False)
new_facts = result.pop("texta_facts", [])
existing_facts = document.get("texta_facts", [])
unique_facts = Document.remove_duplicate_facts(new_facts + existing_facts)
result["texta_facts"] = unique_facts
document = result
if document:
# Add in texta_facts even if nothing was done due to missing values.
facts = document.get("texta_facts", [])
document["texta_facts"] = facts
container.append(document)
else:
# Add in at least something to avoid problems with operations that include indexing.
container.append({})
docs[actual_index] = result
return docs
return container
@staticmethod
def download_concatenator_resources(resource_dir: str, logger):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment