Commit 18c432c2 authored by Hele-Andra Kuulmets's avatar Hele-Andra Kuulmets
Browse files

add sentence split

parent 45a1a7a7
......@@ -101,7 +101,7 @@ def test_remove_duplicate_facts_by_span_in_doc(expected_non_duplicate_facts, tes
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
......@@ -135,7 +135,7 @@ def test_bound_close_ones(expected_close_BOUNDS, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
......@@ -168,7 +168,7 @@ def test_remove_overlaping_in_bounded(expected_bounds_no_overlap, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
......@@ -195,7 +195,7 @@ def test_concatenate_subset_bounds(expected_bounds_no_subsets, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
......@@ -221,7 +221,7 @@ def test_concatenate_subset_bounds(key_value_single_pairs, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
......@@ -243,7 +243,7 @@ def test_space_between_ok(mlp, ok_spaces):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
......@@ -269,7 +269,7 @@ def test_space_between_not_ok(mlp, not_ok_spaces):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
......@@ -293,7 +293,7 @@ def test_clean_similar_in_strval(similar_cleaned_str_val, test_input):
original_text="",
dominant_language_code="en",
analysis_lang="en",
stanza_words=[],
stanza_sentences=[],
stanza_entities=[],
entity_mapper=None,
doc_path="text",
......
......@@ -202,3 +202,20 @@ def test_that_ner_output_contains_correct_doc_path(mlp: MLP):
facts = result.get("texta_facts")
usa_fact = facts[0]
assert usa_fact["doc_path"] == "comment.text_mlp.text"
def test_sentences_separated_with_newline(mlp: MLP):
result = mlp.process("Siin on üks lause. See on teine lause.")
mlp_result = result["text"]
assert mlp_result["text"] == 'Siin on üks lause .\nSee on teine lause .'
assert mlp_result["lemmas"] == 'siin olema üks lause .\nsee olema teine lause .'
def test_sentences_not_separated_with_newline(mlp: MLP):
result = mlp.process("Siin on üks lause. See on teine lause.", analyzers=["lemmas", "pos_tags"])
mlp_result = result["text"]
assert mlp_result["text"] == 'Siin on üks lause . See on teine lause .'
assert mlp_result["lemmas"] == 'siin olema üks lause . see olema teine lause .'
......@@ -44,7 +44,7 @@ class Document:
original_text: str,
dominant_language_code: str,
analysis_lang: str,
stanza_words,
stanza_sentences: [list],
stanza_entities,
concat_resources: dict,
entity_mapper: Optional[EntityMapper] = None,
......@@ -61,7 +61,8 @@ class Document:
self.json_doc = json_doc
self.entity_mapper = entity_mapper
self.stanza_words = stanza_words
self.stanza_sentences = stanza_sentences
self.stanza_words = [word for sentence in self.stanza_sentences for word in sentence]
self.stanza_entities = stanza_entities
self.concat_resources = concat_resources
......@@ -134,7 +135,7 @@ class Document:
def to_json(self, use_default_doc_path=True) -> dict:
container = dict()
container["text"] = self.get_words()
container["text"] = self.get_words(ssplit="sentences" in self.analyzers)
texta_facts = self.facts_to_json()
container["language"] = {"detected": self.dominant_language_code,
"analysis": self.analysis_lang}
......@@ -150,19 +151,34 @@ class Document:
def lemmas(self):
self.__lemmas = [word.lemma.replace("_", "") if word and word.lemma else "X" for word in self.stanza_words]
for sent in self.stanza_sentences:
self.__lemmas.append([word.lemma.replace("_", "") if word and word.lemma else "X" for word in sent])
def get_lemma(self) -> str:
return " ".join([a.strip() for a in self.__lemmas])
sentences = []
for sent_lemmas in self.__lemmas:
sentences.append(" ".join([a.strip() for a in sent_lemmas]))
if "sentences" in self.analyzers:
return "\n".join(sentences)
else:
return " ".join(sentences)
def words(self):
self.__words = [word.text for word in self.stanza_words]
for sent in self.stanza_sentences:
self.__words.append([word.text for word in sent])
def sentences(self):
pass
def get_words(self) -> str:
return " ".join(self.__words)
def get_words(self, ssplit = False) -> str:
if ssplit:
return "\n".join([" ".join(sent_words) for sent_words in self.__words])
else:
return " ".join([" ".join(sent_words) for sent_words in self.__words])
def pos_tags(self):
......
......@@ -52,7 +52,8 @@ SUPPORTED_ANALYZERS = (
"entities",
"namemail",
"bounded",
"currency_sum"
"currency_sum",
"sentences"
)
# Here we define languages with NER support to avoid Stanza trying to load them for languages without NER support.
......@@ -196,16 +197,16 @@ class MLP:
'''
if lang not in self.supported_langs:
analysis_lang = self.default_lang
words, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [])
sentences, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [])
else:
analysis_lang = lang
words, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [])
sentences, entities = self._get_stanza_tokens(analysis_lang, processed_text) if processed_text else ([], [])
document = Document(
original_text=processed_text,
dominant_language_code=lang,
analysis_lang=analysis_lang,
stanza_words=words,
stanza_sentences=sentences,
stanza_entities=entities,
analyzers=analyzers,
json_doc=json_object,
......@@ -230,23 +231,23 @@ class MLP:
if lang == "ru":
raw_text = pat.sub("_", raw_text)
# Separate all the words into a separate list.
pipeline = self.stanza_pipelines[lang](raw_text)
words = []
sentences = []
entities = []
pip_pat = re.compile(r"(?<=\d)_(?=\d)")
for sentence in pipeline.sentences:
words = []
for word in sentence.words:
# Russian HACK (2/2)
# replaces back "#" to "-" between digits.
if lang == "ru":
word.text = pip_pat.sub("-", word.text)
words.append(word)
sentences.append(words)
for entity in sentence.entities:
entities.append(entity)
return words, entities
return sentences, entities
def _get_stanza_ner(self, lang: str, raw_text: str):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment