Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
texta
texta-mlp-python
Commits
ab7f52da
Commit
ab7f52da
authored
Aug 30, 2021
by
Marko Kollo
😄
Browse files
Rewrote Document class to include whole stanza.Document instance.
parent
fc672343
Pipeline
#6244
passed with stage
in 15 minutes and 9 seconds
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
tests/test_creating_bounded_facts.py
View file @
ab7f52da
...
...
@@ -101,8 +101,6 @@ def test_remove_duplicate_facts_by_span_in_doc(expected_non_duplicate_facts, tes
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_sentences
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
json_doc
=
None
,
...
...
@@ -135,8 +133,6 @@ def test_bound_close_ones(expected_close_BOUNDS, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_sentences
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
json_doc
=
None
,
...
...
@@ -168,8 +164,6 @@ def test_remove_overlaping_in_bounded(expected_bounds_no_overlap, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_sentences
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
json_doc
=
None
,
...
...
@@ -195,8 +189,6 @@ def test_concatenate_subset_bounds(expected_bounds_no_subsets, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_sentences
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
json_doc
=
None
,
...
...
@@ -221,8 +213,6 @@ def test_concatenate_subset_bounds(key_value_single_pairs, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_sentences
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
json_doc
=
None
,
...
...
@@ -243,8 +233,6 @@ def test_space_between_ok(mlp, ok_spaces):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_sentences
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
json_doc
=
None
,
...
...
@@ -269,8 +257,6 @@ def test_space_between_not_ok(mlp, not_ok_spaces):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_sentences
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
json_doc
=
None
,
...
...
@@ -293,8 +279,6 @@ def test_clean_similar_in_strval(similar_cleaned_str_val, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_sentences
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
json_doc
=
None
,
...
...
texta_mlp/document.py
View file @
ab7f52da
...
...
@@ -3,6 +3,7 @@ import math
from
typing
import
List
,
Optional
import
regex
as
re
import
stanza
from
lang_trans.arabic
import
buckwalter
from
pelecanus
import
PelicanJson
...
...
@@ -44,9 +45,8 @@ class Document:
original_text
:
str
,
dominant_language_code
:
str
,
analysis_lang
:
str
,
stanza_sentences
:
[
list
],
stanza_entities
,
concat_resources
:
dict
,
stanza_document
:
stanza
.
Document
=
None
,
entity_mapper
:
Optional
[
EntityMapper
]
=
None
,
doc_path
:
str
=
"text"
,
json_doc
:
dict
=
None
,
...
...
@@ -63,12 +63,14 @@ class Document:
self
.
json_doc
=
json_doc
self
.
entity_mapper
=
entity_mapper
self
.
stanza_sentences
=
stanza_sentences
self
.
stanza_words
=
[
word
for
sentence
in
self
.
stanza_sentences
for
word
in
sentence
]
self
.
stanza_entities
=
stanza_entities
self
.
stanza_document
=
stanza_document
self
.
concat_resources
=
concat_resources
self
.
__stanza_sentences
=
[]
self
.
__stanza_words
=
[]
self
.
__stanza_entities
=
[]
self
.
__words
=
[]
self
.
__lemmas
=
[]
self
.
__pos_tags
=
[]
...
...
@@ -76,7 +78,34 @@ class Document:
self
.
__texta_facts
:
List
[
Fact
]
=
[]
self
.
__handle_existing_facts
()
self
.
words
()
if
self
.
stanza_document
:
self
.
words
()
@
property
def
stanza_sentences
(
self
):
if
not
self
.
__stanza_sentences
:
for
sentence
in
self
.
stanza_document
.
sentences
:
self
.
__stanza_sentences
.
append
(
sentence
)
return
self
.
__stanza_sentences
@
property
def
stanza_words
(
self
):
if
not
self
.
__stanza_words
:
for
sentence
in
self
.
__stanza_sentences
:
for
word
in
sentence
.
words
:
self
.
__stanza_words
.
append
(
word
)
return
self
.
__stanza_words
@
property
def
stanza_entities
(
self
):
if
not
self
.
__stanza_entities
:
for
entity
in
self
.
stanza_document
.
entities
:
self
.
__stanza_entities
.
append
(
entity
)
return
self
.
__stanza_entities
def
__get_doc_path
(
self
,
field
:
str
)
->
str
:
...
...
@@ -186,7 +215,7 @@ class Document:
def
lemmas
(
self
):
for
sent
in
self
.
stanza_sentences
:
self
.
__lemmas
.
append
([
word
.
lemma
.
replace
(
"_"
,
""
)
if
word
and
word
.
lemma
else
"X"
for
word
in
sent
])
self
.
__lemmas
.
append
([
word
.
lemma
.
replace
(
"_"
,
""
)
if
word
and
word
.
lemma
else
"X"
for
word
in
sent
.
words
])
def
get_lemma
(
self
)
->
str
:
...
...
@@ -201,8 +230,8 @@ class Document:
def
words
(
self
):
for
sent
in
self
.
stanza_sentences
:
self
.
__words
.
append
([
word
.
text
for
word
in
sent
])
for
sent
ence
in
self
.
stanza_sentences
:
self
.
__words
.
append
([
word
.
text
for
word
in
sent
ence
.
words
])
def
sentences
(
self
):
...
...
@@ -217,8 +246,11 @@ class Document:
def
pos_tags
(
self
):
self
.
__pos_tags
=
[
word
.
xpos
if
word
and
word
.
xpos
and
word
.
xpos
!=
"_"
else
"X"
if
word
.
xpos
==
"_"
else
"X"
for
word
in
self
.
stanza_words
]
for
word
in
self
.
stanza_words
:
if
word
and
word
.
xpos
and
word
.
xpos
!=
"_"
:
self
.
__pos_tags
.
append
(
word
.
xpos
)
else
:
self
.
__pos_tags
.
append
(
"X"
)
def
get_pos_tags
(
self
)
->
str
:
...
...
@@ -324,19 +356,19 @@ class Document:
if
self
.
dominant_language_code
in
Document
.
langs_to_transliterate
:
for
word
in
self
.
stanza_words
:
if
self
.
dominant_language_code
==
"ru"
:
translit_word
=
self
.
_transliterate_russian_word
(
word
)
translit_word
=
self
.
_transliterate_russian_word
(
word
.
text
)
elif
self
.
dominant_language_code
==
"ar"
:
translit_word
=
self
.
_transliterate_arabic_word
(
word
)
translit_word
=
self
.
_transliterate_arabic_word
(
word
.
text
)
self
.
__transliteration
.
append
(
translit_word
)
@
staticmethod
def
_transliterate_russian_word
(
word
):
translit_word
=
russian_transliterator
([
word
.
text
.
strip
()])
def
_transliterate_russian_word
(
word
:
str
):
translit_word
=
russian_transliterator
([
word
.
strip
()])
try
:
translit_word
=
translit_word
[
0
].
strip
()
except
IndexError
:
translit_word
=
word
.
text
.
strip
()
translit_word
=
word
.
strip
()
return
translit_word
...
...
texta_mlp/mlp.py
View file @
ab7f52da
...
...
@@ -206,17 +206,16 @@ class MLP:
'''
if
lang
not
in
self
.
supported_langs
:
analysis_lang
=
self
.
default_lang
sentences
,
entities
,
e
=
self
.
_get_stanza_
tok
en
s
(
analysis_lang
,
processed_text
)
if
processed_text
else
(
[],
[]
,
""
)
document
,
e
=
self
.
_get_stanza_
docum
en
t
(
analysis_lang
,
processed_text
)
if
processed_text
else
(
None
,
""
)
else
:
analysis_lang
=
lang
sentences
,
entities
,
e
=
self
.
_get_stanza_
tok
en
s
(
analysis_lang
,
processed_text
)
if
processed_text
else
(
[],
[]
,
""
)
document
,
e
=
self
.
_get_stanza_
docum
en
t
(
analysis_lang
,
processed_text
)
if
processed_text
else
(
None
,
""
)
document
=
Document
(
original_text
=
processed_text
,
dominant_language_code
=
lang
,
analysis_lang
=
analysis_lang
,
stanza_sentences
=
sentences
,
stanza_entities
=
entities
,
stanza_document
=
document
,
analyzers
=
analyzers
,
json_doc
=
json_object
,
doc_path
=
doc_paths
,
...
...
@@ -239,6 +238,7 @@ class MLP:
self
.
_entity_mapper
=
self
.
_load_entity_mapper
()
return
self
.
_entity_mapper
def
get_stanza_pipeline
(
self
,
lang
:
str
):
if
lang
not
in
self
.
_stanza_pipelines
:
try
:
...
...
@@ -263,29 +263,18 @@ class MLP:
return
self
.
_stanza_pipelines
[
lang
]
def
_get_stanza_tokens
(
self
,
lang
:
str
,
raw_text
:
str
):
sentences
=
[]
entities
=
[]
def
_get_stanza_document
(
self
,
lang
:
str
,
raw_text
:
str
):
e
=
""
try
:
pipeline
=
self
.
get_stanza_pipeline
(
lang
)(
raw_text
)
for
sentence
in
pipeline
.
sentences
:
words
=
[]
for
word
in
sentence
.
words
:
words
.
append
(
word
)
sentences
.
append
(
words
)
for
entity
in
sentence
.
entities
:
entities
.
append
(
entity
)
document
=
self
.
get_stanza_pipeline
(
lang
)(
raw_text
)
return
document
,
e
except
KeyError
as
e
:
raise
LanguageNotSupported
(
f
"Language
{
lang
}
not supported. Check the list of supported languages."
)
except
Exception
as
e
:
self
.
logger
.
exception
(
e
)
return
sentences
,
entities
,
repr
(
e
)
return
sentences
,
entities
,
e
return
None
,
repr
(
e
)
def
_get_stanza_ner
(
self
,
lang
:
str
,
raw_text
:
str
):
...
...
@@ -379,7 +368,7 @@ class MLP:
doc_texts
=
self
.
parse_doc_texts
(
doc_path
,
document
)
for
raw_text
in
doc_texts
:
analyzers
=
self
.
_load_analyzers
(
analyzers
,
SUPPORTED_ANALYZERS
)
doc
=
self
.
generate_document
(
raw_text
,
analyzers
,
document
,
doc_paths
=
doc_path
)
doc
=
self
.
generate_document
(
raw_text
,
analyzers
=
analyzers
,
json_object
=
document
,
doc_paths
=
doc_path
,
)
if
doc
:
for
analyzer
in
analyzers
:
# For every analyzer, activate the function that processes it from the
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment