Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
texta
texta-mlp-python
Commits
37ea7c67
Commit
37ea7c67
authored
Mar 10, 2021
by
Hele-Andra Kuulmets
Browse files
Merge branch 'sentences' into 'master'
add sentence split See merge request
!7
parents
45a1a7a7
18c432c2
Pipeline
#4896
passed with stages
in 21 minutes and 13 seconds
Changes
5
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
VERSION
View file @
37ea7c67
1.
9.1
1.
10.0
tests/test_creating_bounded_facts.py
View file @
37ea7c67
...
...
@@ -101,7 +101,7 @@ def test_remove_duplicate_facts_by_span_in_doc(expected_non_duplicate_facts, tes
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_
word
s
=
[],
stanza_
sentence
s
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
...
...
@@ -135,7 +135,7 @@ def test_bound_close_ones(expected_close_BOUNDS, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_
word
s
=
[],
stanza_
sentence
s
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
...
...
@@ -168,7 +168,7 @@ def test_remove_overlaping_in_bounded(expected_bounds_no_overlap, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_
word
s
=
[],
stanza_
sentence
s
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
...
...
@@ -195,7 +195,7 @@ def test_concatenate_subset_bounds(expected_bounds_no_subsets, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_
word
s
=
[],
stanza_
sentence
s
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
...
...
@@ -221,7 +221,7 @@ def test_concatenate_subset_bounds(key_value_single_pairs, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_
word
s
=
[],
stanza_
sentence
s
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
...
...
@@ -243,7 +243,7 @@ def test_space_between_ok(mlp, ok_spaces):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_
word
s
=
[],
stanza_
sentence
s
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
...
...
@@ -269,7 +269,7 @@ def test_space_between_not_ok(mlp, not_ok_spaces):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_
word
s
=
[],
stanza_
sentence
s
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
...
...
@@ -293,7 +293,7 @@ def test_clean_similar_in_strval(similar_cleaned_str_val, test_input):
original_text
=
""
,
dominant_language_code
=
"en"
,
analysis_lang
=
"en"
,
stanza_
word
s
=
[],
stanza_
sentence
s
=
[],
stanza_entities
=
[],
entity_mapper
=
None
,
doc_path
=
"text"
,
...
...
tests/test_mlp.py
View file @
37ea7c67
...
...
@@ -202,3 +202,20 @@ def test_that_ner_output_contains_correct_doc_path(mlp: MLP):
facts
=
result
.
get
(
"texta_facts"
)
usa_fact
=
facts
[
0
]
assert
usa_fact
[
"doc_path"
]
==
"comment.text_mlp.text"
def
test_sentences_separated_with_newline
(
mlp
:
MLP
):
result
=
mlp
.
process
(
"Siin on üks lause. See on teine lause."
)
mlp_result
=
result
[
"text"
]
assert
mlp_result
[
"text"
]
==
'Siin on üks lause .
\n
See on teine lause .'
assert
mlp_result
[
"lemmas"
]
==
'siin olema üks lause .
\n
see olema teine lause .'
def
test_sentences_not_separated_with_newline
(
mlp
:
MLP
):
result
=
mlp
.
process
(
"Siin on üks lause. See on teine lause."
,
analyzers
=
[
"lemmas"
,
"pos_tags"
])
mlp_result
=
result
[
"text"
]
assert
mlp_result
[
"text"
]
==
'Siin on üks lause . See on teine lause .'
assert
mlp_result
[
"lemmas"
]
==
'siin olema üks lause . see olema teine lause .'
texta_mlp/document.py
View file @
37ea7c67
...
...
@@ -44,7 +44,7 @@ class Document:
original_text
:
str
,
dominant_language_code
:
str
,
analysis_lang
:
str
,
stanza_
words
,
stanza_
sentences
:
[
list
],
stanza_entities
,
concat_resources
:
dict
,
entity_mapper
:
Optional
[
EntityMapper
]
=
None
,
...
...
@@ -61,7 +61,8 @@ class Document:
self
.
json_doc
=
json_doc
self
.
entity_mapper
=
entity_mapper
self
.
stanza_words
=
stanza_words
self
.
stanza_sentences
=
stanza_sentences
self
.
stanza_words
=
[
word
for
sentence
in
self
.
stanza_sentences
for
word
in
sentence
]
self
.
stanza_entities
=
stanza_entities
self
.
concat_resources
=
concat_resources
...
...
@@ -134,7 +135,7 @@ class Document:
def
to_json
(
self
,
use_default_doc_path
=
True
)
->
dict
:
container
=
dict
()
container
[
"text"
]
=
self
.
get_words
()
container
[
"text"
]
=
self
.
get_words
(
ssplit
=
"sentences"
in
self
.
analyzers
)
texta_facts
=
self
.
facts_to_json
()
container
[
"language"
]
=
{
"detected"
:
self
.
dominant_language_code
,
"analysis"
:
self
.
analysis_lang
}
...
...
@@ -150,19 +151,34 @@ class Document:
def
lemmas
(
self
):
self
.
__lemmas
=
[
word
.
lemma
.
replace
(
"_"
,
""
)
if
word
and
word
.
lemma
else
"X"
for
word
in
self
.
stanza_words
]
for
sent
in
self
.
stanza_sentences
:
self
.
__lemmas
.
append
([
word
.
lemma
.
replace
(
"_"
,
""
)
if
word
and
word
.
lemma
else
"X"
for
word
in
sent
])
def
get_lemma
(
self
)
->
str
:
return
" "
.
join
([
a
.
strip
()
for
a
in
self
.
__lemmas
])
sentences
=
[]
for
sent_lemmas
in
self
.
__lemmas
:
sentences
.
append
(
" "
.
join
([
a
.
strip
()
for
a
in
sent_lemmas
]))
if
"sentences"
in
self
.
analyzers
:
return
"
\n
"
.
join
(
sentences
)
else
:
return
" "
.
join
(
sentences
)
def
words
(
self
):
self
.
__words
=
[
word
.
text
for
word
in
self
.
stanza_words
]
for
sent
in
self
.
stanza_sentences
:
self
.
__words
.
append
([
word
.
text
for
word
in
sent
])
def
sentences
(
self
):
pass
def
get_words
(
self
)
->
str
:
return
" "
.
join
(
self
.
__words
)
def
get_words
(
self
,
ssplit
=
False
)
->
str
:
if
ssplit
:
return
"
\n
"
.
join
([
" "
.
join
(
sent_words
)
for
sent_words
in
self
.
__words
])
else
:
return
" "
.
join
([
" "
.
join
(
sent_words
)
for
sent_words
in
self
.
__words
])
def
pos_tags
(
self
):
...
...
texta_mlp/mlp.py
View file @
37ea7c67
...
...
@@ -52,7 +52,8 @@ SUPPORTED_ANALYZERS = (
"entities"
,
"namemail"
,
"bounded"
,
"currency_sum"
"currency_sum"
,
"sentences"
)
# Here we define languages with NER support to avoid Stanza trying to load them for languages without NER support.
...
...
@@ -196,16 +197,16 @@ class MLP:
'''
if
lang
not
in
self
.
supported_langs
:
analysis_lang
=
self
.
default_lang
word
s
,
entities
=
self
.
_get_stanza_tokens
(
analysis_lang
,
processed_text
)
if
processed_text
else
([],
[])
sentence
s
,
entities
=
self
.
_get_stanza_tokens
(
analysis_lang
,
processed_text
)
if
processed_text
else
([],
[])
else
:
analysis_lang
=
lang
word
s
,
entities
=
self
.
_get_stanza_tokens
(
analysis_lang
,
processed_text
)
if
processed_text
else
([],
[])
sentence
s
,
entities
=
self
.
_get_stanza_tokens
(
analysis_lang
,
processed_text
)
if
processed_text
else
([],
[])
document
=
Document
(
original_text
=
processed_text
,
dominant_language_code
=
lang
,
analysis_lang
=
analysis_lang
,
stanza_
words
=
word
s
,
stanza_
sentences
=
sentence
s
,
stanza_entities
=
entities
,
analyzers
=
analyzers
,
json_doc
=
json_object
,
...
...
@@ -230,23 +231,23 @@ class MLP:
if
lang
==
"ru"
:
raw_text
=
pat
.
sub
(
"_"
,
raw_text
)
# Separate all the words into a separate list.
pipeline
=
self
.
stanza_pipelines
[
lang
](
raw_text
)
word
s
=
[]
sentence
s
=
[]
entities
=
[]
pip_pat
=
re
.
compile
(
r
"(?<=\d)_(?=\d)"
)
for
sentence
in
pipeline
.
sentences
:
words
=
[]
for
word
in
sentence
.
words
:
# Russian HACK (2/2)
# replaces back "#" to "-" between digits.
if
lang
==
"ru"
:
word
.
text
=
pip_pat
.
sub
(
"-"
,
word
.
text
)
words
.
append
(
word
)
sentences
.
append
(
words
)
for
entity
in
sentence
.
entities
:
entities
.
append
(
entity
)
return
word
s
,
entities
return
sentence
s
,
entities
def
_get_stanza_ner
(
self
,
lang
:
str
,
raw_text
:
str
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment