Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
texta
texta-mlp-python
Commits
a0cbbf58
Commit
a0cbbf58
authored
Sep 27, 2021
by
Raul Sirel
Browse files
fix sentence tokenization caused span errors
parent
699e9e30
Pipeline
#6539
passed with stages
in 19 minutes and 6 seconds
Changes
4
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
VERSION
View file @
a0cbbf58
1.11.
5
1.11.
6
tests/test_mlp.py
View file @
a0cbbf58
...
...
@@ -217,8 +217,9 @@ def test_that_ner_output_contains_correct_doc_path(mlp: MLP):
def
test_sentences_separated_with_newline
(
mlp
:
MLP
):
result
=
mlp
.
process
(
"Siin on üks lause. See on teine lause."
)
mlp_result
=
result
[
"text"
]
assert
mlp_result
[
"text"
]
==
'Siin on üks lause .
\n
See on teine lause .'
assert
mlp_result
[
"lemmas"
]
==
'siin olema üks lause .
\n
see olema teine lause .'
assert
mlp_result
[
"text"
]
==
'Siin on üks lause .
\n
See on teine lause .'
assert
mlp_result
[
"lemmas"
]
==
'siin olema üks lause .
\n
see olema teine lause .'
assert
len
(
mlp_result
[
"pos_tags"
].
split
(
" "
))
==
len
(
mlp_result
[
"lemmas"
].
split
(
" "
))
def
test_sentences_not_separated_with_newline
(
mlp
:
MLP
):
...
...
texta_mlp/document.py
View file @
a0cbbf58
...
...
@@ -162,7 +162,7 @@ class Document:
def
to_json
(
self
,
use_default_doc_path
=
True
)
->
dict
:
container
=
dict
()
container
[
"text"
]
=
self
.
get_words
(
ssplit
=
"sentences"
in
self
.
analyzers
)
container
[
"text"
]
=
self
.
get_words
()
texta_facts
=
self
.
facts_to_json
()
container
[
"language"
]
=
{
"detected"
:
self
.
dominant_language_code
,
...
...
@@ -175,7 +175,6 @@ class Document:
container
[
"lemmas"
]
=
self
.
get_lemma
()
if
"pos_tags"
in
self
.
analyzers
:
container
[
"pos_tags"
]
=
self
.
get_pos_tags
()
# if "sentiment" in self.analyzers: container["sentiment"] = self.get_sentiment()
if
"transliteration"
in
self
.
analyzers
and
self
.
__transliteration
:
container
[
"transliteration"
]
=
self
.
get_transliteration
()
if
use_default_doc_path
:
...
...
@@ -195,30 +194,38 @@ class Document:
sentences
.
append
(
" "
.
join
([
a
.
strip
()
for
a
in
sent_lemmas
]))
if
"sentences"
in
self
.
analyzers
:
return
"
\n
"
.
join
(
sentences
)
return
"
\n
"
.
join
(
sentences
)
else
:
return
" "
.
join
(
sentences
)
def
sentences
(
self
):
pass
def
words
(
self
):
for
sent
in
self
.
stanza_sentences
:
self
.
__words
.
append
([
word
.
text
for
word
in
sent
])
def
sentences
(
self
):
pass
def
get_words
(
self
,
ssplit
=
False
)
->
str
:
if
ssplit
:
return
"
\n
"
.
join
([
" "
.
join
(
sent_words
)
for
sent_words
in
self
.
__words
])
def
get_words
(
self
)
->
str
:
if
"sentences"
in
self
.
analyzers
:
return
"
\n
"
.
join
([
" "
.
join
(
sent_words
)
for
sent_words
in
self
.
__words
])
else
:
return
" "
.
join
([
" "
.
join
(
sent_words
)
for
sent_words
in
self
.
__words
])
def
pos_tags
(
self
):
self
.
__pos_tags
=
[
word
.
xpos
if
word
and
word
.
xpos
and
word
.
xpos
!=
"_"
else
"X"
if
word
.
xpos
==
"_"
else
"X"
for
word
in
self
.
stanza_words
]
if
"sentences"
in
self
.
analyzers
:
for
i
,
sent
in
enumerate
(
self
.
stanza_sentences
):
tags_in_sent
=
[
word
.
xpos
if
word
and
word
.
xpos
and
word
.
xpos
!=
"_"
else
"X"
if
word
.
xpos
==
"_"
else
"X"
for
word
in
sent
]
for
tag
in
tags_in_sent
:
self
.
__pos_tags
.
append
(
tag
)
# if not last item
if
i
+
1
<
len
(
self
.
stanza_sentences
):
self
.
__pos_tags
.
append
(
"LBR"
)
else
:
self
.
__pos_tags
=
[
word
.
xpos
if
word
and
word
.
xpos
and
word
.
xpos
!=
"_"
else
"X"
if
word
.
xpos
==
"_"
else
"X"
for
word
in
self
.
stanza_words
]
def
get_pos_tags
(
self
)
->
str
:
...
...
@@ -380,7 +387,9 @@ class Document:
tokenized_text
=
self
.
get_words
()
known_entities
=
Document
.
FACT_NAMES_NER
not_entities
=
self
.
concat_resources
[
"not_entities"
]
for
entity
in
self
.
stanza_entities
:
if
entity
.
text
.
lower
()
in
not_entities
:
continue
if
entity
.
type
in
known_entities
:
...
...
texta_mlp/mlp.py
View file @
a0cbbf58
...
...
@@ -275,6 +275,7 @@ class MLP:
for
word
in
sentence
.
words
:
words
.
append
(
word
)
sentences
.
append
(
words
)
for
entity
in
sentence
.
entities
:
entities
.
append
(
entity
)
...
...
@@ -288,11 +289,6 @@ class MLP:
return
sentences
,
entities
,
e
def
_get_stanza_ner
(
self
,
lang
:
str
,
raw_text
:
str
):
pipeline
=
self
.
get_stanza_pipeline
(
lang
)(
raw_text
)
return
[
entity
for
sentence
in
pipeline
.
sentences
for
entity
in
sentence
.
entities
]
@
staticmethod
def
_get_stanza_processors
(
lang
):
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment