Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
texta
texta-mlp-python
Commits
9269fc2e
Commit
9269fc2e
authored
Mar 23, 2021
by
Marko Kollo
😄
Browse files
Fix issue with parsing non-string values in the process_docs function.
parent
889bc6a2
Pipeline
#5089
passed with stages
in 20 minutes and 12 seconds
Changes
3
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
VERSION
View file @
9269fc2e
1.10.
1
1.10.
2
tests/test_mlp.py
View file @
9269fc2e
import
json
import
pytest
import
regex
as
re
...
...
@@ -218,4 +217,13 @@ def test_sentences_not_separated_with_newline(mlp: MLP):
assert
mlp_result
[
"lemmas"
]
==
'siin olema üks lause . see olema teine lause .'
def
test_parsing_non_text_value_in_dictionary
(
mlp
:
MLP
):
result
=
mlp
.
process_docs
([{
"non_text_field"
:
1324331241
}],
doc_paths
=
[
"non_text_field"
])
for
key
in
result
:
assert
"mlp"
not
in
key
def
test_parsing_empty_list_in_dictionary
(
mlp
:
MLP
):
result
=
mlp
.
process_docs
([{
"empty_list_field"
:
[]}],
doc_paths
=
[
"empty_list_field"
])
for
key
in
result
:
assert
"mlp"
not
in
key
texta_mlp/mlp.py
View file @
9269fc2e
import
logging
import
os
import
pathlib
import
shutil
from
typing
import
List
from
urllib.parse
import
urlparse
from
urllib.request
import
urlopen
import
regex
as
re
import
shutil
import
stanza
from
bs4
import
BeautifulSoup
from
langdetect
import
detect
from
pelecanus
import
PelicanJson
from
typing
import
List
from
urllib.parse
import
urlparse
from
urllib.request
import
urlopen
from
texta_mlp.document
import
Document
from
texta_mlp.entity_mapper
import
EntityMapper
...
...
@@ -323,10 +323,14 @@ class MLP:
"""
wrapper
=
PelicanJson
(
document
)
doc_path_as_list
=
doc_path
.
split
(
"."
)
doc_texts
=
wrapper
.
safe_get_nested_value
(
doc_path_as_list
,
default
=
[])
doc_texts
=
[]
if
doc_texts
is
None
else
doc_texts
doc_texts
=
[
doc_texts
]
if
isinstance
(
doc_texts
,
str
)
else
doc_texts
return
doc_texts
content
=
wrapper
.
safe_get_nested_value
(
doc_path_as_list
,
default
=
[])
if
content
and
isinstance
(
content
,
str
):
return
[
content
]
# Check that content is non-empty list and there are only stings in the list.
elif
content
and
isinstance
(
content
,
list
)
and
all
([
isinstance
(
list_content
,
str
)
for
list_content
in
content
]):
return
content
else
:
return
[]
def
process_docs
(
self
,
docs
:
List
[
dict
],
doc_paths
:
List
[
str
],
analyzers
=
[
"all"
]):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment