Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
texta
texta-mlp-python
Commits
c728e137
Commit
c728e137
authored
Mar 31, 2021
by
Marko Kollo
😄
Browse files
Make the function responsible for parsing dictionaries public.
parent
9269fc2e
Pipeline
#5159
passed with stages
in 20 minutes and 38 seconds
Changes
2
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
VERSION
View file @
c728e137
1.10.
2
1.10.
3
texta_mlp/mlp.py
View file @
c728e137
...
...
@@ -7,7 +7,7 @@ import stanza
from
bs4
import
BeautifulSoup
from
langdetect
import
detect
from
pelecanus
import
PelicanJson
from
typing
import
List
from
typing
import
List
,
Optional
from
urllib.parse
import
urlparse
from
urllib.request
import
urlopen
...
...
@@ -171,7 +171,7 @@ class MLP:
return
text
def
detect_language
(
self
,
text
:
str
):
def
detect_language
(
self
,
text
:
str
)
->
Optional
[
str
]
:
"""
Detects language of input text.
If language not in supported list, language is defaulted or exception raised.
...
...
@@ -314,7 +314,7 @@ class MLP:
return
document
[
"text"
][
"lemmas"
]
def
__
parse_doc_texts
(
self
,
doc_path
:
str
,
document
:
dict
)
->
list
:
def
parse_doc_texts
(
self
,
doc_path
:
str
,
document
:
dict
)
->
list
:
"""
Function for parsing text values from a nested dictionary given a field path.
:param doc_path: Dot separated path of fields to the value we wish to parse.
...
...
@@ -329,6 +329,9 @@ class MLP:
# Check that content is non-empty list and there are only stings in the list.
elif
content
and
isinstance
(
content
,
list
)
and
all
([
isinstance
(
list_content
,
str
)
for
list_content
in
content
]):
return
content
# In case the field path is faulty and it gives you a dictionary instead.
elif
isinstance
(
content
,
dict
):
return
[]
else
:
return
[]
...
...
@@ -346,7 +349,7 @@ class MLP:
for
doc_path
in
doc_paths
:
# Traverse the (possible) nested dicts and extract their text values from it as a list of strings.
# Since the nested doc_path could lead to a list there are multiple pieces of text which would be needed to process.
doc_texts
=
self
.
__
parse_doc_texts
(
doc_path
,
document
)
doc_texts
=
self
.
parse_doc_texts
(
doc_path
,
document
)
for
raw_text
in
doc_texts
:
analyzers
=
self
.
_load_analyzers
(
analyzers
,
SUPPORTED_ANALYZERS
)
doc
=
self
.
generate_document
(
raw_text
,
analyzers
,
document
,
doc_paths
=
doc_path
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment