Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
texta
texta-mlp-python
Commits
a5343de7
Commit
a5343de7
authored
Mar 01, 2021
by
Raul Sirel
Browse files
Merge branch 'doc_path_fixes' into 'master'
Doc path fixes See merge request
!6
parents
1236e4c4
0b26d62d
Pipeline
#4789
passed with stages
in 28 minutes and 41 seconds
Changes
9
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
VERSION
View file @
a5343de7
1.8.
1
1.8.
2
tests/test_address_parser.py
View file @
a5343de7
...
...
@@ -15,12 +15,16 @@ import pytest
])
def
test_address
(
mlp
,
expected_value
,
test_input
):
result
=
mlp
.
process
(
test_input
)
entities
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'ADDR'
]
facts
=
result
[
'texta_facts'
]
entities
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'ADDR'
]
if
expected_value
is
None
:
assert
not
entities
else
:
assert
expected_value
in
entities
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
@
pytest
.
mark
.
parametrize
(
"expected_output, test_input"
,
[
(
'vana-lõuna 39'
,
'TEXTA OÜ asub aadressil Vana-Lõuna 39.'
)
...
...
@@ -28,23 +32,39 @@ def test_address(mlp, expected_value, test_input):
def
test_addresses_by_list
(
mlp
,
expected_output
,
test_input
):
""" testing addresses from a list """
result
=
mlp
.
process
(
test_input
)
entities
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'ADDR'
]
facts
=
result
[
'texta_facts'
]
entities
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'ADDR'
]
if
expected_output
is
None
:
assert
not
entities
else
:
assert
expected_output
in
entities
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
@
pytest
.
mark
.
parametrize
(
"test_input"
,
[
'kana@kukeleegu.com'
,
'Huntington London, 64'
,
"Welcome to the Zoom Family
\n
Welcome to the Zoom Family!
\n
Let's Get Started!
\n
Read our
\n
Getting Started
\n
guide to learn about hosting meetings
\n
Meet
\n
with a product specialist for a demo of key features
\n
Questions? Submit a
\n
Support Request
\n
at any time
\n
Stay On Top
\n
Attend
\n
upcoming webinars to learn about new features, how-to's, add-ons, and
\n
more
\n
Keep up with us on our
\n
Blog
\n
!
\n
Copyright ©️2016 Zoom Video Communications, Inc. All rights reserved.
\n
Our mailing address is:
\n
55 Almaden Boulevard, 6th Floor, San Jose, CA 95113
\n
+1.888.799.9666
\n
If you no longer wish to receive these emails you may unsubscribe at any time."
])
def
test_no_address
(
mlp
,
test_input
):
""" testing for no potential addresses """
result
=
mlp
.
process
(
test_input
)
addrs
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'ADDR'
]
facts
=
result
[
'texta_facts'
]
addrs
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'ADDR'
]
assert
len
(
addrs
)
==
0
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
def
test_doc_path_parsing_inside_doc
(
mlp
):
payload
=
{
"docs"
:
[{
"text"
:
{
"subsection"
:
"Huntington London, 64"
}}],
"doc_paths"
:
[
"text.subsection"
]
}
result
=
mlp
.
process_docs
(
**
payload
)[
0
]
facts
=
result
[
"texta_facts"
]
assert
len
(
facts
)
>
0
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.subsection_mlp.text"
tests/test_creating_bounded_facts.py
View file @
a5343de7
...
...
@@ -85,10 +85,11 @@ def test_created_bounds():
assert
bound1
in
test_text1_bounds
for
fact
in
result
[
"texta_facts"
]:
assert
isinstance
(
fact
[
"doc_path"
],
str
)
==
True
assert
isinstance
(
fact
[
"str_val"
],
str
)
==
True
assert
isinstance
(
fact
[
"spans"
],
str
)
==
True
assert
isinstance
(
fact
[
"fact"
],
str
)
==
True
assert
isinstance
(
fact
[
"doc_path"
],
str
)
is
True
assert
fact
[
"doc_path"
]
==
"text.text"
assert
isinstance
(
fact
[
"str_val"
],
str
)
is
True
assert
isinstance
(
fact
[
"spans"
],
str
)
is
True
assert
isinstance
(
fact
[
"fact"
],
str
)
is
True
@
pytest
.
mark
.
parametrize
(
"expected_non_duplicate_facts, test_input"
,
[
...
...
@@ -251,7 +252,7 @@ def test_space_between_ok(mlp, ok_spaces):
concat_resources
=
mlp_wrapper
.
concat_resources
)
result
=
doc
.
space_between_ok
(
ok_spaces
[
0
],
ok_spaces
[
1
],
ok_spaces
[
2
])
assert
result
==
True
assert
result
is
True
@
pytest
.
mark
.
parametrize
(
"not_ok_spaces"
,
[
...
...
tests/test_currency_parser.py
View file @
a5343de7
...
...
@@ -11,7 +11,22 @@ import pytest
])
def
test_currency_parse
(
mlp
,
expected_value
,
test_input
):
result
=
mlp
.
process
(
test_input
)
detected_facts
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'CURRENCY_SUM'
]
facts
=
result
[
'texta_facts'
]
detected_facts
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'CURRENCY_SUM'
]
assert
expected_value
in
detected_facts
for
detected_fact
in
detected_facts
:
assert
detected_fact
==
expected_value
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
def
test_doc_path_parsing_inside_doc
(
mlp
):
payload
=
{
"docs"
:
[{
"text"
:
{
"subsection"
:
"Eesti keeles on see 2,5 eurot."
}}],
"doc_paths"
:
[
"text.subsection"
]
}
results
=
mlp
.
process_docs
(
**
payload
)
facts
=
results
[
0
][
"texta_facts"
]
assert
len
(
facts
)
>
1
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.subsection_mlp.text"
or
fact
[
"doc_path"
]
==
"text.subsection_mlp.lemmas"
tests/test_email_parser.py
View file @
a5343de7
...
...
@@ -14,10 +14,11 @@ def test_emails(mlp, expected_email, test_input):
result
=
mlp
.
process
(
test_input
)
detected_emails
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'EMAIL'
]
for
fact
in
result
[
"texta_facts"
]:
assert
isinstance
(
fact
[
"doc_path"
],
str
)
==
True
assert
isinstance
(
fact
[
"str_val"
],
str
)
==
True
assert
isinstance
(
fact
[
"spans"
],
str
)
==
True
assert
isinstance
(
fact
[
"fact"
],
str
)
==
True
assert
isinstance
(
fact
[
"doc_path"
],
str
)
is
True
assert
fact
[
"doc_path"
]
==
"text.text"
assert
isinstance
(
fact
[
"str_val"
],
str
)
is
True
assert
isinstance
(
fact
[
"spans"
],
str
)
is
True
assert
isinstance
(
fact
[
"fact"
],
str
)
is
True
if
expected_email
is
None
:
assert
not
detected_emails
...
...
@@ -33,6 +34,20 @@ def test_emails(mlp, expected_email, test_input):
])
def
test_no_email
(
mlp
,
test_input
):
result
=
mlp
.
process
(
test_input
)
emails
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'EMAIL'
]
facts
=
result
[
'texta_facts'
]
emails
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'EMAIL'
]
assert
len
(
emails
)
==
0
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
def
test_doc_path_parsing_inside_doc
(
mlp
):
payload
=
{
"docs"
:
[{
"text"
:
{
"subsection"
:
"why not Dörte@Sörensen.example.com"
}}],
"doc_paths"
:
[
"text.subsection"
]
}
result
=
mlp
.
process_docs
(
**
payload
)[
0
]
facts
=
result
[
"texta_facts"
]
assert
len
(
facts
)
>
0
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.subsection_mlp.text"
tests/test_mlp.py
View file @
a5343de7
...
...
@@ -35,13 +35,13 @@ def test_mlp_process(mlp: MLP):
assert
"text"
in
result
mlp_text
=
result
[
"text"
]
assert
"lemmas"
in
mlp_text
assert
isinstance
(
mlp_text
[
"lemmas"
],
str
)
==
True
assert
isinstance
(
mlp_text
[
"lemmas"
],
str
)
is
True
assert
"lang"
in
mlp_text
assert
isinstance
(
mlp_text
[
"lang"
],
dict
)
==
True
assert
isinstance
(
mlp_text
[
"lang"
],
dict
)
is
True
assert
"texta_facts"
in
result
assert
isinstance
(
result
[
"texta_facts"
],
list
)
==
True
assert
isinstance
(
result
[
"texta_facts"
],
list
)
is
True
if
mlp_text
[
"lang"
][
"analysis_lang"
]
in
(
"ru"
,
"ar"
):
assert
"transliteration"
in
mlp_text
...
...
@@ -52,23 +52,29 @@ def test_mlp_process(mlp: MLP):
# test fact spans
text
=
mlp_text
[
"text"
]
for
fact
in
result
[
"texta_facts"
]:
assert
isinstance
(
fact
[
"doc_path"
],
str
)
==
True
assert
isinstance
(
fact
[
"str_val"
],
str
)
==
True
assert
isinstance
(
fact
[
"spans"
],
str
)
==
True
assert
isinstance
(
fact
[
"fact"
],
str
)
==
True
if
fact
[
"fact"
]
!=
"BOUNDED"
:
span_list
=
json
.
loads
(
fact
[
"spans"
])
fact_path
=
fact
[
"doc_path"
]
fact_value
=
fact
[
"str_val"
]
fact_spans
=
fact
[
"spans"
]
fact_name
=
fact
[
"fact"
]
assert
isinstance
(
fact_path
,
str
)
is
True
assert
isinstance
(
fact_value
,
str
)
is
True
assert
isinstance
(
fact_spans
,
str
)
is
True
assert
isinstance
(
fact_name
,
str
)
is
True
assert
fact_path
==
"text.text"
or
fact_path
==
"text.lemmas"
if
fact_name
!=
"BOUNDED"
:
span_list
=
json
.
loads
(
fact_spans
)
for
span
in
span_list
:
spanned_text
=
text
[
span
[
0
]:
span
[
1
]].
lower
()
# if phones, do replaces
if
fact
[
"fact"
]
==
"PHONE_strict"
:
if
fact
_name
==
"PHONE_strict"
:
patt
=
re
.
compile
(
r
'\([^)]+\)'
)
spanned_text
=
re
.
sub
(
patt
,
''
,
spanned_text
).
replace
(
' '
,
''
).
replace
(
'-'
,
''
).
replace
(
'+'
,
''
)
# if emails, replace ' ' -> ''
if
fact
[
"fact"
]
==
"EMAIL"
:
if
fact
_name
==
"EMAIL"
:
spanned_text
=
spanned_text
.
replace
(
' '
,
''
)
str_val
=
fact
[
"str
_val
"
]
.
lower
()
str_val
=
fact_val
ue
.
lower
()
# Skip checks for phones with area codes.
# TODO Rewrite this test taking the problem with parenthesis into consideration.
...
...
@@ -82,7 +88,11 @@ def test_mlp_process(mlp: MLP):
def
test_companies_by_list
(
mlp
:
MLP
,
expected_output
,
test_input
):
""" testing companies from a list """
result
=
mlp
.
process
(
test_input
)
entities
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'COMPANY'
]
entities
=
[]
for
fact
in
result
[
"texta_facts"
]:
assert
fact
[
"doc_path"
]
==
"text.text"
if
fact
[
"fact"
]
==
"COMPANY"
:
entities
.
append
(
fact
[
"str_val"
])
if
expected_output
is
None
:
assert
not
entities
else
:
...
...
@@ -123,6 +133,7 @@ def test_existing_facts_not_being_overwritten(mlp: MLP):
assert
len
(
facts
)
==
2
# One BOUNDED added
for
fact
in
facts
:
assert
fact
[
"str_val"
]
in
(
"Edgar Savisaar"
,
"nõmme tänav 24"
)
assert
fact
[
"doc_path"
]
==
"texts_mlp.text"
def
test_removal_of_duplicate_facts
(
mlp
:
MLP
):
...
...
@@ -136,6 +147,7 @@ def test_removal_of_duplicate_facts(mlp: MLP):
assert
len
(
facts
)
==
1
assert
fact
[
"str_val"
]
==
"nõmme tänav 24"
assert
fact
[
"doc_path"
]
==
"texts_mlp.text"
def
test_processing_docs_with_missing_docpath
(
mlp
:
MLP
):
...
...
tests/test_namemail_parser.py
View file @
a5343de7
...
...
@@ -4,18 +4,21 @@ import pytest
@
pytest
.
mark
.
parametrize
(
"expected_namemail, test_input"
,
[
(
'Керсти Кальюлайд kaljulaidkersti@yandex.ru'
,
'Отправлено с iPhone06.08.2015, в 20:05, Керсти Кальюлайд kaljulaidkersti@yandex.ru написал(а): '
),
(
'Керсти Кальюлайд < kersti1298@mail.ru'
,
'Воскресенье, 17 января 2016, 1:10 +03:00 от Керсти Кальюлайд < kersti1298@mail.ru >:'
),
# ('Павлович Данилов <danilov64@bk.ru>', 'Кому: Павел Павлович Данилов <danilov64@bk.ru>\nДата: Среда, 10 декабря 2014, 0:31 +03:00'),
# ('Павлович Данилов <danilov64@bk.ru>', 'Кому: Павел Павлович Данилов <danilov64@bk.ru>\nДата: Среда, 10 декабря 2014, 0:31 +03:00'),
(
'Tuule Tormi < tormituule@gmail.com'
,
'От кого: Tuule Tormi < tormituule@gmail.com >'
),
(
'Ove Üllar < 123ove@gmail.com'
,
'Ove Üllar < 123ove@gmail.com >'
),
(
'Антон Казарезов antonkazarezov@mail.ru'
,
'С уважением,
\n
Антон Казарезов
\n
antonkazarezov@mail.ru
\n
37259087634'
)
(
'Антон Казарезов antonkazarezov@mail.ru'
,
'С уважением,
\n
Антон Казарезов
\n
antonkazarezov@mail.ru
\n
37259087634'
)
])
def
test_single_namemail
(
mlp
,
expected_namemail
,
test_input
):
""" test for single namemails """
print
(
test_input
)
result
=
mlp
.
process
(
test_input
,
analyzers
=
(
"lemmas"
,
"pos_tags"
,
"transliteration"
,
"ner"
,
"contacts"
,
"entities"
,
"namemail"
))
result
=
mlp
.
process
(
test_input
,
analyzers
=
(
"lemmas"
,
"pos_tags"
,
"transliteration"
,
"ner"
,
"contacts"
,
"entities"
,
"namemail"
))
print
(
result
)
detected_phone
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'NAMEMAIL'
]
facts
=
result
[
'texta_facts'
]
detected_phone
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'NAMEMAIL'
]
assert
detected_phone
==
[
expected_namemail
]
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
@
pytest
.
mark
.
parametrize
(
"expected_namemails, test_input"
,
[
...
...
@@ -24,8 +27,11 @@ def test_single_namemail(mlp, expected_namemail, test_input):
def
test_multiple_namemails
(
mlp
,
expected_namemails
,
test_input
):
""" test for multiple potential namemails """
result
=
mlp
.
process
(
test_input
)
detected_phones
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'NAMEMAIL'
]
facts
=
result
[
'texta_facts'
]
detected_phones
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'NAMEMAIL'
]
assert
sorted
(
detected_phones
)
==
sorted
(
expected_namemails
)
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
@
pytest
.
mark
.
parametrize
(
"test_input"
,
[
...
...
@@ -34,9 +40,23 @@ def test_multiple_namemails(mlp, expected_namemails, test_input):
'This email was sent to amanda@mail.me'
,
'С уважением, Михайл.
\n\n\n
From: 123456789@mail.ru'
])
def
test_no_namemails
(
mlp
,
test_input
):
""" test for no potential namemails"""
result
=
mlp
.
process
(
test_input
)
phones
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'NAMEMAIL'
]
facts
=
result
[
'texta_facts'
]
phones
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'NAMEMAIL'
]
assert
len
(
phones
)
==
0
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
def
test_doc_path_parsing_inside_doc
(
mlp
):
payload
=
{
"docs"
:
[{
"text"
:
{
"subsection"
:
"To make sure you can receive our emails, please add noreply@actuallyreply.com to your [trusted contacts]"
}}],
"doc_paths"
:
[
"text.subsection"
]
}
result
=
mlp
.
process_docs
(
**
payload
)[
0
]
facts
=
result
[
"texta_facts"
]
assert
len
(
facts
)
>
0
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.subsection_mlp.text"
tests/test_phone_parsers.py
View file @
a5343de7
import
pytest
@
pytest
.
mark
.
parametrize
(
"expected_phone, test_input"
,
[
(
'74956456601'
,
'tema number pole +7 495-645-6601'
),
(
'7(903)4744720'
,
'+7 (903) 474-47-20'
),
...
...
@@ -16,8 +17,11 @@ import pytest
def
test_single_phone
(
mlp
,
expected_phone
,
test_input
):
""" test for single phone numbers """
result
=
mlp
.
process
(
test_input
)
detected_phone
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'PHONE_strict'
]
facts
=
result
[
"texta_facts"
]
detected_phone
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'PHONE_strict'
]
assert
detected_phone
==
[
expected_phone
]
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
@
pytest
.
mark
.
parametrize
(
"expected_phones, test_input"
,
[
...
...
@@ -28,8 +32,11 @@ def test_single_phone(mlp, expected_phone, test_input):
def
test_multiple_phones
(
mlp
,
expected_phones
,
test_input
):
""" test for multiple potential phone numbers """
result
=
mlp
.
process
(
test_input
)
detected_phones
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'PHONE_strict'
]
facts
=
result
[
'texta_facts'
]
detected_phones
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'PHONE_strict'
]
assert
sorted
(
detected_phones
)
==
sorted
(
expected_phones
)
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
@
pytest
.
mark
.
parametrize
(
"test_input"
,
[
...
...
@@ -53,5 +60,20 @@ def test_no_phones(mlp, test_input):
""" test for no potential phones"""
result
=
mlp
.
process
(
test_input
)
print
(
result
[
'texta_facts'
])
phones
=
[
fact
[
'str_val'
]
for
fact
in
result
[
'texta_facts'
]
if
fact
[
'fact'
]
==
'PHONE_strict'
]
facts
=
result
[
'texta_facts'
]
phones
=
[
fact
[
'str_val'
]
for
fact
in
facts
if
fact
[
'fact'
]
==
'PHONE_strict'
]
assert
len
(
phones
)
==
0
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.text"
def
test_doc_path_parsing_inside_doc
(
mlp
):
payload
=
{
"docs"
:
[{
"text"
:
{
"subsection"
:
"tema number pole +7 495-645-6601 (7085), vaid on hoopis +372 5012 3123"
}}],
"doc_paths"
:
[
"text.subsection"
]
}
result
=
mlp
.
process_docs
(
**
payload
)[
0
]
facts
=
result
[
"texta_facts"
]
assert
len
(
facts
)
>
0
for
fact
in
facts
:
assert
fact
[
"doc_path"
]
==
"text.subsection_mlp.text"
texta_mlp/document.py
View file @
a5343de7
...
...
@@ -12,6 +12,7 @@ from .parsers import AddressParser, ContactEmailNamePairParser, ContactEmailPars
ContactPhoneParserHighRecall
,
ContactPhoneParserStrict
from
.russian_transliterator
import
Transliterate
russian_transliterator
=
Transliterate
()
...
...
@@ -37,6 +38,7 @@ class Document:
CLOSE_FACT_DISTANCE
=
150
def
__init__
(
self
,
original_text
:
str
,
...
...
@@ -73,6 +75,16 @@ class Document:
self
.
__handle_existing_facts
()
self
.
words
()
def
__get_doc_path
(
self
,
field
:
str
)
->
str
:
"""
:param field: Whether the doc_path uses the text or lemmas field.
:return: MLP representation of the doc_path
"""
content
=
f
"
{
self
.
doc_path
}
_mlp.
{
field
}
"
return
content
def
__handle_existing_facts
(
self
):
"""
Add existing texta_facts inside the document into the private
...
...
@@ -84,6 +96,7 @@ class Document:
for
fact
in
facts
:
self
.
add_fact
(
fact
)
@
staticmethod
def
remove_duplicate_facts
(
facts
:
List
[
dict
]):
if
facts
:
...
...
@@ -93,14 +106,17 @@ class Document:
else
:
return
[]
def
facts_to_json
(
self
)
->
dict
:
facts
=
[
fact
.
to_json
()
for
fact
in
self
.
__texta_facts
]
unique_facts
=
Document
.
remove_duplicate_facts
(
facts
)
return
{
"texta_facts"
:
unique_facts
}
def
add_fact
(
self
,
fact
:
Fact
):
self
.
__texta_facts
.
append
(
fact
)
def
document_to_json
(
self
,
use_default_doc_path
=
True
)
->
dict
:
"""
:param use_default_doc_path: Normal string values will be given the default path for facts but for dictionary input you already have them.
...
...
@@ -115,6 +131,7 @@ class Document:
return
nested_dict_wrapper
.
convert
()
def
to_json
(
self
,
use_default_doc_path
=
True
)
->
dict
:
container
=
dict
()
container
[
"text"
]
=
self
.
get_words
()
...
...
@@ -131,25 +148,32 @@ class Document:
fact
[
"doc_path"
]
=
"text.text"
return
{
"text"
:
container
,
**
texta_facts
}
def
lemmas
(
self
):
self
.
__lemmas
=
[
word
.
lemma
.
replace
(
"_"
,
""
)
if
word
and
word
.
lemma
else
"X"
for
word
in
self
.
stanza_words
]
def
get_lemma
(
self
)
->
str
:
return
" "
.
join
([
a
.
strip
()
for
a
in
self
.
__lemmas
])
def
words
(
self
):
self
.
__words
=
[
word
.
text
for
word
in
self
.
stanza_words
]
def
get_words
(
self
)
->
str
:
return
" "
.
join
(
self
.
__words
)
def
pos_tags
(
self
):
self
.
__pos_tags
=
[
word
.
xpos
if
word
and
word
.
xpos
and
word
.
xpos
!=
"_"
else
"X"
if
word
.
xpos
==
"_"
else
"X"
for
word
in
self
.
stanza_words
]
def
get_pos_tags
(
self
)
->
str
:
return
" "
.
join
([
a
.
strip
()
for
a
in
self
.
__pos_tags
])
def
entities
(
self
):
"""
Retrieves list-based entities.
...
...
@@ -166,7 +190,7 @@ class Document:
new_fact
=
Fact
(
fact_type
=
entity_type
,
fact_value
=
entity_value
[
"value"
],
doc_path
=
f
"
{
self
.
doc_path
}
_mlp.
text"
,
doc_path
=
self
.
__get_
doc_path
(
"
text"
)
,
spans
=
[[
entity_value
[
"span"
][
0
],
entity_value
[
"span"
][
1
]]]
)
self
.
__texta_facts
.
append
(
new_fact
)
...
...
@@ -176,7 +200,7 @@ class Document:
new_fact
=
Fact
(
fact_type
=
entity_type
,
fact_value
=
entity_value
[
"value"
],
doc_path
=
f
"
{
self
.
doc_path
}
_mlp.
lemmas"
,
doc_path
=
self
.
__get_
doc_path
(
"
lemmas"
)
,
spans
=
[[
entity_value
[
"span"
][
0
],
entity_value
[
"span"
][
1
]]]
)
self
.
__texta_facts
.
append
(
new_fact
)
...
...
@@ -184,6 +208,7 @@ class Document:
# declare the entities processed
self
.
entities_processed
=
True
def
currency_sum
(
self
):
"""
Extracts currency + sum and sum + currency patterns from text using regexp.
...
...
@@ -205,38 +230,44 @@ class Document:
new_fact
=
Fact
(
fact_type
=
"CURRENCY_SUM"
,
fact_value
=
fact_value
,
doc_path
=
self
.
doc_path
,
doc_path
=
self
.
__get_
doc_path
(
"text"
)
,
spans
=
[
match
.
start
(),
match
.
end
()]
)
self
.
__texta_facts
.
append
(
new_fact
)
def
emails
(
self
):
text
=
self
.
get_words
()
emails
=
ContactEmailParser
(
text
).
parse
()
self
.
__texta_facts
.
extend
((
email
.
to_fact
(
Document
.
FACT_NAME_EMAIL
,
self
.
doc_path
)
for
email
in
emails
))
self
.
__texta_facts
.
extend
((
email
.
to_fact
(
Document
.
FACT_NAME_EMAIL
,
self
.
__get_doc_path
(
"text"
))
for
email
in
emails
))
def
phone_strict
(
self
):
text
=
self
.
get_words
()
phone_numbers_strict
=
ContactPhoneParserStrict
(
text
).
parse
()
self
.
__texta_facts
.
extend
(
(
number
.
to_fact
(
Document
.
FACT_NAME_PHONE_STRICT
,
self
.
doc_path
)
for
number
in
phone_numbers_strict
))
(
number
.
to_fact
(
Document
.
FACT_NAME_PHONE_STRICT
,
self
.
__get_doc_path
(
"text"
))
for
number
in
phone_numbers_strict
))
def
phone_high_recall
(
self
):
text
=
self
.
get_words
()
phone_numbers
=
ContactPhoneParserHighRecall
(
text
,
months
=
self
.
concat_resources
[
"months"
]).
parse
()
self
.
__texta_facts
.
extend
(
(
number
.
to_fact
(
Document
.
FACT_NAME_PHONE_HIGH_RECALL
,
self
.
doc_path
)
for
number
in
phone_numbers
))
(
number
.
to_fact
(
Document
.
FACT_NAME_PHONE_HIGH_RECALL
,
self
.
__get_doc_path
(
"text"
))
for
number
in
phone_numbers
))
def
phone_high_precision
(
self
):
text
=
self
.
get_words
()
phone_numbers_high_precision
=
ContactPhoneParserHighPrecision
(
text
).
parse
()
self
.
__texta_facts
.
extend
((
number
.
to_fact
(
Document
.
FACT_NAME_PHONE_HIGH_PRECISION
,
self
.
doc_path
)
for
number
in
self
.
__texta_facts
.
extend
((
number
.
to_fact
(
Document
.
FACT_NAME_PHONE_HIGH_PRECISION
,
self
.
__get_
doc_path
(
"text"
)
)
for
number
in
phone_numbers_high_precision
))
def
addresses
(
self
):
text
=
self
.
get_words
()
addresses
=
AddressParser
(
text
,
self
.
stanza_entities
,
self
.
dominant_language_code
).
parse
()
self
.
__texta_facts
.
extend
((
addr
.
to_fact
(
Document
.
FACT_NAME_ADDRESS
,
self
.
doc_path
)
for
addr
in
addresses
))
self
.
__texta_facts
.
extend
((
addr
.
to_fact
(
Document
.
FACT_NAME_ADDRESS
,
self
.
__get_doc_path
(
"text"
))
for
addr
in
addresses
))
def
transliteration
(
self
):
if
self
.
dominant_language_code
in
Document
.
langs_to_transliterate
:
...
...
@@ -247,6 +278,7 @@ class Document:
translit_word
=
self
.
_transliterate_arabic_word
(
word
)
self
.
__transliteration
.
append
(
translit_word
)
@
staticmethod
def
_transliterate_russian_word
(
word
):
translit_word
=
russian_transliterator
([
word
.
text
.
strip
()])
...
...
@@ -256,6 +288,7 @@ class Document:
translit_word
=
word
.
text
.
strip
()
return
translit_word
@
staticmethod
def
_transliterate_arabic_word
(
word
):
translit_word
=
buckwalter
.
transliterate
(
word
.
text
.
strip
())
...
...
@@ -263,9 +296,11 @@ class Document:
translit_word
=
word
.
text
.
strip
()
return
translit_word
def
get_transliteration
(
self
)
->
str
:
return
" "
.
join
([
'X'
if
not
a
.
strip
()
else
a
for
a
in
self
.
__transliteration
])
def
entity_lemmas
(
self
,
entity_value
):
lemmas
=
""
splitted
=
entity_value
.
split
(
" "
)
...
...
@@ -289,6 +324,7 @@ class Document:
return
word
.
lemma
return
lemmas
def
ner
(
self
):
tokenized_text
=
self
.
get_words
()
known_entities
=
Document
.
FACT_NAMES_NER
...
...
@@ -316,11 +352,12 @@ class Document:
new_fact
=
Fact
(
fact_type
=
entity
.
type
,
fact_value
=
entity
.
text
,
doc_path
=
f
"
{
self
.
doc_path
}
_mlp.
text"
,
doc_path
=
self
.
__get_
doc_path
(
"
text"
)
,
spans
=
[
best_matching_span
]
)
self
.
__texta_facts
.
append
(
new_fact
)
def
namemail
(
self
):
"""
Find name-email pairs.
...
...
@@ -329,7 +366,8 @@ class Document:
text
=
self
.
get_words
()
email_name_pairs
=
ContactEmailNamePairParser
(
text
).
parse
()
# bounded -> str "name mail"
self
.
__texta_facts
.
extend
(
(
emailpair
.
to_fact
(
Document
.
FACT_NAME_NAMEMAIL
,
self
.
doc_path
)
for
emailpair
in
email_name_pairs
))
(
emailpair
.
to_fact
(
Document
.
FACT_NAME_NAMEMAIL
,
self
.
__get_doc_path
(
"text"
))
for
emailpair
in
email_name_pairs
))
def
remove_duplicate_facts_by_span
(
self
,
facts
):
"""if there are emailpairs, then:
...
...
@@ -352,7 +390,7 @@ class Document:
name_fact
=
Fact
(
fact_type
=
"PER"
,
fact_value
=
name_fact_value
.
strip
(
"><\)\(:;-\.,\!\?"
),
doc_path
=
self
.
doc_path
,
doc_path
=
self
.
__get_
doc_path
(
"text"
)
,
spans
=
[(
fact
.
spans
[
0
][
0
],
fact
.
spans
[
0
][
0
]
+
len
(
name_fact_value
))]
)
...
...
@@ -360,7 +398,7 @@ class Document:
email_fact
=
Fact
(
fact_type
=
"EMAIL"
,
fact_value
=
email_fact_value
.
strip
(
"><\)\(:;-\.,\!\?"
),
doc_path
=
self
.
doc_path
,
doc_path
=
self
.
__get_
doc_path
(
"text"
)
,
spans
=
[(
fact
.
spans
[
0
][
1
]
-
len
(
email_fact_value
),
fact
.
spans
[
0
][
1
])]
)
...
...
@@ -378,6 +416,7 @@ class Document: