Commit 9269fc2e authored by Marko Kollo's avatar Marko Kollo 😄
Browse files

Fix issue with parsing non-string values in the process_docs function.

parent 889bc6a2
Pipeline #5089 passed with stages
in 20 minutes and 12 seconds
import json
import pytest
import regex as re
......@@ -218,4 +217,13 @@ def test_sentences_not_separated_with_newline(mlp: MLP):
assert mlp_result["lemmas"] == 'siin olema üks lause . see olema teine lause .'
def test_parsing_non_text_value_in_dictionary(mlp: MLP):
result = mlp.process_docs([{"non_text_field": 1324331241}], doc_paths=["non_text_field"])
for key in result:
assert "mlp" not in key
def test_parsing_empty_list_in_dictionary(mlp: MLP):
result = mlp.process_docs([{"empty_list_field": []}], doc_paths=["empty_list_field"])
for key in result:
assert "mlp" not in key
import logging
import os
import pathlib
import shutil
from typing import List
from urllib.parse import urlparse
from urllib.request import urlopen
import regex as re
import shutil
import stanza
from bs4 import BeautifulSoup
from langdetect import detect
from pelecanus import PelicanJson
from typing import List
from urllib.parse import urlparse
from urllib.request import urlopen
from texta_mlp.document import Document
from texta_mlp.entity_mapper import EntityMapper
......@@ -323,10 +323,14 @@ class MLP:
"""
wrapper = PelicanJson(document)
doc_path_as_list = doc_path.split(".")
doc_texts = wrapper.safe_get_nested_value(doc_path_as_list, default=[])
doc_texts = [] if doc_texts is None else doc_texts
doc_texts = [doc_texts] if isinstance(doc_texts, str) else doc_texts
return doc_texts
content = wrapper.safe_get_nested_value(doc_path_as_list, default=[])
if content and isinstance(content, str):
return [content]
# Check that content is non-empty list and there are only stings in the list.
elif content and isinstance(content, list) and all([isinstance(list_content, str) for list_content in content]):
return content
else:
return []
def process_docs(self, docs: List[dict], doc_paths: List[str], analyzers=["all"]):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment