Commit c728e137 authored by Marko Kollo's avatar Marko Kollo 😄
Browse files

Make the function responsible for parsing dictionaries public.

parent 9269fc2e
Pipeline #5159 passed with stages
in 20 minutes and 38 seconds
......@@ -7,7 +7,7 @@ import stanza
from bs4 import BeautifulSoup
from langdetect import detect
from pelecanus import PelicanJson
from typing import List
from typing import List, Optional
from urllib.parse import urlparse
from urllib.request import urlopen
......@@ -171,7 +171,7 @@ class MLP:
return text
def detect_language(self, text: str):
def detect_language(self, text: str) -> Optional[str]:
Detects language of input text.
If language not in supported list, language is defaulted or exception raised.
......@@ -314,7 +314,7 @@ class MLP:
return document["text"]["lemmas"]
def __parse_doc_texts(self, doc_path: str, document: dict) -> list:
def parse_doc_texts(self, doc_path: str, document: dict) -> list:
Function for parsing text values from a nested dictionary given a field path.
:param doc_path: Dot separated path of fields to the value we wish to parse.
......@@ -329,6 +329,9 @@ class MLP:
# Check that content is non-empty list and there are only stings in the list.
elif content and isinstance(content, list) and all([isinstance(list_content, str) for list_content in content]):
return content
# In case the field path is faulty and it gives you a dictionary instead.
elif isinstance(content, dict):
return []
return []
......@@ -346,7 +349,7 @@ class MLP:
for doc_path in doc_paths:
# Traverse the (possible) nested dicts and extract their text values from it as a list of strings.
# Since the nested doc_path could lead to a list there are multiple pieces of text which would be needed to process.
doc_texts = self.__parse_doc_texts(doc_path, document)
doc_texts = self.parse_doc_texts(doc_path, document)
for raw_text in doc_texts:
analyzers = self._load_analyzers(analyzers, SUPPORTED_ANALYZERS)
doc = self.generate_document(raw_text, analyzers, document, doc_paths=doc_path)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment