Commit 0e94c4eb authored by Marko Kollo's avatar Marko Kollo 😄
Browse files

Temporarily removed memory optimization bc of issues with MWT dependency.

parent 2aa62f7d
Pipeline #5455 passed with stages
in 21 minutes and 36 seconds
import logging
import os
import pathlib
import regex as re
import shutil
from typing import List, Optional
from urllib.parse import urlparse
from urllib.request import urlopen
import regex as re
import stanza
from bs4 import BeautifulSoup
from langdetect import detect
from pelecanus import PelicanJson
from typing import List, Optional
from urllib.parse import urlparse
from urllib.request import urlopen
from texta_mlp.document import Document
from texta_mlp.entity_mapper import EntityMapper
from texta_mlp.exceptions import LanguageNotSupported
from texta_mlp.utils import parse_bool_env
......@@ -269,7 +269,6 @@ class MLP:
stanza_resource_path = pathlib.Path(self.resource_dir) / "stanza"
stanza_pipelines[lang] = stanza.Pipeline(
lang=lang,
processors=self._get_stanza_processors(lang),
dir=str(stanza_resource_path),
use_gpu=self.use_gpu,
logging_level=logging_level
......@@ -277,18 +276,6 @@ class MLP:
return stanza_pipelines
@staticmethod
def _get_stanza_processors(lang):
"""
Returns processor options based on language and NER support in Stanza.
"""
print(lang)
if lang in STANZA_NER_SUPPORT:
return "tokenize,pos,lemma,ner"
else:
return "tokenize,pos,lemma"
def process(self, raw_text: str, analyzers: list = ["all"], lang=None):
"""
Processes raw text.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment