Commit 75c1de43 authored by Raul Sirel's avatar Raul Sirel
Browse files

fix ner. remove dead code

parent 064703ea
Pipeline #6910 passed with stages
in 23 minutes and 11 seconds
......@@ -31,13 +31,6 @@ class Document:
FACT_NAME_PHONE_HIGH_RECALL = "PHONE_high_recall"
FACT_NAME_PHONE_HIGH_PRECISION = "PHONE_high_precision"
FACT_NAME_PHONE_STRICT = "PHONE_strict"
FACT_NAMES_NER = ("PER", "ORG", "GPE", "LOC")
FACT_NAME_NAMEMAIL = "NAMEMAIL"
FACT_NAME_BOUNDED = "BOUNDED"
KNOWN_ENTITIES = (FACT_NAME_EMAIL, FACT_NAME_ADDRESS, FACT_NAME_PHONE_STRICT, FACT_NAME_NAMEMAIL) + FACT_NAMES_NER
CLOSE_FACT_DISTANCE = 150
def __init__(
......@@ -489,33 +482,30 @@ class Document:
def ner(self):
tokenized_text = self.get_words()
known_entities = Document.FACT_NAMES_NER
for entity in self.stanza_entities:
if entity.type in known_entities:
# finds the closest spans in tokenized text
# this is because stanza returns spans from non-tokenized text
pattern = re.compile(re.escape(entity.text)) # Use re.escape to avoid trouble with special characters existing in the text.
matching_tokenized_spans = [(match.start(), match.end()) for match in pattern.finditer(tokenized_text)]
best_matching_span = None
best_matching_distance = math.inf
non_tokenized_span = (entity.start_char, entity.end_char)
# matching spans are always equal or larger
for span in matching_tokenized_spans:
span_distance = (span[0] - non_tokenized_span[0]) + (span[1] - non_tokenized_span[1])
if abs(span_distance) < best_matching_distance:
best_matching_distance = abs(span_distance)
best_matching_span = span
# create and append fact
# ignore facts whose match fails
if best_matching_span:
text_before_match = tokenized_text[:best_matching_span[0]]
sentence_index = text_before_match.count("\n")
new_fact = Fact(
fact_type=entity.type,
fact_value=entity.text,
doc_path=self.__get_doc_path("text"),
spans=[best_matching_span],
sent_index=sentence_index
)
self.__texta_facts.append(new_fact)
# finds the closest spans in tokenized text
# this is because stanza returns spans from non-tokenized text
pattern = re.compile(re.escape(entity.text)) # Use re.escape to avoid trouble with special characters existing in the text.
matching_tokenized_spans = [(match.start(), match.end()) for match in pattern.finditer(tokenized_text)]
best_matching_span = None
best_matching_distance = math.inf
non_tokenized_span = (entity.start_char, entity.end_char)
# matching spans are always equal or larger
for span in matching_tokenized_spans:
span_distance = (span[0] - non_tokenized_span[0]) + (span[1] - non_tokenized_span[1])
if abs(span_distance) < best_matching_distance:
best_matching_distance = abs(span_distance)
best_matching_span = span
# create and append fact
# ignore facts whose match fails
if best_matching_span:
text_before_match = tokenized_text[:best_matching_span[0]]
sentence_index = text_before_match.count("\n")
new_fact = Fact(
fact_type=entity.type,
fact_value=entity.text,
doc_path=self.__get_doc_path("text"),
spans=[best_matching_span],
sent_index=sentence_index
)
self.__texta_facts.append(new_fact)
......@@ -25,7 +25,6 @@ from texta_mlp.settings import (
SUPPORTED_ANALYZERS,
DEFAULT_ANALYZERS,
ENTITY_MAPPER_DATA_URLS,
CONCATENATOR_DATA_FILES,
USE_GPU
)
......@@ -395,43 +394,3 @@ class MLP:
docs[actual_index] = result
return docs
@staticmethod
def download_concatenator_resources(resource_dir: str, logger):
concat_resource_dir = pathlib.Path(resource_dir) / "concatenator"
concat_resource_dir.mkdir(parents=True, exist_ok=True)
for url in CONCATENATOR_DATA_FILES:
file_name = urlparse(url).path.split("/")[-1]
file_path = concat_resource_dir / file_name
if not file_path.exists():
if logger: logger.info(f"Downloading concatenator file {file_name} into the directory: {url}")
response = urlopen(url)
content = response.read().decode()
with open(file_path, "w", encoding="utf8") as fh:
fh.write(content)
def _load_not_entities(self):
not_entities = list()
with open(self.not_entities_path, "r", encoding="UTF-8") as file:
for line in file.readlines():
not_entities += [line.strip().lower()]
return not_entities
def _load_space_between_not_ok(self):
space_between_not_ok = list()
with open(self.space_between_not_ok_path, "r", encoding="UTF-8") as file:
for line in file.readlines():
space_between_not_ok += [line.strip()]
return re.compile("|".join(space_between_not_ok))
def _load_months(self):
months = list()
with open(self.months_path, "r", encoding="UTF-8") as file:
for line in file.readlines():
months += [line.strip()]
return months
......@@ -11,13 +11,6 @@ ENTITY_MAPPER_DATA_URLS = (
"https://packages.texta.ee/texta-resources/entity_mapper/currencies.json"
)
# URLs for Concatenator data sources.
CONCATENATOR_DATA_FILES = (
"https://packages.texta.ee/texta-resources/concatenator/months.txt",
"https://packages.texta.ee/texta-resources/concatenator/not_entities.txt",
"https://packages.texta.ee/texta-resources/concatenator/space_between_not_ok.txt",
)
# URLs for Custom NER model downloads.
CUSTOM_NER_MODELS = {
"et": "https://packages.texta.ee/texta-resources/ner_models/_estonian_nertagger.pt",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment