import os
import spacy
from spacy.tokens import Doc
import nltk
import nltk.data
from tommy.model.stopwords_model import StopwordsModel
from tommy.model.synonyms_model import SynonymsModel
from tommy.support.application_settings import application_settings
from tommy.support.supported_languages import SupportedLanguage
from tommy.controller.language_controller import LanguageController
[docs]
class PreprocessingController:
"""A class that can preprocess text using the Dutch SpaCy pipeline."""
_stopwords_model: StopwordsModel = None
_enable_pos: bool
_synonyms_model: SynonymsModel = None
def __init__(self) -> None:
self._pos_categories = None
self._entity_categories = None
self._nlp = None
self._enable_pos: bool
self._language_controller = None
# load punkt tokenizers for splitting sentences
self._dutch_sent_tokenizer = self._load_nltk_sent_tokenizer(
"dutch.pickle")
self._english_sent_tokenizer = self._load_nltk_sent_tokenizer(
"english.pickle")
@staticmethod
def _load_nltk_sent_tokenizer(*path_parts) -> nltk.PunktSentenceTokenizer:
"""
Load a sentence tokenizer from nltk from the preprocessing data folder
:param path_parts: Components of the path to the desired tokenizer,
e.g., "dutch.pickle"
"""
fpath = f"file:///{os.path.join(
application_settings.data_folder,
"preprocessing_data", "nltk_downloads",
"tokenizers_punkt", *path_parts)}"
try:
tokenizer = nltk.data.load(fpath)
except LookupError:
raise LookupError(f"Could not load nltk tokenizer at path {fpath}")
return tokenizer
[docs]
def load_pipeline(self, language: SupportedLanguage) -> None:
nlp: spacy.Language
match language:
case SupportedLanguage.Dutch:
self._enable_pos = True
pipeline_path = os.path.join(
application_settings.data_folder,
"preprocessing_data", "pipeline_download",
"nl_core_news_sm-3.7.0")
nlp = spacy.load(pipeline_path,
exclude=["parser", "tagger",
"attribute_ruler"])
case SupportedLanguage.English:
self._enable_pos = False
pipeline_path = os.path.join(
application_settings.data_folder,
"preprocessing_data", "pipeline_download",
"en_core_web_sm-3.7.1")
# tagger is taking over the role of the morphologizer (
# supposedly)
nlp = spacy.load(pipeline_path, exclude=["parser"])
case _:
raise ValueError("Unsupported preprocessing language")
self._nlp = nlp
self._nlp.add_pipe("merge_entities")
self._entity_categories = {"CARDINAL", "DATE", "LAW", "MONEY",
"ORDINAL", "PERCENT", "QUANTITY", "TIME"}
self._pos_categories = {"NOUN", "PROPN", "ADJ", "ADV", "VERB"}
[docs]
def set_model_refs(self, stopwords_model: StopwordsModel,
synonyms_model: SynonymsModel) -> None:
self._stopwords_model = stopwords_model
self._synonyms_model = synonyms_model
[docs]
def set_controller_refs(self, language_controller: LanguageController):
"""Set the reference to the language controller"""
self._language_controller = language_controller
self._language_controller.change_language_event.subscribe(
self.load_pipeline)
self.load_pipeline(self._language_controller.get_language())
[docs]
def process_text(self, text: str) -> list[str]:
"""Preprocesses the given text to a list of tokens."""
tokens = self._nlp(text)
tokens = self.process_tokens(tokens)
return tokens
[docs]
def split_into_sentences(self, text: str) -> list[str]:
"""Split the given text to a list of sentences."""
match self._language_controller.get_language():
case SupportedLanguage.Dutch:
tokenizer = self._dutch_sent_tokenizer
case SupportedLanguage.English:
tokenizer = self._english_sent_tokenizer
case _:
raise ValueError("Current language is not supported by NLTK"
" sentence splitter.")
return tokenizer.tokenize(text)
[docs]
def process_tokens(self, doc: Doc) -> list[str]:
"""
Processes the tokens given by the SpaCy pipeline.
:param doc: The tokens given by processing of the Dutch SpaCy pipeline
:return list[str]: The processed tokens
"""
# All steps that require token-level information.
lemmas = [token.lemma_ for token in doc if
token.ent_type_ not in self._entity_categories and
not str.isspace(token.lemma_) and (
not self._enable_pos or
token.pos_ in self._pos_categories)]
# Take the lemmas.
lemmas = [lemma.lower() for lemma in lemmas if len(lemma) > 2]
# Apply synonyms and filter stopwords.
lemmas = self.apply_synonyms(lemmas)
lemmas = self.filter_stopwords(lemmas)
return lemmas
[docs]
def apply_synonyms(self, tokens: list[str]) -> list[str]:
"""
Applies synonyms to the given list of tokens.
:param tokens: The list of tokens
:return: The list of tokens where tokens are mapped to their synonyms
"""
return (list(map(
lambda token: self._synonyms_model.get(token, token), tokens)))
[docs]
def filter_stopwords(self, tokens: list[str]) -> list[str]:
"""
Removes all stopwords from the given list of tokens.
:param tokens: The list of tokens
:return: The list of tokens without stopwords
"""
return [token for token in tokens
if token not in self._stopwords_model]
"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""