import os
import spacy
from spacy.tokens import Doc
import nltk
import nltk.data
from tommy.model.stopwords_model import StopwordsModel
from tommy.model.synonyms_model import SynonymsModel
from tommy.support.application_settings import application_settings
from tommy.support.supported_languages import SupportedLanguage
from tommy.controller.language_controller import LanguageController
[docs]
class PreprocessingController:
    """A class that can preprocess text using the Dutch SpaCy pipeline."""
    _stopwords_model: StopwordsModel = None
    _enable_pos: bool
    _synonyms_model: SynonymsModel = None
    def __init__(self) -> None:
        self._pos_categories = None
        self._entity_categories = None
        self._nlp = None
        self._enable_pos: bool
        self._language_controller = None
        # load punkt tokenizers for splitting sentences
        self._dutch_sent_tokenizer = self._load_nltk_sent_tokenizer(
            "dutch.pickle")
        self._english_sent_tokenizer = self._load_nltk_sent_tokenizer(
            "english.pickle")
    @staticmethod
    def _load_nltk_sent_tokenizer(*path_parts) -> nltk.PunktSentenceTokenizer:
        """
        Load a sentence tokenizer from nltk from the preprocessing data folder
        :param path_parts: Components of the path to the desired tokenizer,
            e.g., "dutch.pickle"
        """
        fpath = f"file:///{os.path.join(
                           application_settings.data_folder,
                           "preprocessing_data", "nltk_downloads", 
                           "tokenizers_punkt", *path_parts)}"
        try:
            tokenizer = nltk.data.load(fpath)
        except LookupError:
            raise LookupError(f"Could not load nltk tokenizer at path {fpath}")
        return tokenizer
[docs]
    def load_pipeline(self, language: SupportedLanguage) -> None:
        nlp: spacy.Language
        match language:
            case SupportedLanguage.Dutch:
                self._enable_pos = True
                pipeline_path = os.path.join(
                    application_settings.data_folder,
                    "preprocessing_data", "pipeline_download",
                    "nl_core_news_sm-3.7.0")
                nlp = spacy.load(pipeline_path,
                                 exclude=["parser", "tagger",
                                          "attribute_ruler"])
            case SupportedLanguage.English:
                self._enable_pos = False
                pipeline_path = os.path.join(
                    application_settings.data_folder,
                    "preprocessing_data", "pipeline_download",
                    "en_core_web_sm-3.7.1")
                # tagger is taking over the role of the morphologizer (
                # supposedly)
                nlp = spacy.load(pipeline_path, exclude=["parser"])
            case _:
                raise ValueError("Unsupported preprocessing language")
        self._nlp = nlp
        self._nlp.add_pipe("merge_entities")
        self._entity_categories = {"CARDINAL", "DATE", "LAW", "MONEY",
                                   "ORDINAL", "PERCENT", "QUANTITY", "TIME"}
        self._pos_categories = {"NOUN", "PROPN", "ADJ", "ADV", "VERB"} 
[docs]
    def set_model_refs(self, stopwords_model: StopwordsModel,
                       synonyms_model: SynonymsModel) -> None:
        self._stopwords_model = stopwords_model
        self._synonyms_model = synonyms_model 
[docs]
    def set_controller_refs(self, language_controller: LanguageController):
        """Set the reference to the language controller"""
        self._language_controller = language_controller
        self._language_controller.change_language_event.subscribe(
            self.load_pipeline)
        self.load_pipeline(self._language_controller.get_language()) 
[docs]
    def process_text(self, text: str) -> list[str]:
        """Preprocesses the given text to a list of tokens."""
        tokens = self._nlp(text)
        tokens = self.process_tokens(tokens)
        return tokens 
[docs]
    def split_into_sentences(self, text: str) -> list[str]:
        """Split the given text to a list of sentences."""
        match self._language_controller.get_language():
            case SupportedLanguage.Dutch:
                tokenizer = self._dutch_sent_tokenizer
            case SupportedLanguage.English:
                tokenizer = self._english_sent_tokenizer
            case _:
                raise ValueError("Current language is not supported by NLTK"
                                 " sentence splitter.")
        return tokenizer.tokenize(text) 
[docs]
    def process_tokens(self, doc: Doc) -> list[str]:
        """
        Processes the tokens given by the SpaCy pipeline.
        :param doc: The tokens given by processing of the Dutch SpaCy pipeline
        :return list[str]: The processed tokens
        """
        # All steps that require token-level information.
        lemmas = [token.lemma_ for token in doc if
                  token.ent_type_ not in self._entity_categories and
                  not str.isspace(token.lemma_) and (
                          not self._enable_pos or
                          token.pos_ in self._pos_categories)]
        # Take the lemmas.
        lemmas = [lemma.lower() for lemma in lemmas if len(lemma) > 2]
        # Apply synonyms and filter stopwords.
        lemmas = self.apply_synonyms(lemmas)
        lemmas = self.filter_stopwords(lemmas)
        return lemmas 
[docs]
    def apply_synonyms(self, tokens: list[str]) -> list[str]:
        """
        Applies synonyms to the given list of tokens.
        :param tokens: The list of tokens
        :return: The list of tokens where tokens are mapped to their synonyms
        """
        return (list(map(
            lambda token: self._synonyms_model.get(token, token), tokens))) 
[docs]
    def filter_stopwords(self, tokens: list[str]) -> list[str]:
        """
        Removes all stopwords from the given list of tokens.
        :param tokens: The list of tokens
        :return: The list of tokens without stopwords
        """
        return [token for token in tokens
                if token not in self._stopwords_model] 
 
"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""