import os
from collections.abc import Generator
from gensim.corpora import Dictionary
from tommy.controller.file_import.generic_file_importer import (
GenericFileImporter)
from tommy.controller.file_import.metadata import Metadata
from tommy.controller.file_import.processed_body import ProcessedBody
from tommy.controller.file_import.processed_corpus import ProcessedCorpus
from tommy.controller.file_import.processed_file import ProcessedFile
from tommy.controller.file_import.raw_body import RawBody
from tommy.controller.file_import.raw_file import RawFile
from tommy.controller.project_settings_controller import (
ProjectSettingsController)
from tommy.controller.preprocessing_controller import PreprocessingController
from tommy.support.event_handler import EventHandler
from tommy.model.corpus_model import CorpusModel
from tommy.view.error_view import ErrorView
[docs]
class CorpusController:
"""
The corpus controller class is responsible for handling interactions with
the corpus model.
"""
_corpus_model: CorpusModel = None
_project_settings_controller: ProjectSettingsController = None
_preprocessing_controller: PreprocessingController = None
fileParsers: GenericFileImporter = GenericFileImporter()
_metadata_changed_event: EventHandler[[Metadata]] = None
corpus_version_id: int = -1
@property
def metadata_changed_event(self) -> EventHandler[[Metadata]]:
"""
This event gets triggered every time the metadata of the corpus is
changed, so the UI can update itself to show the metadata
:return:
"""
return self._metadata_changed_event
[docs]
def __init__(self) -> None:
"""Initialize corpus controller and eventhandler for metadata"""
super().__init__()
self._metadata_changed_event = EventHandler[[Metadata]]()
[docs]
def set_controller_refs(self,
project_settings_controller:
ProjectSettingsController,
preprocessing_controller:
PreprocessingController) -> None:
"""
Sets the reference to the project settings controller,
and subscribes to the publisher of project settings
:param project_settings_controller: the project settings controller
:param preprocessing_controller: the preprocessing controller
:return: None
"""
self._project_settings_controller = project_settings_controller
self._preprocessing_controller = preprocessing_controller
project_settings_controller.input_folder_path_changed_event.subscribe(
self.on_input_folder_path_changed)
[docs]
def set_model_refs(self, corpus_model: CorpusModel) -> None:
"""
Sets the reference to the corpus model
:param corpus_model: The corpus model
:return: None
"""
self._corpus_model = corpus_model
[docs]
def change_config_model_refs(self, corpus_model: CorpusModel) -> None:
"""
Sets the reference to the corpus model
:param corpus_model: The corpus model
:return: None
"""
self._corpus_model = corpus_model
self.extract_and_store_metadata(self._project_settings_controller
.get_input_folder_path())
def _read_files(self, path: str, show_error: bool) -> Generator[RawFile,
None, None]:
"""
Yields the contents of all compatible files in a given directory
and all its subdirectories.
:param path: The string of the path to the directory
:return: A generator yielding File
objects
"""
if path == "":
return None
errors = []
for root, dirs, files in os.walk(path):
for file in files:
if file.startswith('.'):
continue
try:
yield from self.fileParsers.import_file(
os.path.join(root,
file))
except NotImplementedError as e:
errors.append(f"{file} bestaat uit een niet ondersteund "
f"file format. pad: "
f"{os.path.join(root, file)}")
except UnicodeDecodeError as e:
errors.append(f"Dit bestand kon niet worden gedecodeerd: "
f"{file}. Probleem: {e}")
except Warning as e:
errors.append(f"Waarschuwing bij bestand '{file}': {e}")
except ExceptionGroup as e:
error_lines = "\n".join(str(error) for error in
e.exceptions)
errors.append(f"Er zijn meerdere fouten opgetreden bij "
f"het laden van dit bestand: {file}. "
f"Problemen:\n{error_lines}")
except Exception as e:
errors.append(f"Er is een probleem opgetreden bij het "
f"laden van dit bestand: "
f" {file}. Probleem: {e}")
if show_error and errors:
ErrorView("Er is een probleem opgetreden bij het "
"importeren van de volgende bestanden:", errors)
def _read_files_from_input_folder(self) -> Generator[RawFile, None, None]:
"""
Private method to read all files in the folder specified in the
project settings model
:return: A generator that iterates over the raw
file contents and their metadata.
"""
path = self._project_settings_controller.get_input_folder_path()
return self._read_files(path, False)
[docs]
def get_raw_bodies(self) -> Generator[RawBody, None, None]:
"""
Get a generator that reads all the raw file contents from the input
folder
:return: A generator for just the contents of the raw corpus,
but without the metadata
"""
files = self._read_files_from_input_folder()
return (file.body for file in files)
[docs]
def get_raw_files(self) -> Generator[RawFile, None, None]:
"""
Get a generator that reads all the raw file contents and their metadata
from the input folder
:return: A generator of the raw corpus
"""
return self._read_files_from_input_folder()
[docs]
def get_processed_corpus(self) -> ProcessedCorpus:
"""
Get an iterable of the processed corpus. Only works after
pre-processing has been completed.
:return: The pre-processed files and a reference to their metadata
"""
if not self._corpus_model.processed_corpus:
return self.preprocess_corpus()
return self._corpus_model.processed_corpus
[docs]
def preprocess_corpus(self) -> ProcessedCorpus:
"""Preprocessed the corpus and save it in the corpus model"""
processed_files = [ProcessedFile(doc.metadata, ProcessedBody(
self._preprocessing_controller.process_text(doc.body.body)))
for
doc in self.get_raw_files()]
processed_corpus = ProcessedCorpus(processed_files)
self._corpus_model.processed_corpus = processed_corpus
return processed_corpus
[docs]
def get_dictionary(self) -> Dictionary:
"""
Get the dictionary corresponding to the bag-of-words representation of
the pre-processed documents. It is only set after pre-processing
has been completed.
:return: the dictionary of the pre-processed documents
"""
return self._corpus_model.dictionary
[docs]
def set_dictionary(self, dictionary: Dictionary) -> None:
"""
Set the dictionary corresponding to the bag-of-words representation of
the pre-processed documents.
:param dictionary: corpora.Dictionary: the dictionary of the
pre-processed documents
:return: None
"""
self._corpus_model.dictionary = dictionary
"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""