from tommy.controller.corpus_controller import CorpusController
from tommy.controller.model_parameters_controller import (
ModelParametersController,
ModelType)
from tommy.controller.corpus_controller import CorpusController
from tommy.controller.synonyms_controller import SynonymsController
from tommy.model.config_model import ConfigModel
from tommy.controller.result_interfaces.document_topics_interface import \
DocumentTopicsInterface
from tommy.controller.preprocessing_controller import PreprocessingController
from tommy.controller.stopwords_controller import StopwordsController
from tommy.controller.topic_modelling_runners.abstract_topic_runner import (
TopicRunner)
from tommy.controller.topic_modelling_runners.bertopic_runner import (
BertopicRunner)
from tommy.controller.topic_modelling_runners.lda_runner import LdaRunner
from tommy.controller.topic_modelling_runners.nmf_runner import NmfRunner
from tommy.model.config_model import ConfigModel
from tommy.model.topic_model import TopicModel
from tommy.support.async_worker import Worker
from tommy.support.event_handler import EventHandler
from tommy.support.types import Document_topics
from tommy.view.error_view import ErrorView
[docs]
class TopicModellingController:
"""
Controller that runs the selected topic modelling algorithm on a call of
train_model and supplies a topic runner object from which results can be
extracted.
"""
_stopwords_controller: StopwordsController = None
_synonyms_controller: SynonymsController = None
_preprocessing_controller = None
_model_parameters_controller: ModelParametersController = None
_topic_model: TopicModel = None
_config_model: ConfigModel = None
_corpus_controller: CorpusController = None
_start_training_model_event: EventHandler[TopicRunner] = None
_model_trained_event: EventHandler[TopicRunner] = None
_topic_model_switched_event: EventHandler[TopicRunner] = None
_calculate_document_topics_event: \
EventHandler[Document_topics] = None
@property
def start_training_model_event(self) -> EventHandler[TopicRunner]:
return self._start_training_model_event
@property
def model_trained_event(self) -> EventHandler[TopicRunner]:
return self._model_trained_event
@property
def topic_model_switched_event(self) -> EventHandler[TopicRunner]:
return self._topic_model_switched_event
@property
def calculate_topic_documents_event(self) -> (
EventHandler)[Document_topics]:
return self._calculate_document_topics_event
[docs]
def __init__(self) -> None:
"""Initialize the publisher of the topic-modelling-controller"""
super().__init__()
self._worker = None
self._start_training_model_event = EventHandler[TopicRunner]()
self._model_trained_event = EventHandler[TopicRunner]()
self._topic_model_switched_event = EventHandler[TopicRunner]()
self._calculate_document_topics_event = (
EventHandler[Document_topics]())
[docs]
def set_model_refs(self,
topic_model: TopicModel,
config_model: ConfigModel) -> None:
"""
Set the references to the topic model
:return: None
"""
self._topic_model = topic_model
self._config_model = config_model
[docs]
def on_model_swap(self) -> None:
"""
Notify the graph controller that the topic model has changed
:return: None
"""
# if the topic runner ran on an outdated corpus, we delete it.
if (self._config_model.topic_runner is not None
and self._config_model.topic_model.used_corpus_version_id
!= self._corpus_controller.corpus_version_id):
self._config_model.topic_runner = None
self._topic_model_switched_event.publish(
self._config_model.topic_runner)
[docs]
def set_controller_refs(self,
parameters_controller: ModelParametersController,
corpus_controller: CorpusController,
stopwords_controller: StopwordsController,
synonyms_controller: SynonymsController,
preprocessing_controller: PreprocessingController
) -> None:
"""Set the reference to the needed controllers"""
self._model_parameters_controller = parameters_controller
self._corpus_controller = corpus_controller
self._stopwords_controller = stopwords_controller
self._synonyms_controller = synonyms_controller
self._preprocessing_controller = preprocessing_controller
[docs]
def train_model(self) -> None:
"""
Trains the selected model from on the currently loaded data
and notifies the observers that a (new) topic runner is ready when
async training is done
:raises NotImplementedError: if selected model type is not supported
:return: None
"""
new_model_type = self._model_parameters_controller.get_model_type()
self._start_training_model_event.publish(
self._config_model.topic_runner)
def model_trained_callback():
self._model_trained_event.publish(self._config_model.topic_runner)
self._calculate_document_topics_event.publish(
self._topic_model.document_topics)
if self._corpus_controller.metadata_available() is False:
ErrorView("Er is geen data beschikbaar om een model op te "
"trainen. Zorg ervoor dat er een map met ondersteunde "
"bestanden is ingeladen. De ondersteunde bestandstypen "
"zijn:", ["txt", "pdf", "docx",
"csv, zorg ervoor dat de tekst die je wilt "
"analyseren in een kolom staat die als header "
"'body' heeft. Voor meer informatie zie de "
"website <a href='tommy.fyor.nl'>"
"tommy.fyor.nl</a>"])
model_trained_callback()
return
match new_model_type:
case ModelType.LDA:
self._worker = Worker(self._train_lda)
self._worker.finished.connect(model_trained_callback)
self._worker.start()
case ModelType.BERTopic:
self._worker = Worker(self._train_bert)
self._worker.finished.connect(model_trained_callback)
self._worker.start()
case ModelType.NMF:
self._worker = Worker(self._train_nmf)
self._worker.finished.connect(model_trained_callback)
self._worker.start()
case _:
raise NotImplementedError(
f"model type {new_model_type.name} is not supported by "
f"topic modelling controller")
def _train_lda(self) -> None:
"""
Retrieves the corpus and model parameters,
then runs the LDA model on the corpus and saves the topic runner.
:return: None
"""
corpus = self._corpus_controller.preprocess_corpus()
num_topics = self._model_parameters_controller.get_model_n_topics()
alpha_value = self._model_parameters_controller.get_model_alpha()
beta_value = self._model_parameters_controller.get_model_beta()
alpha_beta_custom_enabled = (
self._model_parameters_controller.
get_model_alpha_beta_custom_enabled())
if alpha_beta_custom_enabled:
self._config_model.topic_runner = LdaRunner(
topic_model=self._topic_model,
processed_corpus=corpus,
current_corpus_version_id=
self._corpus_controller.corpus_version_id,
num_topics=num_topics,
alpha=alpha_value,
beta=beta_value)
return
self._config_model.topic_runner = LdaRunner(
topic_model=self._topic_model,
processed_corpus=corpus,
current_corpus_version_id=
self._corpus_controller.corpus_version_id,
num_topics=num_topics)
def _train_nmf(self) -> None:
"""
Retrieves the corpus and model parameters,
then runs the NMF model on the corpus and saves the topic runner.
:return: None
"""
corpus = self._corpus_controller.preprocess_corpus()
num_topics = self._model_parameters_controller.get_model_n_topics()
self._config_model.topic_runner = NmfRunner(
topic_model=self._topic_model,
processed_corpus=corpus,
current_corpus_version_id=
self._corpus_controller.corpus_version_id,
num_topics=num_topics)
def _train_bert(self) -> None:
"""
Retrieves the raw corpus and model parameters,
then runs the BERTopic model on the corpus and saves the topic runner.
:return: None
"""
num_topics = self._model_parameters_controller.get_model_n_topics()
num_words_per_topic = (self._model_parameters_controller
.get_model_word_amount())
bert_min_df = self._model_parameters_controller.get_bert_min_df()
bert_max_features = (self._model_parameters_controller.
get_bert_max_features())
raw_docs = [document.body for document
in self._corpus_controller.get_raw_bodies()]
# split every document into sentences and add all sentences to a list
sentences = list([
sentence
for split_document in
map(self._preprocessing_controller.split_into_sentences, raw_docs)
for sentence in split_document
])
self._config_model.topic_runner = BertopicRunner(
topic_model=self._topic_model,
stopwords_controller=self._stopwords_controller,
current_corpus_version_id=
self._corpus_controller.corpus_version_id,
num_topics=num_topics,
num_words_per_topic=num_words_per_topic,
docs=raw_docs,
sentences=sentences,
min_df=bert_min_df,
max_features=bert_max_features)
"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""