Source code for tommy.controller.topic_modelling_runners.bertopic_runner

import string

from numpy import ndarray
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

from tommy.controller.stopwords_controller import StopwordsController
from tommy.controller.corpus_controller import RawFile
from tommy.datatypes.topics import TopicWithScores
from tommy.model.topic_model import TopicModel
from tommy.controller.topic_modelling_runners.abstract_topic_runner import (
    TopicRunner)



[docs]
class BertopicRunner(TopicRunner):
    """
    BertopicRunner class for running the BERTopic topic modelling algorithm.
    """

    @property
    def _model(self) -> BERTopic:
        """get the bertopic model object saved in the topic_model"""
        return self._topic_model.model['model']

    @_model.setter
    def _model(self, new_model: BERTopic) -> None:
        """set the bertopic model object in the topic_model"""
        self._topic_model.model['model'] = new_model

    @property
    def num_words_per_topic(self) -> int:
        """the number of words that are calculated per topic"""
        return self._topic_model.model['num_words_per_topic']

    @property
    def max_num_topics(self) -> int:
        """the maximum number of topics that are calculated per topic"""
        return self._topic_model.model['num_topics']


[docs]
    def __init__(self, topic_model: TopicModel,
                 stopwords_controller: StopwordsController,
                 current_corpus_version_id: int,
                 num_topics: int,
                 num_words_per_topic: int,
                 docs: list[str],
                 sentences: list[str],
                 min_df: float | None,
                 max_features: int | None) -> None:
        """
        Initialize the BertopicRunner.
        :param topic_model: reference to the topic model where the algorithm
            and data should be saved
        :param stopwords_controller: a reference to the stopwords controller to
            extract the stopwords from
        :param current_corpus_version_id: The version identifier of the corpus
            that is used in training
        :param num_topics: the MAXIMUM number of topics to be returned from
            the analysis
        :param num_words_per_topic: the number of words per topic to be
            calculated. Values between 10-20 advised due to computation time.
        :param min_df: The minimal document frequency for a term to be
            included. I.E., the minimal ratio of the sentences in which te term
            needs to occur
        :param max_features: The maximum number of terms to be included in the
            analysis
        :return: None
        """
        super().__init__(topic_model, current_corpus_version_id)
        self._stopwords_controller = stopwords_controller

        self._topic_model.model = {}
        self._topic_model.model['num_words_per_topic'] = num_words_per_topic
        self._topic_model.model['num_topics'] = num_topics

        self.train_model(docs, sentences,
                         min_df=min_df,
                         max_features=max_features)



[docs]
    def get_n_topics(self) -> int:
        """Returns the number of topics calculated by the model."""
        return len([... for topic_words
                    in self._model.get_topics().values()
                    if topic_words])



[docs]
    def get_model(self) -> string:
        return "BERTOPIC"



[docs]
    def get_topic_with_scores(self, topic_id: int, n_words: int):
        """
        Return a topic object containing top n terms and their corresponding
        score for the topic identified by the topic_index.
        :param topic_id: the index of the requested topic
        :param n_words: number of terms in the resulting topic object,
            Note: BERTopic does not support top n queries
        :return: topic object containing top n terms and their corresponding
            scores
        """
        topics = [topic_words for topic_words
                  in self._model.get_topics().values()
                  if topic_words]

        # type hint in BERTopic's get_topics() function is incorrect
        # noinspection PyTypeChecker
        return TopicWithScores(topic_id, topics[topic_id])



[docs]
    def get_topics_with_scores(self, n_words: int):
        """
        Return a list of topic objects containing top n terms and their
        corresponding scores.
        :param n_words: number of terms in the resulting topic objects,
            Note: BERTopic does not support top n queries
        :return: list of topic objects containing the top n terms and their
            corresponding scores
        """
        # type hint in BERTopic's get_topics() function is incorrect
        # noinspection PyTypeChecker
        return [TopicWithScores(topic_id=topic_id,
                                top_words_with_scores=topic_words)
                for topic_id, topic_words
                in enumerate(self._model.get_topics().values())
                if topic_words]



[docs]
    def train_model(self, docs: list[str], sentences: list[str],
                    min_df: float | None,
                    max_features: int | None) -> None:
        """
        Train the BERTopic model.
        :param docs: list containing the raw bodies of files as
            input data
        :param sentences: list containing the raw bodies of files split into
            sentences as training input
        :param min_df: The minimal document frequency for a term to be
            included. I.E., the minimal ratio of the sentences in which te term
            needs to occur
        :param max_features: The maximum number of terms to be included in the
            analysis
        :return: None
        """
        hyperparams = {}
        if min_df is not None:
            hyperparams['min_df'] = min_df
        if max_features is not None:
            hyperparams['max_features'] = max_features

        vectorizer_model = CountVectorizer(
            ngram_range=(1, 3),
            stop_words=list(stopword for stopword
                            in self._stopwords_controller.stopwords_model),
            **hyperparams)
        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
        self._model = BERTopic(
            vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model,
            top_n_words=self.num_words_per_topic, nr_topics=self.max_num_topics
        ).fit(sentences)




"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""