Source code for tommy.controller.topic_modelling_runners.lda_runner

import string
from collections.abc import Iterable

import numpy as np
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from numpy import ndarray

from tommy.controller.file_import.processed_corpus import ProcessedCorpus
from tommy.controller.result_interfaces.correlation_matrix_interface import (
    CorrelationMatrixInterface)
from tommy.controller.result_interfaces.document_topics_interface import (
    DocumentTopicsInterface)
from tommy.controller.result_interfaces.topic_coherence_interface import (
    TopicCoherenceInterface)
from tommy.controller.topic_modelling_runners.abstract_topic_runner import (
    TopicRunner)
from tommy.datatypes.topics import TopicWithScores
from tommy.model.topic_model import TopicModel

STANDARD_RANDOM_SEED = 42


[docs] class LdaRunner(TopicRunner, CorrelationMatrixInterface, DocumentTopicsInterface, TopicCoherenceInterface): """GensimLdaModel class for topic modeling using LDA with Gensim.""" _num_topics: int _alpha: float _beta: float _random_seed: int @property def _dictionary(self) -> Dictionary: """get the term_ids-to-terms dictionary saved in the topic model""" return self._topic_model.dictionary @_dictionary.setter def _dictionary(self, new_dictionary: Dictionary) -> None: """Takes and sets the term_ids-to-terms dictionary""" self._topic_model.dictionary = new_dictionary @property def _model(self) -> LdaModel: """Get the model than is being run from the topic model""" return self._topic_model.model @_model.setter def _model(self, new_model: LdaModel) -> None: """Set the LDA model than is being run in the topic model""" self._topic_model.model = new_model @property def _bags_of_words(self) -> list[list[tuple[int, int]]]: """Get the bags of words for the topic model""" return self._topic_model.corpus @_bags_of_words.setter def _bags_of_words(self, bag: list[list[tuple[int, int]]]) -> None: """Set the bags of words""" self._topic_model.corpus = bag
[docs] def __init__(self, topic_model: TopicModel, processed_corpus: ProcessedCorpus, current_corpus_version_id: int, num_topics: int, alpha: float = None, beta: float = None, random_seed=STANDARD_RANDOM_SEED) -> None: """ Initialize the GensimLdaModel. :param topic_model: Reference to the topic model where the algorithm and data should be saved. :param processed_corpus: The processed corpus :param current_corpus_version_id: The version identifier of the corpus that is used in training :param num_topics: Number of topics to the model. :param random_seed: Seed for reproducibility, defaults to 42. :return: None """ super().__init__(topic_model, current_corpus_version_id) self._num_topics = num_topics self._alpha = alpha self._beta = beta self._random_seed = random_seed self.train_model(processed_corpus) self.calculate_document_topics(processed_corpus, topic_model)
[docs] def train_model(self, processed_corpus: ProcessedCorpus) -> None: """ Train the LDA model on the given documents and save the resulting model and dictionary in the topic model ready to return results. :param processed_corpus: The processed corpus :return: None """ processed_bodies = [document.body.body for document in processed_corpus] self._dictionary = Dictionary(processed_bodies) bags_of_words = [self._dictionary.doc2bow(tokens) for tokens in processed_bodies] self._bags_of_words = bags_of_words # Run optimized LDA if alpha and beta are None if self._alpha and self._beta is None: self._model = LdaModel(corpus=bags_of_words, id2word=self._dictionary, num_topics=self._num_topics, random_state=self._random_seed) return # Run LDA with custom alpha and beta self._model = LdaModel(corpus=bags_of_words, id2word=self._dictionary, num_topics=self._num_topics, random_state=self._random_seed, alpha=self._alpha, eta=self._beta)
[docs] def calculate_document_topics(self, processed_corpus: ProcessedCorpus, topic_model: TopicModel) -> None: """ Calculate the topic probabilities for each document in the corpus. :param processed_corpus: The processed corpus :param topic_model: The topic model to save the results in :return: """ topic_model.document_topics = [] n_topics = self.get_n_topics() for document in processed_corpus: topic_correspondence = ( self.get_document_topics(document.body.body, 0.0)) probabilities = [0.0] * n_topics # Create list of topic probabilities for each document for (topic_id, topic_probability) in topic_correspondence: probabilities[topic_id] = topic_probability topic_model.document_topics.append( (document.metadata, probabilities))
[docs] def get_n_topics(self) -> int: return self._num_topics
[docs] def get_model(self) -> string: return "LDA"
[docs] def get_topic_with_scores(self, topic_id, n_words) -> TopicWithScores: words_with_scores = self._model.show_topic(topicid=topic_id, topn=n_words) return TopicWithScores(topic_id, words_with_scores)
[docs] def get_topics_with_scores(self, n_words) -> list[TopicWithScores]: return [TopicWithScores(topic_id, words_with_scores) for (topic_id, words_with_scores) in self._model.show_topics(formatted=False, num_words=n_words)]
[docs] def get_correlation_matrix(self, **kwargs) -> ndarray: """ Calculate the topic correlation matrix. :return: ndarray representing the correlation matrix of topics. """ topic_word_distribution = self._model.get_topics() # Binarize the topic-word distribution based on a set threshold 0.01 # i.e. see if a word is related enough to a topic binary_topic_distribution = (topic_word_distribution > 0.005).astype(int) num_topics = binary_topic_distribution.shape[0] dice_matrix = np.zeros((num_topics, num_topics)) # Compute the Dice-Sørensen coefficient for each pair of topics for i in range(num_topics): for j in range(num_topics): # Check if intersection is 1 or 0 intersection = np.sum( binary_topic_distribution[i] * binary_topic_distribution[j]) sum_i = np.sum(binary_topic_distribution[i]) sum_j = np.sum(binary_topic_distribution[j]) # Fill in the formula if sum_i + sum_j > 0: dice_matrix[i, j] = 2 * intersection / ( sum_i + sum_j) # Dividing by zero is impossible, so make it 0 else: dice_matrix[i, j] = 0.0 return dice_matrix
[docs] def get_document_topics(self, doc, minimum_probability): bag_of_words = self._dictionary.doc2bow(doc) return self._model.get_document_topics(bag_of_words, minimum_probability= minimum_probability)
[docs] def get_topic_coherence(self, num_topics): new_model = LdaModel(corpus=self._bags_of_words, id2word=self._dictionary, num_topics=num_topics, random_state=self._random_seed, alpha=self._alpha, eta=self._beta) coherence_model = CoherenceModel(model=new_model, corpus=self._bags_of_words, dictionary=self._dictionary, coherence='u_mass') coherence = coherence_model.get_coherence() return coherence
""" This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences) """