Source code for tommy.controller.visualizations.document_topic_nx_exporter

from typing import TypeAliasType

import networkx as nx

from tommy.controller.result_interfaces.document_topics_interface import (
    DocumentTopicsInterface)
from tommy.controller.topic_modelling_runners.abstract_topic_runner import (
    TopicRunner)
from tommy.controller.visualizations.nx_exporter_on_data import (
    NxExporterOnData)
from tommy.controller.visualizations.visualization_input_datatypes import (
    ProcessedCorpus)
from tommy.support.constant_variables import plot_colors



[docs]
class DocumentTopicNxExporter(NxExporterOnData[ProcessedCorpus]):
    """
    A class for constructing a network showing the topics and the
    number of documents that contain that topic for the topics in the given
    topic runner and the given preprocessed documents; and returning it as an
    nx.Graph.
    Note: this visualization is only to be used for exporting purposes
    """
    _required_interfaces = [DocumentTopicsInterface, TopicRunner]
    name = 'Topics en documenten netwerk'

    @property
    def input_data_type(self) -> TypeAliasType:
        """Returns the type of the additional data needed in get_nx_graph"""
        return ProcessedCorpus


[docs]
    def get_nx_graph(self,
                     topic_runner: TopicRunner | DocumentTopicsInterface,
                     data: ProcessedCorpus
                     ) -> nx.Graph:
        """
        Construct a document-topic nx graph representing plot of the
        relations between documents and topics
        :param topic_runner: The topic runner (implementing
            DocumentTopicsInterface) to extract topic data from
        :param data: The preprocessed corpus containing
            all files as bags of words after preprocessing.
        :return: nx graph representing a document-topic network plot
        """
        return self.construct_doc_topic_network(topic_runner, data, 0.05)



[docs]
    @staticmethod
    def construct_doc_topic_network(topic_runner: TopicRunner
                                                  | DocumentTopicsInterface,
                                    processed_files: ProcessedCorpus,
                                    minimum_probability: float
                                    ) -> nx.Graph:
        """
        Construct a document-topic network plot which is used to plot the
        relations
        :param topic_runner: The topic runner (implementing
            DocumentTopicsInterface) to extract topic data from
        :param processed_files: The preprocessed corpus containing
            all files as bags of words after preprocessing.
        :param minimum_probability: the minimum probability of a document 
            belonging to a topic for it to be included.
        :return: matplotlib figure showing a document-topic network plot
        """
        graph = nx.Graph()

        for topic_id in range(topic_runner.get_n_topics()):
            graph.add_node(topic_id,
                           color=plot_colors[topic_id % len(plot_colors)])

        # Generate initial document topic network
        for document_id, document in enumerate(processed_files):
            document_topic = (
                topic_runner.get_document_topics(document.body.body,
                                                 minimum_probability))

            # Add edges from each document to all associated topics
            for (topic_id, topic_probability) in document_topic:
                graph.add_edge(topic_id,
                               'document:' + str(document_id),
                               color=plot_colors[topic_id % len(plot_colors)],
                               weight=topic_probability)
        return graph




"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""