Source code for tommy.controller.visualizations.document_topic_network_summary_creator

import math

import matplotlib.figure
import networkx as nx
from matplotlib import pyplot as plt

from tommy.controller.result_interfaces.document_topics_interface import (
    DocumentTopicsInterface)
from tommy.controller.topic_modelling_runners.abstract_topic_runner import (
    TopicRunner)
from tommy.controller.visualizations.abstract_visualization import (
    AbstractVisualization)
from tommy.controller.visualizations.document_topic_nx_exporter import (
    DocumentTopicNxExporter)
from tommy.controller.visualizations.possible_visualization import VisGroup
from tommy.controller.visualizations.visualization_input_datatypes import (
    VisInputData, ProcessedCorpus)
from tommy.support.constant_variables import plot_colors



[docs]
class DocumentTopicNetworkSummaryCreator(AbstractVisualization):
    """
    A class for constructing a network showing the summary of topics and the
    number of documents that contain that topic for the topics in the given
    topic runner and the given preprocessed documents; and returning it as a
    matplotlib figure.
    """
    _required_interfaces = [DocumentTopicsInterface, TopicRunner]
    name = 'Topics en documenten die daar minstens 5% bij horen'
    short_tab_name = 'Doc. Netwerk'
    vis_group = VisGroup.MODEL
    needed_input_data = [VisInputData.PROCESSED_CORPUS]

    def _create_figure(self,
                       topic_runner: TopicRunner | DocumentTopicsInterface,
                       processed_corpus: ProcessedCorpus = None,
                       **kwargs) -> matplotlib.figure.Figure:
        """
        Construct a summarized document-topic network plot showing the
        relations between documents and topics
        :param topic_runner: The topic runner (implementing
            DocumentTopicsInterface) to extract topic data from
        :param processed_corpus: The preprocessed corpus containing
            all files as bags of words after preprocessing.
        :return: matplotlib figure showing a document-topic network plot
        :raises ValueError: If the processed_corpus argument is None
        """
        if processed_corpus is None:
            raise ValueError("Preprocessed Corpus keyword argument is "
                             "necessary in the "
                             "document_topic_network_summary_creator")

        # Construct a plot and a graph
        fig = plt.figure()
        plt.title(self.name, pad=25)
        graph = self._construct_doc_topic_network(topic_runner,
                                                  processed_corpus)

        # Get graph elements
        edges = graph.edges()
        nodes = graph.nodes(data="color")

        # Get scaling factor used for scaling the nodes and edges to make sure
        # these don't increase when more documents are added
        scaling_factor = self._get_scaling_doc_topic(graph)

        # Get drawing function arguments
        node_sizes = []
        for node in nodes:
            # Give topic nodes a constant size
            if node[1] is not None:
                node_sizes.append(200)

            # Give doc_set nodes a scaling size
            else:
                first_neighbor = list(graph.neighbors(node[0]))[0]
                node_sizes.append(graph[node[0]][first_neighbor]["weight"]
                                  * scaling_factor * 6.9)

        node_colors = [node[1] if node[1] is not None else "black"
                       for node in nodes]

        edge_colors = [graph[u][v]["color"] for (u, v) in edges]
        edge_width = [(graph[u][v]["weight"]) * scaling_factor * 0.69
                      for u, v in edges]

        # Calculate the shortest paths using dijkstra's algorithm
        shortest_path_lengths = dict(
            nx.shortest_path_length(graph, weight="weight"))

        # Calculate new "shortest" paths to aid visualization
        for source in shortest_path_lengths:
            for target in shortest_path_lengths[source]:
                x = shortest_path_lengths[source][target]
                if x == 0:
                    continue
                shortest_path_lengths[source][target] = (
                    max(x + 5 * math.log(x, 2), 15))

        # Define a custom position using the new "shortest" paths
        pos = nx.kamada_kawai_layout(graph, dist=shortest_path_lengths)

        # Draw the network using the kamada-kawai algorithm to position the
        # nodes in an aesthetically pleasing way.
        nx.draw(graph,
                pos=pos,
                width=edge_width,
                node_size=node_sizes,
                edge_color=edge_colors,
                node_color=node_colors)

        # Add labels to the topic nodes
        labels = {}
        for topic_id in range(topic_runner.get_n_topics()):
            labels[topic_id] = topic_id + 1

        nx.draw_networkx_labels(graph, pos, labels=labels)

        # Adjust the figure
        fig.subplots_adjust(left=0.15, right=0.85, top=0.85, bottom=0.15)

        plt.close()
        return fig

    @staticmethod
    def _construct_doc_topic_network(topic_runner: TopicRunner
                                                   | DocumentTopicsInterface,
                                     processed_files: ProcessedCorpus
                                     ) -> nx.Graph:
        """
        Construct a summarized document-topic network plot which is used to
        plot the relations
        :param topic_runner: The topic runner (implementing
            DocumentTopicsInterface) to extract topic data from
        :param processed_files: The preprocessed corpus containing
            all files as bags of words after preprocessing.
        :return: matplotlib figure showing a document-topic network plot
        """

        # Construct a graph with topic nodes
        init_graph = DocumentTopicNxExporter.construct_doc_topic_network(
            topic_runner, processed_files, 0.05)

        # Construct simplified document topic network
        graph = nx.Graph()

        # Add topic nodes and nodes with degree one to graph
        num_topics: int = topic_runner.get_n_topics()
        for topic_id in range(num_topics):
            graph.add_node(topic_id,
                           color=plot_colors[topic_id % len(plot_colors)])
            lonely_nodes = [node for node in init_graph.neighbors(topic_id)
                            if init_graph.degree(node) == 1]
            if len(lonely_nodes) > 0:
                graph.add_edge(topic_id,
                               'doc_set_' + str(topic_id),
                               color=plot_colors[topic_id % len(plot_colors)],
                               weight=len(lonely_nodes))

        # Add nodes shared by multiple topics
        doc_set_id = num_topics - 1
        for topic_id in range(num_topics):
            for j in range(num_topics):
                doc_set_id += 1
                if topic_id >= j:
                    doc_set_id -= 1
                    continue

                # Calculate the intersection of two node's neighbors
                set1 = set(init_graph.neighbors(topic_id))
                set2 = set(init_graph.neighbors(j))
                intersection = set1.intersection(set2)

                # Add an edge from both topic nodes to a single "intersection"
                # node
                if len(intersection) != 0:
                    graph.add_edge(topic_id,
                                   "doc_set_" + str(doc_set_id),
                                   color=plot_colors[topic_id
                                                     % len(plot_colors)],
                                   weight=len(intersection))
                    graph.add_edge(j,
                                   "doc_set_" + str(doc_set_id),
                                   color=plot_colors[j
                                                     % len(plot_colors)],
                                   weight=len(intersection))

        return graph

    @staticmethod
    def _get_scaling_doc_topic(graph: nx.Graph) -> float:
        """
        Calculates the scale factor to make sure the biggest edge in a network
        is always the same size, regardless of the maximum edge weight
        :param graph: The graph model to calculate the scale factor for
        :return: The edge scale factor
        """

        # Find the maximum edge weight
        weight = [weight for node1, node2, weight in
                  graph.edges(data="weight")]
        max_edge_weight = max(weight)

        # A constant which is multiplied by the scale factor according to an
        # edge width that is visually pleasing
        chosen_weight = 10

        scale_factor = (1 / max_edge_weight)

        return scale_factor * chosen_weight



"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""