Source code for tommy.controller.visualizations.document_topic_network_summary_creator

import math

import matplotlib.figure
import networkx as nx
from matplotlib import pyplot as plt

from tommy.controller.result_interfaces.document_topics_interface import (
    DocumentTopicsInterface)
from tommy.controller.topic_modelling_runners.abstract_topic_runner import (
    TopicRunner)
from tommy.controller.visualizations.abstract_visualization import (
    AbstractVisualization)
from tommy.controller.visualizations.document_topic_nx_exporter import (
    DocumentTopicNxExporter)
from tommy.controller.visualizations.possible_visualization import VisGroup
from tommy.controller.visualizations.visualization_input_datatypes import (
    VisInputData, ProcessedCorpus)
from tommy.support.constant_variables import plot_colors


[docs] class DocumentTopicNetworkSummaryCreator(AbstractVisualization): """ A class for constructing a network showing the summary of topics and the number of documents that contain that topic for the topics in the given topic runner and the given preprocessed documents; and returning it as a matplotlib figure. """ _required_interfaces = [DocumentTopicsInterface, TopicRunner] name = 'Topics en documenten die daar minstens 5% bij horen' short_tab_name = 'Doc. Netwerk' vis_group = VisGroup.MODEL needed_input_data = [VisInputData.PROCESSED_CORPUS] def _create_figure(self, topic_runner: TopicRunner | DocumentTopicsInterface, processed_corpus: ProcessedCorpus = None, **kwargs) -> matplotlib.figure.Figure: """ Construct a summarized document-topic network plot showing the relations between documents and topics :param topic_runner: The topic runner (implementing DocumentTopicsInterface) to extract topic data from :param processed_corpus: The preprocessed corpus containing all files as bags of words after preprocessing. :return: matplotlib figure showing a document-topic network plot :raises ValueError: If the processed_corpus argument is None """ if processed_corpus is None: raise ValueError("Preprocessed Corpus keyword argument is " "necessary in the " "document_topic_network_summary_creator") # Construct a plot and a graph fig = plt.figure() plt.title(self.name, pad=25) graph = self._construct_doc_topic_network(topic_runner, processed_corpus) # Get graph elements edges = graph.edges() nodes = graph.nodes(data="color") # Get scaling factor used for scaling the nodes and edges to make sure # these don't increase when more documents are added scaling_factor = self._get_scaling_doc_topic(graph) # Get drawing function arguments node_sizes = [] for node in nodes: # Give topic nodes a constant size if node[1] is not None: node_sizes.append(200) # Give doc_set nodes a scaling size else: first_neighbor = list(graph.neighbors(node[0]))[0] node_sizes.append(graph[node[0]][first_neighbor]["weight"] * scaling_factor * 6.9) node_colors = [node[1] if node[1] is not None else "black" for node in nodes] edge_colors = [graph[u][v]["color"] for (u, v) in edges] edge_width = [(graph[u][v]["weight"]) * scaling_factor * 0.69 for u, v in edges] # Calculate the shortest paths using dijkstra's algorithm shortest_path_lengths = dict( nx.shortest_path_length(graph, weight="weight")) # Calculate new "shortest" paths to aid visualization for source in shortest_path_lengths: for target in shortest_path_lengths[source]: x = shortest_path_lengths[source][target] if x == 0: continue shortest_path_lengths[source][target] = ( max(x + 5 * math.log(x, 2), 15)) # Define a custom position using the new "shortest" paths pos = nx.kamada_kawai_layout(graph, dist=shortest_path_lengths) # Draw the network using the kamada-kawai algorithm to position the # nodes in an aesthetically pleasing way. nx.draw(graph, pos=pos, width=edge_width, node_size=node_sizes, edge_color=edge_colors, node_color=node_colors) # Add labels to the topic nodes labels = {} for topic_id in range(topic_runner.get_n_topics()): labels[topic_id] = topic_id + 1 nx.draw_networkx_labels(graph, pos, labels=labels) # Adjust the figure fig.subplots_adjust(left=0.15, right=0.85, top=0.85, bottom=0.15) plt.close() return fig @staticmethod def _construct_doc_topic_network(topic_runner: TopicRunner | DocumentTopicsInterface, processed_files: ProcessedCorpus ) -> nx.Graph: """ Construct a summarized document-topic network plot which is used to plot the relations :param topic_runner: The topic runner (implementing DocumentTopicsInterface) to extract topic data from :param processed_files: The preprocessed corpus containing all files as bags of words after preprocessing. :return: matplotlib figure showing a document-topic network plot """ # Construct a graph with topic nodes init_graph = DocumentTopicNxExporter.construct_doc_topic_network( topic_runner, processed_files, 0.05) # Construct simplified document topic network graph = nx.Graph() # Add topic nodes and nodes with degree one to graph num_topics: int = topic_runner.get_n_topics() for topic_id in range(num_topics): graph.add_node(topic_id, color=plot_colors[topic_id % len(plot_colors)]) lonely_nodes = [node for node in init_graph.neighbors(topic_id) if init_graph.degree(node) == 1] if len(lonely_nodes) > 0: graph.add_edge(topic_id, 'doc_set_' + str(topic_id), color=plot_colors[topic_id % len(plot_colors)], weight=len(lonely_nodes)) # Add nodes shared by multiple topics doc_set_id = num_topics - 1 for topic_id in range(num_topics): for j in range(num_topics): doc_set_id += 1 if topic_id >= j: doc_set_id -= 1 continue # Calculate the intersection of two node's neighbors set1 = set(init_graph.neighbors(topic_id)) set2 = set(init_graph.neighbors(j)) intersection = set1.intersection(set2) # Add an edge from both topic nodes to a single "intersection" # node if len(intersection) != 0: graph.add_edge(topic_id, "doc_set_" + str(doc_set_id), color=plot_colors[topic_id % len(plot_colors)], weight=len(intersection)) graph.add_edge(j, "doc_set_" + str(doc_set_id), color=plot_colors[j % len(plot_colors)], weight=len(intersection)) return graph @staticmethod def _get_scaling_doc_topic(graph: nx.Graph) -> float: """ Calculates the scale factor to make sure the biggest edge in a network is always the same size, regardless of the maximum edge weight :param graph: The graph model to calculate the scale factor for :return: The edge scale factor """ # Find the maximum edge weight weight = [weight for node1, node2, weight in graph.edges(data="weight")] max_edge_weight = max(weight) # A constant which is multiplied by the scale factor according to an # edge width that is visually pleasing chosen_weight = 10 scale_factor = (1 / max_edge_weight) return scale_factor * chosen_weight
""" This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences) """