Source code for tommy.controller.visualizations.documents_over_time_creator

from datetime import datetime

import matplotlib.dates
import matplotlib.figure
import pandas as pd
from matplotlib import pyplot as plt

from tommy.controller.result_interfaces.document_topics_interface import (
    DocumentTopicsInterface)
from tommy.controller.topic_modelling_runners.abstract_topic_runner import (
    TopicRunner)
from tommy.controller.visualizations.abstract_visualization import (
    AbstractVisualization)
from tommy.controller.visualizations.possible_visualization import VisGroup
from tommy.controller.visualizations.visualization_input_datatypes import (
    VisInputData, ProcessedCorpus)
from tommy.support.constant_variables import plot_colors



[docs]
class DocumentsOverTimeCreator(AbstractVisualization):

    _required_interfaces = [TopicRunner, DocumentTopicsInterface]
    name = "Documenten over tijd"
    short_tab_name = "Doc. over tijd"
    vis_group = VisGroup.MODEL
    needed_input_data = [VisInputData.PROCESSED_CORPUS]

    def _create_figure(self,
                       topic_runner: TopicRunner | DocumentTopicsInterface,
                       processed_corpus: ProcessedCorpus = None,
                       **kwargs
                       ) -> matplotlib.figure.Figure:
        """
        Construct a document over time plot for the given topic runner and
        return it as a matplotlib figure.

        :param topic_runner: The topic model to construct the plot for. This
            should implement the DocumentTopicsInterface
        :param processed_corpus: The preprocessed corpus containing all files
            as bags of words after preprocessing.
        :return: Matplotlib figure showing a topic ove time plot for all
            topics.
        """

        # Construct a plot and axes
        fig, ax = plt.subplots()

        # Select all available dates with its corresponding probability for
        # each topic
        for topic_id in range(topic_runner.get_n_topics()):
            dates = {"date": [],
                     "probability": []}
            for document in processed_corpus:
                if document.metadata.date is not None:
                    current_date = datetime.combine(document.metadata.date,
                                                    datetime.min.time())
                    topics = topic_runner.get_document_topics(
                        document.body.body,
                        0.0)

                    topic = [topic for topic in topics if topic[0] == topic_id]
                    if topic:
                        current_probability = topic[0][1]
                    else:
                        current_probability = 0.0
                    dates["date"].append(current_date)
                    dates["probability"].append(current_probability)

            # If no dates available, show it on screen
            if all([dates[i] == [] for i in dates]):
                return self._get_no_dates_available_screen()

            # Sort and group all dates
            df = pd.DataFrame(dates)
            df = df.groupby("date", as_index=False).sum()
            df = df.sort_values(by="date", ascending=True)
            grouped_df = self._group_df(df)

            # Plot graph
            ax.plot(grouped_df["date"],
                    grouped_df["probability"],
                    color=plot_colors[topic_id % len(plot_colors)],
                    label=topic_id + 1)

            # Add labels and title to plot
            plt.title(self.name, pad=25)
            plt.xlabel("Datum")
            plt.ylabel("Som gewichten")
            plt.xticks(rotation=30)

            ax.legend()

        fig.figure.subplots_adjust(0.2, 0.2, 0.8, 0.8)

        return fig

    @staticmethod
    def _get_no_dates_available_screen() -> matplotlib.figure.Figure:
        """Returns a figure showing a text that there are no dates
        available."""
        fig = plt.figure()
        plt.figtext(0.5,
                    0.5,
                    "Er zijn geen datums in de dataset om te laten zien",
                    horizontalalignment='center',
                    verticalalignment='center')

        fig.figure.subplots_adjust(
            left=0.15, right=0.85, top=0.85, bottom=0.15)
        plt.close()
        return fig

    @staticmethod
    def _get_valid_offsets() -> list[str]:
        """Returns the possible groupings for grouping dates"""
        return ["YE", "6ME", "2ME", "ME", "2W", "W", "D", "6h", "h", "min",
                "s"]

    @staticmethod
    def _group_df(df: pd.DataFrame) -> pd.DataFrame:
        """Returns a grouped dataframe for the plot over time"""
        offsets = DocumentsOverTimeCreator._get_valid_offsets()

        for offset in offsets:
            new_df = df.groupby([pd.Grouper(key='date', freq=offset)],
                                as_index=False)["probability"].sum()
            if new_df.shape[0] >= 12:
                return new_df

        return df.groupby([pd.Grouper(key='date', freq="ME")],
                          as_index=False)["probability"].sum()


"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""