from datetime import datetime
import matplotlib.dates
import matplotlib.figure
import pandas as pd
from matplotlib import pyplot as plt
from tommy.controller.result_interfaces.document_topics_interface import (
DocumentTopicsInterface)
from tommy.controller.topic_modelling_runners.abstract_topic_runner import (
TopicRunner)
from tommy.controller.visualizations.abstract_visualization import (
AbstractVisualization)
from tommy.controller.visualizations.possible_visualization import VisGroup
from tommy.controller.visualizations.visualization_input_datatypes import (
VisInputData, ProcessedCorpus)
from tommy.support.constant_variables import plot_colors
[docs]
class DocumentsOverTimeCreator(AbstractVisualization):
_required_interfaces = [TopicRunner, DocumentTopicsInterface]
name = "Documenten over tijd"
short_tab_name = "Doc. over tijd"
vis_group = VisGroup.MODEL
needed_input_data = [VisInputData.PROCESSED_CORPUS]
def _create_figure(self,
topic_runner: TopicRunner | DocumentTopicsInterface,
processed_corpus: ProcessedCorpus = None,
**kwargs
) -> matplotlib.figure.Figure:
"""
Construct a document over time plot for the given topic runner and
return it as a matplotlib figure.
:param topic_runner: The topic model to construct the plot for. This
should implement the DocumentTopicsInterface
:param processed_corpus: The preprocessed corpus containing all files
as bags of words after preprocessing.
:return: Matplotlib figure showing a topic ove time plot for all
topics.
"""
# Construct a plot and axes
fig, ax = plt.subplots()
# Select all available dates with its corresponding probability for
# each topic
for topic_id in range(topic_runner.get_n_topics()):
dates = {"date": [],
"probability": []}
for document in processed_corpus:
if document.metadata.date is not None:
current_date = datetime.combine(document.metadata.date,
datetime.min.time())
topics = topic_runner.get_document_topics(
document.body.body,
0.0)
topic = [topic for topic in topics if topic[0] == topic_id]
if topic:
current_probability = topic[0][1]
else:
current_probability = 0.0
dates["date"].append(current_date)
dates["probability"].append(current_probability)
# If no dates available, show it on screen
if all([dates[i] == [] for i in dates]):
return self._get_no_dates_available_screen()
# Sort and group all dates
df = pd.DataFrame(dates)
df = df.groupby("date", as_index=False).sum()
df = df.sort_values(by="date", ascending=True)
grouped_df = self._group_df(df)
# Plot graph
ax.plot(grouped_df["date"],
grouped_df["probability"],
color=plot_colors[topic_id % len(plot_colors)],
label=topic_id + 1)
# Add labels and title to plot
plt.title(self.name, pad=25)
plt.xlabel("Datum")
plt.ylabel("Som gewichten")
plt.xticks(rotation=30)
ax.legend()
fig.figure.subplots_adjust(0.2, 0.2, 0.8, 0.8)
return fig
@staticmethod
def _get_no_dates_available_screen() -> matplotlib.figure.Figure:
"""Returns a figure showing a text that there are no dates
available."""
fig = plt.figure()
plt.figtext(0.5,
0.5,
"Er zijn geen datums in de dataset om te laten zien",
horizontalalignment='center',
verticalalignment='center')
fig.figure.subplots_adjust(
left=0.15, right=0.85, top=0.85, bottom=0.15)
plt.close()
return fig
@staticmethod
def _get_valid_offsets() -> list[str]:
"""Returns the possible groupings for grouping dates"""
return ["YE", "6ME", "2ME", "ME", "2W", "W", "D", "6h", "h", "min",
"s"]
@staticmethod
def _group_df(df: pd.DataFrame) -> pd.DataFrame:
"""Returns a grouped dataframe for the plot over time"""
offsets = DocumentsOverTimeCreator._get_valid_offsets()
for offset in offsets:
new_df = df.groupby([pd.Grouper(key='date', freq=offset)],
as_index=False)["probability"].sum()
if new_df.shape[0] >= 12:
return new_df
return df.groupby([pd.Grouper(key='date', freq="ME")],
as_index=False)["probability"].sum()
"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""