Source code for tommy.controller.visualizations.document_word_count_creator

import matplotlib.figure
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator, AutoMinorLocator

from tommy.controller.topic_modelling_runners.abstract_topic_runner import (
    TopicRunner)
from tommy.controller.visualizations.abstract_visualization import (
    AbstractVisualization)
from tommy.controller.visualizations.possible_visualization import VisGroup
from tommy.controller.visualizations.visualization_input_datatypes import (
    VisInputData, MetadataCorpus)
from tommy.support.constant_variables import prim_col_red


[docs] class DocumentWordCountCreator(AbstractVisualization): """ A class for constructing a graph showing the number of words per documents and returning it as a matplotlib figure. """ _required_interfaces = [] name = 'Distributie aantal woorden per document' short_tab_name = 'Woordaantal' vis_group = VisGroup.CORPUS needed_input_data = [VisInputData.METADATA_CORPUS] def _create_figure(self, topic_runner: TopicRunner, metadata_corpus: MetadataCorpus = None, **kwargs) -> matplotlib.figure.Figure: """ Construct a word count plot showing the number of words per document in the given corpus :param topic_runner: The topic runner (implementing DocumentTopicsInterface) to extract topic data from :param metadata_corpus: The metadata of the corpus containing data about all documents in the corpus. :return: matplotlib figure showing a word count graph :raises ValueError: If the metadata_corpus argument is None """ if metadata_corpus is None: raise ValueError("Metadata Corpus keyword argument is necessary in" " the document_word_count_creator") document_counts = [file.length for file in metadata_corpus] # Construct a histogram fig, ax = plt.subplots() plt.hist(document_counts, bins=150, color=f"{prim_col_red}") # Add margins and labels to the plot plt.margins(x=0.02) plt.xlabel("Aantal woorden per document") plt.ylabel("Aantal documenten") plt.title("Distributie aantal woorden per document", pad=25) # Use MaxNLocator to ensure the number of ticks is manageable ax.xaxis.set_major_locator(MaxNLocator(integer=True, nbins=10)) ax.yaxis.set_major_locator(MaxNLocator(integer=True, nbins=10)) # Use AutoMinorLocator to add minor ticks ax.xaxis.set_minor_locator(AutoMinorLocator()) ax.yaxis.set_minor_locator(AutoMinorLocator()) # Rotate tick labels to prevent overlapping plt.xticks(rotation=30) fig.figure.subplots_adjust( left=0.15, right=0.85, top=0.85, bottom=0.15) plt.close() return fig
""" This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences) """