Source code for tommy.model.corpus_model

from __future__ import annotations

from copy import copy

from gensim.corpora import Dictionary

from tommy.controller.file_import.metadata import Metadata
from tommy.controller.file_import.processed_corpus import ProcessedCorpus


[docs] class CorpusModel: """ CorpusModel stores the data about the documents in the input folder. It is only accessible through the CorpusController class. The raw corpus data is not stored as it wouldn't fit in memory. The processed corpus is stored in the ProcessedCorpus class. """ metadata: list[Metadata] = None dictionary: Dictionary = None processed_corpus: ProcessedCorpus
[docs] def __init__(self, derive_from: CorpusModel = None): """ Initialize the corpus model and create an empty instance of the ProcessedCorpus so files can be added to the processed corpus after pre-processing """ self.processed_corpus = ProcessedCorpus() if derive_from is not None: self.metadata = copy(derive_from.metadata) self.dictionary = copy(derive_from.dictionary)
""" This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences) """