Source code for tommy.controller.file_import.csv_file_importer

import csv
import os.path
from datetime import datetime
from os import stat
from typing import Generator

from tommy.controller.file_import import file_importer_base
from tommy.controller.file_import.metadata import Metadata
from tommy.controller.file_import.raw_body import RawBody
from tommy.controller.file_import.raw_file import RawFile



[docs]
class CsvFileImporter(file_importer_base.FileImporterBase):
    """
    Handles importing of csv files
    """
    mandatory_fields: list[str] = ['body']


[docs]
    def __init__(self) -> None:
        """
        Initializes a new instance of the class.
        """
        pass



[docs]
    def compatible_file(self, path: str) -> bool:
        """
        A CSV file is compatible with this parser if and only if
        the first row of the CSV file contains all mandatory headers

        :param path: The string path to the CSV file to be checked for
                     compatibility.
        :return: bool: True if the file is compatible, False otherwise.
        """
        if not path.endswith('.csv'):
            return False

        with open(path, 'r', newline="", encoding='utf-8-sig') as csvfile:
            csv_reader = csv.DictReader(csvfile, delimiter=',')

            # To check whether each mandatory header exists and is unique,
            # we keep an array of occurrences of all mandatory headers
            mandatory_fields_counts = [0] * len(self.mandatory_fields)
            for header in csv_reader.fieldnames:
                if header.lower() in self.mandatory_fields:
                    mandatory_fields_counts[self.mandatory_fields.index(
                        header.lower())] += 1

            if mandatory_fields_counts == [1] * len(self.mandatory_fields):
                return True

        missing_headers = [header for count, header
                           in zip(mandatory_fields_counts,
                                  self.mandatory_fields)
                           if count == 0]
        duplicate_headers = [header for count, header
                             in zip(mandatory_fields_counts,
                                    self.mandatory_fields)
                             if count > 1]

        if missing_headers and duplicate_headers:
            raise ValueError(f"CSV bestand mist de volgende verplichte"
                             f" headers: {missing_headers}\n"
                             f"En heeft de volgende duplicate headers: "
                             f"{duplicate_headers}")
        if missing_headers:
            raise ValueError(f"CSV bestand mist de volgende verplichte"
                             f" headers: {missing_headers}")
        if duplicate_headers:
            raise ValueError(f"CSV bestand heeft de volgende duplicate"
                             f" headers: {duplicate_headers}")



[docs]
    def load_file(self, path: str) -> Generator[RawFile, None, None]:
        """
        Loads a CSV file and yields File objects.

        :param path: The string path to the CSV file.
        :return: File: A File object generated from each row of the CSV.
        """
        with open(path, 'r', newline="", encoding='utf-8-sig') as csvfile:
            reader = csv.DictReader(csvfile)
            reader.fieldnames = [str(header).lower() for header in
                                 reader.fieldnames]
            row: dict

            row_index = 1  # Only used for debugging
            errors = []
            for row in reader:
                # Remove empty fields
                for key, value in row.items():
                    if (not isinstance(value, str) or value == "" or
                            value.isspace()):
                        row[key] = None
                try:
                    correct_date_format, file = self.generate_file(row, path,
                                                                   row_index)
                    yield file
                    if not correct_date_format:
                        errors.append(
                            SyntaxWarning(
                                f"De datum van document {row_index} kon niet "
                                f"worden geïnterpreteerd: '{row.get('date')}'."
                                f" Dit bestand is zonder datum ingeladen."))
                except Exception as e:
                    errors.append(e)
                row_index += 1

        if errors:
            if len(errors) == 1:
                raise errors[0]
            else:
                raise ExceptionGroup("Er zijn meerdere fouten opgetreden "
                                     "bij het laden van het bestand: ", errors)



[docs]
    def generate_file(self, file: dict, path: str, row_index: int) -> (
            tuple[bool, RawFile]):
        """
        Generates a File object from a CSV row.

        :param file: A dictionary representing a row of CSV data.
        :param path: The string path to the CSV file.
        :param row_index: The index of the row in the csv file. Used for
        debugging and error presentation to the user.
        :return: A tuple of a boolean and a RawFile object. The boolean is
        False if the datetime could not be parsed, and True if the datetime
        was successfully parsed or if the datetime does not exist for that
        document. A RawFile object generated from the CSV row containing
        metadata and the raw text of the file.
        """
        for key in self.mandatory_fields:
            if file.get(key) is None:
                raise KeyError(f"De kolom '{key}' is verplicht, maar is niet "
                               f"gevonden voor document {row_index}")

        file_date_str: str = file.get("date")
        file_date: datetime
        correct_date_format: bool
        if file_date_str is None:
            file_date = None
            correct_date_format = True
        else:
            file_date = self.parse_date(file_date_str)
            correct_date_format = file_date is not None
        dict_title = file.get("title")
        alt_title = os.path.basename(path).replace('.csv', '')
        file_title = alt_title if dict_title is None else dict_title
        return correct_date_format, RawFile(
            metadata=Metadata(author=file.get("author"),
                              title=file_title, date=file_date,
                              url=file.get("url"), path=os.path.relpath(path),
                              format="csv",
                              length=len(file.get("body").split(" ")),
                              name=os.path.relpath(path).split(".")[0],
                              size=stat(path).st_size),
            body=RawBody(body=file.get("body").strip()))




"""
This program has been developed by students from the bachelor Computer Science
at Utrecht University within the Software Project course.
© Copyright Utrecht University
(Department of Information and Computing Sciences)
"""