Source code for tommy.controller.file_import.pdf_file_importer

import os.path
from os import stat
from typing import Generator

from pypdf import PdfReader, DocumentInformation

from tommy.controller.file_import import file_importer_base
from tommy.controller.file_import.metadata import Metadata
from tommy.controller.file_import.raw_body import RawBody
from tommy.controller.file_import.raw_file import RawFile


[docs] class PdfFileImporter(file_importer_base.FileImporterBase): """ Handles importing of PDF files """
[docs] def __init__(self) -> None: """ Initializes a new instance of the class. """ pass
[docs] def compatible_file(self, path: str) -> bool: """ A PDF file is compatible with this parser if and only if the file extension is .pdf and the file is not corrupted. :param path: The string path to the PDF file to be checked for compatibility. :return: bool: True if the file is compatible, False otherwise. """ if not path.endswith('.pdf'): return False with open(path, 'rb') as file: PdfReader(file) return True
[docs] def load_file(self, path: str) -> Generator[RawFile, None, None]: """ Loads a PDF file and yields File objects. :param path: The string path to the PDF file. :return: File: A File object generated from each page of the PDF. """ try: with open(path, 'rb') as file: pdf = PdfReader(file) metadata = pdf.metadata # Bundle all pages together into one document text = '' for page in pdf.pages: text += page.extract_text() except Exception as e: raise Exception(f"kon niet correct gelezen worden: {e}") yield self.generate_file(text, path, metadata)
[docs] @staticmethod def generate_file(file: str, path: str, metadata: DocumentInformation) -> ( RawFile): """ Generates a File object from a PDF page. :param file: A string representing a page of PDF data. :param path: The string path to the PDF file. :param metadata: The metadata of the PDF file. :return: A RawFile object generated from the PDF page containing metadata and the raw text of the file. """ alt_title = os.path.basename(path).replace('.pdf', '') try: date = metadata.creation_date except Exception: # If unable to get the modification time, don't set a date date = None return RawFile( metadata=Metadata(author=metadata.get('/Author', None), title=metadata.get('/Title', alt_title), date=date, path=os.path.relpath(path), format="pdf", length=len(file.split(" ")), name=alt_title, size=stat(path).st_size), body=RawBody(body=file))
""" This program has been developed by students from the bachelor Computer Science at Utrecht University within the Software Project course. © Copyright Utrecht University (Department of Information and Computing Sciences) """