Source code for cvfe.data.pdf

__all__ = ["PDFIO", "XFAPDF", "CanadaXFA"]

import re
import xml.etree.ElementTree as et
from enum import Enum
from typing import Any

import pypdf

from cvfe.data import functional
from cvfe.data.constant import DocTypes



[docs]
class PDFIO:
    """Base class for dealing with PDF files

    For each type of PDF, let's say XFA files, one needs to extend
    this class and abstract methods like :func:`extract_raw_content`
    to generate a string of the content of the PDF in a format that
    can be used by the other classes (e.g. `XML`). For instance,
    see :class:`XFAPDF` for the extension of this class.

    """


[docs]
    def __init__(self) -> None:
        pass



[docs]
    def extract_raw_content(self, pdf_path: str) -> str:
        """Extracts unprocessed data from a PDF file

        Args:
            pdf_path (str): Path to the pdf file
        """

        raise NotImplementedError



[docs]
    def find_in_dict(self, needle: Any, haystack: Any) -> Any:
        """Looks for the value of a key inside a nested dictionary

        Args:
            needle (Any): Key to look for
            haystack (Any): Dictionary to look in. Can be a dict inside
                another dict

        Returns:
            Any: The value of key ``needle``
        """
        for key in haystack.keys():
            try:
                value = haystack[key]
            except:
                continue
            if key == needle:
                return value
            if isinstance(value, dict):
                x = self.find_in_dict(needle, value)
                if x is not None:
                    return x





[docs]
class XFAPDF(PDFIO):
    """Contains functions and utility tools for dealing with XFA PDF documents."""


[docs]
    def __init__(self) -> None:
        super().__init__()



[docs]
    def extract_raw_content(self, pdf_path: str) -> str:
        """Extracts RAW content of XFA PDF files which are in XML format

        Args:
            pdf_path (str): path to the pdf file

        Reference:

            * https://towardsdatascience.com/how-to-extract-data-from-pdf-forms-using-python-10b5e5f26f70


        Returns:
            str: XFA object of the pdf file in XML format
        """

        pdf_object = open(pdf_path, "rb")
        pdf = pypdf.PdfReader(stream=pdf_object, strict=True)
        xfa = self.find_in_dict("/XFA", pdf.resolved_objects)
        # `datasets` keyword contains filled forms in XFA array
        xml = xfa[xfa.index("datasets") + 1].get_object().get_data()
        xml = str(xml)  # convert bytes to str
        return xml



[docs]
    def clean_xml_for_csv(self, xml: str, type: Enum) -> str:
        """Cleans the XML file extracted from XFA forms

        Since each form has its own format and issues, this method needs
        to be implemented uniquely for each unique file/form which needs
        to be specified using argument ``type`` that can be populated from
        :class:`DocTypes <cvfe.data.constant.DocTypes>`.

        Args:
            xml (str): XML content
            type (Enum): type of the document defined
                in :class:`DocTypes <cvfe.data.constant.DocTypes>`

        Returns:
            str: cleaned XML content to be used in CSV file
        """

        raise NotImplementedError



[docs]
    def flatten_dict_basic(self, d: dict) -> dict:
        """
        Takes a (nested) dictionary and flattens it

        ref: https://stackoverflow.com/questions/38852822/how-to-flatten-xml-file-in-python
        args:
            d: A dictionary
            return: An ordered dict
        """

        def items():
            for key, value in d.items():
                if isinstance(value, dict):
                    for sub_key, sub_value in self.flatten_dict_basic(value).items():
                        yield key + "." + sub_key, sub_value
                else:
                    yield key, value

        return dict(items())



[docs]
    def flatten_dict(self, d: dict) -> dict:
        """Takes a (nested) multilevel dictionary and flattens it

        The final keys are ``key.key...`` and values are the leaf values of dictionary

        Args:
            d (dict): A dictionary

        References:

            * https://stackoverflow.com/a/67744709/18971263

        Returns:
            dict: A flattened dictionary
        """
        return functional.flatten_dict(dictionary=d)



[docs]
    def xml_to_flattened_dict(self, xml: str) -> dict:
        """Takes a (nested) XML and converts it to a flattened dictionary

        The final keys are ``key.key...`` and values are the leaf values of XML tree

        Args:
            xml (str): A XML string

        Returns:
            dict: A flattened dictionary
        """
        return functional.xml_to_flattened_dict(xml=xml)





[docs]
class CanadaXFA(XFAPDF):
    """Handles Canada XFA PDF files"""


[docs]
    def __init__(self) -> None:
        super().__init__()



[docs]
    def clean_xml_for_csv(self, xml: str, type: Enum) -> str:
        """Hardcoded cleaning of Canada XFA XML files to be XML compatible with CSV

        Args:
            xml (str): XML content
            type (Enum): type of the document defined
                in :class:`DocTypes <cvfe.data.constant.DocTypes>`

        Returns:
            str: cleaned XML content to be used in CSV file
        """
        if type == DocTypes.CANADA_5257E:
            # remove bad characters
            xml = re.sub(r"b'\\n", "", xml)
            xml = re.sub(r"'", "", xml)
            xml = re.sub(r"\\n", "", xml)

            # remove 9000 lines of redundant info for '5257e' doc
            tree = et.ElementTree(et.fromstring(xml))
            root = tree.getroot()
            junk = tree.findall("LOVFile")
            root.remove(junk[0])
            xml = str(et.tostring(root, encoding="utf8", method="xml"))
            # parsing through ElementTree adds bad characters too
            xml = re.sub(r"b'<\?xml version=\\'1.0\\' encoding=\\'utf8\\'\?>", "", xml)
            xml = re.sub(r"'", "", xml)
            xml = re.sub(r"\\n[ ]*", "", xml)

        elif type == DocTypes.CANADA_5645E:
            # remove bad characters
            xml = re.sub(r"b'\\n", "", xml)
            xml = re.sub(r"'", "", xml)
            xml = re.sub(r"\\n", "", xml)

        return xml